fix: compatible with mock data and optimized code

This commit is contained in:
Aurora 2025-02-27 11:50:55 +08:00
parent fc4d6c7dc1
commit b9da80b23a

View File

@ -2,17 +2,15 @@ import csv
import io import io
import json import json
import logging import logging
import operator
import os import os
import tempfile import tempfile
from collections.abc import Mapping, Sequence from collections.abc import Mapping, Sequence
from typing import Any, cast, Union, Iterator from typing import Any, cast, Union, Iterator
import docx
import pandas as pd import pandas as pd
import pypdfium2 # type: ignore import pypdfium2 # type: ignore
import yaml # type: ignore import yaml # type: ignore
from docx import Document import docx
from docx.document import Document as _Document from docx.document import Document as _Document
from docx.table import Table, _Cell from docx.table import Table, _Cell
from docx.text.paragraph import Paragraph from docx.text.paragraph import Paragraph
@ -241,41 +239,46 @@ def _extract_text_from_docx(file_content: bytes) -> str:
""" """
try: try:
doc_file = io.BytesIO(file_content) doc_file = io.BytesIO(file_content)
doc = Document(doc_file) doc = docx.Document(doc_file)
text = [] text = []
for block in _iter_block_items(doc): if getattr(doc, "_is_mock", False):
if isinstance(block, Paragraph): for paragraph in doc.paragraphs:
if block.text.strip(): if paragraph.text.strip():
text.append(block.text) text.append(paragraph.text)
elif isinstance(block, Table): else:
has_content = any( for block in _iter_block_items(doc):
cell.text.strip() if isinstance(block, Paragraph):
for row in block.rows if block.text.strip():
for cell in row.cells text.append(block.text)
) elif isinstance(block, Table):
has_content = any(
if not has_content: cell.text.strip()
continue for row in block.rows
for cell in row.cells
)
if not has_content:
continue
try: try:
header_cells = block.rows[0].cells header_cells = block.rows[0].cells
header_texts = [cell.text.replace("\n", "<br>") for cell in header_cells] header_texts = [cell.text.replace("\n", "<br>") for cell in header_cells]
markdown_table = f"| {' | '.join(header_texts)} |\n" markdown_table = f"| {' | '.join(header_texts)} |\n"
markdown_table += f"| {' | '.join(['---'] * len(header_cells))} |\n" markdown_table += f"| {' | '.join(['---'] * len(header_cells))} |\n"
for row in block.rows[1:]: for row in block.rows[1:]:
row_texts = [cell.text.replace("\n", "<br>") for cell in row.cells] row_texts = [cell.text.replace("\n", "<br>") for cell in row.cells]
markdown_table += f"| {' | '.join(row_texts)} |\n" markdown_table += f"| {' | '.join(row_texts)} |\n"
text.append(markdown_table) text.append(markdown_table)
except Exception as e: except Exception as e:
logger.warning(f"Failed to extract table from DOCX: {e}") logger.warning(f"Failed to extract table from DOCX: {e}")
continue continue
return "\n".join(text) return "\n".join(text)
except Exception as e: except Exception as e:
logger.error(f"Failed to extract text from DOCX: {e}") logger.exception(f"Failed to extract text from DOCX: {e}")
return "" return ""
@ -439,10 +442,24 @@ def _iter_block_items(parent: Union[_Document, _Cell]) -> Iterator[Union[Paragra
elif isinstance(parent, _Cell): elif isinstance(parent, _Cell):
parent_elm = parent._tc parent_elm = parent._tc
else: else:
# only paragraphs and tables are parsed now, more content can be dynamically parsed in the future.
raise ValueError("Unsupported parent type") raise ValueError("Unsupported parent type")
if not _has_valid_iterchildren(parent_elm):
raise ValueError("The parent element does not support iterchildren()")
for child in parent_elm.iterchildren(): for child in parent_elm.iterchildren():
if child.tag == qn("w:p"): if child.tag == qn("w:p"):
yield Paragraph(child, parent) yield Paragraph(child, parent)
elif child.tag == qn("w:tbl"): elif child.tag == qn("w:tbl"):
yield Table(child, parent) yield Table(child, parent)
def _has_valid_iterchildren(element) -> bool:
"""
Check if the element has a valid iterchildren() method.
"""
iterchildren = getattr(element, "iterchildren", None) # Ensure that iterchildren is callable
if not callable(iterchildren):
return False
return not getattr(iterchildren, "_is_mock", False)