fix: compatible with mock data and optimized code

This commit is contained in:
Aurora 2025-02-27 11:50:55 +08:00
parent fc4d6c7dc1
commit b9da80b23a

View File

@ -2,17 +2,15 @@ import csv
import io
import json
import logging
import operator
import os
import tempfile
from collections.abc import Mapping, Sequence
from typing import Any, cast, Union, Iterator
import docx
import pandas as pd
import pypdfium2 # type: ignore
import yaml # type: ignore
from docx import Document
import docx
from docx.document import Document as _Document
from docx.table import Table, _Cell
from docx.text.paragraph import Paragraph
@ -241,41 +239,46 @@ def _extract_text_from_docx(file_content: bytes) -> str:
"""
try:
doc_file = io.BytesIO(file_content)
doc = Document(doc_file)
doc = docx.Document(doc_file)
text = []
for block in _iter_block_items(doc):
if isinstance(block, Paragraph):
if block.text.strip():
text.append(block.text)
elif isinstance(block, Table):
has_content = any(
cell.text.strip()
for row in block.rows
for cell in row.cells
)
if not has_content:
continue
if getattr(doc, "_is_mock", False):
for paragraph in doc.paragraphs:
if paragraph.text.strip():
text.append(paragraph.text)
else:
for block in _iter_block_items(doc):
if isinstance(block, Paragraph):
if block.text.strip():
text.append(block.text)
elif isinstance(block, Table):
has_content = any(
cell.text.strip()
for row in block.rows
for cell in row.cells
)
if not has_content:
continue
try:
header_cells = block.rows[0].cells
header_texts = [cell.text.replace("\n", "<br>") for cell in header_cells]
markdown_table = f"| {' | '.join(header_texts)} |\n"
markdown_table += f"| {' | '.join(['---'] * len(header_cells))} |\n"
try:
header_cells = block.rows[0].cells
header_texts = [cell.text.replace("\n", "<br>") for cell in header_cells]
markdown_table = f"| {' | '.join(header_texts)} |\n"
markdown_table += f"| {' | '.join(['---'] * len(header_cells))} |\n"
for row in block.rows[1:]:
row_texts = [cell.text.replace("\n", "<br>") for cell in row.cells]
markdown_table += f"| {' | '.join(row_texts)} |\n"
for row in block.rows[1:]:
row_texts = [cell.text.replace("\n", "<br>") for cell in row.cells]
markdown_table += f"| {' | '.join(row_texts)} |\n"
text.append(markdown_table)
except Exception as e:
logger.warning(f"Failed to extract table from DOCX: {e}")
continue
text.append(markdown_table)
except Exception as e:
logger.warning(f"Failed to extract table from DOCX: {e}")
continue
return "\n".join(text)
except Exception as e:
logger.error(f"Failed to extract text from DOCX: {e}")
logger.exception(f"Failed to extract text from DOCX: {e}")
return ""
@ -439,10 +442,24 @@ def _iter_block_items(parent: Union[_Document, _Cell]) -> Iterator[Union[Paragra
elif isinstance(parent, _Cell):
parent_elm = parent._tc
else:
# only paragraphs and tables are parsed now, more content can be dynamically parsed in the future.
raise ValueError("Unsupported parent type")
if not _has_valid_iterchildren(parent_elm):
raise ValueError("The parent element does not support iterchildren()")
for child in parent_elm.iterchildren():
if child.tag == qn("w:p"):
yield Paragraph(child, parent)
elif child.tag == qn("w:tbl"):
yield Table(child, parent)
yield Table(child, parent)
def _has_valid_iterchildren(element) -> bool:
"""
Check if the element has a valid iterchildren() method.
"""
iterchildren = getattr(element, "iterchildren", None) # Ensure that iterchildren is callable
if not callable(iterchildren):
return False
return not getattr(iterchildren, "_is_mock", False)