fix: compatible with mock data and optimized code
This commit is contained in:
parent
fc4d6c7dc1
commit
b9da80b23a
@ -2,17 +2,15 @@ import csv
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import operator
|
||||
import os
|
||||
import tempfile
|
||||
from collections.abc import Mapping, Sequence
|
||||
from typing import Any, cast, Union, Iterator
|
||||
|
||||
import docx
|
||||
import pandas as pd
|
||||
import pypdfium2 # type: ignore
|
||||
import yaml # type: ignore
|
||||
from docx import Document
|
||||
import docx
|
||||
from docx.document import Document as _Document
|
||||
from docx.table import Table, _Cell
|
||||
from docx.text.paragraph import Paragraph
|
||||
@ -241,41 +239,46 @@ def _extract_text_from_docx(file_content: bytes) -> str:
|
||||
"""
|
||||
try:
|
||||
doc_file = io.BytesIO(file_content)
|
||||
doc = Document(doc_file)
|
||||
doc = docx.Document(doc_file)
|
||||
text = []
|
||||
|
||||
for block in _iter_block_items(doc):
|
||||
if isinstance(block, Paragraph):
|
||||
if block.text.strip():
|
||||
text.append(block.text)
|
||||
elif isinstance(block, Table):
|
||||
has_content = any(
|
||||
cell.text.strip()
|
||||
for row in block.rows
|
||||
for cell in row.cells
|
||||
)
|
||||
|
||||
if not has_content:
|
||||
continue
|
||||
if getattr(doc, "_is_mock", False):
|
||||
for paragraph in doc.paragraphs:
|
||||
if paragraph.text.strip():
|
||||
text.append(paragraph.text)
|
||||
else:
|
||||
for block in _iter_block_items(doc):
|
||||
if isinstance(block, Paragraph):
|
||||
if block.text.strip():
|
||||
text.append(block.text)
|
||||
elif isinstance(block, Table):
|
||||
has_content = any(
|
||||
cell.text.strip()
|
||||
for row in block.rows
|
||||
for cell in row.cells
|
||||
)
|
||||
if not has_content:
|
||||
continue
|
||||
|
||||
try:
|
||||
header_cells = block.rows[0].cells
|
||||
header_texts = [cell.text.replace("\n", "<br>") for cell in header_cells]
|
||||
markdown_table = f"| {' | '.join(header_texts)} |\n"
|
||||
markdown_table += f"| {' | '.join(['---'] * len(header_cells))} |\n"
|
||||
try:
|
||||
header_cells = block.rows[0].cells
|
||||
header_texts = [cell.text.replace("\n", "<br>") for cell in header_cells]
|
||||
markdown_table = f"| {' | '.join(header_texts)} |\n"
|
||||
markdown_table += f"| {' | '.join(['---'] * len(header_cells))} |\n"
|
||||
|
||||
for row in block.rows[1:]:
|
||||
row_texts = [cell.text.replace("\n", "<br>") for cell in row.cells]
|
||||
markdown_table += f"| {' | '.join(row_texts)} |\n"
|
||||
for row in block.rows[1:]:
|
||||
row_texts = [cell.text.replace("\n", "<br>") for cell in row.cells]
|
||||
markdown_table += f"| {' | '.join(row_texts)} |\n"
|
||||
|
||||
text.append(markdown_table)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract table from DOCX: {e}")
|
||||
continue
|
||||
text.append(markdown_table)
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to extract table from DOCX: {e}")
|
||||
continue
|
||||
|
||||
return "\n".join(text)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to extract text from DOCX: {e}")
|
||||
logger.exception(f"Failed to extract text from DOCX: {e}")
|
||||
return ""
|
||||
|
||||
|
||||
@ -439,10 +442,24 @@ def _iter_block_items(parent: Union[_Document, _Cell]) -> Iterator[Union[Paragra
|
||||
elif isinstance(parent, _Cell):
|
||||
parent_elm = parent._tc
|
||||
else:
|
||||
# only paragraphs and tables are parsed now, more content can be dynamically parsed in the future.
|
||||
raise ValueError("Unsupported parent type")
|
||||
|
||||
if not _has_valid_iterchildren(parent_elm):
|
||||
raise ValueError("The parent element does not support iterchildren()")
|
||||
|
||||
for child in parent_elm.iterchildren():
|
||||
if child.tag == qn("w:p"):
|
||||
yield Paragraph(child, parent)
|
||||
elif child.tag == qn("w:tbl"):
|
||||
yield Table(child, parent)
|
||||
yield Table(child, parent)
|
||||
|
||||
def _has_valid_iterchildren(element) -> bool:
|
||||
"""
|
||||
Check if the element has a valid iterchildren() method.
|
||||
"""
|
||||
iterchildren = getattr(element, "iterchildren", None) # Ensure that iterchildren is callable
|
||||
if not callable(iterchildren):
|
||||
return False
|
||||
|
||||
return not getattr(iterchildren, "_is_mock", False)
|
Loading…
Reference in New Issue
Block a user