fix: compatible with mock data and optimized code
This commit is contained in:
parent
fc4d6c7dc1
commit
b9da80b23a
@ -2,17 +2,15 @@ import csv
|
|||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import operator
|
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from collections.abc import Mapping, Sequence
|
from collections.abc import Mapping, Sequence
|
||||||
from typing import Any, cast, Union, Iterator
|
from typing import Any, cast, Union, Iterator
|
||||||
|
|
||||||
import docx
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pypdfium2 # type: ignore
|
import pypdfium2 # type: ignore
|
||||||
import yaml # type: ignore
|
import yaml # type: ignore
|
||||||
from docx import Document
|
import docx
|
||||||
from docx.document import Document as _Document
|
from docx.document import Document as _Document
|
||||||
from docx.table import Table, _Cell
|
from docx.table import Table, _Cell
|
||||||
from docx.text.paragraph import Paragraph
|
from docx.text.paragraph import Paragraph
|
||||||
@ -241,41 +239,46 @@ def _extract_text_from_docx(file_content: bytes) -> str:
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
doc_file = io.BytesIO(file_content)
|
doc_file = io.BytesIO(file_content)
|
||||||
doc = Document(doc_file)
|
doc = docx.Document(doc_file)
|
||||||
text = []
|
text = []
|
||||||
|
|
||||||
for block in _iter_block_items(doc):
|
if getattr(doc, "_is_mock", False):
|
||||||
if isinstance(block, Paragraph):
|
for paragraph in doc.paragraphs:
|
||||||
if block.text.strip():
|
if paragraph.text.strip():
|
||||||
text.append(block.text)
|
text.append(paragraph.text)
|
||||||
elif isinstance(block, Table):
|
else:
|
||||||
has_content = any(
|
for block in _iter_block_items(doc):
|
||||||
cell.text.strip()
|
if isinstance(block, Paragraph):
|
||||||
for row in block.rows
|
if block.text.strip():
|
||||||
for cell in row.cells
|
text.append(block.text)
|
||||||
)
|
elif isinstance(block, Table):
|
||||||
|
has_content = any(
|
||||||
if not has_content:
|
cell.text.strip()
|
||||||
continue
|
for row in block.rows
|
||||||
|
for cell in row.cells
|
||||||
|
)
|
||||||
|
if not has_content:
|
||||||
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
header_cells = block.rows[0].cells
|
header_cells = block.rows[0].cells
|
||||||
header_texts = [cell.text.replace("\n", "<br>") for cell in header_cells]
|
header_texts = [cell.text.replace("\n", "<br>") for cell in header_cells]
|
||||||
markdown_table = f"| {' | '.join(header_texts)} |\n"
|
markdown_table = f"| {' | '.join(header_texts)} |\n"
|
||||||
markdown_table += f"| {' | '.join(['---'] * len(header_cells))} |\n"
|
markdown_table += f"| {' | '.join(['---'] * len(header_cells))} |\n"
|
||||||
|
|
||||||
for row in block.rows[1:]:
|
for row in block.rows[1:]:
|
||||||
row_texts = [cell.text.replace("\n", "<br>") for cell in row.cells]
|
row_texts = [cell.text.replace("\n", "<br>") for cell in row.cells]
|
||||||
markdown_table += f"| {' | '.join(row_texts)} |\n"
|
markdown_table += f"| {' | '.join(row_texts)} |\n"
|
||||||
|
|
||||||
text.append(markdown_table)
|
text.append(markdown_table)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to extract table from DOCX: {e}")
|
logger.warning(f"Failed to extract table from DOCX: {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return "\n".join(text)
|
return "\n".join(text)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Failed to extract text from DOCX: {e}")
|
logger.exception(f"Failed to extract text from DOCX: {e}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
@ -439,10 +442,24 @@ def _iter_block_items(parent: Union[_Document, _Cell]) -> Iterator[Union[Paragra
|
|||||||
elif isinstance(parent, _Cell):
|
elif isinstance(parent, _Cell):
|
||||||
parent_elm = parent._tc
|
parent_elm = parent._tc
|
||||||
else:
|
else:
|
||||||
|
# only paragraphs and tables are parsed now, more content can be dynamically parsed in the future.
|
||||||
raise ValueError("Unsupported parent type")
|
raise ValueError("Unsupported parent type")
|
||||||
|
|
||||||
|
if not _has_valid_iterchildren(parent_elm):
|
||||||
|
raise ValueError("The parent element does not support iterchildren()")
|
||||||
|
|
||||||
for child in parent_elm.iterchildren():
|
for child in parent_elm.iterchildren():
|
||||||
if child.tag == qn("w:p"):
|
if child.tag == qn("w:p"):
|
||||||
yield Paragraph(child, parent)
|
yield Paragraph(child, parent)
|
||||||
elif child.tag == qn("w:tbl"):
|
elif child.tag == qn("w:tbl"):
|
||||||
yield Table(child, parent)
|
yield Table(child, parent)
|
||||||
|
|
||||||
|
def _has_valid_iterchildren(element) -> bool:
|
||||||
|
"""
|
||||||
|
Check if the element has a valid iterchildren() method.
|
||||||
|
"""
|
||||||
|
iterchildren = getattr(element, "iterchildren", None) # Ensure that iterchildren is callable
|
||||||
|
if not callable(iterchildren):
|
||||||
|
return False
|
||||||
|
|
||||||
|
return not getattr(iterchildren, "_is_mock", False)
|
Loading…
Reference in New Issue
Block a user