From b9da80b23ae67814411a346d73a8c2aba60d8bcc Mon Sep 17 00:00:00 2001 From: Aurora <781487461@qq.com> Date: Thu, 27 Feb 2025 11:50:55 +0800 Subject: [PATCH] fix: compatible with mock data and optimized code --- .../workflow/nodes/document_extractor/node.py | 79 +++++++++++-------- 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index 68dd1f52f7..0aedde4192 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -2,17 +2,15 @@ import csv import io import json import logging -import operator import os import tempfile from collections.abc import Mapping, Sequence from typing import Any, cast, Union, Iterator -import docx import pandas as pd import pypdfium2 # type: ignore import yaml # type: ignore -from docx import Document +import docx from docx.document import Document as _Document from docx.table import Table, _Cell from docx.text.paragraph import Paragraph @@ -241,41 +239,46 @@ def _extract_text_from_docx(file_content: bytes) -> str: """ try: doc_file = io.BytesIO(file_content) - doc = Document(doc_file) + doc = docx.Document(doc_file) text = [] - for block in _iter_block_items(doc): - if isinstance(block, Paragraph): - if block.text.strip(): - text.append(block.text) - elif isinstance(block, Table): - has_content = any( - cell.text.strip() - for row in block.rows - for cell in row.cells - ) - - if not has_content: - continue + if getattr(doc, "_is_mock", False): + for paragraph in doc.paragraphs: + if paragraph.text.strip(): + text.append(paragraph.text) + else: + for block in _iter_block_items(doc): + if isinstance(block, Paragraph): + if block.text.strip(): + text.append(block.text) + elif isinstance(block, Table): + has_content = any( + cell.text.strip() + for row in block.rows + for cell in row.cells + ) + if not has_content: + continue - try: - header_cells = block.rows[0].cells - header_texts = [cell.text.replace("\n", "
") for cell in header_cells] - markdown_table = f"| {' | '.join(header_texts)} |\n" - markdown_table += f"| {' | '.join(['---'] * len(header_cells))} |\n" + try: + header_cells = block.rows[0].cells + header_texts = [cell.text.replace("\n", "
") for cell in header_cells] + markdown_table = f"| {' | '.join(header_texts)} |\n" + markdown_table += f"| {' | '.join(['---'] * len(header_cells))} |\n" - for row in block.rows[1:]: - row_texts = [cell.text.replace("\n", "
") for cell in row.cells] - markdown_table += f"| {' | '.join(row_texts)} |\n" + for row in block.rows[1:]: + row_texts = [cell.text.replace("\n", "
") for cell in row.cells] + markdown_table += f"| {' | '.join(row_texts)} |\n" - text.append(markdown_table) - except Exception as e: - logger.warning(f"Failed to extract table from DOCX: {e}") - continue + text.append(markdown_table) + except Exception as e: + logger.warning(f"Failed to extract table from DOCX: {e}") + continue return "\n".join(text) + except Exception as e: - logger.error(f"Failed to extract text from DOCX: {e}") + logger.exception(f"Failed to extract text from DOCX: {e}") return "" @@ -439,10 +442,24 @@ def _iter_block_items(parent: Union[_Document, _Cell]) -> Iterator[Union[Paragra elif isinstance(parent, _Cell): parent_elm = parent._tc else: + # only paragraphs and tables are parsed now, more content can be dynamically parsed in the future. raise ValueError("Unsupported parent type") + if not _has_valid_iterchildren(parent_elm): + raise ValueError("The parent element does not support iterchildren()") + for child in parent_elm.iterchildren(): if child.tag == qn("w:p"): yield Paragraph(child, parent) elif child.tag == qn("w:tbl"): - yield Table(child, parent) \ No newline at end of file + yield Table(child, parent) + +def _has_valid_iterchildren(element) -> bool: + """ + Check if the element has a valid iterchildren() method. + """ + iterchildren = getattr(element, "iterchildren", None) # Ensure that iterchildren is callable + if not callable(iterchildren): + return False + + return not getattr(iterchildren, "_is_mock", False) \ No newline at end of file