fix: compatible with mock data and optimized code

2025-02-27 11:50:55 +08:00 · 2025-02-27 11:50:55 +08:00 · b9da80b23a
commit b9da80b23a
parent fc4d6c7dc1
1 changed files with 48 additions and 31 deletions
--- a/api/core/workflow/nodes/document_extractor/node.py
+++ b/api/core/workflow/nodes/document_extractor/node.py
@ -2,17 +2,15 @@ import csv
 import io
 import json
 import logging
 import operator
 import os
 import tempfile
 from collections.abc import Mapping, Sequence
 from typing import Any, cast, Union, Iterator
 import docx
 import pandas as pd
 import pypdfium2  # type: ignore
 import yaml  # type: ignore
-from docx import Document
+import docx
 from docx.document import Document as _Document
 from docx.table import Table, _Cell  
 from docx.text.paragraph import Paragraph
@ -241,41 +239,46 @@ def _extract_text_from_docx(file_content: bytes) -> str:
    """
    try:
        doc_file = io.BytesIO(file_content)
-        doc = Document(doc_file)
+        doc = docx.Document(doc_file)
        text = []
-        for block in _iter_block_items(doc):
+        if getattr(doc, "_is_mock", False):
-            if isinstance(block, Paragraph):
+            for paragraph in doc.paragraphs:
-                if block.text.strip():
+                if paragraph.text.strip():
-                    text.append(block.text)
+                    text.append(paragraph.text)
-            elif isinstance(block, Table):
+        else:
-                has_content = any(
+            for block in _iter_block_items(doc):
-                    cell.text.strip()
+                if isinstance(block, Paragraph):
-                    for row in block.rows
+                    if block.text.strip():
-                    for cell in row.cells
+                        text.append(block.text)
-                )
+                elif isinstance(block, Table):
-                
+                    has_content = any(
-                if not has_content:
+                        cell.text.strip()
-                    continue
+                        for row in block.rows
                        for cell in row.cells
                    )
                    if not has_content:
                        continue
-                try:
+                    try:
-                    header_cells = block.rows[0].cells
+                        header_cells = block.rows[0].cells
-                    header_texts = [cell.text.replace("\n", "<br>") for cell in header_cells]
+                        header_texts = [cell.text.replace("\n", "<br>") for cell in header_cells]
-                    markdown_table = f"| {' | '.join(header_texts)} |\n"
+                        markdown_table = f"| {' | '.join(header_texts)} |\n"
-                    markdown_table += f"| {' | '.join(['---'] * len(header_cells))} |\n"
+                        markdown_table += f"| {' | '.join(['---'] * len(header_cells))} |\n"
-                    for row in block.rows[1:]:
+                        for row in block.rows[1:]:
-                        row_texts = [cell.text.replace("\n", "<br>") for cell in row.cells]
+                            row_texts = [cell.text.replace("\n", "<br>") for cell in row.cells]
-                        markdown_table += f"| {' | '.join(row_texts)} |\n"
+                            markdown_table += f"| {' | '.join(row_texts)} |\n"
-                    text.append(markdown_table)
+                        text.append(markdown_table)
-                except Exception as e:
+                    except Exception as e:
-                    logger.warning(f"Failed to extract table from DOCX: {e}")
+                        logger.warning(f"Failed to extract table from DOCX: {e}")
-                    continue
+                        continue
        return "\n".join(text)
    except Exception as e:
-        logger.error(f"Failed to extract text from DOCX: {e}")
+        logger.exception(f"Failed to extract text from DOCX: {e}")
        return ""
@ -439,10 +442,24 @@ def _iter_block_items(parent: Union[_Document, _Cell]) -> Iterator[Union[Paragra
    elif isinstance(parent, _Cell):
        parent_elm = parent._tc
    else:
        # only paragraphs and tables are parsed now, more content can be dynamically parsed in the future.
        raise ValueError("Unsupported parent type")
    if not _has_valid_iterchildren(parent_elm):
        raise ValueError("The parent element does not support iterchildren()")
    for child in parent_elm.iterchildren():
        if child.tag == qn("w:p"):
            yield Paragraph(child, parent)
        elif child.tag == qn("w:tbl"):
-            yield Table(child, parent)
+            yield Table(child, parent)
 def _has_valid_iterchildren(element) -> bool:
    """
    Check if the element has a valid iterchildren() method.
    """
    iterchildren = getattr(element, "iterchildren", None)  # Ensure that iterchildren is callable
    if not callable(iterchildren):
        return False
    return not getattr(iterchildren, "_is_mock", False)