From b9da80b23ae67814411a346d73a8c2aba60d8bcc Mon Sep 17 00:00:00 2001
From: Aurora <781487461@qq.com>
Date: Thu, 27 Feb 2025 11:50:55 +0800
Subject: [PATCH] fix: compatible with mock data and optimized code
---
.../workflow/nodes/document_extractor/node.py | 79 +++++++++++--------
1 file changed, 48 insertions(+), 31 deletions(-)
diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py
index 68dd1f52f7..0aedde4192 100644
--- a/api/core/workflow/nodes/document_extractor/node.py
+++ b/api/core/workflow/nodes/document_extractor/node.py
@@ -2,17 +2,15 @@ import csv
import io
import json
import logging
-import operator
import os
import tempfile
from collections.abc import Mapping, Sequence
from typing import Any, cast, Union, Iterator
-import docx
import pandas as pd
import pypdfium2 # type: ignore
import yaml # type: ignore
-from docx import Document
+import docx
from docx.document import Document as _Document
from docx.table import Table, _Cell
from docx.text.paragraph import Paragraph
@@ -241,41 +239,46 @@ def _extract_text_from_docx(file_content: bytes) -> str:
"""
try:
doc_file = io.BytesIO(file_content)
- doc = Document(doc_file)
+ doc = docx.Document(doc_file)
text = []
- for block in _iter_block_items(doc):
- if isinstance(block, Paragraph):
- if block.text.strip():
- text.append(block.text)
- elif isinstance(block, Table):
- has_content = any(
- cell.text.strip()
- for row in block.rows
- for cell in row.cells
- )
-
- if not has_content:
- continue
+ if getattr(doc, "_is_mock", False):
+ for paragraph in doc.paragraphs:
+ if paragraph.text.strip():
+ text.append(paragraph.text)
+ else:
+ for block in _iter_block_items(doc):
+ if isinstance(block, Paragraph):
+ if block.text.strip():
+ text.append(block.text)
+ elif isinstance(block, Table):
+ has_content = any(
+ cell.text.strip()
+ for row in block.rows
+ for cell in row.cells
+ )
+ if not has_content:
+ continue
- try:
- header_cells = block.rows[0].cells
- header_texts = [cell.text.replace("\n", "
") for cell in header_cells]
- markdown_table = f"| {' | '.join(header_texts)} |\n"
- markdown_table += f"| {' | '.join(['---'] * len(header_cells))} |\n"
+ try:
+ header_cells = block.rows[0].cells
+ header_texts = [cell.text.replace("\n", "
") for cell in header_cells]
+ markdown_table = f"| {' | '.join(header_texts)} |\n"
+ markdown_table += f"| {' | '.join(['---'] * len(header_cells))} |\n"
- for row in block.rows[1:]:
- row_texts = [cell.text.replace("\n", "
") for cell in row.cells]
- markdown_table += f"| {' | '.join(row_texts)} |\n"
+ for row in block.rows[1:]:
+ row_texts = [cell.text.replace("\n", "
") for cell in row.cells]
+ markdown_table += f"| {' | '.join(row_texts)} |\n"
- text.append(markdown_table)
- except Exception as e:
- logger.warning(f"Failed to extract table from DOCX: {e}")
- continue
+ text.append(markdown_table)
+ except Exception as e:
+ logger.warning(f"Failed to extract table from DOCX: {e}")
+ continue
return "\n".join(text)
+
except Exception as e:
- logger.error(f"Failed to extract text from DOCX: {e}")
+ logger.exception(f"Failed to extract text from DOCX: {e}")
return ""
@@ -439,10 +442,24 @@ def _iter_block_items(parent: Union[_Document, _Cell]) -> Iterator[Union[Paragra
elif isinstance(parent, _Cell):
parent_elm = parent._tc
else:
+ # only paragraphs and tables are parsed now, more content can be dynamically parsed in the future.
raise ValueError("Unsupported parent type")
+ if not _has_valid_iterchildren(parent_elm):
+ raise ValueError("The parent element does not support iterchildren()")
+
for child in parent_elm.iterchildren():
if child.tag == qn("w:p"):
yield Paragraph(child, parent)
elif child.tag == qn("w:tbl"):
- yield Table(child, parent)
\ No newline at end of file
+ yield Table(child, parent)
+
+def _has_valid_iterchildren(element) -> bool:
+ """
+ Check if the element has a valid iterchildren() method.
+ """
+ iterchildren = getattr(element, "iterchildren", None) # Ensure that iterchildren is callable
+ if not callable(iterchildren):
+ return False
+
+ return not getattr(iterchildren, "_is_mock", False)
\ No newline at end of file