diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index 07abe345dd..68dd1f52f7 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -6,14 +6,17 @@ import operator import os import tempfile from collections.abc import Mapping, Sequence -from typing import Any, cast +from typing import Any, cast, Union, Iterator import docx import pandas as pd import pypdfium2 # type: ignore import yaml # type: ignore -from docx.table import Table +from docx import Document +from docx.document import Document as _Document +from docx.table import Table, _Cell from docx.text.paragraph import Paragraph +from docx.oxml.ns import qn from configs import dify_config from core.file import File, FileTransferMethod, file_manager @@ -238,60 +241,42 @@ def _extract_text_from_docx(file_content: bytes) -> str: """ try: doc_file = io.BytesIO(file_content) - doc = docx.Document(doc_file) + doc = Document(doc_file) text = [] - # Keep track of paragraph and table positions - content_items: list[tuple[int, str, Table | Paragraph]] = [] - - # Process paragraphs and tables - for i, paragraph in enumerate(doc.paragraphs): - if paragraph.text.strip(): - content_items.append((i, "paragraph", paragraph)) - - for i, table in enumerate(doc.tables): - content_items.append((i, "table", table)) - - # Sort content items based on their original position - content_items.sort(key=operator.itemgetter(0)) - - # Process sorted content - for _, item_type, item in content_items: - if item_type == "paragraph": - if isinstance(item, Table): - continue - text.append(item.text) - elif item_type == "table": - # Process tables - if not isinstance(item, Table): + for block in _iter_block_items(doc): + if isinstance(block, Paragraph): + if block.text.strip(): + text.append(block.text) + elif isinstance(block, Table): + has_content = any( + cell.text.strip() + for row in block.rows + for cell in row.cells + ) + + if not has_content: continue + try: - # Check if any cell in the table has text - has_content = False - for row in item.rows: - if any(cell.text.strip() for cell in row.cells): - has_content = True - break + header_cells = block.rows[0].cells + header_texts = [cell.text.replace("\n", "
") for cell in header_cells] + markdown_table = f"| {' | '.join(header_texts)} |\n" + markdown_table += f"| {' | '.join(['---'] * len(header_cells))} |\n" - if has_content: - cell_texts = [cell.text.replace("\n", "
") for cell in item.rows[0].cells] - markdown_table = f"| {' | '.join(cell_texts)} |\n" - markdown_table += f"| {' | '.join(['---'] * len(item.rows[0].cells))} |\n" + for row in block.rows[1:]: + row_texts = [cell.text.replace("\n", "
") for cell in row.cells] + markdown_table += f"| {' | '.join(row_texts)} |\n" - for row in item.rows[1:]: - # Replace newlines with
in each cell - row_cells = [cell.text.replace("\n", "
") for cell in row.cells] - markdown_table += "| " + " | ".join(row_cells) + " |\n" - - text.append(markdown_table) + text.append(markdown_table) except Exception as e: - logger.warning(f"Failed to extract table from DOC: {e}") + logger.warning(f"Failed to extract table from DOCX: {e}") continue return "\n".join(text) - except Exception as e: - raise TextExtractionError(f"Failed to extract text from DOCX: {str(e)}") from e + logger.error(f"Failed to extract text from DOCX: {e}") + return "" def _download_file_content(file: File) -> bytes: @@ -440,3 +425,24 @@ def _extract_text_from_msg(file_content: bytes) -> str: return "\n".join([str(element) for element in elements]) except Exception as e: raise TextExtractionError(f"Failed to extract text from MSG: {str(e)}") from e + + +def _iter_block_items(parent: Union[_Document, _Cell]) -> Iterator[Union[Paragraph, Table]]: + """ + Yield each paragraph and table child within *parent*, in document order. + Each returned value is an instance of either Paragraph or Table. + """ + if isinstance(parent, _Document): + parent_elm = parent.element.body + elif isinstance(parent, Table): + parent_elm = parent._element + elif isinstance(parent, _Cell): + parent_elm = parent._tc + else: + raise ValueError("Unsupported parent type") + + for child in parent_elm.iterchildren(): + if child.tag == qn("w:p"): + yield Paragraph(child, parent) + elif child.tag == qn("w:tbl"): + yield Table(child, parent) \ No newline at end of file