diff --git a/api/core/workflow/nodes/document_extractor/node.py b/api/core/workflow/nodes/document_extractor/node.py index 0b1dc611c5..38b382a7b6 100644 --- a/api/core/workflow/nodes/document_extractor/node.py +++ b/api/core/workflow/nodes/document_extractor/node.py @@ -2,6 +2,7 @@ import csv import io import json import logging +import operator import os import tempfile from typing import cast @@ -10,6 +11,8 @@ import docx import pandas as pd import pypdfium2 # type: ignore import yaml # type: ignore +from docx.table import Table +from docx.text.paragraph import Paragraph from configs import dify_config from core.file import File, FileTransferMethod, file_manager @@ -189,35 +192,56 @@ def _extract_text_from_doc(file_content: bytes) -> str: doc_file = io.BytesIO(file_content) doc = docx.Document(doc_file) text = [] - # Process paragraphs - for paragraph in doc.paragraphs: - if paragraph.text.strip(): - text.append(paragraph.text) - # Process tables - for table in doc.tables: - # Table header - try: - # table maybe cause errors so ignore it. - if len(table.rows) > 0 and table.rows[0].cells is not None: + # Keep track of paragraph and table positions + content_items: list[tuple[int, str, Table | Paragraph]] = [] + + # Process paragraphs and tables + for i, paragraph in enumerate(doc.paragraphs): + if paragraph.text.strip(): + content_items.append((i, "paragraph", paragraph)) + + for i, table in enumerate(doc.tables): + content_items.append((i, "table", table)) + + # Sort content items based on their original position + content_items.sort(key=operator.itemgetter(0)) + + # Process sorted content + for _, item_type, item in content_items: + if item_type == "paragraph": + if isinstance(item, Table): + continue + text.append(item.text) + elif item_type == "table": + # Process tables + if not isinstance(item, Table): + continue + try: # Check if any cell in the table has text has_content = False - for row in table.rows: + for row in item.rows: if any(cell.text.strip() for cell in row.cells): has_content = True break if has_content: - markdown_table = "| " + " | ".join(cell.text for cell in table.rows[0].cells) + " |\n" - markdown_table += "| " + " | ".join(["---"] * len(table.rows[0].cells)) + " |\n" - for row in table.rows[1:]: - markdown_table += "| " + " | ".join(cell.text for cell in row.cells) + " |\n" + cell_texts = [cell.text.replace("\n", "
") for cell in item.rows[0].cells] + markdown_table = f"| {' | '.join(cell_texts)} |\n" + markdown_table += f"| {' | '.join(['---'] * len(item.rows[0].cells))} |\n" + + for row in item.rows[1:]: + # Replace newlines with
in each cell + row_cells = [cell.text.replace("\n", "
") for cell in row.cells] + markdown_table += "| " + " | ".join(row_cells) + " |\n" + text.append(markdown_table) - except Exception as e: - logger.warning(f"Failed to extract table from DOC/DOCX: {e}") - continue + except Exception as e: + logger.warning(f"Failed to extract table from DOC/DOCX: {e}") + continue return "\n".join(text) + except Exception as e: raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e