Fix newline characters in tables during document parsing (#12112)

Co-authored-by: hisir <admin@qq.com>
2025-01-07 17:26:24 +08:00 · 2025-01-07 17:26:24 +08:00 · 41f39bf3fc
commit 41f39bf3fc
parent 9677144015
1 changed files with 42 additions and 18 deletions
--- a/api/core/workflow/nodes/document_extractor/node.py
+++ b/api/core/workflow/nodes/document_extractor/node.py
@ -2,6 +2,7 @@ import csv
 import io
 import json
 import logging
+import operator
 import os
 import tempfile
 from typing import cast
@ -10,6 +11,8 @@ import docx
 import pandas as pd
 import pypdfium2  # type: ignore
 import yaml  # type: ignore
+from docx.table import Table
+from docx.text.paragraph import Paragraph

 from configs import dify_config
 from core.file import File, FileTransferMethod, file_manager
@ -189,35 +192,56 @@ def _extract_text_from_doc(file_content: bytes) -> str:
        doc_file = io.BytesIO(file_content)
        doc = docx.Document(doc_file)
        text = []
-        # Process paragraphs
-        for paragraph in doc.paragraphs:
-            if paragraph.text.strip():
-                text.append(paragraph.text)

-        # Process tables
-        for table in doc.tables:
-            # Table header
-            try:
-                # table maybe cause errors so ignore it.
-                if len(table.rows) > 0 and table.rows[0].cells is not None:
+        # Keep track of paragraph and table positions
+        content_items: list[tuple[int, str, Table | Paragraph]] = []
+
+        # Process paragraphs and tables
+        for i, paragraph in enumerate(doc.paragraphs):
+            if paragraph.text.strip():
+                content_items.append((i, "paragraph", paragraph))
+
+        for i, table in enumerate(doc.tables):
+            content_items.append((i, "table", table))
+
+        # Sort content items based on their original position
+        content_items.sort(key=operator.itemgetter(0))
+
+        # Process sorted content
+        for _, item_type, item in content_items:
+            if item_type == "paragraph":
+                if isinstance(item, Table):
+                    continue
+                text.append(item.text)
+            elif item_type == "table":
+                # Process tables
+                if not isinstance(item, Table):
+                    continue
+                try:
                    # Check if any cell in the table has text
                    has_content = False
-                    for row in table.rows:
+                    for row in item.rows:
                        if any(cell.text.strip() for cell in row.cells):
                            has_content = True
                            break

                    if has_content:
-                        markdown_table = "| " + " | ".join(cell.text for cell in table.rows[0].cells) + " |\n"
-                        markdown_table += "| " + " | ".join(["---"] * len(table.rows[0].cells)) + " |\n"
-                        for row in table.rows[1:]:
-                            markdown_table += "| " + " | ".join(cell.text for cell in row.cells) + " |\n"
+                        cell_texts = [cell.text.replace("\n", "<br>") for cell in item.rows[0].cells]
+                        markdown_table = f"| {' | '.join(cell_texts)} |\n"
+                        markdown_table += f"| {' | '.join(['---'] * len(item.rows[0].cells))} |\n"
+
+                        for row in item.rows[1:]:
+                            # Replace newlines with <br> in each cell
+                            row_cells = [cell.text.replace("\n", "<br>") for cell in row.cells]
+                            markdown_table += "| " + " | ".join(row_cells) + " |\n"
+
                        text.append(markdown_table)
-            except Exception as e:
-                logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
-                continue
+                except Exception as e:
+                    logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
+                    continue

        return "\n".join(text)
+
    except Exception as e:
        raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e