Fix newline characters in tables during document parsing (#12112)

Co-authored-by: hisir <admin@qq.com>
This commit is contained in:
hisir 2025-01-07 17:26:24 +08:00 committed by GitHub
parent 9677144015
commit 41f39bf3fc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -2,6 +2,7 @@ import csv
import io import io
import json import json
import logging import logging
import operator
import os import os
import tempfile import tempfile
from typing import cast from typing import cast
@ -10,6 +11,8 @@ import docx
import pandas as pd import pandas as pd
import pypdfium2 # type: ignore import pypdfium2 # type: ignore
import yaml # type: ignore import yaml # type: ignore
from docx.table import Table
from docx.text.paragraph import Paragraph
from configs import dify_config from configs import dify_config
from core.file import File, FileTransferMethod, file_manager from core.file import File, FileTransferMethod, file_manager
@ -189,35 +192,56 @@ def _extract_text_from_doc(file_content: bytes) -> str:
doc_file = io.BytesIO(file_content) doc_file = io.BytesIO(file_content)
doc = docx.Document(doc_file) doc = docx.Document(doc_file)
text = [] text = []
# Process paragraphs
for paragraph in doc.paragraphs:
if paragraph.text.strip():
text.append(paragraph.text)
# Process tables # Keep track of paragraph and table positions
for table in doc.tables: content_items: list[tuple[int, str, Table | Paragraph]] = []
# Table header
try: # Process paragraphs and tables
# table maybe cause errors so ignore it. for i, paragraph in enumerate(doc.paragraphs):
if len(table.rows) > 0 and table.rows[0].cells is not None: if paragraph.text.strip():
content_items.append((i, "paragraph", paragraph))
for i, table in enumerate(doc.tables):
content_items.append((i, "table", table))
# Sort content items based on their original position
content_items.sort(key=operator.itemgetter(0))
# Process sorted content
for _, item_type, item in content_items:
if item_type == "paragraph":
if isinstance(item, Table):
continue
text.append(item.text)
elif item_type == "table":
# Process tables
if not isinstance(item, Table):
continue
try:
# Check if any cell in the table has text # Check if any cell in the table has text
has_content = False has_content = False
for row in table.rows: for row in item.rows:
if any(cell.text.strip() for cell in row.cells): if any(cell.text.strip() for cell in row.cells):
has_content = True has_content = True
break break
if has_content: if has_content:
markdown_table = "| " + " | ".join(cell.text for cell in table.rows[0].cells) + " |\n" cell_texts = [cell.text.replace("\n", "<br>") for cell in item.rows[0].cells]
markdown_table += "| " + " | ".join(["---"] * len(table.rows[0].cells)) + " |\n" markdown_table = f"| {' | '.join(cell_texts)} |\n"
for row in table.rows[1:]: markdown_table += f"| {' | '.join(['---'] * len(item.rows[0].cells))} |\n"
markdown_table += "| " + " | ".join(cell.text for cell in row.cells) + " |\n"
for row in item.rows[1:]:
# Replace newlines with <br> in each cell
row_cells = [cell.text.replace("\n", "<br>") for cell in row.cells]
markdown_table += "| " + " | ".join(row_cells) + " |\n"
text.append(markdown_table) text.append(markdown_table)
except Exception as e: except Exception as e:
logger.warning(f"Failed to extract table from DOC/DOCX: {e}") logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
continue continue
return "\n".join(text) return "\n".join(text)
except Exception as e: except Exception as e:
raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e