Fix newline characters in tables during document parsing (#12112)
Co-authored-by: hisir <admin@qq.com>
This commit is contained in:
parent
9677144015
commit
41f39bf3fc
@ -2,6 +2,7 @@ import csv
|
|||||||
import io
|
import io
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import operator
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import cast
|
from typing import cast
|
||||||
@ -10,6 +11,8 @@ import docx
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pypdfium2 # type: ignore
|
import pypdfium2 # type: ignore
|
||||||
import yaml # type: ignore
|
import yaml # type: ignore
|
||||||
|
from docx.table import Table
|
||||||
|
from docx.text.paragraph import Paragraph
|
||||||
|
|
||||||
from configs import dify_config
|
from configs import dify_config
|
||||||
from core.file import File, FileTransferMethod, file_manager
|
from core.file import File, FileTransferMethod, file_manager
|
||||||
@ -189,35 +192,56 @@ def _extract_text_from_doc(file_content: bytes) -> str:
|
|||||||
doc_file = io.BytesIO(file_content)
|
doc_file = io.BytesIO(file_content)
|
||||||
doc = docx.Document(doc_file)
|
doc = docx.Document(doc_file)
|
||||||
text = []
|
text = []
|
||||||
# Process paragraphs
|
|
||||||
for paragraph in doc.paragraphs:
|
|
||||||
if paragraph.text.strip():
|
|
||||||
text.append(paragraph.text)
|
|
||||||
|
|
||||||
# Process tables
|
# Keep track of paragraph and table positions
|
||||||
for table in doc.tables:
|
content_items: list[tuple[int, str, Table | Paragraph]] = []
|
||||||
# Table header
|
|
||||||
try:
|
# Process paragraphs and tables
|
||||||
# table maybe cause errors so ignore it.
|
for i, paragraph in enumerate(doc.paragraphs):
|
||||||
if len(table.rows) > 0 and table.rows[0].cells is not None:
|
if paragraph.text.strip():
|
||||||
|
content_items.append((i, "paragraph", paragraph))
|
||||||
|
|
||||||
|
for i, table in enumerate(doc.tables):
|
||||||
|
content_items.append((i, "table", table))
|
||||||
|
|
||||||
|
# Sort content items based on their original position
|
||||||
|
content_items.sort(key=operator.itemgetter(0))
|
||||||
|
|
||||||
|
# Process sorted content
|
||||||
|
for _, item_type, item in content_items:
|
||||||
|
if item_type == "paragraph":
|
||||||
|
if isinstance(item, Table):
|
||||||
|
continue
|
||||||
|
text.append(item.text)
|
||||||
|
elif item_type == "table":
|
||||||
|
# Process tables
|
||||||
|
if not isinstance(item, Table):
|
||||||
|
continue
|
||||||
|
try:
|
||||||
# Check if any cell in the table has text
|
# Check if any cell in the table has text
|
||||||
has_content = False
|
has_content = False
|
||||||
for row in table.rows:
|
for row in item.rows:
|
||||||
if any(cell.text.strip() for cell in row.cells):
|
if any(cell.text.strip() for cell in row.cells):
|
||||||
has_content = True
|
has_content = True
|
||||||
break
|
break
|
||||||
|
|
||||||
if has_content:
|
if has_content:
|
||||||
markdown_table = "| " + " | ".join(cell.text for cell in table.rows[0].cells) + " |\n"
|
cell_texts = [cell.text.replace("\n", "<br>") for cell in item.rows[0].cells]
|
||||||
markdown_table += "| " + " | ".join(["---"] * len(table.rows[0].cells)) + " |\n"
|
markdown_table = f"| {' | '.join(cell_texts)} |\n"
|
||||||
for row in table.rows[1:]:
|
markdown_table += f"| {' | '.join(['---'] * len(item.rows[0].cells))} |\n"
|
||||||
markdown_table += "| " + " | ".join(cell.text for cell in row.cells) + " |\n"
|
|
||||||
|
for row in item.rows[1:]:
|
||||||
|
# Replace newlines with <br> in each cell
|
||||||
|
row_cells = [cell.text.replace("\n", "<br>") for cell in row.cells]
|
||||||
|
markdown_table += "| " + " | ".join(row_cells) + " |\n"
|
||||||
|
|
||||||
text.append(markdown_table)
|
text.append(markdown_table)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
|
logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return "\n".join(text)
|
return "\n".join(text)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e
|
raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user