diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index d3886b8e77..17c4b3c212 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -428,7 +428,7 @@ class IndexingRunner: chunk_size=segmentation["max_tokens"], chunk_overlap=chunk_overlap, fixed_separator=separator, - separators=["\n\n", "。", ".", " ", ""], + separators=["\n\n", "。", ". ", " ", ""], embedding_model_instance=embedding_model_instance ) else: @@ -436,7 +436,7 @@ class IndexingRunner: character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder( chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'], chunk_overlap=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['chunk_overlap'], - separators=["\n\n", "。", ".", " ", ""], + separators=["\n\n", "。", ". ", " ", ""], embedding_model_instance=embedding_model_instance ) diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py index 093965e10e..39dc0996ac 100644 --- a/api/core/rag/extractor/extract_processor.py +++ b/api/core/rag/extractor/extract_processor.py @@ -16,7 +16,6 @@ from core.rag.extractor.markdown_extractor import MarkdownExtractor from core.rag.extractor.notion_extractor import NotionExtractor from core.rag.extractor.pdf_extractor import PdfExtractor from core.rag.extractor.text_extractor import TextExtractor -from core.rag.extractor.unstructured.unstructured_doc_extractor import UnstructuredWordExtractor from core.rag.extractor.unstructured.unstructured_eml_extractor import UnstructuredEmailExtractor from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor @@ -108,7 +107,7 @@ class ExtractProcessor: elif file_extension in ['.htm', '.html']: extractor = HtmlExtractor(file_path) elif file_extension in ['.docx']: - extractor = UnstructuredWordExtractor(file_path, unstructured_api_url) + extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by) elif file_extension == '.csv': extractor = CSVExtractor(file_path, autodetect_encoding=True) elif file_extension == '.msg': @@ -137,7 +136,7 @@ class ExtractProcessor: elif file_extension in ['.htm', '.html']: extractor = HtmlExtractor(file_path) elif file_extension in ['.docx']: - extractor = WordExtractor(file_path) + extractor = WordExtractor(file_path, upload_file.tenant_id, upload_file.created_by) elif file_extension == '.csv': extractor = CSVExtractor(file_path, autodetect_encoding=True) elif file_extension == 'epub': diff --git a/api/core/rag/extractor/word_extractor.py b/api/core/rag/extractor/word_extractor.py index 6bf47a76f0..5b858c6c4c 100644 --- a/api/core/rag/extractor/word_extractor.py +++ b/api/core/rag/extractor/word_extractor.py @@ -1,12 +1,20 @@ """Abstract interface for document loader implementations.""" +import datetime +import mimetypes import os import tempfile +import uuid from urllib.parse import urlparse import requests +from docx import Document as DocxDocument +from flask import current_app from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document +from extensions.ext_database import db +from extensions.ext_storage import storage +from models.model import UploadFile class WordExtractor(BaseExtractor): @@ -17,9 +25,12 @@ class WordExtractor(BaseExtractor): file_path: Path to the file to load. """ - def __init__(self, file_path: str): + def __init__(self, file_path: str, tenant_id: str, user_id: str): """Initialize with file path.""" self.file_path = file_path + self.tenant_id = tenant_id + self.user_id = user_id + if "~" in self.file_path: self.file_path = os.path.expanduser(self.file_path) @@ -45,12 +56,7 @@ class WordExtractor(BaseExtractor): def extract(self) -> list[Document]: """Load given path as single page.""" - from docx import Document as docx_Document - - document = docx_Document(self.file_path) - doc_texts = [paragraph.text for paragraph in document.paragraphs] - content = '\n'.join(doc_texts) - + content = self.parse_docx(self.file_path, 'storage') return [Document( page_content=content, metadata={"source": self.file_path}, @@ -61,3 +67,111 @@ class WordExtractor(BaseExtractor): """Check if the url is valid.""" parsed = urlparse(url) return bool(parsed.netloc) and bool(parsed.scheme) + + def _extract_images_from_docx(self, doc, image_folder): + os.makedirs(image_folder, exist_ok=True) + image_count = 0 + image_map = {} + + for rel in doc.part.rels.values(): + if "image" in rel.target_ref: + image_count += 1 + image_ext = rel.target_ref.split('.')[-1] + # user uuid as file name + file_uuid = str(uuid.uuid4()) + file_key = 'image_files/' + self.tenant_id + '/' + file_uuid + '.' + image_ext + mime_type, _ = mimetypes.guess_type(file_key) + + storage.save(file_key, rel.target_part.blob) + # save file to db + config = current_app.config + upload_file = UploadFile( + tenant_id=self.tenant_id, + storage_type=config['STORAGE_TYPE'], + key=file_key, + name=file_key, + size=0, + extension=image_ext, + mime_type=mime_type, + created_by=self.user_id, + created_at=datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None), + used=True, + used_by=self.user_id, + used_at=datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None) + ) + + db.session.add(upload_file) + db.session.commit() + image_map[rel.target_part] = f"![image]({current_app.config.get('CONSOLE_API_URL')}/files/{upload_file.id}/image-preview)" + + return image_map + + def _table_to_markdown(self, table): + markdown = "" + # deal with table headers + header_row = table.rows[0] + headers = [cell.text for cell in header_row.cells] + markdown += "| " + " | ".join(headers) + " |\n" + markdown += "| " + " | ".join(["---"] * len(headers)) + " |\n" + # deal with table rows + for row in table.rows[1:]: + row_cells = [cell.text for cell in row.cells] + markdown += "| " + " | ".join(row_cells) + " |\n" + + return markdown + + def _parse_paragraph(self, paragraph, image_map): + paragraph_content = [] + for run in paragraph.runs: + if run.element.xpath('.//a:blip'): + for blip in run.element.xpath('.//a:blip'): + embed_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed') + if embed_id: + rel_target = run.part.rels[embed_id].target_ref + if rel_target in image_map: + paragraph_content.append(image_map[rel_target]) + if run.text.strip(): + paragraph_content.append(run.text.strip()) + return ' '.join(paragraph_content) if paragraph_content else '' + + def parse_docx(self, docx_path, image_folder): + doc = DocxDocument(docx_path) + os.makedirs(image_folder, exist_ok=True) + + content = [] + + image_map = self._extract_images_from_docx(doc, image_folder) + + def parse_paragraph(paragraph): + paragraph_content = [] + for run in paragraph.runs: + if run.element.tag.endswith('r'): + drawing_elements = run.element.findall( + './/{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing') + for drawing in drawing_elements: + blip_elements = drawing.findall( + './/{http://schemas.openxmlformats.org/drawingml/2006/main}blip') + for blip in blip_elements: + embed_id = blip.get( + '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed') + if embed_id: + image_part = doc.part.related_parts.get(embed_id) + if image_part in image_map: + paragraph_content.append(image_map[image_part]) + if run.text.strip(): + paragraph_content.append(run.text.strip()) + return ''.join(paragraph_content) if paragraph_content else '' + + paragraphs = doc.paragraphs.copy() + tables = doc.tables.copy() + for element in doc.element.body: + if element.tag.endswith('p'): # paragraph + para = paragraphs.pop(0) + parsed_paragraph = parse_paragraph(para) + if parsed_paragraph: + content.append(parsed_paragraph) + elif element.tag.endswith('tbl'): # table + table = tables.pop(0) + content.append(self._table_to_markdown(table)) + return '\n'.join(content) + diff --git a/api/core/rag/index_processor/index_processor_base.py b/api/core/rag/index_processor/index_processor_base.py index c51634fee1..23d2451e2d 100644 --- a/api/core/rag/index_processor/index_processor_base.py +++ b/api/core/rag/index_processor/index_processor_base.py @@ -57,7 +57,7 @@ class BaseIndexProcessor(ABC): chunk_size=segmentation["max_tokens"], chunk_overlap=segmentation.get('chunk_overlap', 0), fixed_separator=separator, - separators=["\n\n", "。", ".", " ", ""], + separators=["\n\n", "。", ". ", " ", ""], embedding_model_instance=embedding_model_instance ) else: @@ -65,7 +65,7 @@ class BaseIndexProcessor(ABC): character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder( chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'], chunk_overlap=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['chunk_overlap'], - separators=["\n\n", "。", ".", " ", ""], + separators=["\n\n", "。", ". ", " ", ""], embedding_model_instance=embedding_model_instance ) diff --git a/api/core/rag/retrieval/dataset_retrieval.py b/api/core/rag/retrieval/dataset_retrieval.py index 147837f715..08c7df96dd 100644 --- a/api/core/rag/retrieval/dataset_retrieval.py +++ b/api/core/rag/retrieval/dataset_retrieval.py @@ -144,9 +144,9 @@ class DatasetRetrieval: float('inf'))) for segment in sorted_segments: if segment.answer: - document_context_list.append(f'question:{segment.content} answer:{segment.answer}') + document_context_list.append(f'question:{segment.get_sign_content()} answer:{segment.answer}') else: - document_context_list.append(segment.content) + document_context_list.append(segment.get_sign_content()) if show_retrieve_source: context_list = [] resource_number = 1 diff --git a/api/core/splitter/text_splitter.py b/api/core/splitter/text_splitter.py index 5eeb237a96..4cb3b6bf8a 100644 --- a/api/core/splitter/text_splitter.py +++ b/api/core/splitter/text_splitter.py @@ -94,7 +94,7 @@ class TextSplitter(BaseDocumentTransformer, ABC): documents.append(new_doc) return documents - def split_documents(self, documents: Iterable[Document] ) -> list[Document]: + def split_documents(self, documents: Iterable[Document]) -> list[Document]: """Split documents.""" texts, metadatas = [], [] for doc in documents: diff --git a/api/core/tools/tool/dataset_retriever/dataset_multi_retriever_tool.py b/api/core/tools/tool/dataset_retriever/dataset_multi_retriever_tool.py index cc55a906d0..b67863eb43 100644 --- a/api/core/tools/tool/dataset_retriever/dataset_multi_retriever_tool.py +++ b/api/core/tools/tool/dataset_retriever/dataset_multi_retriever_tool.py @@ -99,9 +99,9 @@ class DatasetMultiRetrieverTool(DatasetRetrieverBaseTool): float('inf'))) for segment in sorted_segments: if segment.answer: - document_context_list.append(f'question:{segment.content} answer:{segment.answer}') + document_context_list.append(f'question:{segment.get_sign_content()} answer:{segment.answer}') else: - document_context_list.append(segment.content) + document_context_list.append(segment.get_sign_content()) if self.return_resource: context_list = [] resource_number = 1 diff --git a/api/core/tools/tool/dataset_retriever/dataset_retriever_tool.py b/api/core/tools/tool/dataset_retriever/dataset_retriever_tool.py index c253c2cc81..af45fc66f2 100644 --- a/api/core/tools/tool/dataset_retriever/dataset_retriever_tool.py +++ b/api/core/tools/tool/dataset_retriever/dataset_retriever_tool.py @@ -105,9 +105,9 @@ class DatasetRetrieverTool(DatasetRetrieverBaseTool): float('inf'))) for segment in sorted_segments: if segment.answer: - document_context_list.append(f'question:{segment.content} answer:{segment.answer}') + document_context_list.append(f'question:{segment.get_sign_content()} answer:{segment.answer}') else: - document_context_list.append(segment.content) + document_context_list.append(segment.get_sign_content()) if self.return_resource: context_list = [] resource_number = 1 diff --git a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py index 04e95dbe02..e9dfb75f17 100644 --- a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py +++ b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py @@ -191,9 +191,9 @@ class KnowledgeRetrievalNode(BaseNode): 'title': document.name } if segment.answer: - source['content'] = f'question:{segment.content} \nanswer:{segment.answer}' + source['content'] = f'question:{segment.get_sign_content()} \nanswer:{segment.answer}' else: - source['content'] = segment.content + source['content'] = segment.get_sign_content() context_list.append(source) resource_number += 1 return context_list diff --git a/api/models/dataset.py b/api/models/dataset.py index 01b068fa2a..ba791e6d23 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -1,8 +1,15 @@ +import base64 +import hashlib +import hmac import json import logging +import os import pickle +import re +import time from json import JSONDecodeError +from flask import current_app from sqlalchemy import func from sqlalchemy.dialects.postgresql import JSONB @@ -414,6 +421,26 @@ class DocumentSegment(db.Model): DocumentSegment.position == self.position + 1 ).first() + def get_sign_content(self): + pattern = r"/files/([a-f0-9\-]+)/image-preview" + text = self.content + match = re.search(pattern, text) + + if match: + upload_file_id = match.group(1) + nonce = os.urandom(16).hex() + timestamp = str(int(time.time())) + data_to_sign = f"image-preview|{upload_file_id}|{timestamp}|{nonce}" + secret_key = current_app.config['SECRET_KEY'].encode() + sign = hmac.new(secret_key, data_to_sign.encode(), hashlib.sha256).digest() + encoded_sign = base64.urlsafe_b64encode(sign).decode() + + params = f"timestamp={timestamp}&nonce={nonce}&sign={encoded_sign}" + replacement = r"\g<0>?{params}".format(params=params) + text = re.sub(pattern, replacement, text) + return text + + class AppDatasetJoin(db.Model): __tablename__ = 'app_dataset_joins'