diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index abf21b84f5..0cd9f9f646 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -62,7 +62,8 @@ class IndexingRunner: text_docs = self._extract(index_processor, dataset_document, processing_rule.to_dict()) # transform - documents = self._transform(index_processor, dataset, text_docs, processing_rule.to_dict()) + documents = self._transform(index_processor, dataset, text_docs, dataset_document.doc_language, + processing_rule.to_dict()) # save segment self._load_segments(dataset, dataset_document, documents) @@ -120,7 +121,8 @@ class IndexingRunner: text_docs = self._extract(index_processor, dataset_document, processing_rule.to_dict()) # transform - documents = self._transform(index_processor, dataset, text_docs, processing_rule.to_dict()) + documents = self._transform(index_processor, dataset, text_docs, dataset_document.doc_language, + processing_rule.to_dict()) # save segment self._load_segments(dataset, dataset_document, documents) @@ -750,7 +752,7 @@ class IndexingRunner: index_processor.load(dataset, documents) def _transform(self, index_processor: BaseIndexProcessor, dataset: Dataset, - text_docs: list[Document], process_rule: dict) -> list[Document]: + text_docs: list[Document], doc_language: str, process_rule: dict) -> list[Document]: # get embedding model instance embedding_model_instance = None if dataset.indexing_technique == 'high_quality': @@ -768,7 +770,8 @@ class IndexingRunner: ) documents = index_processor.transform(text_docs, embedding_model_instance=embedding_model_instance, - process_rule=process_rule) + process_rule=process_rule, tenant_id=dataset.tenant_id, + doc_language=doc_language) return documents diff --git a/api/core/rag/index_processor/processor/qa_index_processor.py b/api/core/rag/index_processor/processor/qa_index_processor.py index f61c728b49..0d81c419d6 100644 --- a/api/core/rag/index_processor/processor/qa_index_processor.py +++ b/api/core/rag/index_processor/processor/qa_index_processor.py @@ -7,7 +7,6 @@ from typing import Optional import pandas as pd from flask import Flask, current_app -from flask_login import current_user from werkzeug.datastructures import FileStorage from core.generator.llm_generator import LLMGenerator @@ -31,7 +30,7 @@ class QAIndexProcessor(BaseIndexProcessor): def transform(self, documents: list[Document], **kwargs) -> list[Document]: splitter = self._get_splitter(processing_rule=kwargs.get('process_rule'), - embedding_model_instance=None) + embedding_model_instance=kwargs.get('embedding_model_instance')) # Split the text documents into nodes. all_documents = [] @@ -66,10 +65,10 @@ class QAIndexProcessor(BaseIndexProcessor): for doc in sub_documents: document_format_thread = threading.Thread(target=self._format_qa_document, kwargs={ 'flask_app': current_app._get_current_object(), - 'tenant_id': current_user.current_tenant.id, + 'tenant_id': kwargs.get('tenant_id'), 'document_node': doc, 'all_qa_documents': all_qa_documents, - 'document_language': kwargs.get('document_language', 'English')}) + 'document_language': kwargs.get('doc_language', 'English')}) threads.append(document_format_thread) document_format_thread.start() for thread in threads: