From 9c9352bc73606f7847e8a18b48e67fbfa1b631d6 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Fri, 27 Sep 2024 16:17:45 +0800 Subject: [PATCH] update to external knowledge api --- .../console/datasets/test_external.py | 8 +--- api/services/external_knowledge_service.py | 46 +++++++++++++------ api/tasks/external_document_indexing_task.py | 16 +++++-- 3 files changed, 44 insertions(+), 26 deletions(-) diff --git a/api/controllers/console/datasets/test_external.py b/api/controllers/console/datasets/test_external.py index da57cb338a..17b0f77925 100644 --- a/api/controllers/console/datasets/test_external.py +++ b/api/controllers/console/datasets/test_external.py @@ -13,13 +13,7 @@ class TestExternalApi(Resource): @account_initialization_required def post(self): parser = reqparse.RequestParser() - parser.add_argument( - "retrieval_setting", - nullable=False, - required=True, - type=dict, - location="json" - ) + parser.add_argument("retrieval_setting", nullable=False, required=True, type=dict, location="json") parser.add_argument( "query", nullable=False, diff --git a/api/services/external_knowledge_service.py b/api/services/external_knowledge_service.py index 5fc789ded0..e634944e7a 100644 --- a/api/services/external_knowledge_service.py +++ b/api/services/external_knowledge_service.py @@ -20,9 +20,12 @@ from models.dataset import ( ExternalKnowledgeBindings, ) from models.model import UploadFile -from services.entities.external_knowledge_entities.external_knowledge_entities import ExternalKnowledgeApiSetting, Authorization +from services.entities.external_knowledge_entities.external_knowledge_entities import ( + Authorization, + ExternalKnowledgeApiSetting, +) from services.errors.dataset import DatasetNameDuplicateError -from urllib.parse import urlparse + class ExternalDatasetService: @staticmethod @@ -61,14 +64,14 @@ class ExternalDatasetService: db.session.add(external_knowledge_api) db.session.commit() return external_knowledge_api - + @staticmethod def check_endpoint_and_api_key(settings: dict): if "endpoint" not in settings or not settings["endpoint"]: raise ValueError("endpoint is required") if "api_key" not in settings or not settings["api_key"]: raise ValueError("api_key is required") - + endpoint = f"{settings['endpoint']}/retrieval" api_key = settings["api_key"] if not validators.url(endpoint): @@ -90,7 +93,9 @@ class ExternalDatasetService: @staticmethod def update_external_knowledge_api(tenant_id, user_id, external_knowledge_api_id, args) -> ExternalKnowledgeApis: - external_knowledge_api = ExternalKnowledgeApis.query.filter_by(id=external_knowledge_api_id, tenant_id=tenant_id).first() + external_knowledge_api = ExternalKnowledgeApis.query.filter_by( + id=external_knowledge_api_id, tenant_id=tenant_id + ).first() if external_knowledge_api is None: raise ValueError("api template not found") @@ -105,7 +110,9 @@ class ExternalDatasetService: @staticmethod def delete_external_knowledge_api(tenant_id: str, external_knowledge_api_id: str): - external_knowledge_api = ExternalKnowledgeApis.query.filter_by(id=external_knowledge_api_id, tenant_id=tenant_id).first() + external_knowledge_api = ExternalKnowledgeApis.query.filter_by( + id=external_knowledge_api_id, tenant_id=tenant_id + ).first() if external_knowledge_api is None: raise ValueError("api template not found") @@ -130,7 +137,9 @@ class ExternalDatasetService: @staticmethod def document_create_args_validate(tenant_id: str, external_knowledge_api_id: str, process_parameter: dict): - external_knowledge_api = ExternalKnowledgeApis.query.filter_by(id=external_knowledge_api_id, tenant_id=tenant_id).first() + external_knowledge_api = ExternalKnowledgeApis.query.filter_by( + id=external_knowledge_api_id, tenant_id=tenant_id + ).first() if external_knowledge_api is None: raise ValueError("api template not found") settings = json.loads(external_knowledge_api.settings) @@ -150,7 +159,9 @@ class ExternalDatasetService: raise ValueError("data source is required") process_parameter = args.get("process_parameter") - external_knowledge_api = ExternalKnowledgeApis.query.filter_by(id=external_knowledge_api_id, tenant_id=tenant_id).first() + external_knowledge_api = ExternalKnowledgeApis.query.filter_by( + id=external_knowledge_api_id, tenant_id=tenant_id + ).first() if external_knowledge_api is None: raise ValueError("api template not found") @@ -204,7 +215,9 @@ class ExternalDatasetService: return dataset @staticmethod - def process_external_api(settings: ExternalKnowledgeApiSetting, files: Union[None, dict[str, Any]]) -> httpx.Response: + def process_external_api( + settings: ExternalKnowledgeApiSetting, files: Union[None, dict[str, Any]] + ) -> httpx.Response: """ do http request depending on api bundle """ @@ -322,7 +335,9 @@ class ExternalDatasetService: "headers": headers, "params": request_params, } - response = ExternalDatasetService.process_external_api(ExternalKnowledgeApiSetting(**external_knowledge_api_setting), None) + response = ExternalDatasetService.process_external_api( + ExternalKnowledgeApiSetting(**external_knowledge_api_setting), None + ) if response.status_code == 200: return response.json().get("records", []) return [] @@ -338,7 +353,10 @@ class ExternalDatasetService: response = client.retrieve( knowledgeBaseId=external_knowledge_id, retrievalConfiguration={ - "vectorSearchConfiguration": {"numberOfResults": retrieval_setting.get("top_k"), "overrideSearchType": "HYBRID"} + "vectorSearchConfiguration": { + "numberOfResults": retrieval_setting.get("top_k"), + "overrideSearchType": "HYBRID", + } }, retrievalQuery={"text": query}, ) @@ -347,7 +365,7 @@ class ExternalDatasetService: if response.get("retrievalResults"): retrieval_results = response.get("retrievalResults") for retrieval_result in retrieval_results: - if retrieval_result.get("score") < retrieval_setting.get("score_threshold", .0): + if retrieval_result.get("score") < retrieval_setting.get("score_threshold", 0.0): continue result = { "metadata": retrieval_result.get("metadata"), @@ -356,6 +374,4 @@ class ExternalDatasetService: "content": retrieval_result.get("content").get("text"), } results.append(result) - return { - "records": results - } + return {"records": results} diff --git a/api/tasks/external_document_indexing_task.py b/api/tasks/external_document_indexing_task.py index bfd2f155f7..6fc719ae8d 100644 --- a/api/tasks/external_document_indexing_task.py +++ b/api/tasks/external_document_indexing_task.py @@ -14,7 +14,9 @@ from services.external_knowledge_service import ExternalDatasetService @shared_task(queue="dataset") -def external_document_indexing_task(dataset_id: str, external_knowledge_api_id: str, data_source: dict, process_parameter: dict): +def external_document_indexing_task( + dataset_id: str, external_knowledge_api_id: str, data_source: dict, process_parameter: dict +): """ Async process document :param dataset_id: @@ -35,14 +37,18 @@ def external_document_indexing_task(dataset_id: str, external_knowledge_api_id: # get external api template external_knowledge_api = ( db.session.query(ExternalKnowledgeApis) - .filter(ExternalKnowledgeApis.id == external_knowledge_api_id, ExternalKnowledgeApis.tenant_id == dataset.tenant_id) + .filter( + ExternalKnowledgeApis.id == external_knowledge_api_id, ExternalKnowledgeApis.tenant_id == dataset.tenant_id + ) .first() ) if not external_knowledge_api: logging.info( click.style( - "Processed external dataset: {} failed, api template: {} not exit.".format(dataset_id, external_knowledge_api_id), + "Processed external dataset: {} failed, api template: {} not exit.".format( + dataset_id, external_knowledge_api_id + ), fg="red", ) ) @@ -59,7 +65,9 @@ def external_document_indexing_task(dataset_id: str, external_knowledge_api_id: if file: files[file.id] = (file.name, storage.load_once(file.key), file.mime_type) try: - settings = ExternalDatasetService.get_external_knowledge_api_settings(json.loads(external_knowledge_api.settings)) + settings = ExternalDatasetService.get_external_knowledge_api_settings( + json.loads(external_knowledge_api.settings) + ) # assemble headers headers = ExternalDatasetService.assembling_headers(settings.authorization, settings.headers)