dify/api/services/external_knowledge_service.py

378 lines
15 KiB
Python
Raw Normal View History

2024-08-20 11:13:29 +08:00
import json
2024-08-21 16:25:18 +08:00
import random
import time
2024-08-20 16:18:35 +08:00
from copy import deepcopy
2024-08-20 11:13:29 +08:00
from datetime import datetime, timezone
2024-09-18 14:36:51 +08:00
from typing import Any, Optional, Union
2024-08-20 16:18:35 +08:00
2024-09-24 18:00:45 +08:00
import boto3
2024-08-20 16:18:35 +08:00
import httpx
2024-09-27 16:02:59 +08:00
import validators
2024-08-20 11:13:29 +08:00
2024-09-24 18:00:45 +08:00
# from tasks.external_document_indexing_task import external_document_indexing_task
2024-09-24 17:52:16 +08:00
from configs import dify_config
2024-08-20 12:47:51 +08:00
from core.helper import ssrf_proxy
2024-08-20 11:13:29 +08:00
from extensions.ext_database import db
from models.dataset import (
Dataset,
Document,
2024-09-25 12:37:23 +08:00
ExternalKnowledgeApis,
2024-09-18 14:36:51 +08:00
ExternalKnowledgeBindings,
2024-08-20 11:13:29 +08:00
)
2024-08-21 16:25:18 +08:00
from models.model import UploadFile
2024-09-27 16:17:45 +08:00
from services.entities.external_knowledge_entities.external_knowledge_entities import (
Authorization,
ExternalKnowledgeApiSetting,
)
2024-09-11 16:59:19 +08:00
from services.errors.dataset import DatasetNameDuplicateError
2024-09-27 16:17:45 +08:00
2024-08-20 11:13:29 +08:00
class ExternalDatasetService:
@staticmethod
2024-09-25 13:31:15 +08:00
def get_external_knowledge_apis(page, per_page, tenant_id, search=None) -> tuple[list[ExternalKnowledgeApis], int]:
2024-09-25 12:37:23 +08:00
query = ExternalKnowledgeApis.query.filter(ExternalKnowledgeApis.tenant_id == tenant_id).order_by(
ExternalKnowledgeApis.created_at.desc()
2024-08-20 11:13:29 +08:00
)
if search:
2024-09-25 12:37:23 +08:00
query = query.filter(ExternalKnowledgeApis.name.ilike(f"%{search}%"))
2024-08-20 11:13:29 +08:00
2024-09-25 12:37:23 +08:00
external_knowledge_apis = query.paginate(page=page, per_page=per_page, max_per_page=100, error_out=False)
2024-08-20 11:13:29 +08:00
2024-09-25 12:37:23 +08:00
return external_knowledge_apis.items, external_knowledge_apis.total
2024-08-20 11:13:29 +08:00
@classmethod
2024-08-22 15:33:43 +08:00
def validate_api_list(cls, api_settings: dict):
2024-08-20 11:13:29 +08:00
if not api_settings:
2024-09-18 14:36:51 +08:00
raise ValueError("api list is empty")
if "endpoint" not in api_settings and not api_settings["endpoint"]:
raise ValueError("endpoint is required")
if "api_key" not in api_settings and not api_settings["api_key"]:
raise ValueError("api_key is required")
2024-08-20 11:13:29 +08:00
@staticmethod
2024-09-25 12:37:23 +08:00
def create_external_knowledge_api(tenant_id: str, user_id: str, args: dict) -> ExternalKnowledgeApis:
2024-09-27 16:02:59 +08:00
ExternalDatasetService.check_endpoint_and_api_key(args.get("settings"))
2024-09-25 12:37:23 +08:00
external_knowledge_api = ExternalKnowledgeApis(
2024-08-20 11:13:29 +08:00
tenant_id=tenant_id,
created_by=user_id,
updated_by=user_id,
2024-09-18 14:36:51 +08:00
name=args.get("name"),
description=args.get("description", ""),
settings=json.dumps(args.get("settings"), ensure_ascii=False),
2024-08-20 11:13:29 +08:00
)
2024-09-25 12:37:23 +08:00
db.session.add(external_knowledge_api)
2024-08-20 11:13:29 +08:00
db.session.commit()
2024-09-25 12:37:23 +08:00
return external_knowledge_api
2024-09-27 16:17:45 +08:00
2024-09-27 16:02:59 +08:00
@staticmethod
def check_endpoint_and_api_key(settings: dict):
if "endpoint" not in settings or not settings["endpoint"]:
raise ValueError("endpoint is required")
if "api_key" not in settings or not settings["api_key"]:
raise ValueError("api_key is required")
2024-09-27 16:17:45 +08:00
2024-09-27 16:02:59 +08:00
endpoint = f"{settings['endpoint']}/retrieval"
api_key = settings["api_key"]
if not validators.url(endpoint):
raise ValueError(f"invalid endpoint: {endpoint}")
try:
response = httpx.post(endpoint, headers={"Authorization": f"Bearer {api_key}"})
except Exception as e:
raise ValueError(f"failed to connect to the endpoint: {endpoint}")
if response.status_code == 502:
raise ValueError(f"Bad Gateway: failed to connect to the endpoint: {endpoint}")
if response.status_code == 404:
raise ValueError(f"Not Found: failed to connect to the endpoint: {endpoint}")
if response.status_code == 403:
raise ValueError(f"Forbidden: Authorization failed with api_key: {api_key}")
2024-08-20 11:13:29 +08:00
@staticmethod
2024-09-25 12:37:23 +08:00
def get_external_knowledge_api(external_knowledge_api_id: str) -> ExternalKnowledgeApis:
return ExternalKnowledgeApis.query.filter_by(id=external_knowledge_api_id).first()
2024-08-20 11:13:29 +08:00
@staticmethod
2024-09-25 12:37:23 +08:00
def update_external_knowledge_api(tenant_id, user_id, external_knowledge_api_id, args) -> ExternalKnowledgeApis:
2024-09-27 16:17:45 +08:00
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
id=external_knowledge_api_id, tenant_id=tenant_id
).first()
2024-09-25 12:37:23 +08:00
if external_knowledge_api is None:
2024-09-18 14:36:51 +08:00
raise ValueError("api template not found")
2024-08-20 11:13:29 +08:00
2024-09-25 12:37:23 +08:00
external_knowledge_api.name = args.get("name")
external_knowledge_api.description = args.get("description", "")
external_knowledge_api.settings = json.dumps(args.get("settings"), ensure_ascii=False)
external_knowledge_api.updated_by = user_id
external_knowledge_api.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
2024-08-20 11:13:29 +08:00
db.session.commit()
2024-09-25 12:37:23 +08:00
return external_knowledge_api
2024-08-20 11:13:29 +08:00
@staticmethod
2024-09-25 12:37:23 +08:00
def delete_external_knowledge_api(tenant_id: str, external_knowledge_api_id: str):
2024-09-27 16:17:45 +08:00
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
id=external_knowledge_api_id, tenant_id=tenant_id
).first()
2024-09-25 12:37:23 +08:00
if external_knowledge_api is None:
2024-09-18 14:36:51 +08:00
raise ValueError("api template not found")
2024-08-20 11:13:29 +08:00
2024-09-25 12:37:23 +08:00
db.session.delete(external_knowledge_api)
2024-08-20 11:13:29 +08:00
db.session.commit()
@staticmethod
2024-09-25 12:37:23 +08:00
def external_knowledge_api_use_check(external_knowledge_api_id: str) -> tuple[bool, int]:
count = ExternalKnowledgeBindings.query.filter_by(external_knowledge_api_id=external_knowledge_api_id).count()
2024-08-20 11:13:29 +08:00
if count > 0:
2024-09-24 22:28:23 +08:00
return True, count
return False, 0
2024-08-20 11:13:29 +08:00
2024-09-13 09:49:24 +08:00
@staticmethod
def get_external_knowledge_binding_with_dataset_id(tenant_id: str, dataset_id: str) -> ExternalKnowledgeBindings:
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
2024-09-18 14:36:51 +08:00
dataset_id=dataset_id, tenant_id=tenant_id
2024-09-13 09:49:24 +08:00
).first()
if not external_knowledge_binding:
2024-09-18 14:36:51 +08:00
raise ValueError("external knowledge binding not found")
2024-09-13 09:49:24 +08:00
return external_knowledge_binding
2024-08-20 11:13:29 +08:00
@staticmethod
2024-09-25 12:37:23 +08:00
def document_create_args_validate(tenant_id: str, external_knowledge_api_id: str, process_parameter: dict):
2024-09-27 16:17:45 +08:00
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
id=external_knowledge_api_id, tenant_id=tenant_id
).first()
2024-09-25 12:37:23 +08:00
if external_knowledge_api is None:
2024-09-18 14:36:51 +08:00
raise ValueError("api template not found")
2024-09-25 12:37:23 +08:00
settings = json.loads(external_knowledge_api.settings)
2024-09-24 18:02:03 +08:00
for setting in settings:
custom_parameters = setting.get("document_process_setting")
2024-08-23 16:43:47 +08:00
if custom_parameters:
for parameter in custom_parameters:
2024-09-18 14:36:51 +08:00
if parameter.get("required", False) and not process_parameter.get(parameter.get("name")):
2024-08-20 11:13:29 +08:00
raise ValueError(f'{parameter.get("name")} is required')
@staticmethod
2024-09-18 14:36:51 +08:00
def init_external_dataset(tenant_id: str, user_id: str, args: dict, created_from: str = "web"):
2024-09-25 12:37:23 +08:00
external_knowledge_api_id = args.get("external_knowledge_api_id")
2024-08-21 16:25:18 +08:00
2024-09-18 14:36:51 +08:00
data_source = args.get("data_source")
2024-08-21 16:25:18 +08:00
if data_source is None:
2024-09-18 14:36:51 +08:00
raise ValueError("data source is required")
2024-08-21 16:25:18 +08:00
2024-09-18 14:36:51 +08:00
process_parameter = args.get("process_parameter")
2024-09-27 16:17:45 +08:00
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
id=external_knowledge_api_id, tenant_id=tenant_id
).first()
2024-09-25 12:37:23 +08:00
if external_knowledge_api is None:
2024-09-18 14:36:51 +08:00
raise ValueError("api template not found")
2024-08-20 11:13:29 +08:00
dataset = Dataset(
tenant_id=tenant_id,
2024-09-18 14:36:51 +08:00
name=args.get("name"),
description=args.get("description", ""),
provider="external",
2024-08-20 11:13:29 +08:00
created_by=user_id,
)
db.session.add(dataset)
2024-08-21 16:25:18 +08:00
db.session.flush()
2024-09-18 14:36:51 +08:00
document = Document.query.filter_by(dataset_id=dataset.id).order_by(Document.position.desc()).first()
position = document.position + 1 if document else 1
batch = time.strftime("%Y%m%d%H%M%S") + str(random.randint(100000, 999999))
2024-08-21 16:25:18 +08:00
document_ids = []
if data_source["type"] == "upload_file":
2024-09-18 14:36:51 +08:00
upload_file_list = data_source["info_list"]["file_info_list"]["file_ids"]
2024-08-21 16:25:18 +08:00
for file_id in upload_file_list:
2024-09-18 14:36:51 +08:00
file = (
db.session.query(UploadFile)
.filter(UploadFile.tenant_id == dataset.tenant_id, UploadFile.id == file_id)
.first()
)
2024-08-21 16:25:18 +08:00
if file:
data_source_info = {
"upload_file_id": file_id,
}
document = Document(
tenant_id=dataset.tenant_id,
dataset_id=dataset.id,
position=position,
data_source_type=data_source["type"],
data_source_info=json.dumps(data_source_info),
batch=batch,
name=file.name,
created_from=created_from,
created_by=user_id,
)
position += 1
db.session.add(document)
db.session.flush()
document_ids.append(document.id)
2024-08-20 11:13:29 +08:00
db.session.commit()
2024-09-25 12:37:23 +08:00
# external_document_indexing_task.delay(dataset.id, external_knowledge_api_id, data_source, process_parameter)
2024-08-20 11:13:29 +08:00
return dataset
@staticmethod
2024-09-27 16:17:45 +08:00
def process_external_api(
settings: ExternalKnowledgeApiSetting, files: Union[None, dict[str, Any]]
) -> httpx.Response:
2024-08-20 12:47:51 +08:00
"""
do http request depending on api bundle
"""
2024-08-20 16:18:35 +08:00
2024-08-20 12:47:51 +08:00
kwargs = {
2024-09-18 14:36:51 +08:00
"url": settings.url,
"headers": settings.headers,
"follow_redirects": True,
2024-08-20 12:47:51 +08:00
}
2024-09-19 17:07:33 +08:00
response = getattr(ssrf_proxy, settings.request_method)(data=json.dumps(settings.params), files=files, **kwargs)
2024-09-13 09:49:24 +08:00
2024-08-20 16:18:35 +08:00
return response
@staticmethod
def assembling_headers(authorization: Authorization, headers: Optional[dict] = None) -> dict[str, Any]:
authorization = deepcopy(authorization)
if headers:
headers = deepcopy(headers)
2024-08-20 12:47:51 +08:00
else:
2024-09-11 16:59:19 +08:00
headers = {}
2024-09-18 14:36:51 +08:00
if authorization.type == "api-key":
2024-08-20 16:18:35 +08:00
if authorization.config is None:
2024-09-18 14:36:51 +08:00
raise ValueError("authorization config is required")
2024-08-20 16:18:35 +08:00
if authorization.config.api_key is None:
2024-09-18 14:36:51 +08:00
raise ValueError("api_key is required")
2024-08-20 16:18:35 +08:00
if not authorization.config.header:
2024-09-18 14:36:51 +08:00
authorization.config.header = "Authorization"
2024-08-20 16:18:35 +08:00
2024-09-18 14:36:51 +08:00
if authorization.config.type == "bearer":
headers[authorization.config.header] = f"Bearer {authorization.config.api_key}"
elif authorization.config.type == "basic":
headers[authorization.config.header] = f"Basic {authorization.config.api_key}"
elif authorization.config.type == "custom":
2024-08-20 16:18:35 +08:00
headers[authorization.config.header] = authorization.config.api_key
return headers
@staticmethod
2024-09-25 12:37:23 +08:00
def get_external_knowledge_api_settings(settings: dict) -> ExternalKnowledgeApiSetting:
return ExternalKnowledgeApiSetting.parse_obj(settings)
2024-09-11 16:59:19 +08:00
@staticmethod
2024-09-13 09:49:24 +08:00
def create_external_dataset(tenant_id: str, user_id: str, args: dict) -> Dataset:
2024-09-11 16:59:19 +08:00
# check if dataset name already exists
2024-09-18 14:36:51 +08:00
if Dataset.query.filter_by(name=args.get("name"), tenant_id=tenant_id).first():
2024-09-11 16:59:19 +08:00
raise DatasetNameDuplicateError(f"Dataset with name {args.get('name')} already exists.")
2024-09-25 12:37:23 +08:00
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
id=args.get("external_knowledge_api_id"), tenant_id=tenant_id
2024-09-11 16:59:19 +08:00
).first()
2024-09-13 09:49:24 +08:00
2024-09-25 12:37:23 +08:00
if external_knowledge_api is None:
2024-09-18 14:36:51 +08:00
raise ValueError("api template not found")
2024-09-11 16:59:19 +08:00
dataset = Dataset(
tenant_id=tenant_id,
2024-09-18 14:36:51 +08:00
name=args.get("name"),
description=args.get("description", ""),
provider="external",
2024-09-24 17:52:16 +08:00
retrieval_model=args.get("external_retrieval_model"),
2024-09-11 16:59:19 +08:00
created_by=user_id,
)
db.session.add(dataset)
db.session.flush()
external_knowledge_binding = ExternalKnowledgeBindings(
tenant_id=tenant_id,
dataset_id=dataset.id,
2024-09-25 12:37:23 +08:00
external_knowledge_api_id=args.get("external_knowledge_api_id"),
2024-09-18 14:36:51 +08:00
external_knowledge_id=args.get("external_knowledge_id"),
2024-09-11 16:59:19 +08:00
created_by=user_id,
)
db.session.add(external_knowledge_binding)
db.session.commit()
return dataset
2024-09-13 09:49:24 +08:00
@staticmethod
2024-09-19 17:07:33 +08:00
def fetch_external_knowledge_retrieval(
2024-09-24 18:00:45 +08:00
tenant_id: str, dataset_id: str, query: str, external_retrieval_parameters: dict
2024-09-19 17:07:33 +08:00
) -> list:
2024-09-13 09:49:24 +08:00
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(
2024-09-18 14:36:51 +08:00
dataset_id=dataset_id, tenant_id=tenant_id
2024-09-13 09:49:24 +08:00
).first()
if not external_knowledge_binding:
2024-09-18 14:36:51 +08:00
raise ValueError("external knowledge binding not found")
2024-09-13 09:49:24 +08:00
2024-09-25 12:37:23 +08:00
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
id=external_knowledge_binding.external_knowledge_api_id
2024-09-13 09:49:24 +08:00
).first()
2024-09-25 12:37:23 +08:00
if not external_knowledge_api:
2024-09-18 14:36:51 +08:00
raise ValueError("external api template not found")
2024-09-13 09:49:24 +08:00
2024-09-25 12:37:23 +08:00
settings = json.loads(external_knowledge_api.settings)
2024-09-24 18:00:45 +08:00
headers = {"Content-Type": "application/json"}
2024-09-19 17:07:33 +08:00
if settings.get("api_key"):
headers["Authorization"] = f"Bearer {settings.get('api_key')}"
2024-09-13 09:49:24 +08:00
2024-09-26 16:38:53 +08:00
request_params = {
"retrieval_setting": {
"top_k": external_retrieval_parameters.get("top_k"),
"score_threshold": external_retrieval_parameters.get("score_threshold"),
},
"query": query,
"knowledge_id": external_knowledge_binding.external_knowledge_id,
}
2024-09-11 16:59:19 +08:00
2024-09-25 12:37:23 +08:00
external_knowledge_api_setting = {
2024-09-19 17:07:33 +08:00
"url": f"{settings.get('endpoint')}/dify/external-knowledge/retrieval-documents",
2024-09-18 14:36:51 +08:00
"request_method": "post",
2024-09-19 17:07:33 +08:00
"headers": headers,
2024-09-26 16:38:53 +08:00
"params": request_params,
2024-09-13 09:49:24 +08:00
}
2024-09-27 16:17:45 +08:00
response = ExternalDatasetService.process_external_api(
ExternalKnowledgeApiSetting(**external_knowledge_api_setting), None
)
2024-09-19 17:07:33 +08:00
if response.status_code == 200:
2024-09-26 16:38:53 +08:00
return response.json().get("records", [])
2024-09-19 17:07:33 +08:00
return []
2024-09-18 15:29:30 +08:00
@staticmethod
2024-09-26 16:38:53 +08:00
def test_external_knowledge_retrieval(retrieval_setting: dict, query: str, external_knowledge_id: str):
2024-09-19 17:07:33 +08:00
client = boto3.client(
"bedrock-agent-runtime",
2024-09-24 17:52:16 +08:00
aws_secret_access_key=dify_config.AWS_SECRET_ACCESS_KEY,
aws_access_key_id=dify_config.AWS_ACCESS_KEY_ID,
2024-09-24 18:00:45 +08:00
region_name="us-east-1",
2024-09-19 17:07:33 +08:00
)
response = client.retrieve(
knowledgeBaseId=external_knowledge_id,
retrievalConfiguration={
2024-09-27 16:17:45 +08:00
"vectorSearchConfiguration": {
"numberOfResults": retrieval_setting.get("top_k"),
"overrideSearchType": "HYBRID",
}
2024-09-18 15:29:30 +08:00
},
2024-09-24 18:00:45 +08:00
retrievalQuery={"text": query},
2024-09-19 17:07:33 +08:00
)
results = []
if response.get("ResponseMetadata") and response.get("ResponseMetadata").get("HTTPStatusCode") == 200:
if response.get("retrievalResults"):
retrieval_results = response.get("retrievalResults")
for retrieval_result in retrieval_results:
2024-09-27 16:17:45 +08:00
if retrieval_result.get("score") < retrieval_setting.get("score_threshold", 0.0):
2024-09-24 17:52:16 +08:00
continue
2024-09-19 17:07:33 +08:00
result = {
"metadata": retrieval_result.get("metadata"),
"score": retrieval_result.get("score"),
"title": retrieval_result.get("metadata").get("x-amz-bedrock-kb-source-uri"),
"content": retrieval_result.get("content").get("text"),
}
results.append(result)
2024-09-27 16:17:45 +08:00
return {"records": results}