Merge branch 'feat/external-knowledge-api' of github.com:langgenius/dify into feat/external-knowledge-api

This commit is contained in:
Yi 2024-09-25 13:05:57 +08:00
commit d6c604a356
13 changed files with 293 additions and 209 deletions

View File

@ -111,7 +111,7 @@ class DatasetListApi(Resource):
help="Invalid indexing technique.",
)
parser.add_argument(
"external_api_template_id",
"external_knowledge_api_id",
type=str,
nullable=True,
required=False,
@ -144,7 +144,7 @@ class DatasetListApi(Resource):
account=current_user,
permission=DatasetPermissionEnum.ONLY_ME,
provider=args["provider"],
external_api_template_id=args["external_api_template_id"],
external_knowledge_api_id=args["external_knowledge_api_id"],
external_knowledge_id=args["external_knowledge_id"],
)
except services.errors.dataset.DatasetNameDuplicateError:
@ -234,6 +234,33 @@ class DatasetApi(Resource):
)
parser.add_argument("retrieval_model", type=dict, location="json", help="Invalid retrieval model.")
parser.add_argument("partial_member_list", type=list, location="json", help="Invalid parent user list.")
parser.add_argument(
"external_retrieval_model",
type=dict,
required=False,
nullable=True,
location="json",
help="Invalid external retrieval model.",
)
parser.add_argument(
"external_knowledge_id",
type=str,
required=False,
nullable=True,
location="json",
help="Invalid external knowledge id.",
)
parser.add_argument(
"external_knowledge_api_id",
type=str,
required=False,
nullable=True,
location="json",
help="Invalid external knowledge api id.",
)
args = parser.parse_args()
data = request.get_json()

View File

@ -23,7 +23,7 @@ def _validate_name(name):
def _validate_description_length(description):
if len(description) > 400:
if description and len(description) > 400:
raise ValueError("Description cannot exceed 400 characters.")
return description
@ -37,12 +37,12 @@ class ExternalApiTemplateListApi(Resource):
limit = request.args.get("limit", default=20, type=int)
search = request.args.get("keyword", default=None, type=str)
api_templates, total = ExternalDatasetService.get_external_api_templates(
external_knowledge_apis, total = ExternalDatasetService.get_external_knowledge_apis(
page, limit, current_user.current_tenant_id, search
)
response = {
"data": [item.to_dict() for item in api_templates],
"has_more": len(api_templates) == limit,
"data": [item.to_dict() for item in external_knowledge_apis],
"has_more": len(external_knowledge_apis) == limit,
"limit": limit,
"total": total,
"page": page,
@ -61,13 +61,6 @@ class ExternalApiTemplateListApi(Resource):
help="Name is required. Name must be between 1 to 100 characters.",
type=_validate_name,
)
parser.add_argument(
"description",
nullable=True,
required=False,
help="Description is required. Description must be between 1 to 400 characters.",
type=_validate_description_length,
)
parser.add_argument(
"settings",
type=dict,
@ -84,13 +77,13 @@ class ExternalApiTemplateListApi(Resource):
raise Forbidden()
try:
api_template = ExternalDatasetService.create_api_template(
external_knowledge_api = ExternalDatasetService.create_external_knowledge_api(
tenant_id=current_user.current_tenant_id, user_id=current_user.id, args=args
)
except services.errors.dataset.DatasetNameDuplicateError:
raise DatasetNameDuplicateError()
return api_template.to_dict(), 201
return external_knowledge_api.to_dict(), 201
class ExternalApiTemplateApi(Resource):
@ -99,17 +92,17 @@ class ExternalApiTemplateApi(Resource):
@account_initialization_required
def get(self, external_knowledge_api_id):
external_knowledge_api_id = str(external_knowledge_api_id)
api_template = ExternalDatasetService.get_api_template(external_knowledge_api_id)
if api_template is None:
external_knowledge_api = ExternalDatasetService.get_external_knowledge_api(external_knowledge_api_id)
if external_knowledge_api is None:
raise NotFound("API template not found.")
return api_template.to_dict(), 200
return external_knowledge_api.to_dict(), 200
@setup_required
@login_required
@account_initialization_required
def patch(self, api_template_id):
api_template_id = str(api_template_id)
def patch(self, external_knowledge_api_id):
external_knowledge_api_id = str(external_knowledge_api_id)
parser = reqparse.RequestParser()
parser.add_argument(
@ -119,13 +112,6 @@ class ExternalApiTemplateApi(Resource):
help="type is required. Name must be between 1 to 100 characters.",
type=_validate_name,
)
parser.add_argument(
"description",
nullable=True,
required=False,
help="description is required. Description must be between 1 to 400 characters.",
type=_validate_description_length,
)
parser.add_argument(
"settings",
type=dict,
@ -136,27 +122,27 @@ class ExternalApiTemplateApi(Resource):
args = parser.parse_args()
ExternalDatasetService.validate_api_list(args["settings"])
api_template = ExternalDatasetService.update_api_template(
external_knowledge_api = ExternalDatasetService.update_external_knowledge_api(
tenant_id=current_user.current_tenant_id,
user_id=current_user.id,
api_template_id=api_template_id,
external_knowledge_api_id=external_knowledge_api_id,
args=args,
)
return api_template.to_dict(), 200
return external_knowledge_api.to_dict(), 200
@setup_required
@login_required
@account_initialization_required
def delete(self, api_template_id):
api_template_id = str(api_template_id)
def delete(self, external_knowledge_api_id):
external_knowledge_api_id = str(external_knowledge_api_id)
# The role of the current user in the ta table must be admin, owner, or editor
if not current_user.is_editor or current_user.is_dataset_operator:
raise Forbidden()
ExternalDatasetService.delete_api_template(current_user.current_tenant_id, api_template_id)
return {"result": "success"}, 204
ExternalDatasetService.delete_external_knowledge_api(current_user.current_tenant_id, external_knowledge_api_id)
return {"result": "success"}, 200
class ExternalApiUseCheckApi(Resource):
@ -166,8 +152,10 @@ class ExternalApiUseCheckApi(Resource):
def get(self, external_knowledge_api_id):
external_knowledge_api_id = str(external_knowledge_api_id)
external_api_template_is_using = ExternalDatasetService.external_api_template_use_check(external_knowledge_api_id)
return {"is_using": external_api_template_is_using}, 200
external_knowledge_api_is_using, count = ExternalDatasetService.external_knowledge_api_use_check(
external_knowledge_api_id
)
return {"is_using": external_knowledge_api_is_using, "count": count}, 200
class ExternalDatasetInitApi(Resource):
@ -180,7 +168,7 @@ class ExternalDatasetInitApi(Resource):
raise Forbidden()
parser = reqparse.RequestParser()
parser.add_argument("api_template_id", type=str, required=True, nullable=True, location="json")
parser.add_argument("external_knowledge_api_id", type=str, required=True, nullable=True, location="json")
# parser.add_argument('name', nullable=False, required=True,
# help='name is required. Name must be between 1 to 100 characters.',
# type=_validate_name)
@ -196,7 +184,7 @@ class ExternalDatasetInitApi(Resource):
# validate args
ExternalDatasetService.document_create_args_validate(
current_user.current_tenant_id, args["api_template_id"], args["process_parameter"]
current_user.current_tenant_id, args["external_knowledge_api_id"], args["process_parameter"]
)
try:
@ -222,7 +210,7 @@ class ExternalDatasetCreateApi(Resource):
raise Forbidden()
parser = reqparse.RequestParser()
parser.add_argument("external_api_template_id", type=str, required=True, nullable=False, location="json")
parser.add_argument("external_knowledge_api_id", type=str, required=True, nullable=False, location="json")
parser.add_argument("external_knowledge_id", type=str, required=True, nullable=False, location="json")
parser.add_argument(
"name",

View File

@ -83,7 +83,7 @@ class DatasetListApi(DatasetApiResource):
nullable=False,
)
parser.add_argument(
"external_api_template_id",
"external_knowledge_api_id",
type=str,
nullable=True,
required=False,
@ -112,7 +112,7 @@ class DatasetListApi(DatasetApiResource):
account=current_user,
permission=args["permission"],
provider=args["provider"],
external_api_template_id=args["external_api_template_id"],
external_knowledge_api_id=args["external_knowledge_api_id"],
external_knowledge_id=args["external_knowledge_id"],
)
except services.errors.dataset.DatasetNameDuplicateError:

View File

@ -112,11 +112,7 @@ class DatasetRetrieval:
continue
# pass if dataset is not available
if (
dataset
and dataset.available_document_count == 0
and dataset.provider != "external"
):
if dataset and dataset.available_document_count == 0 and dataset.provider != "external":
continue
available_datasets.append(dataset)

View File

@ -38,9 +38,20 @@ dataset_retrieval_model_fields = {
"score_threshold_enabled": fields.Boolean,
"score_threshold": fields.Float,
}
external_retrieval_model_fields = {
"top_k": fields.Integer,
"score_threshold": fields.Float,
}
tag_fields = {"id": fields.String, "name": fields.String, "type": fields.String}
external_knowledge_info_fields = {
"external_knowledge_id": fields.String,
"external_knowledge_api_id": fields.String,
"external_knowledge_api_name": fields.String,
"external_knowledge_api_endpoint": fields.String,
}
dataset_detail_fields = {
"id": fields.String,
"name": fields.String,
@ -61,6 +72,8 @@ dataset_detail_fields = {
"embedding_available": fields.Boolean,
"retrieval_model_dict": fields.Nested(dataset_retrieval_model_fields),
"tags": fields.List(fields.Nested(tag_fields)),
"external_knowledge_info": fields.Nested(external_knowledge_info_fields),
"external_retrieval_model": fields.Nested(external_retrieval_model_fields, allow_null=True),
}
dataset_query_detail_fields = {

View File

@ -2,7 +2,7 @@ from flask_restful import fields
from libs.helper import TimestampField
api_template_query_detail_fields = {
external_knowledge_api_query_detail_fields = {
"id": fields.String,
"name": fields.String,
"setting": fields.String,

View File

@ -12,7 +12,7 @@ from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision = '6af6a521a53e'
down_revision = 'ec3df697ebbb'
down_revision = 'd57ba9ebb251'
branch_labels = None
depends_on = None

View File

@ -1,8 +1,8 @@
"""external_knowledge
"""external_knowledge_api
Revision ID: ec3df697ebbb
Revises: 675b5321501b
Create Date: 2024-09-18 06:59:54.048478
Revision ID: 33f5fac87f29
Revises: 6af6a521a53e
Create Date: 2024-09-25 04:34:57.249436
"""
from alembic import op
@ -11,15 +11,15 @@ import sqlalchemy as sa
from sqlalchemy.dialects import postgresql
# revision identifiers, used by Alembic.
revision = 'ec3df697ebbb'
down_revision = '675b5321501b'
revision = '33f5fac87f29'
down_revision = '6af6a521a53e'
branch_labels = None
depends_on = None
def upgrade():
# ### commands auto generated by Alembic - please adjust! ###
op.create_table('external_api_templates',
op.create_table('external_knowledge_apis',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
sa.Column('name', sa.String(length=255), nullable=False),
sa.Column('description', sa.String(length=255), nullable=False),
@ -29,16 +29,16 @@ def upgrade():
sa.Column('created_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
sa.Column('updated_by', models.types.StringUUID(), nullable=True),
sa.Column('updated_at', sa.DateTime(), server_default=sa.text('CURRENT_TIMESTAMP(0)'), nullable=False),
sa.PrimaryKeyConstraint('id', name='external_api_template_pkey')
sa.PrimaryKeyConstraint('id', name='external_knowledge_apis_pkey')
)
with op.batch_alter_table('external_api_templates', schema=None) as batch_op:
batch_op.create_index('external_api_templates_name_idx', ['name'], unique=False)
batch_op.create_index('external_api_templates_tenant_idx', ['tenant_id'], unique=False)
with op.batch_alter_table('external_knowledge_apis', schema=None) as batch_op:
batch_op.create_index('external_knowledge_apis_name_idx', ['name'], unique=False)
batch_op.create_index('external_knowledge_apis_tenant_idx', ['tenant_id'], unique=False)
op.create_table('external_knowledge_bindings',
sa.Column('id', models.types.StringUUID(), server_default=sa.text('uuid_generate_v4()'), nullable=False),
sa.Column('tenant_id', models.types.StringUUID(), nullable=False),
sa.Column('external_api_template_id', models.types.StringUUID(), nullable=False),
sa.Column('external_knowledge_api_id', models.types.StringUUID(), nullable=False),
sa.Column('dataset_id', models.types.StringUUID(), nullable=False),
sa.Column('external_knowledge_id', sa.Text(), nullable=False),
sa.Column('created_by', models.types.StringUUID(), nullable=False),
@ -49,7 +49,7 @@ def upgrade():
)
with op.batch_alter_table('external_knowledge_bindings', schema=None) as batch_op:
batch_op.create_index('external_knowledge_bindings_dataset_idx', ['dataset_id'], unique=False)
batch_op.create_index('external_knowledge_bindings_external_api_template_idx', ['external_api_template_id'], unique=False)
batch_op.create_index('external_knowledge_bindings_external_knowledge_api_idx', ['external_knowledge_api_id'], unique=False)
batch_op.create_index('external_knowledge_bindings_external_knowledge_idx', ['external_knowledge_id'], unique=False)
batch_op.create_index('external_knowledge_bindings_tenant_idx', ['tenant_id'], unique=False)
@ -58,17 +58,16 @@ def upgrade():
def downgrade():
# ### commands auto generated by Alembic - please adjust! ###
with op.batch_alter_table('external_knowledge_bindings', schema=None) as batch_op:
batch_op.drop_index('external_knowledge_bindings_tenant_idx')
batch_op.drop_index('external_knowledge_bindings_external_knowledge_idx')
batch_op.drop_index('external_knowledge_bindings_external_api_template_idx')
batch_op.drop_index('external_knowledge_bindings_external_knowledge_api_idx')
batch_op.drop_index('external_knowledge_bindings_dataset_idx')
op.drop_table('external_knowledge_bindings')
with op.batch_alter_table('external_api_templates', schema=None) as batch_op:
batch_op.drop_index('external_api_templates_tenant_idx')
batch_op.drop_index('external_api_templates_name_idx')
with op.batch_alter_table('external_knowledge_apis', schema=None) as batch_op:
batch_op.drop_index('external_knowledge_apis_tenant_idx')
batch_op.drop_index('external_knowledge_apis_name_idx')
op.drop_table('external_api_templates')
op.drop_table('external_knowledge_apis')
# ### end Alembic commands ###

View File

@ -171,6 +171,29 @@ class Dataset(db.Model):
return tags or []
@property
def external_knowledge_info(self):
if self.provider != "external":
return None
external_knowledge_binding = (
db.session.query(ExternalKnowledgeBindings).filter(ExternalKnowledgeBindings.dataset_id == self.id).first()
)
if not external_knowledge_binding:
return None
external_knowledge_api = (
db.session.query(ExternalKnowledgeApis)
.filter(ExternalKnowledgeApis.id == external_knowledge_binding.external_knowledge_api_id)
.first()
)
if not external_knowledge_api:
return None
return {
"external_knowledge_id": external_knowledge_binding.external_knowledge_id,
"external_knowledge_api_id": external_knowledge_api.id,
"external_knowledge_api_name": external_knowledge_api.name,
"external_knowledge_api_endpoint": json.loads(external_knowledge_api.settings).get("endpoint", ""),
}
@staticmethod
def gen_collection_name_by_id(dataset_id: str) -> str:
normalized_dataset_id = dataset_id.replace("-", "_")
@ -698,12 +721,12 @@ class DatasetPermission(db.Model):
created_at = db.Column(db.DateTime, nullable=False, server_default=db.text("CURRENT_TIMESTAMP(0)"))
class ExternalApiTemplates(db.Model):
__tablename__ = "external_api_templates"
class ExternalKnowledgeApis(db.Model):
__tablename__ = "external_knowledge_apis"
__table_args__ = (
db.PrimaryKeyConstraint("id", name="external_api_template_pkey"),
db.Index("external_api_templates_tenant_idx", "tenant_id"),
db.Index("external_api_templates_name_idx", "name"),
db.PrimaryKeyConstraint("id", name="external_knowledge_apis_pkey"),
db.Index("external_knowledge_apis_tenant_idx", "tenant_id"),
db.Index("external_knowledge_apis_name_idx", "name"),
)
id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
@ -723,6 +746,7 @@ class ExternalApiTemplates(db.Model):
"name": self.name,
"description": self.description,
"settings": self.settings_dict,
"dataset_bindings": self.dataset_bindings,
"created_by": self.created_by,
"created_at": self.created_at.isoformat(),
}
@ -734,6 +758,21 @@ class ExternalApiTemplates(db.Model):
except JSONDecodeError:
return None
@property
def dataset_bindings(self):
external_knowledge_bindings = (
db.session.query(ExternalKnowledgeBindings)
.filter(ExternalKnowledgeBindings.external_knowledge_api_id == self.id)
.all()
)
dataset_ids = [binding.dataset_id for binding in external_knowledge_bindings]
datasets = db.session.query(Dataset).filter(Dataset.id.in_(dataset_ids)).all()
dataset_bindings = []
for dataset in datasets:
dataset_bindings.append({"id": dataset.id, "name": dataset.name})
return dataset_bindings
class ExternalKnowledgeBindings(db.Model):
__tablename__ = "external_knowledge_bindings"
@ -742,12 +781,12 @@ class ExternalKnowledgeBindings(db.Model):
db.Index("external_knowledge_bindings_tenant_idx", "tenant_id"),
db.Index("external_knowledge_bindings_dataset_idx", "dataset_id"),
db.Index("external_knowledge_bindings_external_knowledge_idx", "external_knowledge_id"),
db.Index("external_knowledge_bindings_external_api_template_idx", "external_api_template_id"),
db.Index("external_knowledge_bindings_external_knowledge_api_idx", "external_knowledge_api_id"),
)
id = db.Column(StringUUID, nullable=False, server_default=db.text("uuid_generate_v4()"))
tenant_id = db.Column(StringUUID, nullable=False)
external_api_template_id = db.Column(StringUUID, nullable=False)
external_knowledge_api_id = db.Column(StringUUID, nullable=False)
dataset_id = db.Column(StringUUID, nullable=False)
external_knowledge_id = db.Column(db.Text, nullable=False)
created_by = db.Column(StringUUID, nullable=False)

View File

@ -143,7 +143,7 @@ class DatasetService:
account: Account,
permission: Optional[str] = None,
provider: str = "vendor",
external_api_template_id: Optional[str] = None,
external_knowledge_api_id: Optional[str] = None,
external_knowledge_id: Optional[str] = None,
):
# check if dataset name already exists
@ -167,14 +167,14 @@ class DatasetService:
db.session.add(dataset)
db.session.flush()
if provider == "external" and external_api_template_id:
external_api_template = ExternalDatasetService.get_api_template(external_api_template_id)
if not external_api_template:
if provider == "external" and external_knowledge_api_id:
external_knowledge_api = ExternalDatasetService.get_external_knowledge_api(external_knowledge_api_id)
if not external_knowledge_api:
raise ValueError("External API template not found.")
external_knowledge_binding = ExternalKnowledgeBindings(
tenant_id=tenant_id,
dataset_id=dataset.id,
external_api_template_id=external_api_template_id,
external_knowledge_api_id=external_knowledge_api_id,
external_knowledge_id=external_knowledge_id,
created_by=account.id,
)
@ -184,7 +184,7 @@ class DatasetService:
return dataset
@staticmethod
def get_dataset(dataset_id):
def get_dataset(dataset_id) -> Dataset:
return Dataset.query.filter_by(id=dataset_id).first()
@staticmethod
@ -225,81 +225,103 @@ class DatasetService:
@staticmethod
def update_dataset(dataset_id, data, user):
data.pop("partial_member_list", None)
filtered_data = {k: v for k, v in data.items() if v is not None or k == "description"}
dataset = DatasetService.get_dataset(dataset_id)
DatasetService.check_dataset_permission(dataset, user)
action = None
if dataset.indexing_technique != data["indexing_technique"]:
# if update indexing_technique
if data["indexing_technique"] == "economy":
action = "remove"
filtered_data["embedding_model"] = None
filtered_data["embedding_model_provider"] = None
filtered_data["collection_binding_id"] = None
elif data["indexing_technique"] == "high_quality":
action = "add"
# get embedding model setting
try:
model_manager = ModelManager()
embedding_model = model_manager.get_model_instance(
tenant_id=current_user.current_tenant_id,
provider=data["embedding_model_provider"],
model_type=ModelType.TEXT_EMBEDDING,
model=data["embedding_model"],
)
filtered_data["embedding_model"] = embedding_model.model
filtered_data["embedding_model_provider"] = embedding_model.provider
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
embedding_model.provider, embedding_model.model
)
filtered_data["collection_binding_id"] = dataset_collection_binding.id
except LLMBadRequestError:
raise ValueError(
"No Embedding Model available. Please configure a valid provider "
"in the Settings -> Model Provider."
)
except ProviderTokenNotInitError as ex:
raise ValueError(ex.description)
else:
if dataset.provider == "external":
dataset.retrieval_model = data.get("external_retrieval_model", None)
dataset.name = data.get("name", dataset.name)
dataset.description = data.get("description", "")
external_knowledge_id = data.get("external_knowledge_id", None)
db.session.add(dataset)
if not external_knowledge_id:
raise ValueError("External knowledge id is required.")
external_knowledge_api_id = data.get("external_knowledge_api_id", None)
if not external_knowledge_api_id:
raise ValueError("External knowledge api id is required.")
external_knowledge_binding = ExternalKnowledgeBindings.query.filter_by(dataset_id=dataset_id).first()
if (
data["embedding_model_provider"] != dataset.embedding_model_provider
or data["embedding_model"] != dataset.embedding_model
external_knowledge_binding.external_knowledge_id != external_knowledge_id
or external_knowledge_binding.external_knowledge_api_id != external_knowledge_api_id
):
action = "update"
try:
model_manager = ModelManager()
embedding_model = model_manager.get_model_instance(
tenant_id=current_user.current_tenant_id,
provider=data["embedding_model_provider"],
model_type=ModelType.TEXT_EMBEDDING,
model=data["embedding_model"],
)
filtered_data["embedding_model"] = embedding_model.model
filtered_data["embedding_model_provider"] = embedding_model.provider
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
embedding_model.provider, embedding_model.model
)
filtered_data["collection_binding_id"] = dataset_collection_binding.id
except LLMBadRequestError:
raise ValueError(
"No Embedding Model available. Please configure a valid provider "
"in the Settings -> Model Provider."
)
except ProviderTokenNotInitError as ex:
raise ValueError(ex.description)
external_knowledge_binding.external_knowledge_id = external_knowledge_id
external_knowledge_binding.external_knowledge_api_id = external_knowledge_api_id
db.session.add(external_knowledge_binding)
db.session.commit()
else:
data.pop("partial_member_list", None)
filtered_data = {k: v for k, v in data.items() if v is not None or k == "description"}
action = None
if dataset.indexing_technique != data["indexing_technique"]:
# if update indexing_technique
if data["indexing_technique"] == "economy":
action = "remove"
filtered_data["embedding_model"] = None
filtered_data["embedding_model_provider"] = None
filtered_data["collection_binding_id"] = None
elif data["indexing_technique"] == "high_quality":
action = "add"
# get embedding model setting
try:
model_manager = ModelManager()
embedding_model = model_manager.get_model_instance(
tenant_id=current_user.current_tenant_id,
provider=data["embedding_model_provider"],
model_type=ModelType.TEXT_EMBEDDING,
model=data["embedding_model"],
)
filtered_data["embedding_model"] = embedding_model.model
filtered_data["embedding_model_provider"] = embedding_model.provider
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
embedding_model.provider, embedding_model.model
)
filtered_data["collection_binding_id"] = dataset_collection_binding.id
except LLMBadRequestError:
raise ValueError(
"No Embedding Model available. Please configure a valid provider "
"in the Settings -> Model Provider."
)
except ProviderTokenNotInitError as ex:
raise ValueError(ex.description)
else:
if (
data["embedding_model_provider"] != dataset.embedding_model_provider
or data["embedding_model"] != dataset.embedding_model
):
action = "update"
try:
model_manager = ModelManager()
embedding_model = model_manager.get_model_instance(
tenant_id=current_user.current_tenant_id,
provider=data["embedding_model_provider"],
model_type=ModelType.TEXT_EMBEDDING,
model=data["embedding_model"],
)
filtered_data["embedding_model"] = embedding_model.model
filtered_data["embedding_model_provider"] = embedding_model.provider
dataset_collection_binding = DatasetCollectionBindingService.get_dataset_collection_binding(
embedding_model.provider, embedding_model.model
)
filtered_data["collection_binding_id"] = dataset_collection_binding.id
except LLMBadRequestError:
raise ValueError(
"No Embedding Model available. Please configure a valid provider "
"in the Settings -> Model Provider."
)
except ProviderTokenNotInitError as ex:
raise ValueError(ex.description)
filtered_data["updated_by"] = user.id
filtered_data["updated_at"] = datetime.datetime.now()
filtered_data["updated_by"] = user.id
filtered_data["updated_at"] = datetime.datetime.now()
# update Retrieval model
filtered_data["retrieval_model"] = data["retrieval_model"]
# update Retrieval model
filtered_data["retrieval_model"] = data["retrieval_model"]
dataset.query.filter_by(id=dataset_id).update(filtered_data)
dataset.query.filter_by(id=dataset_id).update(filtered_data)
db.session.commit()
if action:
deal_dataset_vector_index_task.delay(dataset_id, action)
db.session.commit()
if action:
deal_dataset_vector_index_task.delay(dataset_id, action)
return dataset
@staticmethod

View File

@ -19,7 +19,7 @@ class ProcessStatusSetting(BaseModel):
url: str
class ApiTemplateSetting(BaseModel):
class ExternalKnowledgeApiSetting(BaseModel):
url: str
request_method: str
headers: Optional[dict] = None

View File

@ -15,26 +15,26 @@ from extensions.ext_database import db
from models.dataset import (
Dataset,
Document,
ExternalApiTemplates,
ExternalKnowledgeApis,
ExternalKnowledgeBindings,
)
from models.model import UploadFile
from services.entities.external_knowledge_entities.external_knowledge_entities import ApiTemplateSetting, Authorization
from services.entities.external_knowledge_entities.external_knowledge_entities import ExternalKnowledgeApiSetting, Authorization
from services.errors.dataset import DatasetNameDuplicateError
class ExternalDatasetService:
@staticmethod
def get_external_api_templates(page, per_page, tenant_id, search=None) -> tuple[list[ExternalApiTemplates], int]:
query = ExternalApiTemplates.query.filter(ExternalApiTemplates.tenant_id == tenant_id).order_by(
ExternalApiTemplates.created_at.desc()
def get_external_knowledge_api(page, per_page, tenant_id, search=None) -> tuple[list[ExternalKnowledgeApis], int]:
query = ExternalKnowledgeApis.query.filter(ExternalKnowledgeApis.tenant_id == tenant_id).order_by(
ExternalKnowledgeApis.created_at.desc()
)
if search:
query = query.filter(ExternalApiTemplates.name.ilike(f"%{search}%"))
query = query.filter(ExternalKnowledgeApis.name.ilike(f"%{search}%"))
api_templates = query.paginate(page=page, per_page=per_page, max_per_page=100, error_out=False)
external_knowledge_apis = query.paginate(page=page, per_page=per_page, max_per_page=100, error_out=False)
return api_templates.items, api_templates.total
return external_knowledge_apis.items, external_knowledge_apis.total
@classmethod
def validate_api_list(cls, api_settings: dict):
@ -46,8 +46,8 @@ class ExternalDatasetService:
raise ValueError("api_key is required")
@staticmethod
def create_api_template(tenant_id: str, user_id: str, args: dict) -> ExternalApiTemplates:
api_template = ExternalApiTemplates(
def create_external_knowledge_api(tenant_id: str, user_id: str, args: dict) -> ExternalKnowledgeApis:
external_knowledge_api = ExternalKnowledgeApis(
tenant_id=tenant_id,
created_by=user_id,
updated_by=user_id,
@ -56,44 +56,44 @@ class ExternalDatasetService:
settings=json.dumps(args.get("settings"), ensure_ascii=False),
)
db.session.add(api_template)
db.session.add(external_knowledge_api)
db.session.commit()
return api_template
return external_knowledge_api
@staticmethod
def get_api_template(external_knowledge_api_id: str) -> ExternalApiTemplates:
return ExternalApiTemplates.query.filter_by(id=external_knowledge_api_id).first()
def get_external_knowledge_api(external_knowledge_api_id: str) -> ExternalKnowledgeApis:
return ExternalKnowledgeApis.query.filter_by(id=external_knowledge_api_id).first()
@staticmethod
def update_api_template(tenant_id, user_id, api_template_id, args) -> ExternalApiTemplates:
api_template = ExternalApiTemplates.query.filter_by(id=api_template_id, tenant_id=tenant_id).first()
if api_template is None:
def update_external_knowledge_api(tenant_id, user_id, external_knowledge_api_id, args) -> ExternalKnowledgeApis:
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(id=external_knowledge_api_id, tenant_id=tenant_id).first()
if external_knowledge_api is None:
raise ValueError("api template not found")
api_template.name = args.get("name")
api_template.description = args.get("description", "")
api_template.settings = json.dumps(args.get("settings"), ensure_ascii=False)
api_template.updated_by = user_id
api_template.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
external_knowledge_api.name = args.get("name")
external_knowledge_api.description = args.get("description", "")
external_knowledge_api.settings = json.dumps(args.get("settings"), ensure_ascii=False)
external_knowledge_api.updated_by = user_id
external_knowledge_api.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
db.session.commit()
return api_template
return external_knowledge_api
@staticmethod
def delete_api_template(tenant_id: str, api_template_id: str):
api_template = ExternalApiTemplates.query.filter_by(id=api_template_id, tenant_id=tenant_id).first()
if api_template is None:
def delete_external_knowledge_api(tenant_id: str, external_knowledge_api_id: str):
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(id=external_knowledge_api_id, tenant_id=tenant_id).first()
if external_knowledge_api is None:
raise ValueError("api template not found")
db.session.delete(api_template)
db.session.delete(external_knowledge_api)
db.session.commit()
@staticmethod
def external_api_template_use_check(external_knowledge_api_id: str) -> bool:
count = ExternalKnowledgeBindings.query.filter_by(external_api_template_id=external_knowledge_api_id).count()
def external_knowledge_api_use_check(external_knowledge_api_id: str) -> tuple[bool, int]:
count = ExternalKnowledgeBindings.query.filter_by(external_knowledge_api_id=external_knowledge_api_id).count()
if count > 0:
return True
return False
return True, count
return False, 0
@staticmethod
def get_external_knowledge_binding_with_dataset_id(tenant_id: str, dataset_id: str) -> ExternalKnowledgeBindings:
@ -105,11 +105,11 @@ class ExternalDatasetService:
return external_knowledge_binding
@staticmethod
def document_create_args_validate(tenant_id: str, api_template_id: str, process_parameter: dict):
api_template = ExternalApiTemplates.query.filter_by(id=api_template_id, tenant_id=tenant_id).first()
if api_template is None:
def document_create_args_validate(tenant_id: str, external_knowledge_api_id: str, process_parameter: dict):
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(id=external_knowledge_api_id, tenant_id=tenant_id).first()
if external_knowledge_api is None:
raise ValueError("api template not found")
settings = json.loads(api_template.settings)
settings = json.loads(external_knowledge_api.settings)
for setting in settings:
custom_parameters = setting.get("document_process_setting")
if custom_parameters:
@ -119,15 +119,15 @@ class ExternalDatasetService:
@staticmethod
def init_external_dataset(tenant_id: str, user_id: str, args: dict, created_from: str = "web"):
api_template_id = args.get("api_template_id")
external_knowledge_api_id = args.get("external_knowledge_api_id")
data_source = args.get("data_source")
if data_source is None:
raise ValueError("data source is required")
process_parameter = args.get("process_parameter")
api_template = ExternalApiTemplates.query.filter_by(id=api_template_id, tenant_id=tenant_id).first()
if api_template is None:
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(id=external_knowledge_api_id, tenant_id=tenant_id).first()
if external_knowledge_api is None:
raise ValueError("api template not found")
dataset = Dataset(
@ -175,12 +175,12 @@ class ExternalDatasetService:
db.session.flush()
document_ids.append(document.id)
db.session.commit()
# external_document_indexing_task.delay(dataset.id, api_template_id, data_source, process_parameter)
# external_document_indexing_task.delay(dataset.id, external_knowledge_api_id, data_source, process_parameter)
return dataset
@staticmethod
def process_external_api(settings: ApiTemplateSetting, files: Union[None, dict[str, Any]]) -> httpx.Response:
def process_external_api(settings: ExternalKnowledgeApiSetting, files: Union[None, dict[str, Any]]) -> httpx.Response:
"""
do http request depending on api bundle
"""
@ -222,19 +222,19 @@ class ExternalDatasetService:
return headers
@staticmethod
def get_api_template_settings(settings: dict) -> ApiTemplateSetting:
return ApiTemplateSetting.parse_obj(settings)
def get_external_knowledge_api_settings(settings: dict) -> ExternalKnowledgeApiSetting:
return ExternalKnowledgeApiSetting.parse_obj(settings)
@staticmethod
def create_external_dataset(tenant_id: str, user_id: str, args: dict) -> Dataset:
# check if dataset name already exists
if Dataset.query.filter_by(name=args.get("name"), tenant_id=tenant_id).first():
raise DatasetNameDuplicateError(f"Dataset with name {args.get('name')} already exists.")
api_template = ExternalApiTemplates.query.filter_by(
id=args.get("external_api_template_id"), tenant_id=tenant_id
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
id=args.get("external_knowledge_api_id"), tenant_id=tenant_id
).first()
if api_template is None:
if external_knowledge_api is None:
raise ValueError("api template not found")
dataset = Dataset(
@ -252,7 +252,7 @@ class ExternalDatasetService:
external_knowledge_binding = ExternalKnowledgeBindings(
tenant_id=tenant_id,
dataset_id=dataset.id,
external_api_template_id=args.get("external_api_template_id"),
external_knowledge_api_id=args.get("external_knowledge_api_id"),
external_knowledge_id=args.get("external_knowledge_id"),
created_by=user_id,
)
@ -272,13 +272,13 @@ class ExternalDatasetService:
if not external_knowledge_binding:
raise ValueError("external knowledge binding not found")
external_api_template = ExternalApiTemplates.query.filter_by(
id=external_knowledge_binding.external_api_template_id
external_knowledge_api = ExternalKnowledgeApis.query.filter_by(
id=external_knowledge_binding.external_knowledge_api_id
).first()
if not external_api_template:
if not external_knowledge_api:
raise ValueError("external api template not found")
settings = json.loads(external_api_template.settings)
settings = json.loads(external_knowledge_api.settings)
headers = {"Content-Type": "application/json"}
if settings.get("api_key"):
headers["Authorization"] = f"Bearer {settings.get('api_key')}"
@ -286,13 +286,13 @@ class ExternalDatasetService:
external_retrieval_parameters["query"] = query
external_retrieval_parameters["external_knowledge_id"] = external_knowledge_binding.external_knowledge_id
api_template_setting = {
external_knowledge_api_setting = {
"url": f"{settings.get('endpoint')}/dify/external-knowledge/retrieval-documents",
"request_method": "post",
"headers": headers,
"params": external_retrieval_parameters,
}
response = ExternalDatasetService.process_external_api(ApiTemplateSetting(**api_template_setting), None)
response = ExternalDatasetService.process_external_api(ExternalKnowledgeApiSetting(**external_knowledge_api_setting), None)
if response.status_code == 200:
return response.json()
return []

View File

@ -8,17 +8,17 @@ from celery import shared_task
from core.indexing_runner import DocumentIsPausedException
from extensions.ext_database import db
from extensions.ext_storage import storage
from models.dataset import Dataset, ExternalApiTemplates
from models.dataset import Dataset, ExternalKnowledgeApis
from models.model import UploadFile
from services.external_knowledge_service import ExternalDatasetService
@shared_task(queue="dataset")
def external_document_indexing_task(dataset_id: str, api_template_id: str, data_source: dict, process_parameter: dict):
def external_document_indexing_task(dataset_id: str, external_knowledge_api_id: str, data_source: dict, process_parameter: dict):
"""
Async process document
:param dataset_id:
:param api_template_id:
:param external_knowledge_api_id:
:param data_source:
:param process_parameter:
Usage: external_document_indexing_task.delay(dataset_id, document_id)
@ -33,16 +33,16 @@ def external_document_indexing_task(dataset_id: str, api_template_id: str, data_
return
# get external api template
api_template = (
db.session.query(ExternalApiTemplates)
.filter(ExternalApiTemplates.id == api_template_id, ExternalApiTemplates.tenant_id == dataset.tenant_id)
external_knowledge_api = (
db.session.query(ExternalKnowledgeApis)
.filter(ExternalKnowledgeApis.id == external_knowledge_api_id, ExternalKnowledgeApis.tenant_id == dataset.tenant_id)
.first()
)
if not api_template:
if not external_knowledge_api:
logging.info(
click.style(
"Processed external dataset: {} failed, api template: {} not exit.".format(dataset_id, api_template_id),
"Processed external dataset: {} failed, api template: {} not exit.".format(dataset_id, external_knowledge_api_id),
fg="red",
)
)
@ -59,7 +59,7 @@ def external_document_indexing_task(dataset_id: str, api_template_id: str, data_
if file:
files[file.id] = (file.name, storage.load_once(file.key), file.mime_type)
try:
settings = ExternalDatasetService.get_api_template_settings(json.loads(api_template.settings))
settings = ExternalDatasetService.get_external_knowledge_api_settings(json.loads(external_knowledge_api.settings))
# assemble headers
headers = ExternalDatasetService.assembling_headers(settings.authorization, settings.headers)