From 5b4589951dc80d3543fab9a05e7d4df43efcce65 Mon Sep 17 00:00:00 2001 From: jyong <718720800@qq.com> Date: Wed, 5 Mar 2025 16:10:04 +0800 Subject: [PATCH] fix metadata --- api/core/rag/retrieval/template_prompts.py | 66 +++++++++++++++++++ .../knowledge_retrieval/template_prompts.py | 66 +++++++++++++++++++ api/models/dataset.py | 2 +- .../knowledge_entities/knowledge_entities.py | 4 +- api/services/metadata_service.py | 51 +++++++------- 5 files changed, 164 insertions(+), 25 deletions(-) create mode 100644 api/core/rag/retrieval/template_prompts.py create mode 100644 api/core/workflow/nodes/knowledge_retrieval/template_prompts.py diff --git a/api/core/rag/retrieval/template_prompts.py b/api/core/rag/retrieval/template_prompts.py new file mode 100644 index 0000000000..7abd55d798 --- /dev/null +++ b/api/core/rag/retrieval/template_prompts.py @@ -0,0 +1,66 @@ +METADATA_FILTER_SYSTEM_PROMPT = """ + ### Job Description', + You are a text metadata extract engine that extract text's metadata based on user input and set the metadata value + ### Task + Your task is to ONLY extract the metadatas that exist in the input text from the provided metadata list and Use the following operators ["=", "!=", ">", "<", ">=", "<="] to express logical relationships, then return result in JSON format with the key "metadata_fields" and value "metadata_field_value" and comparison operator "comparison_operator". + ### Format + The input text is in the variable input_text. Metadata are specified as a list in the variable metadata_fields. + ### Constraint + DO NOT include anything other than the JSON array in your response. +""" # noqa: E501 + +METADATA_FILTER_USER_PROMPT_1 = """ + { "input_text": "I want to know which company’s email address test@example.com is?", + "metadata_fields": ["filename", "email", "phone", "address"] + } +""" + +METADATA_FILTER_ASSISTANT_PROMPT_1 = """ +```json + {"metadata_map": [ + {"metadata_field_name": "email", "metadata_field_value": "test@example.com", "comparison_operator": "="} + ] + } +``` +""" + +METADATA_FILTER_USER_PROMPT_2 = """ + {"input_text": "What are the movies with a score of more than 9 in 2024?", + "metadata_fields": ["name", "year", "rating", "country"]} +""" + +METADATA_FILTER_ASSISTANT_PROMPT_2 = """ +```json + {"metadata_map": [ + {"metadata_field_name": "year", "metadata_field_value": "2024", "comparison_operator": "="}, + {"metadata_field_name": "rating", "metadata_field_value": "9", "comparison_operator": ">"}, + ]} +``` +""" + +METADATA_FILTER_USER_PROMPT_3 = """ + '{{"input_text": "{input_text}",', + '"metadata_fields": {metadata_fields}}}' +""" + +METADATA_FILTER_COMPLETION_PROMPT = """ +### Job Description +You are a text metadata extract engine that extract text's metadata based on user input and set the metadata value +### Task +# Your task is to ONLY extract the metadatas that exist in the input text from the provided metadata list and Use the following operators ["=", "!=", ">", "<", ">=", "<="] to express logical relationships, then return result in JSON format with the key "metadata_fields" and value "metadata_field_value" and comparison operator "comparison_operator". +### Format +The input text is in the variable input_text. Metadata are specified as a list in the variable metadata_fields. +### Constraint +DO NOT include anything other than the JSON array in your response. +### Example +Here is the chat example between human and assistant, inside XML tags. + +User:{{"input_text": ["I want to know which company’s email address test@example.com is?"], "metadata_fields": ["filename", "email", "phone", "address"]}} +Assistant:{{"metadata_map": [{{"metadata_field_name": "email", "metadata_field_value": "test@example.com", "comparison_operator": "="}}]}} +User:{{"input_text": "What are the movies with a score of more than 9 in 2024?", "metadata_fields": ["name", "year", "rating", "country"]}} +Assistant:{{"metadata_map": [{{"metadata_field_name": "year", "metadata_field_value": "2024", "comparison_operator": "="}, {{"metadata_field_name": "rating", "metadata_field_value": "9", "comparison_operator": ">"}}]}} + +### User Input +{{"input_text" : "{input_text}", "metadata_fields" : {metadata_fields}}} +### Assistant Output +""" # noqa: E501 diff --git a/api/core/workflow/nodes/knowledge_retrieval/template_prompts.py b/api/core/workflow/nodes/knowledge_retrieval/template_prompts.py new file mode 100644 index 0000000000..7abd55d798 --- /dev/null +++ b/api/core/workflow/nodes/knowledge_retrieval/template_prompts.py @@ -0,0 +1,66 @@ +METADATA_FILTER_SYSTEM_PROMPT = """ + ### Job Description', + You are a text metadata extract engine that extract text's metadata based on user input and set the metadata value + ### Task + Your task is to ONLY extract the metadatas that exist in the input text from the provided metadata list and Use the following operators ["=", "!=", ">", "<", ">=", "<="] to express logical relationships, then return result in JSON format with the key "metadata_fields" and value "metadata_field_value" and comparison operator "comparison_operator". + ### Format + The input text is in the variable input_text. Metadata are specified as a list in the variable metadata_fields. + ### Constraint + DO NOT include anything other than the JSON array in your response. +""" # noqa: E501 + +METADATA_FILTER_USER_PROMPT_1 = """ + { "input_text": "I want to know which company’s email address test@example.com is?", + "metadata_fields": ["filename", "email", "phone", "address"] + } +""" + +METADATA_FILTER_ASSISTANT_PROMPT_1 = """ +```json + {"metadata_map": [ + {"metadata_field_name": "email", "metadata_field_value": "test@example.com", "comparison_operator": "="} + ] + } +``` +""" + +METADATA_FILTER_USER_PROMPT_2 = """ + {"input_text": "What are the movies with a score of more than 9 in 2024?", + "metadata_fields": ["name", "year", "rating", "country"]} +""" + +METADATA_FILTER_ASSISTANT_PROMPT_2 = """ +```json + {"metadata_map": [ + {"metadata_field_name": "year", "metadata_field_value": "2024", "comparison_operator": "="}, + {"metadata_field_name": "rating", "metadata_field_value": "9", "comparison_operator": ">"}, + ]} +``` +""" + +METADATA_FILTER_USER_PROMPT_3 = """ + '{{"input_text": "{input_text}",', + '"metadata_fields": {metadata_fields}}}' +""" + +METADATA_FILTER_COMPLETION_PROMPT = """ +### Job Description +You are a text metadata extract engine that extract text's metadata based on user input and set the metadata value +### Task +# Your task is to ONLY extract the metadatas that exist in the input text from the provided metadata list and Use the following operators ["=", "!=", ">", "<", ">=", "<="] to express logical relationships, then return result in JSON format with the key "metadata_fields" and value "metadata_field_value" and comparison operator "comparison_operator". +### Format +The input text is in the variable input_text. Metadata are specified as a list in the variable metadata_fields. +### Constraint +DO NOT include anything other than the JSON array in your response. +### Example +Here is the chat example between human and assistant, inside XML tags. + +User:{{"input_text": ["I want to know which company’s email address test@example.com is?"], "metadata_fields": ["filename", "email", "phone", "address"]}} +Assistant:{{"metadata_map": [{{"metadata_field_name": "email", "metadata_field_value": "test@example.com", "comparison_operator": "="}}]}} +User:{{"input_text": "What are the movies with a score of more than 9 in 2024?", "metadata_fields": ["name", "year", "rating", "country"]}} +Assistant:{{"metadata_map": [{{"metadata_field_name": "year", "metadata_field_value": "2024", "comparison_operator": "="}, {{"metadata_field_name": "rating", "metadata_field_value": "9", "comparison_operator": ">"}}]}} + +### User Input +{{"input_text" : "{input_text}", "metadata_fields" : {metadata_fields}}} +### Assistant Output +""" # noqa: E501 diff --git a/api/models/dataset.py b/api/models/dataset.py index 4bd0b0ea6f..b19ec05414 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -442,7 +442,7 @@ class Document(db.Model): # type: ignore[name-defined] "id": metadata.id, "name": metadata.name, "type": metadata.type, - "value": self.doc_metadata.get(metadata.type), + "value": self.doc_metadata.get(metadata.name), } metadata_list.append(metadata_dict) # deal built-in fields diff --git a/api/services/entities/knowledge_entities/knowledge_entities.py b/api/services/entities/knowledge_entities/knowledge_entities.py index 7d0f545f9e..29e00ab68a 100644 --- a/api/services/entities/knowledge_entities/knowledge_entities.py +++ b/api/services/entities/knowledge_entities/knowledge_entities.py @@ -133,7 +133,7 @@ class MetadataArgs(BaseModel): class MetadataUpdateArgs(BaseModel): name: str - value: str + value: Optional[str | int | float] = None class MetadataValueUpdateArgs(BaseModel): @@ -143,7 +143,7 @@ class MetadataValueUpdateArgs(BaseModel): class MetadataDetail(BaseModel): id: str name: str - value: str + value: Optional[str | int | float] = None class DocumentMetadataOperation(BaseModel): diff --git a/api/services/metadata_service.py b/api/services/metadata_service.py index 45814ee066..fd0738772b 100644 --- a/api/services/metadata_service.py +++ b/api/services/metadata_service.py @@ -105,12 +105,15 @@ class MetadataService: if documents: for document in documents: if not document.doc_metadata: - document.doc_metadata = {} - document.doc_metadata[BuiltInField.document_name] = document.name - document.doc_metadata[BuiltInField.uploader] = document.uploader - document.doc_metadata[BuiltInField.upload_date] = document.upload_date.timestamp() - document.doc_metadata[BuiltInField.last_update_date] = document.last_update_date.timestamp() - document.doc_metadata[BuiltInField.source] = document.data_source_type + doc_metadata = {} + else: + doc_metadata = document.doc_metadata + doc_metadata[BuiltInField.document_name.value] = document.name + doc_metadata[BuiltInField.uploader.value] = document.uploader + doc_metadata[BuiltInField.upload_date.value] = document.upload_date.timestamp() + doc_metadata[BuiltInField.last_update_date.value] = document.last_update_date.timestamp() + doc_metadata[BuiltInField.source.value] = document.data_source_type + document.doc_metadata = doc_metadata db.session.add(document) db.session.commit() except Exception: @@ -131,11 +134,13 @@ class MetadataService: document_ids = [] if documents: for document in documents: - document.doc_metadata.pop(BuiltInField.document_name) - document.doc_metadata.pop(BuiltInField.uploader) - document.doc_metadata.pop(BuiltInField.upload_date) - document.doc_metadata.pop(BuiltInField.last_update_date) - document.doc_metadata.pop(BuiltInField.source) + doc_metadata = document.doc_metadata + doc_metadata.pop(BuiltInField.document_name) + doc_metadata.pop(BuiltInField.uploader) + doc_metadata.pop(BuiltInField.upload_date) + doc_metadata.pop(BuiltInField.last_update_date) + doc_metadata.pop(BuiltInField.source) + document.doc_metadata = doc_metadata db.session.add(document) document_ids.append(document.id) db.session.commit() @@ -150,18 +155,21 @@ class MetadataService: lock_key = f"document_metadata_lock_{operation.document_id}" try: MetadataService.knowledge_base_metadata_lock_check(None, operation.document_id) - document = DocumentService.get_document(operation.document_id) + document = DocumentService.get_document(dataset.id, operation.document_id) if document is None: raise ValueError("Document not found.") - document.doc_metadata = {} - for metadata_value in metadata_args.fields: - document.doc_metadata[metadata_value.name] = metadata_value.value - if dataset.built_in_fields: - document.doc_metadata[BuiltInField.document_name] = document.name - document.doc_metadata[BuiltInField.uploader] = document.uploader - document.doc_metadata[BuiltInField.upload_date] = document.upload_date.timestamp() - document.doc_metadata[BuiltInField.last_update_date] = document.last_update_date.timestamp() - document.doc_metadata[BuiltInField.source] = document.data_source_type + doc_metadata = {} + for metadata_value in operation.metadata_list: + doc_metadata[metadata_value.name] = metadata_value.value + if dataset.built_in_field_enabled: + doc_metadata[BuiltInField.document_name.value] = document.name + doc_metadata[BuiltInField.uploader.value] = document.uploader + doc_metadata[BuiltInField.upload_date.value] = document.upload_date.timestamp() + doc_metadata[BuiltInField.last_update_date.value] = document.last_update_date.timestamp() + doc_metadata[BuiltInField.source.value] = document.data_source_type + document.doc_metadata = doc_metadata + db.session.add(document) + db.session.commit() # deal metadata bindding DatasetMetadataBinding.query.filter_by(document_id=operation.document_id).delete() for metadata_value in operation.metadata_list: @@ -173,7 +181,6 @@ class MetadataService: created_by=current_user.id, ) db.session.add(dataset_metadata_binding) - db.session.add(document) db.session.commit() except Exception: logging.exception("Update documents metadata failed")