From 5b4589951dc80d3543fab9a05e7d4df43efcce65 Mon Sep 17 00:00:00 2001
From: jyong <718720800@qq.com>
Date: Wed, 5 Mar 2025 16:10:04 +0800
Subject: [PATCH] fix metadata

---
 api/core/rag/retrieval/template_prompts.py    | 66 +++++++++++++++++++
 .../knowledge_retrieval/template_prompts.py   | 66 +++++++++++++++++++
 api/models/dataset.py                         |  2 +-
 .../knowledge_entities/knowledge_entities.py  |  4 +-
 api/services/metadata_service.py              | 51 +++++++-------
 5 files changed, 164 insertions(+), 25 deletions(-)
 create mode 100644 api/core/rag/retrieval/template_prompts.py
 create mode 100644 api/core/workflow/nodes/knowledge_retrieval/template_prompts.py
diff --git a/api/core/rag/retrieval/template_prompts.py b/api/core/rag/retrieval/template_prompts.py
new file mode 100644
index 0000000000..7abd55d798
--- /dev/null
+++ b/api/core/rag/retrieval/template_prompts.py
@@ -0,0 +1,66 @@
+METADATA_FILTER_SYSTEM_PROMPT = """
+    ### Job Description',
+    You are a text metadata extract engine that extract text's metadata based on user input and set the metadata value
+    ### Task
+    Your task is to ONLY extract the metadatas that exist in the input text from the provided metadata list and Use the following operators ["=", "!=", ">", "<", ">=", "<="] to express logical relationships, then return result in JSON format with the key "metadata_fields" and value "metadata_field_value" and comparison operator "comparison_operator".
+    ### Format
+    The input text is in the variable input_text. Metadata are specified as a list in the variable metadata_fields.
+    ### Constraint
+    DO NOT include anything other than the JSON array in your response.
+"""  # noqa: E501
+
+METADATA_FILTER_USER_PROMPT_1 = """
+    { "input_text": "I want to know which company’s email address test@example.com is?",
+    "metadata_fields": ["filename", "email", "phone", "address"]
+    }
+"""
+
+METADATA_FILTER_ASSISTANT_PROMPT_1 = """
+```json
+    {"metadata_map": [
+        {"metadata_field_name": "email", "metadata_field_value": "test@example.com", "comparison_operator": "="}
+    ]
+    }
+```
+"""
+
+METADATA_FILTER_USER_PROMPT_2 = """
+    {"input_text": "What are the movies with a score of more than 9 in 2024?",
+    "metadata_fields": ["name", "year", "rating", "country"]}
+"""
+
+METADATA_FILTER_ASSISTANT_PROMPT_2 = """
+```json
+    {"metadata_map": [
+        {"metadata_field_name": "year", "metadata_field_value": "2024", "comparison_operator": "="},
+        {"metadata_field_name": "rating", "metadata_field_value": "9", "comparison_operator": ">"},
+    ]}
+```
+"""
+
+METADATA_FILTER_USER_PROMPT_3 = """
+    '{{"input_text": "{input_text}",',
+    '"metadata_fields": {metadata_fields}}}'
+"""
+
+METADATA_FILTER_COMPLETION_PROMPT = """
+### Job Description
+You are a text metadata extract engine that extract text's metadata based on user input and set the metadata value
+### Task
+# Your task is to ONLY extract the metadatas that exist in the input text from the provided metadata list and Use the following operators ["=", "!=", ">", "<", ">=", "<="] to express logical relationships, then return result in JSON format with the key "metadata_fields" and value "metadata_field_value" and comparison operator "comparison_operator".
+### Format
+The input text is in the variable input_text. Metadata are specified as a list in the variable metadata_fields.
+### Constraint 
+DO NOT include anything other than the JSON array in your response.
+### Example
+Here is the chat example between human and assistant, inside <example></example> XML tags.
+<example>
+User:{{"input_text": ["I want to know which company’s email address test@example.com is?"], "metadata_fields": ["filename", "email", "phone", "address"]}}
+Assistant:{{"metadata_map": [{{"metadata_field_name": "email", "metadata_field_value": "test@example.com", "comparison_operator": "="}}]}}
+User:{{"input_text": "What are the movies with a score of more than 9 in 2024?", "metadata_fields": ["name", "year", "rating", "country"]}}
+Assistant:{{"metadata_map": [{{"metadata_field_name": "year", "metadata_field_value": "2024", "comparison_operator": "="}, {{"metadata_field_name": "rating", "metadata_field_value": "9", "comparison_operator": ">"}}]}}
+</example> 
+### User Input
+{{"input_text" : "{input_text}", "metadata_fields" : {metadata_fields}}}
+### Assistant Output
+"""  # noqa: E501
diff --git a/api/core/workflow/nodes/knowledge_retrieval/template_prompts.py b/api/core/workflow/nodes/knowledge_retrieval/template_prompts.py
new file mode 100644
index 0000000000..7abd55d798
--- /dev/null
+++ b/api/core/workflow/nodes/knowledge_retrieval/template_prompts.py
@@ -0,0 +1,66 @@
+METADATA_FILTER_SYSTEM_PROMPT = """
+    ### Job Description',
+    You are a text metadata extract engine that extract text's metadata based on user input and set the metadata value
+    ### Task
+    Your task is to ONLY extract the metadatas that exist in the input text from the provided metadata list and Use the following operators ["=", "!=", ">", "<", ">=", "<="] to express logical relationships, then return result in JSON format with the key "metadata_fields" and value "metadata_field_value" and comparison operator "comparison_operator".
+    ### Format
+    The input text is in the variable input_text. Metadata are specified as a list in the variable metadata_fields.
+    ### Constraint
+    DO NOT include anything other than the JSON array in your response.
+"""  # noqa: E501
+
+METADATA_FILTER_USER_PROMPT_1 = """
+    { "input_text": "I want to know which company’s email address test@example.com is?",
+    "metadata_fields": ["filename", "email", "phone", "address"]
+    }
+"""
+
+METADATA_FILTER_ASSISTANT_PROMPT_1 = """
+```json
+    {"metadata_map": [
+        {"metadata_field_name": "email", "metadata_field_value": "test@example.com", "comparison_operator": "="}
+    ]
+    }
+```
+"""
+
+METADATA_FILTER_USER_PROMPT_2 = """
+    {"input_text": "What are the movies with a score of more than 9 in 2024?",
+    "metadata_fields": ["name", "year", "rating", "country"]}
+"""
+
+METADATA_FILTER_ASSISTANT_PROMPT_2 = """
+```json
+    {"metadata_map": [
+        {"metadata_field_name": "year", "metadata_field_value": "2024", "comparison_operator": "="},
+        {"metadata_field_name": "rating", "metadata_field_value": "9", "comparison_operator": ">"},
+    ]}
+```
+"""
+
+METADATA_FILTER_USER_PROMPT_3 = """
+    '{{"input_text": "{input_text}",',
+    '"metadata_fields": {metadata_fields}}}'
+"""
+
+METADATA_FILTER_COMPLETION_PROMPT = """
+### Job Description
+You are a text metadata extract engine that extract text's metadata based on user input and set the metadata value
+### Task
+# Your task is to ONLY extract the metadatas that exist in the input text from the provided metadata list and Use the following operators ["=", "!=", ">", "<", ">=", "<="] to express logical relationships, then return result in JSON format with the key "metadata_fields" and value "metadata_field_value" and comparison operator "comparison_operator".
+### Format
+The input text is in the variable input_text. Metadata are specified as a list in the variable metadata_fields.
+### Constraint 
+DO NOT include anything other than the JSON array in your response.
+### Example
+Here is the chat example between human and assistant, inside <example></example> XML tags.
+<example>
+User:{{"input_text": ["I want to know which company’s email address test@example.com is?"], "metadata_fields": ["filename", "email", "phone", "address"]}}
+Assistant:{{"metadata_map": [{{"metadata_field_name": "email", "metadata_field_value": "test@example.com", "comparison_operator": "="}}]}}
+User:{{"input_text": "What are the movies with a score of more than 9 in 2024?", "metadata_fields": ["name", "year", "rating", "country"]}}
+Assistant:{{"metadata_map": [{{"metadata_field_name": "year", "metadata_field_value": "2024", "comparison_operator": "="}, {{"metadata_field_name": "rating", "metadata_field_value": "9", "comparison_operator": ">"}}]}}
+</example> 
+### User Input
+{{"input_text" : "{input_text}", "metadata_fields" : {metadata_fields}}}
+### Assistant Output
+"""  # noqa: E501
diff --git a/api/models/dataset.py b/api/models/dataset.py
index 4bd0b0ea6f..b19ec05414 100644
--- a/api/models/dataset.py
+++ b/api/models/dataset.py
@@ -442,7 +442,7 @@ class Document(db.Model):  # type: ignore[name-defined]
                     "id": metadata.id,
                     "name": metadata.name,
                     "type": metadata.type,
-                    "value": self.doc_metadata.get(metadata.type),
+                    "value": self.doc_metadata.get(metadata.name),
                 }
                 metadata_list.append(metadata_dict)
             # deal built-in fields
diff --git a/api/services/entities/knowledge_entities/knowledge_entities.py b/api/services/entities/knowledge_entities/knowledge_entities.py
index 7d0f545f9e..29e00ab68a 100644
--- a/api/services/entities/knowledge_entities/knowledge_entities.py
+++ b/api/services/entities/knowledge_entities/knowledge_entities.py
@@ -133,7 +133,7 @@ class MetadataArgs(BaseModel):
 
 class MetadataUpdateArgs(BaseModel):
     name: str
-    value: str
+    value: Optional[str | int | float] = None
 
 
 class MetadataValueUpdateArgs(BaseModel):
@@ -143,7 +143,7 @@ class MetadataValueUpdateArgs(BaseModel):
 class MetadataDetail(BaseModel):
     id: str
     name: str
-    value: str
+    value: Optional[str | int | float] = None
 
 
 class DocumentMetadataOperation(BaseModel):
diff --git a/api/services/metadata_service.py b/api/services/metadata_service.py
index 45814ee066..fd0738772b 100644
--- a/api/services/metadata_service.py
+++ b/api/services/metadata_service.py
@@ -105,12 +105,15 @@ class MetadataService:
             if documents:
                 for document in documents:
                     if not document.doc_metadata:
-                        document.doc_metadata = {}
-                    document.doc_metadata[BuiltInField.document_name] = document.name
-                    document.doc_metadata[BuiltInField.uploader] = document.uploader
-                    document.doc_metadata[BuiltInField.upload_date] = document.upload_date.timestamp()
-                    document.doc_metadata[BuiltInField.last_update_date] = document.last_update_date.timestamp()
-                    document.doc_metadata[BuiltInField.source] = document.data_source_type
+                        doc_metadata = {}
+                    else:
+                        doc_metadata = document.doc_metadata
+                    doc_metadata[BuiltInField.document_name.value] = document.name
+                    doc_metadata[BuiltInField.uploader.value] = document.uploader
+                    doc_metadata[BuiltInField.upload_date.value] = document.upload_date.timestamp()
+                    doc_metadata[BuiltInField.last_update_date.value] = document.last_update_date.timestamp()
+                    doc_metadata[BuiltInField.source.value] = document.data_source_type
+                    document.doc_metadata = doc_metadata
                     db.session.add(document)
                 db.session.commit()
         except Exception:
@@ -131,11 +134,13 @@ class MetadataService:
             document_ids = []
             if documents:
                 for document in documents:
-                    document.doc_metadata.pop(BuiltInField.document_name)
-                    document.doc_metadata.pop(BuiltInField.uploader)
-                    document.doc_metadata.pop(BuiltInField.upload_date)
-                    document.doc_metadata.pop(BuiltInField.last_update_date)
-                    document.doc_metadata.pop(BuiltInField.source)
+                    doc_metadata = document.doc_metadata
+                    doc_metadata.pop(BuiltInField.document_name)
+                    doc_metadata.pop(BuiltInField.uploader)
+                    doc_metadata.pop(BuiltInField.upload_date)
+                    doc_metadata.pop(BuiltInField.last_update_date)
+                    doc_metadata.pop(BuiltInField.source)
+                    document.doc_metadata = doc_metadata
                     db.session.add(document)
                     document_ids.append(document.id)
             db.session.commit()
@@ -150,18 +155,21 @@ class MetadataService:
             lock_key = f"document_metadata_lock_{operation.document_id}"
             try:
                 MetadataService.knowledge_base_metadata_lock_check(None, operation.document_id)
-                document = DocumentService.get_document(operation.document_id)
+                document = DocumentService.get_document(dataset.id, operation.document_id)
                 if document is None:
                     raise ValueError("Document not found.")
-                document.doc_metadata = {}
-                for metadata_value in metadata_args.fields:
-                    document.doc_metadata[metadata_value.name] = metadata_value.value
-                if dataset.built_in_fields:
-                    document.doc_metadata[BuiltInField.document_name] = document.name
-                    document.doc_metadata[BuiltInField.uploader] = document.uploader
-                    document.doc_metadata[BuiltInField.upload_date] = document.upload_date.timestamp()
-                    document.doc_metadata[BuiltInField.last_update_date] = document.last_update_date.timestamp()
-                    document.doc_metadata[BuiltInField.source] = document.data_source_type
+                doc_metadata = {}
+                for metadata_value in operation.metadata_list:
+                    doc_metadata[metadata_value.name] = metadata_value.value
+                if dataset.built_in_field_enabled:
+                    doc_metadata[BuiltInField.document_name.value] = document.name
+                    doc_metadata[BuiltInField.uploader.value] = document.uploader
+                    doc_metadata[BuiltInField.upload_date.value] = document.upload_date.timestamp()
+                    doc_metadata[BuiltInField.last_update_date.value] = document.last_update_date.timestamp()
+                    doc_metadata[BuiltInField.source.value] = document.data_source_type
+                document.doc_metadata = doc_metadata
+                db.session.add(document)
+                db.session.commit()
                 # deal metadata bindding
                 DatasetMetadataBinding.query.filter_by(document_id=operation.document_id).delete()
                 for metadata_value in operation.metadata_list:
@@ -173,7 +181,6 @@ class MetadataService:
                         created_by=current_user.id,
                     )
                     db.session.add(dataset_metadata_binding)
-                db.session.add(document)
                 db.session.commit()
             except Exception:
                 logging.exception("Update documents metadata failed")