From d135677c25aa632d13b6a4db8eca68365cd4b660 Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Thu, 20 Mar 2025 01:38:15 +0800 Subject: [PATCH 1/7] add vdb document id index (#16244) Co-authored-by: crazywoola <427733928@qq.com> --- .../vdb/elasticsearch/elasticsearch_vector.py | 3 +- api/core/rag/datasource/vdb/field.py | 1 + .../datasource/vdb/qdrant/qdrant_vector.py | 4 +++ .../tidb_on_qdrant/tidb_on_qdrant_vector.py | 36 ++++++++----------- .../datasource/vdb/tidb_vector/tidb_vector.py | 2 ++ 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_vector.py b/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_vector.py index 093368b0cc..033d05a077 100644 --- a/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_vector.py +++ b/api/core/rag/datasource/vdb/elasticsearch/elasticsearch_vector.py @@ -196,7 +196,8 @@ class ElasticSearchVector(BaseVector): Field.METADATA_KEY.value: { "type": "object", "properties": { - "doc_id": {"type": "keyword"} # Map doc_id to keyword type + "doc_id": {"type": "keyword"}, # Map doc_id to keyword type + "document_id": {"type": "keyword"}, # Map doc_id to keyword type }, }, } diff --git a/api/core/rag/datasource/vdb/field.py b/api/core/rag/datasource/vdb/field.py index a64407bce1..9887e21b7c 100644 --- a/api/core/rag/datasource/vdb/field.py +++ b/api/core/rag/datasource/vdb/field.py @@ -11,3 +11,4 @@ class Field(Enum): TEXT_KEY = "text" PRIMARY_KEY = "id" DOC_ID = "metadata.doc_id" + DOCUMENT_ID = "metadata.document_id" diff --git a/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py b/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py index 73ce8201fd..4efd90667a 100644 --- a/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py +++ b/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py @@ -134,6 +134,10 @@ class QdrantVector(BaseVector): self._client.create_payload_index( collection_name, Field.DOC_ID.value, field_schema=PayloadSchemaType.KEYWORD ) + # create document_id payload index + self._client.create_payload_index( + collection_name, Field.DOCUMENT_ID.value, field_schema=PayloadSchemaType.KEYWORD + ) # create full text index text_index_params = TextIndexParams( type=TextIndexType.TEXT, diff --git a/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_on_qdrant_vector.py b/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_on_qdrant_vector.py index ae4baeb17e..6a61fe9496 100644 --- a/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_on_qdrant_vector.py +++ b/api/core/rag/datasource/vdb/tidb_on_qdrant/tidb_on_qdrant_vector.py @@ -144,6 +144,10 @@ class TidbOnQdrantVector(BaseVector): self._client.create_payload_index( collection_name, Field.DOC_ID.value, field_schema=PayloadSchemaType.KEYWORD ) + # create document_id payload index + self._client.create_payload_index( + collection_name, Field.DOCUMENT_ID.value, field_schema=PayloadSchemaType.KEYWORD + ) # create full text index text_index_params = TextIndexParams( type=TextIndexType.TEXT, @@ -318,23 +322,17 @@ class TidbOnQdrantVector(BaseVector): def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]: from qdrant_client.http import models - filter = models.Filter( - must=[ - models.FieldCondition( - key="group_id", - match=models.MatchValue(value=self._group_id), - ), - ], - ) + filter = None document_ids_filter = kwargs.get("document_ids_filter") if document_ids_filter: - if filter.must: - filter.must.append( + filter = models.Filter( + must=[ models.FieldCondition( key="metadata.document_id", match=models.MatchAny(any=document_ids_filter), ) - ) + ], + ) results = self._client.search( collection_name=self._collection_name, query_vector=query_vector, @@ -369,23 +367,17 @@ class TidbOnQdrantVector(BaseVector): """ from qdrant_client.http import models - scroll_filter = models.Filter( - must=[ - models.FieldCondition( - key="page_content", - match=models.MatchText(text=query), - ) - ] - ) + scroll_filter = None document_ids_filter = kwargs.get("document_ids_filter") if document_ids_filter: - if scroll_filter.must: - scroll_filter.must.append( + scroll_filter = models.Filter( + must=[ models.FieldCondition( key="metadata.document_id", match=models.MatchAny(any=document_ids_filter), ) - ) + ] + ) response = self._client.scroll( collection_name=self._collection_name, scroll_filter=scroll_filter, diff --git a/api/core/rag/datasource/vdb/tidb_vector/tidb_vector.py b/api/core/rag/datasource/vdb/tidb_vector/tidb_vector.py index 77c5786042..efa68059e5 100644 --- a/api/core/rag/datasource/vdb/tidb_vector/tidb_vector.py +++ b/api/core/rag/datasource/vdb/tidb_vector/tidb_vector.py @@ -105,10 +105,12 @@ class TiDBVector(BaseVector): text TEXT NOT NULL, meta JSON NOT NULL, doc_id VARCHAR(64) AS (JSON_UNQUOTE(JSON_EXTRACT(meta, '$.doc_id'))) STORED, + document_id VARCHAR(64) AS (JSON_UNQUOTE(JSON_EXTRACT(meta, '$.document_id'))) STORED, vector VECTOR({dimension}) NOT NULL, create_time DATETIME DEFAULT CURRENT_TIMESTAMP, update_time DATETIME DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, KEY (doc_id), + KEY (document_id), VECTOR INDEX idx_vector (({tidb_dist_func}(vector))) USING HNSW ); """) From daad5824bf6c23aec1c2169b3866c647af96393a Mon Sep 17 00:00:00 2001 From: wyy-holding <59436937+wyy-holding@users.noreply.github.com> Date: Thu, 20 Mar 2025 09:28:09 +0800 Subject: [PATCH 2/7] add kubernetes yaml for dify by docker-compose.yaml (#16246) --- README.md | 1 + README_AR.md | 1 + README_BN.md | 1 + README_CN.md | 1 + README_DE.md | 1 + README_ES.md | 1 + README_FR.md | 1 + README_JA.md | 1 + README_KL.md | 1 + README_KR.md | 1 + README_PT.md | 1 + README_SI.md | 1 + README_TR.md | 1 + README_TW.md | 1 + README_VI.md | 1 + 15 files changed, 15 insertions(+) diff --git a/README.md b/README.md index c97bf9cf3a..87ebc9bafc 100644 --- a/README.md +++ b/README.md @@ -206,6 +206,7 @@ If you'd like to configure a highly-available setup, there are community-contrib - [Helm Chart by @BorisPolonsky](https://github.com/BorisPolonsky/dify-helm) - [Helm Chart by @magicsong](https://github.com/magicsong/ai-charts) - [YAML file by @Winson-030](https://github.com/Winson-030/dify-kubernetes) +- [YAML file by @wyy-holding](https://github.com/wyy-holding/dify-k8s) #### Using Terraform for Deployment diff --git a/README_AR.md b/README_AR.md index 7f167a5a2d..e58f59da5d 100644 --- a/README_AR.md +++ b/README_AR.md @@ -189,6 +189,7 @@ docker compose up -d - [رسم بياني Helm من قبل @BorisPolonsky](https://github.com/BorisPolonsky/dify-helm) - [رسم بياني Helm من قبل @magicsong](https://github.com/magicsong/ai-charts) - [ملف YAML من قبل @Winson-030](https://github.com/Winson-030/dify-kubernetes) +- [ملف YAML من قبل @wyy-holding](https://github.com/wyy-holding/dify-k8s) #### استخدام Terraform للتوزيع diff --git a/README_BN.md b/README_BN.md index 1329ddf1ed..3ebc81af5d 100644 --- a/README_BN.md +++ b/README_BN.md @@ -205,6 +205,7 @@ GitHub-এ ডিফাইকে স্টার দিয়ে রাখুন - [Helm Chart by @BorisPolonsky](https://github.com/BorisPolonsky/dify-helm) - [Helm Chart by @magicsong](https://github.com/magicsong/ai-charts) - [YAML file by @Winson-030](https://github.com/Winson-030/dify-kubernetes) +- [YAML file by @wyy-holding](https://github.com/wyy-holding/dify-k8s) #### টেরাফর্ম ব্যবহার করে ডিপ্লয় diff --git a/README_CN.md b/README_CN.md index d4fa930c2f..33e34423ff 100644 --- a/README_CN.md +++ b/README_CN.md @@ -207,6 +207,7 @@ docker compose up -d - [Helm Chart by @BorisPolonsky](https://github.com/BorisPolonsky/dify-helm) - [Helm Chart by @magicsong](https://github.com/magicsong/ai-charts) - [YAML 文件 by @Winson-030](https://github.com/Winson-030/dify-kubernetes) +- [YAML file by @wyy-holding](https://github.com/wyy-holding/dify-k8s) #### 使用 Terraform 部署 diff --git a/README_DE.md b/README_DE.md index d260d17136..b3b9bf3221 100644 --- a/README_DE.md +++ b/README_DE.md @@ -207,6 +207,7 @@ Falls Sie eine hochverfügbare Konfiguration einrichten möchten, gibt es von de - [Helm Chart by @BorisPolonsky](https://github.com/BorisPolonsky/dify-helm) - [Helm Chart by @magicsong](https://github.com/magicsong/ai-charts) - [YAML file by @Winson-030](https://github.com/Winson-030/dify-kubernetes) +- [YAML file by @wyy-holding](https://github.com/wyy-holding/dify-k8s) #### Terraform für die Bereitstellung verwenden diff --git a/README_ES.md b/README_ES.md index 1547ce8271..d14afdd2eb 100644 --- a/README_ES.md +++ b/README_ES.md @@ -207,6 +207,7 @@ Si desea configurar una configuración de alta disponibilidad, la comunidad prop - [Gráfico Helm por @BorisPolonsky](https://github.com/BorisPolonsky/dify-helm) - [Gráfico Helm por @magicsong](https://github.com/magicsong/ai-charts) - [Ficheros YAML por @Winson-030](https://github.com/Winson-030/dify-kubernetes) +- [Ficheros YAML por @wyy-holding](https://github.com/wyy-holding/dify-k8s) #### Uso de Terraform para el despliegue diff --git a/README_FR.md b/README_FR.md index 5ff9fe2ded..031196303e 100644 --- a/README_FR.md +++ b/README_FR.md @@ -205,6 +205,7 @@ Si vous souhaitez configurer une configuration haute disponibilité, la communau - [Helm Chart par @BorisPolonsky](https://github.com/BorisPolonsky/dify-helm) - [Helm Chart par @magicsong](https://github.com/magicsong/ai-charts) - [Fichier YAML par @Winson-030](https://github.com/Winson-030/dify-kubernetes) +- [Fichier YAML par @wyy-holding](https://github.com/wyy-holding/dify-k8s) #### Utilisation de Terraform pour le déploiement diff --git a/README_JA.md b/README_JA.md index 6575c5f113..3b7a6f50db 100644 --- a/README_JA.md +++ b/README_JA.md @@ -206,6 +206,7 @@ docker compose up -d - [Helm Chart by @BorisPolonsky](https://github.com/BorisPolonsky/dify-helm) - [Helm Chart by @magicsong](https://github.com/magicsong/ai-charts) - [YAML file by @Winson-030](https://github.com/Winson-030/dify-kubernetes) +- [YAML file by @wyy-holding](https://github.com/wyy-holding/dify-k8s) #### Terraformを使用したデプロイ diff --git a/README_KL.md b/README_KL.md index 2ad3744f15..ccadb77274 100644 --- a/README_KL.md +++ b/README_KL.md @@ -205,6 +205,7 @@ If you'd like to configure a highly-available setup, there are community-contrib - [Helm Chart by @BorisPolonsky](https://github.com/BorisPolonsky/dify-helm) - [Helm Chart by @magicsong](https://github.com/magicsong/ai-charts) - [YAML file by @Winson-030](https://github.com/Winson-030/dify-kubernetes) +- [YAML file by @wyy-holding](https://github.com/wyy-holding/dify-k8s) #### Terraform atorlugu pilersitsineq diff --git a/README_KR.md b/README_KR.md index 3f9ea20099..c1a98f8b68 100644 --- a/README_KR.md +++ b/README_KR.md @@ -199,6 +199,7 @@ Dify를 Kubernetes에 배포하고 프리미엄 스케일링 설정을 구성했 - [Helm Chart by @BorisPolonsky](https://github.com/BorisPolonsky/dify-helm) - [Helm Chart by @magicsong](https://github.com/magicsong/ai-charts) - [YAML file by @Winson-030](https://github.com/Winson-030/dify-kubernetes) +- [YAML file by @wyy-holding](https://github.com/wyy-holding/dify-k8s) #### Terraform을 사용한 배포 diff --git a/README_PT.md b/README_PT.md index 90b508c8f6..5b3c782645 100644 --- a/README_PT.md +++ b/README_PT.md @@ -205,6 +205,7 @@ Se deseja configurar uma instalação de alta disponibilidade, há [Helm Charts] - [Helm Chart de @BorisPolonsky](https://github.com/BorisPolonsky/dify-helm) - [Helm Chart de @magicsong](https://github.com/magicsong/ai-charts) - [Arquivo YAML por @Winson-030](https://github.com/Winson-030/dify-kubernetes) +- [Arquivo YAML por @wyy-holding](https://github.com/wyy-holding/dify-k8s) #### Usando o Terraform para Implantação diff --git a/README_SI.md b/README_SI.md index 5b7c9611f9..7c0867c776 100644 --- a/README_SI.md +++ b/README_SI.md @@ -205,6 +205,7 @@ Star Dify on GitHub and be instantly notified of new releases. - [Helm Chart by @LeoQuote](https://github.com/douban/charts/tree/master/charts/dify) - [Helm Chart by @BorisPolonsky](https://github.com/BorisPolonsky/dify-helm) - [YAML file by @Winson-030](https://github.com/Winson-030/dify-kubernetes) +- [YAML file by @wyy-holding](https://github.com/wyy-holding/dify-k8s) #### Uporaba Terraform za uvajanje diff --git a/README_TR.md b/README_TR.md index 7af8582b7d..f8890b00ef 100644 --- a/README_TR.md +++ b/README_TR.md @@ -198,6 +198,7 @@ Yüksek kullanılabilirliğe sahip bir kurulum yapılandırmak isterseniz, Dify' - [@LeoQuote tarafından Helm Chart](https://github.com/douban/charts/tree/master/charts/dify) - [@BorisPolonsky tarafından Helm Chart](https://github.com/BorisPolonsky/dify-helm) - [@Winson-030 tarafından YAML dosyası](https://github.com/Winson-030/dify-kubernetes) +- [@wyy-holding tarafından YAML dosyası](https://github.com/wyy-holding/dify-k8s) #### Dağıtım için Terraform Kullanımı diff --git a/README_TW.md b/README_TW.md index 4bfc81a25e..260f1e80ac 100644 --- a/README_TW.md +++ b/README_TW.md @@ -204,6 +204,7 @@ Dify 的所有功能都提供相應的 API,因此您可以輕鬆地將 Dify - [由 @LeoQuote 提供的 Helm Chart](https://github.com/douban/charts/tree/master/charts/dify) - [由 @BorisPolonsky 提供的 Helm Chart](https://github.com/BorisPolonsky/dify-helm) - [由 @Winson-030 提供的 YAML 文件](https://github.com/Winson-030/dify-kubernetes) +- [由 @wyy-holding 提供的 YAML 文件](https://github.com/wyy-holding/dify-k8s) ### 使用 Terraform 進行部署 diff --git a/README_VI.md b/README_VI.md index 2f64541285..15d2d5ae80 100644 --- a/README_VI.md +++ b/README_VI.md @@ -200,6 +200,7 @@ Nếu bạn muốn cấu hình một cài đặt có độ sẵn sàng cao, có - [Helm Chart bởi @LeoQuote](https://github.com/douban/charts/tree/master/charts/dify) - [Helm Chart bởi @BorisPolonsky](https://github.com/BorisPolonsky/dify-helm) - [Tệp YAML bởi @Winson-030](https://github.com/Winson-030/dify-kubernetes) +- [Tệp YAML bởi @wyy-holding](https://github.com/wyy-holding/dify-k8s) #### Sử dụng Terraform để Triển khai From 285314da1c35edcf08aa97e73101457a24f6bce6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E3=83=AA=E3=82=A4=E3=83=8E=20Lin?= Date: Thu, 20 Mar 2025 09:28:42 +0800 Subject: [PATCH 3/7] fix: update workflow doc (#16251) --- .../develop/template/template_workflow.en.mdx | 37 ++++++++++--------- .../develop/template/template_workflow.ja.mdx | 36 ++++++++++-------- .../develop/template/template_workflow.zh.mdx | 36 +++++++++--------- 3 files changed, 60 insertions(+), 49 deletions(-) diff --git a/web/app/components/develop/template/template_workflow.en.mdx b/web/app/components/develop/template/template_workflow.en.mdx index 27c0d26505..c8b4b614c8 100644 --- a/web/app/components/develop/template/template_workflow.en.mdx +++ b/web/app/components/develop/template/template_workflow.en.mdx @@ -43,18 +43,9 @@ Workflow applications offers non-session support and is ideal for translation, a - `inputs` (object) Required Allows the entry of various variable values defined by the App. The `inputs` parameter contains multiple key/value pairs, with each key corresponding to a specific variable and each value being the specific value for that variable. - The workflow application requires at least one key/value pair to be inputted. - If the variable is of File type, specify an object that has the keys described in `files` below. - - `response_mode` (string) Required - The mode of response return, supporting: - - `streaming` Streaming mode (recommended), implements a typewriter-like output through SSE ([Server-Sent Events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events)). - - `blocking` Blocking mode, returns result after execution is complete. (Requests may be interrupted if the process is long) - Due to Cloudflare restrictions, the request will be interrupted without a return after 100 seconds. - - `user` (string) Required - User identifier, used to define the identity of the end-user for retrieval and statistics. - Should be uniquely defined by the developer within the application. - - `files` (array[object]) Optional - File list, suitable for inputting files combined with text understanding and answering questions, available only when the model supports file parsing and understanding capability. + The workflow application requires at least one key/value pair to be inputted. The variable can be of File Array type. + File Array type variable is suitable for inputting files combined with text understanding and answering questions, available only when the model supports file parsing and understanding capability. + If the variable is of File Array type, the corresponding value should be a list whose elements contain following attributions: - `type` (string) Supported type: - `document` ('TXT', 'MD', 'MARKDOWN', 'PDF', 'HTML', 'XLSX', 'XLS', 'DOCX', 'CSV', 'EML', 'MSG', 'PPTX', 'PPT', 'XML', 'EPUB') - `image` ('JPG', 'JPEG', 'PNG', 'GIF', 'WEBP', 'SVG') @@ -65,6 +56,15 @@ Workflow applications offers non-session support and is ideal for translation, a - `url` (string) Image URL (when the transfer method is `remote_url`) - `upload_file_id` (string) Uploaded file ID, which must be obtained by uploading through the File Upload API in advance (when the transfer method is `local_file`) + - `response_mode` (string) Required + The mode of response return, supporting: + - `streaming` Streaming mode (recommended), implements a typewriter-like output through SSE ([Server-Sent Events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events)). + - `blocking` Blocking mode, returns result after execution is complete. (Requests may be interrupted if the process is long) + Due to Cloudflare restrictions, the request will be interrupted without a return after 100 seconds. + - `user` (string) Required + User identifier, used to define the identity of the end-user for retrieval and statistics. + Should be uniquely defined by the developer within the application. + ### Response When `response_mode` is `blocking`, return a CompletionResponse object. When `response_mode` is `streaming`, return a ChunkCompletionResponse stream. @@ -190,15 +190,18 @@ Workflow applications offers non-session support and is ideal for translation, a ``` - + ```json {{ title: 'File variable example' }} { "inputs": { - "{variable_name}": { + "{variable_name}": + [ + { "transfer_method": "local_file", "upload_file_id": "{upload_file_id}", "type": "{document_type}" - } + } + ] } } ``` @@ -279,11 +282,11 @@ Workflow applications offers non-session support and is ideal for translation, a data = { "inputs": { - "orig_mail": { + "orig_mail": [{ "transfer_method": "local_file", "upload_file_id": file_id, "type": "document" - } + }] }, "response_mode": response_mode, "user": user diff --git a/web/app/components/develop/template/template_workflow.ja.mdx b/web/app/components/develop/template/template_workflow.ja.mdx index 9e66973db7..3bd56807ae 100644 --- a/web/app/components/develop/template/template_workflow.ja.mdx +++ b/web/app/components/develop/template/template_workflow.ja.mdx @@ -43,18 +43,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - `inputs` (object) 必須 アプリで定義されたさまざまな変数値の入力を許可します。 `inputs`パラメータには複数のキー/値ペアが含まれ、各キーは特定の変数に対応し、各値はその変数の特定の値です。 - ワークフローアプリケーションは少なくとも1つのキー/値ペアの入力を必要とします。 - 変数がファイルタイプの場合、以下の`files`で説明されているキーを持つオブジェクトを指定してください。 - - `response_mode` (string) 必須 - 応答の返却モードを指定します。サポートされているモード: - - `streaming` ストリーミングモード(推奨)、SSE([Server-Sent Events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events))を通じてタイプライターのような出力を実装します。 - - `blocking` ブロッキングモード、実行完了後に結果を返します。(プロセスが長い場合、リクエストが中断される可能性があります) - Cloudflareの制限により、100秒後に応答がない場合、リクエストは中断されます。 - - `user` (string) 必須 - ユーザー識別子、エンドユーザーのアイデンティティを定義するために使用されます。 - アプリケーション内で開発者によって一意に定義される必要があります。 - - `files` (array[object]) オプション + ワークフローアプリケーションは少なくとも1つのキー/値ペアの入力を必要とします。値はファイルリストである場合もあります。 ファイルリストは、テキスト理解と質問への回答を組み合わせたファイルの入力に適しています。モデルがファイルの解析と理解機能をサポートしている場合にのみ使用できます。 + + 変数がファイルリストの場合、リストの各要素は以下の属性を持つ必要があります。 - `type` (string) サポートされているタイプ: - `document` ('TXT', 'MD', 'MARKDOWN', 'PDF', 'HTML', 'XLSX', 'XLS', 'DOCX', 'CSV', 'EML', 'MSG', 'PPTX', 'PPT', 'XML', 'EPUB') - `image` ('JPG', 'JPEG', 'PNG', 'GIF', 'WEBP', 'SVG') @@ -65,6 +57,17 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - `url` (string) 画像URL(転送方法が`remote_url`の場合) - `upload_file_id` (string) アップロードされたファイルID、事前にファイルアップロードAPIを通じて取得する必要があります(転送方法が`local_file`の場合) + - `response_mode` (string) 必須 + 応答の返却モードを指定します。サポートされているモード: + - `streaming` ストリーミングモード(推奨)、SSE([Server-Sent Events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events))を通じてタイプライターのような出力を実装します。 + - `blocking` ブロッキングモード、実行完了後に結果を返します。(プロセスが長い場合、リクエストが中断される可能性があります) + Cloudflareの制限により、100秒後に応答がない場合、リクエストは中断されます。 + - `user` (string) 必須 + ユーザー識別子、エンドユーザーのアイデンティティを定義するために使用されます。 + アプリケーション内で開発者によって一意に定義される必要があります。 + - `files` (array[object]) オプション + + ### 応答 `response_mode`が`blocking`の場合、CompletionResponseオブジェクトを返します。 `response_mode`が`streaming`の場合、ChunkCompletionResponseストリームを返します。 @@ -194,11 +197,14 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from ```json {{ title: 'ファイル変数の例' }} { "inputs": { - "{variable_name}": { + "{variable_name}": + [ + { "transfer_method": "local_file", "upload_file_id": "{upload_file_id}", "type": "{document_type}" - } + } + ] } } ``` @@ -279,11 +285,11 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from data = { "inputs": { - "orig_mail": { + "orig_mail": [{ "transfer_method": "local_file", "upload_file_id": file_id, "type": "document" - } + }] }, "response_mode": response_mode, "user": user diff --git a/web/app/components/develop/template/template_workflow.zh.mdx b/web/app/components/develop/template/template_workflow.zh.mdx index 40dfb863a0..c687fa1a51 100644 --- a/web/app/components/develop/template/template_workflow.zh.mdx +++ b/web/app/components/develop/template/template_workflow.zh.mdx @@ -41,18 +41,8 @@ Workflow 应用无会话支持,适合用于翻译/文章写作/总结 AI 等 ### Request Body - `inputs` (object) Required 允许传入 App 定义的各变量值。 - inputs 参数包含了多组键值对(Key/Value pairs),每组的键对应一个特定变量,每组的值则是该变量的具体值。 - 如果变量是文件类型,请指定一个包含以下 `files` 中所述键的对象。 - - `response_mode` (string) Required - 返回响应模式,支持: - - `streaming` 流式模式(推荐)。基于 SSE(**[Server-Sent Events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events)**)实现类似打字机输出方式的流式返回。 - - `blocking` 阻塞模式,等待执行完毕后返回结果。(请求若流程较长可能会被中断)。 - 由于 Cloudflare 限制,请求会在 100 秒超时无返回后中断。 - - `user` (string) Required - 用户标识,用于定义终端用户的身份,方便检索、统计。 - 由开发者定义规则,需保证用户标识在应用内唯一。 - - `files` (array[object]) Optional - 文件列表,适用于传入文件结合文本理解并回答问题,仅当模型支持该类型文件解析能力时可用。 + inputs 参数包含了多组键值对(Key/Value pairs),每组的键对应一个特定变量,每组的值则是该变量的具体值。变量可以是文件列表类型。 + 文件列表类型变量适用于传入文件结合文本理解并回答问题,仅当模型支持该类型文件解析能力时可用。如果该变量是文件列表类型,该变量对应的值应是列表格式,其中每个元素应包含以下内容: - `type` (string) 支持类型: - `document` 具体类型包含:'TXT', 'MD', 'MARKDOWN', 'PDF', 'HTML', 'XLSX', 'XLS', 'DOCX', 'CSV', 'EML', 'MSG', 'PPTX', 'PPT', 'XML', 'EPUB' - `image` 具体类型包含:'JPG', 'JPEG', 'PNG', 'GIF', 'WEBP', 'SVG' @@ -62,6 +52,15 @@ Workflow 应用无会话支持,适合用于翻译/文章写作/总结 AI 等 - `transfer_method` (string) 传递方式,`remote_url` 图片地址 / `local_file` 上传文件 - `url` (string) 图片地址(仅当传递方式为 `remote_url` 时) - `upload_file_id` (string) (string) 上传文件 ID(仅当传递方式为 `local_file` 时) + - `response_mode` (string) Required + 返回响应模式,支持: + - `streaming` 流式模式(推荐)。基于 SSE(**[Server-Sent Events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events)**)实现类似打字机输出方式的流式返回。 + - `blocking` 阻塞模式,等待执行完毕后返回结果。(请求若流程较长可能会被中断)。 + 由于 Cloudflare 限制,请求会在 100 秒超时无返回后中断。 + - `user` (string) Required + 用户标识,用于定义终端用户的身份,方便检索、统计。 + 由开发者定义规则,需保证用户标识在应用内唯一。 + ### Response 当 `response_mode` 为 `blocking` 时,返回 CompletionResponse object。 @@ -184,15 +183,18 @@ Workflow 应用无会话支持,适合用于翻译/文章写作/总结 AI 等 }' ``` - + ```json {{ title: 'File variable example' }} { "inputs": { - "{variable_name}": { + "{variable_name}": + [ + { "transfer_method": "local_file", "upload_file_id": "{upload_file_id}", "type": "{document_type}" - } + } + ] } } ``` @@ -273,11 +275,11 @@ Workflow 应用无会话支持,适合用于翻译/文章写作/总结 AI 等 data = { "inputs": { - "orig_mail": { + "orig_mail": [{ "transfer_method": "local_file", "upload_file_id": file_id, "type": "document" - } + }] }, "response_mode": response_mode, "user": user From 79118f51c2522bf518becbee74b02e96eb09685f Mon Sep 17 00:00:00 2001 From: Ning Date: Thu, 20 Mar 2025 09:38:46 +0800 Subject: [PATCH 4/7] fix: dify-web docker MAX_TOOLS_NUM environment value not work (#16241) --- web/docker/entrypoint.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/web/docker/entrypoint.sh b/web/docker/entrypoint.sh index d0ee56b889..797b61081a 100755 --- a/web/docker/entrypoint.sh +++ b/web/docker/entrypoint.sh @@ -27,5 +27,6 @@ export NEXT_PUBLIC_TEXT_GENERATION_TIMEOUT_MS=${TEXT_GENERATION_TIMEOUT_MS} export NEXT_PUBLIC_CSP_WHITELIST=${CSP_WHITELIST} export NEXT_PUBLIC_TOP_K_MAX_VALUE=${TOP_K_MAX_VALUE} export NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH} +export NEXT_PUBLIC_MAX_TOOLS_NUM=${MAX_TOOLS_NUM} pm2 start /app/web/server.js --name dify-web --cwd /app/web -i ${PM2_INSTANCES} --no-daemon From c1f3d968bfa36f44ccf23860fd7f8d80f2423407 Mon Sep 17 00:00:00 2001 From: GuanMu Date: Thu, 20 Mar 2025 10:55:37 +0800 Subject: [PATCH 5/7] fix: enhance React imports in LLM panel component #16282 (#16283) --- web/app/components/workflow/nodes/llm/panel.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/web/app/components/workflow/nodes/llm/panel.tsx b/web/app/components/workflow/nodes/llm/panel.tsx index c655188536..e1264ad89e 100644 --- a/web/app/components/workflow/nodes/llm/panel.tsx +++ b/web/app/components/workflow/nodes/llm/panel.tsx @@ -1,5 +1,5 @@ import type { FC } from 'react' -import React from 'react' +import React, { useCallback } from 'react' import { useTranslation } from 'react-i18next' import MemoryConfig from '../_base/components/memory-config' import VarReferencePicker from '../_base/components/variable/var-reference-picker' From 2c9af712a2936df8b020c3825d4e85be09801c4f Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Thu, 20 Mar 2025 14:33:32 +0800 Subject: [PATCH 6/7] Fix/create document by api with metadata (#16307) Co-authored-by: zxhlyh --- api/commands.py | 79 ++++++- .../service_api/dataset/document.py | 96 -------- api/extensions/ext_commands.py | 6 +- api/services/dataset_service.py | 15 -- .../knowledge_entities/knowledge_entities.py | 1 - .../datasets/template/template.en.mdx | 200 ----------------- .../datasets/template/template.zh.mdx | 205 ------------------ 7 files changed, 75 insertions(+), 527 deletions(-) diff --git a/api/commands.py b/api/commands.py index df67f29aff..94e7e74e36 100644 --- a/api/commands.py +++ b/api/commands.py @@ -20,7 +20,7 @@ from libs.helper import email as email_validate from libs.password import hash_password, password_pattern, valid_password from libs.rsa import generate_key_pair from models import Tenant -from models.dataset import Dataset, DatasetCollectionBinding, DocumentSegment +from models.dataset import Dataset, DatasetCollectionBinding, DatasetMetadata, DatasetMetadataBinding, DocumentSegment from models.dataset import Document as DatasetDocument from models.model import Account, App, AppAnnotationSetting, AppMode, Conversation, MessageAnnotation from models.provider import Provider, ProviderModel @@ -483,14 +483,11 @@ def convert_to_agent_apps(): click.echo(click.style("Conversion complete. Converted {} agent apps.".format(len(proceeded_app_ids)), fg="green")) -@click.command("add-qdrant-doc-id-index", help="Add Qdrant doc_id index.") +@click.command("add-qdrant-index", help="Add Qdrant index.") @click.option("--field", default="metadata.doc_id", prompt=False, help="Index field , default is metadata.doc_id.") -def add_qdrant_doc_id_index(field: str): - click.echo(click.style("Starting Qdrant doc_id index creation.", fg="green")) - vector_type = dify_config.VECTOR_STORE - if vector_type != "qdrant": - click.echo(click.style("This command only supports Qdrant vector store.", fg="red")) - return +def add_qdrant_index(field: str): + click.echo(click.style("Starting Qdrant index creation.", fg="green")) + create_count = 0 try: @@ -539,6 +536,72 @@ def add_qdrant_doc_id_index(field: str): click.echo(click.style(f"Index creation complete. Created {create_count} collection indexes.", fg="green")) +@click.command("old-metadata-migration", help="Old metadata migration.") +def old_metadata_migration(): + """ + Old metadata migration. + """ + click.echo(click.style("Starting old metadata migration.", fg="green")) + + page = 1 + while True: + try: + documents = ( + DatasetDocument.query.filter(DatasetDocument.doc_metadata is not None) + .order_by(DatasetDocument.created_at.desc()) + .paginate(page=page, per_page=50) + ) + except NotFound: + break + if not documents: + break + for document in documents: + if document.doc_metadata: + doc_metadata = document.doc_metadata + for key, value in doc_metadata.items(): + dataset_metadata = ( + db.session.query(DatasetMetadata) + .filter(DatasetMetadata.dataset_id == document.dataset_id, DatasetMetadata.name == key) + .first() + ) + if not dataset_metadata: + dataset_metadata = DatasetMetadata( + tenant_id=document.tenant_id, + dataset_id=document.dataset_id, + name=key, + type="string", + created_by=document.created_by, + ) + db.session.add(dataset_metadata) + db.session.flush() + dataset_metadata_binding = DatasetMetadataBinding( + tenant_id=document.tenant_id, + dataset_id=document.dataset_id, + metadata_id=dataset_metadata.id, + document_id=document.id, + created_by=document.created_by, + ) + db.session.add(dataset_metadata_binding) + else: + dataset_metadata_binding = DatasetMetadataBinding.query.filter( + DatasetMetadataBinding.dataset_id == document.dataset_id, + DatasetMetadataBinding.document_id == document.id, + DatasetMetadataBinding.metadata_id == dataset_metadata.id, + ).first() + if not dataset_metadata_binding: + dataset_metadata_binding = DatasetMetadataBinding( + tenant_id=document.tenant_id, + dataset_id=document.dataset_id, + metadata_id=dataset_metadata.id, + document_id=document.id, + created_by=document.created_by, + ) + db.session.add(dataset_metadata_binding) + db.session.commit() + page += 1 + click.echo(click.style("Old metadata migration completed.", fg="green")) + + @click.command("create-tenant", help="Create account and tenant.") @click.option("--email", prompt=True, help="Tenant account email.") @click.option("--name", prompt=True, help="Workspace name.") diff --git a/api/controllers/service_api/dataset/document.py b/api/controllers/service_api/dataset/document.py index d4e67b6596..995444ee48 100644 --- a/api/controllers/service_api/dataset/document.py +++ b/api/controllers/service_api/dataset/document.py @@ -18,7 +18,6 @@ from controllers.service_api.app.error import ( from controllers.service_api.dataset.error import ( ArchivedDocumentImmutableError, DocumentIndexingError, - InvalidMetadataError, ) from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check from core.errors.error import ProviderTokenNotInitError @@ -51,8 +50,6 @@ class DocumentAddByTextApi(DatasetApiResource): "indexing_technique", type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False, location="json" ) parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json") - parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json") - parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json") args = parser.parse_args() dataset_id = str(dataset_id) @@ -65,28 +62,6 @@ class DocumentAddByTextApi(DatasetApiResource): if not dataset.indexing_technique and not args["indexing_technique"]: raise ValueError("indexing_technique is required.") - # Validate metadata if provided - if args.get("doc_type") or args.get("doc_metadata"): - if not args.get("doc_type") or not args.get("doc_metadata"): - raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata") - - if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA: - raise InvalidMetadataError( - "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys()) - ) - - if not isinstance(args["doc_metadata"], dict): - raise InvalidMetadataError("doc_metadata must be a dictionary") - - # Validate metadata schema based on doc_type - if args["doc_type"] != "others": - metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]] - for key, value in args["doc_metadata"].items(): - if key in metadata_schema and not isinstance(value, metadata_schema[key]): - raise InvalidMetadataError(f"Invalid type for metadata field {key}") - # set to MetaDataConfig - args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]} - text = args.get("text") name = args.get("name") if text is None or name is None: @@ -133,8 +108,6 @@ class DocumentUpdateByTextApi(DatasetApiResource): "doc_language", type=str, default="English", required=False, nullable=False, location="json" ) parser.add_argument("retrieval_model", type=dict, required=False, nullable=False, location="json") - parser.add_argument("doc_type", type=str, required=False, nullable=True, location="json") - parser.add_argument("doc_metadata", type=dict, required=False, nullable=True, location="json") args = parser.parse_args() dataset_id = str(dataset_id) tenant_id = str(tenant_id) @@ -146,29 +119,6 @@ class DocumentUpdateByTextApi(DatasetApiResource): # indexing_technique is already set in dataset since this is an update args["indexing_technique"] = dataset.indexing_technique - # Validate metadata if provided - if args.get("doc_type") or args.get("doc_metadata"): - if not args.get("doc_type") or not args.get("doc_metadata"): - raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata") - - if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA: - raise InvalidMetadataError( - "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys()) - ) - - if not isinstance(args["doc_metadata"], dict): - raise InvalidMetadataError("doc_metadata must be a dictionary") - - # Validate metadata schema based on doc_type - if args["doc_type"] != "others": - metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]] - for key, value in args["doc_metadata"].items(): - if key in metadata_schema and not isinstance(value, metadata_schema[key]): - raise InvalidMetadataError(f"Invalid type for metadata field {key}") - - # set to MetaDataConfig - args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]} - if args["text"]: text = args.get("text") name = args.get("name") @@ -216,29 +166,6 @@ class DocumentAddByFileApi(DatasetApiResource): if "doc_language" not in args: args["doc_language"] = "English" - # Validate metadata if provided - if args.get("doc_type") or args.get("doc_metadata"): - if not args.get("doc_type") or not args.get("doc_metadata"): - raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata") - - if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA: - raise InvalidMetadataError( - "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys()) - ) - - if not isinstance(args["doc_metadata"], dict): - raise InvalidMetadataError("doc_metadata must be a dictionary") - - # Validate metadata schema based on doc_type - if args["doc_type"] != "others": - metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]] - for key, value in args["doc_metadata"].items(): - if key in metadata_schema and not isinstance(value, metadata_schema[key]): - raise InvalidMetadataError(f"Invalid type for metadata field {key}") - - # set to MetaDataConfig - args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]} - # get dataset info dataset_id = str(dataset_id) tenant_id = str(tenant_id) @@ -306,29 +233,6 @@ class DocumentUpdateByFileApi(DatasetApiResource): if "doc_language" not in args: args["doc_language"] = "English" - # Validate metadata if provided - if args.get("doc_type") or args.get("doc_metadata"): - if not args.get("doc_type") or not args.get("doc_metadata"): - raise InvalidMetadataError("Both doc_type and doc_metadata must be provided when adding metadata") - - if args["doc_type"] not in DocumentService.DOCUMENT_METADATA_SCHEMA: - raise InvalidMetadataError( - "Invalid doc_type. Must be one of: " + ", ".join(DocumentService.DOCUMENT_METADATA_SCHEMA.keys()) - ) - - if not isinstance(args["doc_metadata"], dict): - raise InvalidMetadataError("doc_metadata must be a dictionary") - - # Validate metadata schema based on doc_type - if args["doc_type"] != "others": - metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[args["doc_type"]] - for key, value in args["doc_metadata"].items(): - if key in metadata_schema and not isinstance(value, metadata_schema[key]): - raise InvalidMetadataError(f"Invalid type for metadata field {key}") - - # set to MetaDataConfig - args["metadata"] = {"doc_type": args["doc_type"], "doc_metadata": args["doc_metadata"]} - # get dataset info dataset_id = str(dataset_id) tenant_id = str(tenant_id) diff --git a/api/extensions/ext_commands.py b/api/extensions/ext_commands.py index 3f5ae539c5..92996f75e5 100644 --- a/api/extensions/ext_commands.py +++ b/api/extensions/ext_commands.py @@ -3,7 +3,7 @@ from dify_app import DifyApp def init_app(app: DifyApp): from commands import ( - add_qdrant_doc_id_index, + add_qdrant_index, convert_to_agent_apps, create_tenant, extract_plugins, @@ -11,6 +11,7 @@ def init_app(app: DifyApp): fix_app_site_missing, install_plugins, migrate_data_for_plugin, + old_metadata_migration, reset_email, reset_encrypt_key_pair, reset_password, @@ -24,7 +25,7 @@ def init_app(app: DifyApp): reset_encrypt_key_pair, vdb_migrate, convert_to_agent_apps, - add_qdrant_doc_id_index, + add_qdrant_index, create_tenant, upgrade_db, fix_app_site_missing, @@ -32,6 +33,7 @@ def init_app(app: DifyApp): extract_plugins, extract_unique_plugins, install_plugins, + old_metadata_migration, ] for cmd in cmds_to_register: app.cli.add_command(cmd) diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index 7ce4e4af22..d3654a3d48 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -46,7 +46,6 @@ from models.source import DataSourceOauthBinding from services.entities.knowledge_entities.knowledge_entities import ( ChildChunkUpdateArgs, KnowledgeConfig, - MetaDataConfig, RerankingModel, RetrievalModel, SegmentUpdateArgs, @@ -999,9 +998,6 @@ class DocumentService: document.data_source_info = json.dumps(data_source_info) document.batch = batch document.indexing_status = "waiting" - if knowledge_config.metadata: - document.doc_type = knowledge_config.metadata.doc_type - document.metadata = knowledge_config.metadata.doc_metadata db.session.add(document) documents.append(document) duplicate_document_ids.append(document.id) @@ -1018,7 +1014,6 @@ class DocumentService: account, file_name, batch, - knowledge_config.metadata, ) db.session.add(document) db.session.flush() @@ -1076,7 +1071,6 @@ class DocumentService: account, truncated_page_name, batch, - knowledge_config.metadata, ) db.session.add(document) db.session.flush() @@ -1117,7 +1111,6 @@ class DocumentService: account, document_name, batch, - knowledge_config.metadata, ) db.session.add(document) db.session.flush() @@ -1155,7 +1148,6 @@ class DocumentService: account: Account, name: str, batch: str, - metadata: Optional[MetaDataConfig] = None, ): document = Document( tenant_id=dataset.tenant_id, @@ -1180,9 +1172,6 @@ class DocumentService: BuiltInField.last_update_date: datetime.datetime.now(datetime.UTC).strftime("%Y-%m-%d %H:%M:%S"), BuiltInField.source: data_source_type, } - if metadata is not None: - doc_metadata.update(metadata.doc_metadata) - document.doc_type = metadata.doc_type if doc_metadata: document.doc_metadata = doc_metadata return document @@ -1297,10 +1286,6 @@ class DocumentService: # update document name if document_data.name: document.name = document_data.name - # update doc_type and doc_metadata if provided - if document_data.metadata is not None: - document.doc_metadata = document_data.metadata.doc_metadata - document.doc_type = document_data.metadata.doc_type # update document to be waiting document.indexing_status = "waiting" document.completed_at = None diff --git a/api/services/entities/knowledge_entities/knowledge_entities.py b/api/services/entities/knowledge_entities/knowledge_entities.py index 37c0fb49e5..51ce596e5c 100644 --- a/api/services/entities/knowledge_entities/knowledge_entities.py +++ b/api/services/entities/knowledge_entities/knowledge_entities.py @@ -128,7 +128,6 @@ class KnowledgeConfig(BaseModel): embedding_model: Optional[str] = None embedding_model_provider: Optional[str] = None name: Optional[str] = None - metadata: Optional[MetaDataConfig] = None class SegmentUpdateArgs(BaseModel): diff --git a/web/app/(commonLayout)/datasets/template/template.en.mdx b/web/app/(commonLayout)/datasets/template/template.en.mdx index 7d32f8cebe..a5f4c40ef6 100644 --- a/web/app/(commonLayout)/datasets/template/template.en.mdx +++ b/web/app/(commonLayout)/datasets/template/template.en.mdx @@ -47,44 +47,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi Document content - - Type of document (optional): - - book Book - - web_page Web page - - paper Academic paper/article - - social_media_post Social media post - - wikipedia_entry Wikipedia entry - - personal_document Personal document - - business_document Business document - - im_chat_log Chat log - - synced_from_notion Notion document - - synced_from_github GitHub document - - others Other document types - - - Document metadata (required if doc_type is provided). Fields vary by doc_type: - For book: - - title Book title - - language Book language - - author Book author - - publisher Publisher name - - publication_date Publication date - - isbn ISBN number - - category Book category - - For web_page: - - title Page title - - url Page URL - - language Page language - - publish_date Publish date - - author/publisher Author or publisher - - topic/keywords Topic or keywords - - description Page description - - Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type. - - For doc_type "others", any valid JSON object is accepted - Index mode - high_quality High quality: embedding using embedding model, built as vector database index @@ -233,68 +195,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi - hierarchical_model Parent-child mode - qa_model Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions - - doc_type Type of document (optional) - - book Book - Document records a book or publication - - web_page Web page - Document records web page content - - paper Academic paper/article - Document records academic paper or research article - - social_media_post Social media post - Content from social media posts - - wikipedia_entry Wikipedia entry - Content from Wikipedia entries - - personal_document Personal document - Documents related to personal content - - business_document Business document - Documents related to business content - - im_chat_log Chat log - Records of instant messaging chats - - synced_from_notion Notion document - Documents synchronized from Notion - - synced_from_github GitHub document - Documents synchronized from GitHub - - others Other document types - Other document types not listed above - - - doc_metadata Document metadata (required if doc_type is provided) - Fields vary by doc_type: - - For book: - - title Book title - Title of the book - - language Book language - Language of the book - - author Book author - Author of the book - - publisher Publisher name - Name of the publishing house - - publication_date Publication date - Date when the book was published - - isbn ISBN number - International Standard Book Number - - category Book category - Category or genre of the book - - For web_page: - - title Page title - Title of the web page - - url Page URL - URL address of the web page - - language Page language - Language of the web page - - publish_date Publish date - Date when the web page was published - - author/publisher Author or publisher - Author or publisher of the web page - - topic/keywords Topic or keywords - Topics or keywords of the web page - - description Page description - Description of the web page content - - Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type. - For doc_type "others", any valid JSON object is accepted - - doc_language In Q&A mode, specify the language of the document, for example: English, Chinese - process_rule Processing rules @@ -407,44 +307,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi Knowledge description (optional) - - Type of document (optional): - - book Book - - web_page Web page - - paper Academic paper/article - - social_media_post Social media post - - wikipedia_entry Wikipedia entry - - personal_document Personal document - - business_document Business document - - im_chat_log Chat log - - synced_from_notion Notion document - - synced_from_github GitHub document - - others Other document types - - - Document metadata (required if doc_type is provided). Fields vary by doc_type: - For book: - - title Book title - - language Book language - - author Book author - - publisher Publisher name - - publication_date Publication date - - isbn ISBN number - - category Book category - - For web_page: - - title Page title - - url Page URL - - language Page language - - publish_date Publish date - - author/publisher Author or publisher - - topic/keywords Topic or keywords - - description Page description - - Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type. - - For doc_type "others", any valid JSON object is accepted - Index technique (optional) - high_quality High quality @@ -762,67 +624,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi - separator Segmentation identifier. Currently, only one delimiter is allowed. The default is *** - max_tokens The maximum length (tokens) must be validated to be shorter than the length of the parent chunk - chunk_overlap Define the overlap between adjacent chunks (optional) - - doc_type Type of document (optional) - - book Book - Document records a book or publication - - web_page Web page - Document records web page content - - paper Academic paper/article - Document records academic paper or research article - - social_media_post Social media post - Content from social media posts - - wikipedia_entry Wikipedia entry - Content from Wikipedia entries - - personal_document Personal document - Documents related to personal content - - business_document Business document - Documents related to business content - - im_chat_log Chat log - Records of instant messaging chats - - synced_from_notion Notion document - Documents synchronized from Notion - - synced_from_github GitHub document - Documents synchronized from GitHub - - others Other document types - Other document types not listed above - - - doc_metadata Document metadata (required if doc_type is provided) - Fields vary by doc_type: - - For book: - - title Book title - Title of the book - - language Book language - Language of the book - - author Book author - Author of the book - - publisher Publisher name - Name of the publishing house - - publication_date Publication date - Date when the book was published - - isbn ISBN number - International Standard Book Number - - category Book category - Category or genre of the book - - For web_page: - - title Page title - Title of the web page - - url Page URL - URL address of the web page - - language Page language - Language of the web page - - publish_date Publish date - Date when the web page was published - - author/publisher Author or publisher - Author or publisher of the web page - - topic/keywords Topic or keywords - Topics or keywords of the web page - - description Page description - Description of the web page content - - Please check [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) for more details on the fields required for each doc_type. - For doc_type "others", any valid JSON object is accepted @@ -1528,7 +1329,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi "id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2", "data_source_type": "upload_file", "name": "readme.txt", - "doc_type": null } }, "score": 3.730463140527718e-05, diff --git a/web/app/(commonLayout)/datasets/template/template.zh.mdx b/web/app/(commonLayout)/datasets/template/template.zh.mdx index 8bd3d8d5eb..282849f3db 100644 --- a/web/app/(commonLayout)/datasets/template/template.zh.mdx +++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx @@ -47,46 +47,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi 文档内容 - - 文档类型(选填) - - book 图书 Book - - web_page 网页 Web page - - paper 学术论文/文章 Academic paper/article - - social_media_post 社交媒体帖子 Social media post - - wikipedia_entry 维基百科条目 Wikipedia entry - - personal_document 个人文档 Personal document - - business_document 商业文档 Business document - - im_chat_log 即时通讯记录 Chat log - - synced_from_notion Notion同步文档 Notion document - - synced_from_github GitHub同步文档 GitHub document - - others 其他文档类型 Other document types - - - - 文档元数据(如提供文档类型则必填)。字段因文档类型而异: - - 针对图书 For book: - - title 书名 Book title - - language 图书语言 Book language - - author 作者 Book author - - publisher 出版社 Publisher name - - publication_date 出版日期 Publication date - - isbn ISBN号码 ISBN number - - category 图书分类 Book category - - 针对网页 For web_page: - - title 页面标题 Page title - - url 页面网址 Page URL - - language 页面语言 Page language - - publish_date 发布日期 Publish date - - author/publisher 作者/发布者 Author or publisher - - topic/keywords 主题/关键词 Topic or keywords - - description 页面描述 Page description - - 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。 - - 针对"其他"类型文档,接受任何有效的JSON对象 - 索引方式 - high_quality 高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引 @@ -234,68 +194,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi - text_model text 文档直接 embedding,经济模式默认为该模式 - hierarchical_model parent-child 模式 - qa_model Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding - - doc_type 文档类型(选填)Type of document (optional) - - book 图书 - 文档记录一本书籍或出版物 - - web_page 网页 - 网页内容的文档记录 - - paper 学术论文/文章 - 学术论文或研究文章的记录 - - social_media_post 社交媒体帖子 - 社交媒体上的帖子内容 - - wikipedia_entry 维基百科条目 - 维基百科的词条内容 - - personal_document 个人文档 - 个人相关的文档记录 - - business_document 商业文档 - 商业相关的文档记录 - - im_chat_log 即时通讯记录 - 即时通讯的聊天记录 - - synced_from_notion Notion同步文档 - 从Notion同步的文档内容 - - synced_from_github GitHub同步文档 - 从GitHub同步的文档内容 - - others 其他文档类型 - 其他未列出的文档类型 - - - doc_metadata 文档元数据(如提供文档类型则必填 - 字段因文档类型而异 - - 针对图书类型 For book: - - title 书名 - 书籍的标题 - - language 图书语言 - 书籍的语言 - - author 作者 - 书籍的作者 - - publisher 出版社 - 出版社的名称 - - publication_date 出版日期 - 书籍的出版日期 - - isbn ISBN号码 - 书籍的ISBN编号 - - category 图书分类 - 书籍的分类类别 - - 针对网页类型 For web_page: - - title 页面标题 - 网页的标题 - - url 页面网址 - 网页的URL地址 - - language 页面语言 - 网页的语言 - - publish_date 发布日期 - 网页的发布日期 - - author/publisher 作者/发布者 - 网页的作者或发布者 - - topic/keywords 主题/关键词 - 网页的主题或关键词 - - description 页面描述 - 网页的描述信息 - - 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。 - - 针对"其他"类型文档,接受任何有效的JSON对象 - doc_language 在 Q&A 模式下,指定文档的语言,例如:EnglishChinese @@ -606,46 +504,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi 文档内容(选填) - - 文档类型(选填) - - book 图书 Book - - web_page 网页 Web page - - paper 学术论文/文章 Academic paper/article - - social_media_post 社交媒体帖子 Social media post - - wikipedia_entry 维基百科条目 Wikipedia entry - - personal_document 个人文档 Personal document - - business_document 商业文档 Business document - - im_chat_log 即时通讯记录 Chat log - - synced_from_notion Notion同步文档 Notion document - - synced_from_github GitHub同步文档 GitHub document - - others 其他文档类型 Other document types - - - - 文档元数据(如提供文档类型则必填)。字段因文档类型而异: - - 针对图书 For book: - - title 书名 Book title - - language 图书语言 Book language - - author 作者 Book author - - publisher 出版社 Publisher name - - publication_date 出版日期 Publication date - - isbn ISBN号码 ISBN number - - category 图书分类 Book category - - 针对网页 For web_page: - - title 页面标题 Page title - - url 页面网址 Page URL - - language 页面语言 Page language - - publish_date 发布日期 Publish date - - author/publisher 作者/发布者 Author or publisher - - topic/keywords 主题/关键词 Topic or keywords - - description 页面描述 Page description - - 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。 - - 针对"其他"类型文档,接受任何有效的JSON对象 - 处理规则(选填) - mode (string) 清洗、分段模式 ,automatic 自动 / custom 自定义 @@ -766,68 +624,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi - separator 分段标识符,目前仅允许设置一个分隔符。默认为 *** - max_tokens 最大长度 (token) 需要校验小于父级的长度 - chunk_overlap 分段重叠指的是在对数据进行分段时,段与段之间存在一定的重叠部分(选填) - - doc_type 文档类型(选填)Type of document (optional) - - book 图书 - 文档记录一本书籍或出版物 - - web_page 网页 - 网页内容的文档记录 - - paper 学术论文/文章 - 学术论文或研究文章的记录 - - social_media_post 社交媒体帖子 - 社交媒体上的帖子内容 - - wikipedia_entry 维基百科条目 - 维基百科的词条内容 - - personal_document 个人文档 - 个人相关的文档记录 - - business_document 商业文档 - 商业相关的文档记录 - - im_chat_log 即时通讯记录 - 即时通讯的聊天记录 - - synced_from_notion Notion同步文档 - 从Notion同步的文档内容 - - synced_from_github GitHub同步文档 - 从GitHub同步的文档内容 - - others 其他文档类型 - 其他未列出的文档类型 - - - doc_metadata 文档元数据(如提供文档类型则必填 - 字段因文档类型而异 - - 针对图书类型 For book: - - title 书名 - 书籍的标题 - - language 图书语言 - 书籍的语言 - - author 作者 - 书籍的作者 - - publisher 出版社 - 出版社的名称 - - publication_date 出版日期 - 书籍的出版日期 - - isbn ISBN号码 - 书籍的ISBN编号 - - category 图书分类 - 书籍的分类类别 - - 针对网页类型 For web_page: - - title 页面标题 - 网页的标题 - - url 页面网址 - 网页的URL地址 - - language 页面语言 - 网页的语言 - - publish_date 发布日期 - 网页的发布日期 - - author/publisher 作者/发布者 - 网页的作者或发布者 - - topic/keywords 主题/关键词 - 网页的主题或关键词 - - description 页面描述 - 网页的描述信息 - - 请查看 [api/services/dataset_service.py](https://github.com/langgenius/dify/blob/main/api/services/dataset_service.py#L475) 了解各文档类型所需字段的详细信息。 - - 针对"其他"类型文档,接受任何有效的JSON对象 @@ -1534,7 +1330,6 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi "id": "a8c6c36f-9f5d-4d7a-8472-f5d7b75d71d2", "data_source_type": "upload_file", "name": "readme.txt", - "doc_type": null } }, "score": 3.730463140527718e-05, From 3e84c77bbb361dbcea7e9d54ec583c23a236580a Mon Sep 17 00:00:00 2001 From: Jyong <76649700+JohnJyong@users.noreply.github.com> Date: Thu, 20 Mar 2025 14:38:32 +0800 Subject: [PATCH 7/7] fix enable dataset metadata built-in field when dataset is empty (#16290) --- api/services/metadata_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/api/services/metadata_service.py b/api/services/metadata_service.py index a43b970a39..4cd2f9e8cb 100644 --- a/api/services/metadata_service.py +++ b/api/services/metadata_service.py @@ -137,7 +137,7 @@ class MetadataService: doc_metadata[BuiltInField.source.value] = MetadataDataSource[document.data_source_type].value document.doc_metadata = doc_metadata db.session.add(document) - db.session.commit() + db.session.commit() except Exception: logging.exception("Enable built-in field failed") finally: