From 9709df1ed76f32eed3083d1e85075da6a3a4f8fc Mon Sep 17 00:00:00 2001 From: nut <1132402956@qq.com> Date: Tue, 31 Dec 2024 14:36:43 +0800 Subject: [PATCH 1/2] feat: Weaviate supports Chinese bm25 #12223 --- api/.env.example | 4 +++ api/configs/middleware/vdb/weaviate_config.py | 5 ++++ .../vdb/weaviate/weaviate_vector.py | 30 +++++++++++++------ docker/.env.example | 4 +++ docker/docker-compose-template.yaml | 2 ++ docker/docker-compose.yaml | 3 ++ docker/middleware.env.example | 4 +++ 7 files changed, 43 insertions(+), 9 deletions(-) diff --git a/api/.env.example b/api/.env.example index cc3e868717..1aaeb6d703 100644 --- a/api/.env.example +++ b/api/.env.example @@ -142,6 +142,10 @@ WEAVIATE_ENDPOINT=http://localhost:8080 WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih WEAVIATE_GRPC_ENABLED=false WEAVIATE_BATCH_SIZE=100 +#If it is a Chinese full-text index, please set it to 'gse' +#https://weaviate.io/developers/weaviate/config-refs/schema#gse-and-trigram-tokenization-methods +#https://pkg.go.dev/github.com/go-ego/gse#section-readme +WEAVIATE_TOKENIZATION= # Qdrant configuration, use `http://localhost:6333` for local mode or `https://your-qdrant-cluster-url.qdrant.io` for remote mode QDRANT_URL=http://localhost:6333 diff --git a/api/configs/middleware/vdb/weaviate_config.py b/api/configs/middleware/vdb/weaviate_config.py index 25000e8bde..b94664e4f4 100644 --- a/api/configs/middleware/vdb/weaviate_config.py +++ b/api/configs/middleware/vdb/weaviate_config.py @@ -28,3 +28,8 @@ class WeaviateConfig(BaseSettings): description="Number of objects to be processed in a single batch operation (default is 100)", default=100, ) + + WEAVIATE_TOKENIZATION: Optional[str] = Field( + description="Tokenization for Weaviate", + default=None, + ) \ No newline at end of file diff --git a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py index 68d043a19f..6f18879565 100644 --- a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py +++ b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py @@ -248,15 +248,27 @@ class WeaviateVector(BaseVector): return docs def _default_schema(self, index_name: str) -> dict: - return { - "class": index_name, - "properties": [ - { - "name": "text", - "dataType": ["text"], - } - ], - } + if dify_config.WEAVIATE_TOKENIZATION: + return { + "class": index_name, + "properties": [ + { + "name": "text", + "dataType": ["text"], + "tokenization": dify_config.WEAVIATE_TOKENIZATION + } + ], + } + else: + return { + "class": index_name, + "properties": [ + { + "name": "text", + "dataType": ["text"], + } + ], + } def _json_serializable(self, value: Any) -> Any: if isinstance(value, datetime.datetime): diff --git a/docker/.env.example b/docker/.env.example index 50ba856bd3..d494d63348 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -383,6 +383,10 @@ VECTOR_STORE=weaviate # The Weaviate endpoint URL. Only available when VECTOR_STORE is `weaviate`. WEAVIATE_ENDPOINT=http://weaviate:8080 WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih +#If it is a Chinese full-text index, please set it to 'gse' +#https://weaviate.io/developers/weaviate/config-refs/schema#gse-and-trigram-tokenization-methods +#https://pkg.go.dev/github.com/go-ego/gse#section-readme +WEAVIATE_TOKENIZATION= # The Qdrant endpoint URL. Only available when VECTOR_STORE is `qdrant`. QDRANT_URL=http://qdrant:6333 diff --git a/docker/docker-compose-template.yaml b/docker/docker-compose-template.yaml index d4e0ba49d0..1b960c139d 100644 --- a/docker/docker-compose-template.yaml +++ b/docker/docker-compose-template.yaml @@ -220,6 +220,7 @@ services: # The Weaviate vector store. weaviate: image: semitechnologies/weaviate:1.19.0 + #image: semitechnologies/weaviate:1.28.0 If it is a Chinese full-text index, please update weaviate version profiles: - '' - weaviate @@ -240,6 +241,7 @@ services: AUTHENTICATION_APIKEY_USERS: ${WEAVIATE_AUTHENTICATION_APIKEY_USERS:-hello@dify.ai} AUTHORIZATION_ADMINLIST_ENABLED: ${WEAVIATE_AUTHORIZATION_ADMINLIST_ENABLED:-true} AUTHORIZATION_ADMINLIST_USERS: ${WEAVIATE_AUTHORIZATION_ADMINLIST_USERS:-hello@dify.ai} + #ENABLE_TOKENIZER_GSE: 'true' If it is a Chinese full-text index, please set it to 'gse' # Qdrant vector store. # (if used, you need to set VECTOR_STORE to qdrant in the api & worker service.) diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index b82659d959..be04925d6d 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -327,6 +327,7 @@ x-shared-env: &shared-api-worker-env WEAVIATE_AUTHENTICATION_APIKEY_USERS: ${WEAVIATE_AUTHENTICATION_APIKEY_USERS:-hello@dify.ai} WEAVIATE_AUTHORIZATION_ADMINLIST_ENABLED: ${WEAVIATE_AUTHORIZATION_ADMINLIST_ENABLED:-true} WEAVIATE_AUTHORIZATION_ADMINLIST_USERS: ${WEAVIATE_AUTHORIZATION_ADMINLIST_USERS:-hello@dify.ai} + WEAVIATE_TOKENIZATION: ${WEAVIATE_TOKENIZATION:-} CHROMA_SERVER_AUTHN_CREDENTIALS: ${CHROMA_SERVER_AUTHN_CREDENTIALS:-difyai123456} CHROMA_SERVER_AUTHN_PROVIDER: ${CHROMA_SERVER_AUTHN_PROVIDER:-chromadb.auth.token_authn.TokenAuthenticationServerProvider} CHROMA_IS_PERSISTENT: ${CHROMA_IS_PERSISTENT:-TRUE} @@ -607,6 +608,7 @@ services: # The Weaviate vector store. weaviate: image: semitechnologies/weaviate:1.19.0 + #image: semitechnologies/weaviate:1.28.0 If it is a Chinese full-text index, please update weaviate version profiles: - '' - weaviate @@ -627,6 +629,7 @@ services: AUTHENTICATION_APIKEY_USERS: ${WEAVIATE_AUTHENTICATION_APIKEY_USERS:-hello@dify.ai} AUTHORIZATION_ADMINLIST_ENABLED: ${WEAVIATE_AUTHORIZATION_ADMINLIST_ENABLED:-true} AUTHORIZATION_ADMINLIST_USERS: ${WEAVIATE_AUTHORIZATION_ADMINLIST_USERS:-hello@dify.ai} + #ENABLE_TOKENIZER_GSE: 'true' If it is a Chinese full-text index, please set it to 'gse' # Qdrant vector store. # (if used, you need to set VECTOR_STORE to qdrant in the api & worker service.) diff --git a/docker/middleware.env.example b/docker/middleware.env.example index c4ce9f0114..7dd974bc9a 100644 --- a/docker/middleware.env.example +++ b/docker/middleware.env.example @@ -78,6 +78,10 @@ WEAVIATE_AUTHENTICATION_APIKEY_USERS=hello@dify.ai WEAVIATE_AUTHORIZATION_ADMINLIST_ENABLED=true WEAVIATE_AUTHORIZATION_ADMINLIST_USERS=hello@dify.ai WEAVIATE_HOST_VOLUME=./volumes/weaviate +#If it is a Chinese full-text index, please set it to 'gse' +#https://weaviate.io/developers/weaviate/config-refs/schema#gse-and-trigram-tokenization-methods +#https://pkg.go.dev/github.com/go-ego/gse#section-readme +WEAVIATE_TOKENIZATION= # ------------------------------ # Docker Compose Service Expose Host Port Configurations From 23c80f8ef30b84cc5f931f12d87d1901fc3b9382 Mon Sep 17 00:00:00 2001 From: nut <1132402956@qq.com> Date: Tue, 31 Dec 2024 15:06:40 +0800 Subject: [PATCH 2/2] feat: Weaviate supports Chinese bm25 #12223 --- api/configs/middleware/vdb/weaviate_config.py | 2 +- api/core/rag/datasource/vdb/weaviate/weaviate_vector.py | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/api/configs/middleware/vdb/weaviate_config.py b/api/configs/middleware/vdb/weaviate_config.py index b94664e4f4..26ca41da07 100644 --- a/api/configs/middleware/vdb/weaviate_config.py +++ b/api/configs/middleware/vdb/weaviate_config.py @@ -32,4 +32,4 @@ class WeaviateConfig(BaseSettings): WEAVIATE_TOKENIZATION: Optional[str] = Field( description="Tokenization for Weaviate", default=None, - ) \ No newline at end of file + ) diff --git a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py index 6f18879565..521eefadf4 100644 --- a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py +++ b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py @@ -252,11 +252,7 @@ class WeaviateVector(BaseVector): return { "class": index_name, "properties": [ - { - "name": "text", - "dataType": ["text"], - "tokenization": dify_config.WEAVIATE_TOKENIZATION - } + {"name": "text", "dataType": ["text"], "tokenization": dify_config.WEAVIATE_TOKENIZATION} ], } else: