diff --git a/api/.env.example b/api/.env.example index 151ed14120..3b4c1a13af 100644 --- a/api/.env.example +++ b/api/.env.example @@ -145,6 +145,10 @@ WEAVIATE_ENDPOINT=http://localhost:8080 WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih WEAVIATE_GRPC_ENABLED=false WEAVIATE_BATCH_SIZE=100 +#If it is a Chinese full-text index, please set it to 'gse' +#https://weaviate.io/developers/weaviate/config-refs/schema#gse-and-trigram-tokenization-methods +#https://pkg.go.dev/github.com/go-ego/gse#section-readme +WEAVIATE_TOKENIZATION= # Qdrant configuration, use `http://localhost:6333` for local mode or `https://your-qdrant-cluster-url.qdrant.io` for remote mode QDRANT_URL=http://localhost:6333 diff --git a/api/configs/middleware/vdb/weaviate_config.py b/api/configs/middleware/vdb/weaviate_config.py index 25000e8bde..26ca41da07 100644 --- a/api/configs/middleware/vdb/weaviate_config.py +++ b/api/configs/middleware/vdb/weaviate_config.py @@ -28,3 +28,8 @@ class WeaviateConfig(BaseSettings): description="Number of objects to be processed in a single batch operation (default is 100)", default=100, ) + + WEAVIATE_TOKENIZATION: Optional[str] = Field( + description="Tokenization for Weaviate", + default=None, + ) diff --git a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py index 01eaf947f1..536d73f380 100644 --- a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py +++ b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py @@ -258,15 +258,23 @@ class WeaviateVector(BaseVector): return docs def _default_schema(self, index_name: str) -> dict: - return { - "class": index_name, - "properties": [ - { - "name": "text", - "dataType": ["text"], - } - ], - } + if dify_config.WEAVIATE_TOKENIZATION: + return { + "class": index_name, + "properties": [ + {"name": "text", "dataType": ["text"], "tokenization": dify_config.WEAVIATE_TOKENIZATION} + ], + } + else: + return { + "class": index_name, + "properties": [ + { + "name": "text", + "dataType": ["text"], + } + ], + } def _json_serializable(self, value: Any) -> Any: if isinstance(value, datetime.datetime): diff --git a/docker/.env.example b/docker/.env.example index b295f5cdf0..6455f0d7d9 100644 --- a/docker/.env.example +++ b/docker/.env.example @@ -389,6 +389,10 @@ VECTOR_STORE=weaviate # The Weaviate endpoint URL. Only available when VECTOR_STORE is `weaviate`. WEAVIATE_ENDPOINT=http://weaviate:8080 WEAVIATE_API_KEY=WVF5YThaHlkYwhGUSmCRgsX3tD5ngdN8pkih +#If it is a Chinese full-text index, please set it to 'gse' +#https://weaviate.io/developers/weaviate/config-refs/schema#gse-and-trigram-tokenization-methods +#https://pkg.go.dev/github.com/go-ego/gse#section-readme +WEAVIATE_TOKENIZATION= # The Qdrant endpoint URL. Only available when VECTOR_STORE is `qdrant`. QDRANT_URL=http://qdrant:6333 diff --git a/docker/docker-compose-template.yaml b/docker/docker-compose-template.yaml index 44f725d343..e1ef8fd8be 100644 --- a/docker/docker-compose-template.yaml +++ b/docker/docker-compose-template.yaml @@ -243,6 +243,7 @@ services: # The Weaviate vector store. weaviate: image: semitechnologies/weaviate:1.19.0 + #image: semitechnologies/weaviate:1.28.0 If it is a Chinese full-text index, please update weaviate version profiles: - '' - weaviate @@ -263,6 +264,7 @@ services: AUTHENTICATION_APIKEY_USERS: ${WEAVIATE_AUTHENTICATION_APIKEY_USERS:-hello@dify.ai} AUTHORIZATION_ADMINLIST_ENABLED: ${WEAVIATE_AUTHORIZATION_ADMINLIST_ENABLED:-true} AUTHORIZATION_ADMINLIST_USERS: ${WEAVIATE_AUTHORIZATION_ADMINLIST_USERS:-hello@dify.ai} + #ENABLE_TOKENIZER_GSE: 'true' If it is a Chinese full-text index, please set it to 'gse' # Qdrant vector store. # (if used, you need to set VECTOR_STORE to qdrant in the api & worker service.) diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index bb02b27062..428b97ae59 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -345,6 +345,7 @@ x-shared-env: &shared-api-worker-env WEAVIATE_AUTHENTICATION_APIKEY_USERS: ${WEAVIATE_AUTHENTICATION_APIKEY_USERS:-hello@dify.ai} WEAVIATE_AUTHORIZATION_ADMINLIST_ENABLED: ${WEAVIATE_AUTHORIZATION_ADMINLIST_ENABLED:-true} WEAVIATE_AUTHORIZATION_ADMINLIST_USERS: ${WEAVIATE_AUTHORIZATION_ADMINLIST_USERS:-hello@dify.ai} + WEAVIATE_TOKENIZATION: ${WEAVIATE_TOKENIZATION:-} CHROMA_SERVER_AUTHN_CREDENTIALS: ${CHROMA_SERVER_AUTHN_CREDENTIALS:-difyai123456} CHROMA_SERVER_AUTHN_PROVIDER: ${CHROMA_SERVER_AUTHN_PROVIDER:-chromadb.auth.token_authn.TokenAuthenticationServerProvider} CHROMA_IS_PERSISTENT: ${CHROMA_IS_PERSISTENT:-TRUE} @@ -673,6 +674,7 @@ services: # The Weaviate vector store. weaviate: image: semitechnologies/weaviate:1.19.0 + #image: semitechnologies/weaviate:1.28.0 If it is a Chinese full-text index, please update weaviate version profiles: - '' - weaviate @@ -693,6 +695,7 @@ services: AUTHENTICATION_APIKEY_USERS: ${WEAVIATE_AUTHENTICATION_APIKEY_USERS:-hello@dify.ai} AUTHORIZATION_ADMINLIST_ENABLED: ${WEAVIATE_AUTHORIZATION_ADMINLIST_ENABLED:-true} AUTHORIZATION_ADMINLIST_USERS: ${WEAVIATE_AUTHORIZATION_ADMINLIST_USERS:-hello@dify.ai} + #ENABLE_TOKENIZER_GSE: 'true' If it is a Chinese full-text index, please set it to 'gse' # Qdrant vector store. # (if used, you need to set VECTOR_STORE to qdrant in the api & worker service.) diff --git a/docker/middleware.env.example b/docker/middleware.env.example index d01f9abe53..63bef4b578 100644 --- a/docker/middleware.env.example +++ b/docker/middleware.env.example @@ -78,6 +78,10 @@ WEAVIATE_AUTHENTICATION_APIKEY_USERS=hello@dify.ai WEAVIATE_AUTHORIZATION_ADMINLIST_ENABLED=true WEAVIATE_AUTHORIZATION_ADMINLIST_USERS=hello@dify.ai WEAVIATE_HOST_VOLUME=./volumes/weaviate +#If it is a Chinese full-text index, please set it to 'gse' +#https://weaviate.io/developers/weaviate/config-refs/schema#gse-and-trigram-tokenization-methods +#https://pkg.go.dev/github.com/go-ego/gse#section-readme +WEAVIATE_TOKENIZATION= # ------------------------------ # Docker Compose Service Expose Host Port Configurations