add score threshold

Merge branch 'main' into main
optimize: reformat and move test to api/tests/integration_tests/vdb/tcvectordb
2024-04-25 19:25:08 +08:00 · 2024-04-25 17:02:00 +08:00 · 2024-04-25 16:15:32 +08:00 · 2024-04-23 10:59:29 +08:00 · 2024-04-19 09:08:23 +08:00 · 2024-04-18 12:01:45 +08:00
11 changed files with 304 additions and 4 deletions
--- a/api/.env.example
+++ b/api/.env.example
@ -90,6 +90,15 @@ RELYT_USER=postgres
 RELYT_PASSWORD=postgres
 RELYT_DATABASE=postgres

+# Tencent configuration
+TENCENT_VECTOR_DB_URL=http://127.0.0.1
+TENCENT_VECTOR_DB_API_KEY=dify
+TENCENT_VECTOR_DB_TIMEOUT=30
+TENCENT_VECTOR_DB_USERNAME=dify
+TENCENT_VECTOR_DB_DATABASE=dify
+TENCENT_VECTOR_DB_SHARD=1
+TENCENT_VECTOR_DB_REPLICAS=2
+
 # Upload configuration
 UPLOAD_FILE_SIZE_LIMIT=15
 UPLOAD_FILE_BATCH_LIMIT=5
--- a/api/commands.py
+++ b/api/commands.py
@ -305,6 +305,14 @@ def migrate_knowledge_vector_database():
                        "vector_store": {"class_prefix": collection_name}
                    }
                    dataset.index_struct = json.dumps(index_struct_dict)
+                elif vector_type == "tencent":
+                    dataset_id = dataset.id
+                    collection_name = Dataset.gen_collection_name_by_id(dataset_id)
+                    index_struct_dict = {
+                        "type": 'tencent',
+                        "vector_store": {"class_prefix": collection_name}
+                    }
+                    dataset.index_struct = json.dumps(index_struct_dict)
                else:
                    raise ValueError(f"Vector store {config.get('VECTOR_STORE')} is not supported.")

--- a/api/config.py
+++ b/api/config.py
@ -245,6 +245,15 @@ class Config:
        self.RELYT_PASSWORD = get_env('RELYT_PASSWORD')
        self.RELYT_DATABASE = get_env('RELYT_DATABASE')

+        # tencent settings
+        self.TENCENT_VECTOR_DB_URL = get_env('TENCENT_VECTOR_DB_URL')
+        self.TENCENT_VECTOR_DB_API_KEY = get_env('TENCENT_VECTOR_DB_API_KEY')
+        self.TENCENT_VECTOR_DB_TIMEOUT = get_env('TENCENT_VECTOR_DB_TIMEOUT')
+        self.TENCENT_VECTOR_DB_USERNAME = get_env('TENCENT_VECTOR_DB_USERNAME')
+        self.TENCENT_VECTOR_DB_DATABASE = get_env('TENCENT_VECTOR_DB_DATABASE')
+        self.TENCENT_VECTOR_DB_SHARD = get_env('TENCENT_VECTOR_DB_SHARD')
+        self.TENCENT_VECTOR_DB_REPLICAS = get_env('TENCENT_VECTOR_DB_REPLICAS')
+
        # ------------------------
        # Mail Configurations.
        # ------------------------
--- a/api/controllers/console/datasets/datasets.py
+++ b/api/controllers/console/datasets/datasets.py
@ -476,7 +476,7 @@ class DatasetRetrievalSettingApi(Resource):
    @account_initialization_required
    def get(self):
        vector_type = current_app.config['VECTOR_STORE']
-        if vector_type == 'milvus':
+        if vector_type == 'milvus' or vector_type == 'tencent':
            return {
                'retrieval_method': [
                    'semantic_search'
@ -498,7 +498,7 @@ class DatasetRetrievalSettingMockApi(Resource):
    @account_initialization_required
    def get(self, vector_type):

-        if vector_type == 'milvus':
+        if vector_type == 'milvus' or vector_type == 'tencent':
            return {
                'retrieval_method': [
                    'semantic_search'
--- a/api/core/rag/datasource/vdb/tencent/init.py
+++ b/api/core/rag/datasource/vdb/tencent/init.py
--- a/api/core/rag/datasource/vdb/tencent/tencent_vector.py
+++ b/api/core/rag/datasource/vdb/tencent/tencent_vector.py
@ -0,0 +1,183 @@
+import json
+from typing import Any, Optional
+
+import tcvectordb
+from pydantic import BaseModel
+from tcvectordb.model import document, enum
+from tcvectordb.model import index as vdb_index
+from tcvectordb.model.document import Filter
+
+from core.rag.datasource.vdb.vector_base import BaseVector
+from core.rag.models.document import Document
+from extensions.ext_redis import redis_client
+
+
+class TencentConfig(BaseModel):
+    url: str
+    api_key: Optional[str]
+    timeout: float = 30
+    username: Optional[str]
+    database: Optional[str]
+    index_type: str = "HNSW"
+    metric_type: str = "L2"
+    shard: int = 1,
+    replicas: int = 2,
+
+    def to_tencent_params(self):
+        return {
+            'url': self.url,
+            'username': self.username,
+            'key': self.api_key,
+            'timeout': self.timeout
+        }
+
+
+class TencentVector(BaseVector):
+    field_id: str = "id"
+    field_vector: str = "vector"
+    field_text: str = "text"
+    field_metadata: str = "metadata"
+
+    def __init__(self, collection_name: str, config: TencentConfig):
+        super().__init__(collection_name)
+        self._client_config = config
+        self._client = tcvectordb.VectorDBClient(**self._client_config.to_tencent_params())
+        self._db = self._init_database()
+
+    def _init_database(self):
+        exists = False
+        for db in self._client.list_databases():
+            if db.database_name == self._client_config.database:
+                exists = True
+                break
+        if exists:
+            return self._client.database(self._client_config.database)
+        else:
+            return self._client.create_database(database_name=self._client_config.database)
+
+    def get_type(self) -> str:
+        return 'tencent'
+
+    def to_index_struct(self) -> dict:
+        return {
+            "type": self.get_type(),
+            "vector_store": {"class_prefix": self._collection_name}
+        }
+
+    def _create_collection(self, dimension: int) -> None:
+        lock_name = 'vector_indexing_lock_{}'.format(self._collection_name)
+        with redis_client.lock(lock_name, timeout=20):
+            collections = self._db.list_collections()
+            for collection in collections:
+                if collection.collection_name == self._collection_name:
+                    self.collection = collection
+                    return
+            index_type = None
+            for k, v in enum.IndexType.__members__.items():
+                if k == self._client_config.index_type:
+                    index_type = v
+            if index_type is None:
+                raise ValueError("unsupported index_type")
+            metric_type = None
+            for k, v in enum.MetricType.__members__.items():
+                if k == self._client_config.metric_type:
+                    metric_type = v
+            if metric_type is None:
+                raise ValueError("unsupported metric_type")
+            params = vdb_index.HNSWParams(m=16, efconstruction=200)
+            index = vdb_index.Index(
+                vdb_index.FilterIndex(
+                    self.field_id, enum.FieldType.String, enum.IndexType.PRIMARY_KEY
+                ),
+                vdb_index.VectorIndex(
+                    self.field_vector,
+                    dimension,
+                    index_type,
+                    metric_type,
+                    params,
+                ),
+                vdb_index.FilterIndex(
+                    self.field_text, enum.FieldType.String, enum.IndexType.FILTER
+                ),
+                vdb_index.FilterIndex(
+                    self.field_metadata, enum.FieldType.String, enum.IndexType.FILTER
+                ),
+            )
+
+            self.collection = self._db.create_collection(
+                name=self._collection_name,
+                shard=self._client_config.shard,
+                replicas=self._client_config.replicas,
+                description="Collection for Dify",
+                index=index,
+            )
+
+    def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
+        self._create_collection(len(embeddings[0]))
+        self.add_texts(texts, embeddings)
+
+    def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
+        texts = [doc.page_content for doc in documents]
+        metadatas = [doc.metadata for doc in documents]
+        total_count = len(embeddings)
+        docs = []
+        for id in range(0, total_count):
+            if metadatas is None:
+                continue
+            metadata = json.dumps(metadatas[id])
+            doc = document.Document(
+                id=metadatas[id]["doc_id"],
+                vector=embeddings[id],
+                text=texts[id],
+                metadata=metadata,
+            )
+            docs.append(doc)
+        self.collection.upsert(docs, self._client_config.timeout)
+
+    def text_exists(self, id: str) -> bool:
+        docs = self._db.collection(self._collection_name).query(document_ids=[id])
+        if docs and len(docs) > 0:
+            return True
+        return False
+
+    def delete_by_ids(self, ids: list[str]) -> None:
+        self._db.collection(self._collection_name).delete(document_ids=ids)
+
+    def delete_by_metadata_field(self, key: str, value: str) -> None:
+        docs = self._db.collection(self._collection_name).query(filter=Filter(Filter.In(key, [value])))
+        if docs and len(docs) > 0:
+            self.collection.delete(document_ids=[doc['id'] for doc in docs])
+
+    def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]:
+
+        res = self._db.collection(self._collection_name).search(vectors=[query_vector],
+                                                                params=document.HNSWSearchParams(
+                                                                    ef=kwargs.get("ef", 10)),
+                                                                retrieve_vector=False,
+                                                                limit=kwargs.get('top_k', 4),
+                                                                timeout=self._client_config.timeout,
+                                                                )
+        score_threshold = kwargs.get("score_threshold", .0) if kwargs.get('score_threshold', .0) else 0.0
+        return self._get_search_res(res, score_threshold)
+
+    def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
+        return []
+
+    def _get_search_res(self, res, score_threshold):
+        docs = []
+        if res is None or len(res) == 0:
+            return docs
+
+        for result in res[0]:
+            meta = result.get(self.field_metadata)
+            if meta is not None:
+                meta = json.loads(meta)
+            score = 1 - result.get("score")
+            if score > score_threshold:
+                meta['score'] = score
+                doc = Document(page_content=result.get(self.field_text), metadata=meta)
+                docs.append(doc)
+        return docs
+
+    def delete(self) -> None:
+        self._db.drop_collection(name=self._collection_name)
--- a/api/core/rag/datasource/vdb/vector_factory.py
+++ b/api/core/rag/datasource/vdb/vector_factory.py
@ -25,7 +25,6 @@ class Vector:
    def _init_vector(self) -> BaseVector:
        config = current_app.config
        vector_type = config.get('VECTOR_STORE')
-
        if self._dataset.index_struct_dict:
            vector_type = self._dataset.index_struct_dict['type']

@ -138,6 +137,31 @@ class Vector:
                ),
                dim=dim
            )
+        elif vector_type == "tencent":
+            from core.rag.datasource.vdb.tencent.tencent_vector import TencentConfig, TencentVector
+            if self._dataset.index_struct_dict:
+                class_prefix: str = self._dataset.index_struct_dict['vector_store']['class_prefix']
+                collection_name = class_prefix
+            else:
+                dataset_id = self._dataset.id
+                collection_name = Dataset.gen_collection_name_by_id(dataset_id)
+                index_struct_dict = {
+                    "type": 'tencent',
+                    "vector_store": {"class_prefix": collection_name}
+                }
+                self._dataset.index_struct = json.dumps(index_struct_dict)
+            return TencentVector(
+                collection_name=collection_name,
+                config=TencentConfig(
+                    url=config.get('TENCENT_VECTOR_DB_URL'),
+                    api_key=config.get('TENCENT_VECTOR_DB_API_KEY'),
+                    timeout=config.get('TENCENT_VECTOR_DB_TIMEOUT'),
+                    username=config.get('TENCENT_VECTOR_DB_USERNAME'),
+                    database=config.get('TENCENT_VECTOR_DB_DATABASE'),
+                    shard=config.get('TENCENT_VECTOR_DB_SHARD'),
+                    replicas=config.get('TENCENT_VECTOR_DB_REPLICAS'),
+                )
+            )
        else:
            raise ValueError(f"Vector store {config.get('VECTOR_STORE')} is not supported.")

--- a/api/requirements.txt
+++ b/api/requirements.txt
@ -80,4 +80,5 @@ lxml==5.1.0
 xlrd~=2.0.1
 pydantic~=1.10.0
 pgvecto-rs==0.1.4
-oss2==2.15.0
+oss2==2.15.0
+tcvectordb==1.3.2
--- a/api/tests/integration_tests/vdb/tcvectordb/init.py
+++ b/api/tests/integration_tests/vdb/tcvectordb/init.py
--- a/api/tests/integration_tests/vdb/tcvectordb/test_tencent.py
+++ b/api/tests/integration_tests/vdb/tcvectordb/test_tencent.py
@ -0,0 +1,58 @@
+import pytest
+
+from core.rag.datasource.vdb.tencent.tencent_vector import TencentConfig, TencentVector
+from core.rag.models.document import Document
+from extensions.ext_redis import redis_client
+
+
+def _create_tencent_vector() -> TencentVector:
+    tencent_vector = TencentVector(
+        collection_name='test-001',
+        config=TencentConfig(
+            url="http://10.6.x.x",
+            api_key="nTZ**********************",
+            timeout=30,
+            username="dify",
+            database="dify",
+            shard=1,
+            replicas=2,
+        )
+    )
+    documents = [
+        Document(page_content="This is document 1", metadata={"doc_id": "doc1", "document_id": "foo1"}),
+        Document(page_content="This is document 2", metadata={"doc_id": "doc2", "document_id": "foo2"}),
+    ]
+    embeddings = [[0.2123, 0.23, 0.213], [0.2123, 0.22, 0.213]]
+    tencent_vector.create(texts=documents, embeddings=embeddings)
+
+    return tencent_vector
+
+
+@pytest.fixture(autouse=True)
+def mock_redis_lock(mocker):
+    mocker.patch.object(redis_client, "lock")
+
+
+def test_text_exists():
+    tencent_vector = _create_tencent_vector()
+    assert tencent_vector.text_exists(id="doc1") is True
+
+
+def test_delete_by_ids():
+    tencent_vector = _create_tencent_vector()
+    tencent_vector.delete_by_ids(ids=['doc2'])
+
+
+def test_delete_by_metadata_field():
+    tencent_vector = _create_tencent_vector()
+    tencent_vector.delete_by_metadata_field(key="document_id", value="foo1")
+
+
+def test_search_by_vector():
+    tencent_vector = _create_tencent_vector()
+    res = tencent_vector.search_by_vector(query_vector=[0.3123, 0.43, 0.213])
+    assert len(res) > 0
+
+def test_delete():
+    tencent_vector = _create_tencent_vector()
+    tencent_vector.delete()
--- a/docker/docker-compose.yaml
+++ b/docker/docker-compose.yaml
@ -229,6 +229,14 @@ services:
      RELYT_USER: postgres
      RELYT_PASSWORD: difyai123456
      RELYT_DATABASE: postgres
+      # tencent configurations
+      TENCENT_VECTOR_DB_URL: http://127.0.0.1
+      TENCENT_VECTOR_DB_API_KEY: dify
+      TENCENT_VECTOR_DB_TIMEOUT: 30
+      TENCENT_VECTOR_DB_USERNAME: dify
+      TENCENT_VECTOR_DB_DATABASE: dify
+      TENCENT_VECTOR_DB_SHARD: 1
+      TENCENT_VECTOR_DB_REPLICAS: 2
    depends_on:
      - db
      - redis
Author	SHA1	Message	Date
jyong	3f34b7e103	add score threshold	2024-04-25 19:25:08 +08:00
quicksand	a401a73eb7	Merge branch 'main' into main	2024-04-25 17:02:00 +08:00
quicksandzn	01e27def9e	optimize: reformat and move test to api/tests/integration_tests/vdb/tcvectordb	2024-04-25 16:15:32 +08:00
quicksand	617fec0dad	Merge branch 'langgenius:main' into main	2024-04-23 10:59:29 +08:00
quicksandzn	e3d5d2f7ae	remove comments	2024-04-19 09:08:23 +08:00
quicksandzn	90dca38ce7	optimize: config prefix	2024-04-18 12:01:45 +08:00
quicksandzn	fe905ea696	optimize: test tencent vdb	2024-04-18 11:29:48 +08:00
quicksandzn	96aeb34f7d	remove .env	2024-04-18 09:15:36 +08:00
quicksandzn	75bbfb5bc7	optimize:add requirements	2024-04-17 19:16:28 +08:00
quicksandzn	324a0baf22	feat: support tencent vdb	2024-04-17 19:14:00 +08:00