Compare commits
10 Commits
main
...
quicksandz
Author | SHA1 | Date | |
---|---|---|---|
![]() |
3f34b7e103 | ||
![]() |
a401a73eb7 | ||
![]() |
01e27def9e | ||
![]() |
617fec0dad | ||
![]() |
e3d5d2f7ae | ||
![]() |
90dca38ce7 | ||
![]() |
fe905ea696 | ||
![]() |
96aeb34f7d | ||
![]() |
75bbfb5bc7 | ||
![]() |
324a0baf22 |
@ -90,6 +90,15 @@ RELYT_USER=postgres
|
|||||||
RELYT_PASSWORD=postgres
|
RELYT_PASSWORD=postgres
|
||||||
RELYT_DATABASE=postgres
|
RELYT_DATABASE=postgres
|
||||||
|
|
||||||
|
# Tencent configuration
|
||||||
|
TENCENT_VECTOR_DB_URL=http://127.0.0.1
|
||||||
|
TENCENT_VECTOR_DB_API_KEY=dify
|
||||||
|
TENCENT_VECTOR_DB_TIMEOUT=30
|
||||||
|
TENCENT_VECTOR_DB_USERNAME=dify
|
||||||
|
TENCENT_VECTOR_DB_DATABASE=dify
|
||||||
|
TENCENT_VECTOR_DB_SHARD=1
|
||||||
|
TENCENT_VECTOR_DB_REPLICAS=2
|
||||||
|
|
||||||
# Upload configuration
|
# Upload configuration
|
||||||
UPLOAD_FILE_SIZE_LIMIT=15
|
UPLOAD_FILE_SIZE_LIMIT=15
|
||||||
UPLOAD_FILE_BATCH_LIMIT=5
|
UPLOAD_FILE_BATCH_LIMIT=5
|
||||||
|
@ -305,6 +305,14 @@ def migrate_knowledge_vector_database():
|
|||||||
"vector_store": {"class_prefix": collection_name}
|
"vector_store": {"class_prefix": collection_name}
|
||||||
}
|
}
|
||||||
dataset.index_struct = json.dumps(index_struct_dict)
|
dataset.index_struct = json.dumps(index_struct_dict)
|
||||||
|
elif vector_type == "tencent":
|
||||||
|
dataset_id = dataset.id
|
||||||
|
collection_name = Dataset.gen_collection_name_by_id(dataset_id)
|
||||||
|
index_struct_dict = {
|
||||||
|
"type": 'tencent',
|
||||||
|
"vector_store": {"class_prefix": collection_name}
|
||||||
|
}
|
||||||
|
dataset.index_struct = json.dumps(index_struct_dict)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Vector store {config.get('VECTOR_STORE')} is not supported.")
|
raise ValueError(f"Vector store {config.get('VECTOR_STORE')} is not supported.")
|
||||||
|
|
||||||
|
@ -245,6 +245,15 @@ class Config:
|
|||||||
self.RELYT_PASSWORD = get_env('RELYT_PASSWORD')
|
self.RELYT_PASSWORD = get_env('RELYT_PASSWORD')
|
||||||
self.RELYT_DATABASE = get_env('RELYT_DATABASE')
|
self.RELYT_DATABASE = get_env('RELYT_DATABASE')
|
||||||
|
|
||||||
|
# tencent settings
|
||||||
|
self.TENCENT_VECTOR_DB_URL = get_env('TENCENT_VECTOR_DB_URL')
|
||||||
|
self.TENCENT_VECTOR_DB_API_KEY = get_env('TENCENT_VECTOR_DB_API_KEY')
|
||||||
|
self.TENCENT_VECTOR_DB_TIMEOUT = get_env('TENCENT_VECTOR_DB_TIMEOUT')
|
||||||
|
self.TENCENT_VECTOR_DB_USERNAME = get_env('TENCENT_VECTOR_DB_USERNAME')
|
||||||
|
self.TENCENT_VECTOR_DB_DATABASE = get_env('TENCENT_VECTOR_DB_DATABASE')
|
||||||
|
self.TENCENT_VECTOR_DB_SHARD = get_env('TENCENT_VECTOR_DB_SHARD')
|
||||||
|
self.TENCENT_VECTOR_DB_REPLICAS = get_env('TENCENT_VECTOR_DB_REPLICAS')
|
||||||
|
|
||||||
# ------------------------
|
# ------------------------
|
||||||
# Mail Configurations.
|
# Mail Configurations.
|
||||||
# ------------------------
|
# ------------------------
|
||||||
|
@ -476,7 +476,7 @@ class DatasetRetrievalSettingApi(Resource):
|
|||||||
@account_initialization_required
|
@account_initialization_required
|
||||||
def get(self):
|
def get(self):
|
||||||
vector_type = current_app.config['VECTOR_STORE']
|
vector_type = current_app.config['VECTOR_STORE']
|
||||||
if vector_type == 'milvus':
|
if vector_type == 'milvus' or vector_type == 'tencent':
|
||||||
return {
|
return {
|
||||||
'retrieval_method': [
|
'retrieval_method': [
|
||||||
'semantic_search'
|
'semantic_search'
|
||||||
@ -498,7 +498,7 @@ class DatasetRetrievalSettingMockApi(Resource):
|
|||||||
@account_initialization_required
|
@account_initialization_required
|
||||||
def get(self, vector_type):
|
def get(self, vector_type):
|
||||||
|
|
||||||
if vector_type == 'milvus':
|
if vector_type == 'milvus' or vector_type == 'tencent':
|
||||||
return {
|
return {
|
||||||
'retrieval_method': [
|
'retrieval_method': [
|
||||||
'semantic_search'
|
'semantic_search'
|
||||||
|
0
api/core/rag/datasource/vdb/tencent/__init__.py
Normal file
0
api/core/rag/datasource/vdb/tencent/__init__.py
Normal file
183
api/core/rag/datasource/vdb/tencent/tencent_vector.py
Normal file
183
api/core/rag/datasource/vdb/tencent/tencent_vector.py
Normal file
@ -0,0 +1,183 @@
|
|||||||
|
import json
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
import tcvectordb
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from tcvectordb.model import document, enum
|
||||||
|
from tcvectordb.model import index as vdb_index
|
||||||
|
from tcvectordb.model.document import Filter
|
||||||
|
|
||||||
|
from core.rag.datasource.vdb.vector_base import BaseVector
|
||||||
|
from core.rag.models.document import Document
|
||||||
|
from extensions.ext_redis import redis_client
|
||||||
|
|
||||||
|
|
||||||
|
class TencentConfig(BaseModel):
|
||||||
|
url: str
|
||||||
|
api_key: Optional[str]
|
||||||
|
timeout: float = 30
|
||||||
|
username: Optional[str]
|
||||||
|
database: Optional[str]
|
||||||
|
index_type: str = "HNSW"
|
||||||
|
metric_type: str = "L2"
|
||||||
|
shard: int = 1,
|
||||||
|
replicas: int = 2,
|
||||||
|
|
||||||
|
def to_tencent_params(self):
|
||||||
|
return {
|
||||||
|
'url': self.url,
|
||||||
|
'username': self.username,
|
||||||
|
'key': self.api_key,
|
||||||
|
'timeout': self.timeout
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class TencentVector(BaseVector):
|
||||||
|
field_id: str = "id"
|
||||||
|
field_vector: str = "vector"
|
||||||
|
field_text: str = "text"
|
||||||
|
field_metadata: str = "metadata"
|
||||||
|
|
||||||
|
def __init__(self, collection_name: str, config: TencentConfig):
|
||||||
|
super().__init__(collection_name)
|
||||||
|
self._client_config = config
|
||||||
|
self._client = tcvectordb.VectorDBClient(**self._client_config.to_tencent_params())
|
||||||
|
self._db = self._init_database()
|
||||||
|
|
||||||
|
def _init_database(self):
|
||||||
|
exists = False
|
||||||
|
for db in self._client.list_databases():
|
||||||
|
if db.database_name == self._client_config.database:
|
||||||
|
exists = True
|
||||||
|
break
|
||||||
|
if exists:
|
||||||
|
return self._client.database(self._client_config.database)
|
||||||
|
else:
|
||||||
|
return self._client.create_database(database_name=self._client_config.database)
|
||||||
|
|
||||||
|
def get_type(self) -> str:
|
||||||
|
return 'tencent'
|
||||||
|
|
||||||
|
def to_index_struct(self) -> dict:
|
||||||
|
return {
|
||||||
|
"type": self.get_type(),
|
||||||
|
"vector_store": {"class_prefix": self._collection_name}
|
||||||
|
}
|
||||||
|
|
||||||
|
def _create_collection(self, dimension: int) -> None:
|
||||||
|
lock_name = 'vector_indexing_lock_{}'.format(self._collection_name)
|
||||||
|
with redis_client.lock(lock_name, timeout=20):
|
||||||
|
collections = self._db.list_collections()
|
||||||
|
for collection in collections:
|
||||||
|
if collection.collection_name == self._collection_name:
|
||||||
|
self.collection = collection
|
||||||
|
return
|
||||||
|
index_type = None
|
||||||
|
for k, v in enum.IndexType.__members__.items():
|
||||||
|
if k == self._client_config.index_type:
|
||||||
|
index_type = v
|
||||||
|
if index_type is None:
|
||||||
|
raise ValueError("unsupported index_type")
|
||||||
|
metric_type = None
|
||||||
|
for k, v in enum.MetricType.__members__.items():
|
||||||
|
if k == self._client_config.metric_type:
|
||||||
|
metric_type = v
|
||||||
|
if metric_type is None:
|
||||||
|
raise ValueError("unsupported metric_type")
|
||||||
|
params = vdb_index.HNSWParams(m=16, efconstruction=200)
|
||||||
|
index = vdb_index.Index(
|
||||||
|
vdb_index.FilterIndex(
|
||||||
|
self.field_id, enum.FieldType.String, enum.IndexType.PRIMARY_KEY
|
||||||
|
),
|
||||||
|
vdb_index.VectorIndex(
|
||||||
|
self.field_vector,
|
||||||
|
dimension,
|
||||||
|
index_type,
|
||||||
|
metric_type,
|
||||||
|
params,
|
||||||
|
),
|
||||||
|
vdb_index.FilterIndex(
|
||||||
|
self.field_text, enum.FieldType.String, enum.IndexType.FILTER
|
||||||
|
),
|
||||||
|
vdb_index.FilterIndex(
|
||||||
|
self.field_metadata, enum.FieldType.String, enum.IndexType.FILTER
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.collection = self._db.create_collection(
|
||||||
|
name=self._collection_name,
|
||||||
|
shard=self._client_config.shard,
|
||||||
|
replicas=self._client_config.replicas,
|
||||||
|
description="Collection for Dify",
|
||||||
|
index=index,
|
||||||
|
)
|
||||||
|
|
||||||
|
def create(self, texts: list[Document], embeddings: list[list[float]], **kwargs):
|
||||||
|
self._create_collection(len(embeddings[0]))
|
||||||
|
self.add_texts(texts, embeddings)
|
||||||
|
|
||||||
|
def add_texts(self, documents: list[Document], embeddings: list[list[float]], **kwargs):
|
||||||
|
texts = [doc.page_content for doc in documents]
|
||||||
|
metadatas = [doc.metadata for doc in documents]
|
||||||
|
total_count = len(embeddings)
|
||||||
|
docs = []
|
||||||
|
for id in range(0, total_count):
|
||||||
|
if metadatas is None:
|
||||||
|
continue
|
||||||
|
metadata = json.dumps(metadatas[id])
|
||||||
|
doc = document.Document(
|
||||||
|
id=metadatas[id]["doc_id"],
|
||||||
|
vector=embeddings[id],
|
||||||
|
text=texts[id],
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
docs.append(doc)
|
||||||
|
self.collection.upsert(docs, self._client_config.timeout)
|
||||||
|
|
||||||
|
def text_exists(self, id: str) -> bool:
|
||||||
|
docs = self._db.collection(self._collection_name).query(document_ids=[id])
|
||||||
|
if docs and len(docs) > 0:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
def delete_by_ids(self, ids: list[str]) -> None:
|
||||||
|
self._db.collection(self._collection_name).delete(document_ids=ids)
|
||||||
|
|
||||||
|
def delete_by_metadata_field(self, key: str, value: str) -> None:
|
||||||
|
docs = self._db.collection(self._collection_name).query(filter=Filter(Filter.In(key, [value])))
|
||||||
|
if docs and len(docs) > 0:
|
||||||
|
self.collection.delete(document_ids=[doc['id'] for doc in docs])
|
||||||
|
|
||||||
|
def search_by_vector(self, query_vector: list[float], **kwargs: Any) -> list[Document]:
|
||||||
|
|
||||||
|
res = self._db.collection(self._collection_name).search(vectors=[query_vector],
|
||||||
|
params=document.HNSWSearchParams(
|
||||||
|
ef=kwargs.get("ef", 10)),
|
||||||
|
retrieve_vector=False,
|
||||||
|
limit=kwargs.get('top_k', 4),
|
||||||
|
timeout=self._client_config.timeout,
|
||||||
|
)
|
||||||
|
score_threshold = kwargs.get("score_threshold", .0) if kwargs.get('score_threshold', .0) else 0.0
|
||||||
|
return self._get_search_res(res, score_threshold)
|
||||||
|
|
||||||
|
def search_by_full_text(self, query: str, **kwargs: Any) -> list[Document]:
|
||||||
|
return []
|
||||||
|
|
||||||
|
def _get_search_res(self, res, score_threshold):
|
||||||
|
docs = []
|
||||||
|
if res is None or len(res) == 0:
|
||||||
|
return docs
|
||||||
|
|
||||||
|
for result in res[0]:
|
||||||
|
meta = result.get(self.field_metadata)
|
||||||
|
if meta is not None:
|
||||||
|
meta = json.loads(meta)
|
||||||
|
score = 1 - result.get("score")
|
||||||
|
if score > score_threshold:
|
||||||
|
meta['score'] = score
|
||||||
|
doc = Document(page_content=result.get(self.field_text), metadata=meta)
|
||||||
|
docs.append(doc)
|
||||||
|
return docs
|
||||||
|
|
||||||
|
def delete(self) -> None:
|
||||||
|
self._db.drop_collection(name=self._collection_name)
|
@ -25,7 +25,6 @@ class Vector:
|
|||||||
def _init_vector(self) -> BaseVector:
|
def _init_vector(self) -> BaseVector:
|
||||||
config = current_app.config
|
config = current_app.config
|
||||||
vector_type = config.get('VECTOR_STORE')
|
vector_type = config.get('VECTOR_STORE')
|
||||||
|
|
||||||
if self._dataset.index_struct_dict:
|
if self._dataset.index_struct_dict:
|
||||||
vector_type = self._dataset.index_struct_dict['type']
|
vector_type = self._dataset.index_struct_dict['type']
|
||||||
|
|
||||||
@ -138,6 +137,31 @@ class Vector:
|
|||||||
),
|
),
|
||||||
dim=dim
|
dim=dim
|
||||||
)
|
)
|
||||||
|
elif vector_type == "tencent":
|
||||||
|
from core.rag.datasource.vdb.tencent.tencent_vector import TencentConfig, TencentVector
|
||||||
|
if self._dataset.index_struct_dict:
|
||||||
|
class_prefix: str = self._dataset.index_struct_dict['vector_store']['class_prefix']
|
||||||
|
collection_name = class_prefix
|
||||||
|
else:
|
||||||
|
dataset_id = self._dataset.id
|
||||||
|
collection_name = Dataset.gen_collection_name_by_id(dataset_id)
|
||||||
|
index_struct_dict = {
|
||||||
|
"type": 'tencent',
|
||||||
|
"vector_store": {"class_prefix": collection_name}
|
||||||
|
}
|
||||||
|
self._dataset.index_struct = json.dumps(index_struct_dict)
|
||||||
|
return TencentVector(
|
||||||
|
collection_name=collection_name,
|
||||||
|
config=TencentConfig(
|
||||||
|
url=config.get('TENCENT_VECTOR_DB_URL'),
|
||||||
|
api_key=config.get('TENCENT_VECTOR_DB_API_KEY'),
|
||||||
|
timeout=config.get('TENCENT_VECTOR_DB_TIMEOUT'),
|
||||||
|
username=config.get('TENCENT_VECTOR_DB_USERNAME'),
|
||||||
|
database=config.get('TENCENT_VECTOR_DB_DATABASE'),
|
||||||
|
shard=config.get('TENCENT_VECTOR_DB_SHARD'),
|
||||||
|
replicas=config.get('TENCENT_VECTOR_DB_REPLICAS'),
|
||||||
|
)
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Vector store {config.get('VECTOR_STORE')} is not supported.")
|
raise ValueError(f"Vector store {config.get('VECTOR_STORE')} is not supported.")
|
||||||
|
|
||||||
|
@ -80,4 +80,5 @@ lxml==5.1.0
|
|||||||
xlrd~=2.0.1
|
xlrd~=2.0.1
|
||||||
pydantic~=1.10.0
|
pydantic~=1.10.0
|
||||||
pgvecto-rs==0.1.4
|
pgvecto-rs==0.1.4
|
||||||
oss2==2.15.0
|
oss2==2.15.0
|
||||||
|
tcvectordb==1.3.2
|
58
api/tests/integration_tests/vdb/tcvectordb/test_tencent.py
Normal file
58
api/tests/integration_tests/vdb/tcvectordb/test_tencent.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
import pytest
|
||||||
|
|
||||||
|
from core.rag.datasource.vdb.tencent.tencent_vector import TencentConfig, TencentVector
|
||||||
|
from core.rag.models.document import Document
|
||||||
|
from extensions.ext_redis import redis_client
|
||||||
|
|
||||||
|
|
||||||
|
def _create_tencent_vector() -> TencentVector:
|
||||||
|
tencent_vector = TencentVector(
|
||||||
|
collection_name='test-001',
|
||||||
|
config=TencentConfig(
|
||||||
|
url="http://10.6.x.x",
|
||||||
|
api_key="nTZ**********************",
|
||||||
|
timeout=30,
|
||||||
|
username="dify",
|
||||||
|
database="dify",
|
||||||
|
shard=1,
|
||||||
|
replicas=2,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
documents = [
|
||||||
|
Document(page_content="This is document 1", metadata={"doc_id": "doc1", "document_id": "foo1"}),
|
||||||
|
Document(page_content="This is document 2", metadata={"doc_id": "doc2", "document_id": "foo2"}),
|
||||||
|
]
|
||||||
|
embeddings = [[0.2123, 0.23, 0.213], [0.2123, 0.22, 0.213]]
|
||||||
|
tencent_vector.create(texts=documents, embeddings=embeddings)
|
||||||
|
|
||||||
|
return tencent_vector
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def mock_redis_lock(mocker):
|
||||||
|
mocker.patch.object(redis_client, "lock")
|
||||||
|
|
||||||
|
|
||||||
|
def test_text_exists():
|
||||||
|
tencent_vector = _create_tencent_vector()
|
||||||
|
assert tencent_vector.text_exists(id="doc1") is True
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_by_ids():
|
||||||
|
tencent_vector = _create_tencent_vector()
|
||||||
|
tencent_vector.delete_by_ids(ids=['doc2'])
|
||||||
|
|
||||||
|
|
||||||
|
def test_delete_by_metadata_field():
|
||||||
|
tencent_vector = _create_tencent_vector()
|
||||||
|
tencent_vector.delete_by_metadata_field(key="document_id", value="foo1")
|
||||||
|
|
||||||
|
|
||||||
|
def test_search_by_vector():
|
||||||
|
tencent_vector = _create_tencent_vector()
|
||||||
|
res = tencent_vector.search_by_vector(query_vector=[0.3123, 0.43, 0.213])
|
||||||
|
assert len(res) > 0
|
||||||
|
|
||||||
|
def test_delete():
|
||||||
|
tencent_vector = _create_tencent_vector()
|
||||||
|
tencent_vector.delete()
|
@ -229,6 +229,14 @@ services:
|
|||||||
RELYT_USER: postgres
|
RELYT_USER: postgres
|
||||||
RELYT_PASSWORD: difyai123456
|
RELYT_PASSWORD: difyai123456
|
||||||
RELYT_DATABASE: postgres
|
RELYT_DATABASE: postgres
|
||||||
|
# tencent configurations
|
||||||
|
TENCENT_VECTOR_DB_URL: http://127.0.0.1
|
||||||
|
TENCENT_VECTOR_DB_API_KEY: dify
|
||||||
|
TENCENT_VECTOR_DB_TIMEOUT: 30
|
||||||
|
TENCENT_VECTOR_DB_USERNAME: dify
|
||||||
|
TENCENT_VECTOR_DB_DATABASE: dify
|
||||||
|
TENCENT_VECTOR_DB_SHARD: 1
|
||||||
|
TENCENT_VECTOR_DB_REPLICAS: 2
|
||||||
depends_on:
|
depends_on:
|
||||||
- db
|
- db
|
||||||
- redis
|
- redis
|
||||||
|
Loading…
Reference in New Issue
Block a user