diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e1c087a6cd..992126551c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -155,4 +155,4 @@ And that's it! Once your PR is merged, you will be featured as a contributor in
## Getting Help
-If you ever get stuck or got a burning question while contributing, simply shoot your queries our way via the related GitHub issue, or hop onto our [Discord](https://discord.gg/AhzKf7dNgk) for a quick chat.
+If you ever get stuck or got a burning question while contributing, simply shoot your queries our way via the related GitHub issue, or hop onto our [Discord](https://discord.gg/8Tpq4AcN9c) for a quick chat.
diff --git a/CONTRIBUTING_CN.md b/CONTRIBUTING_CN.md
index 6adfed6b6c..08c5a0a4bd 100644
--- a/CONTRIBUTING_CN.md
+++ b/CONTRIBUTING_CN.md
@@ -152,4 +152,4 @@ Dify的后端使用Python编写,使用[Flask](https://flask.palletsprojects.co
## 获取帮助
-如果你在贡献过程中遇到困难或者有任何问题,可以通过相关的 GitHub 问题提出你的疑问,或者加入我们的 [Discord](https://discord.gg/AhzKf7dNgk) 进行快速交流。
+如果你在贡献过程中遇到困难或者有任何问题,可以通过相关的 GitHub 问题提出你的疑问,或者加入我们的 [Discord](https://discord.gg/8Tpq4AcN9c) 进行快速交流。
diff --git a/README.md b/README.md
index 80a60e9cad..154fdd8adb 100644
--- a/README.md
+++ b/README.md
@@ -131,7 +131,7 @@ At the same time, please consider supporting Dify by sharing it on social media
### Translations
-We are looking for contributors to help with translating Dify to languages other than Mandarin or English. If you are interested in helping, please see the [i18n README](https://github.com/langgenius/dify/blob/main/web/i18n/README.md) for more information, and leave us a comment in the `global-users` channel of our [Discord Community Server](https://discord.gg/AhzKf7dNgk).
+We are looking for contributors to help with translating Dify to languages other than Mandarin or English. If you are interested in helping, please see the [i18n README](https://github.com/langgenius/dify/blob/main/web/i18n/README.md) for more information, and leave us a comment in the `global-users` channel of our [Discord Community Server](https://discord.gg/8Tpq4AcN9c).
## Community & Support
diff --git a/api/commands.py b/api/commands.py
index b82d4d5d5d..9f1dc95281 100644
--- a/api/commands.py
+++ b/api/commands.py
@@ -109,19 +109,20 @@ def reset_encrypt_key_pair():
click.echo(click.style('Sorry, only support SELF_HOSTED mode.', fg='red'))
return
- tenant = db.session.query(Tenant).first()
- if not tenant:
- click.echo(click.style('Sorry, no workspace found. Please enter /install to initialize.', fg='red'))
- return
+ tenants = db.session.query(Tenant).all()
+ for tenant in tenants:
+ if not tenant:
+ click.echo(click.style('Sorry, no workspace found. Please enter /install to initialize.', fg='red'))
+ return
- tenant.encrypt_public_key = generate_key_pair(tenant.id)
+ tenant.encrypt_public_key = generate_key_pair(tenant.id)
- db.session.query(Provider).filter(Provider.provider_type == 'custom').delete()
- db.session.query(ProviderModel).delete()
- db.session.commit()
+ db.session.query(Provider).filter(Provider.provider_type == 'custom', Provider.tenant_id == tenant.id).delete()
+ db.session.query(ProviderModel).filter(ProviderModel.tenant_id == tenant.id).delete()
+ db.session.commit()
- click.echo(click.style('Congratulations! '
- 'the asymmetric key pair of workspace {} has been reset.'.format(tenant.id), fg='green'))
+ click.echo(click.style('Congratulations! '
+ 'the asymmetric key pair of workspace {} has been reset.'.format(tenant.id), fg='green'))
@click.command('vdb-migrate', help='migrate vector db.')
diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py
index dd46aa27dc..94c7d18c55 100644
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@@ -1,3 +1,4 @@
+import concurrent.futures
import datetime
import json
import logging
@@ -650,17 +651,44 @@ class IndexingRunner:
# chunk nodes by chunk size
indexing_start_at = time.perf_counter()
tokens = 0
- chunk_size = 100
+ chunk_size = 10
embedding_model_type_instance = None
if embedding_model_instance:
embedding_model_type_instance = embedding_model_instance.model_type_instance
embedding_model_type_instance = cast(TextEmbeddingModel, embedding_model_type_instance)
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
+ futures = []
+ for i in range(0, len(documents), chunk_size):
+ chunk_documents = documents[i:i + chunk_size]
+ futures.append(executor.submit(self._process_chunk, current_app._get_current_object(), index_processor,
+ chunk_documents, dataset,
+ dataset_document, embedding_model_instance,
+ embedding_model_type_instance))
- for i in range(0, len(documents), chunk_size):
+ for future in futures:
+ tokens += future.result()
+
+ indexing_end_at = time.perf_counter()
+
+ # update document status to completed
+ self._update_document_index_status(
+ document_id=dataset_document.id,
+ after_indexing_status="completed",
+ extra_update_params={
+ DatasetDocument.tokens: tokens,
+ DatasetDocument.completed_at: datetime.datetime.utcnow(),
+ DatasetDocument.indexing_latency: indexing_end_at - indexing_start_at,
+ }
+ )
+
+ def _process_chunk(self, flask_app, index_processor, chunk_documents, dataset, dataset_document,
+ embedding_model_instance, embedding_model_type_instance):
+ with flask_app.app_context():
# check document is paused
self._check_document_paused_status(dataset_document.id)
- chunk_documents = documents[i:i + chunk_size]
+
+ tokens = 0
if dataset.indexing_technique == 'high_quality' or embedding_model_type_instance:
tokens += sum(
embedding_model_type_instance.get_num_tokens(
@@ -670,9 +698,9 @@ class IndexingRunner:
)
for document in chunk_documents
)
+
# load index
index_processor.load(dataset, chunk_documents)
- db.session.add(dataset)
document_ids = [document.metadata['doc_id'] for document in chunk_documents]
db.session.query(DocumentSegment).filter(
@@ -687,18 +715,7 @@ class IndexingRunner:
db.session.commit()
- indexing_end_at = time.perf_counter()
-
- # update document status to completed
- self._update_document_index_status(
- document_id=dataset_document.id,
- after_indexing_status="completed",
- extra_update_params={
- DatasetDocument.tokens: tokens,
- DatasetDocument.completed_at: datetime.datetime.utcnow(),
- DatasetDocument.indexing_latency: indexing_end_at - indexing_start_at,
- }
- )
+ return tokens
def _check_document_paused_status(self, document_id: str):
indexing_cache_key = 'document_{}_is_paused'.format(document_id)
diff --git a/api/core/model_runtime/model_providers/bedrock/bedrock.yaml b/api/core/model_runtime/model_providers/bedrock/bedrock.yaml
index 65e9e50e1b..19ce51ddcd 100644
--- a/api/core/model_runtime/model_providers/bedrock/bedrock.yaml
+++ b/api/core/model_runtime/model_providers/bedrock/bedrock.yaml
@@ -78,4 +78,4 @@ provider_credential_schema:
placeholder:
en_US: A model you have access to (e.g. amazon.titan-text-lite-v1) for validation.
zh_Hans: 为了进行验证,请输入一个您可用的模型名称 (例如:amazon.titan-text-lite-v1)
-
\ No newline at end of file
+
diff --git a/api/core/rag/datasource/vdb/milvus/milvus_vector.py b/api/core/rag/datasource/vdb/milvus/milvus_vector.py
index 203b7eff37..f62d603d8d 100644
--- a/api/core/rag/datasource/vdb/milvus/milvus_vector.py
+++ b/api/core/rag/datasource/vdb/milvus/milvus_vector.py
@@ -144,6 +144,16 @@ class MilvusVector(BaseVector):
utility.drop_collection(self._collection_name, None, using=alias)
def text_exists(self, id: str) -> bool:
+ alias = uuid4().hex
+ if self._client_config.secure:
+ uri = "https://" + str(self._client_config.host) + ":" + str(self._client_config.port)
+ else:
+ uri = "http://" + str(self._client_config.host) + ":" + str(self._client_config.port)
+ connections.connect(alias=alias, uri=uri, user=self._client_config.user, password=self._client_config.password)
+
+ from pymilvus import utility
+ if not utility.has_collection(self._collection_name, using=alias):
+ return False
result = self._client.query(collection_name=self._collection_name,
filter=f'metadata["doc_id"] == "{id}"',
diff --git a/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py b/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py
index 6bd4b5c340..436e6b5f6a 100644
--- a/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py
+++ b/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py
@@ -275,6 +275,13 @@ class QdrantVector(BaseVector):
)
def text_exists(self, id: str) -> bool:
+ all_collection_name = []
+ collections_response = self._client.get_collections()
+ collection_list = collections_response.collections
+ for collection in collection_list:
+ all_collection_name.append(collection.name)
+ if self._collection_name not in all_collection_name:
+ return False
response = self._client.retrieve(
collection_name=self._collection_name,
ids=[id]
diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py
index 27ae15a025..71fc07967c 100644
--- a/api/core/rag/datasource/vdb/vector_factory.py
+++ b/api/core/rag/datasource/vdb/vector_factory.py
@@ -128,8 +128,8 @@ class Vector:
if kwargs.get('duplicate_check', False):
documents = self._filter_duplicate_texts(documents)
embeddings = self._embeddings.embed_documents([document.page_content for document in documents])
- self._vector_processor.add_texts(
- documents=documents,
+ self._vector_processor.create(
+ texts=documents,
embeddings=embeddings,
**kwargs
)
diff --git a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py
index 6e317115b8..5d24ee9fd2 100644
--- a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py
+++ b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py
@@ -134,6 +134,11 @@ class WeaviateVector(BaseVector):
def text_exists(self, id: str) -> bool:
collection_name = self._collection_name
+ schema = self._default_schema(self._collection_name)
+
+ # check whether the index already exists
+ if not self._client.schema.contains(schema):
+ return False
result = self._client.query.get(collection_name).with_additional(["id"]).with_where({
"path": ["doc_id"],
"operator": "Equal",
diff --git a/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py b/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py
index b37981a30d..34a4e85e97 100644
--- a/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py
@@ -53,7 +53,7 @@ class UnstructuredWordExtractor(BaseExtractor):
elements = partition_docx(filename=self._file_path)
from unstructured.chunking.title import chunk_by_title
- chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+ chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()
diff --git a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
index 1d92bbbee6..f6ae8fad53 100644
--- a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
@@ -43,7 +43,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
pass
from unstructured.chunking.title import chunk_by_title
- chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+ chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()
diff --git a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
index 3ac04ddc17..3d63446fef 100644
--- a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
@@ -38,7 +38,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
elements = partition_md(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
- chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+ chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()
diff --git a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
index d4b72e37eb..34d3e8021a 100644
--- a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
@@ -28,7 +28,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
elements = partition_msg(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
- chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+ chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()
diff --git a/api/core/rag/extractor/unstructured/unstructured_text_extractor.py b/api/core/rag/extractor/unstructured/unstructured_text_extractor.py
index 5af21b2b1d..cc67f2b866 100644
--- a/api/core/rag/extractor/unstructured/unstructured_text_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_text_extractor.py
@@ -28,7 +28,7 @@ class UnstructuredTextExtractor(BaseExtractor):
elements = partition_text(filename=self._file_path, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
- chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+ chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()
diff --git a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
index b08ff63a1c..5600fb075d 100644
--- a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
@@ -28,7 +28,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url)
from unstructured.chunking.title import chunk_by_title
- chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
+ chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()
diff --git a/api/core/tools/provider/builtin/devdocs/_assets/icon.svg b/api/core/tools/provider/builtin/devdocs/_assets/icon.svg
new file mode 100644
index 0000000000..c7a19fabfb
--- /dev/null
+++ b/api/core/tools/provider/builtin/devdocs/_assets/icon.svg
@@ -0,0 +1,4 @@
+
+
\ No newline at end of file
diff --git a/api/core/tools/provider/builtin/devdocs/devdocs.py b/api/core/tools/provider/builtin/devdocs/devdocs.py
new file mode 100644
index 0000000000..25cbe4d053
--- /dev/null
+++ b/api/core/tools/provider/builtin/devdocs/devdocs.py
@@ -0,0 +1,21 @@
+from core.tools.errors import ToolProviderCredentialValidationError
+from core.tools.provider.builtin.devdocs.tools.searchDevDocs import SearchDevDocsTool
+from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
+
+
+class DevDocsProvider(BuiltinToolProviderController):
+ def _validate_credentials(self, credentials: dict) -> None:
+ try:
+ SearchDevDocsTool().fork_tool_runtime(
+ meta={
+ "credentials": credentials,
+ }
+ ).invoke(
+ user_id='',
+ tool_parameters={
+ "doc": "python~3.12",
+ "topic": "library/code",
+ },
+ )
+ except Exception as e:
+ raise ToolProviderCredentialValidationError(str(e))
\ No newline at end of file
diff --git a/api/core/tools/provider/builtin/devdocs/devdocs.yaml b/api/core/tools/provider/builtin/devdocs/devdocs.yaml
new file mode 100644
index 0000000000..1db226fc4b
--- /dev/null
+++ b/api/core/tools/provider/builtin/devdocs/devdocs.yaml
@@ -0,0 +1,10 @@
+identity:
+ author: Richards Tu
+ name: devdocs
+ label:
+ en_US: DevDocs
+ zh_Hans: DevDocs
+ description:
+ en_US: Get official developer documentations on DevDocs.
+ zh_Hans: 从DevDocs获取官方开发者文档。
+ icon: icon.svg
diff --git a/api/core/tools/provider/builtin/devdocs/tools/searchDevDocs.py b/api/core/tools/provider/builtin/devdocs/tools/searchDevDocs.py
new file mode 100644
index 0000000000..1a244c5db3
--- /dev/null
+++ b/api/core/tools/provider/builtin/devdocs/tools/searchDevDocs.py
@@ -0,0 +1,42 @@
+from typing import Any, Union
+
+import requests
+from pydantic import BaseModel, Field
+
+from core.tools.entities.tool_entities import ToolInvokeMessage
+from core.tools.tool.builtin_tool import BuiltinTool
+
+
+class SearchDevDocsInput(BaseModel):
+ doc: str = Field(..., description="The name of the documentation.")
+ topic: str = Field(..., description="The path of the section/topic.")
+
+
+class SearchDevDocsTool(BuiltinTool):
+ def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
+ """
+ Invokes the DevDocs search tool with the given user ID and tool parameters.
+
+ Args:
+ user_id (str): The ID of the user invoking the tool.
+ tool_parameters (dict[str, Any]): The parameters for the tool, including 'doc' and 'topic'.
+
+ Returns:
+ ToolInvokeMessage | list[ToolInvokeMessage]: The result of the tool invocation, which can be a single message or a list of messages.
+ """
+ doc = tool_parameters.get('doc', '')
+ topic = tool_parameters.get('topic', '')
+
+ if not doc:
+ return self.create_text_message('Please provide the documentation name.')
+ if not topic:
+ return self.create_text_message('Please provide the topic path.')
+
+ url = f"https://documents.devdocs.io/{doc}/{topic}.html"
+ response = requests.get(url)
+
+ if response.status_code == 200:
+ content = response.text
+ return self.create_text_message(self.summary(user_id=user_id, content=content))
+ else:
+ return self.create_text_message(f"Failed to retrieve the documentation. Status code: {response.status_code}")
\ No newline at end of file
diff --git a/api/core/tools/provider/builtin/devdocs/tools/searchDevDocs.yaml b/api/core/tools/provider/builtin/devdocs/tools/searchDevDocs.yaml
new file mode 100644
index 0000000000..2476db9da4
--- /dev/null
+++ b/api/core/tools/provider/builtin/devdocs/tools/searchDevDocs.yaml
@@ -0,0 +1,34 @@
+identity:
+ name: searchDevDocs
+ author: Richards Tu
+ label:
+ en_US: Search Developer Docs
+ zh_Hans: 搜索开发者文档
+description:
+ human:
+ en_US: A tools for searching for a specific topic and path in DevDocs based on the provided documentation name and topic. Don't for get to add some shots in the system prompt; for example, the documentation name should be like \"vuex~4\", \"css\", or \"python~3.12\", while the topic should be like \"guide/actions\" for Vuex 4, \"display-box\" for CSS, or \"library/code\" for Python 3.12.
+ zh_Hans: 一个用于根据提供的文档名称和主题,在DevDocs中搜索特定主题和路径的工具。不要忘记在系统提示词中添加一些示例;例如,文档名称应该是\"vuex~4\"、\"css\"或\"python~3.12\",而主题应该是\"guide/actions\"用于Vuex 4,\"display-box\"用于CSS,或\"library/code\"用于Python 3.12。
+ llm: A tools for searching for specific developer documentation in DevDocs based on the provided documentation name and topic.
+parameters:
+ - name: doc
+ type: string
+ required: true
+ label:
+ en_US: Documentation name
+ zh_Hans: 文档名称
+ human_description:
+ en_US: The name of the documentation.
+ zh_Hans: 文档名称。
+ llm_description: The name of the documentation, such as \"vuex~4\", \"css\", or \"python~3.12\". The exact value should be identified by the user.
+ form: llm
+ - name: topic
+ type: string
+ required: true
+ label:
+ en_US: Topic name
+ zh_Hans: 主题名称
+ human_description:
+ en_US: The path of the section/topic.
+ zh_Hans: 文档主题的路径。
+ llm_description: The path of the section/topic, such as \"guide/actions\" for Vuex 4, \"display-box\" for CSS, or \"library/code\" for Python 3.12.
+ form: llm
diff --git a/api/libs/login.py b/api/libs/login.py
index 5c03cfe957..14085fe603 100644
--- a/api/libs/login.py
+++ b/api/libs/login.py
@@ -53,7 +53,7 @@ def login_required(func):
def decorated_view(*args, **kwargs):
auth_header = request.headers.get('Authorization')
admin_api_key_enable = os.getenv('ADMIN_API_KEY_ENABLE', default='False')
- if admin_api_key_enable:
+ if admin_api_key_enable.lower() == 'true':
if auth_header:
if ' ' not in auth_header:
raise Unauthorized('Invalid Authorization header format. Expected \'Bearer \' format.')
diff --git a/api/services/account_service.py b/api/services/account_service.py
index 103af7f79c..7fc61e40e3 100644
--- a/api/services/account_service.py
+++ b/api/services/account_service.py
@@ -435,11 +435,13 @@ class RegisterService:
if open_id is not None or provider is not None:
AccountService.link_account_integrate(provider, open_id, account)
+ if current_app.config['EDITION'] != 'SELF_HOSTED':
+ tenant = TenantService.create_tenant(f"{account.name}'s Workspace")
- tenant = TenantService.create_tenant(f"{account.name}'s Workspace")
+ TenantService.create_tenant_member(tenant, account, role='owner')
+ account.current_tenant = tenant
- TenantService.create_tenant_member(tenant, account, role='owner')
- account.current_tenant = tenant
+ tenant_was_created.send(tenant)
db.session.commit()
except Exception as e:
@@ -447,8 +449,6 @@ class RegisterService:
logging.error(f'Register failed: {e}')
raise AccountRegisterError(f'Registration failed: {e}') from e
- tenant_was_created.send(tenant)
-
return account
@classmethod
@@ -461,7 +461,6 @@ class RegisterService:
name = email.split('@')[0]
account = cls.register(email=email, name=name, language=language, status=AccountStatus.PENDING)
-
# Create new tenant member for invited tenant
TenantService.create_tenant_member(tenant, account, role)
TenantService.switch_tenant(account, tenant.id)