diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e1c087a6cd..992126551c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -155,4 +155,4 @@ And that's it! Once your PR is merged, you will be featured as a contributor in ## Getting Help -If you ever get stuck or got a burning question while contributing, simply shoot your queries our way via the related GitHub issue, or hop onto our [Discord](https://discord.gg/AhzKf7dNgk) for a quick chat. +If you ever get stuck or got a burning question while contributing, simply shoot your queries our way via the related GitHub issue, or hop onto our [Discord](https://discord.gg/8Tpq4AcN9c) for a quick chat. diff --git a/CONTRIBUTING_CN.md b/CONTRIBUTING_CN.md index 6adfed6b6c..08c5a0a4bd 100644 --- a/CONTRIBUTING_CN.md +++ b/CONTRIBUTING_CN.md @@ -152,4 +152,4 @@ Dify的后端使用Python编写,使用[Flask](https://flask.palletsprojects.co ## 获取帮助 -如果你在贡献过程中遇到困难或者有任何问题,可以通过相关的 GitHub 问题提出你的疑问,或者加入我们的 [Discord](https://discord.gg/AhzKf7dNgk) 进行快速交流。 +如果你在贡献过程中遇到困难或者有任何问题,可以通过相关的 GitHub 问题提出你的疑问,或者加入我们的 [Discord](https://discord.gg/8Tpq4AcN9c) 进行快速交流。 diff --git a/README.md b/README.md index 80a60e9cad..154fdd8adb 100644 --- a/README.md +++ b/README.md @@ -131,7 +131,7 @@ At the same time, please consider supporting Dify by sharing it on social media ### Translations -We are looking for contributors to help with translating Dify to languages other than Mandarin or English. If you are interested in helping, please see the [i18n README](https://github.com/langgenius/dify/blob/main/web/i18n/README.md) for more information, and leave us a comment in the `global-users` channel of our [Discord Community Server](https://discord.gg/AhzKf7dNgk). +We are looking for contributors to help with translating Dify to languages other than Mandarin or English. If you are interested in helping, please see the [i18n README](https://github.com/langgenius/dify/blob/main/web/i18n/README.md) for more information, and leave us a comment in the `global-users` channel of our [Discord Community Server](https://discord.gg/8Tpq4AcN9c). ## Community & Support diff --git a/api/commands.py b/api/commands.py index b82d4d5d5d..9f1dc95281 100644 --- a/api/commands.py +++ b/api/commands.py @@ -109,19 +109,20 @@ def reset_encrypt_key_pair(): click.echo(click.style('Sorry, only support SELF_HOSTED mode.', fg='red')) return - tenant = db.session.query(Tenant).first() - if not tenant: - click.echo(click.style('Sorry, no workspace found. Please enter /install to initialize.', fg='red')) - return + tenants = db.session.query(Tenant).all() + for tenant in tenants: + if not tenant: + click.echo(click.style('Sorry, no workspace found. Please enter /install to initialize.', fg='red')) + return - tenant.encrypt_public_key = generate_key_pair(tenant.id) + tenant.encrypt_public_key = generate_key_pair(tenant.id) - db.session.query(Provider).filter(Provider.provider_type == 'custom').delete() - db.session.query(ProviderModel).delete() - db.session.commit() + db.session.query(Provider).filter(Provider.provider_type == 'custom', Provider.tenant_id == tenant.id).delete() + db.session.query(ProviderModel).filter(ProviderModel.tenant_id == tenant.id).delete() + db.session.commit() - click.echo(click.style('Congratulations! ' - 'the asymmetric key pair of workspace {} has been reset.'.format(tenant.id), fg='green')) + click.echo(click.style('Congratulations! ' + 'the asymmetric key pair of workspace {} has been reset.'.format(tenant.id), fg='green')) @click.command('vdb-migrate', help='migrate vector db.') diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index dd46aa27dc..94c7d18c55 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -1,3 +1,4 @@ +import concurrent.futures import datetime import json import logging @@ -650,17 +651,44 @@ class IndexingRunner: # chunk nodes by chunk size indexing_start_at = time.perf_counter() tokens = 0 - chunk_size = 100 + chunk_size = 10 embedding_model_type_instance = None if embedding_model_instance: embedding_model_type_instance = embedding_model_instance.model_type_instance embedding_model_type_instance = cast(TextEmbeddingModel, embedding_model_type_instance) + with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: + futures = [] + for i in range(0, len(documents), chunk_size): + chunk_documents = documents[i:i + chunk_size] + futures.append(executor.submit(self._process_chunk, current_app._get_current_object(), index_processor, + chunk_documents, dataset, + dataset_document, embedding_model_instance, + embedding_model_type_instance)) - for i in range(0, len(documents), chunk_size): + for future in futures: + tokens += future.result() + + indexing_end_at = time.perf_counter() + + # update document status to completed + self._update_document_index_status( + document_id=dataset_document.id, + after_indexing_status="completed", + extra_update_params={ + DatasetDocument.tokens: tokens, + DatasetDocument.completed_at: datetime.datetime.utcnow(), + DatasetDocument.indexing_latency: indexing_end_at - indexing_start_at, + } + ) + + def _process_chunk(self, flask_app, index_processor, chunk_documents, dataset, dataset_document, + embedding_model_instance, embedding_model_type_instance): + with flask_app.app_context(): # check document is paused self._check_document_paused_status(dataset_document.id) - chunk_documents = documents[i:i + chunk_size] + + tokens = 0 if dataset.indexing_technique == 'high_quality' or embedding_model_type_instance: tokens += sum( embedding_model_type_instance.get_num_tokens( @@ -670,9 +698,9 @@ class IndexingRunner: ) for document in chunk_documents ) + # load index index_processor.load(dataset, chunk_documents) - db.session.add(dataset) document_ids = [document.metadata['doc_id'] for document in chunk_documents] db.session.query(DocumentSegment).filter( @@ -687,18 +715,7 @@ class IndexingRunner: db.session.commit() - indexing_end_at = time.perf_counter() - - # update document status to completed - self._update_document_index_status( - document_id=dataset_document.id, - after_indexing_status="completed", - extra_update_params={ - DatasetDocument.tokens: tokens, - DatasetDocument.completed_at: datetime.datetime.utcnow(), - DatasetDocument.indexing_latency: indexing_end_at - indexing_start_at, - } - ) + return tokens def _check_document_paused_status(self, document_id: str): indexing_cache_key = 'document_{}_is_paused'.format(document_id) diff --git a/api/core/model_runtime/model_providers/bedrock/bedrock.yaml b/api/core/model_runtime/model_providers/bedrock/bedrock.yaml index 65e9e50e1b..19ce51ddcd 100644 --- a/api/core/model_runtime/model_providers/bedrock/bedrock.yaml +++ b/api/core/model_runtime/model_providers/bedrock/bedrock.yaml @@ -78,4 +78,4 @@ provider_credential_schema: placeholder: en_US: A model you have access to (e.g. amazon.titan-text-lite-v1) for validation. zh_Hans: 为了进行验证,请输入一个您可用的模型名称 (例如:amazon.titan-text-lite-v1) - \ No newline at end of file + diff --git a/api/core/rag/datasource/vdb/milvus/milvus_vector.py b/api/core/rag/datasource/vdb/milvus/milvus_vector.py index 203b7eff37..f62d603d8d 100644 --- a/api/core/rag/datasource/vdb/milvus/milvus_vector.py +++ b/api/core/rag/datasource/vdb/milvus/milvus_vector.py @@ -144,6 +144,16 @@ class MilvusVector(BaseVector): utility.drop_collection(self._collection_name, None, using=alias) def text_exists(self, id: str) -> bool: + alias = uuid4().hex + if self._client_config.secure: + uri = "https://" + str(self._client_config.host) + ":" + str(self._client_config.port) + else: + uri = "http://" + str(self._client_config.host) + ":" + str(self._client_config.port) + connections.connect(alias=alias, uri=uri, user=self._client_config.user, password=self._client_config.password) + + from pymilvus import utility + if not utility.has_collection(self._collection_name, using=alias): + return False result = self._client.query(collection_name=self._collection_name, filter=f'metadata["doc_id"] == "{id}"', diff --git a/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py b/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py index 6bd4b5c340..436e6b5f6a 100644 --- a/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py +++ b/api/core/rag/datasource/vdb/qdrant/qdrant_vector.py @@ -275,6 +275,13 @@ class QdrantVector(BaseVector): ) def text_exists(self, id: str) -> bool: + all_collection_name = [] + collections_response = self._client.get_collections() + collection_list = collections_response.collections + for collection in collection_list: + all_collection_name.append(collection.name) + if self._collection_name not in all_collection_name: + return False response = self._client.retrieve( collection_name=self._collection_name, ids=[id] diff --git a/api/core/rag/datasource/vdb/vector_factory.py b/api/core/rag/datasource/vdb/vector_factory.py index 27ae15a025..71fc07967c 100644 --- a/api/core/rag/datasource/vdb/vector_factory.py +++ b/api/core/rag/datasource/vdb/vector_factory.py @@ -128,8 +128,8 @@ class Vector: if kwargs.get('duplicate_check', False): documents = self._filter_duplicate_texts(documents) embeddings = self._embeddings.embed_documents([document.page_content for document in documents]) - self._vector_processor.add_texts( - documents=documents, + self._vector_processor.create( + texts=documents, embeddings=embeddings, **kwargs ) diff --git a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py index 6e317115b8..5d24ee9fd2 100644 --- a/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py +++ b/api/core/rag/datasource/vdb/weaviate/weaviate_vector.py @@ -134,6 +134,11 @@ class WeaviateVector(BaseVector): def text_exists(self, id: str) -> bool: collection_name = self._collection_name + schema = self._default_schema(self._collection_name) + + # check whether the index already exists + if not self._client.schema.contains(schema): + return False result = self._client.query.get(collection_name).with_additional(["id"]).with_where({ "path": ["doc_id"], "operator": "Equal", diff --git a/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py b/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py index b37981a30d..34a4e85e97 100644 --- a/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_doc_extractor.py @@ -53,7 +53,7 @@ class UnstructuredWordExtractor(BaseExtractor): elements = partition_docx(filename=self._file_path) from unstructured.chunking.title import chunk_by_title - chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) + chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) documents = [] for chunk in chunks: text = chunk.text.strip() diff --git a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py index 1d92bbbee6..f6ae8fad53 100644 --- a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py @@ -43,7 +43,7 @@ class UnstructuredEmailExtractor(BaseExtractor): pass from unstructured.chunking.title import chunk_by_title - chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) + chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) documents = [] for chunk in chunks: text = chunk.text.strip() diff --git a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py index 3ac04ddc17..3d63446fef 100644 --- a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py @@ -38,7 +38,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor): elements = partition_md(filename=self._file_path, api_url=self._api_url) from unstructured.chunking.title import chunk_by_title - chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) + chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) documents = [] for chunk in chunks: text = chunk.text.strip() diff --git a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py index d4b72e37eb..34d3e8021a 100644 --- a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py @@ -28,7 +28,7 @@ class UnstructuredMsgExtractor(BaseExtractor): elements = partition_msg(filename=self._file_path, api_url=self._api_url) from unstructured.chunking.title import chunk_by_title - chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) + chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) documents = [] for chunk in chunks: text = chunk.text.strip() diff --git a/api/core/rag/extractor/unstructured/unstructured_text_extractor.py b/api/core/rag/extractor/unstructured/unstructured_text_extractor.py index 5af21b2b1d..cc67f2b866 100644 --- a/api/core/rag/extractor/unstructured/unstructured_text_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_text_extractor.py @@ -28,7 +28,7 @@ class UnstructuredTextExtractor(BaseExtractor): elements = partition_text(filename=self._file_path, api_url=self._api_url) from unstructured.chunking.title import chunk_by_title - chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) + chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) documents = [] for chunk in chunks: text = chunk.text.strip() diff --git a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py index b08ff63a1c..5600fb075d 100644 --- a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py @@ -28,7 +28,7 @@ class UnstructuredXmlExtractor(BaseExtractor): elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url) from unstructured.chunking.title import chunk_by_title - chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0) + chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000) documents = [] for chunk in chunks: text = chunk.text.strip() diff --git a/api/core/tools/provider/builtin/devdocs/_assets/icon.svg b/api/core/tools/provider/builtin/devdocs/_assets/icon.svg new file mode 100644 index 0000000000..c7a19fabfb --- /dev/null +++ b/api/core/tools/provider/builtin/devdocs/_assets/icon.svg @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/api/core/tools/provider/builtin/devdocs/devdocs.py b/api/core/tools/provider/builtin/devdocs/devdocs.py new file mode 100644 index 0000000000..25cbe4d053 --- /dev/null +++ b/api/core/tools/provider/builtin/devdocs/devdocs.py @@ -0,0 +1,21 @@ +from core.tools.errors import ToolProviderCredentialValidationError +from core.tools.provider.builtin.devdocs.tools.searchDevDocs import SearchDevDocsTool +from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController + + +class DevDocsProvider(BuiltinToolProviderController): + def _validate_credentials(self, credentials: dict) -> None: + try: + SearchDevDocsTool().fork_tool_runtime( + meta={ + "credentials": credentials, + } + ).invoke( + user_id='', + tool_parameters={ + "doc": "python~3.12", + "topic": "library/code", + }, + ) + except Exception as e: + raise ToolProviderCredentialValidationError(str(e)) \ No newline at end of file diff --git a/api/core/tools/provider/builtin/devdocs/devdocs.yaml b/api/core/tools/provider/builtin/devdocs/devdocs.yaml new file mode 100644 index 0000000000..1db226fc4b --- /dev/null +++ b/api/core/tools/provider/builtin/devdocs/devdocs.yaml @@ -0,0 +1,10 @@ +identity: + author: Richards Tu + name: devdocs + label: + en_US: DevDocs + zh_Hans: DevDocs + description: + en_US: Get official developer documentations on DevDocs. + zh_Hans: 从DevDocs获取官方开发者文档。 + icon: icon.svg diff --git a/api/core/tools/provider/builtin/devdocs/tools/searchDevDocs.py b/api/core/tools/provider/builtin/devdocs/tools/searchDevDocs.py new file mode 100644 index 0000000000..1a244c5db3 --- /dev/null +++ b/api/core/tools/provider/builtin/devdocs/tools/searchDevDocs.py @@ -0,0 +1,42 @@ +from typing import Any, Union + +import requests +from pydantic import BaseModel, Field + +from core.tools.entities.tool_entities import ToolInvokeMessage +from core.tools.tool.builtin_tool import BuiltinTool + + +class SearchDevDocsInput(BaseModel): + doc: str = Field(..., description="The name of the documentation.") + topic: str = Field(..., description="The path of the section/topic.") + + +class SearchDevDocsTool(BuiltinTool): + def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: + """ + Invokes the DevDocs search tool with the given user ID and tool parameters. + + Args: + user_id (str): The ID of the user invoking the tool. + tool_parameters (dict[str, Any]): The parameters for the tool, including 'doc' and 'topic'. + + Returns: + ToolInvokeMessage | list[ToolInvokeMessage]: The result of the tool invocation, which can be a single message or a list of messages. + """ + doc = tool_parameters.get('doc', '') + topic = tool_parameters.get('topic', '') + + if not doc: + return self.create_text_message('Please provide the documentation name.') + if not topic: + return self.create_text_message('Please provide the topic path.') + + url = f"https://documents.devdocs.io/{doc}/{topic}.html" + response = requests.get(url) + + if response.status_code == 200: + content = response.text + return self.create_text_message(self.summary(user_id=user_id, content=content)) + else: + return self.create_text_message(f"Failed to retrieve the documentation. Status code: {response.status_code}") \ No newline at end of file diff --git a/api/core/tools/provider/builtin/devdocs/tools/searchDevDocs.yaml b/api/core/tools/provider/builtin/devdocs/tools/searchDevDocs.yaml new file mode 100644 index 0000000000..2476db9da4 --- /dev/null +++ b/api/core/tools/provider/builtin/devdocs/tools/searchDevDocs.yaml @@ -0,0 +1,34 @@ +identity: + name: searchDevDocs + author: Richards Tu + label: + en_US: Search Developer Docs + zh_Hans: 搜索开发者文档 +description: + human: + en_US: A tools for searching for a specific topic and path in DevDocs based on the provided documentation name and topic. Don't for get to add some shots in the system prompt; for example, the documentation name should be like \"vuex~4\", \"css\", or \"python~3.12\", while the topic should be like \"guide/actions\" for Vuex 4, \"display-box\" for CSS, or \"library/code\" for Python 3.12. + zh_Hans: 一个用于根据提供的文档名称和主题,在DevDocs中搜索特定主题和路径的工具。不要忘记在系统提示词中添加一些示例;例如,文档名称应该是\"vuex~4\"、\"css\"或\"python~3.12\",而主题应该是\"guide/actions\"用于Vuex 4,\"display-box\"用于CSS,或\"library/code\"用于Python 3.12。 + llm: A tools for searching for specific developer documentation in DevDocs based on the provided documentation name and topic. +parameters: + - name: doc + type: string + required: true + label: + en_US: Documentation name + zh_Hans: 文档名称 + human_description: + en_US: The name of the documentation. + zh_Hans: 文档名称。 + llm_description: The name of the documentation, such as \"vuex~4\", \"css\", or \"python~3.12\". The exact value should be identified by the user. + form: llm + - name: topic + type: string + required: true + label: + en_US: Topic name + zh_Hans: 主题名称 + human_description: + en_US: The path of the section/topic. + zh_Hans: 文档主题的路径。 + llm_description: The path of the section/topic, such as \"guide/actions\" for Vuex 4, \"display-box\" for CSS, or \"library/code\" for Python 3.12. + form: llm diff --git a/api/libs/login.py b/api/libs/login.py index 5c03cfe957..14085fe603 100644 --- a/api/libs/login.py +++ b/api/libs/login.py @@ -53,7 +53,7 @@ def login_required(func): def decorated_view(*args, **kwargs): auth_header = request.headers.get('Authorization') admin_api_key_enable = os.getenv('ADMIN_API_KEY_ENABLE', default='False') - if admin_api_key_enable: + if admin_api_key_enable.lower() == 'true': if auth_header: if ' ' not in auth_header: raise Unauthorized('Invalid Authorization header format. Expected \'Bearer \' format.') diff --git a/api/services/account_service.py b/api/services/account_service.py index 103af7f79c..7fc61e40e3 100644 --- a/api/services/account_service.py +++ b/api/services/account_service.py @@ -435,11 +435,13 @@ class RegisterService: if open_id is not None or provider is not None: AccountService.link_account_integrate(provider, open_id, account) + if current_app.config['EDITION'] != 'SELF_HOSTED': + tenant = TenantService.create_tenant(f"{account.name}'s Workspace") - tenant = TenantService.create_tenant(f"{account.name}'s Workspace") + TenantService.create_tenant_member(tenant, account, role='owner') + account.current_tenant = tenant - TenantService.create_tenant_member(tenant, account, role='owner') - account.current_tenant = tenant + tenant_was_created.send(tenant) db.session.commit() except Exception as e: @@ -447,8 +449,6 @@ class RegisterService: logging.error(f'Register failed: {e}') raise AccountRegisterError(f'Registration failed: {e}') from e - tenant_was_created.send(tenant) - return account @classmethod @@ -461,7 +461,6 @@ class RegisterService: name = email.split('@')[0] account = cls.register(email=email, name=name, language=language, status=AccountStatus.PENDING) - # Create new tenant member for invited tenant TenantService.create_tenant_member(tenant, account, role) TenantService.switch_tenant(account, tenant.id)