diff --git a/api/controllers/service_api/dataset/segment.py b/api/controllers/service_api/dataset/segment.py index 25ae43f2ad..9cbcb15a7a 100644 --- a/api/controllers/service_api/dataset/segment.py +++ b/api/controllers/service_api/dataset/segment.py @@ -13,10 +13,16 @@ from core.errors.error import LLMBadRequestError, ProviderTokenNotInitError from core.model_manager import ModelManager from core.model_runtime.entities.model_entities import ModelType from extensions.ext_database import db -from fields.segment_fields import segment_fields -from models.dataset import Dataset, DocumentSegment +from fields.segment_fields import segment_fields, child_chunk_fields +from models.dataset import Dataset, DocumentSegment, ChildChunk from services.dataset_service import DatasetService, DocumentService, SegmentService from services.entities.knowledge_entities.knowledge_entities import SegmentUpdateArgs +from services.errors.chunk import ( + ChildChunkDeleteIndexError as ChildChunkDeleteIndexServiceError, + ChildChunkIndexingError as ChildChunkIndexingServiceError, + ChildChunkIndexingError, + ChildChunkDeleteIndexError +) class SegmentApi(DatasetApiResource): @@ -195,7 +201,210 @@ class DatasetSegmentApi(DatasetApiResource): return {"data": marshal(segment, segment_fields), "doc_form": document.doc_form}, 200 +class ChildChunkAddApi(DatasetApiResource): + """Resource for child chunks.""" + + @cloud_edition_billing_resource_check("vector_space", "dataset") + @cloud_edition_billing_knowledge_limit_check("add_segment", "dataset") + def post(self, tenant_id, dataset_id, document_id, segment_id): + """Create child chunk.""" + # check dataset + dataset_id = str(dataset_id) + tenant_id = str(tenant_id) + dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() + if not dataset: + raise NotFound("Dataset not found.") + + # check document + document_id = str(document_id) + document = DocumentService.get_document(dataset.id, document_id) + if not document: + raise NotFound("Document not found.") + + # check segment + segment_id = str(segment_id) + segment = DocumentSegment.query.filter( + DocumentSegment.id == str(segment_id), + DocumentSegment.tenant_id == current_user.current_tenant_id + ).first() + if not segment: + raise NotFound("Segment not found.") + + # check embedding model setting + if dataset.indexing_technique == "high_quality": + try: + model_manager = ModelManager() + model_manager.get_model_instance( + tenant_id=current_user.current_tenant_id, + provider=dataset.embedding_model_provider, + model_type=ModelType.TEXT_EMBEDDING, + model=dataset.embedding_model, + ) + except LLMBadRequestError: + raise ProviderNotInitializeError( + "No Embedding Model available. Please configure a valid provider in the Settings -> Model Provider." + ) + except ProviderTokenNotInitError as ex: + raise ProviderNotInitializeError(ex.description) + + # validate args + parser = reqparse.RequestParser() + parser.add_argument("content", type=str, required=True, nullable=False, location="json") + args = parser.parse_args() + + try: + child_chunk = SegmentService.create_child_chunk(args.get("content"), segment, document, dataset) + except ChildChunkIndexingServiceError as e: + raise ChildChunkIndexingError(str(e)) + + return {"data": marshal(child_chunk, child_chunk_fields)}, 200 + + def get(self, tenant_id, dataset_id, document_id, segment_id): + """Get child chunks.""" + # check dataset + dataset_id = str(dataset_id) + tenant_id = str(tenant_id) + dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() + if not dataset: + raise NotFound("Dataset not found.") + + # check document + document_id = str(document_id) + document = DocumentService.get_document(dataset.id, document_id) + if not document: + raise NotFound("Document not found.") + + # check segment + segment_id = str(segment_id) + segment = DocumentSegment.query.filter( + DocumentSegment.id == str(segment_id), + DocumentSegment.tenant_id == current_user.current_tenant_id + ).first() + if not segment: + raise NotFound("Segment not found.") + + parser = reqparse.RequestParser() + parser.add_argument("limit", type=int, default=20, location="args") + parser.add_argument("keyword", type=str, default=None, location="args") + parser.add_argument("page", type=int, default=1, location="args") + args = parser.parse_args() + + page = args["page"] + limit = min(args["limit"], 100) + keyword = args["keyword"] + + child_chunks = SegmentService.get_child_chunks(segment_id, document_id, dataset_id, page, limit, keyword) + + return { + "data": marshal(child_chunks.items, child_chunk_fields), + "total": child_chunks.total, + "total_pages": child_chunks.pages, + "page": page, + "limit": limit, + }, 200 + + +class ChildChunkUpdateApi(DatasetApiResource): + """Resource for updating child chunks.""" + + def delete(self, tenant_id, dataset_id, document_id, segment_id, child_chunk_id): + """Delete child chunk.""" + # check dataset + dataset_id = str(dataset_id) + tenant_id = str(tenant_id) + dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() + if not dataset: + raise NotFound("Dataset not found.") + + # check document + document_id = str(document_id) + document = DocumentService.get_document(dataset.id, document_id) + if not document: + raise NotFound("Document not found.") + + # check segment + segment_id = str(segment_id) + segment = DocumentSegment.query.filter( + DocumentSegment.id == str(segment_id), + DocumentSegment.tenant_id == current_user.current_tenant_id + ).first() + if not segment: + raise NotFound("Segment not found.") + + # check child chunk + child_chunk_id = str(child_chunk_id) + child_chunk = ChildChunk.query.filter( + ChildChunk.id == str(child_chunk_id), + ChildChunk.tenant_id == current_user.current_tenant_id + ).first() + if not child_chunk: + raise NotFound("Child chunk not found.") + + try: + SegmentService.delete_child_chunk(child_chunk, dataset) + except ChildChunkDeleteIndexServiceError as e: + raise ChildChunkDeleteIndexError(str(e)) + + return {"result": "success"}, 200 + + @cloud_edition_billing_resource_check("vector_space", "dataset") + def patch(self, tenant_id, dataset_id, document_id, segment_id, child_chunk_id): + """Update child chunk.""" + # check dataset + dataset_id = str(dataset_id) + tenant_id = str(tenant_id) + dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() + if not dataset: + raise NotFound("Dataset not found.") + + # check document + document_id = str(document_id) + document = DocumentService.get_document(dataset.id, document_id) + if not document: + raise NotFound("Document not found.") + + # check segment + segment_id = str(segment_id) + segment = DocumentSegment.query.filter( + DocumentSegment.id == str(segment_id), + DocumentSegment.tenant_id == current_user.current_tenant_id + ).first() + if not segment: + raise NotFound("Segment not found.") + + # check child chunk + child_chunk_id = str(child_chunk_id) + child_chunk = ChildChunk.query.filter( + ChildChunk.id == str(child_chunk_id), + ChildChunk.tenant_id == current_user.current_tenant_id + ).first() + if not child_chunk: + raise NotFound("Child chunk not found.") + + # validate args + parser = reqparse.RequestParser() + parser.add_argument("content", type=str, required=True, nullable=False, location="json") + args = parser.parse_args() + + try: + child_chunk = SegmentService.update_child_chunk( + args.get("content"), child_chunk, segment, document, dataset + ) + except ChildChunkIndexingServiceError as e: + raise ChildChunkIndexingError(str(e)) + + return {"data": marshal(child_chunk, child_chunk_fields)}, 200 + + api.add_resource(SegmentApi, "/datasets//documents//segments") api.add_resource( DatasetSegmentApi, "/datasets//documents//segments/" ) +api.add_resource( + ChildChunkAddApi, + "/datasets//documents//segments//child_chunks" +) +api.add_resource( + ChildChunkUpdateApi, + "/datasets//documents//segments//child_chunks/" +) diff --git a/move_section.js b/move_section.js new file mode 100644 index 0000000000..ac20869fd4 --- /dev/null +++ b/move_section.js @@ -0,0 +1 @@ +const fs = require("fs"); const path = require("path"); const filePath = path.join(process.cwd(), "web/app/(commonLayout)/datasets/template/template.zh.mdx"); let content = fs.readFileSync(filePath, "utf8"); const lines = content.split("\n"); const retrievalSection = lines.slice(1234, 1335).join("\n"); content = lines.slice(0, 1234).concat(lines.slice(1335)).join("\n"); content += "\n\n" + retrievalSection; fs.writeFileSync(filePath, content); diff --git a/web/app/(commonLayout)/datasets/template/template.zh.mdx b/web/app/(commonLayout)/datasets/template/template.zh.mdx index db72ef9a08..69cf9552b1 100644 --- a/web/app/(commonLayout)/datasets/template/template.zh.mdx +++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx @@ -881,182 +881,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
- - - - ### Path - - - 知识库 ID - - - 上传文档的批次号 - - - - - - ```bash {{ title: 'cURL' }} - curl --location --request GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{batch}/indexing-status' \ - --header 'Authorization: Bearer {api_key}' \ - ``` - - - ```json {{ title: 'Response' }} - { - "data":[{ - "id": "", - "indexing_status": "indexing", - "processing_started_at": 1681623462.0, - "parsing_completed_at": 1681623462.0, - "cleaning_completed_at": 1681623462.0, - "splitting_completed_at": 1681623462.0, - "completed_at": null, - "paused_at": null, - "error": null, - "stopped_at": null, - "completed_segments": 24, - "total_segments": 100 - }] - } - ``` - - - - -
- - - - - ### Path - - - 知识库 ID - - - 文档 ID - - - - - - ```bash {{ title: 'cURL' }} - curl --location --request DELETE '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}' \ - --header 'Authorization: Bearer {api_key}' \ - ``` - - - ```json {{ title: 'Response' }} - { - "result": "success" - } - ``` - - - - -
- - - - - ### Path - - - 知识库 ID - - - - ### Query - - - 搜索关键词,可选,目前仅搜索文档名称 - - - 页码,可选 - - - 返回条数,可选,默认 20,范围 1-100 - - - - - - ```bash {{ title: 'cURL' }} - curl --location --request GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents' \ - --header 'Authorization: Bearer {api_key}' \ - ``` - - - ```json {{ title: 'Response' }} - { - "data": [ - { - "id": "", - "position": 1, - "data_source_type": "file_upload", - "data_source_info": null, - "dataset_process_rule_id": null, - "name": "dify", - "created_from": "", - "created_by": "", - "created_at": 1681623639, - "tokens": 0, - "indexing_status": "waiting", - "error": null, - "enabled": true, - "disabled_at": null, - "disabled_by": null, - "archived": false - }, - ], - "has_more": false, - "limit": 20, - "total": 9, - "page": 1 - } - ``` - - - - -
- @@ -1351,6 +1179,311 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi + +
+ + + + + ### Path + + + 知识库 ID + + + 文档 ID + + + 分段 ID + + + + ### Request Body + + + 子分段内容 + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request POST '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "content": "子分段内容" + }' + ``` + + + ```json {{ title: 'Response' }} + { + "data": { + "id": "", + "segment_id": "", + "content": "子分段内容", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + } + } + ``` + + + + +
+ + + + + ### Path + + + 知识库 ID + + + 文档 ID + + + 分段 ID + + + + ### Query + + + 搜索关键词(选填) + + + 页码(选填,默认1) + + + 每页数量(选填,默认20,最大100) + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks?page=1&limit=20' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "data": [{ + "id": "", + "segment_id": "", + "content": "子分段内容", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + }], + "total": 1, + "total_pages": 1, + "page": 1, + "limit": 20 + } + ``` + + + + +
+ + + + + ### Path + + + 知识库 ID + + + 文档 ID + + + 分段 ID + + + 子分段 ID + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request DELETE '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks/{child_chunk_id}' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "result": "success" + } + ``` + + + + +
+ + + + ### 错误信息 + + + 返回的错误代码 + + + + + 返回的错误状态 + + + + + 返回的错误信息 + + + + + + ```json {{ title: 'Response' }} + { + "code": "no_file_uploaded", + "message": "Please upload your file.", + "status": 400 + } + ``` + + + + +
+ + + + + ### Path + + + 知识库 ID + + + 文档 ID + + + 分段 ID + + + 子分段 ID + + + + ### Request Body + + + 子分段内容 + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks/{child_chunk_id}' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "content": "更新的子分段内容" + }' + ``` + + + ```json {{ title: 'Response' }} + { + "data": { + "id": "", + "segment_id": "", + "content": "更新的子分段内容", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + } + } + ``` + + + +
-
- - - ### 错误信息 - - - 返回的错误代码 - - - - - 返回的错误状态 - - - - - 返回的错误信息 - - - - - - ```json {{ title: 'Response' }} - { - "code": "no_file_uploaded", - "message": "Please upload your file.", - "status": 400 - } - ``` - - - @@ -1652,4 +1753,4 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
-
+
\ No newline at end of file