diff --git a/api/controllers/service_api/dataset/segment.py b/api/controllers/service_api/dataset/segment.py index 25ae43f2ad..233cd37df4 100644 --- a/api/controllers/service_api/dataset/segment.py +++ b/api/controllers/service_api/dataset/segment.py @@ -13,10 +13,20 @@ from core.errors.error import LLMBadRequestError, ProviderTokenNotInitError from core.model_manager import ModelManager from core.model_runtime.entities.model_entities import ModelType from extensions.ext_database import db -from fields.segment_fields import segment_fields -from models.dataset import Dataset, DocumentSegment +from fields.segment_fields import child_chunk_fields, segment_fields +from models.dataset import Dataset from services.dataset_service import DatasetService, DocumentService, SegmentService from services.entities.knowledge_entities.knowledge_entities import SegmentUpdateArgs +from services.errors.chunk import ( + ChildChunkDeleteIndexError, + ChildChunkIndexingError, +) +from services.errors.chunk import ( + ChildChunkDeleteIndexError as ChildChunkDeleteIndexServiceError, +) +from services.errors.chunk import ( + ChildChunkIndexingError as ChildChunkIndexingServiceError, +) class SegmentApi(DatasetApiResource): @@ -70,7 +80,7 @@ class SegmentApi(DatasetApiResource): return {"error": "Segments is required"}, 400 def get(self, tenant_id, dataset_id, document_id): - """Create single segment.""" + """Get segments.""" # check dataset dataset_id = str(dataset_id) tenant_id = str(tenant_id) @@ -104,21 +114,13 @@ class SegmentApi(DatasetApiResource): parser.add_argument("keyword", type=str, default=None, location="args") args = parser.parse_args() - status_list = args["status"] - keyword = args["keyword"] - - query = DocumentSegment.query.filter( - DocumentSegment.document_id == str(document_id), DocumentSegment.tenant_id == current_user.current_tenant_id + segments, total = SegmentService.get_segments( + document_id=document_id, + tenant_id=current_user.current_tenant_id, + status_list=args["status"], + keyword=args["keyword"], ) - if status_list: - query = query.filter(DocumentSegment.status.in_(status_list)) - - if keyword: - query = query.where(DocumentSegment.content.ilike(f"%{keyword}%")) - - total = query.count() - segments = query.order_by(DocumentSegment.position).all() return {"data": marshal(segments, segment_fields), "doc_form": document.doc_form, "total": total}, 200 @@ -138,9 +140,8 @@ class DatasetSegmentApi(DatasetApiResource): if not document: raise NotFound("Document not found.") # check segment - segment = DocumentSegment.query.filter( - DocumentSegment.id == str(segment_id), DocumentSegment.tenant_id == current_user.current_tenant_id - ).first() + segment_id = str(segment_id) + segment = SegmentService.get_segment_by_id(segment_id=segment_id, tenant_id=current_user.current_tenant_id) if not segment: raise NotFound("Segment not found.") SegmentService.delete_segment(segment, document, dataset) @@ -179,9 +180,7 @@ class DatasetSegmentApi(DatasetApiResource): raise ProviderNotInitializeError(ex.description) # check segment segment_id = str(segment_id) - segment = DocumentSegment.query.filter( - DocumentSegment.id == str(segment_id), DocumentSegment.tenant_id == current_user.current_tenant_id - ).first() + segment = SegmentService.get_segment_by_id(segment_id=segment_id, tenant_id=current_user.current_tenant_id) if not segment: raise NotFound("Segment not found.") @@ -190,12 +189,198 @@ class DatasetSegmentApi(DatasetApiResource): parser.add_argument("segment", type=dict, required=False, nullable=True, location="json") args = parser.parse_args() - SegmentService.segment_create_args_validate(args["segment"], document) - segment = SegmentService.update_segment(SegmentUpdateArgs(**args["segment"]), segment, document, dataset) - return {"data": marshal(segment, segment_fields), "doc_form": document.doc_form}, 200 + updated_segment = SegmentService.update_segment( + SegmentUpdateArgs(**args["segment"]), segment, document, dataset + ) + return {"data": marshal(updated_segment, segment_fields), "doc_form": document.doc_form}, 200 + + +class ChildChunkApi(DatasetApiResource): + """Resource for child chunks.""" + + @cloud_edition_billing_resource_check("vector_space", "dataset") + @cloud_edition_billing_knowledge_limit_check("add_segment", "dataset") + def post(self, tenant_id, dataset_id, document_id, segment_id): + """Create child chunk.""" + # check dataset + dataset_id = str(dataset_id) + tenant_id = str(tenant_id) + dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() + if not dataset: + raise NotFound("Dataset not found.") + + # check document + document_id = str(document_id) + document = DocumentService.get_document(dataset.id, document_id) + if not document: + raise NotFound("Document not found.") + + # check segment + segment_id = str(segment_id) + segment = SegmentService.get_segment_by_id(segment_id=segment_id, tenant_id=current_user.current_tenant_id) + if not segment: + raise NotFound("Segment not found.") + + # check embedding model setting + if dataset.indexing_technique == "high_quality": + try: + model_manager = ModelManager() + model_manager.get_model_instance( + tenant_id=current_user.current_tenant_id, + provider=dataset.embedding_model_provider, + model_type=ModelType.TEXT_EMBEDDING, + model=dataset.embedding_model, + ) + except LLMBadRequestError: + raise ProviderNotInitializeError( + "No Embedding Model available. Please configure a valid provider in the Settings -> Model Provider." + ) + except ProviderTokenNotInitError as ex: + raise ProviderNotInitializeError(ex.description) + + # validate args + parser = reqparse.RequestParser() + parser.add_argument("content", type=str, required=True, nullable=False, location="json") + args = parser.parse_args() + + try: + child_chunk = SegmentService.create_child_chunk(args.get("content"), segment, document, dataset) + except ChildChunkIndexingServiceError as e: + raise ChildChunkIndexingError(str(e)) + + return {"data": marshal(child_chunk, child_chunk_fields)}, 200 + + def get(self, tenant_id, dataset_id, document_id, segment_id): + """Get child chunks.""" + # check dataset + dataset_id = str(dataset_id) + tenant_id = str(tenant_id) + dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() + if not dataset: + raise NotFound("Dataset not found.") + + # check document + document_id = str(document_id) + document = DocumentService.get_document(dataset.id, document_id) + if not document: + raise NotFound("Document not found.") + + # check segment + segment_id = str(segment_id) + segment = SegmentService.get_segment_by_id(segment_id=segment_id, tenant_id=current_user.current_tenant_id) + if not segment: + raise NotFound("Segment not found.") + + parser = reqparse.RequestParser() + parser.add_argument("limit", type=int, default=20, location="args") + parser.add_argument("keyword", type=str, default=None, location="args") + parser.add_argument("page", type=int, default=1, location="args") + args = parser.parse_args() + + page = args["page"] + limit = min(args["limit"], 100) + keyword = args["keyword"] + + child_chunks = SegmentService.get_child_chunks(segment_id, document_id, dataset_id, page, limit, keyword) + + return { + "data": marshal(child_chunks.items, child_chunk_fields), + "total": child_chunks.total, + "total_pages": child_chunks.pages, + "page": page, + "limit": limit, + }, 200 + + +class DatasetChildChunkApi(DatasetApiResource): + """Resource for updating child chunks.""" + + def delete(self, tenant_id, dataset_id, document_id, segment_id, child_chunk_id): + """Delete child chunk.""" + # check dataset + dataset_id = str(dataset_id) + tenant_id = str(tenant_id) + dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() + if not dataset: + raise NotFound("Dataset not found.") + + # check document + document_id = str(document_id) + document = DocumentService.get_document(dataset.id, document_id) + if not document: + raise NotFound("Document not found.") + + # check segment + segment_id = str(segment_id) + segment = SegmentService.get_segment_by_id(segment_id=segment_id, tenant_id=current_user.current_tenant_id) + if not segment: + raise NotFound("Segment not found.") + + # check child chunk + child_chunk_id = str(child_chunk_id) + child_chunk = SegmentService.get_child_chunk_by_id( + child_chunk_id=child_chunk_id, tenant_id=current_user.current_tenant_id + ) + if not child_chunk: + raise NotFound("Child chunk not found.") + + try: + SegmentService.delete_child_chunk(child_chunk, dataset) + except ChildChunkDeleteIndexServiceError as e: + raise ChildChunkDeleteIndexError(str(e)) + + return {"result": "success"}, 200 + + @cloud_edition_billing_resource_check("vector_space", "dataset") + def patch(self, tenant_id, dataset_id, document_id, segment_id, child_chunk_id): + """Update child chunk.""" + # check dataset + dataset_id = str(dataset_id) + tenant_id = str(tenant_id) + dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() + if not dataset: + raise NotFound("Dataset not found.") + + # get document + document = DocumentService.get_document(dataset_id, document_id) + if not document: + raise NotFound("Document not found.") + + # get segment + segment = SegmentService.get_segment_by_id(segment_id=segment_id, tenant_id=current_user.current_tenant_id) + if not segment: + raise NotFound("Segment not found.") + + # get child chunk + child_chunk = SegmentService.get_child_chunk_by_id( + child_chunk_id=child_chunk_id, tenant_id=current_user.current_tenant_id + ) + if not child_chunk: + raise NotFound("Child chunk not found.") + + # validate args + parser = reqparse.RequestParser() + parser.add_argument("content", type=str, required=True, nullable=False, location="json") + args = parser.parse_args() + + try: + child_chunk = SegmentService.update_child_chunk( + args.get("content"), child_chunk, segment, document, dataset + ) + except ChildChunkIndexingServiceError as e: + raise ChildChunkIndexingError(str(e)) + + return {"data": marshal(child_chunk, child_chunk_fields)}, 200 api.add_resource(SegmentApi, "/datasets//documents//segments") api.add_resource( DatasetSegmentApi, "/datasets//documents//segments/" ) +api.add_resource( + ChildChunkApi, "/datasets//documents//segments//child_chunks" +) +api.add_resource( + DatasetChildChunkApi, + "/datasets//documents//segments//child_chunks/", +) diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index d3654a3d48..8f88cecbe6 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -2140,6 +2140,88 @@ class SegmentService: query = query.where(ChildChunk.content.ilike(f"%{keyword}%")) return query.paginate(page=page, per_page=limit, max_per_page=100, error_out=False) + @classmethod + def get_child_chunk_by_id(cls, child_chunk_id: str, tenant_id: str) -> Optional[ChildChunk]: + """Get a child chunk by its ID.""" + result = ChildChunk.query.filter(ChildChunk.id == child_chunk_id, ChildChunk.tenant_id == tenant_id).first() + return result if isinstance(result, ChildChunk) else None + + @classmethod + def get_segments( + cls, document_id: str, tenant_id: str, status_list: list[str] | None = None, keyword: str | None = None + ): + """Get segments for a document with optional filtering.""" + query = DocumentSegment.query.filter( + DocumentSegment.document_id == document_id, DocumentSegment.tenant_id == tenant_id + ) + + if status_list: + query = query.filter(DocumentSegment.status.in_(status_list)) + + if keyword: + query = query.filter(DocumentSegment.content.ilike(f"%{keyword}%")) + + segments = query.order_by(DocumentSegment.position.asc()).all() + total = len(segments) + + return segments, total + + @classmethod + def update_segment_by_id( + cls, tenant_id: str, dataset_id: str, document_id: str, segment_id: str, segment_data: dict, user_id: str + ) -> tuple[DocumentSegment, Document]: + """Update a segment by its ID with validation and checks.""" + # check dataset + dataset = db.session.query(Dataset).filter(Dataset.tenant_id == tenant_id, Dataset.id == dataset_id).first() + if not dataset: + raise NotFound("Dataset not found.") + + # check user's model setting + DatasetService.check_dataset_model_setting(dataset) + + # check document + document = DocumentService.get_document(dataset_id, document_id) + if not document: + raise NotFound("Document not found.") + + # check embedding model setting if high quality + if dataset.indexing_technique == "high_quality": + try: + model_manager = ModelManager() + model_manager.get_model_instance( + tenant_id=user_id, + provider=dataset.embedding_model_provider, + model_type=ModelType.TEXT_EMBEDDING, + model=dataset.embedding_model, + ) + except LLMBadRequestError: + raise ValueError( + "No Embedding Model available. Please configure a valid provider in the Settings -> Model Provider." + ) + except ProviderTokenNotInitError as ex: + raise ValueError(ex.description) + + # check segment + segment = DocumentSegment.query.filter( + DocumentSegment.id == segment_id, DocumentSegment.tenant_id == user_id + ).first() + if not segment: + raise NotFound("Segment not found.") + + # validate and update segment + cls.segment_create_args_validate(segment_data, document) + updated_segment = cls.update_segment(SegmentUpdateArgs(**segment_data), segment, document, dataset) + + return updated_segment, document + + @classmethod + def get_segment_by_id(cls, segment_id: str, tenant_id: str) -> Optional[DocumentSegment]: + """Get a segment by its ID.""" + result = DocumentSegment.query.filter( + DocumentSegment.id == segment_id, DocumentSegment.tenant_id == tenant_id + ).first() + return result if isinstance(result, DocumentSegment) else None + class DatasetCollectionBindingService: @classmethod diff --git a/web/app/(commonLayout)/datasets/template/template.en.mdx b/web/app/(commonLayout)/datasets/template/template.en.mdx index a5f4c40ef6..1c8ac9bcbd 100644 --- a/web/app/(commonLayout)/datasets/template/template.en.mdx +++ b/web/app/(commonLayout)/datasets/template/template.en.mdx @@ -1148,6 +1148,276 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
+ + + + ### Params + + + Knowledge ID + + + Document ID + + + Segment ID + + + + ### Request Body + + + Child chunk content + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request POST '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "content": "Child chunk content" + }' + ``` + + + ```json {{ title: 'Response' }} + { + "data": { + "id": "", + "segment_id": "", + "content": "Child chunk content", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + } + } + ``` + + + + +
+ + + + + ### Params + + + Knowledge ID + + + Document ID + + + Segment ID + + + + ### Query + + + Search keyword (optional) + + + Page number (optional, default: 1) + + + Items per page (optional, default: 20, max: 100) + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks?page=1&limit=20' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "data": [{ + "id": "", + "segment_id": "", + "content": "Child chunk content", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + }], + "total": 1, + "total_pages": 1, + "page": 1, + "limit": 20 + } + ``` + + + + +
+ + + + + ### Params + + + Knowledge ID + + + Document ID + + + Segment ID + + + Child Chunk ID + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request DELETE '${props.apiBaseUrl}/datasets/{dataset_id}/segments/{segment_id}/child_chunks/{child_chunk_id}' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "result": "success" + } + ``` + + + + +
+ + + + + ### Params + + + Knowledge ID + + + Document ID + + + Segment ID + + + Child Chunk ID + + + + ### Request Body + + + Child chunk content + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks/{child_chunk_id}' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "content": "Updated child chunk content" + }' + ``` + + + ```json {{ title: 'Response' }} + { + "data": { + "id": "", + "segment_id": "", + "content": "Updated child chunk content", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + } + } + ``` + + + + +
+ -
+
\ No newline at end of file diff --git a/web/app/(commonLayout)/datasets/template/template.zh.mdx b/web/app/(commonLayout)/datasets/template/template.zh.mdx index 282849f3db..3c9ab2e3ae 100644 --- a/web/app/(commonLayout)/datasets/template/template.zh.mdx +++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx @@ -1149,6 +1149,310 @@ import { Row, Col, Properties, Property, Heading, SubProperty, PropertyInstructi
+ + + + ### Path + + + 知识库 ID + + + 文档 ID + + + 分段 ID + + + + ### Request Body + + + 子分段内容 + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request POST '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "content": "子分段内容" + }' + ``` + + + ```json {{ title: 'Response' }} + { + "data": { + "id": "", + "segment_id": "", + "content": "子分段内容", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + } + } + ``` + + + + +
+ + + + + ### Path + + + 知识库 ID + + + 文档 ID + + + 分段 ID + + + + ### Query + + + 搜索关键词(选填) + + + 页码(选填,默认1) + + + 每页数量(选填,默认20,最大100) + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request GET '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks?page=1&limit=20' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "data": [{ + "id": "", + "segment_id": "", + "content": "子分段内容", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + }], + "total": 1, + "total_pages": 1, + "page": 1, + "limit": 20 + } + ``` + + + + +
+ + + + + ### Path + + + 知识库 ID + + + 文档 ID + + + 分段 ID + + + 子分段 ID + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request DELETE '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks/{child_chunk_id}' \ + --header 'Authorization: Bearer {api_key}' + ``` + + + ```json {{ title: 'Response' }} + { + "result": "success" + } + ``` + + + + +
+ + + + ### 错误信息 + + + 返回的错误代码 + + + + + 返回的错误状态 + + + + + 返回的错误信息 + + + + + + ```json {{ title: 'Response' }} + { + "code": "no_file_uploaded", + "message": "Please upload your file.", + "status": 400 + } + ``` + + + + +
+ + + + + ### Path + + + 知识库 ID + + + 文档 ID + + + 分段 ID + + + 子分段 ID + + + + ### Request Body + + + 子分段内容 + + + + + + ```bash {{ title: 'cURL' }} + curl --location --request PATCH '${props.apiBaseUrl}/datasets/{dataset_id}/documents/{document_id}/segments/{segment_id}/child_chunks/{child_chunk_id}' \ + --header 'Authorization: Bearer {api_key}' \ + --header 'Content-Type: application/json' \ + --data-raw '{ + "content": "更新的子分段内容" + }' + ``` + + + ```json {{ title: 'Response' }} + { + "data": { + "id": "", + "segment_id": "", + "content": "更新的子分段内容", + "word_count": 25, + "tokens": 0, + "index_node_id": "", + "index_node_hash": "", + "status": "completed", + "created_by": "", + "created_at": 1695312007, + "indexing_at": 1695312007, + "completed_at": 1695312007, + "error": null, + "stopped_at": null + } + } + ``` + + + + +
+