From 09d759d196e02d632d785be7110e796cb3d647c4 Mon Sep 17 00:00:00 2001 From: Wu Tianwei <30284043+WTW0313@users.noreply.github.com> Date: Thu, 2 Jan 2025 16:07:21 +0800 Subject: [PATCH] fix: Fix parent child retrieval issues (#12206) Co-authored-by: NFish Co-authored-by: nite-knite --- .../datasets/template/template.en.mdx | 35 +++++- .../datasets/template/template.zh.mdx | 39 +++++- .../params-config/config-content.tsx | 119 ++++++++---------- .../dataset-config/params-config/index.tsx | 37 +++--- .../dataset-config/settings-modal/index.tsx | 20 +-- .../components/app/configuration/index.tsx | 6 +- .../datasets/common/check-rerank-model.ts | 15 +-- .../index.tsx | 5 +- .../common/retrieval-method-config/index.tsx | 116 ++++++++++------- .../common/retrieval-param-config/index.tsx | 102 +++++++-------- .../create/embedding-process/index.tsx | 3 + .../datasets/create/step-two/index.tsx | 104 +++++++-------- .../datasets/create/step-two/option-card.tsx | 4 +- .../documents/detail/completed/index.tsx | 24 ++-- .../detail/completed/segment-list.tsx | 2 +- .../datasets/documents/detail/index.tsx | 18 ++- .../components/datasets/documents/index.tsx | 18 ++- .../components/datasets/documents/list.tsx | 38 +++--- .../components/child-chunks-item.tsx | 2 +- .../components/chunk-detail-modal.tsx | 2 +- .../hit-testing/components/result-item.tsx | 9 +- .../datasets/hit-testing/components/score.tsx | 8 +- .../components/datasets/hit-testing/index.tsx | 2 +- .../hit-testing/modify-retrieval-modal.tsx | 16 +-- .../datasets/settings/form/index.tsx | 26 ++-- .../model-selector/model-trigger.tsx | 1 + .../components/retrieval-config.tsx | 9 +- .../nodes/knowledge-retrieval/use-config.ts | 2 +- .../nodes/knowledge-retrieval/utils.ts | 33 +++-- web/i18n/en-US/app-debug.ts | 2 +- web/i18n/en-US/workflow.ts | 4 +- web/i18n/zh-Hans/app-debug.ts | 2 +- web/i18n/zh-Hans/workflow.ts | 2 +- web/service/knowledge/use-document.ts | 8 +- 34 files changed, 446 insertions(+), 387 deletions(-) diff --git a/web/app/(commonLayout)/datasets/template/template.en.mdx b/web/app/(commonLayout)/datasets/template/template.en.mdx index d3dcfc4b24..f2db83e47e 100644 --- a/web/app/(commonLayout)/datasets/template/template.en.mdx +++ b/web/app/(commonLayout)/datasets/template/template.en.mdx @@ -52,6 +52,15 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - high_quality High quality: embedding using embedding model, built as vector database index - economy Economy: Build using inverted index of keyword table index + + Format of indexed content + - text_model Text documents are directly embedded; `economy` mode defaults to using this form + - hierarchical_model Parent-child mode + - qa_model Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions + + + In Q&A mode, specify the language of the document, for example: English, Chinese + Processing rules - mode (string) Cleaning, segmentation mode, automatic / custom @@ -65,6 +74,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - segmentation (object) Segmentation rules - separator Custom segment identifier, currently only allows one delimiter to be set. Default is \n - max_tokens Maximum length (token) defaults to 1000 + - parent_mode Retrieval mode of parent chunks: full-doc full text retrieval / paragraph paragraph retrieval + - subchunk_segmentation (object) Child chunk rules + - separator Segmentation identifier. Currently, only one delimiter is allowed. The default is *** + - max_tokens The maximum length (tokens) must be validated to be shorter than the length of the parent chunk @@ -155,6 +168,13 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - high_quality High quality: embedding using embedding model, built as vector database index - economy Economy: Build using inverted index of keyword table index + - doc_form Format of indexed content + - text_model Text documents are directly embedded; `economy` mode defaults to using this form + - hierarchical_model Parent-child mode + - qa_model Q&A Mode: Generates Q&A pairs for segmented documents and then embeds the questions + + - doc_language In Q&A mode, specify the language of the document, for example: English, Chinese + - process_rule Processing rules - mode (string) Cleaning, segmentation mode, automatic / custom - rules (object) Custom rules (in automatic mode, this field is empty) @@ -167,6 +187,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - segmentation (object) Segmentation rules - separator Custom segment identifier, currently only allows one delimiter to be set. Default is \n - max_tokens Maximum length (token) defaults to 1000 + - parent_mode Retrieval mode of parent chunks: full-doc full text retrieval / paragraph paragraph retrieval + - subchunk_segmentation (object) Child chunk rules + - separator Segmentation identifier. Currently, only one delimiter is allowed. The default is *** + - max_tokens The maximum length (tokens) must be validated to be shorter than the length of the parent chunk Files that need to be uploaded. @@ -449,6 +473,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - segmentation (object) Segmentation rules - separator Custom segment identifier, currently only allows one delimiter to be set. Default is \n - max_tokens Maximum length (token) defaults to 1000 + - parent_mode Retrieval mode of parent chunks: full-doc full text retrieval / paragraph paragraph retrieval + - subchunk_segmentation (object) Child chunk rules + - separator Segmentation identifier. Currently, only one delimiter is allowed. The default is *** + - max_tokens The maximum length (tokens) must be validated to be shorter than the length of the parent chunk @@ -546,6 +574,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - segmentation (object) Segmentation rules - separator Custom segment identifier, currently only allows one delimiter to be set. Default is \n - max_tokens Maximum length (token) defaults to 1000 + - parent_mode Retrieval mode of parent chunks: full-doc full text retrieval / paragraph paragraph retrieval + - subchunk_segmentation (object) Child chunk rules + - separator Segmentation identifier. Currently, only one delimiter is allowed. The default is *** + - max_tokens The maximum length (tokens) must be validated to be shorter than the length of the parent chunk @@ -984,7 +1016,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from @@ -1009,6 +1041,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - answer (text) Answer content, passed if the knowledge is in Q&A mode (optional) - keywords (list) Keyword (optional) - enabled (bool) False / true (optional) + - regenerate_child_chunks (bool) Whether to regenerate child chunks (optional) diff --git a/web/app/(commonLayout)/datasets/template/template.zh.mdx b/web/app/(commonLayout)/datasets/template/template.zh.mdx index db15ede9fc..24418dea57 100644 --- a/web/app/(commonLayout)/datasets/template/template.zh.mdx +++ b/web/app/(commonLayout)/datasets/template/template.zh.mdx @@ -52,6 +52,15 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - high_quality 高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引 - economy 经济:使用 keyword table index 的倒排索引进行构建 + + 索引内容的形式 + - text_model text 文档直接 embedding,经济模式默认为该模式 + - hierarchical_model parent-child 模式 + - qa_model Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding + + + 在 Q&A 模式下,指定文档的语言,例如:EnglishChinese + 处理规则 - mode (string) 清洗、分段模式 ,automatic 自动 / custom 自定义 @@ -63,8 +72,12 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - remove_urls_emails 删除 URL、电子邮件地址 - enabled (bool) 是否选中该规则,不传入文档 ID 时代表默认值 - segmentation (object) 分段规则 - - separator 自定义分段标识符,目前仅允许设置一个分隔符。默认为 \n + - separator 自定义分段标识符,目前仅允许设置一个分隔符。默认为 \n - max_tokens 最大长度(token)默认为 1000 + - parent_mode 父分段的召回模式 full-doc 全文召回 / paragraph 段落召回 + - subchunk_segmentation (object) 子分段规则 + - separator 分段标识符,目前仅允许设置一个分隔符。默认为 *** + - max_tokens 最大长度 (token) 需要校验小于父级的长度 @@ -155,6 +168,13 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - high_quality 高质量:使用 embedding 模型进行嵌入,构建为向量数据库索引 - economy 经济:使用 keyword table index 的倒排索引进行构建 + - doc_form 索引内容的形式 + - text_model text 文档直接 embedding,经济模式默认为该模式 + - hierarchical_model parent-child 模式 + - qa_model Q&A 模式:为分片文档生成 Q&A 对,然后对问题进行 embedding + + - doc_language 在 Q&A 模式下,指定文档的语言,例如:EnglishChinese + - process_rule 处理规则 - mode (string) 清洗、分段模式 ,automatic 自动 / custom 自定义 - rules (object) 自定义规则(自动模式下,该字段为空) @@ -167,6 +187,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - segmentation (object) 分段规则 - separator 自定义分段标识符,目前仅允许设置一个分隔符。默认为 \n - max_tokens 最大长度(token)默认为 1000 + - parent_mode 父分段的召回模式 full-doc 全文召回 / paragraph 段落召回 + - subchunk_segmentation (object) 子分段规则 + - separator 分段标识符,目前仅允许设置一个分隔符。默认为 *** + - max_tokens 最大长度 (token) 需要校验小于父级的长度 需要上传的文件。 @@ -411,7 +435,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from @@ -449,6 +473,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - segmentation (object) 分段规则 - separator 自定义分段标识符,目前仅允许设置一个分隔符。默认为 \n - max_tokens 最大长度(token)默认为 1000 + - parent_mode 父分段的召回模式 full-doc 全文召回 / paragraph 段落召回 + - subchunk_segmentation (object) 子分段规则 + - separator 分段标识符,目前仅允许设置一个分隔符。默认为 *** + - max_tokens 最大长度 (token) 需要校验小于父级的长度 @@ -508,7 +536,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from @@ -546,6 +574,10 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - segmentation (object) 分段规则 - separator 自定义分段标识符,目前仅允许设置一个分隔符。默认为 \n - max_tokens 最大长度(token)默认为 1000 + - parent_mode 父分段的召回模式 full-doc 全文召回 / paragraph 段落召回 + - subchunk_segmentation (object) 子分段规则 + - separator 分段标识符,目前仅允许设置一个分隔符。默认为 *** + - max_tokens 最大长度 (token) 需要校验小于父级的长度 @@ -1009,6 +1041,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty, Paragraph } from - answer (text) 答案内容,非必填,如果知识库的模式为 Q&A 模式则传值 - keywords (list) 关键字,非必填 - enabled (bool) false/true,非必填 + - regenerate_child_chunks (bool) 是否重新生成子分段,非必填 diff --git a/web/app/components/app/configuration/dataset-config/params-config/config-content.tsx b/web/app/components/app/configuration/dataset-config/params-config/config-content.tsx index dcb2b1a3fd..3744c6a56b 100644 --- a/web/app/components/app/configuration/dataset-config/params-config/config-content.tsx +++ b/web/app/components/app/configuration/dataset-config/params-config/config-content.tsx @@ -59,36 +59,24 @@ const ConfigContent: FC = ({ const { modelList: rerankModelList, - defaultModel: rerankDefaultModel, - currentModel: isRerankDefaultModelValid, } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank) const { currentModel: currentRerankModel, } = useCurrentProviderAndModel( rerankModelList, - rerankDefaultModel - ? { - ...rerankDefaultModel, - provider: rerankDefaultModel.provider.provider, - } - : undefined, + { + provider: datasetConfigs.reranking_model?.reranking_provider_name, + model: datasetConfigs.reranking_model?.reranking_model_name, + }, ) - const rerankModel = (() => { - if (datasetConfigs.reranking_model?.reranking_provider_name) { - return { - provider_name: datasetConfigs.reranking_model.reranking_provider_name, - model_name: datasetConfigs.reranking_model.reranking_model_name, - } + const rerankModel = useMemo(() => { + return { + provider_name: datasetConfigs?.reranking_model?.reranking_provider_name ?? '', + model_name: datasetConfigs?.reranking_model?.reranking_model_name ?? '', } - else if (rerankDefaultModel) { - return { - provider_name: rerankDefaultModel.provider.provider, - model_name: rerankDefaultModel.model, - } - } - })() + }, [datasetConfigs.reranking_model]) const handleParamChange = (key: string, value: number) => { if (key === 'top_k') { @@ -133,6 +121,12 @@ const ConfigContent: FC = ({ } const handleRerankModeChange = (mode: RerankingModeEnum) => { + if (mode === datasetConfigs.reranking_mode) + return + + if (mode === RerankingModeEnum.RerankingModel && !currentRerankModel) + Toast.notify({ type: 'error', message: t('workflow.errorMsg.rerankModelRequired') }) + onChange({ ...datasetConfigs, reranking_mode: mode, @@ -162,31 +156,25 @@ const ConfigContent: FC = ({ const canManuallyToggleRerank = useMemo(() => { return (selectedDatasetsMode.allInternal && selectedDatasetsMode.allEconomic) - || selectedDatasetsMode.allExternal + || selectedDatasetsMode.allExternal }, [selectedDatasetsMode.allEconomic, selectedDatasetsMode.allExternal, selectedDatasetsMode.allInternal]) const showRerankModel = useMemo(() => { if (!canManuallyToggleRerank) return true - else if (canManuallyToggleRerank && !isRerankDefaultModelValid) - return false return datasetConfigs.reranking_enable - }, [canManuallyToggleRerank, datasetConfigs.reranking_enable, isRerankDefaultModelValid]) + }, [datasetConfigs.reranking_enable, canManuallyToggleRerank]) - const handleDisabledSwitchClick = useCallback(() => { - if (!currentRerankModel && !showRerankModel) + const handleDisabledSwitchClick = useCallback((enable: boolean) => { + if (!currentRerankModel && enable) Toast.notify({ type: 'error', message: t('workflow.errorMsg.rerankModelRequired') }) - }, [currentRerankModel, showRerankModel, t]) - - useEffect(() => { - if (canManuallyToggleRerank && showRerankModel !== datasetConfigs.reranking_enable) { - onChange({ - ...datasetConfigs, - reranking_enable: showRerankModel, - }) - } - }, [canManuallyToggleRerank, showRerankModel, datasetConfigs, onChange]) + onChange({ + ...datasetConfigs, + reranking_enable: enable, + }) + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [currentRerankModel, datasetConfigs, onChange]) return (
@@ -267,24 +255,12 @@ const ConfigContent: FC = ({
{ selectedDatasetsMode.allEconomic && !selectedDatasetsMode.mixtureInternalAndExternal && ( -
- { - if (canManuallyToggleRerank) { - onChange({ - ...datasetConfigs, - reranking_enable: v, - }) - } - }} - /> -
+ ) }
{t('common.modelProvider.rerankModel.key')}
@@ -298,21 +274,24 @@ const ConfigContent: FC = ({ triggerClassName='ml-1 w-4 h-4' />
-
- { - onChange({ - ...datasetConfigs, - reranking_model: { - reranking_provider_name: v.provider, - reranking_model_name: v.model, - }, - }) - }} - modelList={rerankModelList} - /> -
+ { + showRerankModel && ( +
+ { + onChange({ + ...datasetConfigs, + reranking_model: { + reranking_provider_name: v.provider, + reranking_model_name: v.model, + }, + }) + }} + modelList={rerankModelList} + /> +
+ )}
) } diff --git a/web/app/components/app/configuration/dataset-config/params-config/index.tsx b/web/app/components/app/configuration/dataset-config/params-config/index.tsx index 7f7a4799d1..acd1955943 100644 --- a/web/app/components/app/configuration/dataset-config/params-config/index.tsx +++ b/web/app/components/app/configuration/dataset-config/params-config/index.tsx @@ -10,7 +10,7 @@ import Modal from '@/app/components/base/modal' import Button from '@/app/components/base/button' import { RETRIEVE_TYPE } from '@/types/app' import Toast from '@/app/components/base/toast' -import { useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks' +import { useCurrentProviderAndModel, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks' import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations' import { RerankingModeEnum } from '@/models/datasets' import type { DataSet } from '@/models/datasets' @@ -41,17 +41,27 @@ const ParamsConfig = ({ }, [datasetConfigs]) const { - defaultModel: rerankDefaultModel, - currentModel: isRerankDefaultModelValid, + modelList: rerankModelList, + currentModel: rerankDefaultModel, currentProvider: rerankDefaultProvider, } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank) + const { + currentModel: isCurrentRerankModelValid, + } = useCurrentProviderAndModel( + rerankModelList, + { + provider: tempDataSetConfigs.reranking_model?.reranking_provider_name ?? '', + model: tempDataSetConfigs.reranking_model?.reranking_model_name ?? '', + }, + ) + const isValid = () => { let errMsg = '' if (tempDataSetConfigs.retrieval_model === RETRIEVE_TYPE.multiWay) { if (tempDataSetConfigs.reranking_enable && tempDataSetConfigs.reranking_mode === RerankingModeEnum.RerankingModel - && !isRerankDefaultModelValid + && !isCurrentRerankModelValid ) errMsg = t('appDebug.datasetConfig.rerankModelRequired') } @@ -66,16 +76,7 @@ const ParamsConfig = ({ const handleSave = () => { if (!isValid()) return - const config = { ...tempDataSetConfigs } - if (config.retrieval_model === RETRIEVE_TYPE.multiWay - && config.reranking_mode === RerankingModeEnum.RerankingModel - && !config.reranking_model) { - config.reranking_model = { - reranking_provider_name: rerankDefaultModel?.provider?.provider, - reranking_model_name: rerankDefaultModel?.model, - } as any - } - setDatasetConfigs(config) + setDatasetConfigs(tempDataSetConfigs) setRerankSettingModalOpen(false) } @@ -94,14 +95,14 @@ const ParamsConfig = ({ reranking_enable: restConfigs.reranking_enable, }, selectedDatasets, selectedDatasets, { provider: rerankDefaultProvider?.provider, - model: isRerankDefaultModelValid?.model, + model: rerankDefaultModel?.model, }) setTempDataSetConfigs({ ...retrievalConfig, - reranking_model: restConfigs.reranking_model && { - reranking_provider_name: restConfigs.reranking_model.reranking_provider_name, - reranking_model_name: restConfigs.reranking_model.reranking_model_name, + reranking_model: { + reranking_provider_name: retrievalConfig.reranking_model?.provider || '', + reranking_model_name: retrievalConfig.reranking_model?.model || '', }, retrieval_model, score_threshold_enabled, diff --git a/web/app/components/app/configuration/dataset-config/settings-modal/index.tsx b/web/app/components/app/configuration/dataset-config/settings-modal/index.tsx index 7a347a1899..506406cfe0 100644 --- a/web/app/components/app/configuration/dataset-config/settings-modal/index.tsx +++ b/web/app/components/app/configuration/dataset-config/settings-modal/index.tsx @@ -12,7 +12,7 @@ import Divider from '@/app/components/base/divider' import Button from '@/app/components/base/button' import Input from '@/app/components/base/input' import Textarea from '@/app/components/base/textarea' -import { type DataSet, RerankingModeEnum } from '@/models/datasets' +import { type DataSet } from '@/models/datasets' import { useToastContext } from '@/app/components/base/toast' import { updateDatasetSetting } from '@/service/datasets' import { useAppContext } from '@/context/app-context' @@ -21,7 +21,7 @@ import type { RetrievalConfig } from '@/types/app' import RetrievalSettings from '@/app/components/datasets/external-knowledge-base/create/RetrievalSettings' import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config' import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config' -import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model' +import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model' import { AlertTriangle } from '@/app/components/base/icons/src/vender/solid/alertsAndFeedback' import PermissionSelector from '@/app/components/datasets/settings/permission-selector' import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector' @@ -99,8 +99,6 @@ const SettingsModal: FC = ({ } if ( !isReRankModelSelected({ - rerankDefaultModel, - isRerankDefaultModelValid: !!isRerankDefaultModelValid, rerankModelList, retrievalConfig, indexMethod, @@ -109,14 +107,6 @@ const SettingsModal: FC = ({ notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') }) return } - const postRetrievalConfig = ensureRerankModelSelected({ - rerankDefaultModel: rerankDefaultModel!, - retrievalConfig: { - ...retrievalConfig, - reranking_enable: retrievalConfig.reranking_mode === RerankingModeEnum.RerankingModel, - }, - indexMethod, - }) try { setLoading(true) const { id, name, description, permission } = localeCurrentDataset @@ -128,8 +118,8 @@ const SettingsModal: FC = ({ permission, indexing_technique: indexMethod, retrieval_model: { - ...postRetrievalConfig, - score_threshold: postRetrievalConfig.score_threshold_enabled ? postRetrievalConfig.score_threshold : 0, + ...retrievalConfig, + score_threshold: retrievalConfig.score_threshold_enabled ? retrievalConfig.score_threshold : 0, }, embedding_model: localeCurrentDataset.embedding_model, embedding_model_provider: localeCurrentDataset.embedding_model_provider, @@ -157,7 +147,7 @@ const SettingsModal: FC = ({ onSave({ ...localeCurrentDataset, indexing_technique: indexMethod, - retrieval_model_dict: postRetrievalConfig, + retrieval_model_dict: retrievalConfig, }) } catch (e) { diff --git a/web/app/components/app/configuration/index.tsx b/web/app/components/app/configuration/index.tsx index d3719a7696..b4289a105a 100644 --- a/web/app/components/app/configuration/index.tsx +++ b/web/app/components/app/configuration/index.tsx @@ -287,9 +287,9 @@ const Configuration: FC = () => { setDatasetConfigs({ ...retrievalConfig, - reranking_model: restConfigs.reranking_model && { - reranking_provider_name: restConfigs.reranking_model.reranking_provider_name, - reranking_model_name: restConfigs.reranking_model.reranking_model_name, + reranking_model: { + reranking_provider_name: retrievalConfig?.reranking_model?.provider || '', + reranking_model_name: retrievalConfig?.reranking_model?.model || '', }, retrieval_model, score_threshold_enabled, diff --git a/web/app/components/datasets/common/check-rerank-model.ts b/web/app/components/datasets/common/check-rerank-model.ts index 581c2bb69a..ccb8c45a09 100644 --- a/web/app/components/datasets/common/check-rerank-model.ts +++ b/web/app/components/datasets/common/check-rerank-model.ts @@ -6,14 +6,10 @@ import type { import { RerankingModeEnum } from '@/models/datasets' export const isReRankModelSelected = ({ - rerankDefaultModel, - isRerankDefaultModelValid, retrievalConfig, rerankModelList, indexMethod, }: { - rerankDefaultModel?: DefaultModelResponse - isRerankDefaultModelValid: boolean retrievalConfig: RetrievalConfig rerankModelList: Model[] indexMethod?: string @@ -25,12 +21,17 @@ export const isReRankModelSelected = ({ return provider?.models.find(({ model }) => model === retrievalConfig.reranking_model?.reranking_model_name) } - if (isRerankDefaultModelValid) - return !!rerankDefaultModel - return false })() + if ( + indexMethod === 'high_quality' + && ([RETRIEVE_METHOD.semantic, RETRIEVE_METHOD.fullText].includes(retrievalConfig.search_method)) + && retrievalConfig.reranking_enable + && !rerankModelSelected + ) + return false + if ( indexMethod === 'high_quality' && (retrievalConfig.search_method === RETRIEVE_METHOD.hybrid && retrievalConfig.reranking_mode !== RerankingModeEnum.WeightedScore) diff --git a/web/app/components/datasets/common/economical-retrieval-method-config/index.tsx b/web/app/components/datasets/common/economical-retrieval-method-config/index.tsx index 9236858ae4..5183b7a94e 100644 --- a/web/app/components/datasets/common/economical-retrieval-method-config/index.tsx +++ b/web/app/components/datasets/common/economical-retrieval-method-config/index.tsx @@ -10,11 +10,13 @@ import { RETRIEVE_METHOD } from '@/types/app' import type { RetrievalConfig } from '@/types/app' type Props = { + disabled?: boolean value: RetrievalConfig onChange: (value: RetrievalConfig) => void } const EconomicalRetrievalMethodConfig: FC = ({ + disabled = false, value, onChange, }) => { @@ -22,7 +24,8 @@ const EconomicalRetrievalMethodConfig: FC = ({ return (
- } + } title={t('dataset.retrieval.invertedIndex.title')} description={t('dataset.retrieval.invertedIndex.description')} isActive activeHeaderClassName='bg-dataset-option-card-purple-gradient' diff --git a/web/app/components/datasets/common/retrieval-method-config/index.tsx b/web/app/components/datasets/common/retrieval-method-config/index.tsx index 9ab157571b..aee978cc53 100644 --- a/web/app/components/datasets/common/retrieval-method-config/index.tsx +++ b/web/app/components/datasets/common/retrieval-method-config/index.tsx @@ -1,6 +1,6 @@ 'use client' import type { FC } from 'react' -import React from 'react' +import React, { useCallback } from 'react' import { useTranslation } from 'react-i18next' import Image from 'next/image' import RetrievalParamConfig from '../retrieval-param-config' @@ -10,7 +10,7 @@ import { retrievalIcon } from '../../create/icons' import type { RetrievalConfig } from '@/types/app' import { RETRIEVE_METHOD } from '@/types/app' import { useProviderContext } from '@/context/provider-context' -import { useDefaultModel } from '@/app/components/header/account-setting/model-provider-page/hooks' +import { useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks' import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations' import { DEFAULT_WEIGHTED_SCORE, @@ -20,54 +20,87 @@ import { import Badge from '@/app/components/base/badge' type Props = { + disabled?: boolean value: RetrievalConfig onChange: (value: RetrievalConfig) => void } const RetrievalMethodConfig: FC = ({ - value: passValue, + disabled = false, + value, onChange, }) => { const { t } = useTranslation() const { supportRetrievalMethods } = useProviderContext() - const { data: rerankDefaultModel } = useDefaultModel(ModelTypeEnum.rerank) - const value = (() => { - if (!passValue.reranking_model.reranking_model_name) { - return { - ...passValue, - reranking_model: { - reranking_provider_name: rerankDefaultModel?.provider.provider || '', - reranking_model_name: rerankDefaultModel?.model || '', - }, - reranking_mode: passValue.reranking_mode || (rerankDefaultModel ? RerankingModeEnum.RerankingModel : RerankingModeEnum.WeightedScore), - weights: passValue.weights || { - weight_type: WeightedScoreEnum.Customized, - vector_setting: { - vector_weight: DEFAULT_WEIGHTED_SCORE.other.semantic, - embedding_provider_name: '', - embedding_model_name: '', - }, - keyword_setting: { - keyword_weight: DEFAULT_WEIGHTED_SCORE.other.keyword, - }, - }, - } + const { + defaultModel: rerankDefaultModel, + currentModel: isRerankDefaultModelValid, + } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank) + + const onSwitch = useCallback((retrieveMethod: RETRIEVE_METHOD) => { + if ([RETRIEVE_METHOD.semantic, RETRIEVE_METHOD.fullText].includes(retrieveMethod)) { + onChange({ + ...value, + search_method: retrieveMethod, + ...(!value.reranking_model.reranking_model_name + ? { + reranking_model: { + reranking_provider_name: isRerankDefaultModelValid ? rerankDefaultModel?.provider?.provider ?? '' : '', + reranking_model_name: isRerankDefaultModelValid ? rerankDefaultModel?.model ?? '' : '', + }, + reranking_enable: !!isRerankDefaultModelValid, + } + : { + reranking_enable: true, + }), + }) } - return passValue - })() + if (retrieveMethod === RETRIEVE_METHOD.hybrid) { + onChange({ + ...value, + search_method: retrieveMethod, + ...(!value.reranking_model.reranking_model_name + ? { + reranking_model: { + reranking_provider_name: isRerankDefaultModelValid ? rerankDefaultModel?.provider?.provider ?? '' : '', + reranking_model_name: isRerankDefaultModelValid ? rerankDefaultModel?.model ?? '' : '', + }, + reranking_enable: !!isRerankDefaultModelValid, + reranking_mode: isRerankDefaultModelValid ? RerankingModeEnum.RerankingModel : RerankingModeEnum.WeightedScore, + } + : { + reranking_enable: true, + reranking_mode: RerankingModeEnum.RerankingModel, + }), + ...(!value.weights + ? { + weights: { + weight_type: WeightedScoreEnum.Customized, + vector_setting: { + vector_weight: DEFAULT_WEIGHTED_SCORE.other.semantic, + embedding_provider_name: '', + embedding_model_name: '', + }, + keyword_setting: { + keyword_weight: DEFAULT_WEIGHTED_SCORE.other.keyword, + }, + }, + } + : {}), + }) + } + }, [value, rerankDefaultModel, isRerankDefaultModelValid, onChange]) + return (
{supportRetrievalMethods.includes(RETRIEVE_METHOD.semantic) && ( - } + } title={t('dataset.retrieval.semantic_search.title')} description={t('dataset.retrieval.semantic_search.description')} isActive={ value.search_method === RETRIEVE_METHOD.semantic } - onSwitched={() => onChange({ - ...value, - search_method: RETRIEVE_METHOD.semantic, - })} + onSwitched={() => onSwitch(RETRIEVE_METHOD.semantic)} effectImg={Effect.src} activeHeaderClassName='bg-dataset-option-card-purple-gradient' > @@ -78,17 +111,14 @@ const RetrievalMethodConfig: FC = ({ /> )} - {supportRetrievalMethods.includes(RETRIEVE_METHOD.semantic) && ( - } + {supportRetrievalMethods.includes(RETRIEVE_METHOD.fullText) && ( + } title={t('dataset.retrieval.full_text_search.title')} description={t('dataset.retrieval.full_text_search.description')} isActive={ value.search_method === RETRIEVE_METHOD.fullText } - onSwitched={() => onChange({ - ...value, - search_method: RETRIEVE_METHOD.fullText, - })} + onSwitched={() => onSwitch(RETRIEVE_METHOD.fullText)} effectImg={Effect.src} activeHeaderClassName='bg-dataset-option-card-purple-gradient' > @@ -99,8 +129,8 @@ const RetrievalMethodConfig: FC = ({ /> )} - {supportRetrievalMethods.includes(RETRIEVE_METHOD.semantic) && ( - } + {supportRetrievalMethods.includes(RETRIEVE_METHOD.hybrid) && ( + } title={
{t('dataset.retrieval.hybrid_search.title')}
@@ -110,11 +140,7 @@ const RetrievalMethodConfig: FC = ({ description={t('dataset.retrieval.hybrid_search.description')} isActive={ value.search_method === RETRIEVE_METHOD.hybrid } - onSwitched={() => onChange({ - ...value, - search_method: RETRIEVE_METHOD.hybrid, - reranking_enable: true, - })} + onSwitched={() => onSwitch(RETRIEVE_METHOD.hybrid)} effectImg={Effect.src} activeHeaderClassName='bg-dataset-option-card-purple-gradient' > diff --git a/web/app/components/datasets/common/retrieval-param-config/index.tsx b/web/app/components/datasets/common/retrieval-param-config/index.tsx index 5136ac1659..93d555a34d 100644 --- a/web/app/components/datasets/common/retrieval-param-config/index.tsx +++ b/web/app/components/datasets/common/retrieval-param-config/index.tsx @@ -1,6 +1,6 @@ 'use client' import type { FC } from 'react' -import React, { useCallback } from 'react' +import React, { useCallback, useMemo } from 'react' import { useTranslation } from 'react-i18next' import Image from 'next/image' @@ -39,8 +39,8 @@ const RetrievalParamConfig: FC = ({ const { t } = useTranslation() const canToggleRerankModalEnable = type !== RETRIEVE_METHOD.hybrid const isEconomical = type === RETRIEVE_METHOD.invertedIndex + const isHybridSearch = type === RETRIEVE_METHOD.hybrid const { - defaultModel: rerankDefaultModel, modelList: rerankModelList, } = useModelListAndDefaultModel(ModelTypeEnum.rerank) @@ -48,35 +48,28 @@ const RetrievalParamConfig: FC = ({ currentModel, } = useCurrentProviderAndModel( rerankModelList, - rerankDefaultModel - ? { - ...rerankDefaultModel, - provider: rerankDefaultModel.provider.provider, - } - : undefined, + { + provider: value.reranking_model?.reranking_provider_name ?? '', + model: value.reranking_model?.reranking_model_name ?? '', + }, ) - const handleDisabledSwitchClick = useCallback(() => { - if (!currentModel) + const handleDisabledSwitchClick = useCallback((enable: boolean) => { + if (enable && !currentModel) Toast.notify({ type: 'error', message: t('workflow.errorMsg.rerankModelRequired') }) - }, [currentModel, rerankDefaultModel, t]) + onChange({ + ...value, + reranking_enable: enable, + }) + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [currentModel, onChange, value]) - const isHybridSearch = type === RETRIEVE_METHOD.hybrid - - const rerankModel = (() => { - if (value.reranking_model) { - return { - provider_name: value.reranking_model.reranking_provider_name, - model_name: value.reranking_model.reranking_model_name, - } + const rerankModel = useMemo(() => { + return { + provider_name: value.reranking_model.reranking_provider_name, + model_name: value.reranking_model.reranking_model_name, } - else if (rerankDefaultModel) { - return { - provider_name: rerankDefaultModel.provider.provider, - model_name: rerankDefaultModel.model, - } - } - })() + }, [value.reranking_model]) const handleChangeRerankMode = (v: RerankingModeEnum) => { if (v === value.reranking_mode) @@ -100,6 +93,8 @@ const RetrievalParamConfig: FC = ({ }, } } + if (v === RerankingModeEnum.RerankingModel && !currentModel) + Toast.notify({ type: 'error', message: t('workflow.errorMsg.rerankModelRequired') }) onChange(result) } @@ -122,22 +117,11 @@ const RetrievalParamConfig: FC = ({
{canToggleRerankModalEnable && ( -
- { - onChange({ - ...value, - reranking_enable: v, - }) - }} - disabled={!currentModel} - /> -
+ )}
{t('common.modelProvider.rerankModel.key')} @@ -148,21 +132,23 @@ const RetrievalParamConfig: FC = ({ />
- { - onChange({ - ...value, - reranking_model: { - reranking_provider_name: v.provider, - reranking_model_name: v.model, - }, - }) - }} - /> + { + value.reranking_enable && ( + { + onChange({ + ...value, + reranking_model: { + reranking_provider_name: v.provider, + reranking_model_name: v.model, + }, + }) + }} + /> + ) + }
)} { @@ -255,10 +241,8 @@ const RetrievalParamConfig: FC = ({ { value.reranking_mode !== RerankingModeEnum.WeightedScore && ( { onChange({ ...value, diff --git a/web/app/components/datasets/create/embedding-process/index.tsx b/web/app/components/datasets/create/embedding-process/index.tsx index 201333ffce..ead593d272 100644 --- a/web/app/components/datasets/create/embedding-process/index.tsx +++ b/web/app/components/datasets/create/embedding-process/index.tsx @@ -30,6 +30,7 @@ import { useProviderContext } from '@/context/provider-context' import { sleep } from '@/utils' import { RETRIEVE_METHOD } from '@/types/app' import Tooltip from '@/app/components/base/tooltip' +import { useInvalidDocumentList } from '@/service/knowledge/use-document' type Props = { datasetId: string @@ -207,7 +208,9 @@ const EmbeddingProcess: FC = ({ datasetId, batchId, documents = [], index }) const router = useRouter() + const invalidDocumentList = useInvalidDocumentList() const navToDocumentList = () => { + invalidDocumentList() router.push(`/datasets/${datasetId}/documents`) } const navToApiDocs = () => { diff --git a/web/app/components/datasets/create/step-two/index.tsx b/web/app/components/datasets/create/step-two/index.tsx index 0d7202967a..c0e6a0f1d3 100644 --- a/web/app/components/datasets/create/step-two/index.tsx +++ b/web/app/components/datasets/create/step-two/index.tsx @@ -31,17 +31,17 @@ import LanguageSelect from './language-select' import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs' import cn from '@/utils/classnames' import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, DocumentItem, FullDocumentDetail, ParentMode, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets' +import { ChunkingMode, DataSourceType, ProcessMode } from '@/models/datasets' import Button from '@/app/components/base/button' import FloatRightContainer from '@/app/components/base/float-right-container' import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config' import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config' import { type RetrievalConfig } from '@/types/app' -import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model' +import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model' import Toast from '@/app/components/base/toast' import type { NotionPage } from '@/models/common' import { DataSourceProvider } from '@/models/common' -import { ChunkingMode, DataSourceType, RerankingModeEnum } from '@/models/datasets' import { useDatasetDetailContext } from '@/context/dataset-detail' import I18n from '@/context/i18n' import { RETRIEVE_METHOD } from '@/types/app' @@ -90,17 +90,13 @@ type StepTwoProps = { onCancel?: () => void } -export enum SegmentType { - AUTO = 'automatic', - CUSTOM = 'custom', -} export enum IndexingType { QUALIFIED = 'high_quality', ECONOMICAL = 'economy', } const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n' -const DEFAULT_MAXMIMUM_CHUNK_LENGTH = 500 +const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500 const DEFAULT_OVERLAP = 50 type ParentChildConfig = { @@ -131,7 +127,6 @@ const StepTwo = ({ isSetting, documentDetail, isAPIKeySet, - onSetting, datasetId, indexingType, dataSourceType: inCreatePageDataSourceType, @@ -162,12 +157,12 @@ const StepTwo = ({ const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type) const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type - const [segmentationType, setSegmentationType] = useState(SegmentType.CUSTOM) + const [segmentationType, setSegmentationType] = useState(ProcessMode.general) const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER) const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => { doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER)) }, []) - const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXMIMUM_CHUNK_LENGTH) // default chunk length + const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(4000) const [overlap, setOverlap] = useState(DEFAULT_OVERLAP) const [rules, setRules] = useState([]) @@ -198,7 +193,6 @@ const StepTwo = ({ ) // QA Related - const [isLanguageSelectDisabled, _setIsLanguageSelectDisabled] = useState(false) const [isQAConfirmDialogOpen, setIsQAConfirmDialogOpen] = useState(false) const [docForm, setDocForm] = useState( (datasetId && documentDetail) ? documentDetail.doc_form as ChunkingMode : ChunkingMode.text, @@ -348,7 +342,7 @@ const StepTwo = ({ } const updatePreview = () => { - if (segmentationType === SegmentType.CUSTOM && maxChunkLength > 4000) { + if (segmentationType === ProcessMode.general && maxChunkLength > 4000) { Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') }) return } @@ -373,13 +367,42 @@ const StepTwo = ({ model: defaultEmbeddingModel?.model || '', }, ) + const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || { + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: false, + reranking_model: { + reranking_provider_name: '', + reranking_model_name: '', + }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + } as RetrievalConfig) + + useEffect(() => { + if (currentDataset?.retrieval_model_dict) + return + setRetrievalConfig({ + search_method: RETRIEVE_METHOD.semantic, + reranking_enable: !!isRerankDefaultModelValid, + reranking_model: { + reranking_provider_name: isRerankDefaultModelValid ? rerankDefaultModel?.provider.provider ?? '' : '', + reranking_model_name: isRerankDefaultModelValid ? rerankDefaultModel?.model ?? '' : '', + }, + top_k: 3, + score_threshold_enabled: false, + score_threshold: 0.5, + }) + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [rerankDefaultModel, isRerankDefaultModelValid]) + const getCreationParams = () => { let params - if (segmentationType === SegmentType.CUSTOM && overlap > maxChunkLength) { + if (segmentationType === ProcessMode.general && overlap > maxChunkLength) { Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') }) return } - if (segmentationType === SegmentType.CUSTOM && maxChunkLength > limitMaxChunkLength) { + if (segmentationType === ProcessMode.general && maxChunkLength > limitMaxChunkLength) { Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: limitMaxChunkLength }) }) return } @@ -389,7 +412,6 @@ const StepTwo = ({ doc_form: currentDocForm, doc_language: docLanguage, process_rule: getProcessRule(), - // eslint-disable-next-line @typescript-eslint/no-use-before-define retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page. embedding_model: embeddingModel.model, // Readonly embedding_model_provider: embeddingModel.provider, // Readonly @@ -400,10 +422,7 @@ const StepTwo = ({ const indexMethod = getIndexing_technique() if ( !isReRankModelSelected({ - rerankDefaultModel, - isRerankDefaultModelValid: !!isRerankDefaultModelValid, rerankModelList, - // eslint-disable-next-line @typescript-eslint/no-use-before-define retrievalConfig, indexMethod: indexMethod as string, }) @@ -411,16 +430,6 @@ const StepTwo = ({ Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') }) return } - const postRetrievalConfig = ensureRerankModelSelected({ - rerankDefaultModel: rerankDefaultModel!, - retrievalConfig: { - // eslint-disable-next-line @typescript-eslint/no-use-before-define - ...retrievalConfig, - // eslint-disable-next-line @typescript-eslint/no-use-before-define - reranking_enable: retrievalConfig.reranking_mode === RerankingModeEnum.RerankingModel, - }, - indexMethod: indexMethod as string, - }) params = { data_source: { type: dataSourceType, @@ -432,8 +441,7 @@ const StepTwo = ({ process_rule: getProcessRule(), doc_form: currentDocForm, doc_language: docLanguage, - - retrieval_model: postRetrievalConfig, + retrieval_model: retrievalConfig, embedding_model: embeddingModel.model, embedding_model_provider: embeddingModel.provider, } as CreateDocumentReq @@ -490,7 +498,6 @@ const StepTwo = ({ const getDefaultMode = () => { if (documentDetail) - // @ts-expect-error fix after api refactored setSegmentationType(documentDetail.dataset_process_rule.mode) } @@ -525,7 +532,6 @@ const StepTwo = ({ onSuccess(data) { updateIndexingTypeCache && updateIndexingTypeCache(indexType as string) updateResultCache && updateResultCache(data) - // eslint-disable-next-line @typescript-eslint/no-use-before-define updateRetrievalMethodCache && updateRetrievalMethodCache(retrievalConfig.search_method as string) }, }, @@ -545,14 +551,6 @@ const StepTwo = ({ isSetting && onSave && onSave() } - const changeToEconomicalType = () => { - if (docForm !== ChunkingMode.text) - return - - if (!hasSetIndexType) - setIndexType(IndexingType.ECONOMICAL) - } - useEffect(() => { // fetch rules if (!isSetting) { @@ -574,18 +572,6 @@ const StepTwo = ({ setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL) }, [isAPIKeySet, indexingType, datasetId]) - const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || { - search_method: RETRIEVE_METHOD.semantic, - reranking_enable: false, - reranking_model: { - reranking_provider_name: rerankDefaultModel?.provider.provider, - reranking_model_name: rerankDefaultModel?.model, - }, - top_k: 3, - score_threshold_enabled: false, - score_threshold: 0.5, - } as RetrievalConfig) - const economyDomRef = useRef(null) const isHoveringEconomy = useHover(economyDomRef) @@ -984,12 +970,14 @@ const StepTwo = ({ getIndexing_technique() === IndexingType.QUALIFIED ? ( ) : ( @@ -1010,7 +998,7 @@ const StepTwo = ({ ) : (
- + {!datasetId && }
)} @@ -1081,11 +1069,11 @@ const StepTwo = ({ } { currentDocForm !== ChunkingMode.qa - && + && }
} diff --git a/web/app/components/datasets/create/step-two/option-card.tsx b/web/app/components/datasets/create/step-two/option-card.tsx index b27be757b3..57ed50d518 100644 --- a/web/app/components/datasets/create/step-two/option-card.tsx +++ b/web/app/components/datasets/create/step-two/option-card.tsx @@ -4,7 +4,7 @@ import classNames from '@/utils/classnames' const TriangleArrow: FC> = props => ( - + ) @@ -65,7 +65,7 @@ export const OptionCard: FC = forwardRef((props, ref) => { (isActive && !noHighlight) ? 'border-[1.5px] border-components-option-card-option-selected-border' : 'border border-components-option-card-option-border', - disabled && 'opacity-50 cursor-not-allowed', + disabled && 'opacity-50 pointer-events-none', className, )} style={{ diff --git a/web/app/components/datasets/documents/detail/completed/index.tsx b/web/app/components/datasets/documents/detail/completed/index.tsx index 8385bde04b..1185c97e0f 100644 --- a/web/app/components/datasets/documents/detail/completed/index.tsx +++ b/web/app/components/datasets/documents/detail/completed/index.tsx @@ -232,6 +232,16 @@ const Completed: FC = ({ setFullScreen(false) }, []) + const onCloseNewSegmentModal = useCallback(() => { + onNewSegmentModalChange(false) + setFullScreen(false) + }, [onNewSegmentModalChange]) + + const onCloseNewChildChunkModal = useCallback(() => { + setShowNewChildSegmentModal(false) + setFullScreen(false) + }, []) + const { mutateAsync: enableSegment } = useEnableSegment() const { mutateAsync: disableSegment } = useDisableSegment() @@ -623,6 +633,7 @@ const Completed: FC = ({ = ({ { - onNewSegmentModalChange(false) - setFullScreen(false) - }} + onCancel={onCloseNewSegmentModal} onSave={resetList} viewNewlyAddedChunk={viewNewlyAddedChunk} /> @@ -651,6 +660,7 @@ const Completed: FC = ({ = ({ { - setShowNewChildSegmentModal(false) - setFullScreen(false) - }} + onCancel={onCloseNewChildChunkModal} onSave={onSaveNewChildChunk} viewNewlyAddedChildChunk={viewNewlyAddedChildChunk} /> diff --git a/web/app/components/datasets/documents/detail/completed/segment-list.tsx b/web/app/components/datasets/documents/detail/completed/segment-list.tsx index c31345ff3b..885db49db8 100644 --- a/web/app/components/datasets/documents/detail/completed/segment-list.tsx +++ b/web/app/components/datasets/documents/detail/completed/segment-list.tsx @@ -80,7 +80,7 @@ ref: ForwardedRef, checked={selectedSegmentIds.includes(segItem.id)} onCheck={() => onSelected(segItem.id)} /> -
+
= ({ datasetId, documentId }) => { const embedding = ['queuing', 'indexing', 'paused'].includes((documentDetail?.display_status || '').toLowerCase()) + const invalidChunkList = useInvalid(useSegmentListKey) + const invalidChildChunkList = useInvalid(useChildSegmentListKey) + const handleOperate = (operateName?: string) => { - if (operateName === 'delete') + if (operateName === 'delete') { backToPrev() - else + } + else { detailMutate() + setTimeout(() => { + invalidChunkList() + invalidChildChunkList() + }, 5000) + } } const mode = useMemo(() => { @@ -245,7 +255,7 @@ const DocumentDetail: FC = ({ datasetId, documentId }) => {
{isDetailLoading ? - :
{embedding diff --git a/web/app/components/datasets/documents/index.tsx b/web/app/components/datasets/documents/index.tsx index 7365ff9850..c9df2f28e2 100644 --- a/web/app/components/datasets/documents/index.tsx +++ b/web/app/components/datasets/documents/index.tsx @@ -24,6 +24,10 @@ import { DataSourceType } from '@/models/datasets' import IndexFailed from '@/app/components/datasets/common/document-status-with-action/index-failed' import { useProviderContext } from '@/context/provider-context' import cn from '@/utils/classnames' +import { useInvalidDocumentDetailKey } from '@/service/knowledge/use-document' +import { useInvalid } from '@/service/use-base' +import { useChildSegmentListKey, useSegmentListKey } from '@/service/knowledge/use-segment' + const FolderPlusIcon = ({ className }: React.SVGProps) => { return @@ -99,7 +103,7 @@ const Documents: FC = ({ datasetId }) => { return { page: currPage + 1, limit, keyword: debouncedSearchValue, fetch: isDataSourceNotion ? true : '' } }, [currPage, debouncedSearchValue, isDataSourceNotion, limit]) - const { data: documentsRes, error, mutate, isLoading: isListLoading } = useSWR( + const { data: documentsRes, mutate, isLoading: isListLoading } = useSWR( { action: 'fetchDocuments', datasetId, @@ -115,10 +119,20 @@ const Documents: FC = ({ datasetId }) => { setIsMuting(false) }, [isListLoading, isMuting]) + const invalidDocumentDetail = useInvalidDocumentDetailKey() + const invalidChunkList = useInvalid(useSegmentListKey) + const invalidChildChunkList = useInvalid(useChildSegmentListKey) + const handleUpdate = useCallback(() => { setIsMuting(true) mutate() - }, [mutate]) + invalidDocumentDetail() + setTimeout(() => { + invalidChunkList() + invalidChildChunkList() + }, 5000) + // eslint-disable-next-line react-hooks/exhaustive-deps + }, []) const documentsWithProgress = useMemo(() => { let completedNum = 0 diff --git a/web/app/components/datasets/documents/list.tsx b/web/app/components/datasets/documents/list.tsx index 26c26414df..00ccdfddce 100644 --- a/web/app/components/datasets/documents/list.tsx +++ b/web/app/components/datasets/documents/list.tsx @@ -133,6 +133,16 @@ export const StatusItem: FC<{ {DOC_INDEX_STATUS_MAP[localStatus]?.text} + { + errorMessage && ( + {errorMessage}
+ } + triggerClassName='ml-1 w-4 h-4' + /> + ) + } { scene === 'detail' && (
@@ -152,16 +162,6 @@ export const StatusItem: FC<{
) } - { - errorMessage && ( - {errorMessage}
- } - triggerClassName='ml-1 w-4 h-4' - /> - ) - }
} @@ -561,18 +561,14 @@ const DocumentList: FC = ({
-
- - {doc?.data_source_type === DataSourceType.NOTION && - } +
+
+ {doc?.data_source_type === DataSourceType.NOTION && } {doc?.data_source_type === DataSourceType.FILE && } - {doc?.data_source_type === DataSourceType.WEB && - } - { - doc.name - } - -
+ {doc?.data_source_type === DataSourceType.WEB && } +
+ {doc.name} +
diff --git a/web/app/components/datasets/hit-testing/components/child-chunks-item.tsx b/web/app/components/datasets/hit-testing/components/child-chunks-item.tsx index 043aa3cea7..3c01e3d0b3 100644 --- a/web/app/components/datasets/hit-testing/components/child-chunks-item.tsx +++ b/web/app/components/datasets/hit-testing/components/child-chunks-item.tsx @@ -17,7 +17,7 @@ const ChildChunks: FC = ({ const { id, score, content, position } = payload return (
C-{position}
diff --git a/web/app/components/datasets/hit-testing/components/chunk-detail-modal.tsx b/web/app/components/datasets/hit-testing/components/chunk-detail-modal.tsx index 8d49cef3d0..fe2f2b8f36 100644 --- a/web/app/components/datasets/hit-testing/components/chunk-detail-modal.tsx +++ b/web/app/components/datasets/hit-testing/components/chunk-detail-modal.tsx @@ -56,7 +56,7 @@ const ChunkDetailModal: FC = ({
-
+
{content}
{!isParentChildRetrieval && keywords && keywords.length > 0 && ( diff --git a/web/app/components/datasets/hit-testing/components/result-item.tsx b/web/app/components/datasets/hit-testing/components/result-item.tsx index 36ee541161..3c8c146d53 100644 --- a/web/app/components/datasets/hit-testing/components/result-item.tsx +++ b/web/app/components/datasets/hit-testing/components/result-item.tsx @@ -43,13 +43,8 @@ const ResultItem: FC = ({ setFalse: hideDetailModal, }] = useBoolean(false) - const handleClickCard = () => { - if (!isParentChildRetrieval) - showDetailModal() - } - return ( -
+
{/* Meta info */}
@@ -66,7 +61,7 @@ const ResultItem: FC = ({ {/* Main */}
-
{content}
+
{content}
{isParentChildRetrieval && (
diff --git a/web/app/components/datasets/hit-testing/components/score.tsx b/web/app/components/datasets/hit-testing/components/score.tsx index 175b00b7e7..76914318e0 100644 --- a/web/app/components/datasets/hit-testing/components/score.tsx +++ b/web/app/components/datasets/hit-testing/components/score.tsx @@ -12,15 +12,15 @@ const Score: FC = ({ value, besideChunkName, }) => { - if (!value) + if (!value || isNaN(value)) return null - return ( -
+
score
-
{value.toFixed(2)}
+
{value?.toFixed(2)}
) diff --git a/web/app/components/datasets/hit-testing/index.tsx b/web/app/components/datasets/hit-testing/index.tsx index 30be6fb7e7..ccc200bbe6 100644 --- a/web/app/components/datasets/hit-testing/index.tsx +++ b/web/app/components/datasets/hit-testing/index.tsx @@ -192,7 +192,7 @@ const HitTesting: FC = ({ datasetId }: Props) => { }
- setIsShowModifyRetrievalModal(false)} footer={null} mask={isMobile} panelClassname='mt-16 mx-2 sm:mr-2 mb-3 !p-0 !max-w-[640px] rounded-xl'> + setIsShowModifyRetrievalModal(false)} footer={null} mask={isMobile} panelClassname='mt-16 mx-2 sm:mr-2 mb-3 !p-0 !max-w-[640px] rounded-xl'> = ({ const { modelList: rerankModelList, - defaultModel: rerankDefaultModel, - currentModel: isRerankDefaultModelValid, } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank) const handleSave = () => { if ( !isReRankModelSelected({ - rerankDefaultModel, - isRerankDefaultModelValid: !!isRerankDefaultModelValid, rerankModelList, retrievalConfig, indexMethod, @@ -56,14 +51,7 @@ const ModifyRetrievalModal: FC = ({ Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') }) return } - onSave(ensureRerankModelSelected({ - rerankDefaultModel: rerankDefaultModel!, - retrievalConfig: { - ...retrievalConfig, - reranking_enable: retrievalConfig.reranking_mode === RerankingModeEnum.RerankingModel, - }, - indexMethod, - })) + onSave(retrievalConfig) } if (!isShow) diff --git a/web/app/components/datasets/settings/form/index.tsx b/web/app/components/datasets/settings/form/index.tsx index 5f6fc00eb7..760954d6cb 100644 --- a/web/app/components/datasets/settings/form/index.tsx +++ b/web/app/components/datasets/settings/form/index.tsx @@ -17,11 +17,11 @@ import Input from '@/app/components/base/input' import Textarea from '@/app/components/base/textarea' import { ApiConnectionMod } from '@/app/components/base/icons/src/vender/solid/development' import { updateDatasetSetting } from '@/service/datasets' -import { type DataSetListResponse, RerankingModeEnum } from '@/models/datasets' +import { type DataSetListResponse } from '@/models/datasets' import DatasetDetailContext from '@/context/dataset-detail' import { type RetrievalConfig } from '@/types/app' import { useAppContext } from '@/context/app-context' -import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model' +import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model' import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector' import { useModelList, @@ -74,8 +74,6 @@ const Form = () => { ) const { modelList: rerankModelList, - defaultModel: rerankDefaultModel, - currentModel: isRerankDefaultModelValid, } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank) const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding) @@ -109,8 +107,6 @@ const Form = () => { } if ( !isReRankModelSelected({ - rerankDefaultModel, - isRerankDefaultModelValid: !!isRerankDefaultModelValid, rerankModelList, retrievalConfig, indexMethod, @@ -119,17 +115,9 @@ const Form = () => { notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') }) return } - const postRetrievalConfig = ensureRerankModelSelected({ - rerankDefaultModel: rerankDefaultModel!, - retrievalConfig: { - ...retrievalConfig, - reranking_enable: retrievalConfig.reranking_mode === RerankingModeEnum.RerankingModel, - }, - indexMethod, - }) - if (postRetrievalConfig.weights) { - postRetrievalConfig.weights.vector_setting.embedding_provider_name = currentDataset?.embedding_model_provider || '' - postRetrievalConfig.weights.vector_setting.embedding_model_name = currentDataset?.embedding_model || '' + if (retrievalConfig.weights) { + retrievalConfig.weights.vector_setting.embedding_provider_name = currentDataset?.embedding_model_provider || '' + retrievalConfig.weights.vector_setting.embedding_model_name = currentDataset?.embedding_model || '' } try { setLoading(true) @@ -141,8 +129,8 @@ const Form = () => { permission, indexing_technique: indexMethod, retrieval_model: { - ...postRetrievalConfig, - score_threshold: postRetrievalConfig.score_threshold_enabled ? postRetrievalConfig.score_threshold : 0, + ...retrievalConfig, + score_threshold: retrievalConfig.score_threshold_enabled ? retrievalConfig.score_threshold : 0, }, embedding_model: embeddingModel.model, embedding_model_provider: embeddingModel.provider, diff --git a/web/app/components/header/account-setting/model-provider-page/model-selector/model-trigger.tsx b/web/app/components/header/account-setting/model-provider-page/model-selector/model-trigger.tsx index 556a2ef66f..aba7ff5f64 100644 --- a/web/app/components/header/account-setting/model-provider-page/model-selector/model-trigger.tsx +++ b/web/app/components/header/account-setting/model-provider-page/model-selector/model-trigger.tsx @@ -36,6 +36,7 @@ const ModelTrigger: FC = ({ className={classNames( 'group flex items-center px-2 h-8 rounded-lg bg-components-input-bg-normal', !readonly && 'hover:bg-components-input-bg-hover cursor-pointer', + !!readonly && 'opacity-50', className, open && '!bg-components-input-bg-hover', model.status !== ModelStatusEnum.active && '!bg-[#FFFAEB]', diff --git a/web/app/components/workflow/nodes/knowledge-retrieval/components/retrieval-config.tsx b/web/app/components/workflow/nodes/knowledge-retrieval/components/retrieval-config.tsx index b335b62e33..d3e2079733 100644 --- a/web/app/components/workflow/nodes/knowledge-retrieval/components/retrieval-config.tsx +++ b/web/app/components/workflow/nodes/knowledge-retrieval/components/retrieval-config.tsx @@ -59,7 +59,8 @@ const RetrievalConfig: FC = ({ }, [onOpenFromPropsChange]) const { - defaultModel: rerankDefaultModel, + currentProvider: validRerankDefaultProvider, + currentModel: validRerankDefaultModel, } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank) const { multiple_retrieval_config } = payload @@ -75,8 +76,8 @@ const RetrievalConfig: FC = ({ ? undefined : (!configs.reranking_model?.reranking_provider_name ? { - provider: rerankDefaultModel?.provider?.provider || '', - model: rerankDefaultModel?.model || '', + provider: validRerankDefaultProvider?.provider || '', + model: validRerankDefaultModel?.model || '', } : { provider: configs.reranking_model?.reranking_provider_name, @@ -86,7 +87,7 @@ const RetrievalConfig: FC = ({ weights: configs.weights as any, reranking_enable: configs.reranking_enable, }) - }, [onMultipleRetrievalConfigChange, payload.retrieval_mode, rerankDefaultModel?.provider?.provider, rerankDefaultModel?.model, onRetrievalModeChange]) + }, [onMultipleRetrievalConfigChange, payload.retrieval_mode, validRerankDefaultProvider, validRerankDefaultModel, onRetrievalModeChange]) return ( { }) setInputs(newInput) // eslint-disable-next-line react-hooks/exhaustive-deps - }, [currentProvider?.provider, currentModel, rerankDefaultModel]) + }, [currentProvider?.provider, currentModel, currentRerankModel, rerankDefaultModel]) const [selectedDatasets, setSelectedDatasets] = useState([]) const [rerankModelOpen, setRerankModelOpen] = useState(false) const handleRetrievalModeChange = useCallback((newMode: RETRIEVE_TYPE) => { diff --git a/web/app/components/workflow/nodes/knowledge-retrieval/utils.ts b/web/app/components/workflow/nodes/knowledge-retrieval/utils.ts index 794fcbca4a..c7b48c1eaa 100644 --- a/web/app/components/workflow/nodes/knowledge-retrieval/utils.ts +++ b/web/app/components/workflow/nodes/knowledge-retrieval/utils.ts @@ -126,7 +126,7 @@ export const getMultipleRetrievalConfig = ( reranking_mode, reranking_model, weights, - reranking_enable: ((allInternal && allEconomic) || allExternal) ? reranking_enable : true, + reranking_enable: ((allInternal && allEconomic) || allExternal) ? reranking_enable : shouldSetWeightDefaultValue, } const setDefaultWeights = () => { @@ -152,16 +152,20 @@ export const getMultipleRetrievalConfig = ( if (allEconomic || mixtureHighQualityAndEconomic || inconsistentEmbeddingModel || allExternal || mixtureInternalAndExternal) { result.reranking_mode = RerankingModeEnum.RerankingModel - - if (rerankModelIsValid) { - result.reranking_mode = RerankingModeEnum.RerankingModel - result.reranking_model = { - provider: validRerankModel?.provider || '', - model: validRerankModel?.model || '', + if (!result.reranking_model?.provider || !result.reranking_model?.model) { + if (rerankModelIsValid) { + result.reranking_enable = true + result.reranking_model = { + provider: validRerankModel?.provider || '', + model: validRerankModel?.model || '', + } + } + else { + result.reranking_model = { + provider: '', + model: '', + } } - } - else { - result.reranking_model = undefined } } @@ -169,6 +173,7 @@ export const getMultipleRetrievalConfig = ( if (!reranking_mode) { if (validRerankModel?.provider && validRerankModel?.model) { result.reranking_mode = RerankingModeEnum.RerankingModel + result.reranking_enable = true result.reranking_model = { provider: validRerankModel.provider, model: validRerankModel.model, @@ -186,6 +191,7 @@ export const getMultipleRetrievalConfig = ( if (reranking_mode === RerankingModeEnum.WeightedScore && weights && shouldSetWeightDefaultValue) { if (rerankModelIsValid) { result.reranking_mode = RerankingModeEnum.RerankingModel + result.reranking_enable = true result.reranking_model = { provider: validRerankModel.provider || '', model: validRerankModel.model || '', @@ -199,6 +205,13 @@ export const getMultipleRetrievalConfig = ( result.reranking_mode = RerankingModeEnum.WeightedScore setDefaultWeights() } + if (reranking_mode === RerankingModeEnum.RerankingModel && rerankModelIsValid) { + result.reranking_enable = true + result.reranking_model = { + provider: validRerankModel.provider || '', + model: validRerankModel.model || '', + } + } } return result diff --git a/web/i18n/en-US/app-debug.ts b/web/i18n/en-US/app-debug.ts index 266da820a0..9c1b883871 100644 --- a/web/i18n/en-US/app-debug.ts +++ b/web/i18n/en-US/app-debug.ts @@ -483,7 +483,7 @@ const translation = { title: 'Multi-path retrieval', description: 'Based on user intent, queries across all Knowledge, retrieves relevant text from multi-sources, and selects the best results matching the user query after reranking. ', }, - rerankModelRequired: 'Rerank model is required', + rerankModelRequired: 'A configured Rerank Model is required', params: 'Params', top_k: 'Top K', top_kTip: 'Used to filter chunks that are most similar to user questions. The system will also dynamically adjust the value of Top K, according to max_tokens of the selected model.', diff --git a/web/i18n/en-US/workflow.ts b/web/i18n/en-US/workflow.ts index 0dfa8883eb..42b7048f85 100644 --- a/web/i18n/en-US/workflow.ts +++ b/web/i18n/en-US/workflow.ts @@ -183,7 +183,7 @@ const translation = { }, errorMsg: { fieldRequired: '{{field}} is required', - rerankModelRequired: 'Before turning on the Rerank Model, please confirm that the model has been successfully configured in the settings.', + rerankModelRequired: 'A configured Rerank Model is required', authRequired: 'Authorization is required', invalidJson: '{{field}} is invalid JSON', fields: { @@ -191,7 +191,7 @@ const translation = { variableValue: 'Variable Value', code: 'Code', model: 'Model', - rerankModel: 'Rerank Model', + rerankModel: 'A configured Rerank Model', visionVariable: 'Vision Variable', }, invalidVariable: 'Invalid variable', diff --git a/web/i18n/zh-Hans/app-debug.ts b/web/i18n/zh-Hans/app-debug.ts index 4e3f18ad7f..14f1358dd6 100644 --- a/web/i18n/zh-Hans/app-debug.ts +++ b/web/i18n/zh-Hans/app-debug.ts @@ -475,7 +475,7 @@ const translation = { title: '多路召回', description: '根据用户意图同时匹配所有知识库,从多路知识库查询相关文本片段,经过重排序步骤,从多路查询结果中选择匹配用户问题的最佳结果。', }, - rerankModelRequired: '请选择 Rerank 模型', + rerankModelRequired: '未配置 Rerank 模型', params: '参数设置', top_k: 'Top K', top_kTip: '用于筛选与用户问题相似度最高的文本片段。系统同时会根据选用模型上下文窗口大小动态调整分段数量。', diff --git a/web/i18n/zh-Hans/workflow.ts b/web/i18n/zh-Hans/workflow.ts index 91451b486b..93ebda4ce9 100644 --- a/web/i18n/zh-Hans/workflow.ts +++ b/web/i18n/zh-Hans/workflow.ts @@ -183,7 +183,7 @@ const translation = { }, errorMsg: { fieldRequired: '{{field}} 不能为空', - rerankModelRequired: '开启 Rerank 模型前,请务必确认模型已在设置中成功配置。', + rerankModelRequired: '未配置 Rerank 模型', authRequired: '请先授权', invalidJson: '{{field}} 是非法的 JSON', fields: { diff --git a/web/service/knowledge/use-document.ts b/web/service/knowledge/use-document.ts index 2b9981f22f..02e523bd90 100644 --- a/web/service/knowledge/use-document.ts +++ b/web/service/knowledge/use-document.ts @@ -29,6 +29,10 @@ export const useDocumentList = (payload: { }) } +export const useInvalidDocumentList = () => { + return useInvalid(useDocumentListKey) +} + const useAutoDisabledDocumentKey = [NAME_SPACE, 'autoDisabledDocument'] export const useAutoDisabledDocuments = (datasetId: string) => { return useQuery({ @@ -94,7 +98,7 @@ export const useSyncWebsite = () => { }) } -const useDocumentDetailKey = [NAME_SPACE, 'documentDetail'] +const useDocumentDetailKey = [NAME_SPACE, 'documentDetail', 'withoutMetaData'] export const useDocumentDetail = (payload: { datasetId: string documentId: string @@ -114,7 +118,7 @@ export const useDocumentMetadata = (payload: { }) => { const { datasetId, documentId, params } = payload return useQuery({ - queryKey: [...useDocumentDetailKey, 'withMetaData', datasetId, documentId], + queryKey: [...useDocumentDetailKey, 'onlyMetaData', datasetId, documentId], queryFn: () => get(`/datasets/${datasetId}/documents/${documentId}`, { params }), }) }