dify/web/app/components/datasets/create/step-two/index.tsx

950 lines
37 KiB
TypeScript
Raw Normal View History

2023-05-15 08:51:32 +08:00
'use client'
2024-12-03 14:14:37 +08:00
import type { FC, PropsWithChildren } from 'react'
import React, { useCallback, useEffect, useState } from 'react'
2023-05-15 08:51:32 +08:00
import { useTranslation } from 'react-i18next'
import { useContext } from 'use-context-selector'
2024-06-20 11:05:08 +08:00
import {
2024-11-20 16:24:06 +08:00
RiArrowLeftLine,
2024-11-20 15:25:26 +08:00
RiCloseLine,
RiSearchEyeLine,
2024-06-20 11:05:08 +08:00
} from '@remixicon/react'
2023-06-06 10:52:02 +08:00
import Link from 'next/link'
2024-11-20 10:13:29 +08:00
import Image from 'next/image'
import SettingCog from '../assets/setting-gear-mod.svg'
import OrangeEffect from '../assets/option-card-effect-orange.svg'
import FamilyMod from '../assets/family-mod.svg'
import Note from '../assets/note-mod.svg'
import FileList from '../assets/file-list-3-fill.svg'
import { indexMethodIcon } from '../icons'
import { PreviewContainer } from '../../preview/container'
import { ChunkContainer, QAPreview } from '../../chunk'
import { PreviewHeader } from '../../preview/header'
2023-06-06 10:52:02 +08:00
import s from './index.module.css'
2024-09-19 17:40:20 +08:00
import unescape from './unescape'
import escape from './escape'
2024-11-20 10:13:29 +08:00
import { OptionCard } from './option-card'
2024-11-20 15:25:26 +08:00
import LanguageSelect from './language-select'
2024-11-21 11:40:17 +08:00
import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs'
import cn from '@/utils/classnames'
2024-12-03 15:23:51 +08:00
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FullDocumentDetail, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
2024-12-03 17:26:45 +08:00
2023-05-15 08:51:32 +08:00
import Button from '@/app/components/base/button'
import FloatRightContainer from '@/app/components/base/float-right-container'
import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
import { type RetrievalConfig } from '@/types/app'
import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
2023-05-15 08:51:32 +08:00
import Toast from '@/app/components/base/toast'
2023-08-28 19:48:53 +08:00
import type { NotionPage } from '@/models/common'
import { DataSourceProvider } from '@/models/common'
import { DataSourceType, DocForm } from '@/models/datasets'
2023-06-19 16:32:25 +08:00
import { useDatasetDetailContext } from '@/context/dataset-detail'
import I18n from '@/context/i18n'
import { RETRIEVE_METHOD } from '@/types/app'
import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
2024-02-23 14:31:06 +08:00
import { LanguagesSupported } from '@/i18n/language'
import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector'
import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations'
2024-04-04 15:54:59 +08:00
import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
2024-11-20 10:13:29 +08:00
import Checkbox from '@/app/components/base/checkbox'
import RadioCard from '@/app/components/base/radio-card'
2024-11-20 15:25:26 +08:00
import { MessageChatSquare } from '@/app/components/base/icons/src/public/common'
import { IS_CE_EDITION } from '@/config'
2024-11-21 11:40:17 +08:00
import Switch from '@/app/components/base/switch'
2024-11-26 17:22:02 +08:00
import Divider from '@/app/components/base/divider'
2024-12-03 17:26:45 +08:00
import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument, useFetchDefaultProcessRule, useFetchFileIndexingEstimateForFile, useFetchFileIndexingEstimateForNotion, useFetchFileIndexingEstimateForWeb } from '@/service/use-datasets'
2024-12-03 15:23:51 +08:00
import Loading from '@/app/components/base/loading'
const TextLabel: FC<PropsWithChildren> = (props) => {
2024-11-26 14:18:42 +08:00
return <label className='text-text-secondary text-xs font-semibold leading-none'>{props.children}</label>
}
2023-05-15 08:51:32 +08:00
type StepTwoProps = {
2023-06-06 10:52:02 +08:00
isSetting?: boolean
documentDetail?: FullDocumentDetail
2024-06-05 00:13:29 +08:00
isAPIKeySet: boolean
2023-06-06 10:52:02 +08:00
onSetting: () => void
datasetId?: string
indexingType?: IndexingType
retrievalMethod?: string
dataSourceType: DataSourceType
files: CustomFile[]
2023-08-28 19:48:53 +08:00
notionPages?: NotionPage[]
websitePages?: CrawlResultItem[]
crawlOptions?: CrawlOptions
websiteCrawlProvider?: DataSourceProvider
websiteCrawlJobId?: string
2023-06-06 10:52:02 +08:00
onStepChange?: (delta: number) => void
updateIndexingTypeCache?: (type: string) => void
updateRetrievalMethodCache?: (method: string) => void
updateResultCache?: (res: createDocumentResponse) => void
onSave?: () => void
onCancel?: () => void
2023-05-15 08:51:32 +08:00
}
2024-12-03 15:23:51 +08:00
export enum SegmentType {
2023-05-15 08:51:32 +08:00
AUTO = 'automatic',
CUSTOM = 'custom',
}
2024-12-03 14:34:18 +08:00
export enum IndexingType {
2023-05-15 08:51:32 +08:00
QUALIFIED = 'high_quality',
ECONOMICAL = 'economy',
}
2024-09-19 17:40:20 +08:00
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
2024-11-21 11:40:17 +08:00
type ParentChildConfig = {
chunkForContext: 'paragraph' | 'full_doc'
parent: {
delimiter: string
maxLength: number
}
child: {
delimiter: string
maxLength: number
}
}
2024-11-26 15:37:57 +08:00
const defaultParentChildConfig: ParentChildConfig = {
chunkForContext: 'paragraph',
parent: {
delimiter: '\\n\\n',
maxLength: 4000,
},
child: {
delimiter: '\\n\\n',
maxLength: 4000,
},
}
2023-05-15 08:51:32 +08:00
const StepTwo = ({
isSetting,
documentDetail,
2024-06-05 00:13:29 +08:00
isAPIKeySet,
2023-05-15 08:51:32 +08:00
onSetting,
datasetId,
indexingType,
dataSourceType: inCreatePageDataSourceType,
files,
notionPages = [],
websitePages = [],
crawlOptions,
websiteCrawlProvider = DataSourceProvider.fireCrawl,
websiteCrawlJobId = '',
2023-05-15 08:51:32 +08:00
onStepChange,
updateIndexingTypeCache,
updateResultCache,
onSave,
onCancel,
updateRetrievalMethodCache,
2023-05-15 08:51:32 +08:00
}: StepTwoProps) => {
const { t } = useTranslation()
const { locale } = useContext(I18n)
const media = useBreakpoints()
const isMobile = media === MediaType.mobile
const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext()
const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
2023-05-15 08:51:32 +08:00
const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
2024-09-19 17:40:20 +08:00
const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
const setSegmentIdentifier = useCallback((value: string) => {
doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER)
}, [])
const [max, setMax] = useState(4000) // default chunk length
const [overlap, setOverlap] = useState(50)
2023-05-15 08:51:32 +08:00
const [rules, setRules] = useState<PreProcessingRule[]>([])
const [defaultConfig, setDefaultConfig] = useState<Rules>()
const hasSetIndexType = !!indexingType
const [indexType, setIndexType] = useState<IndexingType>(
2023-10-07 17:42:16 +08:00
(indexingType
2024-06-05 00:13:29 +08:00
|| isAPIKeySet)
2023-06-06 10:52:02 +08:00
? IndexingType.QUALIFIED
: IndexingType.ECONOMICAL,
2023-05-15 08:51:32 +08:00
)
const [isLanguageSelectDisabled, setIsLanguageSelectDisabled] = useState(false)
const [docForm, setDocForm] = useState<DocForm | string>(
2023-10-07 17:42:16 +08:00
(datasetId && documentDetail) ? documentDetail.doc_form : DocForm.TEXT,
)
const [docLanguage, setDocLanguage] = useState<string>(
(datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese'),
)
const [QATipHide, setQATipHide] = useState(false)
2024-12-03 17:26:45 +08:00
const [qaPreviewSwitched, setQAPreviewSwitched] = useState(false)
2023-05-15 08:51:32 +08:00
2024-11-26 15:37:57 +08:00
const [parentChildConfig, setParentChildConfig] = useState<ParentChildConfig>(defaultParentChildConfig)
2024-11-21 11:40:17 +08:00
2024-12-03 15:23:51 +08:00
const getIndexing_technique = () => indexingType || indexType
const getProcessRule = () => {
const processRule: ProcessRule = {
rules: {} as any, // api will check this. It will be removed after api refactored.
mode: segmentationType,
}
if (segmentationType === SegmentType.CUSTOM) {
const ruleObj = {
pre_processing_rules: rules,
segmentation: {
separator: unescape(segmentIdentifier),
max_tokens: max,
chunk_overlap: overlap,
},
}
processRule.rules = ruleObj
}
return processRule
}
const fileIndexingEstimateQuery = useFetchFileIndexingEstimateForFile({
docForm: docForm as DocForm,
docLanguage,
dataSourceType: DataSourceType.FILE,
files,
indexingTechnique: getIndexing_technique() as any,
processRule: getProcessRule(),
dataset_id: datasetId!,
})
const notionIndexingEstimateQuery = useFetchFileIndexingEstimateForNotion({
docForm: docForm as DocForm,
docLanguage,
dataSourceType: DataSourceType.NOTION,
notionPages,
indexingTechnique: getIndexing_technique() as any,
processRule: getProcessRule(),
dataset_id: datasetId || '',
})
const websiteIndexingEstimateQuery = useFetchFileIndexingEstimateForWeb({
docForm: docForm as DocForm,
docLanguage,
dataSourceType: DataSourceType.WEB,
websitePages,
crawlOptions,
websiteCrawlProvider,
websiteCrawlJobId,
indexingTechnique: getIndexing_technique() as any,
processRule: getProcessRule(),
dataset_id: datasetId || '',
})
const fetchEstimate = useCallback(() => {
if (dataSourceType === DataSourceType.FILE)
fileIndexingEstimateQuery.mutate()
if (dataSourceType === DataSourceType.NOTION)
notionIndexingEstimateQuery.mutate()
if (dataSourceType === DataSourceType.WEB)
websiteIndexingEstimateQuery.mutate()
}, [dataSourceType, fileIndexingEstimateQuery, notionIndexingEstimateQuery, websiteIndexingEstimateQuery])
const estimate
= dataSourceType === DataSourceType.FILE
? fileIndexingEstimateQuery.data
: dataSourceType === DataSourceType.NOTION
? notionIndexingEstimateQuery.data
: websiteIndexingEstimateQuery.data
2024-12-03 17:26:45 +08:00
// const getIsEstimateReady = useCallback(() => {
// if (dataSourceType === DataSourceType.FILE)
// return fileIndexingEstimateQuery.isSuccess
2024-12-03 15:23:51 +08:00
2024-12-03 17:26:45 +08:00
// if (dataSourceType === DataSourceType.NOTION)
// return notionIndexingEstimateQuery.isSuccess
2024-12-03 15:23:51 +08:00
2024-12-03 17:26:45 +08:00
// if (dataSourceType === DataSourceType.WEB)
// return websiteIndexingEstimateQuery.isSuccess
// }, [dataSourceType, fileIndexingEstimateQuery.isSuccess, notionIndexingEstimateQuery.isSuccess, websiteIndexingEstimateQuery.isSuccess])
2024-12-03 15:23:51 +08:00
2024-12-03 17:26:45 +08:00
// const getFileName = (name: string) => {
// const arr = name.split('.')
// return arr.slice(0, -1).join('.')
// }
2023-05-15 08:51:32 +08:00
const getRuleName = (key: string) => {
2023-06-06 10:52:02 +08:00
if (key === 'remove_extra_spaces')
2023-05-15 08:51:32 +08:00
return t('datasetCreation.stepTwo.removeExtraSpaces')
2023-06-06 10:52:02 +08:00
if (key === 'remove_urls_emails')
2023-05-15 08:51:32 +08:00
return t('datasetCreation.stepTwo.removeUrlEmails')
2023-06-06 10:52:02 +08:00
if (key === 'remove_stopwords')
2023-05-15 08:51:32 +08:00
return t('datasetCreation.stepTwo.removeStopwords')
}
const ruleChangeHandle = (id: string) => {
2023-06-06 10:52:02 +08:00
const newRules = rules.map((rule) => {
2023-05-15 08:51:32 +08:00
if (rule.id === id) {
return {
id: rule.id,
enabled: !rule.enabled,
}
}
return rule
})
setRules(newRules)
}
const resetRules = () => {
if (defaultConfig) {
2024-09-19 17:40:20 +08:00
setSegmentIdentifier(defaultConfig.segmentation.separator)
2023-05-15 08:51:32 +08:00
setMax(defaultConfig.segmentation.max_tokens)
2024-12-03 14:14:37 +08:00
setOverlap(defaultConfig.segmentation.chunk_overlap!)
2023-05-15 08:51:32 +08:00
setRules(defaultConfig.pre_processing_rules)
}
2024-11-26 15:37:57 +08:00
setParentChildConfig(defaultParentChildConfig)
2023-05-15 08:51:32 +08:00
}
2024-12-03 14:14:37 +08:00
const updatePreview = () => {
if (segmentationType === SegmentType.CUSTOM && max > 4000) {
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
return
}
2024-12-03 15:23:51 +08:00
fetchEstimate()
2024-12-03 17:26:45 +08:00
setQAPreviewSwitched(false)
2023-05-15 08:51:32 +08:00
}
const {
modelList: rerankModelList,
defaultModel: rerankDefaultModel,
2024-09-08 13:14:11 +08:00
currentModel: isRerankDefaultModelValid,
2024-04-04 15:54:59 +08:00
} = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank)
const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding)
const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding)
const [embeddingModel, setEmbeddingModel] = useState<DefaultModel>(
currentDataset?.embedding_model
? {
provider: currentDataset.embedding_model_provider,
model: currentDataset.embedding_model,
}
: {
provider: defaultEmbeddingModel?.provider.provider || '',
model: defaultEmbeddingModel?.model || '',
},
)
2023-05-15 08:51:32 +08:00
const getCreationParams = () => {
let params
if (segmentationType === SegmentType.CUSTOM && overlap > max) {
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
return
}
if (segmentationType === SegmentType.CUSTOM && max > 4000) {
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
return
}
if (isSetting) {
params = {
original_document_id: documentDetail?.id,
doc_form: docForm,
doc_language: docLanguage,
process_rule: getProcessRule(),
// eslint-disable-next-line @typescript-eslint/no-use-before-define
retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page.
embedding_model: embeddingModel.model, // Readonly
embedding_model_provider: embeddingModel.provider, // Readonly
} as CreateDocumentReq
2023-06-06 10:52:02 +08:00
}
else { // create
const indexMethod = getIndexing_technique()
if (
!isReRankModelSelected({
rerankDefaultModel,
2024-09-08 13:14:11 +08:00
isRerankDefaultModelValid: !!isRerankDefaultModelValid,
rerankModelList,
// eslint-disable-next-line @typescript-eslint/no-use-before-define
retrievalConfig,
indexMethod: indexMethod as string,
})
) {
Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') })
return
}
const postRetrievalConfig = ensureRerankModelSelected({
rerankDefaultModel: rerankDefaultModel!,
// eslint-disable-next-line @typescript-eslint/no-use-before-define
retrievalConfig,
indexMethod: indexMethod as string,
})
params = {
data_source: {
type: dataSourceType,
info_list: {
data_source_type: dataSourceType,
},
},
indexing_technique: getIndexing_technique(),
process_rule: getProcessRule(),
doc_form: docForm,
doc_language: docLanguage,
retrieval_model: postRetrievalConfig,
embedding_model: embeddingModel.model,
embedding_model_provider: embeddingModel.provider,
} as CreateDocumentReq
if (dataSourceType === DataSourceType.FILE) {
params.data_source.info_list.file_info_list = {
file_ids: files.map(file => file.id || '').filter(Boolean),
}
}
if (dataSourceType === DataSourceType.NOTION)
2024-12-03 15:23:51 +08:00
params.data_source.info_list.notion_info_list = getNotionInfo(notionPages)
2024-12-03 15:23:51 +08:00
if (dataSourceType === DataSourceType.WEB) {
params.data_source.info_list.website_info_list = getWebsiteInfo({
websiteCrawlProvider,
websiteCrawlJobId,
websitePages,
})
}
}
2023-05-15 08:51:32 +08:00
return params
}
2024-12-03 17:26:45 +08:00
const fetchDefaultProcessRuleMutation = useFetchDefaultProcessRule({
onSuccess(data) {
const separator = data.rules.segmentation.separator
2024-09-19 17:40:20 +08:00
setSegmentIdentifier(separator)
2024-12-03 17:26:45 +08:00
setMax(data.rules.segmentation.max_tokens)
setOverlap(data.rules.segmentation.chunk_overlap!)
setRules(data.rules.pre_processing_rules)
setDefaultConfig(data.rules)
},
onError(error) {
Toast.notify({
type: 'error',
message: `${error}`,
})
},
})
const getRulesFromDetail = () => {
if (documentDetail) {
const rules = documentDetail.dataset_process_rule.rules
const separator = rules.segmentation.separator
const max = rules.segmentation.max_tokens
const overlap = rules.segmentation.chunk_overlap
2024-09-19 17:40:20 +08:00
setSegmentIdentifier(separator)
setMax(max)
2024-12-03 17:26:45 +08:00
setOverlap(overlap as number)
setRules(rules.pre_processing_rules)
setDefaultConfig(rules)
}
}
const getDefaultMode = () => {
2023-06-06 10:52:02 +08:00
if (documentDetail)
setSegmentationType(documentDetail.dataset_process_rule.mode)
}
2024-12-03 17:26:45 +08:00
const createFirstDocumentMutation = useCreateFirstDocument({
onError(error) {
Toast.notify({
type: 'error',
message: `${error}`,
})
},
})
const createDocumentMutation = useCreateDocument(datasetId!, {
onError(error) {
2023-05-15 08:51:32 +08:00
Toast.notify({
type: 'error',
2024-12-03 17:26:45 +08:00
message: `${error}`,
2023-05-15 08:51:32 +08:00
})
2024-12-03 17:26:45 +08:00
},
})
const isCreating = createFirstDocumentMutation.isPending || createDocumentMutation.isPending
const createHandle = async () => {
const params = getCreationParams()
if (!params)
return false
if (!datasetId) {
await createFirstDocumentMutation.mutateAsync(
params,
{
onSuccess(data) {
updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
updateResultCache && updateResultCache(data)
// eslint-disable-next-line @typescript-eslint/no-use-before-define
updateRetrievalMethodCache && updateRetrievalMethodCache(retrievalConfig.search_method as string)
},
},
)
2023-05-15 08:51:32 +08:00
}
2024-12-03 17:26:45 +08:00
else {
await createDocumentMutation.mutateAsync(params, {
onSuccess(data) {
updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
updateResultCache && updateResultCache(data)
},
})
}
2024-12-03 17:26:45 +08:00
if (mutateDatasetRes)
mutateDatasetRes()
onStepChange && onStepChange(+1)
isSetting && onSave && onSave()
2023-05-15 08:51:32 +08:00
}
2024-12-03 14:14:37 +08:00
const handleDocformSwitch = (isQAMode: boolean) => {
if (isQAMode)
setDocForm(DocForm.QA)
else
setDocForm(DocForm.TEXT)
}
2024-12-03 17:26:45 +08:00
const previewSwitch = () => {
setQAPreviewSwitched(true)
setIsLanguageSelectDisabled(true)
2024-12-03 15:23:51 +08:00
fetchEstimate()
}
const handleSelect = (language: string) => {
setDocLanguage(language)
// Switch language, re-cutter
2024-12-03 17:26:45 +08:00
if (docForm === DocForm.QA && qaPreviewSwitched)
previewSwitch()
}
const changeToEconomicalType = () => {
if (!hasSetIndexType) {
setIndexType(IndexingType.ECONOMICAL)
setDocForm(DocForm.TEXT)
}
}
2023-05-15 08:51:32 +08:00
useEffect(() => {
// fetch rules
if (!isSetting) {
2024-12-03 17:26:45 +08:00
fetchDefaultProcessRuleMutation.mutate('/datasets/process-rule')
2023-06-06 10:52:02 +08:00
}
else {
getRulesFromDetail()
getDefaultMode()
}
2023-05-15 08:51:32 +08:00
}, [])
useEffect(() => {
if (indexingType === IndexingType.ECONOMICAL && docForm === DocForm.QA)
setDocForm(DocForm.TEXT)
}, [indexingType, docForm])
2023-05-15 08:51:32 +08:00
useEffect(() => {
// get indexing type by props
2023-06-06 10:52:02 +08:00
if (indexingType)
2023-05-15 08:51:32 +08:00
setIndexType(indexingType as IndexingType)
2023-06-06 10:52:02 +08:00
else
2024-06-05 00:13:29 +08:00
setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
}, [isAPIKeySet, indexingType, datasetId])
2023-05-15 08:51:32 +08:00
const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
search_method: RETRIEVE_METHOD.semantic,
reranking_enable: false,
reranking_model: {
reranking_provider_name: rerankDefaultModel?.provider.provider,
reranking_model_name: rerankDefaultModel?.model,
},
top_k: 3,
score_threshold_enabled: false,
score_threshold: 0.5,
} as RetrievalConfig)
2023-05-15 08:51:32 +08:00
return (
2024-11-22 13:12:24 +08:00
<div className='flex w-full max-h-full h-full overflow-y-auto'>
2024-11-21 16:19:32 +08:00
<div className='relative h-full w-full overflow-y-scroll'>
<div className={cn(s.form, isMobile && '!px-4')}>
2023-05-15 08:51:32 +08:00
<div className={s.label}>{t('datasetCreation.stepTwo.segmentation')}</div>
<div className='max-w-[640px]'>
2024-11-20 10:13:29 +08:00
<div className='space-y-4'>
<OptionCard
title={t('datasetCreation.stepTwo.general')}
icon={<Image src={SettingCog} alt={t('datasetCreation.stepTwo.general')} />}
activeHeaderClassName='bg-gradient-to-r from-[#EFF0F9] to-[#F9FAFB]'
description={t('datasetCreation.stepTwo.generalTip')}
2024-11-20 10:13:29 +08:00
isActive={SegmentType.AUTO === segmentationType}
onClick={() => setSegmentationType(SegmentType.AUTO)}
actions={
<>
2024-12-03 14:14:37 +08:00
<Button variant={'secondary-accent'} onClick={() => updatePreview()}>
<RiSearchEyeLine className='h-4 w-4 mr-1.5' />
{t('datasetCreation.stepTwo.previewChunk')}
</Button>
2024-11-26 15:37:57 +08:00
<Button variant={'ghost'} onClick={resetRules}>
{t('datasetCreation.stepTwo.reset')}
2024-11-20 10:13:29 +08:00
</Button>
</>
}
>
<div className='space-y-4'>
2024-11-26 14:48:49 +08:00
<div className='flex gap-3'>
2024-11-21 11:40:17 +08:00
<DelimiterInput
value={segmentIdentifier}
onChange={e => setSegmentIdentifier(e.target.value)}
/>
<MaxLengthInput
2024-11-26 15:29:31 +08:00
value={max}
onChange={setMax}
2024-11-21 11:40:17 +08:00
/>
<OverlapInput
2024-11-26 15:29:31 +08:00
value={overlap}
2024-11-21 11:40:17 +08:00
min={1}
onChange={setOverlap}
2024-11-21 11:40:17 +08:00
/>
</div>
2024-11-20 10:13:29 +08:00
<div className='space-y-2'>
<div className='w-full flex flex-col'>
<TextLabel>{t('datasetCreation.stepTwo.rules')}</TextLabel>
2024-11-20 10:13:29 +08:00
<div className='mt-4 space-y-2'>
{rules.map(rule => (
<div key={rule.id} className={s.ruleItem} onClick={() => {
ruleChangeHandle(rule.id)
}}>
<Checkbox
checked={rule.enabled}
/>
<label className="ml-2 text-sm font-normal cursor-pointer text-gray-800">{getRuleName(rule.id)}</label>
</div>
))}
</div>
2023-05-15 08:51:32 +08:00
</div>
</div>
</div>
2024-11-20 10:13:29 +08:00
</OptionCard>
<OptionCard
title={t('datasetCreation.stepTwo.parentChild')}
icon={<Image src={FamilyMod} alt={t('datasetCreation.stepTwo.parentChild')} />}
effectImg={OrangeEffect.src}
activeHeaderClassName='bg-gradient-to-r from-[#F9F1EE] to-[#F9FAFB]'
description={t('datasetCreation.stepTwo.parentChildTip')}
2024-11-20 10:13:29 +08:00
isActive={SegmentType.CUSTOM === segmentationType}
onClick={() => setSegmentationType(SegmentType.CUSTOM)}
actions={
<>
2024-12-03 14:14:37 +08:00
<Button variant={'secondary-accent'} onClick={() => updatePreview()}>
<RiSearchEyeLine className='h-4 w-4 mr-1.5' />
{t('datasetCreation.stepTwo.previewChunk')}
</Button>
<Button variant={'ghost'} onClick={resetRules}>
{t('datasetCreation.stepTwo.reset')}
</Button>
</>
}
2024-11-20 10:13:29 +08:00
>
<div className='space-y-4'>
2024-11-21 11:40:17 +08:00
<div className='space-y-2'>
<TextLabel>
{t('datasetCreation.stepTwo.parentChunkForContext')}
2024-11-21 11:40:17 +08:00
</TextLabel>
<RadioCard
icon={<Image src={Note} alt='' />}
title={t('datasetCreation.stepTwo.paragraph')}
description={t('datasetCreation.stepTwo.paragraphTip')}
2024-11-21 11:40:17 +08:00
isChosen={parentChildConfig.chunkForContext === 'paragraph'}
onChosen={() => setParentChildConfig(
{
...parentChildConfig,
chunkForContext: 'paragraph',
},
)}
chosenConfig={
<div className='flex gap-2'>
<DelimiterInput
value={parentChildConfig.parent.delimiter}
onChange={e => setParentChildConfig({
...parentChildConfig,
parent: {
...parentChildConfig.parent,
delimiter: e.target.value,
},
})}
/>
<MaxLengthInput
2024-11-26 15:29:31 +08:00
value={parentChildConfig.parent.maxLength}
onChange={value => setParentChildConfig({
2024-11-21 11:40:17 +08:00
...parentChildConfig,
parent: {
...parentChildConfig.parent,
maxLength: value,
2024-11-21 11:40:17 +08:00
},
})}
/>
</div>
}
/>
<RadioCard
icon={<Image src={FileList} alt='' />}
title={t('datasetCreation.stepTwo.fullDoc')}
description={t('datasetCreation.stepTwo.fullDocTip')}
2024-11-21 11:40:17 +08:00
onChosen={() => setParentChildConfig(
{
...parentChildConfig,
chunkForContext: 'full_doc',
},
)}
isChosen={parentChildConfig.chunkForContext === 'full_doc'}
/>
</div>
2024-11-26 14:48:49 +08:00
<div className='space-y-4'>
2024-11-21 11:40:17 +08:00
<TextLabel>
{t('datasetCreation.stepTwo.childChunkForRetrieval')}
2024-11-21 11:40:17 +08:00
</TextLabel>
2024-11-26 14:48:49 +08:00
<div className='flex gap-3 mt-2'>
2024-11-21 11:40:17 +08:00
<DelimiterInput
value={parentChildConfig.child.delimiter}
onChange={e => setParentChildConfig({
...parentChildConfig,
child: {
...parentChildConfig.child,
delimiter: e.target.value,
},
})}
/>
<MaxLengthInput
2024-11-26 15:29:31 +08:00
value={parentChildConfig.child.maxLength}
onChange={value => setParentChildConfig({
2024-11-21 11:40:17 +08:00
...parentChildConfig,
child: {
...parentChildConfig.child,
maxLength: value,
2024-11-21 11:40:17 +08:00
},
})}
/>
</div>
<div className='space-y-2'>
2024-11-26 14:48:49 +08:00
<TextLabel>
{t('datasetCreation.stepTwo.rules')}
</TextLabel>
<div className='space-y-2 mt-2'>
{rules.map(rule => (
<div key={rule.id} className={s.ruleItem} onClick={() => {
ruleChangeHandle(rule.id)
}}>
<Checkbox
checked={rule.enabled}
/>
<label className="ml-2 text-sm font-normal cursor-pointer text-gray-800">{getRuleName(rule.id)}</label>
</div>
))}
</div>
2024-11-21 11:40:17 +08:00
</div>
</div>
2024-11-20 10:13:29 +08:00
</div>
</OptionCard>
2023-05-15 08:51:32 +08:00
</div>
</div>
2024-11-26 17:22:02 +08:00
<Divider className='my-5' />
2023-05-15 08:51:32 +08:00
<div className={s.label}>{t('datasetCreation.stepTwo.indexMode')}</div>
<div className='max-w-[640px]'>
<div className='flex items-center gap-3 flex-wrap sm:flex-nowrap'>
2023-05-15 08:51:32 +08:00
{(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && (
<div
className={cn(
s.radioItem,
s.indexItem,
2024-06-05 00:13:29 +08:00
!isAPIKeySet && s.disabled,
2023-05-15 08:51:32 +08:00
!hasSetIndexType && indexType === IndexingType.QUALIFIED && s.active,
hasSetIndexType && s.disabled,
hasSetIndexType && '!w-full !min-h-[96px]',
2023-05-15 08:51:32 +08:00
)}
onClick={() => {
2024-06-05 00:13:29 +08:00
if (isAPIKeySet)
2023-05-15 08:51:32 +08:00
setIndexType(IndexingType.QUALIFIED)
}}
>
2024-11-26 14:18:42 +08:00
<div className='h-8 p-1.5 bg-white rounded-lg border border-components-panel-border-subtle justify-center items-center inline-flex absolute left-5 top-[18px]'>
<Image src={indexMethodIcon.high_quality} alt='Gold Icon' width={20} height={20} />
</div>
2023-05-15 08:51:32 +08:00
{!hasSetIndexType && <span className={cn(s.radio)} />}
<div className={s.typeHeader}>
<div className={s.title}>
{t('datasetCreation.stepTwo.qualified')}
{!hasSetIndexType && <span className={s.recommendTag}>{t('datasetCreation.stepTwo.recommend')}</span>}
</div>
<div className={s.tip}>{t('datasetCreation.stepTwo.qualifiedTip')}</div>
</div>
2024-06-05 00:13:29 +08:00
{!isAPIKeySet && (
2023-05-15 08:51:32 +08:00
<div className={s.warningTip}>
<span>{t('datasetCreation.stepTwo.warning')}&nbsp;</span>
<span className={s.click} onClick={onSetting}>{t('datasetCreation.stepTwo.click')}</span>
</div>
)}
</div>
)}
{(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && (
<div
className={cn(
s.radioItem,
s.indexItem,
!hasSetIndexType && indexType === IndexingType.ECONOMICAL && s.active,
hasSetIndexType && s.disabled,
hasSetIndexType && '!w-full !min-h-[96px]',
2023-05-15 08:51:32 +08:00
)}
onClick={changeToEconomicalType}
2023-05-15 08:51:32 +08:00
>
2024-11-26 14:18:42 +08:00
<div className='h-8 p-1.5 bg-white rounded-lg border border-components-panel-border-subtle justify-center items-center inline-flex absolute left-5 top-[18px]'>
<Image src={indexMethodIcon.economical} alt='Economical Icon' width={20} height={20} />
</div>
2023-05-15 08:51:32 +08:00
{!hasSetIndexType && <span className={cn(s.radio)} />}
<div className={s.typeHeader}>
<div className={s.title}>{t('datasetCreation.stepTwo.economical')}</div>
<div className={s.tip}>{t('datasetCreation.stepTwo.economicalTip')}</div>
</div>
</div>
)}
</div>
{hasSetIndexType && indexType === IndexingType.ECONOMICAL && (
2023-05-15 08:51:32 +08:00
<div className='mt-2 text-xs text-gray-500 font-medium'>
2024-09-07 16:59:38 +08:00
{t('datasetCreation.stepTwo.indexSettingTip')}
2024-11-26 14:18:42 +08:00
<Link className='text-text-accent' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
2023-05-15 08:51:32 +08:00
</div>
)}
2024-11-20 15:25:26 +08:00
{IS_CE_EDITION && indexType === IndexingType.QUALIFIED && (
2024-11-26 17:22:02 +08:00
<div className='mt-2 rounded-xl bg-gray-50 border border-gray-100'>
2024-11-20 15:25:26 +08:00
<div className='flex justify-between items-center px-5 py-4'>
<div className='flex justify-center items-center w-8 h-8 rounded-lg bg-indigo-50'>
<MessageChatSquare className='w-4 h-4' />
</div>
<div className='grow mx-3'>
2024-11-21 16:19:32 +08:00
<div className='mb-0.5 text-md font-medium text-gray-900'>{t('datasetCreation.stepTwo.QATitle')}</div>
2024-11-20 15:25:26 +08:00
<div className='inline-flex items-center text-[13px] leading-[18px] text-gray-500'>
<span className='pr-1'>{t('datasetCreation.stepTwo.QALanguage')}</span>
<LanguageSelect currentLanguage={docLanguage} onSelect={handleSelect} disabled={isLanguageSelectDisabled} />
</div>
</div>
2024-11-21 11:40:17 +08:00
<Switch
defaultValue={docForm === DocForm.QA}
2024-12-03 14:14:37 +08:00
onChange={handleDocformSwitch}
2024-11-21 11:40:17 +08:00
size='md'
/>
2024-11-20 15:25:26 +08:00
</div>
{docForm === DocForm.QA && !QATipHide && (
<div className='flex justify-between items-center px-5 py-2 bg-orange-50 border-t border-amber-100 rounded-b-xl text-[13px] leading-[18px] text-medium text-amber-500'>
{t('datasetCreation.stepTwo.QATip')}
<RiCloseLine className='w-4 h-4 text-gray-500 cursor-pointer' onClick={() => setQATipHide(true)} />
</div>
)}
</div>
)}
{/* Embedding model */}
{indexType === IndexingType.QUALIFIED && (
2024-11-26 17:22:02 +08:00
<div className='mt-6 my-2'>
<div className={cn(s.label, datasetId && 'flex justify-between items-center')}>{t('datasetSettings.form.embeddingModel')}</div>
<ModelSelector
readonly={!!datasetId}
defaultModel={embeddingModel}
modelList={embeddingModelList}
onSelect={(model: DefaultModel) => {
setEmbeddingModel(model)
}}
/>
{!!datasetId && (
<div className='mt-2 text-xs text-gray-500 font-medium'>
2024-09-07 16:59:38 +08:00
{t('datasetCreation.stepTwo.indexSettingTip')}
2024-11-26 14:18:42 +08:00
<Link className='text-text-accent' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
</div>
)}
</div>
)}
2024-11-26 17:22:02 +08:00
<Divider className='my-5' />
{/* Retrieval Method Config */}
<div>
{!datasetId
? (
<div className={s.label}>
<div className='shrink-0 mr-4'>{t('datasetSettings.form.retrievalSetting.title')}</div>
<div className='leading-[18px] text-xs font-normal text-gray-500'>
2024-11-26 14:18:42 +08:00
<a target='_blank' rel='noopener noreferrer' href='https://docs.dify.ai/guides/knowledge-base/create-knowledge-and-upload-documents#id-4-retrieval-settings' className='text-text-accent'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>
{t('datasetSettings.form.retrievalSetting.longDescription')}
</div>
</div>
)
: (
<div className={cn(s.label, 'flex justify-between items-center')}>
<div>{t('datasetSettings.form.retrievalSetting.title')}</div>
</div>
)}
<div className='max-w-[640px]'>
2024-08-27 11:25:27 +08:00
{
getIndexing_technique() === IndexingType.QUALIFIED
? (
<RetrievalMethodConfig
value={retrievalConfig}
2024-08-27 11:25:27 +08:00
onChange={setRetrievalConfig}
/>
2024-08-27 11:25:27 +08:00
)
: (
<EconomicalRetrievalMethodConfig
value={retrievalConfig}
onChange={setRetrievalConfig}
/>
)
}
</div>
</div>
2023-06-06 10:52:02 +08:00
{!isSetting
? (
<div className='flex items-center mt-8 py-2'>
2024-11-20 16:24:06 +08:00
<Button onClick={() => onStepChange && onStepChange(-1)}>
<RiArrowLeftLine className='w-4 h-4 mr-1' />
{t('datasetCreation.stepTwo.previousStep')}
</Button>
<Button className='ml-auto' loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.nextStep')}</Button>
2023-06-06 10:52:02 +08:00
</div>
)
: (
<div className='flex items-center mt-8 py-2'>
<Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>
2023-06-06 10:52:02 +08:00
<Button className='ml-2' onClick={onCancel}>{t('datasetCreation.stepTwo.cancel')}</Button>
</div>
)}
2023-05-15 08:51:32 +08:00
</div>
</div>
</div>
<FloatRightContainer isMobile={isMobile} isOpen={true} onClose={() => { }} footer={null}>
<PreviewContainer
header={<PreviewHeader
title='Preview'
>
</PreviewHeader>}
className={cn(s.previewWrap, isMobile && s.isMobile, 'relative h-full overflow-y-scroll space-y-4')}
>
{qaPreviewSwitched && docForm === DocForm.QA && estimate?.qa_preview && (
estimate?.qa_preview.map(item => (
<QAPreview key={item.question} qa={item} />
))
)}
{(docForm === DocForm.TEXT || !qaPreviewSwitched) && estimate?.preview && (
estimate?.preview.map((item, index) => (
<ChunkContainer
key={item}
label={`Chunk-${index + 1}`}
characterCount={item.length}
>
{item}
</ChunkContainer>
))
)}
{qaPreviewSwitched && docForm === DocForm.QA && !estimate?.qa_preview && (
<div className='flex items-center justify-center h-[200px]'>
<Loading type='area' />
2023-05-15 08:51:32 +08:00
</div>
)}
{!qaPreviewSwitched && !estimate?.preview && (
<div className='flex items-center justify-center h-[200px]'>
<Loading type='area' />
</div>
)}
</PreviewContainer>
</FloatRightContainer>
2023-05-15 08:51:32 +08:00
</div>
)
}
export default StepTwo