2023-05-15 08:51:32 +08:00
|
|
|
'use client'
|
2024-12-03 14:14:37 +08:00
|
|
|
import type { FC, PropsWithChildren } from 'react'
|
|
|
|
import React, { useCallback, useEffect, useState } from 'react'
|
2023-05-15 08:51:32 +08:00
|
|
|
import { useTranslation } from 'react-i18next'
|
2023-08-18 17:37:31 +08:00
|
|
|
import { useContext } from 'use-context-selector'
|
2024-06-20 11:05:08 +08:00
|
|
|
import {
|
2024-11-20 16:24:06 +08:00
|
|
|
RiArrowLeftLine,
|
2024-11-20 15:25:26 +08:00
|
|
|
RiCloseLine,
|
2024-11-20 14:55:59 +08:00
|
|
|
RiSearchEyeLine,
|
2024-06-20 11:05:08 +08:00
|
|
|
} from '@remixicon/react'
|
2023-06-06 10:52:02 +08:00
|
|
|
import Link from 'next/link'
|
2024-11-20 10:13:29 +08:00
|
|
|
import Image from 'next/image'
|
|
|
|
import SettingCog from '../assets/setting-gear-mod.svg'
|
2024-11-20 14:55:59 +08:00
|
|
|
import OrangeEffect from '../assets/option-card-effect-orange.svg'
|
|
|
|
import FamilyMod from '../assets/family-mod.svg'
|
|
|
|
import Note from '../assets/note-mod.svg'
|
|
|
|
import FileList from '../assets/file-list-3-fill.svg'
|
2024-11-25 17:57:31 +08:00
|
|
|
import { indexMethodIcon } from '../icons'
|
2024-12-04 11:52:05 +08:00
|
|
|
import { PreviewContainer } from '../../preview/container'
|
|
|
|
import { ChunkContainer, QAPreview } from '../../chunk'
|
|
|
|
import { PreviewHeader } from '../../preview/header'
|
2023-06-06 10:52:02 +08:00
|
|
|
import s from './index.module.css'
|
2024-09-19 17:40:20 +08:00
|
|
|
import unescape from './unescape'
|
|
|
|
import escape from './escape'
|
2024-11-20 10:13:29 +08:00
|
|
|
import { OptionCard } from './option-card'
|
2024-11-20 15:25:26 +08:00
|
|
|
import LanguageSelect from './language-select'
|
2024-11-21 11:40:17 +08:00
|
|
|
import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs'
|
2024-07-09 15:05:40 +08:00
|
|
|
import cn from '@/utils/classnames'
|
2024-12-03 15:23:51 +08:00
|
|
|
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FullDocumentDetail, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
|
2024-12-03 17:26:45 +08:00
|
|
|
|
2023-05-15 08:51:32 +08:00
|
|
|
import Button from '@/app/components/base/button'
|
2023-11-27 11:47:48 +08:00
|
|
|
import FloatRightContainer from '@/app/components/base/float-right-container'
|
2023-11-18 11:53:35 +08:00
|
|
|
import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
|
|
|
|
import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
|
|
|
|
import { type RetrievalConfig } from '@/types/app'
|
|
|
|
import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
|
2023-05-15 08:51:32 +08:00
|
|
|
import Toast from '@/app/components/base/toast'
|
2023-08-28 19:48:53 +08:00
|
|
|
import type { NotionPage } from '@/models/common'
|
2024-09-30 09:57:19 +08:00
|
|
|
import { DataSourceProvider } from '@/models/common'
|
2023-08-18 17:37:31 +08:00
|
|
|
import { DataSourceType, DocForm } from '@/models/datasets'
|
2023-06-19 16:32:25 +08:00
|
|
|
import { useDatasetDetailContext } from '@/context/dataset-detail'
|
2023-08-18 17:37:31 +08:00
|
|
|
import I18n from '@/context/i18n'
|
2023-11-18 11:53:35 +08:00
|
|
|
import { RETRIEVE_METHOD } from '@/types/app'
|
2023-11-27 11:47:48 +08:00
|
|
|
import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
|
2024-09-04 14:41:47 +08:00
|
|
|
import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
|
2024-02-23 14:31:06 +08:00
|
|
|
import { LanguagesSupported } from '@/i18n/language'
|
2024-09-04 14:41:47 +08:00
|
|
|
import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector'
|
|
|
|
import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations'
|
2024-04-04 15:54:59 +08:00
|
|
|
import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
|
2024-11-20 10:13:29 +08:00
|
|
|
import Checkbox from '@/app/components/base/checkbox'
|
2024-11-20 14:55:59 +08:00
|
|
|
import RadioCard from '@/app/components/base/radio-card'
|
2024-11-20 15:25:26 +08:00
|
|
|
import { MessageChatSquare } from '@/app/components/base/icons/src/public/common'
|
|
|
|
import { IS_CE_EDITION } from '@/config'
|
2024-11-21 11:40:17 +08:00
|
|
|
import Switch from '@/app/components/base/switch'
|
2024-11-26 17:22:02 +08:00
|
|
|
import Divider from '@/app/components/base/divider'
|
2024-12-03 17:26:45 +08:00
|
|
|
import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument, useFetchDefaultProcessRule, useFetchFileIndexingEstimateForFile, useFetchFileIndexingEstimateForNotion, useFetchFileIndexingEstimateForWeb } from '@/service/use-datasets'
|
2024-12-03 15:23:51 +08:00
|
|
|
import Loading from '@/app/components/base/loading'
|
2024-11-20 14:55:59 +08:00
|
|
|
|
|
|
|
const TextLabel: FC<PropsWithChildren> = (props) => {
|
2024-11-26 14:18:42 +08:00
|
|
|
return <label className='text-text-secondary text-xs font-semibold leading-none'>{props.children}</label>
|
2024-11-20 14:55:59 +08:00
|
|
|
}
|
|
|
|
|
2023-05-15 08:51:32 +08:00
|
|
|
type StepTwoProps = {
|
2023-06-06 10:52:02 +08:00
|
|
|
isSetting?: boolean
|
2023-06-01 23:19:36 +08:00
|
|
|
documentDetail?: FullDocumentDetail
|
2024-06-05 00:13:29 +08:00
|
|
|
isAPIKeySet: boolean
|
2023-06-06 10:52:02 +08:00
|
|
|
onSetting: () => void
|
|
|
|
datasetId?: string
|
2024-12-04 11:52:05 +08:00
|
|
|
indexingType?: IndexingType
|
2024-11-25 17:57:31 +08:00
|
|
|
retrievalMethod?: string
|
2023-06-16 21:47:51 +08:00
|
|
|
dataSourceType: DataSourceType
|
2023-08-16 23:14:27 +08:00
|
|
|
files: CustomFile[]
|
2023-08-28 19:48:53 +08:00
|
|
|
notionPages?: NotionPage[]
|
2024-06-14 22:02:41 +08:00
|
|
|
websitePages?: CrawlResultItem[]
|
|
|
|
crawlOptions?: CrawlOptions
|
2024-09-30 09:57:19 +08:00
|
|
|
websiteCrawlProvider?: DataSourceProvider
|
|
|
|
websiteCrawlJobId?: string
|
2023-06-06 10:52:02 +08:00
|
|
|
onStepChange?: (delta: number) => void
|
|
|
|
updateIndexingTypeCache?: (type: string) => void
|
2024-11-25 17:57:31 +08:00
|
|
|
updateRetrievalMethodCache?: (method: string) => void
|
2023-06-01 23:19:36 +08:00
|
|
|
updateResultCache?: (res: createDocumentResponse) => void
|
|
|
|
onSave?: () => void
|
|
|
|
onCancel?: () => void
|
2023-05-15 08:51:32 +08:00
|
|
|
}
|
|
|
|
|
2024-12-03 15:23:51 +08:00
|
|
|
export enum SegmentType {
|
2023-05-15 08:51:32 +08:00
|
|
|
AUTO = 'automatic',
|
|
|
|
CUSTOM = 'custom',
|
|
|
|
}
|
2024-12-03 14:34:18 +08:00
|
|
|
export enum IndexingType {
|
2023-05-15 08:51:32 +08:00
|
|
|
QUALIFIED = 'high_quality',
|
|
|
|
ECONOMICAL = 'economy',
|
|
|
|
}
|
|
|
|
|
2024-09-19 17:40:20 +08:00
|
|
|
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
|
|
|
|
|
2024-11-21 11:40:17 +08:00
|
|
|
type ParentChildConfig = {
|
|
|
|
chunkForContext: 'paragraph' | 'full_doc'
|
|
|
|
parent: {
|
|
|
|
delimiter: string
|
|
|
|
maxLength: number
|
|
|
|
}
|
|
|
|
child: {
|
|
|
|
delimiter: string
|
|
|
|
maxLength: number
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-11-26 15:37:57 +08:00
|
|
|
const defaultParentChildConfig: ParentChildConfig = {
|
|
|
|
chunkForContext: 'paragraph',
|
|
|
|
parent: {
|
|
|
|
delimiter: '\\n\\n',
|
|
|
|
maxLength: 4000,
|
|
|
|
},
|
|
|
|
child: {
|
|
|
|
delimiter: '\\n\\n',
|
|
|
|
maxLength: 4000,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
2023-05-15 08:51:32 +08:00
|
|
|
const StepTwo = ({
|
2023-06-01 23:19:36 +08:00
|
|
|
isSetting,
|
|
|
|
documentDetail,
|
2024-06-05 00:13:29 +08:00
|
|
|
isAPIKeySet,
|
2023-05-15 08:51:32 +08:00
|
|
|
onSetting,
|
|
|
|
datasetId,
|
|
|
|
indexingType,
|
2024-06-14 22:02:41 +08:00
|
|
|
dataSourceType: inCreatePageDataSourceType,
|
2023-06-21 09:44:01 +08:00
|
|
|
files,
|
2023-06-16 21:47:51 +08:00
|
|
|
notionPages = [],
|
2024-06-14 22:02:41 +08:00
|
|
|
websitePages = [],
|
|
|
|
crawlOptions,
|
2024-09-30 09:57:19 +08:00
|
|
|
websiteCrawlProvider = DataSourceProvider.fireCrawl,
|
|
|
|
websiteCrawlJobId = '',
|
2023-05-15 08:51:32 +08:00
|
|
|
onStepChange,
|
|
|
|
updateIndexingTypeCache,
|
|
|
|
updateResultCache,
|
2023-06-01 23:19:36 +08:00
|
|
|
onSave,
|
|
|
|
onCancel,
|
2024-11-25 17:57:31 +08:00
|
|
|
updateRetrievalMethodCache,
|
2023-05-15 08:51:32 +08:00
|
|
|
}: StepTwoProps) => {
|
|
|
|
const { t } = useTranslation()
|
2023-08-18 17:37:31 +08:00
|
|
|
const { locale } = useContext(I18n)
|
2023-11-27 11:47:48 +08:00
|
|
|
const media = useBreakpoints()
|
|
|
|
const isMobile = media === MediaType.mobile
|
|
|
|
|
2023-11-18 11:53:35 +08:00
|
|
|
const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext()
|
2024-06-14 22:02:41 +08:00
|
|
|
const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
|
|
|
|
const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
|
2023-05-15 08:51:32 +08:00
|
|
|
const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
|
2024-09-19 17:40:20 +08:00
|
|
|
const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
|
|
|
|
const setSegmentIdentifier = useCallback((value: string) => {
|
|
|
|
doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER)
|
|
|
|
}, [])
|
|
|
|
const [max, setMax] = useState(4000) // default chunk length
|
2024-01-26 13:24:40 +08:00
|
|
|
const [overlap, setOverlap] = useState(50)
|
2023-05-15 08:51:32 +08:00
|
|
|
const [rules, setRules] = useState<PreProcessingRule[]>([])
|
|
|
|
const [defaultConfig, setDefaultConfig] = useState<Rules>()
|
|
|
|
const hasSetIndexType = !!indexingType
|
2024-12-04 11:52:05 +08:00
|
|
|
const [indexType, setIndexType] = useState<IndexingType>(
|
2023-10-07 17:42:16 +08:00
|
|
|
(indexingType
|
2024-06-05 00:13:29 +08:00
|
|
|
|| isAPIKeySet)
|
2023-06-06 10:52:02 +08:00
|
|
|
? IndexingType.QUALIFIED
|
|
|
|
: IndexingType.ECONOMICAL,
|
2023-05-15 08:51:32 +08:00
|
|
|
)
|
2024-10-14 13:32:13 +08:00
|
|
|
const [isLanguageSelectDisabled, setIsLanguageSelectDisabled] = useState(false)
|
2023-07-28 20:47:15 +08:00
|
|
|
const [docForm, setDocForm] = useState<DocForm | string>(
|
2023-10-07 17:42:16 +08:00
|
|
|
(datasetId && documentDetail) ? documentDetail.doc_form : DocForm.TEXT,
|
2023-07-28 20:47:15 +08:00
|
|
|
)
|
2024-08-28 08:45:51 +08:00
|
|
|
const [docLanguage, setDocLanguage] = useState<string>(
|
|
|
|
(datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese'),
|
|
|
|
)
|
2023-08-18 17:37:31 +08:00
|
|
|
const [QATipHide, setQATipHide] = useState(false)
|
2024-12-03 17:26:45 +08:00
|
|
|
const [qaPreviewSwitched, setQAPreviewSwitched] = useState(false)
|
2023-05-15 08:51:32 +08:00
|
|
|
|
2024-11-26 15:37:57 +08:00
|
|
|
const [parentChildConfig, setParentChildConfig] = useState<ParentChildConfig>(defaultParentChildConfig)
|
2024-11-21 11:40:17 +08:00
|
|
|
|
2024-12-03 15:23:51 +08:00
|
|
|
const getIndexing_technique = () => indexingType || indexType
|
|
|
|
|
|
|
|
const getProcessRule = () => {
|
|
|
|
const processRule: ProcessRule = {
|
|
|
|
rules: {} as any, // api will check this. It will be removed after api refactored.
|
|
|
|
mode: segmentationType,
|
|
|
|
}
|
|
|
|
if (segmentationType === SegmentType.CUSTOM) {
|
|
|
|
const ruleObj = {
|
|
|
|
pre_processing_rules: rules,
|
|
|
|
segmentation: {
|
|
|
|
separator: unescape(segmentIdentifier),
|
|
|
|
max_tokens: max,
|
|
|
|
chunk_overlap: overlap,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
processRule.rules = ruleObj
|
|
|
|
}
|
|
|
|
return processRule
|
|
|
|
}
|
|
|
|
|
|
|
|
const fileIndexingEstimateQuery = useFetchFileIndexingEstimateForFile({
|
|
|
|
docForm: docForm as DocForm,
|
|
|
|
docLanguage,
|
|
|
|
dataSourceType: DataSourceType.FILE,
|
|
|
|
files,
|
|
|
|
indexingTechnique: getIndexing_technique() as any,
|
|
|
|
processRule: getProcessRule(),
|
|
|
|
dataset_id: datasetId!,
|
|
|
|
})
|
|
|
|
const notionIndexingEstimateQuery = useFetchFileIndexingEstimateForNotion({
|
|
|
|
docForm: docForm as DocForm,
|
|
|
|
docLanguage,
|
|
|
|
dataSourceType: DataSourceType.NOTION,
|
|
|
|
notionPages,
|
|
|
|
indexingTechnique: getIndexing_technique() as any,
|
|
|
|
processRule: getProcessRule(),
|
|
|
|
dataset_id: datasetId || '',
|
|
|
|
})
|
|
|
|
|
|
|
|
const websiteIndexingEstimateQuery = useFetchFileIndexingEstimateForWeb({
|
|
|
|
docForm: docForm as DocForm,
|
|
|
|
docLanguage,
|
|
|
|
dataSourceType: DataSourceType.WEB,
|
|
|
|
websitePages,
|
|
|
|
crawlOptions,
|
|
|
|
websiteCrawlProvider,
|
|
|
|
websiteCrawlJobId,
|
|
|
|
indexingTechnique: getIndexing_technique() as any,
|
|
|
|
processRule: getProcessRule(),
|
|
|
|
dataset_id: datasetId || '',
|
|
|
|
})
|
|
|
|
|
|
|
|
const fetchEstimate = useCallback(() => {
|
|
|
|
if (dataSourceType === DataSourceType.FILE)
|
|
|
|
fileIndexingEstimateQuery.mutate()
|
|
|
|
|
|
|
|
if (dataSourceType === DataSourceType.NOTION)
|
|
|
|
notionIndexingEstimateQuery.mutate()
|
|
|
|
|
|
|
|
if (dataSourceType === DataSourceType.WEB)
|
|
|
|
websiteIndexingEstimateQuery.mutate()
|
|
|
|
}, [dataSourceType, fileIndexingEstimateQuery, notionIndexingEstimateQuery, websiteIndexingEstimateQuery])
|
|
|
|
|
|
|
|
const estimate
|
|
|
|
= dataSourceType === DataSourceType.FILE
|
|
|
|
? fileIndexingEstimateQuery.data
|
|
|
|
: dataSourceType === DataSourceType.NOTION
|
|
|
|
? notionIndexingEstimateQuery.data
|
|
|
|
: websiteIndexingEstimateQuery.data
|
|
|
|
|
2024-12-03 17:26:45 +08:00
|
|
|
// const getIsEstimateReady = useCallback(() => {
|
|
|
|
// if (dataSourceType === DataSourceType.FILE)
|
|
|
|
// return fileIndexingEstimateQuery.isSuccess
|
2024-12-03 15:23:51 +08:00
|
|
|
|
2024-12-03 17:26:45 +08:00
|
|
|
// if (dataSourceType === DataSourceType.NOTION)
|
|
|
|
// return notionIndexingEstimateQuery.isSuccess
|
2024-12-03 15:23:51 +08:00
|
|
|
|
2024-12-03 17:26:45 +08:00
|
|
|
// if (dataSourceType === DataSourceType.WEB)
|
|
|
|
// return websiteIndexingEstimateQuery.isSuccess
|
|
|
|
// }, [dataSourceType, fileIndexingEstimateQuery.isSuccess, notionIndexingEstimateQuery.isSuccess, websiteIndexingEstimateQuery.isSuccess])
|
2024-12-03 15:23:51 +08:00
|
|
|
|
2024-12-03 17:26:45 +08:00
|
|
|
// const getFileName = (name: string) => {
|
|
|
|
// const arr = name.split('.')
|
|
|
|
// return arr.slice(0, -1).join('.')
|
|
|
|
// }
|
2023-05-15 08:51:32 +08:00
|
|
|
|
|
|
|
const getRuleName = (key: string) => {
|
2023-06-06 10:52:02 +08:00
|
|
|
if (key === 'remove_extra_spaces')
|
2023-05-15 08:51:32 +08:00
|
|
|
return t('datasetCreation.stepTwo.removeExtraSpaces')
|
2023-06-06 10:52:02 +08:00
|
|
|
|
|
|
|
if (key === 'remove_urls_emails')
|
2023-05-15 08:51:32 +08:00
|
|
|
return t('datasetCreation.stepTwo.removeUrlEmails')
|
2023-06-06 10:52:02 +08:00
|
|
|
|
|
|
|
if (key === 'remove_stopwords')
|
2023-05-15 08:51:32 +08:00
|
|
|
return t('datasetCreation.stepTwo.removeStopwords')
|
|
|
|
}
|
|
|
|
const ruleChangeHandle = (id: string) => {
|
2023-06-06 10:52:02 +08:00
|
|
|
const newRules = rules.map((rule) => {
|
2023-05-15 08:51:32 +08:00
|
|
|
if (rule.id === id) {
|
|
|
|
return {
|
|
|
|
id: rule.id,
|
|
|
|
enabled: !rule.enabled,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return rule
|
|
|
|
})
|
|
|
|
setRules(newRules)
|
|
|
|
}
|
|
|
|
const resetRules = () => {
|
|
|
|
if (defaultConfig) {
|
2024-09-19 17:40:20 +08:00
|
|
|
setSegmentIdentifier(defaultConfig.segmentation.separator)
|
2023-05-15 08:51:32 +08:00
|
|
|
setMax(defaultConfig.segmentation.max_tokens)
|
2024-12-03 14:14:37 +08:00
|
|
|
setOverlap(defaultConfig.segmentation.chunk_overlap!)
|
2023-05-15 08:51:32 +08:00
|
|
|
setRules(defaultConfig.pre_processing_rules)
|
|
|
|
}
|
2024-11-26 15:37:57 +08:00
|
|
|
setParentChildConfig(defaultParentChildConfig)
|
2023-05-15 08:51:32 +08:00
|
|
|
}
|
|
|
|
|
2024-12-03 14:14:37 +08:00
|
|
|
const updatePreview = () => {
|
2024-10-25 15:02:36 +08:00
|
|
|
if (segmentationType === SegmentType.CUSTOM && max > 4000) {
|
|
|
|
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
|
|
|
|
return
|
|
|
|
}
|
2024-12-03 15:23:51 +08:00
|
|
|
fetchEstimate()
|
2024-12-03 17:26:45 +08:00
|
|
|
setQAPreviewSwitched(false)
|
2023-05-15 08:51:32 +08:00
|
|
|
}
|
|
|
|
|
2023-11-18 11:53:35 +08:00
|
|
|
const {
|
2024-01-02 23:42:00 +08:00
|
|
|
modelList: rerankModelList,
|
|
|
|
defaultModel: rerankDefaultModel,
|
2024-09-08 13:14:11 +08:00
|
|
|
currentModel: isRerankDefaultModelValid,
|
2024-04-04 15:54:59 +08:00
|
|
|
} = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank)
|
2024-09-04 14:41:47 +08:00
|
|
|
const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding)
|
|
|
|
const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding)
|
|
|
|
const [embeddingModel, setEmbeddingModel] = useState<DefaultModel>(
|
|
|
|
currentDataset?.embedding_model
|
|
|
|
? {
|
|
|
|
provider: currentDataset.embedding_model_provider,
|
|
|
|
model: currentDataset.embedding_model,
|
|
|
|
}
|
|
|
|
: {
|
|
|
|
provider: defaultEmbeddingModel?.provider.provider || '',
|
|
|
|
model: defaultEmbeddingModel?.model || '',
|
|
|
|
},
|
|
|
|
)
|
2023-05-15 08:51:32 +08:00
|
|
|
const getCreationParams = () => {
|
2023-06-01 23:19:36 +08:00
|
|
|
let params
|
2024-01-26 13:24:40 +08:00
|
|
|
if (segmentationType === SegmentType.CUSTOM && overlap > max) {
|
|
|
|
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
|
|
|
|
return
|
|
|
|
}
|
2024-10-25 15:02:36 +08:00
|
|
|
if (segmentationType === SegmentType.CUSTOM && max > 4000) {
|
|
|
|
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
|
|
|
|
return
|
|
|
|
}
|
2023-06-01 23:19:36 +08:00
|
|
|
if (isSetting) {
|
|
|
|
params = {
|
|
|
|
original_document_id: documentDetail?.id,
|
2023-07-28 20:47:15 +08:00
|
|
|
doc_form: docForm,
|
2023-08-18 17:37:31 +08:00
|
|
|
doc_language: docLanguage,
|
2023-06-01 23:19:36 +08:00
|
|
|
process_rule: getProcessRule(),
|
2023-11-18 11:53:35 +08:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-use-before-define
|
|
|
|
retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page.
|
2024-09-04 14:41:47 +08:00
|
|
|
embedding_model: embeddingModel.model, // Readonly
|
|
|
|
embedding_model_provider: embeddingModel.provider, // Readonly
|
2023-06-01 23:19:36 +08:00
|
|
|
} as CreateDocumentReq
|
2023-06-06 10:52:02 +08:00
|
|
|
}
|
2023-11-18 11:53:35 +08:00
|
|
|
else { // create
|
|
|
|
const indexMethod = getIndexing_technique()
|
|
|
|
if (
|
|
|
|
!isReRankModelSelected({
|
|
|
|
rerankDefaultModel,
|
2024-09-08 13:14:11 +08:00
|
|
|
isRerankDefaultModelValid: !!isRerankDefaultModelValid,
|
2023-11-21 13:46:07 +08:00
|
|
|
rerankModelList,
|
2023-11-18 11:53:35 +08:00
|
|
|
// eslint-disable-next-line @typescript-eslint/no-use-before-define
|
|
|
|
retrievalConfig,
|
|
|
|
indexMethod: indexMethod as string,
|
|
|
|
})
|
|
|
|
) {
|
|
|
|
Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') })
|
|
|
|
return
|
|
|
|
}
|
|
|
|
const postRetrievalConfig = ensureRerankModelSelected({
|
|
|
|
rerankDefaultModel: rerankDefaultModel!,
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-use-before-define
|
|
|
|
retrievalConfig,
|
|
|
|
indexMethod: indexMethod as string,
|
|
|
|
})
|
2023-06-01 23:19:36 +08:00
|
|
|
params = {
|
|
|
|
data_source: {
|
2023-06-16 21:47:51 +08:00
|
|
|
type: dataSourceType,
|
|
|
|
info_list: {
|
|
|
|
data_source_type: dataSourceType,
|
|
|
|
},
|
2023-06-01 23:19:36 +08:00
|
|
|
},
|
|
|
|
indexing_technique: getIndexing_technique(),
|
|
|
|
process_rule: getProcessRule(),
|
2023-07-28 20:47:15 +08:00
|
|
|
doc_form: docForm,
|
2023-08-18 17:37:31 +08:00
|
|
|
doc_language: docLanguage,
|
2023-11-18 11:53:35 +08:00
|
|
|
|
|
|
|
retrieval_model: postRetrievalConfig,
|
2024-09-04 14:41:47 +08:00
|
|
|
embedding_model: embeddingModel.model,
|
|
|
|
embedding_model_provider: embeddingModel.provider,
|
2023-06-01 23:19:36 +08:00
|
|
|
} as CreateDocumentReq
|
2023-06-16 21:47:51 +08:00
|
|
|
if (dataSourceType === DataSourceType.FILE) {
|
|
|
|
params.data_source.info_list.file_info_list = {
|
2023-09-24 14:35:20 +08:00
|
|
|
file_ids: files.map(file => file.id || '').filter(Boolean),
|
2023-06-16 21:47:51 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (dataSourceType === DataSourceType.NOTION)
|
2024-12-03 15:23:51 +08:00
|
|
|
params.data_source.info_list.notion_info_list = getNotionInfo(notionPages)
|
2024-06-14 22:02:41 +08:00
|
|
|
|
2024-12-03 15:23:51 +08:00
|
|
|
if (dataSourceType === DataSourceType.WEB) {
|
|
|
|
params.data_source.info_list.website_info_list = getWebsiteInfo({
|
|
|
|
websiteCrawlProvider,
|
|
|
|
websiteCrawlJobId,
|
|
|
|
websitePages,
|
|
|
|
})
|
|
|
|
}
|
2023-06-01 23:19:36 +08:00
|
|
|
}
|
2023-05-15 08:51:32 +08:00
|
|
|
return params
|
|
|
|
}
|
|
|
|
|
2024-12-03 17:26:45 +08:00
|
|
|
const fetchDefaultProcessRuleMutation = useFetchDefaultProcessRule({
|
|
|
|
onSuccess(data) {
|
|
|
|
const separator = data.rules.segmentation.separator
|
2024-09-19 17:40:20 +08:00
|
|
|
setSegmentIdentifier(separator)
|
2024-12-03 17:26:45 +08:00
|
|
|
setMax(data.rules.segmentation.max_tokens)
|
|
|
|
setOverlap(data.rules.segmentation.chunk_overlap!)
|
|
|
|
setRules(data.rules.pre_processing_rules)
|
|
|
|
setDefaultConfig(data.rules)
|
|
|
|
},
|
|
|
|
onError(error) {
|
|
|
|
Toast.notify({
|
|
|
|
type: 'error',
|
|
|
|
message: `${error}`,
|
|
|
|
})
|
|
|
|
},
|
|
|
|
})
|
2023-06-01 23:19:36 +08:00
|
|
|
|
|
|
|
const getRulesFromDetail = () => {
|
|
|
|
if (documentDetail) {
|
|
|
|
const rules = documentDetail.dataset_process_rule.rules
|
|
|
|
const separator = rules.segmentation.separator
|
|
|
|
const max = rules.segmentation.max_tokens
|
2024-01-26 13:24:40 +08:00
|
|
|
const overlap = rules.segmentation.chunk_overlap
|
2024-09-19 17:40:20 +08:00
|
|
|
setSegmentIdentifier(separator)
|
2023-06-01 23:19:36 +08:00
|
|
|
setMax(max)
|
2024-12-03 17:26:45 +08:00
|
|
|
setOverlap(overlap as number)
|
2023-06-01 23:19:36 +08:00
|
|
|
setRules(rules.pre_processing_rules)
|
|
|
|
setDefaultConfig(rules)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const getDefaultMode = () => {
|
2023-06-06 10:52:02 +08:00
|
|
|
if (documentDetail)
|
2023-06-01 23:19:36 +08:00
|
|
|
setSegmentationType(documentDetail.dataset_process_rule.mode)
|
|
|
|
}
|
|
|
|
|
2024-12-03 17:26:45 +08:00
|
|
|
const createFirstDocumentMutation = useCreateFirstDocument({
|
|
|
|
onError(error) {
|
|
|
|
Toast.notify({
|
|
|
|
type: 'error',
|
|
|
|
message: `${error}`,
|
|
|
|
})
|
|
|
|
},
|
|
|
|
})
|
|
|
|
const createDocumentMutation = useCreateDocument(datasetId!, {
|
|
|
|
onError(error) {
|
2023-05-15 08:51:32 +08:00
|
|
|
Toast.notify({
|
|
|
|
type: 'error',
|
2024-12-03 17:26:45 +08:00
|
|
|
message: `${error}`,
|
2023-05-15 08:51:32 +08:00
|
|
|
})
|
2024-12-03 17:26:45 +08:00
|
|
|
},
|
|
|
|
})
|
|
|
|
|
|
|
|
const isCreating = createFirstDocumentMutation.isPending || createDocumentMutation.isPending
|
|
|
|
|
|
|
|
const createHandle = async () => {
|
|
|
|
const params = getCreationParams()
|
|
|
|
if (!params)
|
|
|
|
return false
|
|
|
|
|
|
|
|
if (!datasetId) {
|
|
|
|
await createFirstDocumentMutation.mutateAsync(
|
|
|
|
params,
|
|
|
|
{
|
|
|
|
onSuccess(data) {
|
|
|
|
updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
|
|
|
|
updateResultCache && updateResultCache(data)
|
|
|
|
// eslint-disable-next-line @typescript-eslint/no-use-before-define
|
|
|
|
updateRetrievalMethodCache && updateRetrievalMethodCache(retrievalConfig.search_method as string)
|
|
|
|
},
|
|
|
|
},
|
|
|
|
)
|
2023-05-15 08:51:32 +08:00
|
|
|
}
|
2024-12-03 17:26:45 +08:00
|
|
|
else {
|
|
|
|
await createDocumentMutation.mutateAsync(params, {
|
|
|
|
onSuccess(data) {
|
|
|
|
updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
|
|
|
|
updateResultCache && updateResultCache(data)
|
|
|
|
},
|
|
|
|
})
|
2023-09-24 14:35:20 +08:00
|
|
|
}
|
2024-12-03 17:26:45 +08:00
|
|
|
if (mutateDatasetRes)
|
|
|
|
mutateDatasetRes()
|
|
|
|
onStepChange && onStepChange(+1)
|
|
|
|
isSetting && onSave && onSave()
|
2023-05-15 08:51:32 +08:00
|
|
|
}
|
|
|
|
|
2024-12-03 14:14:37 +08:00
|
|
|
const handleDocformSwitch = (isQAMode: boolean) => {
|
|
|
|
if (isQAMode)
|
2023-07-28 20:47:15 +08:00
|
|
|
setDocForm(DocForm.QA)
|
|
|
|
else
|
|
|
|
setDocForm(DocForm.TEXT)
|
|
|
|
}
|
|
|
|
|
2024-12-03 17:26:45 +08:00
|
|
|
const previewSwitch = () => {
|
|
|
|
setQAPreviewSwitched(true)
|
2024-10-14 13:32:13 +08:00
|
|
|
setIsLanguageSelectDisabled(true)
|
2024-12-03 15:23:51 +08:00
|
|
|
fetchEstimate()
|
2024-10-14 13:32:13 +08:00
|
|
|
}
|
|
|
|
|
2023-08-18 17:37:31 +08:00
|
|
|
const handleSelect = (language: string) => {
|
|
|
|
setDocLanguage(language)
|
2024-10-14 13:32:13 +08:00
|
|
|
// Switch language, re-cutter
|
2024-12-03 17:26:45 +08:00
|
|
|
if (docForm === DocForm.QA && qaPreviewSwitched)
|
|
|
|
previewSwitch()
|
2023-08-18 17:37:31 +08:00
|
|
|
}
|
|
|
|
|
2023-07-28 20:47:15 +08:00
|
|
|
const changeToEconomicalType = () => {
|
|
|
|
if (!hasSetIndexType) {
|
|
|
|
setIndexType(IndexingType.ECONOMICAL)
|
|
|
|
setDocForm(DocForm.TEXT)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-05-15 08:51:32 +08:00
|
|
|
useEffect(() => {
|
|
|
|
// fetch rules
|
2023-06-01 23:19:36 +08:00
|
|
|
if (!isSetting) {
|
2024-12-03 17:26:45 +08:00
|
|
|
fetchDefaultProcessRuleMutation.mutate('/datasets/process-rule')
|
2023-06-06 10:52:02 +08:00
|
|
|
}
|
|
|
|
else {
|
2023-06-01 23:19:36 +08:00
|
|
|
getRulesFromDetail()
|
|
|
|
getDefaultMode()
|
|
|
|
}
|
2023-05-15 08:51:32 +08:00
|
|
|
}, [])
|
|
|
|
|
2023-07-28 20:47:15 +08:00
|
|
|
useEffect(() => {
|
|
|
|
if (indexingType === IndexingType.ECONOMICAL && docForm === DocForm.QA)
|
|
|
|
setDocForm(DocForm.TEXT)
|
|
|
|
}, [indexingType, docForm])
|
|
|
|
|
2023-05-15 08:51:32 +08:00
|
|
|
useEffect(() => {
|
|
|
|
// get indexing type by props
|
2023-06-06 10:52:02 +08:00
|
|
|
if (indexingType)
|
2023-05-15 08:51:32 +08:00
|
|
|
setIndexType(indexingType as IndexingType)
|
2023-06-06 10:52:02 +08:00
|
|
|
|
|
|
|
else
|
2024-06-05 00:13:29 +08:00
|
|
|
setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
|
|
|
|
}, [isAPIKeySet, indexingType, datasetId])
|
2023-05-15 08:51:32 +08:00
|
|
|
|
2023-11-18 11:53:35 +08:00
|
|
|
const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
|
|
|
|
search_method: RETRIEVE_METHOD.semantic,
|
|
|
|
reranking_enable: false,
|
|
|
|
reranking_model: {
|
2024-01-02 23:42:00 +08:00
|
|
|
reranking_provider_name: rerankDefaultModel?.provider.provider,
|
|
|
|
reranking_model_name: rerankDefaultModel?.model,
|
2023-11-18 11:53:35 +08:00
|
|
|
},
|
|
|
|
top_k: 3,
|
2023-11-27 15:38:05 +08:00
|
|
|
score_threshold_enabled: false,
|
2023-11-18 11:53:35 +08:00
|
|
|
score_threshold: 0.5,
|
|
|
|
} as RetrievalConfig)
|
|
|
|
|
2023-05-15 08:51:32 +08:00
|
|
|
return (
|
2024-11-22 13:12:24 +08:00
|
|
|
<div className='flex w-full max-h-full h-full overflow-y-auto'>
|
2024-11-21 16:19:32 +08:00
|
|
|
<div className='relative h-full w-full overflow-y-scroll'>
|
2023-11-27 11:47:48 +08:00
|
|
|
<div className={cn(s.form, isMobile && '!px-4')}>
|
2023-05-15 08:51:32 +08:00
|
|
|
<div className={s.label}>{t('datasetCreation.stepTwo.segmentation')}</div>
|
|
|
|
<div className='max-w-[640px]'>
|
2024-11-20 10:13:29 +08:00
|
|
|
<div className='space-y-4'>
|
|
|
|
<OptionCard
|
2024-11-25 17:57:31 +08:00
|
|
|
title={t('datasetCreation.stepTwo.general')}
|
|
|
|
icon={<Image src={SettingCog} alt={t('datasetCreation.stepTwo.general')} />}
|
2024-11-20 14:55:59 +08:00
|
|
|
activeHeaderClassName='bg-gradient-to-r from-[#EFF0F9] to-[#F9FAFB]'
|
2024-11-25 17:57:31 +08:00
|
|
|
description={t('datasetCreation.stepTwo.generalTip')}
|
2024-11-20 10:13:29 +08:00
|
|
|
isActive={SegmentType.AUTO === segmentationType}
|
|
|
|
onClick={() => setSegmentationType(SegmentType.AUTO)}
|
|
|
|
actions={
|
|
|
|
<>
|
2024-12-03 14:14:37 +08:00
|
|
|
<Button variant={'secondary-accent'} onClick={() => updatePreview()}>
|
2024-11-20 14:55:59 +08:00
|
|
|
<RiSearchEyeLine className='h-4 w-4 mr-1.5' />
|
2024-11-25 17:57:31 +08:00
|
|
|
{t('datasetCreation.stepTwo.previewChunk')}
|
|
|
|
</Button>
|
2024-11-26 15:37:57 +08:00
|
|
|
<Button variant={'ghost'} onClick={resetRules}>
|
2024-11-25 17:57:31 +08:00
|
|
|
{t('datasetCreation.stepTwo.reset')}
|
2024-11-20 10:13:29 +08:00
|
|
|
</Button>
|
|
|
|
</>
|
|
|
|
}
|
|
|
|
>
|
|
|
|
<div className='space-y-4'>
|
2024-11-26 14:48:49 +08:00
|
|
|
<div className='flex gap-3'>
|
2024-11-21 11:40:17 +08:00
|
|
|
<DelimiterInput
|
|
|
|
value={segmentIdentifier}
|
|
|
|
onChange={e => setSegmentIdentifier(e.target.value)}
|
|
|
|
/>
|
|
|
|
<MaxLengthInput
|
2024-11-26 15:29:31 +08:00
|
|
|
value={max}
|
2024-11-25 17:57:31 +08:00
|
|
|
onChange={setMax}
|
2024-11-21 11:40:17 +08:00
|
|
|
/>
|
|
|
|
<OverlapInput
|
2024-11-26 15:29:31 +08:00
|
|
|
value={overlap}
|
2024-11-21 11:40:17 +08:00
|
|
|
min={1}
|
2024-11-25 17:57:31 +08:00
|
|
|
onChange={setOverlap}
|
2024-11-21 11:40:17 +08:00
|
|
|
/>
|
2024-01-26 13:24:40 +08:00
|
|
|
</div>
|
2024-11-20 10:13:29 +08:00
|
|
|
<div className='space-y-2'>
|
|
|
|
<div className='w-full flex flex-col'>
|
2024-11-20 14:55:59 +08:00
|
|
|
<TextLabel>{t('datasetCreation.stepTwo.rules')}</TextLabel>
|
2024-11-20 10:13:29 +08:00
|
|
|
<div className='mt-4 space-y-2'>
|
|
|
|
{rules.map(rule => (
|
|
|
|
<div key={rule.id} className={s.ruleItem} onClick={() => {
|
|
|
|
ruleChangeHandle(rule.id)
|
|
|
|
}}>
|
|
|
|
<Checkbox
|
|
|
|
checked={rule.enabled}
|
|
|
|
/>
|
|
|
|
<label className="ml-2 text-sm font-normal cursor-pointer text-gray-800">{getRuleName(rule.id)}</label>
|
|
|
|
</div>
|
|
|
|
))}
|
|
|
|
</div>
|
2023-05-15 08:51:32 +08:00
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
</div>
|
2024-11-20 10:13:29 +08:00
|
|
|
</OptionCard>
|
|
|
|
<OptionCard
|
2024-11-25 17:57:31 +08:00
|
|
|
title={t('datasetCreation.stepTwo.parentChild')}
|
|
|
|
icon={<Image src={FamilyMod} alt={t('datasetCreation.stepTwo.parentChild')} />}
|
2024-11-20 14:55:59 +08:00
|
|
|
effectImg={OrangeEffect.src}
|
|
|
|
activeHeaderClassName='bg-gradient-to-r from-[#F9F1EE] to-[#F9FAFB]'
|
2024-11-25 17:57:31 +08:00
|
|
|
description={t('datasetCreation.stepTwo.parentChildTip')}
|
2024-11-20 10:13:29 +08:00
|
|
|
isActive={SegmentType.CUSTOM === segmentationType}
|
|
|
|
onClick={() => setSegmentationType(SegmentType.CUSTOM)}
|
2024-11-20 14:55:59 +08:00
|
|
|
actions={
|
|
|
|
<>
|
2024-12-03 14:14:37 +08:00
|
|
|
<Button variant={'secondary-accent'} onClick={() => updatePreview()}>
|
2024-11-20 14:55:59 +08:00
|
|
|
<RiSearchEyeLine className='h-4 w-4 mr-1.5' />
|
2024-11-25 17:57:31 +08:00
|
|
|
{t('datasetCreation.stepTwo.previewChunk')}
|
|
|
|
</Button>
|
|
|
|
<Button variant={'ghost'} onClick={resetRules}>
|
|
|
|
{t('datasetCreation.stepTwo.reset')}
|
2024-11-20 14:55:59 +08:00
|
|
|
</Button>
|
|
|
|
</>
|
|
|
|
}
|
2024-11-20 10:13:29 +08:00
|
|
|
>
|
|
|
|
<div className='space-y-4'>
|
2024-11-21 11:40:17 +08:00
|
|
|
<div className='space-y-2'>
|
|
|
|
<TextLabel>
|
2024-11-25 17:57:31 +08:00
|
|
|
{t('datasetCreation.stepTwo.parentChunkForContext')}
|
2024-11-21 11:40:17 +08:00
|
|
|
</TextLabel>
|
|
|
|
<RadioCard
|
|
|
|
icon={<Image src={Note} alt='' />}
|
2024-11-25 17:57:31 +08:00
|
|
|
title={t('datasetCreation.stepTwo.paragraph')}
|
|
|
|
description={t('datasetCreation.stepTwo.paragraphTip')}
|
2024-11-21 11:40:17 +08:00
|
|
|
isChosen={parentChildConfig.chunkForContext === 'paragraph'}
|
|
|
|
onChosen={() => setParentChildConfig(
|
|
|
|
{
|
|
|
|
...parentChildConfig,
|
|
|
|
chunkForContext: 'paragraph',
|
|
|
|
},
|
|
|
|
)}
|
|
|
|
chosenConfig={
|
|
|
|
<div className='flex gap-2'>
|
|
|
|
<DelimiterInput
|
|
|
|
value={parentChildConfig.parent.delimiter}
|
|
|
|
onChange={e => setParentChildConfig({
|
|
|
|
...parentChildConfig,
|
|
|
|
parent: {
|
|
|
|
...parentChildConfig.parent,
|
|
|
|
delimiter: e.target.value,
|
|
|
|
},
|
|
|
|
})}
|
|
|
|
/>
|
|
|
|
<MaxLengthInput
|
2024-11-26 15:29:31 +08:00
|
|
|
value={parentChildConfig.parent.maxLength}
|
2024-11-25 17:57:31 +08:00
|
|
|
onChange={value => setParentChildConfig({
|
2024-11-21 11:40:17 +08:00
|
|
|
...parentChildConfig,
|
|
|
|
parent: {
|
|
|
|
...parentChildConfig.parent,
|
2024-11-25 17:57:31 +08:00
|
|
|
maxLength: value,
|
2024-11-21 11:40:17 +08:00
|
|
|
},
|
|
|
|
})}
|
|
|
|
/>
|
|
|
|
</div>
|
|
|
|
}
|
|
|
|
/>
|
|
|
|
<RadioCard
|
|
|
|
icon={<Image src={FileList} alt='' />}
|
2024-11-25 17:57:31 +08:00
|
|
|
title={t('datasetCreation.stepTwo.fullDoc')}
|
|
|
|
description={t('datasetCreation.stepTwo.fullDocTip')}
|
2024-11-21 11:40:17 +08:00
|
|
|
onChosen={() => setParentChildConfig(
|
|
|
|
{
|
|
|
|
...parentChildConfig,
|
|
|
|
chunkForContext: 'full_doc',
|
|
|
|
},
|
|
|
|
)}
|
|
|
|
isChosen={parentChildConfig.chunkForContext === 'full_doc'}
|
|
|
|
/>
|
2024-11-20 14:55:59 +08:00
|
|
|
</div>
|
|
|
|
|
2024-11-26 14:48:49 +08:00
|
|
|
<div className='space-y-4'>
|
2024-11-21 11:40:17 +08:00
|
|
|
<TextLabel>
|
2024-11-25 17:57:31 +08:00
|
|
|
{t('datasetCreation.stepTwo.childChunkForRetrieval')}
|
2024-11-21 11:40:17 +08:00
|
|
|
</TextLabel>
|
2024-11-26 14:48:49 +08:00
|
|
|
<div className='flex gap-3 mt-2'>
|
2024-11-21 11:40:17 +08:00
|
|
|
<DelimiterInput
|
|
|
|
value={parentChildConfig.child.delimiter}
|
|
|
|
onChange={e => setParentChildConfig({
|
|
|
|
...parentChildConfig,
|
|
|
|
child: {
|
|
|
|
...parentChildConfig.child,
|
|
|
|
delimiter: e.target.value,
|
|
|
|
},
|
|
|
|
})}
|
|
|
|
/>
|
|
|
|
<MaxLengthInput
|
2024-11-26 15:29:31 +08:00
|
|
|
value={parentChildConfig.child.maxLength}
|
2024-11-25 17:57:31 +08:00
|
|
|
onChange={value => setParentChildConfig({
|
2024-11-21 11:40:17 +08:00
|
|
|
...parentChildConfig,
|
|
|
|
child: {
|
|
|
|
...parentChildConfig.child,
|
2024-11-25 17:57:31 +08:00
|
|
|
maxLength: value,
|
2024-11-21 11:40:17 +08:00
|
|
|
},
|
|
|
|
})}
|
|
|
|
/>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
<div className='space-y-2'>
|
2024-11-26 14:48:49 +08:00
|
|
|
<TextLabel>
|
|
|
|
{t('datasetCreation.stepTwo.rules')}
|
|
|
|
</TextLabel>
|
|
|
|
<div className='space-y-2 mt-2'>
|
|
|
|
{rules.map(rule => (
|
|
|
|
<div key={rule.id} className={s.ruleItem} onClick={() => {
|
|
|
|
ruleChangeHandle(rule.id)
|
|
|
|
}}>
|
|
|
|
<Checkbox
|
|
|
|
checked={rule.enabled}
|
|
|
|
/>
|
|
|
|
<label className="ml-2 text-sm font-normal cursor-pointer text-gray-800">{getRuleName(rule.id)}</label>
|
|
|
|
</div>
|
|
|
|
))}
|
|
|
|
</div>
|
2024-11-21 11:40:17 +08:00
|
|
|
</div>
|
2024-11-20 14:55:59 +08:00
|
|
|
</div>
|
2024-11-20 10:13:29 +08:00
|
|
|
</div>
|
|
|
|
</OptionCard>
|
2023-05-15 08:51:32 +08:00
|
|
|
</div>
|
|
|
|
</div>
|
2024-11-26 17:22:02 +08:00
|
|
|
<Divider className='my-5' />
|
2023-05-15 08:51:32 +08:00
|
|
|
<div className={s.label}>{t('datasetCreation.stepTwo.indexMode')}</div>
|
|
|
|
<div className='max-w-[640px]'>
|
2023-11-27 11:47:48 +08:00
|
|
|
<div className='flex items-center gap-3 flex-wrap sm:flex-nowrap'>
|
2023-05-15 08:51:32 +08:00
|
|
|
{(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.QUALIFIED)) && (
|
|
|
|
<div
|
|
|
|
className={cn(
|
|
|
|
s.radioItem,
|
|
|
|
s.indexItem,
|
2024-06-05 00:13:29 +08:00
|
|
|
!isAPIKeySet && s.disabled,
|
2023-05-15 08:51:32 +08:00
|
|
|
!hasSetIndexType && indexType === IndexingType.QUALIFIED && s.active,
|
|
|
|
hasSetIndexType && s.disabled,
|
2024-09-04 14:41:47 +08:00
|
|
|
hasSetIndexType && '!w-full !min-h-[96px]',
|
2023-05-15 08:51:32 +08:00
|
|
|
)}
|
|
|
|
onClick={() => {
|
2024-06-05 00:13:29 +08:00
|
|
|
if (isAPIKeySet)
|
2023-05-15 08:51:32 +08:00
|
|
|
setIndexType(IndexingType.QUALIFIED)
|
|
|
|
}}
|
|
|
|
>
|
2024-11-26 14:18:42 +08:00
|
|
|
<div className='h-8 p-1.5 bg-white rounded-lg border border-components-panel-border-subtle justify-center items-center inline-flex absolute left-5 top-[18px]'>
|
2024-11-25 17:57:31 +08:00
|
|
|
<Image src={indexMethodIcon.high_quality} alt='Gold Icon' width={20} height={20} />
|
2024-11-20 14:55:59 +08:00
|
|
|
</div>
|
2023-05-15 08:51:32 +08:00
|
|
|
{!hasSetIndexType && <span className={cn(s.radio)} />}
|
|
|
|
<div className={s.typeHeader}>
|
|
|
|
<div className={s.title}>
|
|
|
|
{t('datasetCreation.stepTwo.qualified')}
|
|
|
|
{!hasSetIndexType && <span className={s.recommendTag}>{t('datasetCreation.stepTwo.recommend')}</span>}
|
|
|
|
</div>
|
|
|
|
<div className={s.tip}>{t('datasetCreation.stepTwo.qualifiedTip')}</div>
|
|
|
|
</div>
|
2024-06-05 00:13:29 +08:00
|
|
|
{!isAPIKeySet && (
|
2023-05-15 08:51:32 +08:00
|
|
|
<div className={s.warningTip}>
|
|
|
|
<span>{t('datasetCreation.stepTwo.warning')} </span>
|
|
|
|
<span className={s.click} onClick={onSetting}>{t('datasetCreation.stepTwo.click')}</span>
|
|
|
|
</div>
|
|
|
|
)}
|
|
|
|
</div>
|
|
|
|
)}
|
|
|
|
|
|
|
|
{(!hasSetIndexType || (hasSetIndexType && indexingType === IndexingType.ECONOMICAL)) && (
|
|
|
|
<div
|
|
|
|
className={cn(
|
|
|
|
s.radioItem,
|
|
|
|
s.indexItem,
|
|
|
|
!hasSetIndexType && indexType === IndexingType.ECONOMICAL && s.active,
|
|
|
|
hasSetIndexType && s.disabled,
|
2024-09-04 14:41:47 +08:00
|
|
|
hasSetIndexType && '!w-full !min-h-[96px]',
|
2023-05-15 08:51:32 +08:00
|
|
|
)}
|
2023-07-28 20:47:15 +08:00
|
|
|
onClick={changeToEconomicalType}
|
2023-05-15 08:51:32 +08:00
|
|
|
>
|
2024-11-26 14:18:42 +08:00
|
|
|
<div className='h-8 p-1.5 bg-white rounded-lg border border-components-panel-border-subtle justify-center items-center inline-flex absolute left-5 top-[18px]'>
|
2024-11-25 17:57:31 +08:00
|
|
|
<Image src={indexMethodIcon.economical} alt='Economical Icon' width={20} height={20} />
|
2024-11-20 14:55:59 +08:00
|
|
|
</div>
|
2023-05-15 08:51:32 +08:00
|
|
|
{!hasSetIndexType && <span className={cn(s.radio)} />}
|
|
|
|
<div className={s.typeHeader}>
|
|
|
|
<div className={s.title}>{t('datasetCreation.stepTwo.economical')}</div>
|
|
|
|
<div className={s.tip}>{t('datasetCreation.stepTwo.economicalTip')}</div>
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
)}
|
|
|
|
</div>
|
2024-09-04 14:41:47 +08:00
|
|
|
{hasSetIndexType && indexType === IndexingType.ECONOMICAL && (
|
2023-05-15 08:51:32 +08:00
|
|
|
<div className='mt-2 text-xs text-gray-500 font-medium'>
|
2024-09-07 16:59:38 +08:00
|
|
|
{t('datasetCreation.stepTwo.indexSettingTip')}
|
2024-11-26 14:18:42 +08:00
|
|
|
<Link className='text-text-accent' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
|
2023-05-15 08:51:32 +08:00
|
|
|
</div>
|
|
|
|
)}
|
2024-11-20 15:25:26 +08:00
|
|
|
{IS_CE_EDITION && indexType === IndexingType.QUALIFIED && (
|
2024-11-26 17:22:02 +08:00
|
|
|
<div className='mt-2 rounded-xl bg-gray-50 border border-gray-100'>
|
2024-11-20 15:25:26 +08:00
|
|
|
<div className='flex justify-between items-center px-5 py-4'>
|
|
|
|
<div className='flex justify-center items-center w-8 h-8 rounded-lg bg-indigo-50'>
|
|
|
|
<MessageChatSquare className='w-4 h-4' />
|
|
|
|
</div>
|
|
|
|
<div className='grow mx-3'>
|
2024-11-21 16:19:32 +08:00
|
|
|
<div className='mb-0.5 text-md font-medium text-gray-900'>{t('datasetCreation.stepTwo.QATitle')}</div>
|
2024-11-20 15:25:26 +08:00
|
|
|
<div className='inline-flex items-center text-[13px] leading-[18px] text-gray-500'>
|
|
|
|
<span className='pr-1'>{t('datasetCreation.stepTwo.QALanguage')}</span>
|
|
|
|
<LanguageSelect currentLanguage={docLanguage} onSelect={handleSelect} disabled={isLanguageSelectDisabled} />
|
|
|
|
</div>
|
|
|
|
</div>
|
2024-11-21 11:40:17 +08:00
|
|
|
<Switch
|
|
|
|
defaultValue={docForm === DocForm.QA}
|
2024-12-03 14:14:37 +08:00
|
|
|
onChange={handleDocformSwitch}
|
2024-11-21 11:40:17 +08:00
|
|
|
size='md'
|
|
|
|
/>
|
2024-11-20 15:25:26 +08:00
|
|
|
</div>
|
|
|
|
{docForm === DocForm.QA && !QATipHide && (
|
|
|
|
<div className='flex justify-between items-center px-5 py-2 bg-orange-50 border-t border-amber-100 rounded-b-xl text-[13px] leading-[18px] text-medium text-amber-500'>
|
|
|
|
{t('datasetCreation.stepTwo.QATip')}
|
|
|
|
<RiCloseLine className='w-4 h-4 text-gray-500 cursor-pointer' onClick={() => setQATipHide(true)} />
|
|
|
|
</div>
|
|
|
|
)}
|
|
|
|
</div>
|
|
|
|
)}
|
2024-09-04 14:41:47 +08:00
|
|
|
{/* Embedding model */}
|
|
|
|
{indexType === IndexingType.QUALIFIED && (
|
2024-11-26 17:22:02 +08:00
|
|
|
<div className='mt-6 my-2'>
|
2024-09-04 14:41:47 +08:00
|
|
|
<div className={cn(s.label, datasetId && 'flex justify-between items-center')}>{t('datasetSettings.form.embeddingModel')}</div>
|
|
|
|
<ModelSelector
|
|
|
|
readonly={!!datasetId}
|
|
|
|
defaultModel={embeddingModel}
|
|
|
|
modelList={embeddingModelList}
|
|
|
|
onSelect={(model: DefaultModel) => {
|
|
|
|
setEmbeddingModel(model)
|
|
|
|
}}
|
|
|
|
/>
|
|
|
|
{!!datasetId && (
|
|
|
|
<div className='mt-2 text-xs text-gray-500 font-medium'>
|
2024-09-07 16:59:38 +08:00
|
|
|
{t('datasetCreation.stepTwo.indexSettingTip')}
|
2024-11-26 14:18:42 +08:00
|
|
|
<Link className='text-text-accent' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
|
2024-09-04 14:41:47 +08:00
|
|
|
</div>
|
|
|
|
)}
|
|
|
|
</div>
|
|
|
|
)}
|
2024-11-26 17:22:02 +08:00
|
|
|
<Divider className='my-5' />
|
2023-11-18 11:53:35 +08:00
|
|
|
{/* Retrieval Method Config */}
|
|
|
|
<div>
|
|
|
|
{!datasetId
|
|
|
|
? (
|
|
|
|
<div className={s.label}>
|
2024-09-04 14:41:47 +08:00
|
|
|
<div className='shrink-0 mr-4'>{t('datasetSettings.form.retrievalSetting.title')}</div>
|
2023-11-18 11:53:35 +08:00
|
|
|
<div className='leading-[18px] text-xs font-normal text-gray-500'>
|
2024-11-26 14:18:42 +08:00
|
|
|
<a target='_blank' rel='noopener noreferrer' href='https://docs.dify.ai/guides/knowledge-base/create-knowledge-and-upload-documents#id-4-retrieval-settings' className='text-text-accent'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>
|
2023-11-18 11:53:35 +08:00
|
|
|
{t('datasetSettings.form.retrievalSetting.longDescription')}
|
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
)
|
|
|
|
: (
|
|
|
|
<div className={cn(s.label, 'flex justify-between items-center')}>
|
|
|
|
<div>{t('datasetSettings.form.retrievalSetting.title')}</div>
|
|
|
|
</div>
|
|
|
|
)}
|
|
|
|
|
|
|
|
<div className='max-w-[640px]'>
|
2024-08-27 11:25:27 +08:00
|
|
|
{
|
|
|
|
getIndexing_technique() === IndexingType.QUALIFIED
|
|
|
|
? (
|
|
|
|
<RetrievalMethodConfig
|
2023-11-18 11:53:35 +08:00
|
|
|
value={retrievalConfig}
|
2024-08-27 11:25:27 +08:00
|
|
|
onChange={setRetrievalConfig}
|
2023-11-18 11:53:35 +08:00
|
|
|
/>
|
2024-08-27 11:25:27 +08:00
|
|
|
)
|
|
|
|
: (
|
|
|
|
<EconomicalRetrievalMethodConfig
|
|
|
|
value={retrievalConfig}
|
|
|
|
onChange={setRetrievalConfig}
|
|
|
|
/>
|
|
|
|
)
|
|
|
|
}
|
2023-11-18 11:53:35 +08:00
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
|
2023-06-06 10:52:02 +08:00
|
|
|
{!isSetting
|
|
|
|
? (
|
|
|
|
<div className='flex items-center mt-8 py-2'>
|
2024-11-20 16:24:06 +08:00
|
|
|
<Button onClick={() => onStepChange && onStepChange(-1)}>
|
|
|
|
<RiArrowLeftLine className='w-4 h-4 mr-1' />
|
|
|
|
{t('datasetCreation.stepTwo.previousStep')}
|
|
|
|
</Button>
|
|
|
|
<Button className='ml-auto' loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.nextStep')}</Button>
|
2023-06-06 10:52:02 +08:00
|
|
|
</div>
|
|
|
|
)
|
|
|
|
: (
|
|
|
|
<div className='flex items-center mt-8 py-2'>
|
2024-06-19 14:13:16 +08:00
|
|
|
<Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>
|
2023-06-06 10:52:02 +08:00
|
|
|
<Button className='ml-2' onClick={onCancel}>{t('datasetCreation.stepTwo.cancel')}</Button>
|
|
|
|
</div>
|
|
|
|
)}
|
2023-05-15 08:51:32 +08:00
|
|
|
</div>
|
|
|
|
</div>
|
|
|
|
</div>
|
2024-11-28 10:36:24 +08:00
|
|
|
<FloatRightContainer isMobile={isMobile} isOpen={true} onClose={() => { }} footer={null}>
|
2024-12-04 11:52:05 +08:00
|
|
|
<PreviewContainer
|
|
|
|
header={<PreviewHeader
|
|
|
|
title='Preview'
|
|
|
|
>
|
|
|
|
</PreviewHeader>}
|
|
|
|
className={cn(s.previewWrap, isMobile && s.isMobile, 'relative h-full overflow-y-scroll space-y-4')}
|
2024-11-28 10:36:24 +08:00
|
|
|
>
|
2024-12-04 11:52:05 +08:00
|
|
|
{qaPreviewSwitched && docForm === DocForm.QA && estimate?.qa_preview && (
|
|
|
|
estimate?.qa_preview.map(item => (
|
|
|
|
<QAPreview key={item.question} qa={item} />
|
|
|
|
))
|
|
|
|
)}
|
|
|
|
{(docForm === DocForm.TEXT || !qaPreviewSwitched) && estimate?.preview && (
|
|
|
|
estimate?.preview.map((item, index) => (
|
|
|
|
<ChunkContainer
|
|
|
|
key={item}
|
|
|
|
label={`Chunk-${index + 1}`}
|
|
|
|
characterCount={item.length}
|
|
|
|
>
|
|
|
|
{item}
|
|
|
|
</ChunkContainer>
|
|
|
|
))
|
|
|
|
)}
|
|
|
|
{qaPreviewSwitched && docForm === DocForm.QA && !estimate?.qa_preview && (
|
|
|
|
<div className='flex items-center justify-center h-[200px]'>
|
|
|
|
<Loading type='area' />
|
2023-05-15 08:51:32 +08:00
|
|
|
</div>
|
2024-12-04 11:52:05 +08:00
|
|
|
)}
|
|
|
|
{!qaPreviewSwitched && !estimate?.preview && (
|
|
|
|
<div className='flex items-center justify-center h-[200px]'>
|
|
|
|
<Loading type='area' />
|
|
|
|
</div>
|
|
|
|
)}
|
|
|
|
</PreviewContainer>
|
2023-11-27 11:47:48 +08:00
|
|
|
</FloatRightContainer>
|
2023-05-15 08:51:32 +08:00
|
|
|
</div>
|
|
|
|
)
|
|
|
|
}
|
|
|
|
|
|
|
|
export default StepTwo
|