Merge branch 'feat/parent-child-retrieval' of https://github.com/langgenius/dify into feat/parent-child-retrieval

This commit is contained in:
twwu 2024-12-04 14:24:54 +08:00
commit 8d74eb4946
13 changed files with 644 additions and 374 deletions

View File

@ -18,7 +18,7 @@ const dividerVariants = cva(
},
)
type DividerProps = {
export type DividerProps = {
className?: string
style?: CSSProperties
} & VariantProps<typeof dividerVariants>

View File

@ -0,0 +1,23 @@
import type { FC } from 'react'
import type { DividerProps } from '.'
import Divider from '.'
import classNames from '@/utils/classnames'
export type DividerWithLabelProps = DividerProps & {
label: string
}
export const DividerWithLabel: FC<DividerWithLabelProps> = (props) => {
const { label, className, ...rest } = props
return <div
className="flex items-center gap-2 my-2"
>
<Divider {...rest} className={classNames('flex-1', className)} />
<span className="text-text-tertiary text-xs">
{label}
</span>
<Divider {...rest} className={classNames('flex-1', className)} />
</div>
}
export default DividerWithLabel

View File

@ -0,0 +1,13 @@
<svg width="10" height="10" viewBox="0 0 10 10" fill="none" xmlns="http://www.w3.org/2000/svg">
<g id="Group">
<path id="Vector" d="M2.5 10H0V7.5H2.5V10Z" fill="#676F83"/>
<path id="Vector_2" d="M6.25 6.25H3.75V3.75H6.25V6.25Z" fill="#676F83"/>
<path id="Vector_3" d="M2.5 6.25H0V3.75H2.5V6.25Z" fill="#676F83"/>
<path id="Vector_4" d="M6.25 2.5H3.75V0H6.25V2.5Z" fill="#676F83"/>
<path id="Vector_5" d="M2.5 2.5H0V0H2.5V2.5Z" fill="#676F83"/>
<path id="Vector_6" d="M10 2.5H7.5V0H10V2.5Z" fill="#676F83"/>
<path id="Vector_7" d="M9.58332 7.91663H7.91666V9.58329H9.58332V7.91663Z" fill="#676F83"/>
<path id="Vector_8" d="M9.58332 4.16663H7.91666V5.83329H9.58332V4.16663Z" fill="#676F83"/>
<path id="Vector_9" d="M5.83332 7.91663H4.16666V9.58329H5.83332V7.91663Z" fill="#676F83"/>
</g>
</svg>

After

Width:  |  Height:  |  Size: 792 B

View File

@ -0,0 +1,55 @@
import type { FC, PropsWithChildren } from 'react'
import Image from 'next/image'
import SelectionMod from './assets/selection-mod-nocolor.svg'
import type { QA } from '@/models/datasets'
export type ChunkLabelProps = {
label: string
characterCount: number
}
export const ChunkLabel: FC<ChunkLabelProps> = (props) => {
const { label, characterCount } = props
return <div className='flex items-center text-text-tertiary text-xs font-medium'>
<Image src={SelectionMod} alt="Selection Mod" width={10} height={10} />
<p className='flex gap-2 ml-0.5'><span>
{label}
</span>
<span>
·
</span>
<span>
{`${characterCount} characters`}
</span></p>
</div>
}
export type ChunkContainerProps = ChunkLabelProps & PropsWithChildren
export const ChunkContainer: FC<ChunkContainerProps> = (props) => {
const { label, characterCount, children } = props
return <div className='space-y-2'>
<ChunkLabel label={label} characterCount={characterCount} />
<p className='text-text-secondary text-sm tracking-[-0.0005em]'>
{children}
</p>
</div>
}
export type QAPreviewProps = {
qa: QA
}
export const QAPreview: FC<QAPreviewProps> = (props) => {
const { qa } = props
return <div className='space-y-2'>
<div className='flex gap-1 items-start'>
<label className='text-text-tertiary text-[13px] font-medium'>Q</label>
<p className='text-text-secondary tracking-[-0.0005em]'>{qa.question}</p>
</div>
<div className='flex gap-1 items-start'>
<label className='text-text-tertiary text-[13px] font-medium'>A</label>
<p className='text-text-secondary tracking-[-0.0005em]'>{qa.answer}</p>
</div>
</div>
}

View File

@ -394,19 +394,6 @@
max-width: 524px;
}
.previewHeader {
position: sticky;
top: 0;
left: 0;
padding-top: 42px;
background-color: #fff;
font-weight: 600;
font-size: 18px;
line-height: 28px;
color: #101828;
z-index: 10;
}
/*
* `fixed` must under `previewHeader` because of style override would not work
*/

View File

@ -1,17 +1,14 @@
'use client'
import type { FC, PropsWithChildren, ReactNode } from 'react'
import React, { useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react'
import type { FC, PropsWithChildren } from 'react'
import React, { useCallback, useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import { useContext } from 'use-context-selector'
import { useBoolean } from 'ahooks'
import { XMarkIcon } from '@heroicons/react/20/solid'
import {
RiArrowLeftLine,
RiCloseLine,
RiSearchEyeLine,
} from '@remixicon/react'
import Link from 'next/link'
import { groupBy } from 'lodash-es'
import Image from 'next/image'
import SettingCog from '../assets/setting-gear-mod.svg'
import OrangeEffect from '../assets/option-card-effect-orange.svg'
@ -19,7 +16,9 @@ import FamilyMod from '../assets/family-mod.svg'
import Note from '../assets/note-mod.svg'
import FileList from '../assets/file-list-3-fill.svg'
import { indexMethodIcon } from '../icons'
import PreviewItem, { PreviewType } from './preview-item'
import { PreviewContainer } from '../../preview/container'
import { ChunkContainer, QAPreview } from '../../chunk'
import { PreviewHeader } from '../../preview/header'
import s from './index.module.css'
import unescape from './unescape'
import escape from './escape'
@ -27,15 +26,9 @@ import { OptionCard } from './option-card'
import LanguageSelect from './language-select'
import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs'
import cn from '@/utils/classnames'
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
import {
createDocument,
createFirstDocument,
fetchFileIndexingEstimate as didFetchFileIndexingEstimate,
fetchDefaultProcessRule,
} from '@/service/datasets'
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FullDocumentDetail, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
import Button from '@/app/components/base/button'
import Loading from '@/app/components/base/loading'
import FloatRightContainer from '@/app/components/base/float-right-container'
import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
@ -60,26 +53,20 @@ import { MessageChatSquare } from '@/app/components/base/icons/src/public/common
import { IS_CE_EDITION } from '@/config'
import Switch from '@/app/components/base/switch'
import Divider from '@/app/components/base/divider'
import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument, useFetchDefaultProcessRule, useFetchFileIndexingEstimateForFile, useFetchFileIndexingEstimateForNotion, useFetchFileIndexingEstimateForWeb } from '@/service/use-datasets'
import Loading from '@/app/components/base/loading'
const TextLabel: FC<PropsWithChildren> = (props) => {
return <label className='text-text-secondary text-xs font-semibold leading-none'>{props.children}</label>
}
const FormField: FC<PropsWithChildren<{ label: ReactNode }>> = (props) => {
return <div className='space-y-2 flex-1'>
<TextLabel>{props.label}</TextLabel>
{props.children}
</div>
}
type ValueOf<T> = T[keyof T]
type StepTwoProps = {
isSetting?: boolean
documentDetail?: FullDocumentDetail
isAPIKeySet: boolean
onSetting: () => void
datasetId?: string
indexingType?: ValueOf<IndexingType>
indexingType?: IndexingType
retrievalMethod?: string
dataSourceType: DataSourceType
files: CustomFile[]
@ -96,11 +83,11 @@ type StepTwoProps = {
onCancel?: () => void
}
enum SegmentType {
export enum SegmentType {
AUTO = 'automatic',
CUSTOM = 'custom',
}
enum IndexingType {
export enum IndexingType {
QUALIFIED = 'high_quality',
ECONOMICAL = 'economy',
}
@ -117,7 +104,6 @@ type ParentChildConfig = {
delimiter: string
maxLength: number
}
rules: PreProcessingRule[]
}
const defaultParentChildConfig: ParentChildConfig = {
@ -130,7 +116,6 @@ const defaultParentChildConfig: ParentChildConfig = {
delimiter: '\\n\\n',
maxLength: 4000,
},
rules: [],
}
const StepTwo = ({
@ -162,10 +147,6 @@ const StepTwo = ({
const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext()
const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
const scrollRef = useRef<HTMLDivElement>(null)
const [scrolled, setScrolled] = useState(false)
const previewScrollRef = useRef<HTMLDivElement>(null)
const [previewScrolled, setPreviewScrolled] = useState(false)
const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
const setSegmentIdentifier = useCallback((value: string) => {
@ -176,7 +157,7 @@ const StepTwo = ({
const [rules, setRules] = useState<PreProcessingRule[]>([])
const [defaultConfig, setDefaultConfig] = useState<Rules>()
const hasSetIndexType = !!indexingType
const [indexType, setIndexType] = useState<ValueOf<IndexingType>>(
const [indexType, setIndexType] = useState<IndexingType>(
(indexingType
|| isAPIKeySet)
? IndexingType.QUALIFIED
@ -190,37 +171,96 @@ const StepTwo = ({
(datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese'),
)
const [QATipHide, setQATipHide] = useState(false)
const [previewSwitched, setPreviewSwitched] = useState(false)
const [showPreview, { setTrue: setShowPreview, setFalse: hidePreview }] = useBoolean()
const [customFileIndexingEstimate, setCustomFileIndexingEstimate] = useState<FileIndexingEstimateResponse | null>(null)
const [automaticFileIndexingEstimate, setAutomaticFileIndexingEstimate] = useState<FileIndexingEstimateResponse | null>(null)
const fileIndexingEstimate = (() => {
return segmentationType === SegmentType.AUTO ? automaticFileIndexingEstimate : customFileIndexingEstimate
})()
const [isCreating, setIsCreating] = useState(false)
const [qaPreviewSwitched, setQAPreviewSwitched] = useState(false)
const [parentChildConfig, setParentChildConfig] = useState<ParentChildConfig>(defaultParentChildConfig)
const scrollHandle = (e: Event) => {
if ((e.target as HTMLDivElement).scrollTop > 0)
setScrolled(true)
const getIndexing_technique = () => indexingType || indexType
else
setScrolled(false)
const getProcessRule = () => {
const processRule: ProcessRule = {
rules: {} as any, // api will check this. It will be removed after api refactored.
mode: segmentationType,
}
if (segmentationType === SegmentType.CUSTOM) {
const ruleObj = {
pre_processing_rules: rules,
segmentation: {
separator: unescape(segmentIdentifier),
max_tokens: max,
chunk_overlap: overlap,
},
}
processRule.rules = ruleObj
}
return processRule
}
const previewScrollHandle = (e: Event) => {
if ((e.target as HTMLDivElement).scrollTop > 0)
setPreviewScrolled(true)
const fileIndexingEstimateQuery = useFetchFileIndexingEstimateForFile({
docForm: docForm as DocForm,
docLanguage,
dataSourceType: DataSourceType.FILE,
files,
indexingTechnique: getIndexing_technique() as any,
processRule: getProcessRule(),
dataset_id: datasetId!,
})
const notionIndexingEstimateQuery = useFetchFileIndexingEstimateForNotion({
docForm: docForm as DocForm,
docLanguage,
dataSourceType: DataSourceType.NOTION,
notionPages,
indexingTechnique: getIndexing_technique() as any,
processRule: getProcessRule(),
dataset_id: datasetId || '',
})
else
setPreviewScrolled(false)
}
const getFileName = (name: string) => {
const arr = name.split('.')
return arr.slice(0, -1).join('.')
}
const websiteIndexingEstimateQuery = useFetchFileIndexingEstimateForWeb({
docForm: docForm as DocForm,
docLanguage,
dataSourceType: DataSourceType.WEB,
websitePages,
crawlOptions,
websiteCrawlProvider,
websiteCrawlJobId,
indexingTechnique: getIndexing_technique() as any,
processRule: getProcessRule(),
dataset_id: datasetId || '',
})
const fetchEstimate = useCallback(() => {
if (dataSourceType === DataSourceType.FILE)
fileIndexingEstimateQuery.mutate()
if (dataSourceType === DataSourceType.NOTION)
notionIndexingEstimateQuery.mutate()
if (dataSourceType === DataSourceType.WEB)
websiteIndexingEstimateQuery.mutate()
}, [dataSourceType, fileIndexingEstimateQuery, notionIndexingEstimateQuery, websiteIndexingEstimateQuery])
const estimate
= dataSourceType === DataSourceType.FILE
? fileIndexingEstimateQuery.data
: dataSourceType === DataSourceType.NOTION
? notionIndexingEstimateQuery.data
: websiteIndexingEstimateQuery.data
// const getIsEstimateReady = useCallback(() => {
// if (dataSourceType === DataSourceType.FILE)
// return fileIndexingEstimateQuery.isSuccess
// if (dataSourceType === DataSourceType.NOTION)
// return notionIndexingEstimateQuery.isSuccess
// if (dataSourceType === DataSourceType.WEB)
// return websiteIndexingEstimateQuery.isSuccess
// }, [dataSourceType, fileIndexingEstimateQuery.isSuccess, notionIndexingEstimateQuery.isSuccess, websiteIndexingEstimateQuery.isSuccess])
// const getFileName = (name: string) => {
// const arr = name.split('.')
// return arr.slice(0, -1).join('.')
// }
const getRuleName = (key: string) => {
if (key === 'remove_extra_spaces')
@ -248,129 +288,21 @@ const StepTwo = ({
if (defaultConfig) {
setSegmentIdentifier(defaultConfig.segmentation.separator)
setMax(defaultConfig.segmentation.max_tokens)
setOverlap(defaultConfig.segmentation.chunk_overlap)
setOverlap(defaultConfig.segmentation.chunk_overlap!)
setRules(defaultConfig.pre_processing_rules)
}
setParentChildConfig(defaultParentChildConfig)
}
const fetchFileIndexingEstimate = async (docForm = DocForm.TEXT, language?: string) => {
// eslint-disable-next-line @typescript-eslint/no-use-before-define
const res = await didFetchFileIndexingEstimate(getFileIndexingEstimateParams(docForm, language)!)
if (segmentationType === SegmentType.CUSTOM)
setCustomFileIndexingEstimate(res)
else
setAutomaticFileIndexingEstimate(res)
}
const confirmChangeCustomConfig = () => {
const updatePreview = () => {
if (segmentationType === SegmentType.CUSTOM && max > 4000) {
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
return
}
setCustomFileIndexingEstimate(null)
setShowPreview()
fetchFileIndexingEstimate()
setPreviewSwitched(false)
fetchEstimate()
setQAPreviewSwitched(false)
}
const getIndexing_technique = () => indexingType || indexType
const getProcessRule = () => {
const processRule: ProcessRule = {
rules: {} as any, // api will check this. It will be removed after api refactored.
mode: segmentationType,
}
if (segmentationType === SegmentType.CUSTOM) {
const ruleObj = {
pre_processing_rules: rules,
segmentation: {
separator: unescape(segmentIdentifier),
max_tokens: max,
chunk_overlap: overlap,
},
}
processRule.rules = ruleObj
}
return processRule
}
const getNotionInfo = () => {
const workspacesMap = groupBy(notionPages, 'workspace_id')
const workspaces = Object.keys(workspacesMap).map((workspaceId) => {
return {
workspaceId,
pages: workspacesMap[workspaceId],
}
})
return workspaces.map((workspace) => {
return {
workspace_id: workspace.workspaceId,
pages: workspace.pages.map((page) => {
const { page_id, page_name, page_icon, type } = page
return {
page_id,
page_name,
page_icon,
type,
}
}),
}
}) as NotionInfo[]
}
const getWebsiteInfo = () => {
return {
provider: websiteCrawlProvider,
job_id: websiteCrawlJobId,
urls: websitePages.map(page => page.source_url),
only_main_content: crawlOptions?.only_main_content,
}
}
const getFileIndexingEstimateParams = (docForm: DocForm, language?: string): IndexingEstimateParams | undefined => {
if (dataSourceType === DataSourceType.FILE) {
return {
info_list: {
data_source_type: dataSourceType,
file_info_list: {
file_ids: files.map(file => file.id) as string[],
},
},
indexing_technique: getIndexing_technique() as string,
process_rule: getProcessRule(),
doc_form: docForm,
doc_language: language || docLanguage,
dataset_id: datasetId as string,
}
}
if (dataSourceType === DataSourceType.NOTION) {
return {
info_list: {
data_source_type: dataSourceType,
notion_info_list: getNotionInfo(),
},
indexing_technique: getIndexing_technique() as string,
process_rule: getProcessRule(),
doc_form: docForm,
doc_language: language || docLanguage,
dataset_id: datasetId as string,
}
}
if (dataSourceType === DataSourceType.WEB) {
return {
info_list: {
data_source_type: dataSourceType,
website_info_list: getWebsiteInfo(),
},
indexing_technique: getIndexing_technique() as string,
process_rule: getProcessRule(),
doc_form: docForm,
doc_language: language || docLanguage,
dataset_id: datasetId as string,
}
}
}
const {
modelList: rerankModelList,
defaultModel: rerankDefaultModel,
@ -454,28 +386,35 @@ const StepTwo = ({
}
}
if (dataSourceType === DataSourceType.NOTION)
params.data_source.info_list.notion_info_list = getNotionInfo()
params.data_source.info_list.notion_info_list = getNotionInfo(notionPages)
if (dataSourceType === DataSourceType.WEB)
params.data_source.info_list.website_info_list = getWebsiteInfo()
if (dataSourceType === DataSourceType.WEB) {
params.data_source.info_list.website_info_list = getWebsiteInfo({
websiteCrawlProvider,
websiteCrawlJobId,
websitePages,
})
}
}
return params
}
const getRules = async () => {
try {
const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' })
const separator = res.rules.segmentation.separator
const fetchDefaultProcessRuleMutation = useFetchDefaultProcessRule({
onSuccess(data) {
const separator = data.rules.segmentation.separator
setSegmentIdentifier(separator)
setMax(res.rules.segmentation.max_tokens)
setOverlap(res.rules.segmentation.chunk_overlap)
setRules(res.rules.pre_processing_rules)
setDefaultConfig(res.rules)
}
catch (err) {
console.log(err)
}
}
setMax(data.rules.segmentation.max_tokens)
setOverlap(data.rules.segmentation.chunk_overlap!)
setRules(data.rules.pre_processing_rules)
setDefaultConfig(data.rules)
},
onError(error) {
Toast.notify({
type: 'error',
message: `${error}`,
})
},
})
const getRulesFromDetail = () => {
if (documentDetail) {
@ -485,7 +424,7 @@ const StepTwo = ({
const overlap = rules.segmentation.chunk_overlap
setSegmentIdentifier(separator)
setMax(max)
setOverlap(overlap)
setOverlap(overlap as number)
setRules(rules.pre_processing_rules)
setDefaultConfig(rules)
}
@ -496,77 +435,75 @@ const StepTwo = ({
setSegmentationType(documentDetail.dataset_process_rule.mode)
}
const createFirstDocumentMutation = useCreateFirstDocument({
onError(error) {
Toast.notify({
type: 'error',
message: `${error}`,
})
},
})
const createDocumentMutation = useCreateDocument(datasetId!, {
onError(error) {
Toast.notify({
type: 'error',
message: `${error}`,
})
},
})
const isCreating = createFirstDocumentMutation.isPending || createDocumentMutation.isPending
const createHandle = async () => {
if (isCreating)
return
setIsCreating(true)
try {
let res
const params = getCreationParams()
if (!params)
return false
setIsCreating(true)
if (!datasetId) {
res = await createFirstDocument({
body: params as CreateDocumentReq,
})
await createFirstDocumentMutation.mutateAsync(
params,
{
onSuccess(data) {
updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
updateResultCache && updateResultCache(res)
updateResultCache && updateResultCache(data)
// eslint-disable-next-line @typescript-eslint/no-use-before-define
updateRetrievalMethodCache && updateRetrievalMethodCache(retrievalConfig.search_method as string)
},
},
)
}
else {
res = await createDocument({
datasetId,
body: params as CreateDocumentReq,
})
await createDocumentMutation.mutateAsync(params, {
onSuccess(data) {
updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
updateResultCache && updateResultCache(res)
updateResultCache && updateResultCache(data)
},
})
}
if (mutateDatasetRes)
mutateDatasetRes()
onStepChange && onStepChange(+1)
isSetting && onSave && onSave()
}
catch (err) {
Toast.notify({
type: 'error',
message: `${err}`,
})
}
finally {
setIsCreating(false)
}
}
const handleSwitch = (state: boolean) => {
if (state)
const handleDocformSwitch = (isQAMode: boolean) => {
if (isQAMode)
setDocForm(DocForm.QA)
else
setDocForm(DocForm.TEXT)
}
const previewSwitch = async (language?: string) => {
setPreviewSwitched(true)
const previewSwitch = () => {
setQAPreviewSwitched(true)
setIsLanguageSelectDisabled(true)
if (segmentationType === SegmentType.AUTO)
setAutomaticFileIndexingEstimate(null)
else
setCustomFileIndexingEstimate(null)
try {
await fetchFileIndexingEstimate(DocForm.QA, language)
}
finally {
setIsLanguageSelectDisabled(false)
}
fetchEstimate()
}
const handleSelect = (language: string) => {
setDocLanguage(language)
// Switch language, re-cutter
if (docForm === DocForm.QA && previewSwitched)
previewSwitch(language)
if (docForm === DocForm.QA && qaPreviewSwitched)
previewSwitch()
}
const changeToEconomicalType = () => {
@ -579,7 +516,7 @@ const StepTwo = ({
useEffect(() => {
// fetch rules
if (!isSetting) {
getRules()
fetchDefaultProcessRuleMutation.mutate('/datasets/process-rule')
}
else {
getRulesFromDetail()
@ -587,22 +524,6 @@ const StepTwo = ({
}
}, [])
useEffect(() => {
scrollRef.current?.addEventListener('scroll', scrollHandle)
return () => {
scrollRef.current?.removeEventListener('scroll', scrollHandle)
}
}, [])
useLayoutEffect(() => {
if (showPreview) {
previewScrollRef.current?.addEventListener('scroll', previewScrollHandle)
return () => {
previewScrollRef.current?.removeEventListener('scroll', previewScrollHandle)
}
}
}, [showPreview])
useEffect(() => {
if (indexingType === IndexingType.ECONOMICAL && docForm === DocForm.QA)
setDocForm(DocForm.TEXT)
@ -617,20 +538,6 @@ const StepTwo = ({
setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
}, [isAPIKeySet, indexingType, datasetId])
useEffect(() => {
if (segmentationType === SegmentType.AUTO) {
setAutomaticFileIndexingEstimate(null)
!isMobile && setShowPreview()
fetchFileIndexingEstimate()
setPreviewSwitched(false)
}
else {
hidePreview()
setCustomFileIndexingEstimate(null)
setPreviewSwitched(false)
}
}, [segmentationType, indexType])
const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
search_method: RETRIEVE_METHOD.semantic,
reranking_enable: false,
@ -659,7 +566,7 @@ const StepTwo = ({
onClick={() => setSegmentationType(SegmentType.AUTO)}
actions={
<>
<Button variant={'secondary-accent'}>
<Button variant={'secondary-accent'} onClick={() => updatePreview()}>
<RiSearchEyeLine className='h-4 w-4 mr-1.5' />
{t('datasetCreation.stepTwo.previewChunk')}
</Button>
@ -714,7 +621,7 @@ const StepTwo = ({
onClick={() => setSegmentationType(SegmentType.CUSTOM)}
actions={
<>
<Button variant={'secondary-accent'}>
<Button variant={'secondary-accent'} onClick={() => updatePreview()}>
<RiSearchEyeLine className='h-4 w-4 mr-1.5' />
{t('datasetCreation.stepTwo.previewChunk')}
</Button>
@ -910,7 +817,7 @@ const StepTwo = ({
</div>
<Switch
defaultValue={docForm === DocForm.QA}
onChange={handleSwitch}
onChange={handleDocformSwitch}
size='md'
/>
</div>
@ -1000,70 +907,40 @@ const StepTwo = ({
</div>
</div>
<FloatRightContainer isMobile={isMobile} isOpen={true} onClose={() => { }} footer={null}>
{showPreview && <div
ref={previewScrollRef}
className={cn(s.previewWrap, isMobile && s.isMobile, 'relative h-full overflow-y-scroll border-l border-[#F2F4F7]')}
<PreviewContainer
header={<PreviewHeader
title='Preview'
>
<div className={cn(s.previewHeader, previewScrolled && `${s.fixed} pb-3`)}>
<div className='flex items-center justify-between px-8'>
<div className='grow flex items-center'>
<div>{t('datasetCreation.stepTwo.previewTitle')}</div>
{docForm === DocForm.QA && !previewSwitched && (
<Button className='ml-2' variant='secondary-accent' onClick={() => previewSwitch()}>{t('datasetCreation.stepTwo.previewButton')}</Button>
</PreviewHeader>}
className={cn(s.previewWrap, isMobile && s.isMobile, 'relative h-full overflow-y-scroll space-y-4')}
>
{qaPreviewSwitched && docForm === DocForm.QA && estimate?.qa_preview && (
estimate?.qa_preview.map(item => (
<QAPreview key={item.question} qa={item} />
))
)}
</div>
<div className='flex items-center justify-center w-6 h-6 cursor-pointer' onClick={hidePreview}>
<XMarkIcon className='h-4 w-4'></XMarkIcon>
</div>
</div>
{docForm === DocForm.QA && !previewSwitched && (
<div className='px-8 pr-12 text-xs text-gray-500'>
<span>{t('datasetCreation.stepTwo.previewSwitchTipStart')}</span>
<span className='text-amber-600'>{t('datasetCreation.stepTwo.previewSwitchTipEnd')}</span>
</div>
{(docForm === DocForm.TEXT || !qaPreviewSwitched) && estimate?.preview && (
estimate?.preview.map((item, index) => (
<ChunkContainer
key={item}
label={`Chunk-${index + 1}`}
characterCount={item.length}
>
{item}
</ChunkContainer>
))
)}
</div>
<div className='my-4 px-8 space-y-4'>
{previewSwitched && docForm === DocForm.QA && fileIndexingEstimate?.qa_preview && (
<>
{fileIndexingEstimate?.qa_preview.map((item, index) => (
<PreviewItem type={PreviewType.QA} key={item.question} qa={item} index={index + 1} />
))}
</>
)}
{(docForm === DocForm.TEXT || !previewSwitched) && fileIndexingEstimate?.preview && (
<>
{fileIndexingEstimate?.preview.map((item, index) => (
<PreviewItem type={PreviewType.TEXT} key={item} content={item} index={index + 1} />
))}
</>
)}
{previewSwitched && docForm === DocForm.QA && !fileIndexingEstimate?.qa_preview && (
{qaPreviewSwitched && docForm === DocForm.QA && !estimate?.qa_preview && (
<div className='flex items-center justify-center h-[200px]'>
<Loading type='area' />
</div>
)}
{!previewSwitched && !fileIndexingEstimate?.preview && (
{!qaPreviewSwitched && !estimate?.preview && (
<div className='flex items-center justify-center h-[200px]'>
<Loading type='area' />
</div>
)}
</div>
</div>}
{!showPreview && (
<div className={cn(s.sideTip)}>
<div className={s.tipCard}>
<span className={s.icon} />
<div className={s.title}>{t('datasetCreation.stepTwo.sideTipTitle')}</div>
<div className={s.content}>
<p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP1')}</p>
<p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP2')}</p>
<p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP3')}</p>
<p>{t('datasetCreation.stepTwo.sideTipP4')}</p>
</div>
</div>
</div>
)}
</PreviewContainer>
</FloatRightContainer>
</div>
)

View File

@ -34,7 +34,7 @@ export const SliceContent: FC<SliceContentProps> = forwardRef((props, ref) => {
const { className, children, ...rest } = props
return <span {...rest} ref={ref} className={classNames(
baseStyle,
'px-1 bg-state-base-hover group-hover:bg-state-accent-hover-alt group-hover:text-text-primary',
'px-1 bg-state-base-hover group-hover:bg-state-accent-hover-alt group-hover:text-text-primary leading-7',
className,
)}>
{children}

View File

@ -0,0 +1,27 @@
import type { ComponentProps, FC, ReactNode } from 'react'
import { forwardRef } from 'react'
import classNames from '@/utils/classnames'
export type PreviewContainerProps = ComponentProps<'div'> & {
header: ReactNode
}
export const PreviewContainer: FC<PreviewContainerProps> = forwardRef((props, ref) => {
const { children, className, header, ...rest } = props
return <div
{...rest}
ref={ref}
className={classNames(
'flex flex-col rounded-xl border-t-[0.5px] border-l-[0.5px] border-components-panel-border bg-background-default-lighter shadow shadow-shadow-shadow-5',
className,
)}
>
<header className='py-4 pl-5 pr-3 border-b border-divider-subtle'>
{header}
</header>
<main className='py-5 px-6'>
{children}
</main>
</div>
})
PreviewContainer.displayName = 'PreviewContainer'

View File

@ -0,0 +1,23 @@
import type { ComponentProps, FC } from 'react'
import classNames from '@/utils/classnames'
export type PreviewHeaderProps = Omit<ComponentProps<'div'>, 'title'> & {
title: string
}
export const PreviewHeader: FC<PreviewHeaderProps> = (props) => {
const { title, className, children, ...rest } = props
return <div
{...rest}
className={classNames(
className,
)}
>
<div
className='text-text-accent text-2xs font-semibold leading-3 uppercase'
>
{title}
</div>
{children}
</div>
}

View File

@ -1,35 +1,76 @@
'use client'
import { useState } from 'react'
import { FormattedText } from '../components/datasets/formatted-text/formatted'
import { PreviewSlice } from '../components/datasets/formatted-text/flavours/preview-slice'
import { EditSlice } from '../components/datasets/formatted-text/flavours/edit-slice'
import { PreviewContainer } from '../components/datasets/preview/container'
import { PreviewHeader } from '../components/datasets/preview/header'
import FileIcon from '../components/base/file-icon'
import { ChevronDown } from '../components/base/icons/src/vender/solid/arrows'
import Badge from '../components/base/badge'
import { DividerWithLabel } from '../components/base/divider/with-label'
import Button from '../components/base/button'
import { ChunkContainer, QAPreview } from '../components/datasets/chunk'
import classNames from '@/utils/classnames'
export default function Page() {
const [parentChild, setParentChild] = useState(false)
const [vertical, setVertical] = useState(false)
const [qa, setQa] = useState(false)
return <div className='p-4'>
<FormattedText>
<PreviewSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' tooltip={'Child-chunk-2 · 268 Characters'} />
<PreviewSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' tooltip={'Child-chunk-2 · 268 Characters'} />
<PreviewSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' tooltip={'Child-chunk-2 · 268 Characters'} />
<PreviewSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' tooltip={'Child-chunk-2 · 268 Characters'} />
<PreviewSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' tooltip={'Child-chunk-2 · 268 Characters'} />
</FormattedText>
<div className='mt-12 flex flex-col gap-2'>
<EditSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' onDelete={function (): void {
console.log('onDelete')
} } />
<EditSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' onDelete={function (): void {
console.log('onDelete')
} } />
<EditSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' onDelete={function (): void {
console.log('onDelete')
} } />
<EditSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' onDelete={function (): void {
console.log('onDelete')
} } />
<EditSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' onDelete={function (): void {
console.log('onDelete')
} } />
<div className='flex gap-2 my-4'>
<Button onClick={() => setParentChild(!parentChild)}>
Parent-Child
</Button>
<Button onClick={() => setVertical(!vertical)}>Vertical</Button>
<Button onClick={() => setQa(!qa)}>QA</Button>
</div>
<PreviewContainer header={
<PreviewHeader title='Preview'>
<div className='flex items-center'>
<FileIcon type='pdf' className='size-4' />
<p
className='text-text-primary text-sm font-semibold mx-1'
>EOS R3 Tech Sheet.pdf</p>
<ChevronDown className='size-[18px]' />
<Badge text='276 Estimated chunks' className='ml-1' />
</div>
</PreviewHeader>
}>
<div className='space-y-6'>{parentChild
? Array.from({ length: 4 }, (_, i) => {
return <ChunkContainer
label='Parent-Chunk-01'
characterCount={521}
key={i}
>
<FormattedText className={classNames(
'w-full',
vertical && 'flex flex-col gap-2',
)}>
{Array.from({ length: 4 }, (_, i) => {
return <PreviewSlice
key={i}
label='C-1'
text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' tooltip={'Child-chunk-2 · 268 Characters'} />
})}
</FormattedText>
</ChunkContainer>
})
: Array.from({ length: 2 }, (_, i) => {
return <ChunkContainer label='Chunk-01' characterCount={521} key={i}>
{
qa
? <QAPreview qa={{
question: 'What is the author\'s unconventional approach to writing this book, and how does it challenge the traditional academic mindset of \'publish or perish\'?',
answer: 'It is quite natural for academics who are continuously told to “publish or perish” to want to always create something from scratch that is their own fresh creation. This book is an experiment in not starting from scratch, but instead “re-mixing” the book titled Think Python: How to Think Like a Computer Scientist written by Allen B. Downey, Jeff Elkner and others.',
}} />
: 'In December of 2009, I was preparing to teach SI502 - Networked Programming at the University of Michigan for the fifth semester in a row and decided it was time to write a Python textbook that focused on exploring data instead of understanding algorithms and abstractions. My goal in SI502 is to teach people life-long data handling skills using Python. Few of my students were planning to be professional computer programmers. Instead, they planned be librarians, managers, lawyers, biologists, economists, etc. who happened to want to skillfully use technology in their chosen field.'
}
</ChunkContainer>
})
}</div>
<DividerWithLabel label='Display previews of up to 10 paragraphs' />
</PreviewContainer>
</div>
}

View File

@ -330,6 +330,7 @@ export type NotionPage = {
}
export type ProcessRule = {
processRule: { pre_processing_rules: PreProcessingRule[]; segmentation: { separator: string; max_tokens: number; chunk_overlap: number } }
mode: string
rules: Rules
}

223
web/service/use-datasets.ts Normal file
View File

@ -0,0 +1,223 @@
import groupBy from 'lodash-es/groupBy'
import type { MutationOptions } from '@tanstack/react-query'
import { useMutation } from '@tanstack/react-query'
import { createDocument, createFirstDocument, fetchDefaultProcessRule, fetchFileIndexingEstimate } from './datasets'
import { type IndexingType } from '@/app/components/datasets/create/step-two'
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, DataSourceType, DocForm, FileIndexingEstimateResponse, IndexingEstimateParams, NotionInfo, ProcessRule, ProcessRuleResponse, createDocumentResponse } from '@/models/datasets'
import type { DataSourceProvider, NotionPage } from '@/models/common'
export const getNotionInfo = (
notionPages: NotionPage[],
) => {
const workspacesMap = groupBy(notionPages, 'workspace_id')
const workspaces = Object.keys(workspacesMap).map((workspaceId) => {
return {
workspaceId,
pages: workspacesMap[workspaceId],
}
})
return workspaces.map((workspace) => {
return {
workspace_id: workspace.workspaceId,
pages: workspace.pages.map((page) => {
const { page_id, page_name, page_icon, type } = page
return {
page_id,
page_name,
page_icon,
type,
}
}),
}
}) as NotionInfo[]
}
export const getWebsiteInfo = (
opts: {
websiteCrawlProvider: DataSourceProvider
websiteCrawlJobId: string
websitePages: CrawlResultItem[]
crawlOptions?: CrawlOptions
},
) => {
const { websiteCrawlProvider, websiteCrawlJobId, websitePages, crawlOptions } = opts
return {
provider: websiteCrawlProvider,
job_id: websiteCrawlJobId,
urls: websitePages.map(page => page.source_url),
only_main_content: crawlOptions?.only_main_content,
}
}
type GetFileIndexingEstimateParamsOptionBase = {
docForm: DocForm
docLanguage: string
indexingTechnique: IndexingType
processRule: ProcessRule
dataset_id: string
}
type GetFileIndexingEstimateParamsOptionFile = GetFileIndexingEstimateParamsOptionBase & {
dataSourceType: DataSourceType.FILE
files: CustomFile[]
}
const getFileIndexingEstimateParamsForFile = ({
docForm,
docLanguage,
dataSourceType,
files,
indexingTechnique,
processRule,
dataset_id,
}: GetFileIndexingEstimateParamsOptionFile): IndexingEstimateParams => {
return {
info_list: {
data_source_type: dataSourceType,
file_info_list: {
file_ids: files.map(file => file.id) as string[],
},
},
indexing_technique: indexingTechnique,
process_rule: processRule,
doc_form: docForm,
doc_language: docLanguage,
dataset_id,
}
}
export const useFetchFileIndexingEstimateForFile = (
options: GetFileIndexingEstimateParamsOptionFile,
mutationOptions: MutationOptions<FileIndexingEstimateResponse> = {},
) => {
return useMutation({
mutationFn: async () => {
return fetchFileIndexingEstimate(getFileIndexingEstimateParamsForFile(options))
},
...mutationOptions,
})
}
type GetFileIndexingEstimateParamsOptionNotion = GetFileIndexingEstimateParamsOptionBase & {
dataSourceType: DataSourceType.NOTION
notionPages: NotionPage[]
}
const getFileIndexingEstimateParamsForNotion = ({
docForm,
docLanguage,
dataSourceType,
notionPages,
indexingTechnique,
processRule,
dataset_id,
}: GetFileIndexingEstimateParamsOptionNotion): IndexingEstimateParams => {
return {
info_list: {
data_source_type: dataSourceType,
notion_info_list: getNotionInfo(notionPages),
},
indexing_technique: indexingTechnique,
process_rule: processRule,
doc_form: docForm,
doc_language: docLanguage,
dataset_id,
}
}
export const useFetchFileIndexingEstimateForNotion = (
options: GetFileIndexingEstimateParamsOptionNotion,
mutationOptions: MutationOptions<FileIndexingEstimateResponse> = {},
) => {
return useMutation({
mutationFn: async () => {
return fetchFileIndexingEstimate(getFileIndexingEstimateParamsForNotion(options))
},
...mutationOptions,
})
}
type GetFileIndexingEstimateParamsOptionWeb = GetFileIndexingEstimateParamsOptionBase & {
dataSourceType: DataSourceType.WEB
websitePages: CrawlResultItem[]
crawlOptions?: CrawlOptions
websiteCrawlProvider: DataSourceProvider
websiteCrawlJobId: string
}
const getFileIndexingEstimateParamsForWeb = ({
docForm,
docLanguage,
dataSourceType,
websitePages,
crawlOptions,
websiteCrawlProvider,
websiteCrawlJobId,
indexingTechnique,
processRule,
dataset_id,
}: GetFileIndexingEstimateParamsOptionWeb): IndexingEstimateParams => {
return {
info_list: {
data_source_type: dataSourceType,
website_info_list: getWebsiteInfo({
websiteCrawlProvider,
websiteCrawlJobId,
websitePages,
crawlOptions,
}),
},
indexing_technique: indexingTechnique,
process_rule: processRule,
doc_form: docForm,
doc_language: docLanguage,
dataset_id,
}
}
export const useFetchFileIndexingEstimateForWeb = (
options: GetFileIndexingEstimateParamsOptionWeb,
mutationOptions: MutationOptions<FileIndexingEstimateResponse> = {},
) => {
return useMutation({
mutationFn: async () => {
return fetchFileIndexingEstimate(getFileIndexingEstimateParamsForWeb(options))
},
...mutationOptions,
})
}
export const useCreateFirstDocument = (
mutationOptions: MutationOptions<createDocumentResponse, Error, CreateDocumentReq> = {},
) => {
return useMutation({
mutationFn: async (createDocumentReq: CreateDocumentReq,
) => {
return createFirstDocument({ body: createDocumentReq })
},
...mutationOptions,
})
}
export const useCreateDocument = (
datasetId: string,
mutationOptions: MutationOptions<createDocumentResponse, Error, CreateDocumentReq> = {},
) => {
return useMutation({
mutationFn: async (req: CreateDocumentReq) => {
return createDocument({ datasetId, body: req })
},
...mutationOptions,
})
}
export const useFetchDefaultProcessRule = (
mutationOptions: MutationOptions<ProcessRuleResponse, Error, string> = {},
) => {
return useMutation({
mutationFn: async (url: string) => {
return fetchDefaultProcessRule({ url })
},
...mutationOptions,
})
}