Merge branch 'feat/parent-child-retrieval' of https://github.com/langgenius/dify into feat/parent-child-retrieval
This commit is contained in:
commit
8d74eb4946
@ -18,7 +18,7 @@ const dividerVariants = cva(
|
||||
},
|
||||
)
|
||||
|
||||
type DividerProps = {
|
||||
export type DividerProps = {
|
||||
className?: string
|
||||
style?: CSSProperties
|
||||
} & VariantProps<typeof dividerVariants>
|
||||
|
23
web/app/components/base/divider/with-label.tsx
Normal file
23
web/app/components/base/divider/with-label.tsx
Normal file
@ -0,0 +1,23 @@
|
||||
import type { FC } from 'react'
|
||||
import type { DividerProps } from '.'
|
||||
import Divider from '.'
|
||||
import classNames from '@/utils/classnames'
|
||||
|
||||
export type DividerWithLabelProps = DividerProps & {
|
||||
label: string
|
||||
}
|
||||
|
||||
export const DividerWithLabel: FC<DividerWithLabelProps> = (props) => {
|
||||
const { label, className, ...rest } = props
|
||||
return <div
|
||||
className="flex items-center gap-2 my-2"
|
||||
>
|
||||
<Divider {...rest} className={classNames('flex-1', className)} />
|
||||
<span className="text-text-tertiary text-xs">
|
||||
{label}
|
||||
</span>
|
||||
<Divider {...rest} className={classNames('flex-1', className)} />
|
||||
</div>
|
||||
}
|
||||
|
||||
export default DividerWithLabel
|
13
web/app/components/datasets/assets/selection-mod-nocolor.svg
Normal file
13
web/app/components/datasets/assets/selection-mod-nocolor.svg
Normal file
@ -0,0 +1,13 @@
|
||||
<svg width="10" height="10" viewBox="0 0 10 10" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<g id="Group">
|
||||
<path id="Vector" d="M2.5 10H0V7.5H2.5V10Z" fill="#676F83"/>
|
||||
<path id="Vector_2" d="M6.25 6.25H3.75V3.75H6.25V6.25Z" fill="#676F83"/>
|
||||
<path id="Vector_3" d="M2.5 6.25H0V3.75H2.5V6.25Z" fill="#676F83"/>
|
||||
<path id="Vector_4" d="M6.25 2.5H3.75V0H6.25V2.5Z" fill="#676F83"/>
|
||||
<path id="Vector_5" d="M2.5 2.5H0V0H2.5V2.5Z" fill="#676F83"/>
|
||||
<path id="Vector_6" d="M10 2.5H7.5V0H10V2.5Z" fill="#676F83"/>
|
||||
<path id="Vector_7" d="M9.58332 7.91663H7.91666V9.58329H9.58332V7.91663Z" fill="#676F83"/>
|
||||
<path id="Vector_8" d="M9.58332 4.16663H7.91666V5.83329H9.58332V4.16663Z" fill="#676F83"/>
|
||||
<path id="Vector_9" d="M5.83332 7.91663H4.16666V9.58329H5.83332V7.91663Z" fill="#676F83"/>
|
||||
</g>
|
||||
</svg>
|
After Width: | Height: | Size: 792 B |
55
web/app/components/datasets/chunk.tsx
Normal file
55
web/app/components/datasets/chunk.tsx
Normal file
@ -0,0 +1,55 @@
|
||||
import type { FC, PropsWithChildren } from 'react'
|
||||
import Image from 'next/image'
|
||||
import SelectionMod from './assets/selection-mod-nocolor.svg'
|
||||
import type { QA } from '@/models/datasets'
|
||||
|
||||
export type ChunkLabelProps = {
|
||||
label: string
|
||||
characterCount: number
|
||||
}
|
||||
|
||||
export const ChunkLabel: FC<ChunkLabelProps> = (props) => {
|
||||
const { label, characterCount } = props
|
||||
return <div className='flex items-center text-text-tertiary text-xs font-medium'>
|
||||
<Image src={SelectionMod} alt="Selection Mod" width={10} height={10} />
|
||||
<p className='flex gap-2 ml-0.5'><span>
|
||||
{label}
|
||||
</span>
|
||||
<span>
|
||||
·
|
||||
</span>
|
||||
<span>
|
||||
{`${characterCount} characters`}
|
||||
</span></p>
|
||||
</div>
|
||||
}
|
||||
|
||||
export type ChunkContainerProps = ChunkLabelProps & PropsWithChildren
|
||||
|
||||
export const ChunkContainer: FC<ChunkContainerProps> = (props) => {
|
||||
const { label, characterCount, children } = props
|
||||
return <div className='space-y-2'>
|
||||
<ChunkLabel label={label} characterCount={characterCount} />
|
||||
<p className='text-text-secondary text-sm tracking-[-0.0005em]'>
|
||||
{children}
|
||||
</p>
|
||||
</div>
|
||||
}
|
||||
|
||||
export type QAPreviewProps = {
|
||||
qa: QA
|
||||
}
|
||||
|
||||
export const QAPreview: FC<QAPreviewProps> = (props) => {
|
||||
const { qa } = props
|
||||
return <div className='space-y-2'>
|
||||
<div className='flex gap-1 items-start'>
|
||||
<label className='text-text-tertiary text-[13px] font-medium'>Q</label>
|
||||
<p className='text-text-secondary tracking-[-0.0005em]'>{qa.question}</p>
|
||||
</div>
|
||||
<div className='flex gap-1 items-start'>
|
||||
<label className='text-text-tertiary text-[13px] font-medium'>A</label>
|
||||
<p className='text-text-secondary tracking-[-0.0005em]'>{qa.answer}</p>
|
||||
</div>
|
||||
</div>
|
||||
}
|
@ -394,19 +394,6 @@
|
||||
max-width: 524px;
|
||||
}
|
||||
|
||||
.previewHeader {
|
||||
position: sticky;
|
||||
top: 0;
|
||||
left: 0;
|
||||
padding-top: 42px;
|
||||
background-color: #fff;
|
||||
font-weight: 600;
|
||||
font-size: 18px;
|
||||
line-height: 28px;
|
||||
color: #101828;
|
||||
z-index: 10;
|
||||
}
|
||||
|
||||
/*
|
||||
* `fixed` must under `previewHeader` because of style override would not work
|
||||
*/
|
||||
|
@ -1,17 +1,14 @@
|
||||
'use client'
|
||||
import type { FC, PropsWithChildren, ReactNode } from 'react'
|
||||
import React, { useCallback, useEffect, useLayoutEffect, useRef, useState } from 'react'
|
||||
import type { FC, PropsWithChildren } from 'react'
|
||||
import React, { useCallback, useEffect, useState } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { useContext } from 'use-context-selector'
|
||||
import { useBoolean } from 'ahooks'
|
||||
import { XMarkIcon } from '@heroicons/react/20/solid'
|
||||
import {
|
||||
RiArrowLeftLine,
|
||||
RiCloseLine,
|
||||
RiSearchEyeLine,
|
||||
} from '@remixicon/react'
|
||||
import Link from 'next/link'
|
||||
import { groupBy } from 'lodash-es'
|
||||
import Image from 'next/image'
|
||||
import SettingCog from '../assets/setting-gear-mod.svg'
|
||||
import OrangeEffect from '../assets/option-card-effect-orange.svg'
|
||||
@ -19,7 +16,9 @@ import FamilyMod from '../assets/family-mod.svg'
|
||||
import Note from '../assets/note-mod.svg'
|
||||
import FileList from '../assets/file-list-3-fill.svg'
|
||||
import { indexMethodIcon } from '../icons'
|
||||
import PreviewItem, { PreviewType } from './preview-item'
|
||||
import { PreviewContainer } from '../../preview/container'
|
||||
import { ChunkContainer, QAPreview } from '../../chunk'
|
||||
import { PreviewHeader } from '../../preview/header'
|
||||
import s from './index.module.css'
|
||||
import unescape from './unescape'
|
||||
import escape from './escape'
|
||||
@ -27,15 +26,9 @@ import { OptionCard } from './option-card'
|
||||
import LanguageSelect from './language-select'
|
||||
import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs'
|
||||
import cn from '@/utils/classnames'
|
||||
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
|
||||
import {
|
||||
createDocument,
|
||||
createFirstDocument,
|
||||
fetchFileIndexingEstimate as didFetchFileIndexingEstimate,
|
||||
fetchDefaultProcessRule,
|
||||
} from '@/service/datasets'
|
||||
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FullDocumentDetail, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
|
||||
|
||||
import Button from '@/app/components/base/button'
|
||||
import Loading from '@/app/components/base/loading'
|
||||
import FloatRightContainer from '@/app/components/base/float-right-container'
|
||||
import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
|
||||
import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
|
||||
@ -60,26 +53,20 @@ import { MessageChatSquare } from '@/app/components/base/icons/src/public/common
|
||||
import { IS_CE_EDITION } from '@/config'
|
||||
import Switch from '@/app/components/base/switch'
|
||||
import Divider from '@/app/components/base/divider'
|
||||
import { getNotionInfo, getWebsiteInfo, useCreateDocument, useCreateFirstDocument, useFetchDefaultProcessRule, useFetchFileIndexingEstimateForFile, useFetchFileIndexingEstimateForNotion, useFetchFileIndexingEstimateForWeb } from '@/service/use-datasets'
|
||||
import Loading from '@/app/components/base/loading'
|
||||
|
||||
const TextLabel: FC<PropsWithChildren> = (props) => {
|
||||
return <label className='text-text-secondary text-xs font-semibold leading-none'>{props.children}</label>
|
||||
}
|
||||
|
||||
const FormField: FC<PropsWithChildren<{ label: ReactNode }>> = (props) => {
|
||||
return <div className='space-y-2 flex-1'>
|
||||
<TextLabel>{props.label}</TextLabel>
|
||||
{props.children}
|
||||
</div>
|
||||
}
|
||||
|
||||
type ValueOf<T> = T[keyof T]
|
||||
type StepTwoProps = {
|
||||
isSetting?: boolean
|
||||
documentDetail?: FullDocumentDetail
|
||||
isAPIKeySet: boolean
|
||||
onSetting: () => void
|
||||
datasetId?: string
|
||||
indexingType?: ValueOf<IndexingType>
|
||||
indexingType?: IndexingType
|
||||
retrievalMethod?: string
|
||||
dataSourceType: DataSourceType
|
||||
files: CustomFile[]
|
||||
@ -96,11 +83,11 @@ type StepTwoProps = {
|
||||
onCancel?: () => void
|
||||
}
|
||||
|
||||
enum SegmentType {
|
||||
export enum SegmentType {
|
||||
AUTO = 'automatic',
|
||||
CUSTOM = 'custom',
|
||||
}
|
||||
enum IndexingType {
|
||||
export enum IndexingType {
|
||||
QUALIFIED = 'high_quality',
|
||||
ECONOMICAL = 'economy',
|
||||
}
|
||||
@ -117,7 +104,6 @@ type ParentChildConfig = {
|
||||
delimiter: string
|
||||
maxLength: number
|
||||
}
|
||||
rules: PreProcessingRule[]
|
||||
}
|
||||
|
||||
const defaultParentChildConfig: ParentChildConfig = {
|
||||
@ -130,7 +116,6 @@ const defaultParentChildConfig: ParentChildConfig = {
|
||||
delimiter: '\\n\\n',
|
||||
maxLength: 4000,
|
||||
},
|
||||
rules: [],
|
||||
}
|
||||
|
||||
const StepTwo = ({
|
||||
@ -162,10 +147,6 @@ const StepTwo = ({
|
||||
const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext()
|
||||
const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
|
||||
const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
|
||||
const scrollRef = useRef<HTMLDivElement>(null)
|
||||
const [scrolled, setScrolled] = useState(false)
|
||||
const previewScrollRef = useRef<HTMLDivElement>(null)
|
||||
const [previewScrolled, setPreviewScrolled] = useState(false)
|
||||
const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
|
||||
const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
|
||||
const setSegmentIdentifier = useCallback((value: string) => {
|
||||
@ -176,7 +157,7 @@ const StepTwo = ({
|
||||
const [rules, setRules] = useState<PreProcessingRule[]>([])
|
||||
const [defaultConfig, setDefaultConfig] = useState<Rules>()
|
||||
const hasSetIndexType = !!indexingType
|
||||
const [indexType, setIndexType] = useState<ValueOf<IndexingType>>(
|
||||
const [indexType, setIndexType] = useState<IndexingType>(
|
||||
(indexingType
|
||||
|| isAPIKeySet)
|
||||
? IndexingType.QUALIFIED
|
||||
@ -190,37 +171,96 @@ const StepTwo = ({
|
||||
(datasetId && documentDetail) ? documentDetail.doc_language : (locale !== LanguagesSupported[1] ? 'English' : 'Chinese'),
|
||||
)
|
||||
const [QATipHide, setQATipHide] = useState(false)
|
||||
const [previewSwitched, setPreviewSwitched] = useState(false)
|
||||
const [showPreview, { setTrue: setShowPreview, setFalse: hidePreview }] = useBoolean()
|
||||
const [customFileIndexingEstimate, setCustomFileIndexingEstimate] = useState<FileIndexingEstimateResponse | null>(null)
|
||||
const [automaticFileIndexingEstimate, setAutomaticFileIndexingEstimate] = useState<FileIndexingEstimateResponse | null>(null)
|
||||
|
||||
const fileIndexingEstimate = (() => {
|
||||
return segmentationType === SegmentType.AUTO ? automaticFileIndexingEstimate : customFileIndexingEstimate
|
||||
})()
|
||||
const [isCreating, setIsCreating] = useState(false)
|
||||
const [qaPreviewSwitched, setQAPreviewSwitched] = useState(false)
|
||||
|
||||
const [parentChildConfig, setParentChildConfig] = useState<ParentChildConfig>(defaultParentChildConfig)
|
||||
|
||||
const scrollHandle = (e: Event) => {
|
||||
if ((e.target as HTMLDivElement).scrollTop > 0)
|
||||
setScrolled(true)
|
||||
const getIndexing_technique = () => indexingType || indexType
|
||||
|
||||
else
|
||||
setScrolled(false)
|
||||
const getProcessRule = () => {
|
||||
const processRule: ProcessRule = {
|
||||
rules: {} as any, // api will check this. It will be removed after api refactored.
|
||||
mode: segmentationType,
|
||||
}
|
||||
if (segmentationType === SegmentType.CUSTOM) {
|
||||
const ruleObj = {
|
||||
pre_processing_rules: rules,
|
||||
segmentation: {
|
||||
separator: unescape(segmentIdentifier),
|
||||
max_tokens: max,
|
||||
chunk_overlap: overlap,
|
||||
},
|
||||
}
|
||||
processRule.rules = ruleObj
|
||||
}
|
||||
return processRule
|
||||
}
|
||||
|
||||
const previewScrollHandle = (e: Event) => {
|
||||
if ((e.target as HTMLDivElement).scrollTop > 0)
|
||||
setPreviewScrolled(true)
|
||||
const fileIndexingEstimateQuery = useFetchFileIndexingEstimateForFile({
|
||||
docForm: docForm as DocForm,
|
||||
docLanguage,
|
||||
dataSourceType: DataSourceType.FILE,
|
||||
files,
|
||||
indexingTechnique: getIndexing_technique() as any,
|
||||
processRule: getProcessRule(),
|
||||
dataset_id: datasetId!,
|
||||
})
|
||||
const notionIndexingEstimateQuery = useFetchFileIndexingEstimateForNotion({
|
||||
docForm: docForm as DocForm,
|
||||
docLanguage,
|
||||
dataSourceType: DataSourceType.NOTION,
|
||||
notionPages,
|
||||
indexingTechnique: getIndexing_technique() as any,
|
||||
processRule: getProcessRule(),
|
||||
dataset_id: datasetId || '',
|
||||
})
|
||||
|
||||
else
|
||||
setPreviewScrolled(false)
|
||||
}
|
||||
const getFileName = (name: string) => {
|
||||
const arr = name.split('.')
|
||||
return arr.slice(0, -1).join('.')
|
||||
}
|
||||
const websiteIndexingEstimateQuery = useFetchFileIndexingEstimateForWeb({
|
||||
docForm: docForm as DocForm,
|
||||
docLanguage,
|
||||
dataSourceType: DataSourceType.WEB,
|
||||
websitePages,
|
||||
crawlOptions,
|
||||
websiteCrawlProvider,
|
||||
websiteCrawlJobId,
|
||||
indexingTechnique: getIndexing_technique() as any,
|
||||
processRule: getProcessRule(),
|
||||
dataset_id: datasetId || '',
|
||||
})
|
||||
|
||||
const fetchEstimate = useCallback(() => {
|
||||
if (dataSourceType === DataSourceType.FILE)
|
||||
fileIndexingEstimateQuery.mutate()
|
||||
|
||||
if (dataSourceType === DataSourceType.NOTION)
|
||||
notionIndexingEstimateQuery.mutate()
|
||||
|
||||
if (dataSourceType === DataSourceType.WEB)
|
||||
websiteIndexingEstimateQuery.mutate()
|
||||
}, [dataSourceType, fileIndexingEstimateQuery, notionIndexingEstimateQuery, websiteIndexingEstimateQuery])
|
||||
|
||||
const estimate
|
||||
= dataSourceType === DataSourceType.FILE
|
||||
? fileIndexingEstimateQuery.data
|
||||
: dataSourceType === DataSourceType.NOTION
|
||||
? notionIndexingEstimateQuery.data
|
||||
: websiteIndexingEstimateQuery.data
|
||||
|
||||
// const getIsEstimateReady = useCallback(() => {
|
||||
// if (dataSourceType === DataSourceType.FILE)
|
||||
// return fileIndexingEstimateQuery.isSuccess
|
||||
|
||||
// if (dataSourceType === DataSourceType.NOTION)
|
||||
// return notionIndexingEstimateQuery.isSuccess
|
||||
|
||||
// if (dataSourceType === DataSourceType.WEB)
|
||||
// return websiteIndexingEstimateQuery.isSuccess
|
||||
// }, [dataSourceType, fileIndexingEstimateQuery.isSuccess, notionIndexingEstimateQuery.isSuccess, websiteIndexingEstimateQuery.isSuccess])
|
||||
|
||||
// const getFileName = (name: string) => {
|
||||
// const arr = name.split('.')
|
||||
// return arr.slice(0, -1).join('.')
|
||||
// }
|
||||
|
||||
const getRuleName = (key: string) => {
|
||||
if (key === 'remove_extra_spaces')
|
||||
@ -248,129 +288,21 @@ const StepTwo = ({
|
||||
if (defaultConfig) {
|
||||
setSegmentIdentifier(defaultConfig.segmentation.separator)
|
||||
setMax(defaultConfig.segmentation.max_tokens)
|
||||
setOverlap(defaultConfig.segmentation.chunk_overlap)
|
||||
setOverlap(defaultConfig.segmentation.chunk_overlap!)
|
||||
setRules(defaultConfig.pre_processing_rules)
|
||||
}
|
||||
setParentChildConfig(defaultParentChildConfig)
|
||||
}
|
||||
|
||||
const fetchFileIndexingEstimate = async (docForm = DocForm.TEXT, language?: string) => {
|
||||
// eslint-disable-next-line @typescript-eslint/no-use-before-define
|
||||
const res = await didFetchFileIndexingEstimate(getFileIndexingEstimateParams(docForm, language)!)
|
||||
if (segmentationType === SegmentType.CUSTOM)
|
||||
setCustomFileIndexingEstimate(res)
|
||||
else
|
||||
setAutomaticFileIndexingEstimate(res)
|
||||
}
|
||||
|
||||
const confirmChangeCustomConfig = () => {
|
||||
const updatePreview = () => {
|
||||
if (segmentationType === SegmentType.CUSTOM && max > 4000) {
|
||||
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
|
||||
return
|
||||
}
|
||||
setCustomFileIndexingEstimate(null)
|
||||
setShowPreview()
|
||||
fetchFileIndexingEstimate()
|
||||
setPreviewSwitched(false)
|
||||
fetchEstimate()
|
||||
setQAPreviewSwitched(false)
|
||||
}
|
||||
|
||||
const getIndexing_technique = () => indexingType || indexType
|
||||
|
||||
const getProcessRule = () => {
|
||||
const processRule: ProcessRule = {
|
||||
rules: {} as any, // api will check this. It will be removed after api refactored.
|
||||
mode: segmentationType,
|
||||
}
|
||||
if (segmentationType === SegmentType.CUSTOM) {
|
||||
const ruleObj = {
|
||||
pre_processing_rules: rules,
|
||||
segmentation: {
|
||||
separator: unescape(segmentIdentifier),
|
||||
max_tokens: max,
|
||||
chunk_overlap: overlap,
|
||||
},
|
||||
}
|
||||
processRule.rules = ruleObj
|
||||
}
|
||||
return processRule
|
||||
}
|
||||
|
||||
const getNotionInfo = () => {
|
||||
const workspacesMap = groupBy(notionPages, 'workspace_id')
|
||||
const workspaces = Object.keys(workspacesMap).map((workspaceId) => {
|
||||
return {
|
||||
workspaceId,
|
||||
pages: workspacesMap[workspaceId],
|
||||
}
|
||||
})
|
||||
return workspaces.map((workspace) => {
|
||||
return {
|
||||
workspace_id: workspace.workspaceId,
|
||||
pages: workspace.pages.map((page) => {
|
||||
const { page_id, page_name, page_icon, type } = page
|
||||
return {
|
||||
page_id,
|
||||
page_name,
|
||||
page_icon,
|
||||
type,
|
||||
}
|
||||
}),
|
||||
}
|
||||
}) as NotionInfo[]
|
||||
}
|
||||
|
||||
const getWebsiteInfo = () => {
|
||||
return {
|
||||
provider: websiteCrawlProvider,
|
||||
job_id: websiteCrawlJobId,
|
||||
urls: websitePages.map(page => page.source_url),
|
||||
only_main_content: crawlOptions?.only_main_content,
|
||||
}
|
||||
}
|
||||
|
||||
const getFileIndexingEstimateParams = (docForm: DocForm, language?: string): IndexingEstimateParams | undefined => {
|
||||
if (dataSourceType === DataSourceType.FILE) {
|
||||
return {
|
||||
info_list: {
|
||||
data_source_type: dataSourceType,
|
||||
file_info_list: {
|
||||
file_ids: files.map(file => file.id) as string[],
|
||||
},
|
||||
},
|
||||
indexing_technique: getIndexing_technique() as string,
|
||||
process_rule: getProcessRule(),
|
||||
doc_form: docForm,
|
||||
doc_language: language || docLanguage,
|
||||
dataset_id: datasetId as string,
|
||||
}
|
||||
}
|
||||
if (dataSourceType === DataSourceType.NOTION) {
|
||||
return {
|
||||
info_list: {
|
||||
data_source_type: dataSourceType,
|
||||
notion_info_list: getNotionInfo(),
|
||||
},
|
||||
indexing_technique: getIndexing_technique() as string,
|
||||
process_rule: getProcessRule(),
|
||||
doc_form: docForm,
|
||||
doc_language: language || docLanguage,
|
||||
dataset_id: datasetId as string,
|
||||
}
|
||||
}
|
||||
if (dataSourceType === DataSourceType.WEB) {
|
||||
return {
|
||||
info_list: {
|
||||
data_source_type: dataSourceType,
|
||||
website_info_list: getWebsiteInfo(),
|
||||
},
|
||||
indexing_technique: getIndexing_technique() as string,
|
||||
process_rule: getProcessRule(),
|
||||
doc_form: docForm,
|
||||
doc_language: language || docLanguage,
|
||||
dataset_id: datasetId as string,
|
||||
}
|
||||
}
|
||||
}
|
||||
const {
|
||||
modelList: rerankModelList,
|
||||
defaultModel: rerankDefaultModel,
|
||||
@ -454,28 +386,35 @@ const StepTwo = ({
|
||||
}
|
||||
}
|
||||
if (dataSourceType === DataSourceType.NOTION)
|
||||
params.data_source.info_list.notion_info_list = getNotionInfo()
|
||||
params.data_source.info_list.notion_info_list = getNotionInfo(notionPages)
|
||||
|
||||
if (dataSourceType === DataSourceType.WEB)
|
||||
params.data_source.info_list.website_info_list = getWebsiteInfo()
|
||||
if (dataSourceType === DataSourceType.WEB) {
|
||||
params.data_source.info_list.website_info_list = getWebsiteInfo({
|
||||
websiteCrawlProvider,
|
||||
websiteCrawlJobId,
|
||||
websitePages,
|
||||
})
|
||||
}
|
||||
}
|
||||
return params
|
||||
}
|
||||
|
||||
const getRules = async () => {
|
||||
try {
|
||||
const res = await fetchDefaultProcessRule({ url: '/datasets/process-rule' })
|
||||
const separator = res.rules.segmentation.separator
|
||||
const fetchDefaultProcessRuleMutation = useFetchDefaultProcessRule({
|
||||
onSuccess(data) {
|
||||
const separator = data.rules.segmentation.separator
|
||||
setSegmentIdentifier(separator)
|
||||
setMax(res.rules.segmentation.max_tokens)
|
||||
setOverlap(res.rules.segmentation.chunk_overlap)
|
||||
setRules(res.rules.pre_processing_rules)
|
||||
setDefaultConfig(res.rules)
|
||||
}
|
||||
catch (err) {
|
||||
console.log(err)
|
||||
}
|
||||
}
|
||||
setMax(data.rules.segmentation.max_tokens)
|
||||
setOverlap(data.rules.segmentation.chunk_overlap!)
|
||||
setRules(data.rules.pre_processing_rules)
|
||||
setDefaultConfig(data.rules)
|
||||
},
|
||||
onError(error) {
|
||||
Toast.notify({
|
||||
type: 'error',
|
||||
message: `${error}`,
|
||||
})
|
||||
},
|
||||
})
|
||||
|
||||
const getRulesFromDetail = () => {
|
||||
if (documentDetail) {
|
||||
@ -485,7 +424,7 @@ const StepTwo = ({
|
||||
const overlap = rules.segmentation.chunk_overlap
|
||||
setSegmentIdentifier(separator)
|
||||
setMax(max)
|
||||
setOverlap(overlap)
|
||||
setOverlap(overlap as number)
|
||||
setRules(rules.pre_processing_rules)
|
||||
setDefaultConfig(rules)
|
||||
}
|
||||
@ -496,77 +435,75 @@ const StepTwo = ({
|
||||
setSegmentationType(documentDetail.dataset_process_rule.mode)
|
||||
}
|
||||
|
||||
const createFirstDocumentMutation = useCreateFirstDocument({
|
||||
onError(error) {
|
||||
Toast.notify({
|
||||
type: 'error',
|
||||
message: `${error}`,
|
||||
})
|
||||
},
|
||||
})
|
||||
const createDocumentMutation = useCreateDocument(datasetId!, {
|
||||
onError(error) {
|
||||
Toast.notify({
|
||||
type: 'error',
|
||||
message: `${error}`,
|
||||
})
|
||||
},
|
||||
})
|
||||
|
||||
const isCreating = createFirstDocumentMutation.isPending || createDocumentMutation.isPending
|
||||
|
||||
const createHandle = async () => {
|
||||
if (isCreating)
|
||||
return
|
||||
setIsCreating(true)
|
||||
try {
|
||||
let res
|
||||
const params = getCreationParams()
|
||||
if (!params)
|
||||
return false
|
||||
|
||||
setIsCreating(true)
|
||||
if (!datasetId) {
|
||||
res = await createFirstDocument({
|
||||
body: params as CreateDocumentReq,
|
||||
})
|
||||
await createFirstDocumentMutation.mutateAsync(
|
||||
params,
|
||||
{
|
||||
onSuccess(data) {
|
||||
updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
|
||||
updateResultCache && updateResultCache(res)
|
||||
updateResultCache && updateResultCache(data)
|
||||
// eslint-disable-next-line @typescript-eslint/no-use-before-define
|
||||
updateRetrievalMethodCache && updateRetrievalMethodCache(retrievalConfig.search_method as string)
|
||||
},
|
||||
},
|
||||
)
|
||||
}
|
||||
else {
|
||||
res = await createDocument({
|
||||
datasetId,
|
||||
body: params as CreateDocumentReq,
|
||||
})
|
||||
await createDocumentMutation.mutateAsync(params, {
|
||||
onSuccess(data) {
|
||||
updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
|
||||
updateResultCache && updateResultCache(res)
|
||||
updateResultCache && updateResultCache(data)
|
||||
},
|
||||
})
|
||||
}
|
||||
if (mutateDatasetRes)
|
||||
mutateDatasetRes()
|
||||
onStepChange && onStepChange(+1)
|
||||
isSetting && onSave && onSave()
|
||||
}
|
||||
catch (err) {
|
||||
Toast.notify({
|
||||
type: 'error',
|
||||
message: `${err}`,
|
||||
})
|
||||
}
|
||||
finally {
|
||||
setIsCreating(false)
|
||||
}
|
||||
}
|
||||
|
||||
const handleSwitch = (state: boolean) => {
|
||||
if (state)
|
||||
const handleDocformSwitch = (isQAMode: boolean) => {
|
||||
if (isQAMode)
|
||||
setDocForm(DocForm.QA)
|
||||
else
|
||||
setDocForm(DocForm.TEXT)
|
||||
}
|
||||
|
||||
const previewSwitch = async (language?: string) => {
|
||||
setPreviewSwitched(true)
|
||||
const previewSwitch = () => {
|
||||
setQAPreviewSwitched(true)
|
||||
setIsLanguageSelectDisabled(true)
|
||||
if (segmentationType === SegmentType.AUTO)
|
||||
setAutomaticFileIndexingEstimate(null)
|
||||
else
|
||||
setCustomFileIndexingEstimate(null)
|
||||
try {
|
||||
await fetchFileIndexingEstimate(DocForm.QA, language)
|
||||
}
|
||||
finally {
|
||||
setIsLanguageSelectDisabled(false)
|
||||
}
|
||||
fetchEstimate()
|
||||
}
|
||||
|
||||
const handleSelect = (language: string) => {
|
||||
setDocLanguage(language)
|
||||
// Switch language, re-cutter
|
||||
if (docForm === DocForm.QA && previewSwitched)
|
||||
previewSwitch(language)
|
||||
if (docForm === DocForm.QA && qaPreviewSwitched)
|
||||
previewSwitch()
|
||||
}
|
||||
|
||||
const changeToEconomicalType = () => {
|
||||
@ -579,7 +516,7 @@ const StepTwo = ({
|
||||
useEffect(() => {
|
||||
// fetch rules
|
||||
if (!isSetting) {
|
||||
getRules()
|
||||
fetchDefaultProcessRuleMutation.mutate('/datasets/process-rule')
|
||||
}
|
||||
else {
|
||||
getRulesFromDetail()
|
||||
@ -587,22 +524,6 @@ const StepTwo = ({
|
||||
}
|
||||
}, [])
|
||||
|
||||
useEffect(() => {
|
||||
scrollRef.current?.addEventListener('scroll', scrollHandle)
|
||||
return () => {
|
||||
scrollRef.current?.removeEventListener('scroll', scrollHandle)
|
||||
}
|
||||
}, [])
|
||||
|
||||
useLayoutEffect(() => {
|
||||
if (showPreview) {
|
||||
previewScrollRef.current?.addEventListener('scroll', previewScrollHandle)
|
||||
return () => {
|
||||
previewScrollRef.current?.removeEventListener('scroll', previewScrollHandle)
|
||||
}
|
||||
}
|
||||
}, [showPreview])
|
||||
|
||||
useEffect(() => {
|
||||
if (indexingType === IndexingType.ECONOMICAL && docForm === DocForm.QA)
|
||||
setDocForm(DocForm.TEXT)
|
||||
@ -617,20 +538,6 @@ const StepTwo = ({
|
||||
setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
|
||||
}, [isAPIKeySet, indexingType, datasetId])
|
||||
|
||||
useEffect(() => {
|
||||
if (segmentationType === SegmentType.AUTO) {
|
||||
setAutomaticFileIndexingEstimate(null)
|
||||
!isMobile && setShowPreview()
|
||||
fetchFileIndexingEstimate()
|
||||
setPreviewSwitched(false)
|
||||
}
|
||||
else {
|
||||
hidePreview()
|
||||
setCustomFileIndexingEstimate(null)
|
||||
setPreviewSwitched(false)
|
||||
}
|
||||
}, [segmentationType, indexType])
|
||||
|
||||
const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
|
||||
search_method: RETRIEVE_METHOD.semantic,
|
||||
reranking_enable: false,
|
||||
@ -659,7 +566,7 @@ const StepTwo = ({
|
||||
onClick={() => setSegmentationType(SegmentType.AUTO)}
|
||||
actions={
|
||||
<>
|
||||
<Button variant={'secondary-accent'}>
|
||||
<Button variant={'secondary-accent'} onClick={() => updatePreview()}>
|
||||
<RiSearchEyeLine className='h-4 w-4 mr-1.5' />
|
||||
{t('datasetCreation.stepTwo.previewChunk')}
|
||||
</Button>
|
||||
@ -714,7 +621,7 @@ const StepTwo = ({
|
||||
onClick={() => setSegmentationType(SegmentType.CUSTOM)}
|
||||
actions={
|
||||
<>
|
||||
<Button variant={'secondary-accent'}>
|
||||
<Button variant={'secondary-accent'} onClick={() => updatePreview()}>
|
||||
<RiSearchEyeLine className='h-4 w-4 mr-1.5' />
|
||||
{t('datasetCreation.stepTwo.previewChunk')}
|
||||
</Button>
|
||||
@ -910,7 +817,7 @@ const StepTwo = ({
|
||||
</div>
|
||||
<Switch
|
||||
defaultValue={docForm === DocForm.QA}
|
||||
onChange={handleSwitch}
|
||||
onChange={handleDocformSwitch}
|
||||
size='md'
|
||||
/>
|
||||
</div>
|
||||
@ -1000,70 +907,40 @@ const StepTwo = ({
|
||||
</div>
|
||||
</div>
|
||||
<FloatRightContainer isMobile={isMobile} isOpen={true} onClose={() => { }} footer={null}>
|
||||
{showPreview && <div
|
||||
ref={previewScrollRef}
|
||||
className={cn(s.previewWrap, isMobile && s.isMobile, 'relative h-full overflow-y-scroll border-l border-[#F2F4F7]')}
|
||||
<PreviewContainer
|
||||
header={<PreviewHeader
|
||||
title='Preview'
|
||||
>
|
||||
<div className={cn(s.previewHeader, previewScrolled && `${s.fixed} pb-3`)}>
|
||||
<div className='flex items-center justify-between px-8'>
|
||||
<div className='grow flex items-center'>
|
||||
<div>{t('datasetCreation.stepTwo.previewTitle')}</div>
|
||||
{docForm === DocForm.QA && !previewSwitched && (
|
||||
<Button className='ml-2' variant='secondary-accent' onClick={() => previewSwitch()}>{t('datasetCreation.stepTwo.previewButton')}</Button>
|
||||
</PreviewHeader>}
|
||||
className={cn(s.previewWrap, isMobile && s.isMobile, 'relative h-full overflow-y-scroll space-y-4')}
|
||||
>
|
||||
{qaPreviewSwitched && docForm === DocForm.QA && estimate?.qa_preview && (
|
||||
estimate?.qa_preview.map(item => (
|
||||
<QAPreview key={item.question} qa={item} />
|
||||
))
|
||||
)}
|
||||
</div>
|
||||
<div className='flex items-center justify-center w-6 h-6 cursor-pointer' onClick={hidePreview}>
|
||||
<XMarkIcon className='h-4 w-4'></XMarkIcon>
|
||||
</div>
|
||||
</div>
|
||||
{docForm === DocForm.QA && !previewSwitched && (
|
||||
<div className='px-8 pr-12 text-xs text-gray-500'>
|
||||
<span>{t('datasetCreation.stepTwo.previewSwitchTipStart')}</span>
|
||||
<span className='text-amber-600'>{t('datasetCreation.stepTwo.previewSwitchTipEnd')}</span>
|
||||
</div>
|
||||
{(docForm === DocForm.TEXT || !qaPreviewSwitched) && estimate?.preview && (
|
||||
estimate?.preview.map((item, index) => (
|
||||
<ChunkContainer
|
||||
key={item}
|
||||
label={`Chunk-${index + 1}`}
|
||||
characterCount={item.length}
|
||||
>
|
||||
{item}
|
||||
</ChunkContainer>
|
||||
))
|
||||
)}
|
||||
</div>
|
||||
<div className='my-4 px-8 space-y-4'>
|
||||
{previewSwitched && docForm === DocForm.QA && fileIndexingEstimate?.qa_preview && (
|
||||
<>
|
||||
{fileIndexingEstimate?.qa_preview.map((item, index) => (
|
||||
<PreviewItem type={PreviewType.QA} key={item.question} qa={item} index={index + 1} />
|
||||
))}
|
||||
</>
|
||||
)}
|
||||
{(docForm === DocForm.TEXT || !previewSwitched) && fileIndexingEstimate?.preview && (
|
||||
<>
|
||||
{fileIndexingEstimate?.preview.map((item, index) => (
|
||||
<PreviewItem type={PreviewType.TEXT} key={item} content={item} index={index + 1} />
|
||||
))}
|
||||
</>
|
||||
)}
|
||||
{previewSwitched && docForm === DocForm.QA && !fileIndexingEstimate?.qa_preview && (
|
||||
{qaPreviewSwitched && docForm === DocForm.QA && !estimate?.qa_preview && (
|
||||
<div className='flex items-center justify-center h-[200px]'>
|
||||
<Loading type='area' />
|
||||
</div>
|
||||
)}
|
||||
{!previewSwitched && !fileIndexingEstimate?.preview && (
|
||||
{!qaPreviewSwitched && !estimate?.preview && (
|
||||
<div className='flex items-center justify-center h-[200px]'>
|
||||
<Loading type='area' />
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>}
|
||||
{!showPreview && (
|
||||
<div className={cn(s.sideTip)}>
|
||||
<div className={s.tipCard}>
|
||||
<span className={s.icon} />
|
||||
<div className={s.title}>{t('datasetCreation.stepTwo.sideTipTitle')}</div>
|
||||
<div className={s.content}>
|
||||
<p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP1')}</p>
|
||||
<p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP2')}</p>
|
||||
<p className='mb-3'>{t('datasetCreation.stepTwo.sideTipP3')}</p>
|
||||
<p>{t('datasetCreation.stepTwo.sideTipP4')}</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
</PreviewContainer>
|
||||
</FloatRightContainer>
|
||||
</div>
|
||||
)
|
||||
|
@ -34,7 +34,7 @@ export const SliceContent: FC<SliceContentProps> = forwardRef((props, ref) => {
|
||||
const { className, children, ...rest } = props
|
||||
return <span {...rest} ref={ref} className={classNames(
|
||||
baseStyle,
|
||||
'px-1 bg-state-base-hover group-hover:bg-state-accent-hover-alt group-hover:text-text-primary',
|
||||
'px-1 bg-state-base-hover group-hover:bg-state-accent-hover-alt group-hover:text-text-primary leading-7',
|
||||
className,
|
||||
)}>
|
||||
{children}
|
||||
|
27
web/app/components/datasets/preview/container.tsx
Normal file
27
web/app/components/datasets/preview/container.tsx
Normal file
@ -0,0 +1,27 @@
|
||||
import type { ComponentProps, FC, ReactNode } from 'react'
|
||||
import { forwardRef } from 'react'
|
||||
import classNames from '@/utils/classnames'
|
||||
|
||||
export type PreviewContainerProps = ComponentProps<'div'> & {
|
||||
header: ReactNode
|
||||
}
|
||||
|
||||
export const PreviewContainer: FC<PreviewContainerProps> = forwardRef((props, ref) => {
|
||||
const { children, className, header, ...rest } = props
|
||||
return <div
|
||||
{...rest}
|
||||
ref={ref}
|
||||
className={classNames(
|
||||
'flex flex-col rounded-xl border-t-[0.5px] border-l-[0.5px] border-components-panel-border bg-background-default-lighter shadow shadow-shadow-shadow-5',
|
||||
className,
|
||||
)}
|
||||
>
|
||||
<header className='py-4 pl-5 pr-3 border-b border-divider-subtle'>
|
||||
{header}
|
||||
</header>
|
||||
<main className='py-5 px-6'>
|
||||
{children}
|
||||
</main>
|
||||
</div>
|
||||
})
|
||||
PreviewContainer.displayName = 'PreviewContainer'
|
23
web/app/components/datasets/preview/header.tsx
Normal file
23
web/app/components/datasets/preview/header.tsx
Normal file
@ -0,0 +1,23 @@
|
||||
import type { ComponentProps, FC } from 'react'
|
||||
import classNames from '@/utils/classnames'
|
||||
|
||||
export type PreviewHeaderProps = Omit<ComponentProps<'div'>, 'title'> & {
|
||||
title: string
|
||||
}
|
||||
|
||||
export const PreviewHeader: FC<PreviewHeaderProps> = (props) => {
|
||||
const { title, className, children, ...rest } = props
|
||||
return <div
|
||||
{...rest}
|
||||
className={classNames(
|
||||
className,
|
||||
)}
|
||||
>
|
||||
<div
|
||||
className='text-text-accent text-2xs font-semibold leading-3 uppercase'
|
||||
>
|
||||
{title}
|
||||
</div>
|
||||
{children}
|
||||
</div>
|
||||
}
|
0
web/app/components/datasets/preview/index.tsx
Normal file
0
web/app/components/datasets/preview/index.tsx
Normal file
@ -1,35 +1,76 @@
|
||||
'use client'
|
||||
|
||||
import { useState } from 'react'
|
||||
import { FormattedText } from '../components/datasets/formatted-text/formatted'
|
||||
import { PreviewSlice } from '../components/datasets/formatted-text/flavours/preview-slice'
|
||||
import { EditSlice } from '../components/datasets/formatted-text/flavours/edit-slice'
|
||||
import { PreviewContainer } from '../components/datasets/preview/container'
|
||||
import { PreviewHeader } from '../components/datasets/preview/header'
|
||||
import FileIcon from '../components/base/file-icon'
|
||||
import { ChevronDown } from '../components/base/icons/src/vender/solid/arrows'
|
||||
import Badge from '../components/base/badge'
|
||||
import { DividerWithLabel } from '../components/base/divider/with-label'
|
||||
import Button from '../components/base/button'
|
||||
import { ChunkContainer, QAPreview } from '../components/datasets/chunk'
|
||||
import classNames from '@/utils/classnames'
|
||||
|
||||
export default function Page() {
|
||||
const [parentChild, setParentChild] = useState(false)
|
||||
const [vertical, setVertical] = useState(false)
|
||||
const [qa, setQa] = useState(false)
|
||||
return <div className='p-4'>
|
||||
<FormattedText>
|
||||
<PreviewSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' tooltip={'Child-chunk-2 · 268 Characters'} />
|
||||
<PreviewSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' tooltip={'Child-chunk-2 · 268 Characters'} />
|
||||
<PreviewSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' tooltip={'Child-chunk-2 · 268 Characters'} />
|
||||
<PreviewSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' tooltip={'Child-chunk-2 · 268 Characters'} />
|
||||
<PreviewSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' tooltip={'Child-chunk-2 · 268 Characters'} />
|
||||
</FormattedText>
|
||||
|
||||
<div className='mt-12 flex flex-col gap-2'>
|
||||
<EditSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' onDelete={function (): void {
|
||||
console.log('onDelete')
|
||||
} } />
|
||||
<EditSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' onDelete={function (): void {
|
||||
console.log('onDelete')
|
||||
} } />
|
||||
<EditSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' onDelete={function (): void {
|
||||
console.log('onDelete')
|
||||
} } />
|
||||
<EditSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' onDelete={function (): void {
|
||||
console.log('onDelete')
|
||||
} } />
|
||||
<EditSlice label='C-1' text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' onDelete={function (): void {
|
||||
console.log('onDelete')
|
||||
} } />
|
||||
<div className='flex gap-2 my-4'>
|
||||
<Button onClick={() => setParentChild(!parentChild)}>
|
||||
Parent-Child
|
||||
</Button>
|
||||
<Button onClick={() => setVertical(!vertical)}>Vertical</Button>
|
||||
<Button onClick={() => setQa(!qa)}>QA</Button>
|
||||
</div>
|
||||
<PreviewContainer header={
|
||||
<PreviewHeader title='Preview'>
|
||||
<div className='flex items-center'>
|
||||
<FileIcon type='pdf' className='size-4' />
|
||||
<p
|
||||
className='text-text-primary text-sm font-semibold mx-1'
|
||||
>EOS R3 Tech Sheet.pdf</p>
|
||||
<ChevronDown className='size-[18px]' />
|
||||
<Badge text='276 Estimated chunks' className='ml-1' />
|
||||
</div>
|
||||
</PreviewHeader>
|
||||
}>
|
||||
<div className='space-y-6'>{parentChild
|
||||
? Array.from({ length: 4 }, (_, i) => {
|
||||
return <ChunkContainer
|
||||
label='Parent-Chunk-01'
|
||||
characterCount={521}
|
||||
key={i}
|
||||
>
|
||||
<FormattedText className={classNames(
|
||||
'w-full',
|
||||
vertical && 'flex flex-col gap-2',
|
||||
)}>
|
||||
{Array.from({ length: 4 }, (_, i) => {
|
||||
return <PreviewSlice
|
||||
key={i}
|
||||
label='C-1'
|
||||
text='lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.' tooltip={'Child-chunk-2 · 268 Characters'} />
|
||||
})}
|
||||
</FormattedText>
|
||||
</ChunkContainer>
|
||||
})
|
||||
: Array.from({ length: 2 }, (_, i) => {
|
||||
return <ChunkContainer label='Chunk-01' characterCount={521} key={i}>
|
||||
{
|
||||
qa
|
||||
? <QAPreview qa={{
|
||||
question: 'What is the author\'s unconventional approach to writing this book, and how does it challenge the traditional academic mindset of \'publish or perish\'?',
|
||||
answer: 'It is quite natural for academics who are continuously told to “publish or perish” to want to always create something from scratch that is their own fresh creation. This book is an experiment in not starting from scratch, but instead “re-mixing” the book titled Think Python: How to Think Like a Computer Scientist written by Allen B. Downey, Jeff Elkner and others.',
|
||||
}} />
|
||||
: 'In December of 2009, I was preparing to teach SI502 - Networked Programming at the University of Michigan for the fifth semester in a row and decided it was time to write a Python textbook that focused on exploring data instead of understanding algorithms and abstractions. My goal in SI502 is to teach people life-long data handling skills using Python. Few of my students were planning to be professional computer programmers. Instead, they planned be librarians, managers, lawyers, biologists, economists, etc. who happened to want to skillfully use technology in their chosen field.'
|
||||
}
|
||||
</ChunkContainer>
|
||||
})
|
||||
}</div>
|
||||
<DividerWithLabel label='Display previews of up to 10 paragraphs' />
|
||||
</PreviewContainer>
|
||||
</div>
|
||||
}
|
||||
|
@ -330,6 +330,7 @@ export type NotionPage = {
|
||||
}
|
||||
|
||||
export type ProcessRule = {
|
||||
processRule: { pre_processing_rules: PreProcessingRule[]; segmentation: { separator: string; max_tokens: number; chunk_overlap: number } }
|
||||
mode: string
|
||||
rules: Rules
|
||||
}
|
||||
|
223
web/service/use-datasets.ts
Normal file
223
web/service/use-datasets.ts
Normal file
@ -0,0 +1,223 @@
|
||||
import groupBy from 'lodash-es/groupBy'
|
||||
import type { MutationOptions } from '@tanstack/react-query'
|
||||
import { useMutation } from '@tanstack/react-query'
|
||||
import { createDocument, createFirstDocument, fetchDefaultProcessRule, fetchFileIndexingEstimate } from './datasets'
|
||||
import { type IndexingType } from '@/app/components/datasets/create/step-two'
|
||||
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, DataSourceType, DocForm, FileIndexingEstimateResponse, IndexingEstimateParams, NotionInfo, ProcessRule, ProcessRuleResponse, createDocumentResponse } from '@/models/datasets'
|
||||
import type { DataSourceProvider, NotionPage } from '@/models/common'
|
||||
|
||||
export const getNotionInfo = (
|
||||
notionPages: NotionPage[],
|
||||
) => {
|
||||
const workspacesMap = groupBy(notionPages, 'workspace_id')
|
||||
const workspaces = Object.keys(workspacesMap).map((workspaceId) => {
|
||||
return {
|
||||
workspaceId,
|
||||
pages: workspacesMap[workspaceId],
|
||||
}
|
||||
})
|
||||
return workspaces.map((workspace) => {
|
||||
return {
|
||||
workspace_id: workspace.workspaceId,
|
||||
pages: workspace.pages.map((page) => {
|
||||
const { page_id, page_name, page_icon, type } = page
|
||||
return {
|
||||
page_id,
|
||||
page_name,
|
||||
page_icon,
|
||||
type,
|
||||
}
|
||||
}),
|
||||
}
|
||||
}) as NotionInfo[]
|
||||
}
|
||||
|
||||
export const getWebsiteInfo = (
|
||||
opts: {
|
||||
websiteCrawlProvider: DataSourceProvider
|
||||
websiteCrawlJobId: string
|
||||
websitePages: CrawlResultItem[]
|
||||
crawlOptions?: CrawlOptions
|
||||
},
|
||||
) => {
|
||||
const { websiteCrawlProvider, websiteCrawlJobId, websitePages, crawlOptions } = opts
|
||||
return {
|
||||
provider: websiteCrawlProvider,
|
||||
job_id: websiteCrawlJobId,
|
||||
urls: websitePages.map(page => page.source_url),
|
||||
only_main_content: crawlOptions?.only_main_content,
|
||||
}
|
||||
}
|
||||
|
||||
type GetFileIndexingEstimateParamsOptionBase = {
|
||||
docForm: DocForm
|
||||
docLanguage: string
|
||||
indexingTechnique: IndexingType
|
||||
processRule: ProcessRule
|
||||
dataset_id: string
|
||||
}
|
||||
|
||||
type GetFileIndexingEstimateParamsOptionFile = GetFileIndexingEstimateParamsOptionBase & {
|
||||
dataSourceType: DataSourceType.FILE
|
||||
files: CustomFile[]
|
||||
}
|
||||
|
||||
const getFileIndexingEstimateParamsForFile = ({
|
||||
docForm,
|
||||
docLanguage,
|
||||
dataSourceType,
|
||||
files,
|
||||
indexingTechnique,
|
||||
processRule,
|
||||
dataset_id,
|
||||
}: GetFileIndexingEstimateParamsOptionFile): IndexingEstimateParams => {
|
||||
return {
|
||||
info_list: {
|
||||
data_source_type: dataSourceType,
|
||||
file_info_list: {
|
||||
file_ids: files.map(file => file.id) as string[],
|
||||
},
|
||||
},
|
||||
indexing_technique: indexingTechnique,
|
||||
process_rule: processRule,
|
||||
doc_form: docForm,
|
||||
doc_language: docLanguage,
|
||||
dataset_id,
|
||||
}
|
||||
}
|
||||
|
||||
export const useFetchFileIndexingEstimateForFile = (
|
||||
options: GetFileIndexingEstimateParamsOptionFile,
|
||||
mutationOptions: MutationOptions<FileIndexingEstimateResponse> = {},
|
||||
) => {
|
||||
return useMutation({
|
||||
mutationFn: async () => {
|
||||
return fetchFileIndexingEstimate(getFileIndexingEstimateParamsForFile(options))
|
||||
},
|
||||
...mutationOptions,
|
||||
})
|
||||
}
|
||||
|
||||
type GetFileIndexingEstimateParamsOptionNotion = GetFileIndexingEstimateParamsOptionBase & {
|
||||
dataSourceType: DataSourceType.NOTION
|
||||
notionPages: NotionPage[]
|
||||
}
|
||||
|
||||
const getFileIndexingEstimateParamsForNotion = ({
|
||||
docForm,
|
||||
docLanguage,
|
||||
dataSourceType,
|
||||
notionPages,
|
||||
indexingTechnique,
|
||||
processRule,
|
||||
dataset_id,
|
||||
}: GetFileIndexingEstimateParamsOptionNotion): IndexingEstimateParams => {
|
||||
return {
|
||||
info_list: {
|
||||
data_source_type: dataSourceType,
|
||||
notion_info_list: getNotionInfo(notionPages),
|
||||
},
|
||||
indexing_technique: indexingTechnique,
|
||||
process_rule: processRule,
|
||||
doc_form: docForm,
|
||||
doc_language: docLanguage,
|
||||
dataset_id,
|
||||
}
|
||||
}
|
||||
|
||||
export const useFetchFileIndexingEstimateForNotion = (
|
||||
options: GetFileIndexingEstimateParamsOptionNotion,
|
||||
mutationOptions: MutationOptions<FileIndexingEstimateResponse> = {},
|
||||
) => {
|
||||
return useMutation({
|
||||
mutationFn: async () => {
|
||||
return fetchFileIndexingEstimate(getFileIndexingEstimateParamsForNotion(options))
|
||||
},
|
||||
...mutationOptions,
|
||||
})
|
||||
}
|
||||
|
||||
type GetFileIndexingEstimateParamsOptionWeb = GetFileIndexingEstimateParamsOptionBase & {
|
||||
dataSourceType: DataSourceType.WEB
|
||||
websitePages: CrawlResultItem[]
|
||||
crawlOptions?: CrawlOptions
|
||||
websiteCrawlProvider: DataSourceProvider
|
||||
websiteCrawlJobId: string
|
||||
}
|
||||
|
||||
const getFileIndexingEstimateParamsForWeb = ({
|
||||
docForm,
|
||||
docLanguage,
|
||||
dataSourceType,
|
||||
websitePages,
|
||||
crawlOptions,
|
||||
websiteCrawlProvider,
|
||||
websiteCrawlJobId,
|
||||
indexingTechnique,
|
||||
processRule,
|
||||
dataset_id,
|
||||
}: GetFileIndexingEstimateParamsOptionWeb): IndexingEstimateParams => {
|
||||
return {
|
||||
info_list: {
|
||||
data_source_type: dataSourceType,
|
||||
website_info_list: getWebsiteInfo({
|
||||
websiteCrawlProvider,
|
||||
websiteCrawlJobId,
|
||||
websitePages,
|
||||
crawlOptions,
|
||||
}),
|
||||
},
|
||||
indexing_technique: indexingTechnique,
|
||||
process_rule: processRule,
|
||||
doc_form: docForm,
|
||||
doc_language: docLanguage,
|
||||
dataset_id,
|
||||
}
|
||||
}
|
||||
|
||||
export const useFetchFileIndexingEstimateForWeb = (
|
||||
options: GetFileIndexingEstimateParamsOptionWeb,
|
||||
mutationOptions: MutationOptions<FileIndexingEstimateResponse> = {},
|
||||
) => {
|
||||
return useMutation({
|
||||
mutationFn: async () => {
|
||||
return fetchFileIndexingEstimate(getFileIndexingEstimateParamsForWeb(options))
|
||||
},
|
||||
...mutationOptions,
|
||||
})
|
||||
}
|
||||
|
||||
export const useCreateFirstDocument = (
|
||||
mutationOptions: MutationOptions<createDocumentResponse, Error, CreateDocumentReq> = {},
|
||||
) => {
|
||||
return useMutation({
|
||||
mutationFn: async (createDocumentReq: CreateDocumentReq,
|
||||
) => {
|
||||
return createFirstDocument({ body: createDocumentReq })
|
||||
},
|
||||
...mutationOptions,
|
||||
})
|
||||
}
|
||||
|
||||
export const useCreateDocument = (
|
||||
datasetId: string,
|
||||
mutationOptions: MutationOptions<createDocumentResponse, Error, CreateDocumentReq> = {},
|
||||
) => {
|
||||
return useMutation({
|
||||
mutationFn: async (req: CreateDocumentReq) => {
|
||||
return createDocument({ datasetId, body: req })
|
||||
},
|
||||
...mutationOptions,
|
||||
})
|
||||
}
|
||||
|
||||
export const useFetchDefaultProcessRule = (
|
||||
mutationOptions: MutationOptions<ProcessRuleResponse, Error, string> = {},
|
||||
) => {
|
||||
return useMutation({
|
||||
mutationFn: async (url: string) => {
|
||||
return fetchDefaultProcessRule({ url })
|
||||
},
|
||||
...mutationOptions,
|
||||
})
|
||||
}
|
Loading…
Reference in New Issue
Block a user