Merge branch 'feat/parent-child-retrieval' of https://github.com/langgenius/dify into feat/parent-child-retrieval

This commit is contained in:
twwu 2024-12-11 14:00:27 +08:00
commit 4017c65c1f
8 changed files with 190 additions and 17 deletions

View File

@ -96,6 +96,8 @@ export enum IndexingType {
}
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
const DEFAULT_MAXMIMUM_CHUNK_LENGTH = 500
const DEFAULT_OVERLAP = 50
type ParentChildConfig = {
chunkForContext: ParentMode
@ -155,9 +157,9 @@ const StepTwo = ({
const setSegmentIdentifier = useCallback((value: string) => {
doSetSegmentIdentifier(value ? escape(value) : DEFAULT_SEGMENT_IDENTIFIER)
}, [])
const [maxChunkLength, setMaxChunkLength] = useState(4000) // default chunk length
const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXMIMUM_CHUNK_LENGTH) // default chunk length
const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(4000)
const [overlap, setOverlap] = useState(50)
const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)
const [rules, setRules] = useState<PreProcessingRule[]>([])
const [defaultConfig, setDefaultConfig] = useState<Rules>()
const hasSetIndexType = !!indexingType

View File

@ -32,7 +32,7 @@ export const DelimiterInput: FC<InputProps> = (props) => {
<Input
type="text"
className='h-9'
placeholder={t('datasetCreation.stepTwo.separatorPlaceholder') || ''}
placeholder={t('datasetCreation.stepTwo.separatorPlaceholder')!}
{...props}
/>
</FormField>
@ -46,7 +46,7 @@ export const MaxLengthInput: FC<InputNumberProps> = (props) => {
<InputNumber
type="number"
className='h-9'
placeholder={t('datasetCreation.stepTwo.maxLength') || ''}
placeholder={'≤ 4000'}
max={4000}
min={1}
{...props}

View File

@ -539,7 +539,6 @@ const DocumentList: FC<IDocumentListProps> = ({
}}>
<td className='text-left align-middle text-text-tertiary text-xs'>
<div className='flex items-center' onClick={e => e.stopPropagation()}>
<Checkbox
className='shrink-0 mr-2'
checked={selectedIds.includes(doc.id)}

View File

@ -43,7 +43,7 @@ export const EditSlice: FC<EditSliceProps> = (props) => {
>
<SliceLabel
className={classNames(
isDestructive && '!bg-red-500 !text-text-primary-on-surface',
isDestructive && '!bg-state-destructive-solid !text-text-primary-on-surface',
)}
>
{label}

View File

@ -0,0 +1,137 @@
export const generalResultData = [
{
segment: {
id: 'b621b153-f8a7-4e85-bd3d-07feaf61bd9e',
position: 1,
document_id: '990c1ba7-a170-42ed-a71f-579e4875eaba',
content: '张家界森林覆盖率达 90%以上,生物多样性丰富。这里是许多珍稀动植物的栖息地,例\r\n如银杉、中华秋沙鸭等。清新的空气和丰富的负氧离子让它成为“ 天然氧吧”。\r\n历史背景\r\n1. 古代历史\r\n张家界地区在古代是土家族和苗族等少数民族的居住地历史可以追溯到新石器时代。\r\n这里曾是楚国的属地后来成为武陵山地区的重要组成部分。\r\n2. 近代发展\r\n张家界介绍\r\n张家界概述\r\n张家界位于中国湖南省西北部是中国知名的旅游胜地以独特的喀斯特地貌和壮美的\r\n自然风光闻名世界。它不仅是自然景观的瑰宝还蕴含了丰富的历史与人文底蕴。\r\n地理特色\r\n1. 地貌特征\r\n张家界以其石英砂岩峰林地貌而著称峰峦如刀劈斧削形态各异被誉为“ 天然山水\r\n画卷”。\r\n• 武陵源风景名胜区\r\n被列入联合国教科文组织世界自然遗产名录其中包括张家界国家森林公园、天子山、\r\n索溪峪等景区。',
answer: null,
word_count: 387,
tokens: 471,
keywords: [
'氧吧',
'丰富',
'90%',
'天子山',
'地貌',
'历史',
'张家界',
'索溪峪',
'天然',
'负氧离子',
],
index_node_id: '483fad87-3b7e-486d-afae-75e4f0b2f3dd',
index_node_hash: '61bb7556a32e3e09ed83f2de731c2ac2d669c598de6d85708e11f78817c882bb',
hit_count: 0,
enabled: true,
disabled_at: null,
disabled_by: null,
status: 'completed',
created_by: '6d8ad01f-edf9-43a6-b863-a034b1828ac7',
created_at: 1732605173,
indexing_at: 1732605173,
completed_at: 1732605177,
error: null,
stopped_at: null,
document: {
id: '990c1ba7-a170-42ed-a71f-579e4875eaba',
data_source_type: 'upload_file',
name: '张家界介绍.pdf',
doc_type: null,
},
},
child_chunks: null,
score: 0.8771945,
tsne_position: null,
},
{
segment: {
id: '0859a14d-697e-4703-b59d-2ff69a7a9795',
position: 5,
document_id: '990c1ba7-a170-42ed-a71f-579e4875eaba',
content: '茅岩河漂流和黄石寨徒步是体验张家界山水魅力的绝佳方式。\r\n总结\r\n张家界是集自然奇观与人文风情于一体的旅游胜地。无论是其独特的地貌景观还是浓\r\n郁的土家文化都展现了人与自然的和谐之美。这里的每一座山、每一片森林似乎都\r\n在诉说着古老的故事吸引着来自世界各地的游客流连忘返。',
answer: null,
word_count: 140,
tokens: 173,
keywords: [
'绝佳',
'徒步',
'人与自然',
'流连忘返',
'河漂流',
'之美',
'张家界',
'黄石寨',
'诉说着',
'茅岩',
],
index_node_id: '1d8e46bd-27ea-47fa-b8c4-87737bf2e021',
index_node_hash: '8ac318494724ac44120b2f9db397bb02186b456fff76f9f8b86156fb8a864999',
hit_count: 0,
enabled: true,
disabled_at: null,
disabled_by: null,
status: 'completed',
created_by: '6d8ad01f-edf9-43a6-b863-a034b1828ac7',
created_at: 1732605173,
indexing_at: 1732605173,
completed_at: 1732605177,
error: null,
stopped_at: null,
document: {
id: '990c1ba7-a170-42ed-a71f-579e4875eaba',
data_source_type: 'upload_file',
name: '张家界介绍.pdf',
doc_type: null,
},
},
child_chunks: null,
score: 0.8642928,
tsne_position: null,
},
{
segment: {
id: 'f5e63d62-984f-419f-a8ec-781e1280c739',
position: 4,
document_id: '990c1ba7-a170-42ed-a71f-579e4875eaba',
content: '葛粉汤\r\n一种用当地葛根制成的食品清热解毒深受游客喜爱。\r\n3. 艺术与传说\r\n张家界的山水常与中国传统文化和神话传说相结合例如天子山据说是土家族起义领袖',
answer: null,
word_count: 80,
tokens: 94,
keywords: [
'葛根',
'清热解毒',
'葛粉',
'天子山',
'起义领袖',
'深受',
'张家界',
'神话传说',
'土家族',
'山水',
],
index_node_id: '80f71f0d-6218-4160-8575-c59d58ac15e3',
index_node_hash: '155ad96a96b984d7058fdb377f98bd50158d58574b75bea0187c9e3af5680ad5',
hit_count: 0,
enabled: true,
disabled_at: null,
disabled_by: null,
status: 'completed',
created_by: '6d8ad01f-edf9-43a6-b863-a034b1828ac7',
created_at: 1732605173,
indexing_at: 1732605173,
completed_at: 1732605177,
error: null,
stopped_at: null,
document: {
id: '990c1ba7-a170-42ed-a71f-579e4875eaba',
data_source_type: 'upload_file',
name: '张家界介绍.pdf',
doc_type: null,
},
},
child_chunks: null,
score: 0.80618876,
tsne_position: null,
},
]

View File

@ -0,0 +1,33 @@
'use client'
import type { FC } from 'react'
import React from 'react'
import { useTranslation } from 'react-i18next'
import { SegmentIndexTag } from '../../documents/detail/completed'
import type { HitTesting } from '@/models/datasets'
import cn from '@/utils/classnames'
type Props = {
payload: HitTesting
}
const ResultItem: FC<Props> = ({
payload,
}) => {
const { t } = useTranslation()
const { segment } = payload
const { position, word_count } = segment
return (
<div>
<div className='flex justify-between items-center'>
<div className='flex items-center space-x-2'>
<SegmentIndexTag positionId={position} className={cn('w-fit group-hover:opacity-100')} />
<div className='text-xs font-medium text-text-quaternary'>·</div>
<div className='system-xs-medium text-text-tertiary'>{word_count} {t('datasetDocuments.segment.characters')}</div>
</div>
{/* Score */}
</div>
</div>
)
}
export default React.memo(ResultItem)

View File

@ -7,11 +7,11 @@ import { omit } from 'lodash-es'
import { useBoolean } from 'ahooks'
import { useContext } from 'use-context-selector'
import SegmentCard from '../documents/detail/completed/SegmentCard'
import docStyle from '../documents/detail/completed/style.module.css'
import Textarea from './textarea'
import s from './style.module.css'
import HitDetail from './hit-detail'
import ModifyRetrievalModal from './modify-retrieval-modal'
import { generalResultData } from './assets/test-data'
import cn from '@/utils/classnames'
import type { ExternalKnowledgeBaseHitTestingResponse, ExternalKnowledgeBaseHitTesting as ExternalKnowledgeBaseHitTestingType, HitTestingResponse, HitTesting as HitTestingType } from '@/models/datasets'
import Loading from '@/app/components/base/loading'
@ -24,7 +24,6 @@ import DatasetDetailContext from '@/context/dataset-detail'
import type { RetrievalConfig } from '@/types/app'
import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
import useTimestamp from '@/hooks/use-timestamp'
const limit = 10
type Props = {
@ -49,6 +48,7 @@ const HitTesting: FC<Props> = ({ datasetId }: Props) => {
const isMobile = media === MediaType.mobile
const [hitResult, setHitResult] = useState<HitTestingResponse | undefined>() // 初始化记录为空数组
// console.log(hitResult?.records)
const [externalHitResult, setExternalHitResult] = useState<ExternalKnowledgeBaseHitTestingResponse | undefined>()
const [submitLoading, setSubmitLoading] = useState(false)
const [currParagraph, setCurrParagraph] = useState<{ paraInfo?: HitTestingType; showModal: boolean }>({ showModal: false })
@ -77,7 +77,6 @@ const HitTesting: FC<Props> = ({ datasetId }: Props) => {
const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict as RetrievalConfig)
const [isShowModifyRetrievalModal, setIsShowModifyRetrievalModal] = useState(false)
const [isShowRightPanel, { setTrue: showRightPanel, setFalse: hideRightPanel, set: setShowRightPanel }] = useBoolean(!isMobile)
const renderHitResults = (results: any[], onClickCard: (record: any) => void) => (
<>
<div className='text-gray-600 font-semibold mb-4'>{t('datasetHitTesting.hit.title')}</div>
@ -87,7 +86,7 @@ const HitTesting: FC<Props> = ({ datasetId }: Props) => {
<SegmentCard
key={idx}
loading={false}
refSource= {{
refSource={{
title: record.title,
uri: record.metadata ? record.metadata['x-amz-bedrock-kb-source-uri'] : '',
}}
@ -106,12 +105,14 @@ const HitTesting: FC<Props> = ({ datasetId }: Props) => {
)
const renderEmptyState = () => (
<div className='h-full flex flex-col justify-center items-center'>
<div className={cn(docStyle.commonIcon, docStyle.targetIcon, '!bg-gray-200 !h-14 !w-14')} />
<div className='text-gray-300 text-[13px] mt-3'>
{t('datasetHitTesting.hit.emptyTip')}
</div>
</div>
// for test
<div></div>
// <div className='h-full flex flex-col justify-center items-center'>
// <div className={cn(docStyle.commonIcon, docStyle.targetIcon, '!bg-gray-200 !h-14 !w-14')} />
// <div className='text-gray-300 text-[13px] mt-3'>
// {t('datasetHitTesting.hit.emptyTip')}
// </div>
// </div>
)
useEffect(() => {
@ -190,6 +191,7 @@ const HitTesting: FC<Props> = ({ datasetId }: Props) => {
</div>
<FloatRightContainer panelClassname='!justify-start !overflow-y-auto' showClose isMobile={isMobile} isOpen={isShowRightPanel} onClose={hideRightPanel} footer={null}>
<div className={cn(s.rightDiv, 'p-0 sm:px-8 sm:pt-[42px] sm:pb-[26px]')}>
{renderHitResults(generalResultData, onClickCard)}
{submitLoading
? <div className={s.cardWrapper}>
<SegmentCard

View File

@ -21,7 +21,7 @@ const translation = {
},
action: {
uploadFile: 'Upload new file',
settings: 'Segment settings',
settings: 'Chunking Settings',
addButton: 'Add chunk',
add: 'Add a chunk',
batchAdd: 'Batch add',