dify/api/core/rag/rerank/rerank.py

100 lines
3.4 KiB
Python
Raw Normal View History

2024-05-07 13:14:54 +08:00
import os
from typing import Optional
2024-05-07 13:14:54 +08:00
from flashrank import Ranker, RerankRequest
from flask import current_app
2024-06-06 17:47:14 +08:00
from rank_bm25 import BM25Okapi
2024-05-07 13:14:54 +08:00
from core.model_manager import ModelInstance
2024-06-06 17:47:14 +08:00
from core.rag.datasource.keyword.jieba.jieba_keyword_table_handler import JiebaKeywordTableHandler
from core.rag.models.document import Document
class RerankRunner:
def __init__(self, rerank_model_instance: ModelInstance) -> None:
self.rerank_model_instance = rerank_model_instance
def run(self, query: str, documents: list[Document], score_threshold: Optional[float] = None,
top_n: Optional[int] = None, user: Optional[str] = None) -> list[Document]:
"""
Run rerank model
:param query: search query
:param documents: documents for reranking
:param score_threshold: score threshold
:param top_n: top n
:param user: unique user id if needed
:return:
"""
docs = []
doc_id = []
unique_documents = []
for document in documents:
if document.metadata['doc_id'] not in doc_id:
doc_id.append(document.metadata['doc_id'])
docs.append(document.page_content)
unique_documents.append(document)
documents = unique_documents
2024-05-07 13:14:54 +08:00
passages = []
i = 1
for document in documents:
passage = {
'id': i,
2024-06-06 17:47:14 +08:00
'text': document.page_content
2024-05-07 13:14:54 +08:00
}
passages.append(passage)
i += 1
folder = current_app.config.get('STORAGE_LOCAL_PATH')
if not os.path.isabs(folder):
folder = os.path.join(current_app.root_path, folder)
2024-06-06 17:47:14 +08:00
ranker = Ranker(model_name="rank-T5-flan", cache_dir=folder)
2024-05-07 13:14:54 +08:00
rerank_request = RerankRequest(query=query, passages=passages)
results = ranker.rerank(rerank_request)
print(results)
2024-06-06 17:47:14 +08:00
document_BM25 = []
for document in documents:
document_BM25.append(document.page_content)
# 预处理:分词
keyword_table_handler = JiebaKeywordTableHandler()
tokenized_documents = [keyword_table_handler.extract_keywords(doc, 20) for doc in document_BM25]
tokenized_query = keyword_table_handler.extract_keywords(query, 20)
# 创建BM25对象
bm25 = BM25Okapi(tokenized_documents)
# 计算查询与每个文档的BM25分数
doc_scores = bm25.get_scores(tokenized_query)
# 输出BM25分数
for i, score in enumerate(doc_scores):
print(f"Document {i + 1}: BM25 Score = {score}")
rerank_result = self.rerank_model_instance.invoke_rerank(
query=query,
docs=docs,
score_threshold=score_threshold,
top_n=top_n,
user=user
)
rerank_documents = []
for result in rerank_result.docs:
# format document
rerank_document = Document(
page_content=result.text,
metadata={
"doc_id": documents[result.index].metadata['doc_id'],
"doc_hash": documents[result.index].metadata['doc_hash'],
"document_id": documents[result.index].metadata['document_id'],
"dataset_id": documents[result.index].metadata['dataset_id'],
'score': result.score
}
)
rerank_documents.append(rerank_document)
return rerank_documents