text extractor tool

This commit is contained in:
jyong 2024-11-04 14:52:02 +08:00
parent 67b1190535
commit bc7cc06572
11 changed files with 107 additions and 114 deletions

View File

@ -1,4 +1,6 @@
import base64
from pathlib import Path
import tempfile
from configs import dify_config
from core.file import file_repository
@ -109,6 +111,38 @@ def _download_file_content(path: str, /):
return data
def download_to_target_path(f: File, temp_dir: str, /):
if f.transfer_method == FileTransferMethod.TOOL_FILE:
tool_file = file_repository.get_tool_file(session=db.session(), file=f)
suffix = Path(tool_file.file_key).suffix
target_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
_download_file_to_target_path(tool_file.file_key, target_path)
return target_path
elif f.transfer_method == FileTransferMethod.LOCAL_FILE:
upload_file = file_repository.get_upload_file(session=db.session(), file=f)
suffix = Path(upload_file.key).suffix
target_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
_download_file_to_target_path(upload_file.key, target_path)
return target_path
else:
raise ValueError(f"Unsupported transfer method: {f.transfer_method}")
def _download_file_to_target_path(path: str, target_path: str, /):
"""
Download and return the contents of a file as bytes.
This function loads the file from storage and ensures it's in bytes format.
Args:
path (str): The path to the file in storage.
target_path (str): The path to the target file.
Raises:
ValueError: If the loaded file is not a bytes object.
"""
storage.download(path, target_path)
def _get_encoded_string(f: File, /):
match f.transfer_method:
case FileTransferMethod.REMOTE_URL:

Binary file not shown.

Before

Width:  |  Height:  |  Size: 153 KiB

View File

@ -1,20 +0,0 @@
from typing import Any
from core.tools.errors import ToolProviderCredentialValidationError
from core.tools.provider.builtin.dalle.tools.dalle2 import DallE2Tool
from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
class DALLEProvider(BuiltinToolProviderController):
def _validate_credentials(self, credentials: dict[str, Any]) -> None:
try:
DallE2Tool().fork_tool_runtime(
runtime={
"credentials": credentials,
}
).invoke(
user_id="",
tool_parameters={"prompt": "cute girl, blue eyes, white hair, anime style", "size": "small", "n": 1},
)
except Exception as e:
raise ToolProviderCredentialValidationError(str(e))

View File

@ -1,61 +0,0 @@
identity:
author: Dify
name: dalle
label:
en_US: DALL-E
zh_Hans: DALL-E 绘画
pt_BR: DALL-E
description:
en_US: DALL-E art
zh_Hans: DALL-E 绘画
pt_BR: DALL-E art
icon: icon.png
tags:
- image
- productivity
credentials_for_provider:
openai_api_key:
type: secret-input
required: true
label:
en_US: OpenAI API key
zh_Hans: OpenAI API key
pt_BR: OpenAI API key
help:
en_US: Please input your OpenAI API key
zh_Hans: 请输入你的 OpenAI API key
pt_BR: Please input your OpenAI API key
placeholder:
en_US: Please input your OpenAI API key
zh_Hans: 请输入你的 OpenAI API key
pt_BR: Please input your OpenAI API key
openai_organization_id:
type: text-input
required: false
label:
en_US: OpenAI organization ID
zh_Hans: OpenAI organization ID
pt_BR: OpenAI organization ID
help:
en_US: Please input your OpenAI organization ID
zh_Hans: 请输入你的 OpenAI organization ID
pt_BR: Please input your OpenAI organization ID
placeholder:
en_US: Please input your OpenAI organization ID
zh_Hans: 请输入你的 OpenAI organization ID
pt_BR: Please input your OpenAI organization ID
openai_base_url:
type: text-input
required: false
label:
en_US: OpenAI base URL
zh_Hans: OpenAI base URL
pt_BR: OpenAI base URL
help:
en_US: Please input your OpenAI base URL
zh_Hans: 请输入你的 OpenAI base URL
pt_BR: Please input your OpenAI base URL
placeholder:
en_US: Please input your OpenAI base URL
zh_Hans: 请输入你的 OpenAI base URL
pt_BR: Please input your OpenAI base URL

View File

@ -1,32 +0,0 @@
from base64 import b64decode
from typing import Any, Union
from openai import OpenAI
from yarl import URL
from core.file.enums import FileType
from core.file.file_manager import download
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.errors import ToolParameterValidationError
from core.tools.tool.builtin_tool import BuiltinTool
class FileExtractorTool(BuiltinTool):
def _invoke(
self,
user_id: str,
tool_parameters: dict[str, Any],
) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
"""
invoke tools
"""
# image file for workflow mode
file = tool_parameters.get("file")
if file and file.type != FileType.DOCUMENT:
raise ToolParameterValidationError("Not a valid document")
if file:
file_binary = download(file)
else:
raise ToolParameterValidationError("Please provide either file")
return result

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.3 KiB

View File

@ -0,0 +1,10 @@
from typing import Any
from core.tools.errors import ToolProviderCredentialValidationError
from core.tools.provider.builtin.dalle.tools.dalle2 import DallE2Tool
from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
class FileExtractorProvider(BuiltinToolProviderController):
def _validate_credentials(self, credentials: dict[str, Any]) -> None:
pass

View File

@ -0,0 +1,15 @@
identity:
author: Jyong
name: file_extractor
label:
en_US: File Extractor
zh_Hans: 文件提取
pt_BR: File Extractor
description:
en_US: Extract text from file
zh_Hans: 从文件中提取文本
pt_BR: Extract text from file
icon: icon.png
tags:
- utilities
- productivity

View File

@ -0,0 +1,47 @@
from base64 import b64decode
import tempfile
from typing import Any, Union
from openai import OpenAI
from yarl import URL
from core.file.enums import FileType
from core.file.file_manager import download_to_target_path
from core.rag.extractor.text_extractor import TextExtractor
from core.rag.splitter.fixed_text_splitter import FixedRecursiveCharacterTextSplitter
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.errors import ToolParameterValidationError
from core.tools.tool.builtin_tool import BuiltinTool
class FileExtractorTool(BuiltinTool):
def _invoke(
self,
user_id: str,
tool_parameters: dict[str, Any],
) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
"""
invoke tools
"""
# image file for workflow mode
file = tool_parameters.get("text_file")
if file and file.type != FileType.DOCUMENT:
raise ToolParameterValidationError("Not a valid document")
if file:
with tempfile.TemporaryDirectory() as temp_dir:
file_path = download_to_target_path(file, temp_dir)
extractor = TextExtractor(file_path, autodetect_encoding=True)
documents = extractor.extract()
character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder(
chunk_size=tool_parameters.get("max_token", 500),
chunk_overlap=0,
fixed_separator=tool_parameters.get("separator", "\n\n"),
separators=["\n\n", "", ". ", " ", ""],
embedding_model_instance=None,
)
chunks = character_splitter.split_documents(documents)
return self.create_json_message(json.dumps([chunk.page_content for chunk in chunks]))
else:
raise ToolParameterValidationError("Please provide either file")

View File

@ -24,7 +24,7 @@ parameters:
zh_Hans: 要提取的 text 文档。
llm_description: you should not input this parameter. just input the image_id.
form: llm
- name: separators
- name: separator
type: string
required: false
label: