text extractor tool
This commit is contained in:
parent
67b1190535
commit
bc7cc06572
@ -1,4 +1,6 @@
|
||||
import base64
|
||||
from pathlib import Path
|
||||
import tempfile
|
||||
|
||||
from configs import dify_config
|
||||
from core.file import file_repository
|
||||
@ -109,6 +111,38 @@ def _download_file_content(path: str, /):
|
||||
return data
|
||||
|
||||
|
||||
def download_to_target_path(f: File, temp_dir: str, /):
|
||||
if f.transfer_method == FileTransferMethod.TOOL_FILE:
|
||||
tool_file = file_repository.get_tool_file(session=db.session(), file=f)
|
||||
suffix = Path(tool_file.file_key).suffix
|
||||
target_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
|
||||
_download_file_to_target_path(tool_file.file_key, target_path)
|
||||
return target_path
|
||||
elif f.transfer_method == FileTransferMethod.LOCAL_FILE:
|
||||
upload_file = file_repository.get_upload_file(session=db.session(), file=f)
|
||||
suffix = Path(upload_file.key).suffix
|
||||
target_path = f"{temp_dir}/{next(tempfile._get_candidate_names())}{suffix}"
|
||||
_download_file_to_target_path(upload_file.key, target_path)
|
||||
return target_path
|
||||
else:
|
||||
raise ValueError(f"Unsupported transfer method: {f.transfer_method}")
|
||||
|
||||
|
||||
def _download_file_to_target_path(path: str, target_path: str, /):
|
||||
"""
|
||||
Download and return the contents of a file as bytes.
|
||||
|
||||
This function loads the file from storage and ensures it's in bytes format.
|
||||
|
||||
Args:
|
||||
path (str): The path to the file in storage.
|
||||
target_path (str): The path to the target file.
|
||||
Raises:
|
||||
ValueError: If the loaded file is not a bytes object.
|
||||
"""
|
||||
storage.download(path, target_path)
|
||||
|
||||
|
||||
def _get_encoded_string(f: File, /):
|
||||
match f.transfer_method:
|
||||
case FileTransferMethod.REMOTE_URL:
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 153 KiB |
@ -1,20 +0,0 @@
|
||||
from typing import Any
|
||||
|
||||
from core.tools.errors import ToolProviderCredentialValidationError
|
||||
from core.tools.provider.builtin.dalle.tools.dalle2 import DallE2Tool
|
||||
from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
|
||||
|
||||
|
||||
class DALLEProvider(BuiltinToolProviderController):
|
||||
def _validate_credentials(self, credentials: dict[str, Any]) -> None:
|
||||
try:
|
||||
DallE2Tool().fork_tool_runtime(
|
||||
runtime={
|
||||
"credentials": credentials,
|
||||
}
|
||||
).invoke(
|
||||
user_id="",
|
||||
tool_parameters={"prompt": "cute girl, blue eyes, white hair, anime style", "size": "small", "n": 1},
|
||||
)
|
||||
except Exception as e:
|
||||
raise ToolProviderCredentialValidationError(str(e))
|
@ -1,61 +0,0 @@
|
||||
identity:
|
||||
author: Dify
|
||||
name: dalle
|
||||
label:
|
||||
en_US: DALL-E
|
||||
zh_Hans: DALL-E 绘画
|
||||
pt_BR: DALL-E
|
||||
description:
|
||||
en_US: DALL-E art
|
||||
zh_Hans: DALL-E 绘画
|
||||
pt_BR: DALL-E art
|
||||
icon: icon.png
|
||||
tags:
|
||||
- image
|
||||
- productivity
|
||||
credentials_for_provider:
|
||||
openai_api_key:
|
||||
type: secret-input
|
||||
required: true
|
||||
label:
|
||||
en_US: OpenAI API key
|
||||
zh_Hans: OpenAI API key
|
||||
pt_BR: OpenAI API key
|
||||
help:
|
||||
en_US: Please input your OpenAI API key
|
||||
zh_Hans: 请输入你的 OpenAI API key
|
||||
pt_BR: Please input your OpenAI API key
|
||||
placeholder:
|
||||
en_US: Please input your OpenAI API key
|
||||
zh_Hans: 请输入你的 OpenAI API key
|
||||
pt_BR: Please input your OpenAI API key
|
||||
openai_organization_id:
|
||||
type: text-input
|
||||
required: false
|
||||
label:
|
||||
en_US: OpenAI organization ID
|
||||
zh_Hans: OpenAI organization ID
|
||||
pt_BR: OpenAI organization ID
|
||||
help:
|
||||
en_US: Please input your OpenAI organization ID
|
||||
zh_Hans: 请输入你的 OpenAI organization ID
|
||||
pt_BR: Please input your OpenAI organization ID
|
||||
placeholder:
|
||||
en_US: Please input your OpenAI organization ID
|
||||
zh_Hans: 请输入你的 OpenAI organization ID
|
||||
pt_BR: Please input your OpenAI organization ID
|
||||
openai_base_url:
|
||||
type: text-input
|
||||
required: false
|
||||
label:
|
||||
en_US: OpenAI base URL
|
||||
zh_Hans: OpenAI base URL
|
||||
pt_BR: OpenAI base URL
|
||||
help:
|
||||
en_US: Please input your OpenAI base URL
|
||||
zh_Hans: 请输入你的 OpenAI base URL
|
||||
pt_BR: Please input your OpenAI base URL
|
||||
placeholder:
|
||||
en_US: Please input your OpenAI base URL
|
||||
zh_Hans: 请输入你的 OpenAI base URL
|
||||
pt_BR: Please input your OpenAI base URL
|
@ -1,32 +0,0 @@
|
||||
from base64 import b64decode
|
||||
from typing import Any, Union
|
||||
|
||||
from openai import OpenAI
|
||||
from yarl import URL
|
||||
from core.file.enums import FileType
|
||||
|
||||
from core.file.file_manager import download
|
||||
from core.tools.entities.tool_entities import ToolInvokeMessage
|
||||
from core.tools.errors import ToolParameterValidationError
|
||||
from core.tools.tool.builtin_tool import BuiltinTool
|
||||
|
||||
|
||||
class FileExtractorTool(BuiltinTool):
|
||||
def _invoke(
|
||||
self,
|
||||
user_id: str,
|
||||
tool_parameters: dict[str, Any],
|
||||
) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
|
||||
"""
|
||||
invoke tools
|
||||
"""
|
||||
# image file for workflow mode
|
||||
file = tool_parameters.get("file")
|
||||
if file and file.type != FileType.DOCUMENT:
|
||||
raise ToolParameterValidationError("Not a valid document")
|
||||
|
||||
if file:
|
||||
file_binary = download(file)
|
||||
else:
|
||||
raise ToolParameterValidationError("Please provide either file")
|
||||
return result
|
BIN
api/core/tools/provider/builtin/file_extractor/_assets/icon.png
Normal file
BIN
api/core/tools/provider/builtin/file_extractor/_assets/icon.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 4.3 KiB |
@ -0,0 +1,10 @@
|
||||
from typing import Any
|
||||
|
||||
from core.tools.errors import ToolProviderCredentialValidationError
|
||||
from core.tools.provider.builtin.dalle.tools.dalle2 import DallE2Tool
|
||||
from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
|
||||
|
||||
|
||||
class FileExtractorProvider(BuiltinToolProviderController):
|
||||
def _validate_credentials(self, credentials: dict[str, Any]) -> None:
|
||||
pass
|
@ -0,0 +1,15 @@
|
||||
identity:
|
||||
author: Jyong
|
||||
name: file_extractor
|
||||
label:
|
||||
en_US: File Extractor
|
||||
zh_Hans: 文件提取
|
||||
pt_BR: File Extractor
|
||||
description:
|
||||
en_US: Extract text from file
|
||||
zh_Hans: 从文件中提取文本
|
||||
pt_BR: Extract text from file
|
||||
icon: icon.png
|
||||
tags:
|
||||
- utilities
|
||||
- productivity
|
@ -0,0 +1,47 @@
|
||||
from base64 import b64decode
|
||||
import tempfile
|
||||
from typing import Any, Union
|
||||
|
||||
from openai import OpenAI
|
||||
from yarl import URL
|
||||
from core.file.enums import FileType
|
||||
|
||||
from core.file.file_manager import download_to_target_path
|
||||
from core.rag.extractor.text_extractor import TextExtractor
|
||||
from core.rag.splitter.fixed_text_splitter import FixedRecursiveCharacterTextSplitter
|
||||
from core.tools.entities.tool_entities import ToolInvokeMessage
|
||||
from core.tools.errors import ToolParameterValidationError
|
||||
from core.tools.tool.builtin_tool import BuiltinTool
|
||||
|
||||
|
||||
class FileExtractorTool(BuiltinTool):
|
||||
def _invoke(
|
||||
self,
|
||||
user_id: str,
|
||||
tool_parameters: dict[str, Any],
|
||||
) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
|
||||
"""
|
||||
invoke tools
|
||||
"""
|
||||
# image file for workflow mode
|
||||
file = tool_parameters.get("text_file")
|
||||
if file and file.type != FileType.DOCUMENT:
|
||||
raise ToolParameterValidationError("Not a valid document")
|
||||
|
||||
if file:
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
file_path = download_to_target_path(file, temp_dir)
|
||||
extractor = TextExtractor(file_path, autodetect_encoding=True)
|
||||
documents = extractor.extract()
|
||||
character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder(
|
||||
chunk_size=tool_parameters.get("max_token", 500),
|
||||
chunk_overlap=0,
|
||||
fixed_separator=tool_parameters.get("separator", "\n\n"),
|
||||
separators=["\n\n", "。", ". ", " ", ""],
|
||||
embedding_model_instance=None,
|
||||
)
|
||||
chunks = character_splitter.split_documents(documents)
|
||||
return self.create_json_message(json.dumps([chunk.page_content for chunk in chunks]))
|
||||
|
||||
else:
|
||||
raise ToolParameterValidationError("Please provide either file")
|
@ -24,7 +24,7 @@ parameters:
|
||||
zh_Hans: 要提取的 text 文档。
|
||||
llm_description: you should not input this parameter. just input the image_id.
|
||||
form: llm
|
||||
- name: separators
|
||||
- name: separator
|
||||
type: string
|
||||
required: false
|
||||
label:
|
Loading…
Reference in New Issue
Block a user