This commit is contained in:
Joel 2024-10-28 16:33:02 +08:00
parent 966d42a4db
commit 0cec6195a3
9 changed files with 42 additions and 84 deletions

View File

@ -10,10 +10,8 @@ class AliYuqueTool:
@staticmethod @staticmethod
def auth(token): def auth(token):
session = requests.Session() session = requests.Session()
session.headers.update( session.headers.update({"Accept": "application/json", "X-Auth-Token": token})
{"Accept": "application/json", "X-Auth-Token": token}) login = session.request("GET", AliYuqueTool.server_url + "/api/v2/user")
login = session.request(
"GET", AliYuqueTool.server_url + "/api/v2/user")
login.raise_for_status() login.raise_for_status()
resp = login.json() resp = login.json()
return resp return resp
@ -22,12 +20,10 @@ class AliYuqueTool:
if not token: if not token:
raise Exception("token is required") raise Exception("token is required")
session = requests.Session() session = requests.Session()
session.headers.update( session.headers.update({"accept": "application/json", "X-Auth-Token": token})
{"accept": "application/json", "X-Auth-Token": token})
new_params = {**tool_parameters} new_params = {**tool_parameters}
replacements = {k: v for k, v in new_params.items() replacements = {k: v for k, v in new_params.items() if f"{{{k}}}" in path}
if f"{{{k}}}" in path}
for key, value in replacements.items(): for key, value in replacements.items():
path = path.replace(f"{{{key}}}", str(value)) path = path.replace(f"{{{key}}}", str(value))
@ -39,10 +35,8 @@ class AliYuqueTool:
"Content-Type": "application/json", "Content-Type": "application/json",
} }
) )
response = session.request( response = session.request(method.upper(), self.server_url + path, json=new_params)
method.upper(), self.server_url + path, json=new_params)
else: else:
response = session.request( response = session.request(method, self.server_url + path, params=new_params)
method, self.server_url + path, params=new_params)
response.raise_for_status() response.raise_for_status()
return response.text return response.text

View File

@ -13,6 +13,5 @@ class AliYuqueDeleteDocumentTool(AliYuqueTool, BuiltinTool):
if not token: if not token:
raise Exception("token is required") raise Exception("token is required")
return self.create_text_message( return self.create_text_message(
self.request("DELETE", token, tool_parameters, self.request("DELETE", token, tool_parameters, "/api/v2/repos/{book_id}/docs/{id}")
"/api/v2/repos/{book_id}/docs/{id}")
) )

View File

@ -13,6 +13,5 @@ class AliYuqueDescribeBookIndexPageTool(AliYuqueTool, BuiltinTool):
if not token: if not token:
raise Exception("token is required") raise Exception("token is required")
return self.create_text_message( return self.create_text_message(
self.request("GET", token, tool_parameters, self.request("GET", token, tool_parameters, "/api/v2/repos/{group_login}/{book_slug}/index_page")
"/api/v2/repos/{group_login}/{book_slug}/index_page")
) )

View File

@ -33,16 +33,14 @@ class AliYuqueDescribeDocumentContentTool(AliYuqueTool, BuiltinTool):
new_params["group_login"] = group_id new_params["group_login"] = group_id
new_params["book_slug"] = book_slug new_params["book_slug"] = book_slug
index_page = json.loads( index_page = json.loads(
self.request("GET", token, new_params, self.request("GET", token, new_params, "/api/v2/repos/{group_login}/{book_slug}/index_page")
"/api/v2/repos/{group_login}/{book_slug}/index_page")
) )
book_id = index_page.get("data", {}).get("book", {}).get("id") book_id = index_page.get("data", {}).get("book", {}).get("id")
if not book_id: if not book_id:
raise Exception(f"can not parse book_id from {index_page}") raise Exception(f"can not parse book_id from {index_page}")
new_params["book_id"] = book_id new_params["book_id"] = book_id
new_params["id"] = doc_id new_params["id"] = doc_id
data = self.request("GET", token, new_params, data = self.request("GET", token, new_params, "/api/v2/repos/{book_id}/docs/{id}")
"/api/v2/repos/{book_id}/docs/{id}")
data = json.loads(data) data = json.loads(data)
body_only = tool_parameters.get("body_only") or "" body_only = tool_parameters.get("body_only") or ""
if body_only.lower() == "true": if body_only.lower() == "true":

View File

@ -13,6 +13,5 @@ class AliYuqueDescribeDocumentsTool(AliYuqueTool, BuiltinTool):
if not token: if not token:
raise Exception("token is required") raise Exception("token is required")
return self.create_text_message( return self.create_text_message(
self.request("GET", token, tool_parameters, self.request("GET", token, tool_parameters, "/api/v2/repos/{book_id}/docs/{id}")
"/api/v2/repos/{book_id}/docs/{id}")
) )

View File

@ -13,6 +13,5 @@ class AliYuqueUpdateDocumentTool(AliYuqueTool, BuiltinTool):
if not token: if not token:
raise Exception("token is required") raise Exception("token is required")
return self.create_text_message( return self.create_text_message(
self.request("PUT", token, tool_parameters, self.request("PUT", token, tool_parameters, "/api/v2/repos/{book_id}/docs/{id}")
"/api/v2/repos/{book_id}/docs/{id}")
) )

View File

@ -35,8 +35,7 @@ class DocumentExtractorNode(BaseNode[DocumentExtractorNodeData]):
def _run(self): def _run(self):
variable_selector = self.node_data.variable_selector variable_selector = self.node_data.variable_selector
variable = self.graph_runtime_state.variable_pool.get( variable = self.graph_runtime_state.variable_pool.get(variable_selector)
variable_selector)
if variable is None: if variable is None:
error_message = f"File variable not found for selector: {variable_selector}" error_message = f"File variable not found for selector: {variable_selector}"
@ -47,8 +46,7 @@ class DocumentExtractorNode(BaseNode[DocumentExtractorNodeData]):
value = variable.value value = variable.value
inputs = {"variable_selector": variable_selector} inputs = {"variable_selector": variable_selector}
process_data = {"documents": value if isinstance(value, list) else [ process_data = {"documents": value if isinstance(value, list) else [value]}
value]}
try: try:
if isinstance(value, list): if isinstance(value, list):
@ -68,8 +66,7 @@ class DocumentExtractorNode(BaseNode[DocumentExtractorNodeData]):
outputs={"text": extracted_text}, outputs={"text": extracted_text},
) )
else: else:
raise DocumentExtractorError( raise DocumentExtractorError(f"Unsupported variable type: {type(value)}")
f"Unsupported variable type: {type(value)}")
except DocumentExtractorError as e: except DocumentExtractorError as e:
return NodeRunResult( return NodeRunResult(
status=WorkflowNodeExecutionStatus.FAILED, status=WorkflowNodeExecutionStatus.FAILED,
@ -105,8 +102,7 @@ def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str:
case "application/json": case "application/json":
return _extract_text_from_json(file_content) return _extract_text_from_json(file_content)
case _: case _:
raise UnsupportedFileTypeError( raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
f"Unsupported MIME type: {mime_type}")
def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str) -> str: def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str) -> str:
@ -135,8 +131,7 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
case ".msg": case ".msg":
return _extract_text_from_msg(file_content) return _extract_text_from_msg(file_content)
case _: case _:
raise UnsupportedFileTypeError( raise UnsupportedFileTypeError(f"Unsupported Extension Type: {file_extension}")
f"Unsupported Extension Type: {file_extension}")
def _extract_text_from_plain_text(file_content: bytes) -> str: def _extract_text_from_plain_text(file_content: bytes) -> str:
@ -151,8 +146,7 @@ def _extract_text_from_json(file_content: bytes) -> str:
json_data = json.loads(file_content.decode("utf-8")) json_data = json.loads(file_content.decode("utf-8"))
return json.dumps(json_data, indent=2, ensure_ascii=False) return json.dumps(json_data, indent=2, ensure_ascii=False)
except (UnicodeDecodeError, json.JSONDecodeError) as e: except (UnicodeDecodeError, json.JSONDecodeError) as e:
raise TextExtractionError( raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
f"Failed to decode or parse JSON file: {e}") from e
def _extract_text_from_pdf(file_content: bytes) -> str: def _extract_text_from_pdf(file_content: bytes) -> str:
@ -167,8 +161,7 @@ def _extract_text_from_pdf(file_content: bytes) -> str:
page.close() page.close()
return text return text
except Exception as e: except Exception as e:
raise TextExtractionError( raise TextExtractionError(f"Failed to extract text from PDF: {str(e)}") from e
f"Failed to extract text from PDF: {str(e)}") from e
def _extract_text_from_doc(file_content: bytes) -> str: def _extract_text_from_doc(file_content: bytes) -> str:
@ -177,8 +170,7 @@ def _extract_text_from_doc(file_content: bytes) -> str:
doc = docx.Document(doc_file) doc = docx.Document(doc_file)
return "\n".join([paragraph.text for paragraph in doc.paragraphs]) return "\n".join([paragraph.text for paragraph in doc.paragraphs])
except Exception as e: except Exception as e:
raise TextExtractionError( raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e
f"Failed to extract text from DOC/DOCX: {str(e)}") from e
def _download_file_content(file: File) -> bytes: def _download_file_content(file: File) -> bytes:
@ -193,8 +185,7 @@ def _download_file_content(file: File) -> bytes:
elif file.transfer_method == FileTransferMethod.LOCAL_FILE: elif file.transfer_method == FileTransferMethod.LOCAL_FILE:
return file_manager.download(file) return file_manager.download(file)
else: else:
raise ValueError( raise ValueError(f"Unsupported transfer method: {file.transfer_method}")
f"Unsupported transfer method: {file.transfer_method}")
except Exception as e: except Exception as e:
raise FileDownloadError(f"Error downloading file: {str(e)}") from e raise FileDownloadError(f"Error downloading file: {str(e)}") from e
@ -202,14 +193,11 @@ def _download_file_content(file: File) -> bytes:
def _extract_text_from_file(file: File): def _extract_text_from_file(file: File):
file_content = _download_file_content(file) file_content = _download_file_content(file)
if file.extension: if file.extension:
extracted_text = _extract_text_by_file_extension( extracted_text = _extract_text_by_file_extension(file_content=file_content, file_extension=file.extension)
file_content=file_content, file_extension=file.extension)
elif file.mime_type: elif file.mime_type:
extracted_text = _extract_text_by_mime_type( extracted_text = _extract_text_by_mime_type(file_content=file_content, mime_type=file.mime_type)
file_content=file_content, mime_type=file.mime_type)
else: else:
raise UnsupportedFileTypeError( raise UnsupportedFileTypeError("Unable to determine file type: MIME type or file extension is missing")
"Unable to determine file type: MIME type or file extension is missing")
return extracted_text return extracted_text
@ -230,8 +218,7 @@ def _extract_text_from_csv(file_content: bytes) -> str:
return markdown_table.strip() return markdown_table.strip()
except Exception as e: except Exception as e:
raise TextExtractionError( raise TextExtractionError(f"Failed to extract text from CSV: {str(e)}") from e
f"Failed to extract text from CSV: {str(e)}") from e
def _extract_text_from_excel(file_content: bytes) -> str: def _extract_text_from_excel(file_content: bytes) -> str:
@ -247,8 +234,7 @@ def _extract_text_from_excel(file_content: bytes) -> str:
markdown_table = df.to_markdown(index=False) markdown_table = df.to_markdown(index=False)
return markdown_table return markdown_table
except Exception as e: except Exception as e:
raise TextExtractionError( raise TextExtractionError(f"Failed to extract text from Excel file: {str(e)}") from e
f"Failed to extract text from Excel file: {str(e)}") from e
def _extract_text_from_ppt(file_content: bytes) -> str: def _extract_text_from_ppt(file_content: bytes) -> str:
@ -257,8 +243,7 @@ def _extract_text_from_ppt(file_content: bytes) -> str:
elements = partition_ppt(file=file) elements = partition_ppt(file=file)
return "\n".join([getattr(element, "text", "") for element in elements]) return "\n".join([getattr(element, "text", "") for element in elements])
except Exception as e: except Exception as e:
raise TextExtractionError( raise TextExtractionError(f"Failed to extract text from PPT: {str(e)}") from e
f"Failed to extract text from PPT: {str(e)}") from e
def _extract_text_from_pptx(file_content: bytes) -> str: def _extract_text_from_pptx(file_content: bytes) -> str:
@ -267,8 +252,7 @@ def _extract_text_from_pptx(file_content: bytes) -> str:
elements = partition_pptx(file=file) elements = partition_pptx(file=file)
return "\n".join([getattr(element, "text", "") for element in elements]) return "\n".join([getattr(element, "text", "") for element in elements])
except Exception as e: except Exception as e:
raise TextExtractionError( raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e
f"Failed to extract text from PPTX: {str(e)}") from e
def _extract_text_from_epub(file_content: bytes) -> str: def _extract_text_from_epub(file_content: bytes) -> str:
@ -277,8 +261,7 @@ def _extract_text_from_epub(file_content: bytes) -> str:
elements = partition_epub(file=file) elements = partition_epub(file=file)
return "\n".join([str(element) for element in elements]) return "\n".join([str(element) for element in elements])
except Exception as e: except Exception as e:
raise TextExtractionError( raise TextExtractionError(f"Failed to extract text from EPUB: {str(e)}") from e
f"Failed to extract text from EPUB: {str(e)}") from e
def _extract_text_from_eml(file_content: bytes) -> str: def _extract_text_from_eml(file_content: bytes) -> str:
@ -287,8 +270,7 @@ def _extract_text_from_eml(file_content: bytes) -> str:
elements = partition_email(file=file) elements = partition_email(file=file)
return "\n".join([str(element) for element in elements]) return "\n".join([str(element) for element in elements])
except Exception as e: except Exception as e:
raise TextExtractionError( raise TextExtractionError(f"Failed to extract text from EML: {str(e)}") from e
f"Failed to extract text from EML: {str(e)}") from e
def _extract_text_from_msg(file_content: bytes) -> str: def _extract_text_from_msg(file_content: bytes) -> str:
@ -297,5 +279,4 @@ def _extract_text_from_msg(file_content: bytes) -> str:
elements = partition_msg(file=file) elements = partition_msg(file=file)
return "\n".join([str(element) for element in elements]) return "\n".join([str(element) for element in elements])
except Exception as e: except Exception as e:
raise TextExtractionError( raise TextExtractionError(f"Failed to extract text from MSG: {str(e)}") from e
f"Failed to extract text from MSG: {str(e)}") from e

View File

@ -65,8 +65,7 @@ def test_run_invalid_variable_type(document_extractor_node, mock_graph_runtime_s
@pytest.mark.parametrize( @pytest.mark.parametrize(
("mime_type", "file_content", "expected_text", "transfer_method", "extension"), ("mime_type", "file_content", "expected_text", "transfer_method", "extension"),
[ [
("text/plain", b"Hello, world!", ("text/plain", b"Hello, world!", ["Hello, world!"], FileTransferMethod.LOCAL_FILE, ".txt"),
["Hello, world!"], FileTransferMethod.LOCAL_FILE, ".txt"),
( (
"application/pdf", "application/pdf",
b"%PDF-1.5\n%Test PDF content", b"%PDF-1.5\n%Test PDF content",
@ -81,8 +80,7 @@ def test_run_invalid_variable_type(document_extractor_node, mock_graph_runtime_s
FileTransferMethod.REMOTE_URL, FileTransferMethod.REMOTE_URL,
"", "",
), ),
("text/plain", b"Remote content", ("text/plain", b"Remote content", ["Remote content"], FileTransferMethod.REMOTE_URL, None),
["Remote content"], FileTransferMethod.REMOTE_URL, None),
], ],
) )
def test_run_extract_text( def test_run_extract_text(
@ -119,12 +117,10 @@ def test_run_extract_text(
if mime_type == "application/pdf": if mime_type == "application/pdf":
mock_pdf_extract = Mock(return_value=expected_text[0]) mock_pdf_extract = Mock(return_value=expected_text[0])
monkeypatch.setattr( monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_pdf", mock_pdf_extract)
"core.workflow.nodes.document_extractor.node._extract_text_from_pdf", mock_pdf_extract)
elif mime_type.startswith("application/vnd.openxmlformats"): elif mime_type.startswith("application/vnd.openxmlformats"):
mock_docx_extract = Mock(return_value=expected_text[0]) mock_docx_extract = Mock(return_value=expected_text[0])
monkeypatch.setattr( monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_doc", mock_docx_extract)
"core.workflow.nodes.document_extractor.node._extract_text_from_doc", mock_docx_extract)
result = document_extractor_node._run() result = document_extractor_node._run()
@ -134,8 +130,7 @@ def test_run_extract_text(
assert result.outputs["text"] == expected_text assert result.outputs["text"] == expected_text
if transfer_method == FileTransferMethod.REMOTE_URL: if transfer_method == FileTransferMethod.REMOTE_URL:
mock_ssrf_proxy_get.assert_called_once_with( mock_ssrf_proxy_get.assert_called_once_with("https://example.com/file.txt")
"https://example.com/file.txt")
elif transfer_method == FileTransferMethod.LOCAL_FILE: elif transfer_method == FileTransferMethod.LOCAL_FILE:
mock_download.assert_called_once_with(mock_file) mock_download.assert_called_once_with(mock_file)

View File

@ -77,18 +77,12 @@ MOCK = os.getenv("MOCK_SWITCH", "false").lower() == "true"
@pytest.fixture @pytest.fixture
def setup_volcengine_tos_mock(monkeypatch: MonkeyPatch): def setup_volcengine_tos_mock(monkeypatch: MonkeyPatch):
if MOCK: if MOCK:
monkeypatch.setattr(TosClientV2, "__init__", monkeypatch.setattr(TosClientV2, "__init__", MockVolcengineTosClass.__init__)
MockVolcengineTosClass.__init__) monkeypatch.setattr(TosClientV2, "put_object", MockVolcengineTosClass.put_object)
monkeypatch.setattr(TosClientV2, "put_object", monkeypatch.setattr(TosClientV2, "get_object", MockVolcengineTosClass.get_object)
MockVolcengineTosClass.put_object) monkeypatch.setattr(TosClientV2, "get_object_to_file", MockVolcengineTosClass.get_object_to_file)
monkeypatch.setattr(TosClientV2, "get_object", monkeypatch.setattr(TosClientV2, "head_object", MockVolcengineTosClass.head_object)
MockVolcengineTosClass.get_object) monkeypatch.setattr(TosClientV2, "delete_object", MockVolcengineTosClass.delete_object)
monkeypatch.setattr(TosClientV2, "get_object_to_file",
MockVolcengineTosClass.get_object_to_file)
monkeypatch.setattr(TosClientV2, "head_object",
MockVolcengineTosClass.head_object)
monkeypatch.setattr(TosClientV2, "delete_object",
MockVolcengineTosClass.delete_object)
yield yield