This commit is contained in:
Eric 2025-03-21 13:32:41 +08:00 committed by GitHub
commit af20411441
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 49 additions and 32 deletions

View File

@ -55,14 +55,18 @@ class PdfExtractor(BaseExtractor):
import pypdfium2 # type: ignore import pypdfium2 # type: ignore
with blob.as_bytes_io() as file_path: with blob.as_bytes_io() as file_path:
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
try: try:
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
for page_number, page in enumerate(pdf_reader): for page_number, page in enumerate(pdf_reader):
text_page = page.get_textpage() try:
content = text_page.get_text_range() text_page = page.get_textpage()
text_page.close() try:
page.close() content = text_page.get_text_range()
metadata = {"source": blob.source, "page": page_number} metadata = {"source": blob.source, "page": page_number}
yield Document(page_content=content, metadata=metadata) yield Document(page_content=content, metadata=metadata)
finally:
text_page.close()
finally:
page.close()
finally: finally:
pdf_reader.close() pdf_reader.close()

View File

@ -266,21 +266,26 @@ class Executor:
return headers return headers
def _validate_and_parse_response(self, response: httpx.Response) -> Response: def _validate_and_parse_response(self, response: httpx.Response) -> Response:
executor_response = Response(response) try:
executor_response = Response(response)
threshold_size = ( threshold_size = (
dify_config.HTTP_REQUEST_NODE_MAX_BINARY_SIZE dify_config.HTTP_REQUEST_NODE_MAX_BINARY_SIZE
if executor_response.is_file if executor_response.is_file
else dify_config.HTTP_REQUEST_NODE_MAX_TEXT_SIZE else dify_config.HTTP_REQUEST_NODE_MAX_TEXT_SIZE
)
if executor_response.size > threshold_size:
raise ResponseSizeError(
f"{'File' if executor_response.is_file else 'Text'} size is too large,"
f" max size is {threshold_size / 1024 / 1024:.2f} MB,"
f" but current size is {executor_response.readable_size}."
) )
if executor_response.size > threshold_size:
raise ResponseSizeError(
f"{'File' if executor_response.is_file else 'Text'} size is too large,"
f" max size is {threshold_size / 1024 / 1024:.2f} MB,"
f" but current size is {executor_response.readable_size}."
)
return executor_response return executor_response
except Exception as e:
# Ensure response is closed when an exception occurs
response.close()
raise e
def _do_http_request(self, headers: dict[str, Any]) -> httpx.Response: def _do_http_request(self, headers: dict[str, Any]) -> httpx.Response:
""" """
@ -316,21 +321,29 @@ class Executor:
"follow_redirects": True, "follow_redirects": True,
"max_retries": self.max_retries, "max_retries": self.max_retries,
} }
# request_args = {k: v for k, v in request_args.items() if v is not None}
try: # Use with statement to ensure proper resource cleanup
response = getattr(ssrf_proxy, self.method.lower())(**request_args) with httpx.Client() as client:
except (ssrf_proxy.MaxRetriesExceededError, httpx.RequestError) as e: try:
raise HttpRequestNodeError(str(e)) response = getattr(client, self.method.lower())(**request_args)
# FIXME: fix type ignore, this maybe httpx type issue # Create a new Response object and copy required data
return response # type: ignore # This allows safe closure of the original response
copied_response = response.copy()
response.close()
return copied_response
except (ssrf_proxy.MaxRetriesExceededError, httpx.RequestError) as e:
raise HttpRequestNodeError(str(e))
def invoke(self) -> Response: def invoke(self) -> Response:
# assemble headers response = None
headers = self._assembling_headers() try:
# do http request headers = self._assembling_headers()
response = self._do_http_request(headers) response = self._do_http_request(headers)
# validate response return self._validate_and_parse_response(response)
return self._validate_and_parse_response(response) except Exception as e:
if response is not None:
response.close()
raise e
def to_log(self): def to_log(self):
url_parts = urlparse(self.url) url_parts = urlparse(self.url)