Merge 97f7a29fec
into a30945312a
This commit is contained in:
commit
af20411441
@ -55,14 +55,18 @@ class PdfExtractor(BaseExtractor):
|
|||||||
import pypdfium2 # type: ignore
|
import pypdfium2 # type: ignore
|
||||||
|
|
||||||
with blob.as_bytes_io() as file_path:
|
with blob.as_bytes_io() as file_path:
|
||||||
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
|
|
||||||
try:
|
try:
|
||||||
|
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
|
||||||
for page_number, page in enumerate(pdf_reader):
|
for page_number, page in enumerate(pdf_reader):
|
||||||
text_page = page.get_textpage()
|
try:
|
||||||
content = text_page.get_text_range()
|
text_page = page.get_textpage()
|
||||||
text_page.close()
|
try:
|
||||||
page.close()
|
content = text_page.get_text_range()
|
||||||
metadata = {"source": blob.source, "page": page_number}
|
metadata = {"source": blob.source, "page": page_number}
|
||||||
yield Document(page_content=content, metadata=metadata)
|
yield Document(page_content=content, metadata=metadata)
|
||||||
|
finally:
|
||||||
|
text_page.close()
|
||||||
|
finally:
|
||||||
|
page.close()
|
||||||
finally:
|
finally:
|
||||||
pdf_reader.close()
|
pdf_reader.close()
|
||||||
|
@ -266,21 +266,26 @@ class Executor:
|
|||||||
return headers
|
return headers
|
||||||
|
|
||||||
def _validate_and_parse_response(self, response: httpx.Response) -> Response:
|
def _validate_and_parse_response(self, response: httpx.Response) -> Response:
|
||||||
executor_response = Response(response)
|
try:
|
||||||
|
executor_response = Response(response)
|
||||||
|
|
||||||
threshold_size = (
|
threshold_size = (
|
||||||
dify_config.HTTP_REQUEST_NODE_MAX_BINARY_SIZE
|
dify_config.HTTP_REQUEST_NODE_MAX_BINARY_SIZE
|
||||||
if executor_response.is_file
|
if executor_response.is_file
|
||||||
else dify_config.HTTP_REQUEST_NODE_MAX_TEXT_SIZE
|
else dify_config.HTTP_REQUEST_NODE_MAX_TEXT_SIZE
|
||||||
)
|
|
||||||
if executor_response.size > threshold_size:
|
|
||||||
raise ResponseSizeError(
|
|
||||||
f"{'File' if executor_response.is_file else 'Text'} size is too large,"
|
|
||||||
f" max size is {threshold_size / 1024 / 1024:.2f} MB,"
|
|
||||||
f" but current size is {executor_response.readable_size}."
|
|
||||||
)
|
)
|
||||||
|
if executor_response.size > threshold_size:
|
||||||
|
raise ResponseSizeError(
|
||||||
|
f"{'File' if executor_response.is_file else 'Text'} size is too large,"
|
||||||
|
f" max size is {threshold_size / 1024 / 1024:.2f} MB,"
|
||||||
|
f" but current size is {executor_response.readable_size}."
|
||||||
|
)
|
||||||
|
|
||||||
return executor_response
|
return executor_response
|
||||||
|
except Exception as e:
|
||||||
|
# Ensure response is closed when an exception occurs
|
||||||
|
response.close()
|
||||||
|
raise e
|
||||||
|
|
||||||
def _do_http_request(self, headers: dict[str, Any]) -> httpx.Response:
|
def _do_http_request(self, headers: dict[str, Any]) -> httpx.Response:
|
||||||
"""
|
"""
|
||||||
@ -316,21 +321,29 @@ class Executor:
|
|||||||
"follow_redirects": True,
|
"follow_redirects": True,
|
||||||
"max_retries": self.max_retries,
|
"max_retries": self.max_retries,
|
||||||
}
|
}
|
||||||
# request_args = {k: v for k, v in request_args.items() if v is not None}
|
|
||||||
try:
|
# Use with statement to ensure proper resource cleanup
|
||||||
response = getattr(ssrf_proxy, self.method.lower())(**request_args)
|
with httpx.Client() as client:
|
||||||
except (ssrf_proxy.MaxRetriesExceededError, httpx.RequestError) as e:
|
try:
|
||||||
raise HttpRequestNodeError(str(e))
|
response = getattr(client, self.method.lower())(**request_args)
|
||||||
# FIXME: fix type ignore, this maybe httpx type issue
|
# Create a new Response object and copy required data
|
||||||
return response # type: ignore
|
# This allows safe closure of the original response
|
||||||
|
copied_response = response.copy()
|
||||||
|
response.close()
|
||||||
|
return copied_response
|
||||||
|
except (ssrf_proxy.MaxRetriesExceededError, httpx.RequestError) as e:
|
||||||
|
raise HttpRequestNodeError(str(e))
|
||||||
|
|
||||||
def invoke(self) -> Response:
|
def invoke(self) -> Response:
|
||||||
# assemble headers
|
response = None
|
||||||
headers = self._assembling_headers()
|
try:
|
||||||
# do http request
|
headers = self._assembling_headers()
|
||||||
response = self._do_http_request(headers)
|
response = self._do_http_request(headers)
|
||||||
# validate response
|
return self._validate_and_parse_response(response)
|
||||||
return self._validate_and_parse_response(response)
|
except Exception as e:
|
||||||
|
if response is not None:
|
||||||
|
response.close()
|
||||||
|
raise e
|
||||||
|
|
||||||
def to_log(self):
|
def to_log(self):
|
||||||
url_parts = urlparse(self.url)
|
url_parts = urlparse(self.url)
|
||||||
|
Loading…
Reference in New Issue
Block a user