Merge 97f7a29fec
into a30945312a
This commit is contained in:
commit
af20411441
@ -55,14 +55,18 @@ class PdfExtractor(BaseExtractor):
|
|||||||
import pypdfium2 # type: ignore
|
import pypdfium2 # type: ignore
|
||||||
|
|
||||||
with blob.as_bytes_io() as file_path:
|
with blob.as_bytes_io() as file_path:
|
||||||
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
|
|
||||||
try:
|
try:
|
||||||
|
pdf_reader = pypdfium2.PdfDocument(file_path, autoclose=True)
|
||||||
for page_number, page in enumerate(pdf_reader):
|
for page_number, page in enumerate(pdf_reader):
|
||||||
|
try:
|
||||||
text_page = page.get_textpage()
|
text_page = page.get_textpage()
|
||||||
|
try:
|
||||||
content = text_page.get_text_range()
|
content = text_page.get_text_range()
|
||||||
text_page.close()
|
|
||||||
page.close()
|
|
||||||
metadata = {"source": blob.source, "page": page_number}
|
metadata = {"source": blob.source, "page": page_number}
|
||||||
yield Document(page_content=content, metadata=metadata)
|
yield Document(page_content=content, metadata=metadata)
|
||||||
|
finally:
|
||||||
|
text_page.close()
|
||||||
|
finally:
|
||||||
|
page.close()
|
||||||
finally:
|
finally:
|
||||||
pdf_reader.close()
|
pdf_reader.close()
|
||||||
|
@ -266,6 +266,7 @@ class Executor:
|
|||||||
return headers
|
return headers
|
||||||
|
|
||||||
def _validate_and_parse_response(self, response: httpx.Response) -> Response:
|
def _validate_and_parse_response(self, response: httpx.Response) -> Response:
|
||||||
|
try:
|
||||||
executor_response = Response(response)
|
executor_response = Response(response)
|
||||||
|
|
||||||
threshold_size = (
|
threshold_size = (
|
||||||
@ -281,6 +282,10 @@ class Executor:
|
|||||||
)
|
)
|
||||||
|
|
||||||
return executor_response
|
return executor_response
|
||||||
|
except Exception as e:
|
||||||
|
# Ensure response is closed when an exception occurs
|
||||||
|
response.close()
|
||||||
|
raise e
|
||||||
|
|
||||||
def _do_http_request(self, headers: dict[str, Any]) -> httpx.Response:
|
def _do_http_request(self, headers: dict[str, Any]) -> httpx.Response:
|
||||||
"""
|
"""
|
||||||
@ -316,21 +321,29 @@ class Executor:
|
|||||||
"follow_redirects": True,
|
"follow_redirects": True,
|
||||||
"max_retries": self.max_retries,
|
"max_retries": self.max_retries,
|
||||||
}
|
}
|
||||||
# request_args = {k: v for k, v in request_args.items() if v is not None}
|
|
||||||
|
# Use with statement to ensure proper resource cleanup
|
||||||
|
with httpx.Client() as client:
|
||||||
try:
|
try:
|
||||||
response = getattr(ssrf_proxy, self.method.lower())(**request_args)
|
response = getattr(client, self.method.lower())(**request_args)
|
||||||
|
# Create a new Response object and copy required data
|
||||||
|
# This allows safe closure of the original response
|
||||||
|
copied_response = response.copy()
|
||||||
|
response.close()
|
||||||
|
return copied_response
|
||||||
except (ssrf_proxy.MaxRetriesExceededError, httpx.RequestError) as e:
|
except (ssrf_proxy.MaxRetriesExceededError, httpx.RequestError) as e:
|
||||||
raise HttpRequestNodeError(str(e))
|
raise HttpRequestNodeError(str(e))
|
||||||
# FIXME: fix type ignore, this maybe httpx type issue
|
|
||||||
return response # type: ignore
|
|
||||||
|
|
||||||
def invoke(self) -> Response:
|
def invoke(self) -> Response:
|
||||||
# assemble headers
|
response = None
|
||||||
|
try:
|
||||||
headers = self._assembling_headers()
|
headers = self._assembling_headers()
|
||||||
# do http request
|
|
||||||
response = self._do_http_request(headers)
|
response = self._do_http_request(headers)
|
||||||
# validate response
|
|
||||||
return self._validate_and_parse_response(response)
|
return self._validate_and_parse_response(response)
|
||||||
|
except Exception as e:
|
||||||
|
if response is not None:
|
||||||
|
response.close()
|
||||||
|
raise e
|
||||||
|
|
||||||
def to_log(self):
|
def to_log(self):
|
||||||
url_parts = urlparse(self.url)
|
url_parts = urlparse(self.url)
|
||||||
|
Loading…
Reference in New Issue
Block a user