Compare commits

...

2 Commits

Author SHA1 Message Date
jyong
517bbc281a fix the ssrf of docx file extractor external images 2024-11-04 15:31:20 +08:00
jyong
c135ec4b08 fix the ssrf of docx file extractor external images 2024-11-04 14:58:50 +08:00
2 changed files with 17 additions and 16 deletions

View File

@ -31,24 +31,25 @@ def make_request(method, url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs):
allow_redirects = kwargs.pop("allow_redirects")
if "follow_redirects" not in kwargs:
kwargs["follow_redirects"] = allow_redirects
stream = kwargs.pop("stream", False)
retries = 0
while retries <= max_retries:
try:
if SSRF_PROXY_ALL_URL:
with httpx.Client(proxy=SSRF_PROXY_ALL_URL) as client:
response = client.request(method=method, url=url, **kwargs)
elif proxy_mounts:
with httpx.Client(mounts=proxy_mounts) as client:
response = client.request(method=method, url=url, **kwargs)
else:
with httpx.Client() as client:
response = client.request(method=method, url=url, **kwargs)
client_args = {"proxy": SSRF_PROXY_ALL_URL} if SSRF_PROXY_ALL_URL else {}
if proxy_mounts:
client_args["mounts"] = proxy_mounts
if response.status_code not in STATUS_FORCELIST:
return response
else:
logging.warning(f"Received status code {response.status_code} for URL {url} which is in the force list")
with httpx.Client(**client_args) as client:
response = client.request(method=method, url=url, **kwargs)
if response.status_code not in STATUS_FORCELIST:
if stream:
return response.iter_bytes()
return response
else:
logging.warning(
f"Received status code {response.status_code} for URL {url} which is in the force list"
)
except httpx.RequestError as e:
logging.warning(f"Request to URL {url} failed on attempt {retries + 1}: {e}")

View File

@ -14,6 +14,7 @@ import requests
from docx import Document as DocxDocument
from configs import dify_config
from core.helper import ssrf_proxy
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
from extensions.ext_database import db
@ -80,13 +81,12 @@ class WordExtractor(BaseExtractor):
os.makedirs(image_folder, exist_ok=True)
image_count = 0
image_map = {}
for rel in doc.part.rels.values():
if "image" in rel.target_ref:
image_count += 1
if rel.is_external:
url = rel.reltype
response = requests.get(url, stream=True)
response = ssrf_proxy.get(url, stream=True)
if response.status_code == 200:
image_ext = mimetypes.guess_extension(response.headers["Content-Type"])
file_uuid = str(uuid.uuid4())