fix the ssrf of docx file extractor external images

2024-11-04 15:31:20 +08:00 · 2024-11-04 14:58:50 +08:00
2 changed files with 17 additions and 16 deletions
--- a/api/core/helper/ssrf_proxy.py
+++ b/api/core/helper/ssrf_proxy.py
@ -31,24 +31,25 @@ def make_request(method, url, max_retries=SSRF_DEFAULT_MAX_RETRIES, **kwargs):
        allow_redirects = kwargs.pop("allow_redirects")
        if "follow_redirects" not in kwargs:
            kwargs["follow_redirects"] = allow_redirects
-
+    stream = kwargs.pop("stream", False)
    retries = 0
    while retries <= max_retries:
        try:
-            if SSRF_PROXY_ALL_URL:
-                with httpx.Client(proxy=SSRF_PROXY_ALL_URL) as client:
-                    response = client.request(method=method, url=url, **kwargs)
-            elif proxy_mounts:
-                with httpx.Client(mounts=proxy_mounts) as client:
-                    response = client.request(method=method, url=url, **kwargs)
-            else:
-                with httpx.Client() as client:
-                    response = client.request(method=method, url=url, **kwargs)
+            client_args = {"proxy": SSRF_PROXY_ALL_URL} if SSRF_PROXY_ALL_URL else {}
+            if proxy_mounts:
+                client_args["mounts"] = proxy_mounts

-            if response.status_code not in STATUS_FORCELIST:
-                return response
-            else:
-                logging.warning(f"Received status code {response.status_code} for URL {url} which is in the force list")
+            with httpx.Client(**client_args) as client:
+                response = client.request(method=method, url=url, **kwargs)
+
+                if response.status_code not in STATUS_FORCELIST:
+                    if stream:
+                        return response.iter_bytes()
+                    return response
+                else:
+                    logging.warning(
+                        f"Received status code {response.status_code} for URL {url} which is in the force list"
+                    )

        except httpx.RequestError as e:
            logging.warning(f"Request to URL {url} failed on attempt {retries + 1}: {e}")
--- a/api/core/rag/extractor/word_extractor.py
+++ b/api/core/rag/extractor/word_extractor.py
@ -14,6 +14,7 @@ import requests
 from docx import Document as DocxDocument

 from configs import dify_config
+from core.helper import ssrf_proxy
 from core.rag.extractor.extractor_base import BaseExtractor
 from core.rag.models.document import Document
 from extensions.ext_database import db
@ -80,13 +81,12 @@ class WordExtractor(BaseExtractor):
        os.makedirs(image_folder, exist_ok=True)
        image_count = 0
        image_map = {}
-
        for rel in doc.part.rels.values():
            if "image" in rel.target_ref:
                image_count += 1
                if rel.is_external:
                    url = rel.reltype
-                    response = requests.get(url, stream=True)
+                    response = ssrf_proxy.get(url, stream=True)
                    if response.status_code == 200:
                        image_ext = mimetypes.guess_extension(response.headers["Content-Type"])
                        file_uuid = str(uuid.uuid4())
Author	SHA1	Message	Date
jyong	517bbc281a	fix the ssrf of docx file extractor external images	2024-11-04 15:31:20 +08:00
jyong	c135ec4b08	fix the ssrf of docx file extractor external images	2024-11-04 14:58:50 +08:00