From 29a4dec387f55f83167a221fe6a15585e31672ca Mon Sep 17 00:00:00 2001 From: Amir Mohsen Date: Fri, 21 Mar 2025 00:18:18 +0100 Subject: [PATCH 1/2] feat: Integrate WaterCrawl.dev as a new knowledge base provider Add WaterCrawl.dev as an alternative provider for website crawling in datasets/knowledge base alongside Firecrawl and Jina Reader. This integration enhances the data source options for knowledge bases, allowing users to configure and use WaterCrawl for their website content extraction needs. Resolved #15950 --- api/controllers/console/datasets/website.py | 6 +- api/core/rag/extractor/extract_processor.py | 12 +- api/core/rag/extractor/watercrawl/client.py | 193 ++++++++++++++++ .../rag/extractor/watercrawl/extractor.py | 57 +++++ api/core/rag/extractor/watercrawl/provider.py | 119 ++++++++++ api/services/auth/api_key_auth_factory.py | 4 + api/services/auth/auth_type.py | 1 + api/services/auth/watercrawl/__init__.py | 0 api/services/auth/watercrawl/watercrawl.py | 44 ++++ api/services/website_service.py | 32 +++ .../datasets/create/assets/watercrawl.svg | 20 ++ .../datasets/create/website/index.module.css | 7 + .../datasets/create/website/index.tsx | 87 ++++--- .../datasets/create/website/no-data.tsx | 5 + .../create/website/watercrawl/header.tsx | 43 ++++ .../create/website/watercrawl/index.tsx | 217 ++++++++++++++++++ .../create/website/watercrawl/options.tsx | 83 +++++++ .../config-watercrawl-modal.tsx | 161 +++++++++++++ .../data-source-website/index.tsx | 57 +++-- .../data-source-page/index.tsx | 1 + .../data-source-page/panel/index.tsx | 8 +- web/i18n/en-US/dataset-creation.ts | 11 + web/models/common.ts | 6 + web/service/datasets.ts | 19 ++ 24 files changed, 1137 insertions(+), 56 deletions(-) create mode 100644 api/core/rag/extractor/watercrawl/client.py create mode 100644 api/core/rag/extractor/watercrawl/extractor.py create mode 100644 api/core/rag/extractor/watercrawl/provider.py create mode 100644 api/services/auth/watercrawl/__init__.py create mode 100644 api/services/auth/watercrawl/watercrawl.py create mode 100644 web/app/components/datasets/create/assets/watercrawl.svg create mode 100644 web/app/components/datasets/create/website/watercrawl/header.tsx create mode 100644 web/app/components/datasets/create/website/watercrawl/index.tsx create mode 100644 web/app/components/datasets/create/website/watercrawl/options.tsx create mode 100644 web/app/components/header/account-setting/data-source-page/data-source-website/config-watercrawl-modal.tsx diff --git a/api/controllers/console/datasets/website.py b/api/controllers/console/datasets/website.py index da995537e7..fa8c1b37e2 100644 --- a/api/controllers/console/datasets/website.py +++ b/api/controllers/console/datasets/website.py @@ -14,7 +14,8 @@ class WebsiteCrawlApi(Resource): def post(self): parser = reqparse.RequestParser() parser.add_argument( - "provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json" + "provider", type=str, choices=["firecrawl", "watercrawl", "jinareader"], + required=True, nullable=True, location="json" ) parser.add_argument("url", type=str, required=True, nullable=True, location="json") parser.add_argument("options", type=dict, required=True, nullable=True, location="json") @@ -34,7 +35,8 @@ class WebsiteCrawlStatusApi(Resource): @account_initialization_required def get(self, job_id: str): parser = reqparse.RequestParser() - parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args") + parser.add_argument("provider", type=str, choices=["firecrawl", "watercrawl", "jinareader"], + required=True, location="args") args = parser.parse_args() # get crawl status try: diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py index f9fd7f92a1..750ef1104b 100644 --- a/api/core/rag/extractor/extract_processor.py +++ b/api/core/rag/extractor/extract_processor.py @@ -25,6 +25,7 @@ from core.rag.extractor.unstructured.unstructured_msg_extractor import Unstructu from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor from core.rag.extractor.unstructured.unstructured_xml_extractor import UnstructuredXmlExtractor +from core.rag.extractor.watercrawl.extractor import WaterCrawlWebExtractor from core.rag.extractor.word_extractor import WordExtractor from core.rag.models.document import Document from extensions.ext_storage import storage @@ -40,7 +41,7 @@ USER_AGENT = ( class ExtractProcessor: @classmethod def load_from_upload_file( - cls, upload_file: UploadFile, return_text: bool = False, is_automatic: bool = False + cls, upload_file: UploadFile, return_text: bool = False, is_automatic: bool = False ) -> Union[list[Document], str]: extract_setting = ExtractSetting( datasource_type="upload_file", upload_file=upload_file, document_model="text_model" @@ -180,6 +181,15 @@ class ExtractProcessor: only_main_content=extract_setting.website_info.only_main_content, ) return extractor.extract() + elif extract_setting.website_info.provider == "watercrawl": + extractor = WaterCrawlWebExtractor( + url=extract_setting.website_info.url, + job_id=extract_setting.website_info.job_id, + tenant_id=extract_setting.website_info.tenant_id, + mode=extract_setting.website_info.mode, + only_main_content=extract_setting.website_info.only_main_content, + ) + return extractor.extract() elif extract_setting.website_info.provider == "jinareader": extractor = JinaReaderWebExtractor( url=extract_setting.website_info.url, diff --git a/api/core/rag/extractor/watercrawl/client.py b/api/core/rag/extractor/watercrawl/client.py new file mode 100644 index 0000000000..553d5476e2 --- /dev/null +++ b/api/core/rag/extractor/watercrawl/client.py @@ -0,0 +1,193 @@ +import json +from typing import Union, Generator +from urllib.parse import urljoin + +import requests +from requests import Response + + +class BaseAPIClient: + def __init__(self, api_key, base_url): + self.api_key = api_key + self.base_url = base_url + self.session = self.init_session() + + def init_session(self): + session = requests.Session() + session.headers.update({'X-API-Key': self.api_key}) + session.headers.update({'Content-Type': 'application/json'}) + session.headers.update({'Accept': 'application/json'}) + session.headers.update({'User-Agent': 'WaterCrawl-Plugin'}) + session.headers.update({'Accept-Language': 'en-US'}) + return session + + def _get(self, endpoint: str, query_params: dict = None, **kwargs): + return self.session.get( + urljoin(self.base_url, endpoint), + params=query_params, + **kwargs + ) + + def _post(self, endpoint: str, query_params: dict = None, data: dict = None, **kwargs): + return self.session.post( + urljoin(self.base_url, endpoint), + params=query_params, + json=data, **kwargs + ) + + def _put(self, endpoint: str, query_params: dict = None, data: dict = None, **kwargs): + return self.session.put( + urljoin(self.base_url, endpoint), + params=query_params, + json=data, **kwargs + ) + + def _delete(self, endpoint: str, query_params: dict = None, **kwargs): + return self.session.delete( + urljoin(self.base_url, endpoint), + params=query_params, + **kwargs + ) + + def _patch(self, endpoint: str, query_params: dict = None, data: dict = None, **kwargs): + return self.session.patch( + urljoin(self.base_url, endpoint), + params=query_params, + json=data, **kwargs + ) + + +class WaterCrawlAPIClient(BaseAPIClient): + def __init__(self, api_key, base_url: str = 'https://app.watercrawl.dev/'): + super().__init__(api_key, base_url) + + def process_eventstream(self, response: Response, download: bool = False): + for line in response.iter_lines(): + line = line.decode('utf-8') + if line.startswith('data:'): + line = line[5:].strip() + data = json.loads(line) + if data['type'] == 'result' and download: + data['data'] = self.download_result(data['data']) + yield data + + def process_response(self, response: Response) -> Union[dict, bytes, list, None, Generator]: + response.raise_for_status() + if response.status_code == 204: + return None + if response.headers.get('Content-Type') == 'application/json': + return response.json() + + if response.headers.get('Content-Type') == 'application/octet-stream': + return response.content + + if response.headers.get('Content-Type') == 'text/event-stream': + return self.process_eventstream(response) + + raise Exception(f'Unknown response type: {response.headers.get("Content-Type")}') + + def get_crawl_requests_list(self, page: int = None, page_size: int = None): + query_params = { + 'page': page or 1, + 'page_size': page_size or 10 + } + return self.process_response( + self._get( + '/api/v1/core/crawl-requests/', + query_params=query_params, + ) + ) + + def get_crawl_request(self, item_id: str): + return self.process_response( + self._get( + f'/api/v1/core/crawl-requests/{item_id}/', + ) + ) + + def create_crawl_request( + self, + url: Union[list, str] = None, + spider_options: dict = None, + page_options: dict = None, + plugin_options: dict = None + ): + data = { + # 'urls': url if isinstance(url, list) else [url], + 'url': url, + 'options': { + 'spider_options': spider_options or {}, + 'page_options': page_options or {}, + 'plugin_options': plugin_options or {}, + } + } + return self.process_response( + self._post( + '/api/v1/core/crawl-requests/', + data=data, + ) + ) + + def stop_crawl_request(self, item_id: str): + return self.process_response( + self._delete( + f'/api/v1/core/crawl-requests/{item_id}/', + ) + ) + + def download_crawl_request(self, item_id: str): + return self.process_response( + self._get( + f'/api/v1/core/crawl-requests/{item_id}/download/', + ) + ) + + def monitor_crawl_request(self, item_id: str, prefetched=False) -> Generator: + query_params = { + 'prefetched': str(prefetched).lower() + } + return self.process_response( + self._get( + f'/api/v1/core/crawl-requests/{item_id}/status/', + stream=True, + query_params=query_params + ), + ) + + def get_crawl_request_results(self, item_id: str, page: int = 1, page_size: int = 25, query_params: dict = None): + query_params = query_params or {} + query_params.update({ + 'page': page or 1, + 'page_size': page_size or 25 + }) + return self.process_response( + self._get( + f'/api/v1/core/crawl-requests/{item_id}/results/', + query_params=query_params + ) + ) + + def scrape_url(self, + url: str, + page_options: dict = None, + plugin_options: dict = None, + sync: bool = True, + prefetched: bool = True + ): + result = self.create_crawl_request( + url=url, + page_options=page_options, + plugin_options=plugin_options + ) + if not sync: + return result + + for result in self.monitor_crawl_request(result['uuid'], prefetched): + if result['type'] == 'result': + return result['data'] + + def download_result(self, result_object: dict): + response = requests.get(result_object['result']) + response.raise_for_status() + result_object['result'] = response.json() + return result_object diff --git a/api/core/rag/extractor/watercrawl/extractor.py b/api/core/rag/extractor/watercrawl/extractor.py new file mode 100644 index 0000000000..40d1740962 --- /dev/null +++ b/api/core/rag/extractor/watercrawl/extractor.py @@ -0,0 +1,57 @@ +from core.rag.extractor.extractor_base import BaseExtractor +from core.rag.models.document import Document +from services.website_service import WebsiteService + + +class WaterCrawlWebExtractor(BaseExtractor): + """ + Crawl and scrape websites and return content in clean llm-ready markdown. + + + Args: + url: The URL to scrape. + api_key: The API key for WaterCrawl. + base_url: The base URL for the Firecrawl API. Defaults to 'https://app.firecrawl.dev'. + mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'. + only_main_content: Only return the main content of the page excluding headers, navs, footers, etc. + """ + + def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True): + """Initialize with url, api_key, base_url and mode.""" + self._url = url + self.job_id = job_id + self.tenant_id = tenant_id + self.mode = mode + self.only_main_content = only_main_content + + def extract(self) -> list[Document]: + """Extract content from the URL.""" + documents = [] + if self.mode == "crawl": + crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "watercrawl", self._url, self.tenant_id) + if crawl_data is None: + return [] + document = Document( + page_content=crawl_data.get("markdown", ""), + metadata={ + "source_url": crawl_data.get("source_url"), + "description": crawl_data.get("description"), + "title": crawl_data.get("title"), + }, + ) + documents.append(document) + elif self.mode == "scrape": + scrape_data = WebsiteService.get_scrape_url_data( + "watercrawl", self._url, self.tenant_id, self.only_main_content + ) + + document = Document( + page_content=scrape_data.get("markdown", ""), + metadata={ + "source_url": scrape_data.get("source_url"), + "description": scrape_data.get("description"), + "title": scrape_data.get("title"), + }, + ) + documents.append(document) + return documents diff --git a/api/core/rag/extractor/watercrawl/provider.py b/api/core/rag/extractor/watercrawl/provider.py new file mode 100644 index 0000000000..31727e3986 --- /dev/null +++ b/api/core/rag/extractor/watercrawl/provider.py @@ -0,0 +1,119 @@ +from datetime import datetime, timezone + +from core.rag.extractor.watercrawl.client import WaterCrawlAPIClient + + +class WaterCrawlProvider: + def __init__(self, api_key, base_url: str = None): + self.client = WaterCrawlAPIClient(api_key, base_url) + + def crawl_url(self, url, options): + spider_options = { + "max_depth": 1, + "page_limit": 1, + "allowed_domains": [], + "exclude_paths": [], + "include_paths": [] + } + if options.get("crawl_sub_pages", True): + spider_options["page_limit"] = options.get("limit", 1) + spider_options["max_depth"] = options.get("depth", 1) + spider_options["include_paths"] = options.get("includes", '').split(",") if options.get("includes") else [] + spider_options["exclude_paths"] = options.get("excludes", '').split(",") if options.get("excludes") else [] + + wait_time = options.get("wait_time", 1000) + page_options = { + "exclude_tags": options.get("exclude_tags", '').split(",") if options.get("exclude_tags") else [], + "include_tags": options.get("include_tags", '').split(",") if options.get("include_tags") else [], + "wait_time": wait_time if wait_time > 1000 else 1000, # minimum wait time is 1 second + "include_html": False, + "only_main_content": options.get("only_main_content", True), + "include_links": False, + "timeout": 15000, + "accept_cookies_selector": "#cookies-accept", + "locale": "en-US", + "actions": [] + } + result = self.client.create_crawl_request( + url=url, + spider_options=spider_options, + page_options=page_options + ) + + return {"status": "active", "job_id": result.get("uuid")} + + def get_crawl_status(self, crawl_request_id): + response = self.client.get_crawl_request(crawl_request_id) + data = [] + if response['status'] in ['new', 'running']: + status = 'active' + else: + status = 'completed' + data = list(self._get_results(crawl_request_id)) + + time_str = response.get('duration') + time_consuming = 0 + if time_str: + time_obj = datetime.strptime(time_str, "%H:%M:%S.%f") + time_consuming = time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1_000_000 + + return { + "status": status, + "job_id": response.get("uuid"), + "total": response.get("options", {}).get('spider_options', {}).get("page_limit", 1), + "current": response.get("number_of_documents", 0), + "data": data, + "time_consuming": time_consuming + } + + def get_crawl_url_data(self, job_id, url): + if not job_id: + return self.scrape_url(url) + + for result in self._get_results(job_id, { + # filter by url + 'url': url + }): + return result + + return None + + def scrape_url(self, url): + response = self.client.scrape_url( + url=url, + sync=True, + prefetched=True + ) + return self._structure_data(response) + + def _structure_data(self, result_object: dict): + if isinstance(result_object.get("result", {}), str): + raise ValueError("Invalid result object. Expected a dictionary.") + + metadata = result_object.get("result", {}).get("metadata", {}) + return { + "title": metadata.get("og:title") or metadata.get("title"), + "description": metadata.get("description"), + "source_url": result_object.get("url"), + "markdown": result_object.get('result').get("markdown"), + } + + def _get_results(self, crawl_request_id: str, query_params: dict = None): + page = 0 + page_size = 100 + + query_params = query_params or {} + query_params.update({ + 'prefetched': "true" + }) + while True: + page += 1 + response = self.client.get_crawl_request_results(crawl_request_id, page, page_size, query_params) + if not response['results']: + break + + for result in response['results']: + yield self._structure_data(result) + + if response['next'] is None: + break diff --git a/api/services/auth/api_key_auth_factory.py b/api/services/auth/api_key_auth_factory.py index f91c448fb9..7ae31b0768 100644 --- a/api/services/auth/api_key_auth_factory.py +++ b/api/services/auth/api_key_auth_factory.py @@ -17,6 +17,10 @@ class ApiKeyAuthFactory: from services.auth.firecrawl.firecrawl import FirecrawlAuth return FirecrawlAuth + case AuthType.WATERCRAWL: + from services.auth.watercrawl.watercrawl import WatercrawlAuth + + return WatercrawlAuth case AuthType.JINA: from services.auth.jina.jina import JinaAuth diff --git a/api/services/auth/auth_type.py b/api/services/auth/auth_type.py index 2e1946841f..ec7118df27 100644 --- a/api/services/auth/auth_type.py +++ b/api/services/auth/auth_type.py @@ -3,4 +3,5 @@ from enum import StrEnum class AuthType(StrEnum): FIRECRAWL = "firecrawl" + WATERCRAWL = "watercrawl" JINA = "jinareader" diff --git a/api/services/auth/watercrawl/__init__.py b/api/services/auth/watercrawl/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/api/services/auth/watercrawl/watercrawl.py b/api/services/auth/watercrawl/watercrawl.py new file mode 100644 index 0000000000..153ab5ba75 --- /dev/null +++ b/api/services/auth/watercrawl/watercrawl.py @@ -0,0 +1,44 @@ +import json +from urllib.parse import urljoin + +import requests + +from services.auth.api_key_auth_base import ApiKeyAuthBase + + +class WatercrawlAuth(ApiKeyAuthBase): + def __init__(self, credentials: dict): + super().__init__(credentials) + auth_type = credentials.get("auth_type") + if auth_type != "x-api-key": + raise ValueError("Invalid auth type, WaterCrawl auth type must be x-api-key") + self.api_key = credentials.get("config", {}).get("api_key", None) + self.base_url = credentials.get("config", {}).get("base_url", "https://app.watercrawl.dev") + + if not self.api_key: + raise ValueError("No API key provided") + + def validate_credentials(self): + headers = self._prepare_headers() + url = urljoin(self.base_url, "/api/v1/core/crawl-requests/") + response = self._get_request(url, headers) + if response.status_code == 200: + return True + else: + self._handle_error(response) + + def _prepare_headers(self): + return {"Content-Type": "application/json", "X-API-KEY": self.api_key} + + def _get_request(self, url, headers): + return requests.get(url, headers=headers) + + def _handle_error(self, response): + if response.status_code in {402, 409, 500}: + error_message = response.json().get("error", "Unknown error occurred") + raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") + else: + if response.text: + error_message = json.loads(response.text).get("error", "Unknown error occurred") + raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") + raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}") diff --git a/api/services/website_service.py b/api/services/website_service.py index 85d32c9e8a..9f5143562e 100644 --- a/api/services/website_service.py +++ b/api/services/website_service.py @@ -7,6 +7,7 @@ from flask_login import current_user # type: ignore from core.helper import encrypter from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp +from core.rag.extractor.watercrawl.provider import WaterCrawlProvider from extensions.ext_redis import redis_client from extensions.ext_storage import storage from services.auth.api_key_auth_service import ApiKeyAuthService @@ -59,6 +60,16 @@ class WebsiteService: time = str(datetime.datetime.now().timestamp()) redis_client.setex(website_crawl_time_cache_key, 3600, time) return {"status": "active", "job_id": job_id} + elif provider == "watercrawl": + # decrypt api_key + api_key = encrypter.decrypt_token( + tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") + ) + return WaterCrawlProvider( + api_key, + credentials.get("config").get("base_url", None) + ).crawl_url(url, options) + elif provider == "jinareader": api_key = encrypter.decrypt_token( tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") @@ -116,6 +127,15 @@ class WebsiteService: time_consuming = abs(end_time - float(start_time)) crawl_status_data["time_consuming"] = f"{time_consuming:.2f}" redis_client.delete(website_crawl_time_cache_key) + elif provider == "watercrawl": + # decrypt api_key + api_key = encrypter.decrypt_token( + tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") + ) + crawl_status_data = WaterCrawlProvider( + api_key, + credentials.get("config").get("base_url", None) + ).get_crawl_status(job_id) elif provider == "jinareader": api_key = encrypter.decrypt_token( tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") @@ -180,6 +200,12 @@ class WebsiteService: if item.get("source_url") == url: return dict(item) return None + elif provider == "watercrawl": + api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key")) + return WaterCrawlProvider( + api_key, + credentials.get("config").get("base_url", None) + ).get_crawl_url_data(job_id, url) elif provider == "jinareader": if not job_id: response = requests.get( @@ -223,5 +249,11 @@ class WebsiteService: params = {"onlyMainContent": only_main_content} result = firecrawl_app.scrape_url(url, params) return result + elif provider == "watercrawl": + api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key")) + return WaterCrawlProvider( + api_key, + credentials.get("config").get("base_url", None) + ).scrape_url(url) else: raise ValueError("Invalid provider") diff --git a/web/app/components/datasets/create/assets/watercrawl.svg b/web/app/components/datasets/create/assets/watercrawl.svg new file mode 100644 index 0000000000..bd4e6bac6f --- /dev/null +++ b/web/app/components/datasets/create/assets/watercrawl.svg @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/web/app/components/datasets/create/website/index.module.css b/web/app/components/datasets/create/website/index.module.css index abaab4bea4..cf6c36468d 100644 --- a/web/app/components/datasets/create/website/index.module.css +++ b/web/app/components/datasets/create/website/index.module.css @@ -4,3 +4,10 @@ background-image: url(../assets/jina.png); background-size: 16px; } + +.watercrawlLogo { + @apply w-5 h-5 bg-center bg-no-repeat inline-block; + /*background-color: #F5FAFF;*/ + background-image: url(../assets/watercrawl.svg); + background-size: 16px; +} diff --git a/web/app/components/datasets/create/website/index.tsx b/web/app/components/datasets/create/website/index.tsx index 5e9e4cee59..6e908753b8 100644 --- a/web/app/components/datasets/create/website/index.tsx +++ b/web/app/components/datasets/create/website/index.tsx @@ -5,6 +5,7 @@ import { useTranslation } from 'react-i18next' import s from './index.module.css' import NoData from './no-data' import Firecrawl from './firecrawl' +import Watercrawl from './watercrawl' import JinaReader from './jina-reader' import cn from '@/utils/classnames' import { useModalContext } from '@/context/modal-context' @@ -47,7 +48,11 @@ const Website: FC = ({ // If users have configured one of the providers, select it. const availableProviders = res.sources.filter((item: DataSourceItem) => - [DataSourceProvider.jinaReader, DataSourceProvider.fireCrawl].includes(item.provider), + [ + DataSourceProvider.jinaReader, + DataSourceProvider.fireCrawl, + DataSourceProvider.waterCrawl, + ].includes(item.provider), ) if (availableProviders.length > 0) @@ -70,6 +75,8 @@ const Website: FC = ({ if (!isLoaded) return null + const source = sources.find(source => source.provider === selectedProvider) + return (
@@ -86,7 +93,7 @@ const Website: FC = ({ )} onClick={() => setSelectedProvider(DataSourceProvider.jinaReader)} > - + Jina Reader +
- - { - selectedProvider === DataSourceProvider.fireCrawl - ? sources.find(source => source.provider === DataSourceProvider.fireCrawl) - ? ( - - ) - : ( - - ) - : sources.find(source => source.provider === DataSourceProvider.jinaReader) - ? ( - - ) - : ( - - ) - } + {source && selectedProvider === DataSourceProvider.fireCrawl && ( + + )} + {source && selectedProvider === DataSourceProvider.waterCrawl && ( + + )} + {source && selectedProvider === DataSourceProvider.jinaReader && ( + + )} + {!source && ( + + )} ) } diff --git a/web/app/components/datasets/create/website/no-data.tsx b/web/app/components/datasets/create/website/no-data.tsx index f1c079faf3..db7fe70d72 100644 --- a/web/app/components/datasets/create/website/no-data.tsx +++ b/web/app/components/datasets/create/website/no-data.tsx @@ -31,6 +31,11 @@ const NoData: FC = ({ title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`), description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`), }, + [DataSourceProvider.waterCrawl]: { + emoji: , + title: t(`${I18N_PREFIX}.waterCrawlNotConfigured`), + description: t(`${I18N_PREFIX}.waterCrawlNotConfiguredDescription`), + }, } const currentProvider = providerConfig[provider] diff --git a/web/app/components/datasets/create/website/watercrawl/header.tsx b/web/app/components/datasets/create/website/watercrawl/header.tsx new file mode 100644 index 0000000000..f2131fcd1a --- /dev/null +++ b/web/app/components/datasets/create/website/watercrawl/header.tsx @@ -0,0 +1,43 @@ +'use client' +import type { FC } from 'react' +import React from 'react' +import { useTranslation } from 'react-i18next' +import { RiBookOpenLine, RiEqualizer2Line } from '@remixicon/react' +import Button from '@/app/components/base/button' + +const I18N_PREFIX = 'datasetCreation.stepOne.website' + +type Props = { + onSetting: () => void +} + +const Header: FC = ({ + onSetting, +}) => { + const { t } = useTranslation() + + return ( +
+
+
{t(`${I18N_PREFIX}.watercrawlTitle`)}
+
+ +
+ + + {t(`${I18N_PREFIX}.watercrawlDoc`)} + +
+ ) +} +export default React.memo(Header) diff --git a/web/app/components/datasets/create/website/watercrawl/index.tsx b/web/app/components/datasets/create/website/watercrawl/index.tsx new file mode 100644 index 0000000000..e52f04e4ed --- /dev/null +++ b/web/app/components/datasets/create/website/watercrawl/index.tsx @@ -0,0 +1,217 @@ +'use client' +import type { FC } from 'react' +import React, { useCallback, useEffect, useState } from 'react' +import { useTranslation } from 'react-i18next' +import UrlInput from '../base/url-input' +import OptionsWrap from '../base/options-wrap' +import CrawledResult from '../base/crawled-result' +import Crawling from '../base/crawling' +import ErrorMessage from '../base/error-message' +import Header from './header' +import Options from './options' +import { useModalContext } from '@/context/modal-context' +import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' +import Toast from '@/app/components/base/toast' +import { checkWatercrawlTaskStatus, createWatercrawlTask } from '@/service/datasets' +import { sleep } from '@/utils' + +const ERROR_I18N_PREFIX = 'common.errorMsg' +const I18N_PREFIX = 'datasetCreation.stepOne.website' + +type Props = { + onPreview: (payload: CrawlResultItem) => void + checkedCrawlResult: CrawlResultItem[] + onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void + onJobIdChange: (jobId: string) => void + crawlOptions: CrawlOptions + onCrawlOptionsChange: (payload: CrawlOptions) => void +} + +enum Step { + init = 'init', + running = 'running', + finished = 'finished', +} + +const WaterCrawl: FC = ({ + onPreview, + checkedCrawlResult, + onCheckedCrawlResultChange, + onJobIdChange, + crawlOptions, + onCrawlOptionsChange, +}) => { + const { t } = useTranslation() + const [step, setStep] = useState(Step.init) + const [controlFoldOptions, setControlFoldOptions] = useState(0) + useEffect(() => { + if (step !== Step.init) + setControlFoldOptions(Date.now()) + }, [step]) + const { setShowAccountSettingModal } = useModalContext() + const handleSetting = useCallback(() => { + setShowAccountSettingModal({ + payload: 'data-source', + }) + }, [setShowAccountSettingModal]) + + const checkValid = useCallback((url: string) => { + let errorMsg = '' + if (!url) { + errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, { + field: 'url', + }) + } + + if (!errorMsg && !((url.startsWith('http://') || url.startsWith('https://')))) + errorMsg = t(`${ERROR_I18N_PREFIX}.urlError`) + + if (!errorMsg && (crawlOptions.limit === null || crawlOptions.limit === undefined || crawlOptions.limit === '')) { + errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, { + field: t(`${I18N_PREFIX}.limit`), + }) + } + + return { + isValid: !errorMsg, + errorMsg, + } + }, [crawlOptions, t]) + + const isInit = step === Step.init + const isCrawlFinished = step === Step.finished + const isRunning = step === Step.running + const [crawlResult, setCrawlResult] = useState<{ + current: number + total: number + data: CrawlResultItem[] + time_consuming: number | string + } | undefined>(undefined) + const [crawlErrorMessage, setCrawlErrorMessage] = useState('') + const showError = isCrawlFinished && crawlErrorMessage + + const waitForCrawlFinished = useCallback(async (jobId: string): Promise => { + try { + const res = await checkWatercrawlTaskStatus(jobId) as any + if (res.status === 'completed') { + return { + isError: false, + data: { + ...res, + total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), + }, + } + } + if (res.status === 'error' || !res.status) { + // can't get the error message from the watercrawl api + return { + isError: true, + errorMessage: res.message, + data: { + data: [], + }, + } + } + // update the progress + setCrawlResult({ + ...res, + total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), + }) + onCheckedCrawlResultChange(res.data || []) // default select the crawl result + await sleep(2500) + return await waitForCrawlFinished(jobId) + } + catch (e: any) { + const errorBody = await e.json() + return { + isError: true, + errorMessage: errorBody.message, + data: { + data: [], + }, + } + } + }, [crawlOptions.limit]) + + const handleRun = useCallback(async (url: string) => { + const { isValid, errorMsg } = checkValid(url) + if (!isValid) { + Toast.notify({ + message: errorMsg!, + type: 'error', + }) + return + } + setStep(Step.running) + try { + const passToServerCrawlOptions: any = { + ...crawlOptions, + } + if (crawlOptions.max_depth === '') + delete passToServerCrawlOptions.max_depth + + const res = await createWatercrawlTask({ + url, + options: passToServerCrawlOptions, + }) as any + const jobId = res.job_id + onJobIdChange(jobId) + const { isError, data, errorMessage } = await waitForCrawlFinished(jobId) + if (isError) { + setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`)) + } + else { + setCrawlResult(data) + onCheckedCrawlResultChange(data.data || []) // default select the crawl result + setCrawlErrorMessage('') + } + } + catch (e) { + setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!) + console.log(e) + } + finally { + setStep(Step.finished) + } + }, [checkValid, crawlOptions, onJobIdChange, t, waitForCrawlFinished]) + + return ( +
+
+
+ + + + + + {!isInit && ( +
+ {isRunning + && } + {showError && ( + + )} + {isCrawlFinished && !showError + && + } +
+ )} +
+
+ ) +} +export default React.memo(WaterCrawl) diff --git a/web/app/components/datasets/create/website/watercrawl/options.tsx b/web/app/components/datasets/create/website/watercrawl/options.tsx new file mode 100644 index 0000000000..8cc2c6757c --- /dev/null +++ b/web/app/components/datasets/create/website/watercrawl/options.tsx @@ -0,0 +1,83 @@ +'use client' +import type { FC } from 'react' +import React, { useCallback } from 'react' +import { useTranslation } from 'react-i18next' +import CheckboxWithLabel from '../base/checkbox-with-label' +import Field from '../base/field' +import cn from '@/utils/classnames' +import type { CrawlOptions } from '@/models/datasets' + +const I18N_PREFIX = 'datasetCreation.stepOne.website' + +type Props = { + className?: string + payload: CrawlOptions + onChange: (payload: CrawlOptions) => void +} + +const Options: FC = ({ + className = '', + payload, + onChange, +}) => { + const { t } = useTranslation() + + const handleChange = useCallback((key: keyof CrawlOptions) => { + return (value: any) => { + onChange({ + ...payload, + [key]: value, + }) + } + }, [payload, onChange]) + return ( +
+ +
+ + +
+ +
+ + +
+ +
+ ) +} +export default React.memo(Options) diff --git a/web/app/components/header/account-setting/data-source-page/data-source-website/config-watercrawl-modal.tsx b/web/app/components/header/account-setting/data-source-page/data-source-website/config-watercrawl-modal.tsx new file mode 100644 index 0000000000..6033a562ac --- /dev/null +++ b/web/app/components/header/account-setting/data-source-page/data-source-website/config-watercrawl-modal.tsx @@ -0,0 +1,161 @@ +'use client' +import type { FC } from 'react' +import React, { useCallback, useState } from 'react' +import { useTranslation } from 'react-i18next' +import { + PortalToFollowElem, + PortalToFollowElemContent, +} from '@/app/components/base/portal-to-follow-elem' +import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security' +import Button from '@/app/components/base/button' +import type { WatercrawlConfig } from '@/models/common' +import Field from '@/app/components/datasets/create/website/base/field' +import Toast from '@/app/components/base/toast' +import { createDataSourceApiKeyBinding } from '@/service/datasets' +import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general' +type Props = { + onCancel: () => void + onSaved: () => void +} + +const I18N_PREFIX = 'datasetCreation.watercrawl' + +const DEFAULT_BASE_URL = 'https://app.watercrawl.dev' + +const ConfigWatercrawlModal: FC = ({ + onCancel, + onSaved, +}) => { + const { t } = useTranslation() + const [isSaving, setIsSaving] = useState(false) + const [config, setConfig] = useState({ + api_key: '', + base_url: '', + }) + + const handleConfigChange = useCallback((key: string) => { + return (value: string | number) => { + setConfig(prev => ({ ...prev, [key]: value as string })) + } + }, []) + + const handleSave = useCallback(async () => { + if (isSaving) + return + let errorMsg = '' + if (config.base_url && !((config.base_url.startsWith('http://') || config.base_url.startsWith('https://')))) + errorMsg = t('common.errorMsg.urlError') + if (!errorMsg) { + if (!config.api_key) { + errorMsg = t('common.errorMsg.fieldRequired', { + field: 'API Key', + }) + } + } + + if (errorMsg) { + Toast.notify({ + type: 'error', + message: errorMsg, + }) + return + } + const postData = { + category: 'website', + provider: 'watercrawl', + credentials: { + auth_type: 'x-api-key', + config: { + api_key: config.api_key, + base_url: config.base_url || DEFAULT_BASE_URL, + }, + }, + } + try { + setIsSaving(true) + await createDataSourceApiKeyBinding(postData) + Toast.notify({ + type: 'success', + message: t('common.api.success'), + }) + } + finally { + setIsSaving(false) + } + + onSaved() + }, [config.api_key, config.base_url, onSaved, t, isSaving]) + + return ( + + +
+
+
+
+
{t(`${I18N_PREFIX}.configWatercrawl`)}
+
+ +
+ + +
+
+ + {t(`${I18N_PREFIX}.getApiKeyLinkText`)} + + +
+ + +
+ +
+
+
+
+ + {t('common.modelProvider.encrypted.front')} + + PKCS1_OAEP + + {t('common.modelProvider.encrypted.back')} +
+
+
+
+
+
+ ) +} +export default React.memo(ConfigWatercrawlModal) diff --git a/web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx b/web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx index d87fd4396e..ae410bf7b3 100644 --- a/web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx +++ b/web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx @@ -5,19 +5,15 @@ import { useTranslation } from 'react-i18next' import Panel from '../panel' import { DataSourceType } from '../panel/types' import ConfigFirecrawlModal from './config-firecrawl-modal' +import ConfigWatercrawlModal from './config-watercrawl-modal' import ConfigJinaReaderModal from './config-jina-reader-modal' import cn from '@/utils/classnames' import s from '@/app/components/datasets/create/website/index.module.css' import { fetchDataSources, removeDataSourceApiKeyBinding } from '@/service/datasets' -import type { - DataSourceItem, -} from '@/models/common' +import type { DataSourceItem } from '@/models/common' +import { DataSourceProvider } from '@/models/common' import { useAppContext } from '@/context/app-context' - -import { - DataSourceProvider, -} from '@/models/common' import Toast from '@/app/components/base/toast' type Props = { @@ -58,6 +54,16 @@ const DataSourceWebsite: FC = ({ provider }) => { return source?.id } + const getProviderName = (provider: DataSourceProvider): string => { + if (provider === DataSourceProvider.fireCrawl) + return 'Firecrawl' + + if (provider === DataSourceProvider.waterCrawl) + return 'WaterCrawl' + + return 'Jina Reader' + } + const handleRemove = useCallback((provider: DataSourceProvider) => { return async () => { const dataSourceId = getIdByProvider(provider) @@ -82,27 +88,42 @@ const DataSourceWebsite: FC = ({ provider }) => { readOnly={!isCurrentWorkspaceManager} configuredList={sources.filter(item => item.provider === provider).map(item => ({ id: item.id, - logo: ({ className }: { className: string }) => ( - item.provider === DataSourceProvider.fireCrawl - ? ( -
🔥
+ logo: ({ className }: { className: string }) => { + if (item.provider === DataSourceProvider.fireCrawl) { + return ( +
🔥
) - : ( -
- + } + + if (item.provider === DataSourceProvider.waterCrawl) { + return ( +
+
) - ), - name: item.provider === DataSourceProvider.fireCrawl ? 'Firecrawl' : 'Jina Reader', + } + return ( +
+ +
+ ) + }, + name: getProviderName(item.provider), isActive: true, }))} onRemove={handleRemove(provider)} /> {configTarget === DataSourceProvider.fireCrawl && ( - + + )} + {configTarget === DataSourceProvider.waterCrawl && ( + )} {configTarget === DataSourceProvider.jinaReader && ( - + )} diff --git a/web/app/components/header/account-setting/data-source-page/index.tsx b/web/app/components/header/account-setting/data-source-page/index.tsx index 93dc2db854..78eeeeac5b 100644 --- a/web/app/components/header/account-setting/data-source-page/index.tsx +++ b/web/app/components/header/account-setting/data-source-page/index.tsx @@ -15,6 +15,7 @@ export default function DataSourcePage() { +
) } diff --git a/web/app/components/header/account-setting/data-source-page/panel/index.tsx b/web/app/components/header/account-setting/data-source-page/panel/index.tsx index 0cd2d26029..d0da485ebf 100644 --- a/web/app/components/header/account-setting/data-source-page/panel/index.tsx +++ b/web/app/components/header/account-setting/data-source-page/panel/index.tsx @@ -41,6 +41,12 @@ const Panel: FC = ({ const isNotion = type === DataSourceType.notion const isWebsite = type === DataSourceType.website + const getProviderName = (): string => { + if (provider === DataSourceProvider.fireCrawl) return '🔥 Firecrawl' + if (provider === DataSourceProvider.waterCrawl) return 'WaterCrawl' + return 'Jina Reader' + } + return (
@@ -50,7 +56,7 @@ const Panel: FC = ({
{t(`common.dataSource.${type}.title`)}
{isWebsite && (
- {t('common.dataSource.website.with')} { provider === DataSourceProvider.fireCrawl ? '🔥 Firecrawl' : 'Jina Reader'} + {t('common.dataSource.website.with')} {getProviderName()}
)}
diff --git a/web/i18n/en-US/dataset-creation.ts b/web/i18n/en-US/dataset-creation.ts index 72eb44c3de..c55a939ea3 100644 --- a/web/i18n/en-US/dataset-creation.ts +++ b/web/i18n/en-US/dataset-creation.ts @@ -15,6 +15,11 @@ const translation = { apiKeyPlaceholder: 'API key from firecrawl.dev', getApiKeyLinkText: 'Get your API key from firecrawl.dev', }, + watercrawl: { + configWatercrawl: 'Configure Watercrawl', + apiKeyPlaceholder: 'API key from watercrawl.dev', + getApiKeyLinkText: 'Get your API key from watercrawl.dev', + }, jinaReader: { configJinaReader: 'Configure Jina Reader', apiKeyPlaceholder: 'API key from jina.ai', @@ -64,15 +69,21 @@ const translation = { chooseProvider: 'Select a provider', fireCrawlNotConfigured: 'Firecrawl is not configured', fireCrawlNotConfiguredDescription: 'Configure Firecrawl with API key to use it.', + watercrawlNotConfigured: 'Watercrawl is not configured', + watercrawlNotConfiguredDescription: 'Configure Watercrawl with API key to use it.', jinaReaderNotConfigured: 'Jina Reader is not configured', jinaReaderNotConfiguredDescription: 'Set up Jina Reader by entering your free API key for access.', configure: 'Configure', configureFirecrawl: 'Configure Firecrawl', + configureWatercrawl: 'Configure Watercrawl', configureJinaReader: 'Configure Jina Reader', run: 'Run', firecrawlTitle: 'Extract web content with 🔥Firecrawl', firecrawlDoc: 'Firecrawl docs', firecrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website', + watercrawlTitle: 'Extract web content with Watercrawl', + watercrawlDoc: 'Watercrawl docs', + watercrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website', jinaReaderTitle: 'Convert the entire site to Markdown', jinaReaderDoc: 'Learn more about Jina Reader', jinaReaderDocLink: 'https://jina.ai/reader', diff --git a/web/models/common.ts b/web/models/common.ts index 4086220e2e..0ee164aad8 100644 --- a/web/models/common.ts +++ b/web/models/common.ts @@ -178,6 +178,7 @@ export enum DataSourceCategory { export enum DataSourceProvider { fireCrawl = 'firecrawl', jinaReader = 'jinareader', + waterCrawl = 'watercrawl', } export type FirecrawlConfig = { @@ -185,6 +186,11 @@ export type FirecrawlConfig = { base_url: string } +export type WatercrawlConfig = { + api_key: string + base_url: string +} + export type DataSourceItem = { id: string category: DataSourceCategory diff --git a/web/service/datasets.ts b/web/service/datasets.ts index 53b55b375b..f9edb2eeaf 100644 --- a/web/service/datasets.ts +++ b/web/service/datasets.ts @@ -253,6 +253,25 @@ export const checkJinaReaderTaskStatus: Fetcher = (jobId }) } +export const createWatercrawlTask: Fetcher> = (body) => { + return post('website/crawl', { + body: { + ...body, + provider: DataSourceProvider.waterCrawl, + }, + }) +} + +export const checkWatercrawlTaskStatus: Fetcher = (jobId: string) => { + return get(`website/crawl/status/${jobId}`, { + params: { + provider: DataSourceProvider.waterCrawl, + }, + }, { + silent: true, + }) +} + type FileTypesRes = { allowed_extensions: string[] } From e563a4ae2098c67d36f0fb55374c83d0c0edf8d6 Mon Sep 17 00:00:00 2001 From: Amir Mohsen Date: Fri, 21 Mar 2025 00:30:34 +0100 Subject: [PATCH 2/2] feat: Integrate WaterCrawl.dev as a new knowledge base provider Add WaterCrawl.dev as an alternative provider for website crawling in datasets/knowledge base alongside Firecrawl and Jina Reader. This integration enhances the data source options for knowledge bases, allowing users to configure and use WaterCrawl for their website content extraction needs. Resolved #15950 --- api/core/rag/extractor/watercrawl/client.py | 47 +++++++++++-------- api/core/rag/extractor/watercrawl/provider.py | 13 +++-- 2 files changed, 35 insertions(+), 25 deletions(-) diff --git a/api/core/rag/extractor/watercrawl/client.py b/api/core/rag/extractor/watercrawl/client.py index 553d5476e2..eff9cf0672 100644 --- a/api/core/rag/extractor/watercrawl/client.py +++ b/api/core/rag/extractor/watercrawl/client.py @@ -1,5 +1,6 @@ import json -from typing import Union, Generator +from collections.abc import Generator +from typing import Union from urllib.parse import urljoin import requests @@ -21,35 +22,35 @@ class BaseAPIClient: session.headers.update({'Accept-Language': 'en-US'}) return session - def _get(self, endpoint: str, query_params: dict = None, **kwargs): + def _get(self, endpoint: str, query_params: dict | None = None, **kwargs): return self.session.get( urljoin(self.base_url, endpoint), params=query_params, **kwargs ) - def _post(self, endpoint: str, query_params: dict = None, data: dict = None, **kwargs): + def _post(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs): return self.session.post( urljoin(self.base_url, endpoint), params=query_params, json=data, **kwargs ) - def _put(self, endpoint: str, query_params: dict = None, data: dict = None, **kwargs): + def _put(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs): return self.session.put( urljoin(self.base_url, endpoint), params=query_params, json=data, **kwargs ) - def _delete(self, endpoint: str, query_params: dict = None, **kwargs): + def _delete(self, endpoint: str, query_params: dict | None = None, **kwargs): return self.session.delete( urljoin(self.base_url, endpoint), params=query_params, **kwargs ) - def _patch(self, endpoint: str, query_params: dict = None, data: dict = None, **kwargs): + def _patch(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs): return self.session.patch( urljoin(self.base_url, endpoint), params=query_params, @@ -58,7 +59,7 @@ class BaseAPIClient: class WaterCrawlAPIClient(BaseAPIClient): - def __init__(self, api_key, base_url: str = 'https://app.watercrawl.dev/'): + def __init__(self, api_key, base_url: str | None = 'https://app.watercrawl.dev/'): super().__init__(api_key, base_url) def process_eventstream(self, response: Response, download: bool = False): @@ -86,7 +87,7 @@ class WaterCrawlAPIClient(BaseAPIClient): raise Exception(f'Unknown response type: {response.headers.get("Content-Type")}') - def get_crawl_requests_list(self, page: int = None, page_size: int = None): + def get_crawl_requests_list(self, page: int | None = None, page_size: int | None = None): query_params = { 'page': page or 1, 'page_size': page_size or 10 @@ -107,10 +108,10 @@ class WaterCrawlAPIClient(BaseAPIClient): def create_crawl_request( self, - url: Union[list, str] = None, - spider_options: dict = None, - page_options: dict = None, - plugin_options: dict = None + url: Union[list, str] | None = None, + spider_options: dict | None = None, + page_options: dict | None = None, + plugin_options: dict | None = None ): data = { # 'urls': url if isinstance(url, list) else [url], @@ -154,7 +155,13 @@ class WaterCrawlAPIClient(BaseAPIClient): ), ) - def get_crawl_request_results(self, item_id: str, page: int = 1, page_size: int = 25, query_params: dict = None): + def get_crawl_request_results( + self, + item_id: str, + page: int = 1, + page_size: int = 25, + query_params: dict | None = None + ): query_params = query_params or {} query_params.update({ 'page': page or 1, @@ -169,22 +176,22 @@ class WaterCrawlAPIClient(BaseAPIClient): def scrape_url(self, url: str, - page_options: dict = None, - plugin_options: dict = None, + page_options: dict | None = None, + plugin_options: dict | None = None, sync: bool = True, prefetched: bool = True ): - result = self.create_crawl_request( + response_result = self.create_crawl_request( url=url, page_options=page_options, plugin_options=plugin_options ) if not sync: - return result + return response_result - for result in self.monitor_crawl_request(result['uuid'], prefetched): - if result['type'] == 'result': - return result['data'] + for event_data in self.monitor_crawl_request(response_result['uuid'], prefetched): + if event_data['type'] == 'result': + return event_data['data'] def download_result(self, result_object: dict): response = requests.get(result_object['result']) diff --git a/api/core/rag/extractor/watercrawl/provider.py b/api/core/rag/extractor/watercrawl/provider.py index 31727e3986..9dce19bb9e 100644 --- a/api/core/rag/extractor/watercrawl/provider.py +++ b/api/core/rag/extractor/watercrawl/provider.py @@ -1,10 +1,10 @@ -from datetime import datetime, timezone +from datetime import datetime from core.rag.extractor.watercrawl.client import WaterCrawlAPIClient class WaterCrawlProvider: - def __init__(self, api_key, base_url: str = None): + def __init__(self, api_key, base_url: str | None = None): self.client = WaterCrawlAPIClient(api_key, base_url) def crawl_url(self, url, options): @@ -25,7 +25,7 @@ class WaterCrawlProvider: page_options = { "exclude_tags": options.get("exclude_tags", '').split(",") if options.get("exclude_tags") else [], "include_tags": options.get("include_tags", '').split(",") if options.get("include_tags") else [], - "wait_time": wait_time if wait_time > 1000 else 1000, # minimum wait time is 1 second + "wait_time": max(1000, wait_time), # minimum wait time is 1 second "include_html": False, "only_main_content": options.get("only_main_content", True), "include_links": False, @@ -55,7 +55,10 @@ class WaterCrawlProvider: time_consuming = 0 if time_str: time_obj = datetime.strptime(time_str, "%H:%M:%S.%f") - time_consuming = time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1_000_000 + time_consuming = (time_obj.hour * 3600 + + time_obj.minute * 60 + + time_obj.second + + time_obj.microsecond / 1_000_000) return { "status": status, @@ -98,7 +101,7 @@ class WaterCrawlProvider: "markdown": result_object.get('result').get("markdown"), } - def _get_results(self, crawl_request_id: str, query_params: dict = None): + def _get_results(self, crawl_request_id: str, query_params: dict | None = None): page = 0 page_size = 100