feat: Integrate WaterCrawl.dev as a new knowledge base provider

Add WaterCrawl.dev as an alternative provider for website crawling in datasets/knowledge base alongside Firecrawl and Jina Reader. This integration enhances the data source options for knowledge bases, allowing users to configure and use WaterCrawl for their website content extraction needs. Resolved #15950
This commit is contained in:
Amir Mohsen 2025-03-21 00:18:18 +01:00
parent 72191f5b13
commit 29a4dec387
24 changed files with 1137 additions and 56 deletions

View File

@ -14,7 +14,8 @@ class WebsiteCrawlApi(Resource):
def post(self):
parser = reqparse.RequestParser()
parser.add_argument(
"provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json"
"provider", type=str, choices=["firecrawl", "watercrawl", "jinareader"],
required=True, nullable=True, location="json"
)
parser.add_argument("url", type=str, required=True, nullable=True, location="json")
parser.add_argument("options", type=dict, required=True, nullable=True, location="json")
@ -34,7 +35,8 @@ class WebsiteCrawlStatusApi(Resource):
@account_initialization_required
def get(self, job_id: str):
parser = reqparse.RequestParser()
parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args")
parser.add_argument("provider", type=str, choices=["firecrawl", "watercrawl", "jinareader"],
required=True, location="args")
args = parser.parse_args()
# get crawl status
try:

View File

@ -25,6 +25,7 @@ from core.rag.extractor.unstructured.unstructured_msg_extractor import Unstructu
from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor
from core.rag.extractor.unstructured.unstructured_xml_extractor import UnstructuredXmlExtractor
from core.rag.extractor.watercrawl.extractor import WaterCrawlWebExtractor
from core.rag.extractor.word_extractor import WordExtractor
from core.rag.models.document import Document
from extensions.ext_storage import storage
@ -40,7 +41,7 @@ USER_AGENT = (
class ExtractProcessor:
@classmethod
def load_from_upload_file(
cls, upload_file: UploadFile, return_text: bool = False, is_automatic: bool = False
cls, upload_file: UploadFile, return_text: bool = False, is_automatic: bool = False
) -> Union[list[Document], str]:
extract_setting = ExtractSetting(
datasource_type="upload_file", upload_file=upload_file, document_model="text_model"
@ -180,6 +181,15 @@ class ExtractProcessor:
only_main_content=extract_setting.website_info.only_main_content,
)
return extractor.extract()
elif extract_setting.website_info.provider == "watercrawl":
extractor = WaterCrawlWebExtractor(
url=extract_setting.website_info.url,
job_id=extract_setting.website_info.job_id,
tenant_id=extract_setting.website_info.tenant_id,
mode=extract_setting.website_info.mode,
only_main_content=extract_setting.website_info.only_main_content,
)
return extractor.extract()
elif extract_setting.website_info.provider == "jinareader":
extractor = JinaReaderWebExtractor(
url=extract_setting.website_info.url,

View File

@ -0,0 +1,193 @@
import json
from typing import Union, Generator
from urllib.parse import urljoin
import requests
from requests import Response
class BaseAPIClient:
def __init__(self, api_key, base_url):
self.api_key = api_key
self.base_url = base_url
self.session = self.init_session()
def init_session(self):
session = requests.Session()
session.headers.update({'X-API-Key': self.api_key})
session.headers.update({'Content-Type': 'application/json'})
session.headers.update({'Accept': 'application/json'})
session.headers.update({'User-Agent': 'WaterCrawl-Plugin'})
session.headers.update({'Accept-Language': 'en-US'})
return session
def _get(self, endpoint: str, query_params: dict = None, **kwargs):
return self.session.get(
urljoin(self.base_url, endpoint),
params=query_params,
**kwargs
)
def _post(self, endpoint: str, query_params: dict = None, data: dict = None, **kwargs):
return self.session.post(
urljoin(self.base_url, endpoint),
params=query_params,
json=data, **kwargs
)
def _put(self, endpoint: str, query_params: dict = None, data: dict = None, **kwargs):
return self.session.put(
urljoin(self.base_url, endpoint),
params=query_params,
json=data, **kwargs
)
def _delete(self, endpoint: str, query_params: dict = None, **kwargs):
return self.session.delete(
urljoin(self.base_url, endpoint),
params=query_params,
**kwargs
)
def _patch(self, endpoint: str, query_params: dict = None, data: dict = None, **kwargs):
return self.session.patch(
urljoin(self.base_url, endpoint),
params=query_params,
json=data, **kwargs
)
class WaterCrawlAPIClient(BaseAPIClient):
def __init__(self, api_key, base_url: str = 'https://app.watercrawl.dev/'):
super().__init__(api_key, base_url)
def process_eventstream(self, response: Response, download: bool = False):
for line in response.iter_lines():
line = line.decode('utf-8')
if line.startswith('data:'):
line = line[5:].strip()
data = json.loads(line)
if data['type'] == 'result' and download:
data['data'] = self.download_result(data['data'])
yield data
def process_response(self, response: Response) -> Union[dict, bytes, list, None, Generator]:
response.raise_for_status()
if response.status_code == 204:
return None
if response.headers.get('Content-Type') == 'application/json':
return response.json()
if response.headers.get('Content-Type') == 'application/octet-stream':
return response.content
if response.headers.get('Content-Type') == 'text/event-stream':
return self.process_eventstream(response)
raise Exception(f'Unknown response type: {response.headers.get("Content-Type")}')
def get_crawl_requests_list(self, page: int = None, page_size: int = None):
query_params = {
'page': page or 1,
'page_size': page_size or 10
}
return self.process_response(
self._get(
'/api/v1/core/crawl-requests/',
query_params=query_params,
)
)
def get_crawl_request(self, item_id: str):
return self.process_response(
self._get(
f'/api/v1/core/crawl-requests/{item_id}/',
)
)
def create_crawl_request(
self,
url: Union[list, str] = None,
spider_options: dict = None,
page_options: dict = None,
plugin_options: dict = None
):
data = {
# 'urls': url if isinstance(url, list) else [url],
'url': url,
'options': {
'spider_options': spider_options or {},
'page_options': page_options or {},
'plugin_options': plugin_options or {},
}
}
return self.process_response(
self._post(
'/api/v1/core/crawl-requests/',
data=data,
)
)
def stop_crawl_request(self, item_id: str):
return self.process_response(
self._delete(
f'/api/v1/core/crawl-requests/{item_id}/',
)
)
def download_crawl_request(self, item_id: str):
return self.process_response(
self._get(
f'/api/v1/core/crawl-requests/{item_id}/download/',
)
)
def monitor_crawl_request(self, item_id: str, prefetched=False) -> Generator:
query_params = {
'prefetched': str(prefetched).lower()
}
return self.process_response(
self._get(
f'/api/v1/core/crawl-requests/{item_id}/status/',
stream=True,
query_params=query_params
),
)
def get_crawl_request_results(self, item_id: str, page: int = 1, page_size: int = 25, query_params: dict = None):
query_params = query_params or {}
query_params.update({
'page': page or 1,
'page_size': page_size or 25
})
return self.process_response(
self._get(
f'/api/v1/core/crawl-requests/{item_id}/results/',
query_params=query_params
)
)
def scrape_url(self,
url: str,
page_options: dict = None,
plugin_options: dict = None,
sync: bool = True,
prefetched: bool = True
):
result = self.create_crawl_request(
url=url,
page_options=page_options,
plugin_options=plugin_options
)
if not sync:
return result
for result in self.monitor_crawl_request(result['uuid'], prefetched):
if result['type'] == 'result':
return result['data']
def download_result(self, result_object: dict):
response = requests.get(result_object['result'])
response.raise_for_status()
result_object['result'] = response.json()
return result_object

View File

@ -0,0 +1,57 @@
from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document
from services.website_service import WebsiteService
class WaterCrawlWebExtractor(BaseExtractor):
"""
Crawl and scrape websites and return content in clean llm-ready markdown.
Args:
url: The URL to scrape.
api_key: The API key for WaterCrawl.
base_url: The base URL for the Firecrawl API. Defaults to 'https://app.firecrawl.dev'.
mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'.
only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
"""
def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True):
"""Initialize with url, api_key, base_url and mode."""
self._url = url
self.job_id = job_id
self.tenant_id = tenant_id
self.mode = mode
self.only_main_content = only_main_content
def extract(self) -> list[Document]:
"""Extract content from the URL."""
documents = []
if self.mode == "crawl":
crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "watercrawl", self._url, self.tenant_id)
if crawl_data is None:
return []
document = Document(
page_content=crawl_data.get("markdown", ""),
metadata={
"source_url": crawl_data.get("source_url"),
"description": crawl_data.get("description"),
"title": crawl_data.get("title"),
},
)
documents.append(document)
elif self.mode == "scrape":
scrape_data = WebsiteService.get_scrape_url_data(
"watercrawl", self._url, self.tenant_id, self.only_main_content
)
document = Document(
page_content=scrape_data.get("markdown", ""),
metadata={
"source_url": scrape_data.get("source_url"),
"description": scrape_data.get("description"),
"title": scrape_data.get("title"),
},
)
documents.append(document)
return documents

View File

@ -0,0 +1,119 @@
from datetime import datetime, timezone
from core.rag.extractor.watercrawl.client import WaterCrawlAPIClient
class WaterCrawlProvider:
def __init__(self, api_key, base_url: str = None):
self.client = WaterCrawlAPIClient(api_key, base_url)
def crawl_url(self, url, options):
spider_options = {
"max_depth": 1,
"page_limit": 1,
"allowed_domains": [],
"exclude_paths": [],
"include_paths": []
}
if options.get("crawl_sub_pages", True):
spider_options["page_limit"] = options.get("limit", 1)
spider_options["max_depth"] = options.get("depth", 1)
spider_options["include_paths"] = options.get("includes", '').split(",") if options.get("includes") else []
spider_options["exclude_paths"] = options.get("excludes", '').split(",") if options.get("excludes") else []
wait_time = options.get("wait_time", 1000)
page_options = {
"exclude_tags": options.get("exclude_tags", '').split(",") if options.get("exclude_tags") else [],
"include_tags": options.get("include_tags", '').split(",") if options.get("include_tags") else [],
"wait_time": wait_time if wait_time > 1000 else 1000, # minimum wait time is 1 second
"include_html": False,
"only_main_content": options.get("only_main_content", True),
"include_links": False,
"timeout": 15000,
"accept_cookies_selector": "#cookies-accept",
"locale": "en-US",
"actions": []
}
result = self.client.create_crawl_request(
url=url,
spider_options=spider_options,
page_options=page_options
)
return {"status": "active", "job_id": result.get("uuid")}
def get_crawl_status(self, crawl_request_id):
response = self.client.get_crawl_request(crawl_request_id)
data = []
if response['status'] in ['new', 'running']:
status = 'active'
else:
status = 'completed'
data = list(self._get_results(crawl_request_id))
time_str = response.get('duration')
time_consuming = 0
if time_str:
time_obj = datetime.strptime(time_str, "%H:%M:%S.%f")
time_consuming = time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1_000_000
return {
"status": status,
"job_id": response.get("uuid"),
"total": response.get("options", {}).get('spider_options', {}).get("page_limit", 1),
"current": response.get("number_of_documents", 0),
"data": data,
"time_consuming": time_consuming
}
def get_crawl_url_data(self, job_id, url):
if not job_id:
return self.scrape_url(url)
for result in self._get_results(job_id, {
# filter by url
'url': url
}):
return result
return None
def scrape_url(self, url):
response = self.client.scrape_url(
url=url,
sync=True,
prefetched=True
)
return self._structure_data(response)
def _structure_data(self, result_object: dict):
if isinstance(result_object.get("result", {}), str):
raise ValueError("Invalid result object. Expected a dictionary.")
metadata = result_object.get("result", {}).get("metadata", {})
return {
"title": metadata.get("og:title") or metadata.get("title"),
"description": metadata.get("description"),
"source_url": result_object.get("url"),
"markdown": result_object.get('result').get("markdown"),
}
def _get_results(self, crawl_request_id: str, query_params: dict = None):
page = 0
page_size = 100
query_params = query_params or {}
query_params.update({
'prefetched': "true"
})
while True:
page += 1
response = self.client.get_crawl_request_results(crawl_request_id, page, page_size, query_params)
if not response['results']:
break
for result in response['results']:
yield self._structure_data(result)
if response['next'] is None:
break

View File

@ -17,6 +17,10 @@ class ApiKeyAuthFactory:
from services.auth.firecrawl.firecrawl import FirecrawlAuth
return FirecrawlAuth
case AuthType.WATERCRAWL:
from services.auth.watercrawl.watercrawl import WatercrawlAuth
return WatercrawlAuth
case AuthType.JINA:
from services.auth.jina.jina import JinaAuth

View File

@ -3,4 +3,5 @@ from enum import StrEnum
class AuthType(StrEnum):
FIRECRAWL = "firecrawl"
WATERCRAWL = "watercrawl"
JINA = "jinareader"

View File

View File

@ -0,0 +1,44 @@
import json
from urllib.parse import urljoin
import requests
from services.auth.api_key_auth_base import ApiKeyAuthBase
class WatercrawlAuth(ApiKeyAuthBase):
def __init__(self, credentials: dict):
super().__init__(credentials)
auth_type = credentials.get("auth_type")
if auth_type != "x-api-key":
raise ValueError("Invalid auth type, WaterCrawl auth type must be x-api-key")
self.api_key = credentials.get("config", {}).get("api_key", None)
self.base_url = credentials.get("config", {}).get("base_url", "https://app.watercrawl.dev")
if not self.api_key:
raise ValueError("No API key provided")
def validate_credentials(self):
headers = self._prepare_headers()
url = urljoin(self.base_url, "/api/v1/core/crawl-requests/")
response = self._get_request(url, headers)
if response.status_code == 200:
return True
else:
self._handle_error(response)
def _prepare_headers(self):
return {"Content-Type": "application/json", "X-API-KEY": self.api_key}
def _get_request(self, url, headers):
return requests.get(url, headers=headers)
def _handle_error(self, response):
if response.status_code in {402, 409, 500}:
error_message = response.json().get("error", "Unknown error occurred")
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
else:
if response.text:
error_message = json.loads(response.text).get("error", "Unknown error occurred")
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}")

View File

@ -7,6 +7,7 @@ from flask_login import current_user # type: ignore
from core.helper import encrypter
from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp
from core.rag.extractor.watercrawl.provider import WaterCrawlProvider
from extensions.ext_redis import redis_client
from extensions.ext_storage import storage
from services.auth.api_key_auth_service import ApiKeyAuthService
@ -59,6 +60,16 @@ class WebsiteService:
time = str(datetime.datetime.now().timestamp())
redis_client.setex(website_crawl_time_cache_key, 3600, time)
return {"status": "active", "job_id": job_id}
elif provider == "watercrawl":
# decrypt api_key
api_key = encrypter.decrypt_token(
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
)
return WaterCrawlProvider(
api_key,
credentials.get("config").get("base_url", None)
).crawl_url(url, options)
elif provider == "jinareader":
api_key = encrypter.decrypt_token(
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
@ -116,6 +127,15 @@ class WebsiteService:
time_consuming = abs(end_time - float(start_time))
crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
redis_client.delete(website_crawl_time_cache_key)
elif provider == "watercrawl":
# decrypt api_key
api_key = encrypter.decrypt_token(
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
)
crawl_status_data = WaterCrawlProvider(
api_key,
credentials.get("config").get("base_url", None)
).get_crawl_status(job_id)
elif provider == "jinareader":
api_key = encrypter.decrypt_token(
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
@ -180,6 +200,12 @@ class WebsiteService:
if item.get("source_url") == url:
return dict(item)
return None
elif provider == "watercrawl":
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
return WaterCrawlProvider(
api_key,
credentials.get("config").get("base_url", None)
).get_crawl_url_data(job_id, url)
elif provider == "jinareader":
if not job_id:
response = requests.get(
@ -223,5 +249,11 @@ class WebsiteService:
params = {"onlyMainContent": only_main_content}
result = firecrawl_app.scrape_url(url, params)
return result
elif provider == "watercrawl":
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
return WaterCrawlProvider(
api_key,
credentials.get("config").get("base_url", None)
).scrape_url(url)
else:
raise ValueError("Invalid provider")

View File

@ -0,0 +1,20 @@
<?xml version="1.0" encoding="utf-8"?>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 500 500">
<path style="fill: rgb(0, 23, 87); stroke: rgb(13, 14, 52);" d="M 247.794 213.903 L 246.81 76.976 L 254.345 76.963 L 254.592 213.989 L 247.794 213.903 Z"/>
<ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="250.025" cy="43.859" rx="33.966" ry="33.906"/>
<path style="fill: rgb(30, 141, 166); stroke: rgb(30, 141, 166);" d="M 282.472 260.389 L 414.181 330.253 L 410.563 336.234 L 279.38 265.739 L 282.472 260.389 Z"/>
<path style="fill: rgb(15, 17, 57); stroke: rgb(13, 14, 52);" d="M 255.105 281.394 L 254.485 417.656 L 246.156 417.691 L 246.688 280.51 L 255.105 281.394 Z"/>
<path style="paint-order: fill; fill: rgb(30, 141, 166); stroke: rgb(30, 141, 166);" d="M 279.486 229.517 L 410.351 160.07 L 413.923 167.04 L 283.727 235.998 L 279.486 229.517 Z"/>
<path style="fill: rgb(15, 164, 161); stroke: rgb(15, 164, 161);" d="M 88.545 164.884 L 219.797 236.07 L 222.867 229.568 L 90.887 159.47 L 88.545 164.884 Z"/>
<path style="fill: rgb(15, 164, 161); stroke: rgb(15, 164, 161);" d="M 224.76 266.9 L 95.55 334.829 L 92.878 328.37 L 219.955 261.275 L 224.76 266.9 Z"/>
<ellipse style="paint-order: fill; fill: rgb(2, 181, 225); stroke: rgb(2, 181, 225);" cx="251.242" cy="247.466" rx="33.966" ry="33.906"/>
<path style="fill: rgb(13, 14, 52); stroke: rgb(13, 14, 52);" d="M 279.502 433.617 L 408.666 359.443 C 408.666 359.443 412.398 366.965 412.398 366.916 C 412.398 366.867 281.544 440.217 281.544 440.217 L 279.502 433.617 Z"/>
<path style="fill: rgb(13, 14, 52); stroke: rgb(13, 14, 52);" d="M 223.119 431.408 L 96.643 361.068 L 93.265 368.047 L 218.895 438.099 L 223.119 431.408 Z"/>
<ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="250.504" cy="451.168" rx="33.966" ry="33.906"/>
<path style="fill: rgb(90, 191, 187); stroke: rgb(90, 191, 187);" d="M 435.665 180.895 L 435.859 316.869 L 443.103 315.579 L 442.56 180.697 L 435.665 180.895 Z"/>
<ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="441.06" cy="349.665" rx="33.966" ry="33.906"/>
<ellipse style="fill: rgb(2, 181, 225); stroke: rgb(2, 181, 225);" cx="441.512" cy="147.767" rx="33.966" ry="33.906"/>
<path style="fill: rgb(84, 187, 181); stroke: rgb(84, 187, 181);" d="M 64.755 314.523 L 57.928 315.006 L 58.307 182.961 L 65.169 182.865 L 64.755 314.523 Z"/>
<ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="58.177" cy="149.757" rx="33.966" ry="33.906"/>
<ellipse style="fill: rgb(61, 224, 203); stroke: rgb(61, 224, 203);" cx="65.909" cy="348.17" rx="33.966" ry="33.906"/>
</svg>

After

Width:  |  Height:  |  Size: 2.6 KiB

View File

@ -4,3 +4,10 @@
background-image: url(../assets/jina.png);
background-size: 16px;
}
.watercrawlLogo {
@apply w-5 h-5 bg-center bg-no-repeat inline-block;
/*background-color: #F5FAFF;*/
background-image: url(../assets/watercrawl.svg);
background-size: 16px;
}

View File

@ -5,6 +5,7 @@ import { useTranslation } from 'react-i18next'
import s from './index.module.css'
import NoData from './no-data'
import Firecrawl from './firecrawl'
import Watercrawl from './watercrawl'
import JinaReader from './jina-reader'
import cn from '@/utils/classnames'
import { useModalContext } from '@/context/modal-context'
@ -47,7 +48,11 @@ const Website: FC<Props> = ({
// If users have configured one of the providers, select it.
const availableProviders = res.sources.filter((item: DataSourceItem) =>
[DataSourceProvider.jinaReader, DataSourceProvider.fireCrawl].includes(item.provider),
[
DataSourceProvider.jinaReader,
DataSourceProvider.fireCrawl,
DataSourceProvider.waterCrawl,
].includes(item.provider),
)
if (availableProviders.length > 0)
@ -70,6 +75,8 @@ const Website: FC<Props> = ({
if (!isLoaded)
return null
const source = sources.find(source => source.provider === selectedProvider)
return (
<div>
<div className="mb-4">
@ -86,7 +93,7 @@ const Website: FC<Props> = ({
)}
onClick={() => setSelectedProvider(DataSourceProvider.jinaReader)}
>
<span className={cn(s.jinaLogo, 'mr-2')} />
<span className={cn(s.jinaLogo, 'mr-2')}/>
<span>Jina Reader</span>
</button>
<button
@ -100,40 +107,52 @@ const Website: FC<Props> = ({
>
🔥 Firecrawl
</button>
<button
className={`px-4 py-2 text-sm font-medium rounded-md flex items-center justify-center ${
selectedProvider === DataSourceProvider.waterCrawl
? 'bg-primary-50 text-primary-600'
: 'bg-gray-100 text-gray-600 hover:bg-gray-200'
}`}
onClick={() => setSelectedProvider(DataSourceProvider.waterCrawl)}
>
<span className={cn(s.watercrawlLogo, 'mr-2')}/>
<span>WaterCrawl</span>
</button>
</div>
</div>
{
selectedProvider === DataSourceProvider.fireCrawl
? sources.find(source => source.provider === DataSourceProvider.fireCrawl)
? (
<Firecrawl
onPreview={onPreview}
checkedCrawlResult={checkedCrawlResult}
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
onJobIdChange={onJobIdChange}
crawlOptions={crawlOptions}
onCrawlOptionsChange={onCrawlOptionsChange}
/>
)
: (
<NoData onConfig={handleOnConfig} provider={selectedProvider} />
)
: sources.find(source => source.provider === DataSourceProvider.jinaReader)
? (
<JinaReader
onPreview={onPreview}
checkedCrawlResult={checkedCrawlResult}
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
onJobIdChange={onJobIdChange}
crawlOptions={crawlOptions}
onCrawlOptionsChange={onCrawlOptionsChange}
/>
)
: (
<NoData onConfig={handleOnConfig} provider={selectedProvider} />
)
}
{source && selectedProvider === DataSourceProvider.fireCrawl && (
<Firecrawl
onPreview={onPreview}
checkedCrawlResult={checkedCrawlResult}
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
onJobIdChange={onJobIdChange}
crawlOptions={crawlOptions}
onCrawlOptionsChange={onCrawlOptionsChange}
/>
)}
{source && selectedProvider === DataSourceProvider.waterCrawl && (
<Watercrawl
onPreview={onPreview}
checkedCrawlResult={checkedCrawlResult}
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
onJobIdChange={onJobIdChange}
crawlOptions={crawlOptions}
onCrawlOptionsChange={onCrawlOptionsChange}
/>
)}
{source && selectedProvider === DataSourceProvider.jinaReader && (
<JinaReader
onPreview={onPreview}
checkedCrawlResult={checkedCrawlResult}
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
onJobIdChange={onJobIdChange}
crawlOptions={crawlOptions}
onCrawlOptionsChange={onCrawlOptionsChange}
/>
)}
{!source && (
<NoData onConfig={handleOnConfig} provider={selectedProvider}/>
)}
</div>
)
}

View File

@ -31,6 +31,11 @@ const NoData: FC<Props> = ({
title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`),
description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`),
},
[DataSourceProvider.waterCrawl]: {
emoji: <span className={s.watercrawlLogo} />,
title: t(`${I18N_PREFIX}.waterCrawlNotConfigured`),
description: t(`${I18N_PREFIX}.waterCrawlNotConfiguredDescription`),
},
}
const currentProvider = providerConfig[provider]

View File

@ -0,0 +1,43 @@
'use client'
import type { FC } from 'react'
import React from 'react'
import { useTranslation } from 'react-i18next'
import { RiBookOpenLine, RiEqualizer2Line } from '@remixicon/react'
import Button from '@/app/components/base/button'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
type Props = {
onSetting: () => void
}
const Header: FC<Props> = ({
onSetting,
}) => {
const { t } = useTranslation()
return (
<div className='flex h-6 items-center justify-between'>
<div className='flex items-center'>
<div className='text-base font-medium text-text-secondary'>{t(`${I18N_PREFIX}.watercrawlTitle`)}</div>
<div className='ml-2 mr-2 w-px h-3.5 bg-divider-regular' />
<Button className='flex items-center gap-x-[1px] h-6 px-1.5' onClick={onSetting}>
<RiEqualizer2Line className='w-3.5 h-3.5 text-components-button-secondary-text' />
<span className='text-components-button-secondary-text text-xs font-medium px-[3px]'>
{t(`${I18N_PREFIX}.configureWatercrawl`)}
</span>
</Button>
</div>
<a
href='https://docs.watercrawl.dev/'
target='_blank'
rel='noopener noreferrer'
className='inline-flex items-center gap-x-1 text-xs font-medium text-text-accent'
>
<RiBookOpenLine className='w-3.5 h-3.5 text-text-accent' />
<span>{t(`${I18N_PREFIX}.watercrawlDoc`)}</span>
</a>
</div>
)
}
export default React.memo(Header)

View File

@ -0,0 +1,217 @@
'use client'
import type { FC } from 'react'
import React, { useCallback, useEffect, useState } from 'react'
import { useTranslation } from 'react-i18next'
import UrlInput from '../base/url-input'
import OptionsWrap from '../base/options-wrap'
import CrawledResult from '../base/crawled-result'
import Crawling from '../base/crawling'
import ErrorMessage from '../base/error-message'
import Header from './header'
import Options from './options'
import { useModalContext } from '@/context/modal-context'
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
import Toast from '@/app/components/base/toast'
import { checkWatercrawlTaskStatus, createWatercrawlTask } from '@/service/datasets'
import { sleep } from '@/utils'
const ERROR_I18N_PREFIX = 'common.errorMsg'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
type Props = {
onPreview: (payload: CrawlResultItem) => void
checkedCrawlResult: CrawlResultItem[]
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
onJobIdChange: (jobId: string) => void
crawlOptions: CrawlOptions
onCrawlOptionsChange: (payload: CrawlOptions) => void
}
enum Step {
init = 'init',
running = 'running',
finished = 'finished',
}
const WaterCrawl: FC<Props> = ({
onPreview,
checkedCrawlResult,
onCheckedCrawlResultChange,
onJobIdChange,
crawlOptions,
onCrawlOptionsChange,
}) => {
const { t } = useTranslation()
const [step, setStep] = useState<Step>(Step.init)
const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
useEffect(() => {
if (step !== Step.init)
setControlFoldOptions(Date.now())
}, [step])
const { setShowAccountSettingModal } = useModalContext()
const handleSetting = useCallback(() => {
setShowAccountSettingModal({
payload: 'data-source',
})
}, [setShowAccountSettingModal])
const checkValid = useCallback((url: string) => {
let errorMsg = ''
if (!url) {
errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
field: 'url',
})
}
if (!errorMsg && !((url.startsWith('http://') || url.startsWith('https://'))))
errorMsg = t(`${ERROR_I18N_PREFIX}.urlError`)
if (!errorMsg && (crawlOptions.limit === null || crawlOptions.limit === undefined || crawlOptions.limit === '')) {
errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
field: t(`${I18N_PREFIX}.limit`),
})
}
return {
isValid: !errorMsg,
errorMsg,
}
}, [crawlOptions, t])
const isInit = step === Step.init
const isCrawlFinished = step === Step.finished
const isRunning = step === Step.running
const [crawlResult, setCrawlResult] = useState<{
current: number
total: number
data: CrawlResultItem[]
time_consuming: number | string
} | undefined>(undefined)
const [crawlErrorMessage, setCrawlErrorMessage] = useState('')
const showError = isCrawlFinished && crawlErrorMessage
const waitForCrawlFinished = useCallback(async (jobId: string): Promise<any> => {
try {
const res = await checkWatercrawlTaskStatus(jobId) as any
if (res.status === 'completed') {
return {
isError: false,
data: {
...res,
total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)),
},
}
}
if (res.status === 'error' || !res.status) {
// can't get the error message from the watercrawl api
return {
isError: true,
errorMessage: res.message,
data: {
data: [],
},
}
}
// update the progress
setCrawlResult({
...res,
total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)),
})
onCheckedCrawlResultChange(res.data || []) // default select the crawl result
await sleep(2500)
return await waitForCrawlFinished(jobId)
}
catch (e: any) {
const errorBody = await e.json()
return {
isError: true,
errorMessage: errorBody.message,
data: {
data: [],
},
}
}
}, [crawlOptions.limit])
const handleRun = useCallback(async (url: string) => {
const { isValid, errorMsg } = checkValid(url)
if (!isValid) {
Toast.notify({
message: errorMsg!,
type: 'error',
})
return
}
setStep(Step.running)
try {
const passToServerCrawlOptions: any = {
...crawlOptions,
}
if (crawlOptions.max_depth === '')
delete passToServerCrawlOptions.max_depth
const res = await createWatercrawlTask({
url,
options: passToServerCrawlOptions,
}) as any
const jobId = res.job_id
onJobIdChange(jobId)
const { isError, data, errorMessage } = await waitForCrawlFinished(jobId)
if (isError) {
setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`))
}
else {
setCrawlResult(data)
onCheckedCrawlResultChange(data.data || []) // default select the crawl result
setCrawlErrorMessage('')
}
}
catch (e) {
setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!)
console.log(e)
}
finally {
setStep(Step.finished)
}
}, [checkValid, crawlOptions, onJobIdChange, t, waitForCrawlFinished])
return (
<div>
<Header onSetting={handleSetting} />
<div className='mt-2 p-4 pb-0 rounded-xl border border-components-panel-border bg-background-default-subtle'>
<UrlInput onRun={handleRun} isRunning={isRunning} />
<OptionsWrap
className='mt-4'
controlFoldOptions={controlFoldOptions}
>
<Options className='mt-2' payload={crawlOptions} onChange={onCrawlOptionsChange} />
</OptionsWrap>
{!isInit && (
<div className='mt-3 relative left-[-16px] w-[calc(100%_+_32px)] rounded-b-xl'>
{isRunning
&& <Crawling
className='mt-2'
crawledNum={crawlResult?.current || 0}
totalNum={crawlResult?.total || Number.parseFloat(crawlOptions.limit as string) || 0}
/>}
{showError && (
<ErrorMessage className='rounded-b-xl' title={t(`${I18N_PREFIX}.exceptionErrorTitle`)} errorMsg={crawlErrorMessage} />
)}
{isCrawlFinished && !showError
&& <CrawledResult
className='mb-2'
list={crawlResult?.data || []}
checkedList={checkedCrawlResult}
onSelectedChange={onCheckedCrawlResultChange}
onPreview={onPreview}
usedTime={Number.parseFloat(crawlResult?.time_consuming as string) || 0}
/>
}
</div>
)}
</div>
</div>
)
}
export default React.memo(WaterCrawl)

View File

@ -0,0 +1,83 @@
'use client'
import type { FC } from 'react'
import React, { useCallback } from 'react'
import { useTranslation } from 'react-i18next'
import CheckboxWithLabel from '../base/checkbox-with-label'
import Field from '../base/field'
import cn from '@/utils/classnames'
import type { CrawlOptions } from '@/models/datasets'
const I18N_PREFIX = 'datasetCreation.stepOne.website'
type Props = {
className?: string
payload: CrawlOptions
onChange: (payload: CrawlOptions) => void
}
const Options: FC<Props> = ({
className = '',
payload,
onChange,
}) => {
const { t } = useTranslation()
const handleChange = useCallback((key: keyof CrawlOptions) => {
return (value: any) => {
onChange({
...payload,
[key]: value,
})
}
}, [payload, onChange])
return (
<div className={cn(className, ' space-y-2')}>
<CheckboxWithLabel
label={t(`${I18N_PREFIX}.crawlSubPage`)}
isChecked={payload.crawl_sub_pages}
onChange={handleChange('crawl_sub_pages')}
/>
<div className='flex justify-between space-x-4'>
<Field
className='grow shrink-0'
label={t(`${I18N_PREFIX}.limit`)}
value={payload.limit}
onChange={handleChange('limit')}
isNumber
isRequired
/>
<Field
className='grow shrink-0'
label={t(`${I18N_PREFIX}.maxDepth`)}
value={payload.max_depth}
onChange={handleChange('max_depth')}
isNumber
tooltip={t(`${I18N_PREFIX}.maxDepthTooltip`)!}
/>
</div>
<div className='flex justify-between space-x-4'>
<Field
className='grow shrink-0'
label={t(`${I18N_PREFIX}.excludePaths`)}
value={payload.excludes}
onChange={handleChange('excludes')}
placeholder='blog/*, /about/*'
/>
<Field
className='grow shrink-0'
label={t(`${I18N_PREFIX}.includeOnlyPaths`)}
value={payload.includes}
onChange={handleChange('includes')}
placeholder='articles/*'
/>
</div>
<CheckboxWithLabel
label={t(`${I18N_PREFIX}.extractOnlyMainContent`)}
isChecked={payload.only_main_content}
onChange={handleChange('only_main_content')}
/>
</div>
)
}
export default React.memo(Options)

View File

@ -0,0 +1,161 @@
'use client'
import type { FC } from 'react'
import React, { useCallback, useState } from 'react'
import { useTranslation } from 'react-i18next'
import {
PortalToFollowElem,
PortalToFollowElemContent,
} from '@/app/components/base/portal-to-follow-elem'
import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security'
import Button from '@/app/components/base/button'
import type { WatercrawlConfig } from '@/models/common'
import Field from '@/app/components/datasets/create/website/base/field'
import Toast from '@/app/components/base/toast'
import { createDataSourceApiKeyBinding } from '@/service/datasets'
import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general'
type Props = {
onCancel: () => void
onSaved: () => void
}
const I18N_PREFIX = 'datasetCreation.watercrawl'
const DEFAULT_BASE_URL = 'https://app.watercrawl.dev'
const ConfigWatercrawlModal: FC<Props> = ({
onCancel,
onSaved,
}) => {
const { t } = useTranslation()
const [isSaving, setIsSaving] = useState(false)
const [config, setConfig] = useState<WatercrawlConfig>({
api_key: '',
base_url: '',
})
const handleConfigChange = useCallback((key: string) => {
return (value: string | number) => {
setConfig(prev => ({ ...prev, [key]: value as string }))
}
}, [])
const handleSave = useCallback(async () => {
if (isSaving)
return
let errorMsg = ''
if (config.base_url && !((config.base_url.startsWith('http://') || config.base_url.startsWith('https://'))))
errorMsg = t('common.errorMsg.urlError')
if (!errorMsg) {
if (!config.api_key) {
errorMsg = t('common.errorMsg.fieldRequired', {
field: 'API Key',
})
}
}
if (errorMsg) {
Toast.notify({
type: 'error',
message: errorMsg,
})
return
}
const postData = {
category: 'website',
provider: 'watercrawl',
credentials: {
auth_type: 'x-api-key',
config: {
api_key: config.api_key,
base_url: config.base_url || DEFAULT_BASE_URL,
},
},
}
try {
setIsSaving(true)
await createDataSourceApiKeyBinding(postData)
Toast.notify({
type: 'success',
message: t('common.api.success'),
})
}
finally {
setIsSaving(false)
}
onSaved()
}, [config.api_key, config.base_url, onSaved, t, isSaving])
return (
<PortalToFollowElem open>
<PortalToFollowElemContent className='w-full h-full z-[60]'>
<div className='fixed inset-0 flex items-center justify-center bg-background-overlay'>
<div className='mx-2 w-[640px] max-h-[calc(100vh-120px)] bg-components-panel-bg shadow-xl rounded-2xl overflow-y-auto'>
<div className='px-8 pt-8'>
<div className='flex justify-between items-center mb-4'>
<div className='system-xl-semibold text-text-primary'>{t(`${I18N_PREFIX}.configWatercrawl`)}</div>
</div>
<div className='space-y-4'>
<Field
label='API Key'
labelClassName='!text-sm'
isRequired
value={config.api_key}
onChange={handleConfigChange('api_key')}
placeholder={t(`${I18N_PREFIX}.apiKeyPlaceholder`)!}
/>
<Field
label='Base URL'
labelClassName='!text-sm'
value={config.base_url}
onChange={handleConfigChange('base_url')}
placeholder={DEFAULT_BASE_URL}
/>
</div>
<div className='my-8 flex justify-between items-center h-8'>
<a className='flex items-center space-x-1 leading-[18px] text-xs font-normal text-text-accent' target='_blank' href='https://app.watercrawl.dev/'>
<span>{t(`${I18N_PREFIX}.getApiKeyLinkText`)}</span>
<LinkExternal02 className='w-3 h-3' />
</a>
<div className='flex'>
<Button
size='large'
className='mr-2'
onClick={onCancel}
>
{t('common.operation.cancel')}
</Button>
<Button
variant='primary'
size='large'
onClick={handleSave}
loading={isSaving}
>
{t('common.operation.save')}
</Button>
</div>
</div>
</div>
<div className='border-t-[0.5px] border-t-divider-regular'>
<div className='flex justify-center items-center py-3 bg-background-section-burn text-xs text-text-tertiary'>
<Lock01 className='mr-1 w-3 h-3 text-text-tertiary' />
{t('common.modelProvider.encrypted.front')}
<a
className='text-text-accent mx-1'
target='_blank' rel='noopener noreferrer'
href='https://pycryptodome.readthedocs.io/en/latest/src/cipher/oaep.html'
>
PKCS1_OAEP
</a>
{t('common.modelProvider.encrypted.back')}
</div>
</div>
</div>
</div>
</PortalToFollowElemContent>
</PortalToFollowElem>
)
}
export default React.memo(ConfigWatercrawlModal)

View File

@ -5,19 +5,15 @@ import { useTranslation } from 'react-i18next'
import Panel from '../panel'
import { DataSourceType } from '../panel/types'
import ConfigFirecrawlModal from './config-firecrawl-modal'
import ConfigWatercrawlModal from './config-watercrawl-modal'
import ConfigJinaReaderModal from './config-jina-reader-modal'
import cn from '@/utils/classnames'
import s from '@/app/components/datasets/create/website/index.module.css'
import { fetchDataSources, removeDataSourceApiKeyBinding } from '@/service/datasets'
import type {
DataSourceItem,
} from '@/models/common'
import type { DataSourceItem } from '@/models/common'
import { DataSourceProvider } from '@/models/common'
import { useAppContext } from '@/context/app-context'
import {
DataSourceProvider,
} from '@/models/common'
import Toast from '@/app/components/base/toast'
type Props = {
@ -58,6 +54,16 @@ const DataSourceWebsite: FC<Props> = ({ provider }) => {
return source?.id
}
const getProviderName = (provider: DataSourceProvider): string => {
if (provider === DataSourceProvider.fireCrawl)
return 'Firecrawl'
if (provider === DataSourceProvider.waterCrawl)
return 'WaterCrawl'
return 'Jina Reader'
}
const handleRemove = useCallback((provider: DataSourceProvider) => {
return async () => {
const dataSourceId = getIdByProvider(provider)
@ -82,27 +88,42 @@ const DataSourceWebsite: FC<Props> = ({ provider }) => {
readOnly={!isCurrentWorkspaceManager}
configuredList={sources.filter(item => item.provider === provider).map(item => ({
id: item.id,
logo: ({ className }: { className: string }) => (
item.provider === DataSourceProvider.fireCrawl
? (
<div className={cn(className, 'flex items-center justify-center w-5 h-5 !bg-background-default border border-divider-subtle text-xs font-medium text-text-tertiary rounded ml-3')}>🔥</div>
logo: ({ className }: { className: string }) => {
if (item.provider === DataSourceProvider.fireCrawl) {
return (
<div
className={cn(className, 'flex items-center justify-center w-5 h-5 !bg-background-default border border-divider-subtle text-xs font-medium text-text-tertiary rounded ml-3')}>🔥</div>
)
: (
<div className={cn(className, 'flex items-center justify-center w-5 h-5 !bg-background-default border border-divider-subtle text-xs font-medium text-text-tertiary rounded ml-3')}>
<span className={s.jinaLogo} />
}
if (item.provider === DataSourceProvider.waterCrawl) {
return (
<div
className={cn(className, 'flex items-center justify-center w-5 h-5 !bg-background-default border border-divider-subtle text-xs font-medium text-text-tertiary rounded ml-3')}>
<span className={s.watercrawlLogo}/>
</div>
)
),
name: item.provider === DataSourceProvider.fireCrawl ? 'Firecrawl' : 'Jina Reader',
}
return (
<div
className={cn(className, 'flex items-center justify-center w-5 h-5 !bg-background-default border border-divider-subtle text-xs font-medium text-text-tertiary rounded ml-3')}>
<span className={s.jinaLogo}/>
</div>
)
},
name: getProviderName(item.provider),
isActive: true,
}))}
onRemove={handleRemove(provider)}
/>
{configTarget === DataSourceProvider.fireCrawl && (
<ConfigFirecrawlModal onSaved={handleAdded} onCancel={hideConfig} />
<ConfigFirecrawlModal onSaved={handleAdded} onCancel={hideConfig}/>
)}
{configTarget === DataSourceProvider.waterCrawl && (
<ConfigWatercrawlModal onSaved={handleAdded} onCancel={hideConfig}/>
)}
{configTarget === DataSourceProvider.jinaReader && (
<ConfigJinaReaderModal onSaved={handleAdded} onCancel={hideConfig} />
<ConfigJinaReaderModal onSaved={handleAdded} onCancel={hideConfig}/>
)}
</>

View File

@ -15,6 +15,7 @@ export default function DataSourcePage() {
<DataSourceNotion workspaces={notionWorkspaces} />
<DataSourceWebsite provider={DataSourceProvider.jinaReader} />
<DataSourceWebsite provider={DataSourceProvider.fireCrawl} />
<DataSourceWebsite provider={DataSourceProvider.waterCrawl} />
</div>
)
}

View File

@ -41,6 +41,12 @@ const Panel: FC<Props> = ({
const isNotion = type === DataSourceType.notion
const isWebsite = type === DataSourceType.website
const getProviderName = (): string => {
if (provider === DataSourceProvider.fireCrawl) return '🔥 Firecrawl'
if (provider === DataSourceProvider.waterCrawl) return 'WaterCrawl'
return 'Jina Reader'
}
return (
<div className='mb-2 bg-background-section-burn rounded-xl'>
<div className='flex items-center px-3 py-[9px]'>
@ -50,7 +56,7 @@ const Panel: FC<Props> = ({
<div className='text-sm font-medium text-text-primary'>{t(`common.dataSource.${type}.title`)}</div>
{isWebsite && (
<div className='ml-1 leading-[18px] px-1.5 rounded-md bg-components-badge-white-to-dark text-xs font-medium text-text-secondary'>
<span className='text-text-tertiary'>{t('common.dataSource.website.with')}</span> { provider === DataSourceProvider.fireCrawl ? '🔥 Firecrawl' : 'Jina Reader'}
<span className='text-text-tertiary'>{t('common.dataSource.website.with')}</span> {getProviderName()}
</div>
)}
</div>

View File

@ -15,6 +15,11 @@ const translation = {
apiKeyPlaceholder: 'API key from firecrawl.dev',
getApiKeyLinkText: 'Get your API key from firecrawl.dev',
},
watercrawl: {
configWatercrawl: 'Configure Watercrawl',
apiKeyPlaceholder: 'API key from watercrawl.dev',
getApiKeyLinkText: 'Get your API key from watercrawl.dev',
},
jinaReader: {
configJinaReader: 'Configure Jina Reader',
apiKeyPlaceholder: 'API key from jina.ai',
@ -64,15 +69,21 @@ const translation = {
chooseProvider: 'Select a provider',
fireCrawlNotConfigured: 'Firecrawl is not configured',
fireCrawlNotConfiguredDescription: 'Configure Firecrawl with API key to use it.',
watercrawlNotConfigured: 'Watercrawl is not configured',
watercrawlNotConfiguredDescription: 'Configure Watercrawl with API key to use it.',
jinaReaderNotConfigured: 'Jina Reader is not configured',
jinaReaderNotConfiguredDescription: 'Set up Jina Reader by entering your free API key for access.',
configure: 'Configure',
configureFirecrawl: 'Configure Firecrawl',
configureWatercrawl: 'Configure Watercrawl',
configureJinaReader: 'Configure Jina Reader',
run: 'Run',
firecrawlTitle: 'Extract web content with 🔥Firecrawl',
firecrawlDoc: 'Firecrawl docs',
firecrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website',
watercrawlTitle: 'Extract web content with Watercrawl',
watercrawlDoc: 'Watercrawl docs',
watercrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website',
jinaReaderTitle: 'Convert the entire site to Markdown',
jinaReaderDoc: 'Learn more about Jina Reader',
jinaReaderDocLink: 'https://jina.ai/reader',

View File

@ -178,6 +178,7 @@ export enum DataSourceCategory {
export enum DataSourceProvider {
fireCrawl = 'firecrawl',
jinaReader = 'jinareader',
waterCrawl = 'watercrawl',
}
export type FirecrawlConfig = {
@ -185,6 +186,11 @@ export type FirecrawlConfig = {
base_url: string
}
export type WatercrawlConfig = {
api_key: string
base_url: string
}
export type DataSourceItem = {
id: string
category: DataSourceCategory

View File

@ -253,6 +253,25 @@ export const checkJinaReaderTaskStatus: Fetcher<CommonResponse, string> = (jobId
})
}
export const createWatercrawlTask: Fetcher<CommonResponse, Record<string, any>> = (body) => {
return post<CommonResponse>('website/crawl', {
body: {
...body,
provider: DataSourceProvider.waterCrawl,
},
})
}
export const checkWatercrawlTaskStatus: Fetcher<CommonResponse, string> = (jobId: string) => {
return get<CommonResponse>(`website/crawl/status/${jobId}`, {
params: {
provider: DataSourceProvider.waterCrawl,
},
}, {
silent: true,
})
}
type FileTypesRes = {
allowed_extensions: string[]
}