feat: Integrate WaterCrawl.dev as a new knowledge base provider
Add WaterCrawl.dev as an alternative provider for website crawling in datasets/knowledge base alongside Firecrawl and Jina Reader. This integration enhances the data source options for knowledge bases, allowing users to configure and use WaterCrawl for their website content extraction needs. Resolved #15950
This commit is contained in:
parent
72191f5b13
commit
29a4dec387
@ -14,7 +14,8 @@ class WebsiteCrawlApi(Resource):
|
||||
def post(self):
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument(
|
||||
"provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json"
|
||||
"provider", type=str, choices=["firecrawl", "watercrawl", "jinareader"],
|
||||
required=True, nullable=True, location="json"
|
||||
)
|
||||
parser.add_argument("url", type=str, required=True, nullable=True, location="json")
|
||||
parser.add_argument("options", type=dict, required=True, nullable=True, location="json")
|
||||
@ -34,7 +35,8 @@ class WebsiteCrawlStatusApi(Resource):
|
||||
@account_initialization_required
|
||||
def get(self, job_id: str):
|
||||
parser = reqparse.RequestParser()
|
||||
parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args")
|
||||
parser.add_argument("provider", type=str, choices=["firecrawl", "watercrawl", "jinareader"],
|
||||
required=True, location="args")
|
||||
args = parser.parse_args()
|
||||
# get crawl status
|
||||
try:
|
||||
|
@ -25,6 +25,7 @@ from core.rag.extractor.unstructured.unstructured_msg_extractor import Unstructu
|
||||
from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
|
||||
from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor
|
||||
from core.rag.extractor.unstructured.unstructured_xml_extractor import UnstructuredXmlExtractor
|
||||
from core.rag.extractor.watercrawl.extractor import WaterCrawlWebExtractor
|
||||
from core.rag.extractor.word_extractor import WordExtractor
|
||||
from core.rag.models.document import Document
|
||||
from extensions.ext_storage import storage
|
||||
@ -40,7 +41,7 @@ USER_AGENT = (
|
||||
class ExtractProcessor:
|
||||
@classmethod
|
||||
def load_from_upload_file(
|
||||
cls, upload_file: UploadFile, return_text: bool = False, is_automatic: bool = False
|
||||
cls, upload_file: UploadFile, return_text: bool = False, is_automatic: bool = False
|
||||
) -> Union[list[Document], str]:
|
||||
extract_setting = ExtractSetting(
|
||||
datasource_type="upload_file", upload_file=upload_file, document_model="text_model"
|
||||
@ -180,6 +181,15 @@ class ExtractProcessor:
|
||||
only_main_content=extract_setting.website_info.only_main_content,
|
||||
)
|
||||
return extractor.extract()
|
||||
elif extract_setting.website_info.provider == "watercrawl":
|
||||
extractor = WaterCrawlWebExtractor(
|
||||
url=extract_setting.website_info.url,
|
||||
job_id=extract_setting.website_info.job_id,
|
||||
tenant_id=extract_setting.website_info.tenant_id,
|
||||
mode=extract_setting.website_info.mode,
|
||||
only_main_content=extract_setting.website_info.only_main_content,
|
||||
)
|
||||
return extractor.extract()
|
||||
elif extract_setting.website_info.provider == "jinareader":
|
||||
extractor = JinaReaderWebExtractor(
|
||||
url=extract_setting.website_info.url,
|
||||
|
193
api/core/rag/extractor/watercrawl/client.py
Normal file
193
api/core/rag/extractor/watercrawl/client.py
Normal file
@ -0,0 +1,193 @@
|
||||
import json
|
||||
from typing import Union, Generator
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
from requests import Response
|
||||
|
||||
|
||||
class BaseAPIClient:
|
||||
def __init__(self, api_key, base_url):
|
||||
self.api_key = api_key
|
||||
self.base_url = base_url
|
||||
self.session = self.init_session()
|
||||
|
||||
def init_session(self):
|
||||
session = requests.Session()
|
||||
session.headers.update({'X-API-Key': self.api_key})
|
||||
session.headers.update({'Content-Type': 'application/json'})
|
||||
session.headers.update({'Accept': 'application/json'})
|
||||
session.headers.update({'User-Agent': 'WaterCrawl-Plugin'})
|
||||
session.headers.update({'Accept-Language': 'en-US'})
|
||||
return session
|
||||
|
||||
def _get(self, endpoint: str, query_params: dict = None, **kwargs):
|
||||
return self.session.get(
|
||||
urljoin(self.base_url, endpoint),
|
||||
params=query_params,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
def _post(self, endpoint: str, query_params: dict = None, data: dict = None, **kwargs):
|
||||
return self.session.post(
|
||||
urljoin(self.base_url, endpoint),
|
||||
params=query_params,
|
||||
json=data, **kwargs
|
||||
)
|
||||
|
||||
def _put(self, endpoint: str, query_params: dict = None, data: dict = None, **kwargs):
|
||||
return self.session.put(
|
||||
urljoin(self.base_url, endpoint),
|
||||
params=query_params,
|
||||
json=data, **kwargs
|
||||
)
|
||||
|
||||
def _delete(self, endpoint: str, query_params: dict = None, **kwargs):
|
||||
return self.session.delete(
|
||||
urljoin(self.base_url, endpoint),
|
||||
params=query_params,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
def _patch(self, endpoint: str, query_params: dict = None, data: dict = None, **kwargs):
|
||||
return self.session.patch(
|
||||
urljoin(self.base_url, endpoint),
|
||||
params=query_params,
|
||||
json=data, **kwargs
|
||||
)
|
||||
|
||||
|
||||
class WaterCrawlAPIClient(BaseAPIClient):
|
||||
def __init__(self, api_key, base_url: str = 'https://app.watercrawl.dev/'):
|
||||
super().__init__(api_key, base_url)
|
||||
|
||||
def process_eventstream(self, response: Response, download: bool = False):
|
||||
for line in response.iter_lines():
|
||||
line = line.decode('utf-8')
|
||||
if line.startswith('data:'):
|
||||
line = line[5:].strip()
|
||||
data = json.loads(line)
|
||||
if data['type'] == 'result' and download:
|
||||
data['data'] = self.download_result(data['data'])
|
||||
yield data
|
||||
|
||||
def process_response(self, response: Response) -> Union[dict, bytes, list, None, Generator]:
|
||||
response.raise_for_status()
|
||||
if response.status_code == 204:
|
||||
return None
|
||||
if response.headers.get('Content-Type') == 'application/json':
|
||||
return response.json()
|
||||
|
||||
if response.headers.get('Content-Type') == 'application/octet-stream':
|
||||
return response.content
|
||||
|
||||
if response.headers.get('Content-Type') == 'text/event-stream':
|
||||
return self.process_eventstream(response)
|
||||
|
||||
raise Exception(f'Unknown response type: {response.headers.get("Content-Type")}')
|
||||
|
||||
def get_crawl_requests_list(self, page: int = None, page_size: int = None):
|
||||
query_params = {
|
||||
'page': page or 1,
|
||||
'page_size': page_size or 10
|
||||
}
|
||||
return self.process_response(
|
||||
self._get(
|
||||
'/api/v1/core/crawl-requests/',
|
||||
query_params=query_params,
|
||||
)
|
||||
)
|
||||
|
||||
def get_crawl_request(self, item_id: str):
|
||||
return self.process_response(
|
||||
self._get(
|
||||
f'/api/v1/core/crawl-requests/{item_id}/',
|
||||
)
|
||||
)
|
||||
|
||||
def create_crawl_request(
|
||||
self,
|
||||
url: Union[list, str] = None,
|
||||
spider_options: dict = None,
|
||||
page_options: dict = None,
|
||||
plugin_options: dict = None
|
||||
):
|
||||
data = {
|
||||
# 'urls': url if isinstance(url, list) else [url],
|
||||
'url': url,
|
||||
'options': {
|
||||
'spider_options': spider_options or {},
|
||||
'page_options': page_options or {},
|
||||
'plugin_options': plugin_options or {},
|
||||
}
|
||||
}
|
||||
return self.process_response(
|
||||
self._post(
|
||||
'/api/v1/core/crawl-requests/',
|
||||
data=data,
|
||||
)
|
||||
)
|
||||
|
||||
def stop_crawl_request(self, item_id: str):
|
||||
return self.process_response(
|
||||
self._delete(
|
||||
f'/api/v1/core/crawl-requests/{item_id}/',
|
||||
)
|
||||
)
|
||||
|
||||
def download_crawl_request(self, item_id: str):
|
||||
return self.process_response(
|
||||
self._get(
|
||||
f'/api/v1/core/crawl-requests/{item_id}/download/',
|
||||
)
|
||||
)
|
||||
|
||||
def monitor_crawl_request(self, item_id: str, prefetched=False) -> Generator:
|
||||
query_params = {
|
||||
'prefetched': str(prefetched).lower()
|
||||
}
|
||||
return self.process_response(
|
||||
self._get(
|
||||
f'/api/v1/core/crawl-requests/{item_id}/status/',
|
||||
stream=True,
|
||||
query_params=query_params
|
||||
),
|
||||
)
|
||||
|
||||
def get_crawl_request_results(self, item_id: str, page: int = 1, page_size: int = 25, query_params: dict = None):
|
||||
query_params = query_params or {}
|
||||
query_params.update({
|
||||
'page': page or 1,
|
||||
'page_size': page_size or 25
|
||||
})
|
||||
return self.process_response(
|
||||
self._get(
|
||||
f'/api/v1/core/crawl-requests/{item_id}/results/',
|
||||
query_params=query_params
|
||||
)
|
||||
)
|
||||
|
||||
def scrape_url(self,
|
||||
url: str,
|
||||
page_options: dict = None,
|
||||
plugin_options: dict = None,
|
||||
sync: bool = True,
|
||||
prefetched: bool = True
|
||||
):
|
||||
result = self.create_crawl_request(
|
||||
url=url,
|
||||
page_options=page_options,
|
||||
plugin_options=plugin_options
|
||||
)
|
||||
if not sync:
|
||||
return result
|
||||
|
||||
for result in self.monitor_crawl_request(result['uuid'], prefetched):
|
||||
if result['type'] == 'result':
|
||||
return result['data']
|
||||
|
||||
def download_result(self, result_object: dict):
|
||||
response = requests.get(result_object['result'])
|
||||
response.raise_for_status()
|
||||
result_object['result'] = response.json()
|
||||
return result_object
|
57
api/core/rag/extractor/watercrawl/extractor.py
Normal file
57
api/core/rag/extractor/watercrawl/extractor.py
Normal file
@ -0,0 +1,57 @@
|
||||
from core.rag.extractor.extractor_base import BaseExtractor
|
||||
from core.rag.models.document import Document
|
||||
from services.website_service import WebsiteService
|
||||
|
||||
|
||||
class WaterCrawlWebExtractor(BaseExtractor):
|
||||
"""
|
||||
Crawl and scrape websites and return content in clean llm-ready markdown.
|
||||
|
||||
|
||||
Args:
|
||||
url: The URL to scrape.
|
||||
api_key: The API key for WaterCrawl.
|
||||
base_url: The base URL for the Firecrawl API. Defaults to 'https://app.firecrawl.dev'.
|
||||
mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'.
|
||||
only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
|
||||
"""
|
||||
|
||||
def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True):
|
||||
"""Initialize with url, api_key, base_url and mode."""
|
||||
self._url = url
|
||||
self.job_id = job_id
|
||||
self.tenant_id = tenant_id
|
||||
self.mode = mode
|
||||
self.only_main_content = only_main_content
|
||||
|
||||
def extract(self) -> list[Document]:
|
||||
"""Extract content from the URL."""
|
||||
documents = []
|
||||
if self.mode == "crawl":
|
||||
crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "watercrawl", self._url, self.tenant_id)
|
||||
if crawl_data is None:
|
||||
return []
|
||||
document = Document(
|
||||
page_content=crawl_data.get("markdown", ""),
|
||||
metadata={
|
||||
"source_url": crawl_data.get("source_url"),
|
||||
"description": crawl_data.get("description"),
|
||||
"title": crawl_data.get("title"),
|
||||
},
|
||||
)
|
||||
documents.append(document)
|
||||
elif self.mode == "scrape":
|
||||
scrape_data = WebsiteService.get_scrape_url_data(
|
||||
"watercrawl", self._url, self.tenant_id, self.only_main_content
|
||||
)
|
||||
|
||||
document = Document(
|
||||
page_content=scrape_data.get("markdown", ""),
|
||||
metadata={
|
||||
"source_url": scrape_data.get("source_url"),
|
||||
"description": scrape_data.get("description"),
|
||||
"title": scrape_data.get("title"),
|
||||
},
|
||||
)
|
||||
documents.append(document)
|
||||
return documents
|
119
api/core/rag/extractor/watercrawl/provider.py
Normal file
119
api/core/rag/extractor/watercrawl/provider.py
Normal file
@ -0,0 +1,119 @@
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from core.rag.extractor.watercrawl.client import WaterCrawlAPIClient
|
||||
|
||||
|
||||
class WaterCrawlProvider:
|
||||
def __init__(self, api_key, base_url: str = None):
|
||||
self.client = WaterCrawlAPIClient(api_key, base_url)
|
||||
|
||||
def crawl_url(self, url, options):
|
||||
spider_options = {
|
||||
"max_depth": 1,
|
||||
"page_limit": 1,
|
||||
"allowed_domains": [],
|
||||
"exclude_paths": [],
|
||||
"include_paths": []
|
||||
}
|
||||
if options.get("crawl_sub_pages", True):
|
||||
spider_options["page_limit"] = options.get("limit", 1)
|
||||
spider_options["max_depth"] = options.get("depth", 1)
|
||||
spider_options["include_paths"] = options.get("includes", '').split(",") if options.get("includes") else []
|
||||
spider_options["exclude_paths"] = options.get("excludes", '').split(",") if options.get("excludes") else []
|
||||
|
||||
wait_time = options.get("wait_time", 1000)
|
||||
page_options = {
|
||||
"exclude_tags": options.get("exclude_tags", '').split(",") if options.get("exclude_tags") else [],
|
||||
"include_tags": options.get("include_tags", '').split(",") if options.get("include_tags") else [],
|
||||
"wait_time": wait_time if wait_time > 1000 else 1000, # minimum wait time is 1 second
|
||||
"include_html": False,
|
||||
"only_main_content": options.get("only_main_content", True),
|
||||
"include_links": False,
|
||||
"timeout": 15000,
|
||||
"accept_cookies_selector": "#cookies-accept",
|
||||
"locale": "en-US",
|
||||
"actions": []
|
||||
}
|
||||
result = self.client.create_crawl_request(
|
||||
url=url,
|
||||
spider_options=spider_options,
|
||||
page_options=page_options
|
||||
)
|
||||
|
||||
return {"status": "active", "job_id": result.get("uuid")}
|
||||
|
||||
def get_crawl_status(self, crawl_request_id):
|
||||
response = self.client.get_crawl_request(crawl_request_id)
|
||||
data = []
|
||||
if response['status'] in ['new', 'running']:
|
||||
status = 'active'
|
||||
else:
|
||||
status = 'completed'
|
||||
data = list(self._get_results(crawl_request_id))
|
||||
|
||||
time_str = response.get('duration')
|
||||
time_consuming = 0
|
||||
if time_str:
|
||||
time_obj = datetime.strptime(time_str, "%H:%M:%S.%f")
|
||||
time_consuming = time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1_000_000
|
||||
|
||||
return {
|
||||
"status": status,
|
||||
"job_id": response.get("uuid"),
|
||||
"total": response.get("options", {}).get('spider_options', {}).get("page_limit", 1),
|
||||
"current": response.get("number_of_documents", 0),
|
||||
"data": data,
|
||||
"time_consuming": time_consuming
|
||||
}
|
||||
|
||||
def get_crawl_url_data(self, job_id, url):
|
||||
if not job_id:
|
||||
return self.scrape_url(url)
|
||||
|
||||
for result in self._get_results(job_id, {
|
||||
# filter by url
|
||||
'url': url
|
||||
}):
|
||||
return result
|
||||
|
||||
return None
|
||||
|
||||
def scrape_url(self, url):
|
||||
response = self.client.scrape_url(
|
||||
url=url,
|
||||
sync=True,
|
||||
prefetched=True
|
||||
)
|
||||
return self._structure_data(response)
|
||||
|
||||
def _structure_data(self, result_object: dict):
|
||||
if isinstance(result_object.get("result", {}), str):
|
||||
raise ValueError("Invalid result object. Expected a dictionary.")
|
||||
|
||||
metadata = result_object.get("result", {}).get("metadata", {})
|
||||
return {
|
||||
"title": metadata.get("og:title") or metadata.get("title"),
|
||||
"description": metadata.get("description"),
|
||||
"source_url": result_object.get("url"),
|
||||
"markdown": result_object.get('result').get("markdown"),
|
||||
}
|
||||
|
||||
def _get_results(self, crawl_request_id: str, query_params: dict = None):
|
||||
page = 0
|
||||
page_size = 100
|
||||
|
||||
query_params = query_params or {}
|
||||
query_params.update({
|
||||
'prefetched': "true"
|
||||
})
|
||||
while True:
|
||||
page += 1
|
||||
response = self.client.get_crawl_request_results(crawl_request_id, page, page_size, query_params)
|
||||
if not response['results']:
|
||||
break
|
||||
|
||||
for result in response['results']:
|
||||
yield self._structure_data(result)
|
||||
|
||||
if response['next'] is None:
|
||||
break
|
@ -17,6 +17,10 @@ class ApiKeyAuthFactory:
|
||||
from services.auth.firecrawl.firecrawl import FirecrawlAuth
|
||||
|
||||
return FirecrawlAuth
|
||||
case AuthType.WATERCRAWL:
|
||||
from services.auth.watercrawl.watercrawl import WatercrawlAuth
|
||||
|
||||
return WatercrawlAuth
|
||||
case AuthType.JINA:
|
||||
from services.auth.jina.jina import JinaAuth
|
||||
|
||||
|
@ -3,4 +3,5 @@ from enum import StrEnum
|
||||
|
||||
class AuthType(StrEnum):
|
||||
FIRECRAWL = "firecrawl"
|
||||
WATERCRAWL = "watercrawl"
|
||||
JINA = "jinareader"
|
||||
|
0
api/services/auth/watercrawl/__init__.py
Normal file
0
api/services/auth/watercrawl/__init__.py
Normal file
44
api/services/auth/watercrawl/watercrawl.py
Normal file
44
api/services/auth/watercrawl/watercrawl.py
Normal file
@ -0,0 +1,44 @@
|
||||
import json
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import requests
|
||||
|
||||
from services.auth.api_key_auth_base import ApiKeyAuthBase
|
||||
|
||||
|
||||
class WatercrawlAuth(ApiKeyAuthBase):
|
||||
def __init__(self, credentials: dict):
|
||||
super().__init__(credentials)
|
||||
auth_type = credentials.get("auth_type")
|
||||
if auth_type != "x-api-key":
|
||||
raise ValueError("Invalid auth type, WaterCrawl auth type must be x-api-key")
|
||||
self.api_key = credentials.get("config", {}).get("api_key", None)
|
||||
self.base_url = credentials.get("config", {}).get("base_url", "https://app.watercrawl.dev")
|
||||
|
||||
if not self.api_key:
|
||||
raise ValueError("No API key provided")
|
||||
|
||||
def validate_credentials(self):
|
||||
headers = self._prepare_headers()
|
||||
url = urljoin(self.base_url, "/api/v1/core/crawl-requests/")
|
||||
response = self._get_request(url, headers)
|
||||
if response.status_code == 200:
|
||||
return True
|
||||
else:
|
||||
self._handle_error(response)
|
||||
|
||||
def _prepare_headers(self):
|
||||
return {"Content-Type": "application/json", "X-API-KEY": self.api_key}
|
||||
|
||||
def _get_request(self, url, headers):
|
||||
return requests.get(url, headers=headers)
|
||||
|
||||
def _handle_error(self, response):
|
||||
if response.status_code in {402, 409, 500}:
|
||||
error_message = response.json().get("error", "Unknown error occurred")
|
||||
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
|
||||
else:
|
||||
if response.text:
|
||||
error_message = json.loads(response.text).get("error", "Unknown error occurred")
|
||||
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
|
||||
raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}")
|
@ -7,6 +7,7 @@ from flask_login import current_user # type: ignore
|
||||
|
||||
from core.helper import encrypter
|
||||
from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp
|
||||
from core.rag.extractor.watercrawl.provider import WaterCrawlProvider
|
||||
from extensions.ext_redis import redis_client
|
||||
from extensions.ext_storage import storage
|
||||
from services.auth.api_key_auth_service import ApiKeyAuthService
|
||||
@ -59,6 +60,16 @@ class WebsiteService:
|
||||
time = str(datetime.datetime.now().timestamp())
|
||||
redis_client.setex(website_crawl_time_cache_key, 3600, time)
|
||||
return {"status": "active", "job_id": job_id}
|
||||
elif provider == "watercrawl":
|
||||
# decrypt api_key
|
||||
api_key = encrypter.decrypt_token(
|
||||
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
||||
)
|
||||
return WaterCrawlProvider(
|
||||
api_key,
|
||||
credentials.get("config").get("base_url", None)
|
||||
).crawl_url(url, options)
|
||||
|
||||
elif provider == "jinareader":
|
||||
api_key = encrypter.decrypt_token(
|
||||
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
||||
@ -116,6 +127,15 @@ class WebsiteService:
|
||||
time_consuming = abs(end_time - float(start_time))
|
||||
crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
|
||||
redis_client.delete(website_crawl_time_cache_key)
|
||||
elif provider == "watercrawl":
|
||||
# decrypt api_key
|
||||
api_key = encrypter.decrypt_token(
|
||||
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
||||
)
|
||||
crawl_status_data = WaterCrawlProvider(
|
||||
api_key,
|
||||
credentials.get("config").get("base_url", None)
|
||||
).get_crawl_status(job_id)
|
||||
elif provider == "jinareader":
|
||||
api_key = encrypter.decrypt_token(
|
||||
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
||||
@ -180,6 +200,12 @@ class WebsiteService:
|
||||
if item.get("source_url") == url:
|
||||
return dict(item)
|
||||
return None
|
||||
elif provider == "watercrawl":
|
||||
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
|
||||
return WaterCrawlProvider(
|
||||
api_key,
|
||||
credentials.get("config").get("base_url", None)
|
||||
).get_crawl_url_data(job_id, url)
|
||||
elif provider == "jinareader":
|
||||
if not job_id:
|
||||
response = requests.get(
|
||||
@ -223,5 +249,11 @@ class WebsiteService:
|
||||
params = {"onlyMainContent": only_main_content}
|
||||
result = firecrawl_app.scrape_url(url, params)
|
||||
return result
|
||||
elif provider == "watercrawl":
|
||||
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
|
||||
return WaterCrawlProvider(
|
||||
api_key,
|
||||
credentials.get("config").get("base_url", None)
|
||||
).scrape_url(url)
|
||||
else:
|
||||
raise ValueError("Invalid provider")
|
||||
|
20
web/app/components/datasets/create/assets/watercrawl.svg
Normal file
20
web/app/components/datasets/create/assets/watercrawl.svg
Normal file
@ -0,0 +1,20 @@
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 500 500">
|
||||
<path style="fill: rgb(0, 23, 87); stroke: rgb(13, 14, 52);" d="M 247.794 213.903 L 246.81 76.976 L 254.345 76.963 L 254.592 213.989 L 247.794 213.903 Z"/>
|
||||
<ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="250.025" cy="43.859" rx="33.966" ry="33.906"/>
|
||||
<path style="fill: rgb(30, 141, 166); stroke: rgb(30, 141, 166);" d="M 282.472 260.389 L 414.181 330.253 L 410.563 336.234 L 279.38 265.739 L 282.472 260.389 Z"/>
|
||||
<path style="fill: rgb(15, 17, 57); stroke: rgb(13, 14, 52);" d="M 255.105 281.394 L 254.485 417.656 L 246.156 417.691 L 246.688 280.51 L 255.105 281.394 Z"/>
|
||||
<path style="paint-order: fill; fill: rgb(30, 141, 166); stroke: rgb(30, 141, 166);" d="M 279.486 229.517 L 410.351 160.07 L 413.923 167.04 L 283.727 235.998 L 279.486 229.517 Z"/>
|
||||
<path style="fill: rgb(15, 164, 161); stroke: rgb(15, 164, 161);" d="M 88.545 164.884 L 219.797 236.07 L 222.867 229.568 L 90.887 159.47 L 88.545 164.884 Z"/>
|
||||
<path style="fill: rgb(15, 164, 161); stroke: rgb(15, 164, 161);" d="M 224.76 266.9 L 95.55 334.829 L 92.878 328.37 L 219.955 261.275 L 224.76 266.9 Z"/>
|
||||
<ellipse style="paint-order: fill; fill: rgb(2, 181, 225); stroke: rgb(2, 181, 225);" cx="251.242" cy="247.466" rx="33.966" ry="33.906"/>
|
||||
<path style="fill: rgb(13, 14, 52); stroke: rgb(13, 14, 52);" d="M 279.502 433.617 L 408.666 359.443 C 408.666 359.443 412.398 366.965 412.398 366.916 C 412.398 366.867 281.544 440.217 281.544 440.217 L 279.502 433.617 Z"/>
|
||||
<path style="fill: rgb(13, 14, 52); stroke: rgb(13, 14, 52);" d="M 223.119 431.408 L 96.643 361.068 L 93.265 368.047 L 218.895 438.099 L 223.119 431.408 Z"/>
|
||||
<ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="250.504" cy="451.168" rx="33.966" ry="33.906"/>
|
||||
<path style="fill: rgb(90, 191, 187); stroke: rgb(90, 191, 187);" d="M 435.665 180.895 L 435.859 316.869 L 443.103 315.579 L 442.56 180.697 L 435.665 180.895 Z"/>
|
||||
<ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="441.06" cy="349.665" rx="33.966" ry="33.906"/>
|
||||
<ellipse style="fill: rgb(2, 181, 225); stroke: rgb(2, 181, 225);" cx="441.512" cy="147.767" rx="33.966" ry="33.906"/>
|
||||
<path style="fill: rgb(84, 187, 181); stroke: rgb(84, 187, 181);" d="M 64.755 314.523 L 57.928 315.006 L 58.307 182.961 L 65.169 182.865 L 64.755 314.523 Z"/>
|
||||
<ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="58.177" cy="149.757" rx="33.966" ry="33.906"/>
|
||||
<ellipse style="fill: rgb(61, 224, 203); stroke: rgb(61, 224, 203);" cx="65.909" cy="348.17" rx="33.966" ry="33.906"/>
|
||||
</svg>
|
After Width: | Height: | Size: 2.6 KiB |
@ -4,3 +4,10 @@
|
||||
background-image: url(../assets/jina.png);
|
||||
background-size: 16px;
|
||||
}
|
||||
|
||||
.watercrawlLogo {
|
||||
@apply w-5 h-5 bg-center bg-no-repeat inline-block;
|
||||
/*background-color: #F5FAFF;*/
|
||||
background-image: url(../assets/watercrawl.svg);
|
||||
background-size: 16px;
|
||||
}
|
||||
|
@ -5,6 +5,7 @@ import { useTranslation } from 'react-i18next'
|
||||
import s from './index.module.css'
|
||||
import NoData from './no-data'
|
||||
import Firecrawl from './firecrawl'
|
||||
import Watercrawl from './watercrawl'
|
||||
import JinaReader from './jina-reader'
|
||||
import cn from '@/utils/classnames'
|
||||
import { useModalContext } from '@/context/modal-context'
|
||||
@ -47,7 +48,11 @@ const Website: FC<Props> = ({
|
||||
|
||||
// If users have configured one of the providers, select it.
|
||||
const availableProviders = res.sources.filter((item: DataSourceItem) =>
|
||||
[DataSourceProvider.jinaReader, DataSourceProvider.fireCrawl].includes(item.provider),
|
||||
[
|
||||
DataSourceProvider.jinaReader,
|
||||
DataSourceProvider.fireCrawl,
|
||||
DataSourceProvider.waterCrawl,
|
||||
].includes(item.provider),
|
||||
)
|
||||
|
||||
if (availableProviders.length > 0)
|
||||
@ -70,6 +75,8 @@ const Website: FC<Props> = ({
|
||||
if (!isLoaded)
|
||||
return null
|
||||
|
||||
const source = sources.find(source => source.provider === selectedProvider)
|
||||
|
||||
return (
|
||||
<div>
|
||||
<div className="mb-4">
|
||||
@ -86,7 +93,7 @@ const Website: FC<Props> = ({
|
||||
)}
|
||||
onClick={() => setSelectedProvider(DataSourceProvider.jinaReader)}
|
||||
>
|
||||
<span className={cn(s.jinaLogo, 'mr-2')} />
|
||||
<span className={cn(s.jinaLogo, 'mr-2')}/>
|
||||
<span>Jina Reader</span>
|
||||
</button>
|
||||
<button
|
||||
@ -100,40 +107,52 @@ const Website: FC<Props> = ({
|
||||
>
|
||||
🔥 Firecrawl
|
||||
</button>
|
||||
<button
|
||||
className={`px-4 py-2 text-sm font-medium rounded-md flex items-center justify-center ${
|
||||
selectedProvider === DataSourceProvider.waterCrawl
|
||||
? 'bg-primary-50 text-primary-600'
|
||||
: 'bg-gray-100 text-gray-600 hover:bg-gray-200'
|
||||
}`}
|
||||
onClick={() => setSelectedProvider(DataSourceProvider.waterCrawl)}
|
||||
>
|
||||
<span className={cn(s.watercrawlLogo, 'mr-2')}/>
|
||||
<span>WaterCrawl</span>
|
||||
</button>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{
|
||||
selectedProvider === DataSourceProvider.fireCrawl
|
||||
? sources.find(source => source.provider === DataSourceProvider.fireCrawl)
|
||||
? (
|
||||
<Firecrawl
|
||||
onPreview={onPreview}
|
||||
checkedCrawlResult={checkedCrawlResult}
|
||||
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
||||
onJobIdChange={onJobIdChange}
|
||||
crawlOptions={crawlOptions}
|
||||
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||
/>
|
||||
)
|
||||
: (
|
||||
<NoData onConfig={handleOnConfig} provider={selectedProvider} />
|
||||
)
|
||||
: sources.find(source => source.provider === DataSourceProvider.jinaReader)
|
||||
? (
|
||||
<JinaReader
|
||||
onPreview={onPreview}
|
||||
checkedCrawlResult={checkedCrawlResult}
|
||||
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
||||
onJobIdChange={onJobIdChange}
|
||||
crawlOptions={crawlOptions}
|
||||
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||
/>
|
||||
)
|
||||
: (
|
||||
<NoData onConfig={handleOnConfig} provider={selectedProvider} />
|
||||
)
|
||||
}
|
||||
{source && selectedProvider === DataSourceProvider.fireCrawl && (
|
||||
<Firecrawl
|
||||
onPreview={onPreview}
|
||||
checkedCrawlResult={checkedCrawlResult}
|
||||
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
||||
onJobIdChange={onJobIdChange}
|
||||
crawlOptions={crawlOptions}
|
||||
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||
/>
|
||||
)}
|
||||
{source && selectedProvider === DataSourceProvider.waterCrawl && (
|
||||
<Watercrawl
|
||||
onPreview={onPreview}
|
||||
checkedCrawlResult={checkedCrawlResult}
|
||||
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
||||
onJobIdChange={onJobIdChange}
|
||||
crawlOptions={crawlOptions}
|
||||
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||
/>
|
||||
)}
|
||||
{source && selectedProvider === DataSourceProvider.jinaReader && (
|
||||
<JinaReader
|
||||
onPreview={onPreview}
|
||||
checkedCrawlResult={checkedCrawlResult}
|
||||
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
||||
onJobIdChange={onJobIdChange}
|
||||
crawlOptions={crawlOptions}
|
||||
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||
/>
|
||||
)}
|
||||
{!source && (
|
||||
<NoData onConfig={handleOnConfig} provider={selectedProvider}/>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
@ -31,6 +31,11 @@ const NoData: FC<Props> = ({
|
||||
title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`),
|
||||
description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`),
|
||||
},
|
||||
[DataSourceProvider.waterCrawl]: {
|
||||
emoji: <span className={s.watercrawlLogo} />,
|
||||
title: t(`${I18N_PREFIX}.waterCrawlNotConfigured`),
|
||||
description: t(`${I18N_PREFIX}.waterCrawlNotConfiguredDescription`),
|
||||
},
|
||||
}
|
||||
|
||||
const currentProvider = providerConfig[provider]
|
||||
|
@ -0,0 +1,43 @@
|
||||
'use client'
|
||||
import type { FC } from 'react'
|
||||
import React from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { RiBookOpenLine, RiEqualizer2Line } from '@remixicon/react'
|
||||
import Button from '@/app/components/base/button'
|
||||
|
||||
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||
|
||||
type Props = {
|
||||
onSetting: () => void
|
||||
}
|
||||
|
||||
const Header: FC<Props> = ({
|
||||
onSetting,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
|
||||
return (
|
||||
<div className='flex h-6 items-center justify-between'>
|
||||
<div className='flex items-center'>
|
||||
<div className='text-base font-medium text-text-secondary'>{t(`${I18N_PREFIX}.watercrawlTitle`)}</div>
|
||||
<div className='ml-2 mr-2 w-px h-3.5 bg-divider-regular' />
|
||||
<Button className='flex items-center gap-x-[1px] h-6 px-1.5' onClick={onSetting}>
|
||||
<RiEqualizer2Line className='w-3.5 h-3.5 text-components-button-secondary-text' />
|
||||
<span className='text-components-button-secondary-text text-xs font-medium px-[3px]'>
|
||||
{t(`${I18N_PREFIX}.configureWatercrawl`)}
|
||||
</span>
|
||||
</Button>
|
||||
</div>
|
||||
<a
|
||||
href='https://docs.watercrawl.dev/'
|
||||
target='_blank'
|
||||
rel='noopener noreferrer'
|
||||
className='inline-flex items-center gap-x-1 text-xs font-medium text-text-accent'
|
||||
>
|
||||
<RiBookOpenLine className='w-3.5 h-3.5 text-text-accent' />
|
||||
<span>{t(`${I18N_PREFIX}.watercrawlDoc`)}</span>
|
||||
</a>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
export default React.memo(Header)
|
217
web/app/components/datasets/create/website/watercrawl/index.tsx
Normal file
217
web/app/components/datasets/create/website/watercrawl/index.tsx
Normal file
@ -0,0 +1,217 @@
|
||||
'use client'
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback, useEffect, useState } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import UrlInput from '../base/url-input'
|
||||
import OptionsWrap from '../base/options-wrap'
|
||||
import CrawledResult from '../base/crawled-result'
|
||||
import Crawling from '../base/crawling'
|
||||
import ErrorMessage from '../base/error-message'
|
||||
import Header from './header'
|
||||
import Options from './options'
|
||||
import { useModalContext } from '@/context/modal-context'
|
||||
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
|
||||
import Toast from '@/app/components/base/toast'
|
||||
import { checkWatercrawlTaskStatus, createWatercrawlTask } from '@/service/datasets'
|
||||
import { sleep } from '@/utils'
|
||||
|
||||
const ERROR_I18N_PREFIX = 'common.errorMsg'
|
||||
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||
|
||||
type Props = {
|
||||
onPreview: (payload: CrawlResultItem) => void
|
||||
checkedCrawlResult: CrawlResultItem[]
|
||||
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
|
||||
onJobIdChange: (jobId: string) => void
|
||||
crawlOptions: CrawlOptions
|
||||
onCrawlOptionsChange: (payload: CrawlOptions) => void
|
||||
}
|
||||
|
||||
enum Step {
|
||||
init = 'init',
|
||||
running = 'running',
|
||||
finished = 'finished',
|
||||
}
|
||||
|
||||
const WaterCrawl: FC<Props> = ({
|
||||
onPreview,
|
||||
checkedCrawlResult,
|
||||
onCheckedCrawlResultChange,
|
||||
onJobIdChange,
|
||||
crawlOptions,
|
||||
onCrawlOptionsChange,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
const [step, setStep] = useState<Step>(Step.init)
|
||||
const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
|
||||
useEffect(() => {
|
||||
if (step !== Step.init)
|
||||
setControlFoldOptions(Date.now())
|
||||
}, [step])
|
||||
const { setShowAccountSettingModal } = useModalContext()
|
||||
const handleSetting = useCallback(() => {
|
||||
setShowAccountSettingModal({
|
||||
payload: 'data-source',
|
||||
})
|
||||
}, [setShowAccountSettingModal])
|
||||
|
||||
const checkValid = useCallback((url: string) => {
|
||||
let errorMsg = ''
|
||||
if (!url) {
|
||||
errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
|
||||
field: 'url',
|
||||
})
|
||||
}
|
||||
|
||||
if (!errorMsg && !((url.startsWith('http://') || url.startsWith('https://'))))
|
||||
errorMsg = t(`${ERROR_I18N_PREFIX}.urlError`)
|
||||
|
||||
if (!errorMsg && (crawlOptions.limit === null || crawlOptions.limit === undefined || crawlOptions.limit === '')) {
|
||||
errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
|
||||
field: t(`${I18N_PREFIX}.limit`),
|
||||
})
|
||||
}
|
||||
|
||||
return {
|
||||
isValid: !errorMsg,
|
||||
errorMsg,
|
||||
}
|
||||
}, [crawlOptions, t])
|
||||
|
||||
const isInit = step === Step.init
|
||||
const isCrawlFinished = step === Step.finished
|
||||
const isRunning = step === Step.running
|
||||
const [crawlResult, setCrawlResult] = useState<{
|
||||
current: number
|
||||
total: number
|
||||
data: CrawlResultItem[]
|
||||
time_consuming: number | string
|
||||
} | undefined>(undefined)
|
||||
const [crawlErrorMessage, setCrawlErrorMessage] = useState('')
|
||||
const showError = isCrawlFinished && crawlErrorMessage
|
||||
|
||||
const waitForCrawlFinished = useCallback(async (jobId: string): Promise<any> => {
|
||||
try {
|
||||
const res = await checkWatercrawlTaskStatus(jobId) as any
|
||||
if (res.status === 'completed') {
|
||||
return {
|
||||
isError: false,
|
||||
data: {
|
||||
...res,
|
||||
total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)),
|
||||
},
|
||||
}
|
||||
}
|
||||
if (res.status === 'error' || !res.status) {
|
||||
// can't get the error message from the watercrawl api
|
||||
return {
|
||||
isError: true,
|
||||
errorMessage: res.message,
|
||||
data: {
|
||||
data: [],
|
||||
},
|
||||
}
|
||||
}
|
||||
// update the progress
|
||||
setCrawlResult({
|
||||
...res,
|
||||
total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)),
|
||||
})
|
||||
onCheckedCrawlResultChange(res.data || []) // default select the crawl result
|
||||
await sleep(2500)
|
||||
return await waitForCrawlFinished(jobId)
|
||||
}
|
||||
catch (e: any) {
|
||||
const errorBody = await e.json()
|
||||
return {
|
||||
isError: true,
|
||||
errorMessage: errorBody.message,
|
||||
data: {
|
||||
data: [],
|
||||
},
|
||||
}
|
||||
}
|
||||
}, [crawlOptions.limit])
|
||||
|
||||
const handleRun = useCallback(async (url: string) => {
|
||||
const { isValid, errorMsg } = checkValid(url)
|
||||
if (!isValid) {
|
||||
Toast.notify({
|
||||
message: errorMsg!,
|
||||
type: 'error',
|
||||
})
|
||||
return
|
||||
}
|
||||
setStep(Step.running)
|
||||
try {
|
||||
const passToServerCrawlOptions: any = {
|
||||
...crawlOptions,
|
||||
}
|
||||
if (crawlOptions.max_depth === '')
|
||||
delete passToServerCrawlOptions.max_depth
|
||||
|
||||
const res = await createWatercrawlTask({
|
||||
url,
|
||||
options: passToServerCrawlOptions,
|
||||
}) as any
|
||||
const jobId = res.job_id
|
||||
onJobIdChange(jobId)
|
||||
const { isError, data, errorMessage } = await waitForCrawlFinished(jobId)
|
||||
if (isError) {
|
||||
setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`))
|
||||
}
|
||||
else {
|
||||
setCrawlResult(data)
|
||||
onCheckedCrawlResultChange(data.data || []) // default select the crawl result
|
||||
setCrawlErrorMessage('')
|
||||
}
|
||||
}
|
||||
catch (e) {
|
||||
setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!)
|
||||
console.log(e)
|
||||
}
|
||||
finally {
|
||||
setStep(Step.finished)
|
||||
}
|
||||
}, [checkValid, crawlOptions, onJobIdChange, t, waitForCrawlFinished])
|
||||
|
||||
return (
|
||||
<div>
|
||||
<Header onSetting={handleSetting} />
|
||||
<div className='mt-2 p-4 pb-0 rounded-xl border border-components-panel-border bg-background-default-subtle'>
|
||||
<UrlInput onRun={handleRun} isRunning={isRunning} />
|
||||
<OptionsWrap
|
||||
className='mt-4'
|
||||
controlFoldOptions={controlFoldOptions}
|
||||
>
|
||||
<Options className='mt-2' payload={crawlOptions} onChange={onCrawlOptionsChange} />
|
||||
</OptionsWrap>
|
||||
|
||||
{!isInit && (
|
||||
<div className='mt-3 relative left-[-16px] w-[calc(100%_+_32px)] rounded-b-xl'>
|
||||
{isRunning
|
||||
&& <Crawling
|
||||
className='mt-2'
|
||||
crawledNum={crawlResult?.current || 0}
|
||||
totalNum={crawlResult?.total || Number.parseFloat(crawlOptions.limit as string) || 0}
|
||||
/>}
|
||||
{showError && (
|
||||
<ErrorMessage className='rounded-b-xl' title={t(`${I18N_PREFIX}.exceptionErrorTitle`)} errorMsg={crawlErrorMessage} />
|
||||
)}
|
||||
{isCrawlFinished && !showError
|
||||
&& <CrawledResult
|
||||
className='mb-2'
|
||||
list={crawlResult?.data || []}
|
||||
checkedList={checkedCrawlResult}
|
||||
onSelectedChange={onCheckedCrawlResultChange}
|
||||
onPreview={onPreview}
|
||||
usedTime={Number.parseFloat(crawlResult?.time_consuming as string) || 0}
|
||||
/>
|
||||
}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
export default React.memo(WaterCrawl)
|
@ -0,0 +1,83 @@
|
||||
'use client'
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import CheckboxWithLabel from '../base/checkbox-with-label'
|
||||
import Field from '../base/field'
|
||||
import cn from '@/utils/classnames'
|
||||
import type { CrawlOptions } from '@/models/datasets'
|
||||
|
||||
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||
|
||||
type Props = {
|
||||
className?: string
|
||||
payload: CrawlOptions
|
||||
onChange: (payload: CrawlOptions) => void
|
||||
}
|
||||
|
||||
const Options: FC<Props> = ({
|
||||
className = '',
|
||||
payload,
|
||||
onChange,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
|
||||
const handleChange = useCallback((key: keyof CrawlOptions) => {
|
||||
return (value: any) => {
|
||||
onChange({
|
||||
...payload,
|
||||
[key]: value,
|
||||
})
|
||||
}
|
||||
}, [payload, onChange])
|
||||
return (
|
||||
<div className={cn(className, ' space-y-2')}>
|
||||
<CheckboxWithLabel
|
||||
label={t(`${I18N_PREFIX}.crawlSubPage`)}
|
||||
isChecked={payload.crawl_sub_pages}
|
||||
onChange={handleChange('crawl_sub_pages')}
|
||||
/>
|
||||
<div className='flex justify-between space-x-4'>
|
||||
<Field
|
||||
className='grow shrink-0'
|
||||
label={t(`${I18N_PREFIX}.limit`)}
|
||||
value={payload.limit}
|
||||
onChange={handleChange('limit')}
|
||||
isNumber
|
||||
isRequired
|
||||
/>
|
||||
<Field
|
||||
className='grow shrink-0'
|
||||
label={t(`${I18N_PREFIX}.maxDepth`)}
|
||||
value={payload.max_depth}
|
||||
onChange={handleChange('max_depth')}
|
||||
isNumber
|
||||
tooltip={t(`${I18N_PREFIX}.maxDepthTooltip`)!}
|
||||
/>
|
||||
</div>
|
||||
|
||||
<div className='flex justify-between space-x-4'>
|
||||
<Field
|
||||
className='grow shrink-0'
|
||||
label={t(`${I18N_PREFIX}.excludePaths`)}
|
||||
value={payload.excludes}
|
||||
onChange={handleChange('excludes')}
|
||||
placeholder='blog/*, /about/*'
|
||||
/>
|
||||
<Field
|
||||
className='grow shrink-0'
|
||||
label={t(`${I18N_PREFIX}.includeOnlyPaths`)}
|
||||
value={payload.includes}
|
||||
onChange={handleChange('includes')}
|
||||
placeholder='articles/*'
|
||||
/>
|
||||
</div>
|
||||
<CheckboxWithLabel
|
||||
label={t(`${I18N_PREFIX}.extractOnlyMainContent`)}
|
||||
isChecked={payload.only_main_content}
|
||||
onChange={handleChange('only_main_content')}
|
||||
/>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
export default React.memo(Options)
|
@ -0,0 +1,161 @@
|
||||
'use client'
|
||||
import type { FC } from 'react'
|
||||
import React, { useCallback, useState } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import {
|
||||
PortalToFollowElem,
|
||||
PortalToFollowElemContent,
|
||||
} from '@/app/components/base/portal-to-follow-elem'
|
||||
import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security'
|
||||
import Button from '@/app/components/base/button'
|
||||
import type { WatercrawlConfig } from '@/models/common'
|
||||
import Field from '@/app/components/datasets/create/website/base/field'
|
||||
import Toast from '@/app/components/base/toast'
|
||||
import { createDataSourceApiKeyBinding } from '@/service/datasets'
|
||||
import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general'
|
||||
type Props = {
|
||||
onCancel: () => void
|
||||
onSaved: () => void
|
||||
}
|
||||
|
||||
const I18N_PREFIX = 'datasetCreation.watercrawl'
|
||||
|
||||
const DEFAULT_BASE_URL = 'https://app.watercrawl.dev'
|
||||
|
||||
const ConfigWatercrawlModal: FC<Props> = ({
|
||||
onCancel,
|
||||
onSaved,
|
||||
}) => {
|
||||
const { t } = useTranslation()
|
||||
const [isSaving, setIsSaving] = useState(false)
|
||||
const [config, setConfig] = useState<WatercrawlConfig>({
|
||||
api_key: '',
|
||||
base_url: '',
|
||||
})
|
||||
|
||||
const handleConfigChange = useCallback((key: string) => {
|
||||
return (value: string | number) => {
|
||||
setConfig(prev => ({ ...prev, [key]: value as string }))
|
||||
}
|
||||
}, [])
|
||||
|
||||
const handleSave = useCallback(async () => {
|
||||
if (isSaving)
|
||||
return
|
||||
let errorMsg = ''
|
||||
if (config.base_url && !((config.base_url.startsWith('http://') || config.base_url.startsWith('https://'))))
|
||||
errorMsg = t('common.errorMsg.urlError')
|
||||
if (!errorMsg) {
|
||||
if (!config.api_key) {
|
||||
errorMsg = t('common.errorMsg.fieldRequired', {
|
||||
field: 'API Key',
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
if (errorMsg) {
|
||||
Toast.notify({
|
||||
type: 'error',
|
||||
message: errorMsg,
|
||||
})
|
||||
return
|
||||
}
|
||||
const postData = {
|
||||
category: 'website',
|
||||
provider: 'watercrawl',
|
||||
credentials: {
|
||||
auth_type: 'x-api-key',
|
||||
config: {
|
||||
api_key: config.api_key,
|
||||
base_url: config.base_url || DEFAULT_BASE_URL,
|
||||
},
|
||||
},
|
||||
}
|
||||
try {
|
||||
setIsSaving(true)
|
||||
await createDataSourceApiKeyBinding(postData)
|
||||
Toast.notify({
|
||||
type: 'success',
|
||||
message: t('common.api.success'),
|
||||
})
|
||||
}
|
||||
finally {
|
||||
setIsSaving(false)
|
||||
}
|
||||
|
||||
onSaved()
|
||||
}, [config.api_key, config.base_url, onSaved, t, isSaving])
|
||||
|
||||
return (
|
||||
<PortalToFollowElem open>
|
||||
<PortalToFollowElemContent className='w-full h-full z-[60]'>
|
||||
<div className='fixed inset-0 flex items-center justify-center bg-background-overlay'>
|
||||
<div className='mx-2 w-[640px] max-h-[calc(100vh-120px)] bg-components-panel-bg shadow-xl rounded-2xl overflow-y-auto'>
|
||||
<div className='px-8 pt-8'>
|
||||
<div className='flex justify-between items-center mb-4'>
|
||||
<div className='system-xl-semibold text-text-primary'>{t(`${I18N_PREFIX}.configWatercrawl`)}</div>
|
||||
</div>
|
||||
|
||||
<div className='space-y-4'>
|
||||
<Field
|
||||
label='API Key'
|
||||
labelClassName='!text-sm'
|
||||
isRequired
|
||||
value={config.api_key}
|
||||
onChange={handleConfigChange('api_key')}
|
||||
placeholder={t(`${I18N_PREFIX}.apiKeyPlaceholder`)!}
|
||||
/>
|
||||
<Field
|
||||
label='Base URL'
|
||||
labelClassName='!text-sm'
|
||||
value={config.base_url}
|
||||
onChange={handleConfigChange('base_url')}
|
||||
placeholder={DEFAULT_BASE_URL}
|
||||
/>
|
||||
</div>
|
||||
<div className='my-8 flex justify-between items-center h-8'>
|
||||
<a className='flex items-center space-x-1 leading-[18px] text-xs font-normal text-text-accent' target='_blank' href='https://app.watercrawl.dev/'>
|
||||
<span>{t(`${I18N_PREFIX}.getApiKeyLinkText`)}</span>
|
||||
<LinkExternal02 className='w-3 h-3' />
|
||||
</a>
|
||||
<div className='flex'>
|
||||
<Button
|
||||
size='large'
|
||||
className='mr-2'
|
||||
onClick={onCancel}
|
||||
>
|
||||
{t('common.operation.cancel')}
|
||||
</Button>
|
||||
<Button
|
||||
variant='primary'
|
||||
size='large'
|
||||
onClick={handleSave}
|
||||
loading={isSaving}
|
||||
>
|
||||
{t('common.operation.save')}
|
||||
</Button>
|
||||
</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<div className='border-t-[0.5px] border-t-divider-regular'>
|
||||
<div className='flex justify-center items-center py-3 bg-background-section-burn text-xs text-text-tertiary'>
|
||||
<Lock01 className='mr-1 w-3 h-3 text-text-tertiary' />
|
||||
{t('common.modelProvider.encrypted.front')}
|
||||
<a
|
||||
className='text-text-accent mx-1'
|
||||
target='_blank' rel='noopener noreferrer'
|
||||
href='https://pycryptodome.readthedocs.io/en/latest/src/cipher/oaep.html'
|
||||
>
|
||||
PKCS1_OAEP
|
||||
</a>
|
||||
{t('common.modelProvider.encrypted.back')}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</PortalToFollowElemContent>
|
||||
</PortalToFollowElem>
|
||||
)
|
||||
}
|
||||
export default React.memo(ConfigWatercrawlModal)
|
@ -5,19 +5,15 @@ import { useTranslation } from 'react-i18next'
|
||||
import Panel from '../panel'
|
||||
import { DataSourceType } from '../panel/types'
|
||||
import ConfigFirecrawlModal from './config-firecrawl-modal'
|
||||
import ConfigWatercrawlModal from './config-watercrawl-modal'
|
||||
import ConfigJinaReaderModal from './config-jina-reader-modal'
|
||||
import cn from '@/utils/classnames'
|
||||
import s from '@/app/components/datasets/create/website/index.module.css'
|
||||
import { fetchDataSources, removeDataSourceApiKeyBinding } from '@/service/datasets'
|
||||
|
||||
import type {
|
||||
DataSourceItem,
|
||||
} from '@/models/common'
|
||||
import type { DataSourceItem } from '@/models/common'
|
||||
import { DataSourceProvider } from '@/models/common'
|
||||
import { useAppContext } from '@/context/app-context'
|
||||
|
||||
import {
|
||||
DataSourceProvider,
|
||||
} from '@/models/common'
|
||||
import Toast from '@/app/components/base/toast'
|
||||
|
||||
type Props = {
|
||||
@ -58,6 +54,16 @@ const DataSourceWebsite: FC<Props> = ({ provider }) => {
|
||||
return source?.id
|
||||
}
|
||||
|
||||
const getProviderName = (provider: DataSourceProvider): string => {
|
||||
if (provider === DataSourceProvider.fireCrawl)
|
||||
return 'Firecrawl'
|
||||
|
||||
if (provider === DataSourceProvider.waterCrawl)
|
||||
return 'WaterCrawl'
|
||||
|
||||
return 'Jina Reader'
|
||||
}
|
||||
|
||||
const handleRemove = useCallback((provider: DataSourceProvider) => {
|
||||
return async () => {
|
||||
const dataSourceId = getIdByProvider(provider)
|
||||
@ -82,27 +88,42 @@ const DataSourceWebsite: FC<Props> = ({ provider }) => {
|
||||
readOnly={!isCurrentWorkspaceManager}
|
||||
configuredList={sources.filter(item => item.provider === provider).map(item => ({
|
||||
id: item.id,
|
||||
logo: ({ className }: { className: string }) => (
|
||||
item.provider === DataSourceProvider.fireCrawl
|
||||
? (
|
||||
<div className={cn(className, 'flex items-center justify-center w-5 h-5 !bg-background-default border border-divider-subtle text-xs font-medium text-text-tertiary rounded ml-3')}>🔥</div>
|
||||
logo: ({ className }: { className: string }) => {
|
||||
if (item.provider === DataSourceProvider.fireCrawl) {
|
||||
return (
|
||||
<div
|
||||
className={cn(className, 'flex items-center justify-center w-5 h-5 !bg-background-default border border-divider-subtle text-xs font-medium text-text-tertiary rounded ml-3')}>🔥</div>
|
||||
)
|
||||
: (
|
||||
<div className={cn(className, 'flex items-center justify-center w-5 h-5 !bg-background-default border border-divider-subtle text-xs font-medium text-text-tertiary rounded ml-3')}>
|
||||
<span className={s.jinaLogo} />
|
||||
}
|
||||
|
||||
if (item.provider === DataSourceProvider.waterCrawl) {
|
||||
return (
|
||||
<div
|
||||
className={cn(className, 'flex items-center justify-center w-5 h-5 !bg-background-default border border-divider-subtle text-xs font-medium text-text-tertiary rounded ml-3')}>
|
||||
<span className={s.watercrawlLogo}/>
|
||||
</div>
|
||||
)
|
||||
),
|
||||
name: item.provider === DataSourceProvider.fireCrawl ? 'Firecrawl' : 'Jina Reader',
|
||||
}
|
||||
return (
|
||||
<div
|
||||
className={cn(className, 'flex items-center justify-center w-5 h-5 !bg-background-default border border-divider-subtle text-xs font-medium text-text-tertiary rounded ml-3')}>
|
||||
<span className={s.jinaLogo}/>
|
||||
</div>
|
||||
)
|
||||
},
|
||||
name: getProviderName(item.provider),
|
||||
isActive: true,
|
||||
}))}
|
||||
onRemove={handleRemove(provider)}
|
||||
/>
|
||||
{configTarget === DataSourceProvider.fireCrawl && (
|
||||
<ConfigFirecrawlModal onSaved={handleAdded} onCancel={hideConfig} />
|
||||
<ConfigFirecrawlModal onSaved={handleAdded} onCancel={hideConfig}/>
|
||||
)}
|
||||
{configTarget === DataSourceProvider.waterCrawl && (
|
||||
<ConfigWatercrawlModal onSaved={handleAdded} onCancel={hideConfig}/>
|
||||
)}
|
||||
{configTarget === DataSourceProvider.jinaReader && (
|
||||
<ConfigJinaReaderModal onSaved={handleAdded} onCancel={hideConfig} />
|
||||
<ConfigJinaReaderModal onSaved={handleAdded} onCancel={hideConfig}/>
|
||||
)}
|
||||
</>
|
||||
|
||||
|
@ -15,6 +15,7 @@ export default function DataSourcePage() {
|
||||
<DataSourceNotion workspaces={notionWorkspaces} />
|
||||
<DataSourceWebsite provider={DataSourceProvider.jinaReader} />
|
||||
<DataSourceWebsite provider={DataSourceProvider.fireCrawl} />
|
||||
<DataSourceWebsite provider={DataSourceProvider.waterCrawl} />
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
@ -41,6 +41,12 @@ const Panel: FC<Props> = ({
|
||||
const isNotion = type === DataSourceType.notion
|
||||
const isWebsite = type === DataSourceType.website
|
||||
|
||||
const getProviderName = (): string => {
|
||||
if (provider === DataSourceProvider.fireCrawl) return '🔥 Firecrawl'
|
||||
if (provider === DataSourceProvider.waterCrawl) return 'WaterCrawl'
|
||||
return 'Jina Reader'
|
||||
}
|
||||
|
||||
return (
|
||||
<div className='mb-2 bg-background-section-burn rounded-xl'>
|
||||
<div className='flex items-center px-3 py-[9px]'>
|
||||
@ -50,7 +56,7 @@ const Panel: FC<Props> = ({
|
||||
<div className='text-sm font-medium text-text-primary'>{t(`common.dataSource.${type}.title`)}</div>
|
||||
{isWebsite && (
|
||||
<div className='ml-1 leading-[18px] px-1.5 rounded-md bg-components-badge-white-to-dark text-xs font-medium text-text-secondary'>
|
||||
<span className='text-text-tertiary'>{t('common.dataSource.website.with')}</span> { provider === DataSourceProvider.fireCrawl ? '🔥 Firecrawl' : 'Jina Reader'}
|
||||
<span className='text-text-tertiary'>{t('common.dataSource.website.with')}</span> {getProviderName()}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
@ -15,6 +15,11 @@ const translation = {
|
||||
apiKeyPlaceholder: 'API key from firecrawl.dev',
|
||||
getApiKeyLinkText: 'Get your API key from firecrawl.dev',
|
||||
},
|
||||
watercrawl: {
|
||||
configWatercrawl: 'Configure Watercrawl',
|
||||
apiKeyPlaceholder: 'API key from watercrawl.dev',
|
||||
getApiKeyLinkText: 'Get your API key from watercrawl.dev',
|
||||
},
|
||||
jinaReader: {
|
||||
configJinaReader: 'Configure Jina Reader',
|
||||
apiKeyPlaceholder: 'API key from jina.ai',
|
||||
@ -64,15 +69,21 @@ const translation = {
|
||||
chooseProvider: 'Select a provider',
|
||||
fireCrawlNotConfigured: 'Firecrawl is not configured',
|
||||
fireCrawlNotConfiguredDescription: 'Configure Firecrawl with API key to use it.',
|
||||
watercrawlNotConfigured: 'Watercrawl is not configured',
|
||||
watercrawlNotConfiguredDescription: 'Configure Watercrawl with API key to use it.',
|
||||
jinaReaderNotConfigured: 'Jina Reader is not configured',
|
||||
jinaReaderNotConfiguredDescription: 'Set up Jina Reader by entering your free API key for access.',
|
||||
configure: 'Configure',
|
||||
configureFirecrawl: 'Configure Firecrawl',
|
||||
configureWatercrawl: 'Configure Watercrawl',
|
||||
configureJinaReader: 'Configure Jina Reader',
|
||||
run: 'Run',
|
||||
firecrawlTitle: 'Extract web content with 🔥Firecrawl',
|
||||
firecrawlDoc: 'Firecrawl docs',
|
||||
firecrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website',
|
||||
watercrawlTitle: 'Extract web content with Watercrawl',
|
||||
watercrawlDoc: 'Watercrawl docs',
|
||||
watercrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website',
|
||||
jinaReaderTitle: 'Convert the entire site to Markdown',
|
||||
jinaReaderDoc: 'Learn more about Jina Reader',
|
||||
jinaReaderDocLink: 'https://jina.ai/reader',
|
||||
|
@ -178,6 +178,7 @@ export enum DataSourceCategory {
|
||||
export enum DataSourceProvider {
|
||||
fireCrawl = 'firecrawl',
|
||||
jinaReader = 'jinareader',
|
||||
waterCrawl = 'watercrawl',
|
||||
}
|
||||
|
||||
export type FirecrawlConfig = {
|
||||
@ -185,6 +186,11 @@ export type FirecrawlConfig = {
|
||||
base_url: string
|
||||
}
|
||||
|
||||
export type WatercrawlConfig = {
|
||||
api_key: string
|
||||
base_url: string
|
||||
}
|
||||
|
||||
export type DataSourceItem = {
|
||||
id: string
|
||||
category: DataSourceCategory
|
||||
|
@ -253,6 +253,25 @@ export const checkJinaReaderTaskStatus: Fetcher<CommonResponse, string> = (jobId
|
||||
})
|
||||
}
|
||||
|
||||
export const createWatercrawlTask: Fetcher<CommonResponse, Record<string, any>> = (body) => {
|
||||
return post<CommonResponse>('website/crawl', {
|
||||
body: {
|
||||
...body,
|
||||
provider: DataSourceProvider.waterCrawl,
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
export const checkWatercrawlTaskStatus: Fetcher<CommonResponse, string> = (jobId: string) => {
|
||||
return get<CommonResponse>(`website/crawl/status/${jobId}`, {
|
||||
params: {
|
||||
provider: DataSourceProvider.waterCrawl,
|
||||
},
|
||||
}, {
|
||||
silent: true,
|
||||
})
|
||||
}
|
||||
|
||||
type FileTypesRes = {
|
||||
allowed_extensions: string[]
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user