dify/api/controllers/console/datasets/website.py
Amir Mohsen 29a4dec387 feat: Integrate WaterCrawl.dev as a new knowledge base provider
Add WaterCrawl.dev as an alternative provider for website crawling in datasets/knowledge base alongside Firecrawl and Jina Reader. This integration enhances the data source options for knowledge bases, allowing users to configure and use WaterCrawl for their website content extraction needs. Resolved #15950
2025-03-21 00:18:18 +01:00

51 lines
1.9 KiB
Python

from flask_restful import Resource, reqparse # type: ignore
from controllers.console import api
from controllers.console.datasets.error import WebsiteCrawlError
from controllers.console.wraps import account_initialization_required, setup_required
from libs.login import login_required
from services.website_service import WebsiteService
class WebsiteCrawlApi(Resource):
@setup_required
@login_required
@account_initialization_required
def post(self):
parser = reqparse.RequestParser()
parser.add_argument(
"provider", type=str, choices=["firecrawl", "watercrawl", "jinareader"],
required=True, nullable=True, location="json"
)
parser.add_argument("url", type=str, required=True, nullable=True, location="json")
parser.add_argument("options", type=dict, required=True, nullable=True, location="json")
args = parser.parse_args()
WebsiteService.document_create_args_validate(args)
# crawl url
try:
result = WebsiteService.crawl_url(args)
except Exception as e:
raise WebsiteCrawlError(str(e))
return result, 200
class WebsiteCrawlStatusApi(Resource):
@setup_required
@login_required
@account_initialization_required
def get(self, job_id: str):
parser = reqparse.RequestParser()
parser.add_argument("provider", type=str, choices=["firecrawl", "watercrawl", "jinareader"],
required=True, location="args")
args = parser.parse_args()
# get crawl status
try:
result = WebsiteService.get_crawl_status(job_id, args["provider"])
except Exception as e:
raise WebsiteCrawlError(str(e))
return result, 200
api.add_resource(WebsiteCrawlApi, "/website/crawl")
api.add_resource(WebsiteCrawlStatusApi, "/website/crawl/status/<string:job_id>")