
Signed-off-by: yihong0618 <zouzou0208@gmail.com> Signed-off-by: -LAN- <laipz8200@outlook.com> Signed-off-by: xhe <xw897002528@gmail.com> Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: kurokobo <kuro664@gmail.com> Co-authored-by: Hiroshi Fujita <fujita-h@users.noreply.github.com> Co-authored-by: NFish <douxc512@gmail.com> Co-authored-by: Gen Sato <52241300+halogen22@users.noreply.github.com> Co-authored-by: eux <euxuuu@gmail.com> Co-authored-by: huangzhuo1949 <167434202+huangzhuo1949@users.noreply.github.com> Co-authored-by: huangzhuo <huangzhuo1@xiaomi.com> Co-authored-by: lotsik <lotsik@mail.ru> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com> Co-authored-by: Wu Tianwei <30284043+WTW0313@users.noreply.github.com> Co-authored-by: nite-knite <nkCoding@gmail.com> Co-authored-by: Jyong <76649700+JohnJyong@users.noreply.github.com> Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: gakkiyomi <gakkiyomi@aliyun.com> Co-authored-by: CN-P5 <heibai2006@gmail.com> Co-authored-by: CN-P5 <heibai2006@qq.com> Co-authored-by: Chuehnone <1897025+chuehnone@users.noreply.github.com> Co-authored-by: yihong <zouzou0208@gmail.com> Co-authored-by: Kevin9703 <51311316+Kevin9703@users.noreply.github.com> Co-authored-by: -LAN- <laipz8200@outlook.com> Co-authored-by: Boris Feld <lothiraldan@gmail.com> Co-authored-by: mbo <himabo@gmail.com> Co-authored-by: mabo <mabo@aeyes.ai> Co-authored-by: Warren Chen <warren.chen830@gmail.com> Co-authored-by: KVOJJJin <jzongcode@gmail.com> Co-authored-by: JzoNgKVO <27049666+JzoNgKVO@users.noreply.github.com> Co-authored-by: jiandanfeng <chenjh3@wangsu.com> Co-authored-by: zhu-an <70234959+xhdd123321@users.noreply.github.com> Co-authored-by: zhaoqingyu.1075 <zhaoqingyu.1075@bytedance.com> Co-authored-by: 海狸大師 <86974027+yenslife@users.noreply.github.com> Co-authored-by: Xu Song <xusong.vip@gmail.com> Co-authored-by: rayshaw001 <396301947@163.com> Co-authored-by: Ding Jiatong <dingjiatong@gmail.com> Co-authored-by: Bowen Liang <liangbowen@gf.com.cn> Co-authored-by: JasonVV <jasonwangiii@outlook.com> Co-authored-by: le0zh <newlight@qq.com> Co-authored-by: zhuxinliang <zhuxinliang@didiglobal.com> Co-authored-by: k-zaku <zaku99@outlook.jp> Co-authored-by: Joel <iamjoel007@gmail.com> Co-authored-by: luckylhb90 <luckylhb90@gmail.com> Co-authored-by: hobo.l <hobo.l@binance.com> Co-authored-by: jiangbo721 <365065261@qq.com> Co-authored-by: 刘江波 <jiangbo721@163.com> Co-authored-by: Shun Miyazawa <34241526+miya@users.noreply.github.com> Co-authored-by: EricPan <30651140+Egfly@users.noreply.github.com> Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: zxhlyh <jasonapring2015@outlook.com> Co-authored-by: sino <sino2322@gmail.com> Co-authored-by: Jhvcc <37662342+Jhvcc@users.noreply.github.com> Co-authored-by: lowell <lowell.hu@zkteco.in> Co-authored-by: Ademílson Tonato <ademilsonft@outlook.com> Co-authored-by: Ademílson Tonato <ademilson.tonato@refurbed.com> Co-authored-by: IWAI, Masaharu <iwaim.sub@gmail.com> Co-authored-by: Yueh-Po Peng (Yabi) <94939112+y10ab1@users.noreply.github.com> Co-authored-by: 非法操作 <hjlarry@163.com> Co-authored-by: Jason <ggbbddjm@gmail.com> Co-authored-by: Xin Zhang <sjhpzx@gmail.com> Co-authored-by: yjc980121 <3898524+yjc980121@users.noreply.github.com> Co-authored-by: heyszt <36215648+hieheihei@users.noreply.github.com> Co-authored-by: Abdullah AlOsaimi <osaimiacc@gmail.com> Co-authored-by: Abdullah AlOsaimi <189027247+osaimi@users.noreply.github.com> Co-authored-by: Yingchun Lai <laiyingchun@apache.org> Co-authored-by: Hash Brown <hi@xzd.me> Co-authored-by: zuodongxu <192560071+zuodongxu@users.noreply.github.com> Co-authored-by: Masashi Tomooka <tmokmss@users.noreply.github.com> Co-authored-by: aplio <ryo.091219@gmail.com> Co-authored-by: Obada Khalili <54270856+obadakhalili@users.noreply.github.com> Co-authored-by: Nam Vu <zuzoovn@gmail.com> Co-authored-by: Kei YAMAZAKI <1715090+kei-yamazaki@users.noreply.github.com> Co-authored-by: TechnoHouse <13776377+deephbz@users.noreply.github.com> Co-authored-by: Riddhimaan-Senapati <114703025+Riddhimaan-Senapati@users.noreply.github.com> Co-authored-by: MaFee921 <31881301+2284730142@users.noreply.github.com> Co-authored-by: te-chan <t-nakanome@sakura-is.co.jp> Co-authored-by: HQidea <HQidea@users.noreply.github.com> Co-authored-by: Joshbly <36315710+Joshbly@users.noreply.github.com> Co-authored-by: xhe <xw897002528@gmail.com> Co-authored-by: weiwenyan-dev <154779315+weiwenyan-dev@users.noreply.github.com> Co-authored-by: ex_wenyan.wei <ex_wenyan.wei@tcl.com> Co-authored-by: engchina <12236799+engchina@users.noreply.github.com> Co-authored-by: engchina <atjapan2015@gmail.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: 呆萌闷油瓶 <253605712@qq.com> Co-authored-by: Kemal <kemalmeler@outlook.com> Co-authored-by: Lazy_Frog <4590648+lazyFrogLOL@users.noreply.github.com> Co-authored-by: Novice Lee <novicelee@NoviPro.local> Co-authored-by: Yi Xiao <54782454+YIXIAO0@users.noreply.github.com> Co-authored-by: Steven sun <98230804+Tuyohai@users.noreply.github.com> Co-authored-by: steven <sunzwj@digitalchina.com> Co-authored-by: Kalo Chin <91766386+fdb02983rhy@users.noreply.github.com> Co-authored-by: Katy Tao <34019945+KatyTao@users.noreply.github.com> Co-authored-by: depy <42985524+h4ckdepy@users.noreply.github.com> Co-authored-by: 胡春东 <gycm520@gmail.com> Co-authored-by: Junjie.M <118170653@qq.com>
228 lines
11 KiB
Python
228 lines
11 KiB
Python
import datetime
|
|
import json
|
|
from typing import Any
|
|
|
|
import requests
|
|
from flask_login import current_user # type: ignore
|
|
|
|
from core.helper import encrypter
|
|
from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp
|
|
from extensions.ext_redis import redis_client
|
|
from extensions.ext_storage import storage
|
|
from services.auth.api_key_auth_service import ApiKeyAuthService
|
|
|
|
|
|
class WebsiteService:
|
|
@classmethod
|
|
def document_create_args_validate(cls, args: dict):
|
|
if "url" not in args or not args["url"]:
|
|
raise ValueError("url is required")
|
|
if "options" not in args or not args["options"]:
|
|
raise ValueError("options is required")
|
|
if "limit" not in args["options"] or not args["options"]["limit"]:
|
|
raise ValueError("limit is required")
|
|
|
|
@classmethod
|
|
def crawl_url(cls, args: dict) -> dict:
|
|
provider = args.get("provider", "")
|
|
url = args.get("url")
|
|
options = args.get("options", "")
|
|
credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
|
|
if provider == "firecrawl":
|
|
# decrypt api_key
|
|
api_key = encrypter.decrypt_token(
|
|
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
|
)
|
|
firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
|
|
crawl_sub_pages = options.get("crawl_sub_pages", False)
|
|
only_main_content = options.get("only_main_content", False)
|
|
if not crawl_sub_pages:
|
|
params = {
|
|
"includePaths": [],
|
|
"excludePaths": [],
|
|
"limit": 1,
|
|
"scrapeOptions": {"onlyMainContent": only_main_content},
|
|
}
|
|
else:
|
|
includes = options.get("includes").split(",") if options.get("includes") else []
|
|
excludes = options.get("excludes").split(",") if options.get("excludes") else []
|
|
params = {
|
|
"includePaths": includes,
|
|
"excludePaths": excludes,
|
|
"limit": options.get("limit", 1),
|
|
"scrapeOptions": {"onlyMainContent": only_main_content},
|
|
}
|
|
if options.get("max_depth"):
|
|
params["maxDepth"] = options.get("max_depth")
|
|
job_id = firecrawl_app.crawl_url(url, params)
|
|
website_crawl_time_cache_key = f"website_crawl_{job_id}"
|
|
time = str(datetime.datetime.now().timestamp())
|
|
redis_client.setex(website_crawl_time_cache_key, 3600, time)
|
|
return {"status": "active", "job_id": job_id}
|
|
elif provider == "jinareader":
|
|
api_key = encrypter.decrypt_token(
|
|
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
|
)
|
|
crawl_sub_pages = options.get("crawl_sub_pages", False)
|
|
if not crawl_sub_pages:
|
|
response = requests.get(
|
|
f"https://r.jina.ai/{url}",
|
|
headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
|
|
)
|
|
if response.json().get("code") != 200:
|
|
raise ValueError("Failed to crawl")
|
|
return {"status": "active", "data": response.json().get("data")}
|
|
else:
|
|
response = requests.post(
|
|
"https://adaptivecrawl-kir3wx7b3a-uc.a.run.app",
|
|
json={
|
|
"url": url,
|
|
"maxPages": options.get("limit", 1),
|
|
"useSitemap": options.get("use_sitemap", True),
|
|
},
|
|
headers={
|
|
"Content-Type": "application/json",
|
|
"Authorization": f"Bearer {api_key}",
|
|
},
|
|
)
|
|
if response.json().get("code") != 200:
|
|
raise ValueError("Failed to crawl")
|
|
return {"status": "active", "job_id": response.json().get("data", {}).get("taskId")}
|
|
else:
|
|
raise ValueError("Invalid provider")
|
|
|
|
@classmethod
|
|
def get_crawl_status(cls, job_id: str, provider: str) -> dict:
|
|
credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
|
|
if provider == "firecrawl":
|
|
# decrypt api_key
|
|
api_key = encrypter.decrypt_token(
|
|
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
|
)
|
|
firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
|
|
result = firecrawl_app.check_crawl_status(job_id)
|
|
crawl_status_data = {
|
|
"status": result.get("status", "active"),
|
|
"job_id": job_id,
|
|
"total": result.get("total", 0),
|
|
"current": result.get("current", 0),
|
|
"data": result.get("data", []),
|
|
}
|
|
if crawl_status_data["status"] == "completed":
|
|
website_crawl_time_cache_key = f"website_crawl_{job_id}"
|
|
start_time = redis_client.get(website_crawl_time_cache_key)
|
|
if start_time:
|
|
end_time = datetime.datetime.now().timestamp()
|
|
time_consuming = abs(end_time - float(start_time))
|
|
crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
|
|
redis_client.delete(website_crawl_time_cache_key)
|
|
elif provider == "jinareader":
|
|
api_key = encrypter.decrypt_token(
|
|
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
|
)
|
|
response = requests.post(
|
|
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
|
|
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
|
|
json={"taskId": job_id},
|
|
)
|
|
data = response.json().get("data", {})
|
|
crawl_status_data = {
|
|
"status": data.get("status", "active"),
|
|
"job_id": job_id,
|
|
"total": len(data.get("urls", [])),
|
|
"current": len(data.get("processed", [])) + len(data.get("failed", [])),
|
|
"data": [],
|
|
"time_consuming": data.get("duration", 0) / 1000,
|
|
}
|
|
|
|
if crawl_status_data["status"] == "completed":
|
|
response = requests.post(
|
|
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
|
|
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
|
|
json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
|
|
)
|
|
data = response.json().get("data", {})
|
|
formatted_data = [
|
|
{
|
|
"title": item.get("data", {}).get("title"),
|
|
"source_url": item.get("data", {}).get("url"),
|
|
"description": item.get("data", {}).get("description"),
|
|
"markdown": item.get("data", {}).get("content"),
|
|
}
|
|
for item in data.get("processed", {}).values()
|
|
]
|
|
crawl_status_data["data"] = formatted_data
|
|
else:
|
|
raise ValueError("Invalid provider")
|
|
return crawl_status_data
|
|
|
|
@classmethod
|
|
def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str) -> dict[Any, Any] | None:
|
|
credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
|
|
# decrypt api_key
|
|
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
|
|
# FIXME data is redefine too many times here, use Any to ease the type checking, fix it later
|
|
data: Any
|
|
if provider == "firecrawl":
|
|
file_key = "website_files/" + job_id + ".txt"
|
|
if storage.exists(file_key):
|
|
d = storage.load_once(file_key)
|
|
if d:
|
|
data = json.loads(d.decode("utf-8"))
|
|
else:
|
|
firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
|
|
result = firecrawl_app.check_crawl_status(job_id)
|
|
if result.get("status") != "completed":
|
|
raise ValueError("Crawl job is not completed")
|
|
data = result.get("data")
|
|
if data:
|
|
for item in data:
|
|
if item.get("source_url") == url:
|
|
return dict(item)
|
|
return None
|
|
elif provider == "jinareader":
|
|
if not job_id:
|
|
response = requests.get(
|
|
f"https://r.jina.ai/{url}",
|
|
headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
|
|
)
|
|
if response.json().get("code") != 200:
|
|
raise ValueError("Failed to crawl")
|
|
return dict(response.json().get("data", {}))
|
|
else:
|
|
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
|
|
response = requests.post(
|
|
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
|
|
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
|
|
json={"taskId": job_id},
|
|
)
|
|
data = response.json().get("data", {})
|
|
if data.get("status") != "completed":
|
|
raise ValueError("Crawl job is not completed")
|
|
|
|
response = requests.post(
|
|
"https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
|
|
headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
|
|
json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
|
|
)
|
|
data = response.json().get("data", {})
|
|
for item in data.get("processed", {}).values():
|
|
if item.get("data", {}).get("url") == url:
|
|
return dict(item.get("data", {}))
|
|
return None
|
|
else:
|
|
raise ValueError("Invalid provider")
|
|
|
|
@classmethod
|
|
def get_scrape_url_data(cls, provider: str, url: str, tenant_id: str, only_main_content: bool) -> dict:
|
|
credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
|
|
if provider == "firecrawl":
|
|
# decrypt api_key
|
|
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
|
|
firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
|
|
params = {"onlyMainContent": only_main_content}
|
|
result = firecrawl_app.scrape_url(url, params)
|
|
return result
|
|
else:
|
|
raise ValueError("Invalid provider")
|