Fixes #890 This PR introduces new streaming superuser-only API endpoints to export and import database information for an organization. New Adminstrator deployment documentation on how to manage the process and copy files between S3 buckets as needed is also included. --------- Co-authored-by: Henry Wilkinson <henry@wilkinson.graphics> Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
1445 lines
47 KiB
Python
1445 lines
47 KiB
Python
""" Crawl API """
|
|
|
|
# pylint: disable=too-many-lines
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import contextlib
|
|
import urllib.parse
|
|
from datetime import datetime
|
|
from uuid import UUID
|
|
|
|
from typing import Optional, List, Dict, Union, Any, Sequence
|
|
|
|
from fastapi import Depends, HTTPException
|
|
from fastapi.responses import StreamingResponse
|
|
from redis import asyncio as exceptions
|
|
import pymongo
|
|
|
|
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
|
|
from .utils import dt_now, parse_jsonl_error_messages, stream_dict_list_as_csv
|
|
from .basecrawls import BaseCrawlOps
|
|
from .crawlmanager import CrawlManager
|
|
from .models import (
|
|
UpdateCrawl,
|
|
DeleteCrawlList,
|
|
CrawlConfig,
|
|
UpdateCrawlConfig,
|
|
CrawlScale,
|
|
CrawlStats,
|
|
CrawlFile,
|
|
Crawl,
|
|
CrawlOut,
|
|
CrawlOutWithResources,
|
|
QARun,
|
|
QARunOut,
|
|
QARunWithResources,
|
|
QARunAggregateStatsOut,
|
|
DeleteQARunList,
|
|
Organization,
|
|
User,
|
|
PaginatedResponse,
|
|
RUNNING_AND_STARTING_STATES,
|
|
SUCCESSFUL_STATES,
|
|
NON_RUNNING_STATES,
|
|
ALL_CRAWL_STATES,
|
|
TYPE_ALL_CRAWL_STATES,
|
|
)
|
|
|
|
|
|
MAX_MATCH_SIZE = 500000
|
|
DEFAULT_RANGE_LIMIT = 50
|
|
|
|
|
|
# ============================================================================
|
|
# pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods
|
|
class CrawlOps(BaseCrawlOps):
|
|
"""Crawl Ops"""
|
|
|
|
crawl_manager: CrawlManager
|
|
|
|
def __init__(self, crawl_manager: CrawlManager, *args):
|
|
super().__init__(*args)
|
|
self.crawl_manager = crawl_manager
|
|
self.crawl_configs.set_crawl_ops(self)
|
|
self.colls.set_crawl_ops(self)
|
|
self.event_webhook_ops.set_crawl_ops(self)
|
|
|
|
self.min_qa_crawler_image = os.environ.get("MIN_QA_CRAWLER_IMAGE")
|
|
|
|
async def init_index(self):
|
|
"""init index for crawls db collection"""
|
|
await self.crawls.create_index([("type", pymongo.HASHED)])
|
|
|
|
await self.crawls.create_index(
|
|
[("type", pymongo.HASHED), ("finished", pymongo.DESCENDING)]
|
|
)
|
|
await self.crawls.create_index(
|
|
[("type", pymongo.HASHED), ("oid", pymongo.DESCENDING)]
|
|
)
|
|
await self.crawls.create_index(
|
|
[("type", pymongo.HASHED), ("cid", pymongo.DESCENDING)]
|
|
)
|
|
await self.crawls.create_index(
|
|
[("type", pymongo.HASHED), ("state", pymongo.DESCENDING)]
|
|
)
|
|
await self.crawls.create_index(
|
|
[("type", pymongo.HASHED), ("fileSize", pymongo.DESCENDING)]
|
|
)
|
|
|
|
await self.crawls.create_index([("finished", pymongo.DESCENDING)])
|
|
await self.crawls.create_index([("oid", pymongo.HASHED)])
|
|
await self.crawls.create_index([("cid", pymongo.HASHED)])
|
|
await self.crawls.create_index([("state", pymongo.HASHED)])
|
|
await self.crawls.create_index([("fileSize", pymongo.DESCENDING)])
|
|
|
|
async def get_crawl(
|
|
self,
|
|
crawlid: str,
|
|
org: Optional[Organization] = None,
|
|
) -> Crawl:
|
|
"""Get crawl data for internal use"""
|
|
res = await self.get_crawl_raw(crawlid, org, "crawl")
|
|
return Crawl.from_dict(res)
|
|
|
|
@contextlib.asynccontextmanager
|
|
async def get_redis(self, crawl_id):
|
|
"""get redis url for crawl id"""
|
|
redis_url = self.crawl_manager.get_redis_url(crawl_id)
|
|
|
|
redis = await self.crawl_manager.get_redis_client(redis_url)
|
|
|
|
try:
|
|
yield redis
|
|
finally:
|
|
await redis.close()
|
|
|
|
async def list_crawls(
|
|
self,
|
|
org: Optional[Organization] = None,
|
|
cid: Optional[UUID] = None,
|
|
userid: Optional[UUID] = None,
|
|
crawl_id: str = "",
|
|
running_only=False,
|
|
state: Optional[List[str]] = None,
|
|
first_seed: Optional[str] = None,
|
|
name: Optional[str] = None,
|
|
description: Optional[str] = None,
|
|
collection_id: Optional[UUID] = None,
|
|
page_size: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
sort_by: Optional[str] = None,
|
|
sort_direction: int = -1,
|
|
resources: bool = False,
|
|
):
|
|
"""List all finished crawls from the db"""
|
|
# pylint: disable=too-many-locals,too-many-branches,too-many-statements
|
|
# Zero-index page for query
|
|
page = page - 1
|
|
skip = page * page_size
|
|
|
|
oid = org.id if org else None
|
|
|
|
query: dict[str, object] = {"type": {"$in": ["crawl", None]}}
|
|
if oid:
|
|
query["oid"] = oid
|
|
|
|
if cid:
|
|
query["cid"] = cid
|
|
|
|
if userid:
|
|
query["userid"] = userid
|
|
|
|
if running_only:
|
|
query["state"] = {"$in": RUNNING_AND_STARTING_STATES}
|
|
|
|
# Override running_only if state list is explicitly passed
|
|
if state:
|
|
validated_states = [value for value in state if value in ALL_CRAWL_STATES]
|
|
query["state"] = {"$in": validated_states}
|
|
|
|
if crawl_id:
|
|
query["_id"] = crawl_id
|
|
|
|
# pylint: disable=duplicate-code
|
|
aggregate = [
|
|
{"$match": query},
|
|
{"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
|
|
{"$set": {"firstSeed": "$firstSeedObject.url"}},
|
|
{"$unset": ["firstSeedObject", "errors", "config"]},
|
|
{"$set": {"activeQAStats": "$qa.stats"}},
|
|
{
|
|
"$set": {
|
|
"qaFinishedArray": {
|
|
"$map": {
|
|
"input": {"$objectToArray": "$qaFinished"},
|
|
"in": "$$this.v",
|
|
}
|
|
}
|
|
}
|
|
},
|
|
# Add active QA run to array if exists prior to sorting, taking care not to
|
|
# pass null to $concatArrays so that our result isn't null
|
|
{
|
|
"$set": {
|
|
"qaActiveArray": {"$cond": [{"$ne": ["$qa", None]}, ["$qa"], []]}
|
|
}
|
|
},
|
|
{
|
|
"$set": {
|
|
"qaArray": {"$concatArrays": ["$qaFinishedArray", "$qaActiveArray"]}
|
|
}
|
|
},
|
|
{
|
|
"$set": {
|
|
"sortedQARuns": {
|
|
"$sortArray": {
|
|
"input": "$qaArray",
|
|
"sortBy": {"started": -1},
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{"$set": {"lastQARun": {"$arrayElemAt": ["$sortedQARuns", 0]}}},
|
|
{"$set": {"lastQAState": "$lastQARun.state"}},
|
|
{"$set": {"lastQAStarted": "$lastQARun.started"}},
|
|
{
|
|
"$set": {
|
|
"qaRunCount": {
|
|
"$size": {
|
|
"$cond": [
|
|
{"$isArray": "$qaArray"},
|
|
"$qaArray",
|
|
[],
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"$unset": [
|
|
"lastQARun",
|
|
"qaActiveArray",
|
|
"qaFinishedArray",
|
|
"qaArray",
|
|
"sortedQARuns",
|
|
]
|
|
},
|
|
]
|
|
|
|
if not resources:
|
|
aggregate.extend([{"$unset": ["files"]}])
|
|
|
|
if name:
|
|
aggregate.extend([{"$match": {"name": name}}])
|
|
|
|
if description:
|
|
aggregate.extend([{"$match": {"description": description}}])
|
|
|
|
if first_seed:
|
|
aggregate.extend([{"$match": {"firstSeed": first_seed}}])
|
|
|
|
if collection_id:
|
|
aggregate.extend([{"$match": {"collectionIds": {"$in": [collection_id]}}}])
|
|
|
|
if sort_by:
|
|
if sort_by not in (
|
|
"started",
|
|
"finished",
|
|
"fileSize",
|
|
"firstSeed",
|
|
"reviewStatus",
|
|
"qaRunCount",
|
|
"lastQAState",
|
|
"lastQAStarted",
|
|
):
|
|
raise HTTPException(status_code=400, detail="invalid_sort_by")
|
|
if sort_direction not in (1, -1):
|
|
raise HTTPException(status_code=400, detail="invalid_sort_direction")
|
|
|
|
aggregate.extend([{"$sort": {sort_by: sort_direction}}])
|
|
|
|
aggregate.extend(
|
|
[
|
|
{
|
|
"$facet": {
|
|
"items": [
|
|
{"$skip": skip},
|
|
{"$limit": page_size},
|
|
],
|
|
"total": [{"$count": "count"}],
|
|
}
|
|
},
|
|
]
|
|
)
|
|
|
|
# Get total
|
|
cursor = self.crawls.aggregate(aggregate)
|
|
results = await cursor.to_list(length=1)
|
|
result = results[0]
|
|
items = result["items"]
|
|
|
|
try:
|
|
total = int(result["total"][0]["count"])
|
|
except (IndexError, ValueError):
|
|
total = 0
|
|
|
|
cls = CrawlOut
|
|
if resources:
|
|
cls = CrawlOutWithResources
|
|
|
|
crawls = []
|
|
for result in items:
|
|
crawl = cls.from_dict(result)
|
|
files = result.get("files") if resources else None
|
|
crawl = await self._resolve_crawl_refs(
|
|
crawl, org, files=files, add_first_seed=False
|
|
)
|
|
crawls.append(crawl)
|
|
|
|
return crawls, total
|
|
|
|
async def delete_crawls(
|
|
self,
|
|
org: Organization,
|
|
delete_list: DeleteCrawlList,
|
|
type_="crawl",
|
|
user: Optional[User] = None,
|
|
):
|
|
"""Delete a list of crawls by id for given org"""
|
|
|
|
count, cids_to_update, quota_reached = await super().delete_crawls(
|
|
org, delete_list, type_, user
|
|
)
|
|
|
|
if count < 1:
|
|
raise HTTPException(status_code=404, detail="crawl_not_found")
|
|
|
|
for cid, cid_dict in cids_to_update.items():
|
|
cid_size = cid_dict["size"]
|
|
cid_inc = cid_dict["inc"]
|
|
await self.crawl_configs.stats_recompute_last(cid, -cid_size, -cid_inc)
|
|
|
|
return {"deleted": True, "storageQuotaReached": quota_reached}
|
|
|
|
# pylint: disable=too-many-arguments
|
|
async def add_new_crawl(
|
|
self,
|
|
crawl_id: str,
|
|
crawlconfig: CrawlConfig,
|
|
userid: UUID,
|
|
started: str,
|
|
manual: bool,
|
|
username: str = "",
|
|
):
|
|
"""initialize new crawl"""
|
|
if not username:
|
|
user = await self.user_manager.get_by_id(userid)
|
|
if user:
|
|
username = user.name
|
|
|
|
image = self.crawl_configs.get_channel_crawler_image(crawlconfig.crawlerChannel)
|
|
|
|
crawl = Crawl(
|
|
id=crawl_id,
|
|
state="starting",
|
|
userid=userid,
|
|
userName=username,
|
|
oid=crawlconfig.oid,
|
|
cid=crawlconfig.id,
|
|
cid_rev=crawlconfig.rev,
|
|
scale=crawlconfig.scale,
|
|
jobType=crawlconfig.jobType,
|
|
config=crawlconfig.config,
|
|
profileid=crawlconfig.profileid,
|
|
schedule=crawlconfig.schedule,
|
|
crawlTimeout=crawlconfig.crawlTimeout,
|
|
maxCrawlSize=crawlconfig.maxCrawlSize,
|
|
manual=manual,
|
|
started=started,
|
|
tags=crawlconfig.tags,
|
|
name=crawlconfig.name,
|
|
crawlerChannel=crawlconfig.crawlerChannel,
|
|
image=image,
|
|
)
|
|
|
|
try:
|
|
await self.crawls.insert_one(crawl.to_dict())
|
|
return dt_now
|
|
|
|
except pymongo.errors.DuplicateKeyError:
|
|
return None
|
|
|
|
async def update_crawl_scale(
|
|
self, crawl_id: str, org: Organization, crawl_scale: CrawlScale, user: User
|
|
):
|
|
"""Update crawl scale in the db"""
|
|
crawl = await self.get_crawl(crawl_id, org)
|
|
update = UpdateCrawlConfig(scale=crawl_scale.scale)
|
|
await self.crawl_configs.update_crawl_config(crawl.cid, org, user, update)
|
|
|
|
result = await self.crawls.find_one_and_update(
|
|
{"_id": crawl_id, "type": "crawl", "oid": org.id},
|
|
{"$set": {"scale": crawl_scale.scale}},
|
|
return_document=pymongo.ReturnDocument.AFTER,
|
|
)
|
|
|
|
if not result:
|
|
raise HTTPException(status_code=404, detail=f"Crawl '{crawl_id}' not found")
|
|
|
|
return True
|
|
|
|
async def _crawl_queue_len(self, redis, key):
|
|
try:
|
|
return await redis.zcard(key)
|
|
except exceptions.ResponseError:
|
|
# fallback to old crawler queue
|
|
return await redis.llen(key)
|
|
|
|
async def _crawl_queue_range(self, redis, key, offset, count):
|
|
try:
|
|
return await redis.zrangebyscore(key, 0, "inf", offset, count)
|
|
except exceptions.ResponseError:
|
|
# fallback to old crawler queue
|
|
return reversed(await redis.lrange(key, -offset - count, -offset - 1))
|
|
|
|
async def get_crawl_queue(self, crawl_id, offset, count, regex):
|
|
"""get crawl queue"""
|
|
|
|
total = 0
|
|
results = []
|
|
|
|
try:
|
|
async with self.get_redis(crawl_id) as redis:
|
|
total = await self._crawl_queue_len(redis, f"{crawl_id}:q")
|
|
results = await self._crawl_queue_range(
|
|
redis, f"{crawl_id}:q", offset, count
|
|
)
|
|
results = [json.loads(result)["url"] for result in results]
|
|
|
|
except exceptions.ConnectionError:
|
|
# can't connect to redis, likely not initialized yet
|
|
pass
|
|
|
|
matched = []
|
|
if regex:
|
|
try:
|
|
regex = re.compile(regex)
|
|
except re.error as exc:
|
|
raise HTTPException(status_code=400, detail="invalid_regex") from exc
|
|
|
|
matched = [result for result in results if regex.search(result)]
|
|
|
|
return {"total": total, "results": results, "matched": matched}
|
|
|
|
async def match_crawl_queue(self, crawl_id, regex, offset=0):
|
|
"""get list of urls that match regex, starting at offset and at most
|
|
around 'limit'. (limit rounded to next step boundary, so
|
|
limit <= next_offset < limit + step"""
|
|
total = 0
|
|
matched = []
|
|
step = DEFAULT_RANGE_LIMIT
|
|
|
|
async with self.get_redis(crawl_id) as redis:
|
|
try:
|
|
total = await self._crawl_queue_len(redis, f"{crawl_id}:q")
|
|
except exceptions.ConnectionError:
|
|
# can't connect to redis, likely not initialized yet
|
|
pass
|
|
|
|
try:
|
|
regex = re.compile(regex)
|
|
except re.error as exc:
|
|
raise HTTPException(status_code=400, detail="invalid_regex") from exc
|
|
|
|
next_offset = -1
|
|
size = 0
|
|
|
|
for count in range(offset, total, step):
|
|
results = await self._crawl_queue_range(
|
|
redis, f"{crawl_id}:q", count, step
|
|
)
|
|
for result in results:
|
|
url = json.loads(result)["url"]
|
|
if regex.search(url):
|
|
size += len(url)
|
|
matched.append(url)
|
|
|
|
# if size of match response exceeds size limit, set nextOffset
|
|
# and break
|
|
if size > MAX_MATCH_SIZE:
|
|
next_offset = count + step
|
|
break
|
|
|
|
return {"total": total, "matched": matched, "nextOffset": next_offset}
|
|
|
|
async def add_or_remove_exclusion(self, crawl_id, regex, org, user, add):
|
|
"""add new exclusion to config or remove exclusion from config
|
|
for given crawl_id, update config on crawl"""
|
|
|
|
crawl = await self.get_crawl(crawl_id, org)
|
|
|
|
cid = crawl.cid
|
|
|
|
scale = crawl.scale or 1
|
|
|
|
async with self.get_redis(crawl_id) as redis:
|
|
query = {
|
|
"regex": regex,
|
|
"type": "addExclusion" if add else "removeExclusion",
|
|
}
|
|
query_str = json.dumps(query)
|
|
|
|
for i in range(0, scale):
|
|
await redis.rpush(f"crawl-{crawl_id}-{i}:msg", query_str)
|
|
|
|
new_config = await self.crawl_configs.add_or_remove_exclusion(
|
|
regex, cid, org, user, add
|
|
)
|
|
|
|
await self.crawls.find_one_and_update(
|
|
{"_id": crawl_id, "type": "crawl", "oid": org.id},
|
|
{"$set": {"config": new_config.dict()}},
|
|
)
|
|
|
|
return {"success": True}
|
|
|
|
async def update_crawl_state_if_allowed(
|
|
self,
|
|
crawl_id: str,
|
|
is_qa: bool,
|
|
state: TYPE_ALL_CRAWL_STATES,
|
|
allowed_from: Sequence[TYPE_ALL_CRAWL_STATES],
|
|
finished: Optional[datetime] = None,
|
|
stats: Optional[CrawlStats] = None,
|
|
):
|
|
"""update crawl state and other properties in db if state has changed"""
|
|
prefix = "" if not is_qa else "qa."
|
|
|
|
update: Dict[str, Any] = {f"{prefix}state": state}
|
|
if finished:
|
|
update[f"{prefix}finished"] = finished
|
|
if stats:
|
|
update[f"{prefix}stats"] = stats.dict()
|
|
|
|
query: Dict[str, Any] = {"_id": crawl_id, "type": "crawl"}
|
|
if allowed_from:
|
|
query[f"{prefix}state"] = {"$in": allowed_from}
|
|
|
|
return await self.crawls.find_one_and_update(query, {"$set": update})
|
|
|
|
async def update_running_crawl_stats(
|
|
self, crawl_id: str, is_qa: bool, stats: CrawlStats
|
|
):
|
|
"""update running crawl stats"""
|
|
prefix = "" if not is_qa else "qa."
|
|
query = {"_id": crawl_id, "type": "crawl", f"{prefix}state": "running"}
|
|
return await self.crawls.find_one_and_update(
|
|
query, {"$set": {f"{prefix}stats": stats.dict()}}
|
|
)
|
|
|
|
async def inc_crawl_exec_time(
|
|
self,
|
|
crawl_id: str,
|
|
is_qa: bool,
|
|
exec_time,
|
|
last_updated_time,
|
|
):
|
|
"""increment exec time"""
|
|
# update both crawl-shared qa exec seconds and per-qa run exec seconds
|
|
if is_qa:
|
|
inc_update = {
|
|
"qaCrawlExecSeconds": exec_time,
|
|
"qa.crawlExecSeconds": exec_time,
|
|
}
|
|
else:
|
|
inc_update = {"crawlExecSeconds": exec_time}
|
|
|
|
return await self.crawls.find_one_and_update(
|
|
{
|
|
"_id": crawl_id,
|
|
"type": "crawl",
|
|
"_lut": {"$ne": last_updated_time},
|
|
},
|
|
{
|
|
"$inc": inc_update,
|
|
"$set": {"_lut": last_updated_time},
|
|
},
|
|
)
|
|
|
|
async def get_crawl_exec_last_update_time(self, crawl_id):
|
|
"""get crawl last updated time"""
|
|
res = await self.crawls.find_one(
|
|
{"_id": crawl_id, "type": "crawl"}, projection=["_lut"]
|
|
)
|
|
return res and res.get("_lut")
|
|
|
|
async def get_crawl_state(self, crawl_id: str, is_qa: bool):
|
|
"""return current crawl state of a crawl"""
|
|
prefix = "" if not is_qa else "qa."
|
|
|
|
res = await self.crawls.find_one(
|
|
{"_id": crawl_id},
|
|
projection={"state": f"${prefix}state", "finished": f"${prefix}finished"},
|
|
)
|
|
if not res:
|
|
return None, None
|
|
return res.get("state"), res.get("finished")
|
|
|
|
async def add_crawl_error(
|
|
self,
|
|
crawl_id: str,
|
|
is_qa: bool,
|
|
error: str,
|
|
):
|
|
"""add crawl error from redis to mongodb errors field"""
|
|
prefix = "" if not is_qa else "qa."
|
|
|
|
await self.crawls.find_one_and_update(
|
|
{"_id": crawl_id}, {"$push": {f"{prefix}errors": error}}
|
|
)
|
|
|
|
async def add_crawl_file(
|
|
self, crawl_id: str, is_qa: bool, crawl_file: CrawlFile, size: int
|
|
):
|
|
"""add new crawl file to crawl"""
|
|
prefix = "" if not is_qa else "qa."
|
|
|
|
await self.crawls.find_one_and_update(
|
|
{"_id": crawl_id},
|
|
{
|
|
"$push": {f"{prefix}files": crawl_file.dict()},
|
|
"$inc": {f"{prefix}fileCount": 1, f"{prefix}fileSize": size},
|
|
},
|
|
)
|
|
|
|
async def get_crawl_seeds(
|
|
self,
|
|
crawl_id: str,
|
|
org: Organization,
|
|
page_size: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
):
|
|
"""Get paginated list of seeds from crawl"""
|
|
skip = (page - 1) * page_size
|
|
upper_bound = skip + page_size
|
|
|
|
crawl = await self.get_crawl(crawl_id, org)
|
|
if not crawl.config or not crawl.config.seeds:
|
|
return [], 0
|
|
try:
|
|
return crawl.config.seeds[skip:upper_bound], len(crawl.config.seeds)
|
|
# pylint: disable=broad-exception-caught
|
|
except Exception:
|
|
return [], 0
|
|
|
|
async def get_crawl_stats(
|
|
self, org: Optional[Organization] = None
|
|
) -> List[Dict[str, Union[str, int]]]:
|
|
"""Return crawl statistics"""
|
|
# pylint: disable=too-many-locals
|
|
org_slugs = await self.orgs.get_org_slugs_by_ids()
|
|
user_emails = await self.user_manager.get_user_emails_by_ids()
|
|
|
|
crawls_data: List[Dict[str, Union[str, int]]] = []
|
|
|
|
query: Dict[str, Union[str, UUID]] = {"type": "crawl"}
|
|
if org:
|
|
query["oid"] = org.id
|
|
|
|
async for crawl_raw in self.crawls.find(query):
|
|
crawl = Crawl.from_dict(crawl_raw)
|
|
data: Dict[str, Union[str, int]] = {}
|
|
data["id"] = crawl.id
|
|
|
|
data["oid"] = str(crawl.oid)
|
|
data["org"] = org_slugs[crawl.oid]
|
|
|
|
data["cid"] = crawl.id
|
|
data["name"] = f'"{crawl.name}"' if crawl.name else ""
|
|
data["state"] = crawl.state
|
|
|
|
data["userid"] = str(crawl.userid)
|
|
data["user"] = user_emails.get(crawl.userid)
|
|
|
|
data["started"] = str(crawl.started)
|
|
data["finished"] = str(crawl.finished)
|
|
|
|
data["duration"] = 0
|
|
duration_seconds = 0
|
|
if crawl.started and crawl.finished:
|
|
duration = crawl.finished - crawl.started
|
|
duration_seconds = int(duration.total_seconds())
|
|
if duration_seconds:
|
|
data["duration"] = duration_seconds
|
|
|
|
if crawl.stats:
|
|
data["pages"] = crawl.stats.done
|
|
|
|
data["filesize"] = crawl.fileSize
|
|
|
|
data["avg_page_time"] = 0
|
|
if crawl.stats and crawl.stats.done != 0 and duration_seconds:
|
|
data["avg_page_time"] = int(duration_seconds / crawl.stats.done)
|
|
|
|
crawls_data.append(data)
|
|
|
|
return crawls_data
|
|
|
|
async def shutdown_crawl(
|
|
self, crawl_id: str, org: Organization, graceful: bool
|
|
) -> Dict[str, bool]:
|
|
"""stop or cancel specified crawl"""
|
|
crawl = await self.get_base_crawl(crawl_id, org)
|
|
if crawl and crawl.type != "crawl":
|
|
raise HTTPException(status_code=400, detail="not_a_crawl")
|
|
|
|
result = None
|
|
try:
|
|
result = await self.crawl_manager.shutdown_crawl(
|
|
crawl_id, graceful=graceful
|
|
)
|
|
|
|
if result.get("success"):
|
|
if graceful:
|
|
await self.crawls.find_one_and_update(
|
|
{"_id": crawl_id, "type": "crawl", "oid": org.id},
|
|
{"$set": {"stopping": True}},
|
|
)
|
|
return result
|
|
|
|
except Exception as exc:
|
|
# pylint: disable=raise-missing-from
|
|
# if reached here, probably crawl doesn't exist anymore
|
|
raise HTTPException(
|
|
status_code=404, detail=f"crawl_not_found, (details: {exc})"
|
|
)
|
|
|
|
# if job no longer running, canceling is considered success,
|
|
# but graceful stoppage is not possible, so would be a failure
|
|
if result.get("error") == "Not Found":
|
|
if not graceful:
|
|
await self.update_crawl_state(crawl_id, "canceled")
|
|
crawl = await self.get_crawl(crawl_id, org)
|
|
if not await self.crawl_configs.stats_recompute_last(crawl.cid, 0, -1):
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f"crawl_config_not_found: {crawl.cid}",
|
|
)
|
|
|
|
return {"success": True}
|
|
|
|
# return whatever detail may be included in the response
|
|
raise HTTPException(status_code=400, detail=result)
|
|
|
|
async def start_crawl_qa_run(self, crawl_id: str, org: Organization, user: User):
|
|
"""Start crawl QA run"""
|
|
|
|
crawl = await self.get_crawl(crawl_id, org)
|
|
|
|
# ensure org execution is allowed
|
|
if org.readOnly:
|
|
raise HTTPException(status_code=403, detail="org_set_to_read_only")
|
|
|
|
# can only QA finished crawls
|
|
if not crawl.finished:
|
|
raise HTTPException(status_code=400, detail="crawl_not_finished")
|
|
|
|
# can only QA successfully finished crawls
|
|
if crawl.state not in SUCCESSFUL_STATES:
|
|
raise HTTPException(status_code=400, detail="crawl_did_not_succeed")
|
|
|
|
# if set, can only QA if crawl image is >= min_qa_crawler_image
|
|
if (
|
|
self.min_qa_crawler_image
|
|
and crawl.image
|
|
and crawl.image < self.min_qa_crawler_image
|
|
):
|
|
raise HTTPException(status_code=400, detail="qa_not_supported_for_crawl")
|
|
|
|
# can only run one QA at a time
|
|
if crawl.qa:
|
|
raise HTTPException(status_code=400, detail="qa_already_running")
|
|
|
|
# not a valid crawl
|
|
if not crawl.cid or crawl.type != "crawl":
|
|
raise HTTPException(status_code=400, detail="invalid_crawl_for_qa")
|
|
|
|
crawlconfig = await self.crawl_configs.prepare_for_run_crawl(crawl.cid, org)
|
|
|
|
try:
|
|
qa_run_id = await self.crawl_manager.create_qa_crawl_job(
|
|
crawlconfig,
|
|
org.storage,
|
|
userid=str(user.id),
|
|
qa_source=crawl_id,
|
|
storage_filename=self.crawl_configs.default_filename_template,
|
|
)
|
|
|
|
image = self.crawl_configs.get_channel_crawler_image(
|
|
crawlconfig.crawlerChannel
|
|
)
|
|
|
|
qa_run = QARun(
|
|
id=qa_run_id,
|
|
started=datetime.now(),
|
|
userid=user.id,
|
|
userName=user.name,
|
|
state="starting",
|
|
image=image,
|
|
)
|
|
|
|
await self.crawls.find_one_and_update(
|
|
{"_id": crawl_id},
|
|
{
|
|
"$set": {
|
|
"qa": qa_run.dict(),
|
|
}
|
|
},
|
|
)
|
|
|
|
return qa_run_id
|
|
|
|
except Exception as exc:
|
|
# pylint: disable=raise-missing-from
|
|
raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")
|
|
|
|
async def stop_crawl_qa_run(
|
|
self, crawl_id: str, org: Organization, graceful: bool = True
|
|
):
|
|
"""Stop crawl QA run, QA run removed when actually finished"""
|
|
crawl = await self.get_crawl(crawl_id, org)
|
|
|
|
if not crawl.qa:
|
|
raise HTTPException(status_code=400, detail="qa_not_running")
|
|
|
|
try:
|
|
result = await self.crawl_manager.shutdown_crawl(
|
|
crawl.qa.id, graceful=graceful
|
|
)
|
|
|
|
if result.get("error") == "Not Found":
|
|
# treat as success, qa crawl no longer exists, so mark as no qa
|
|
result = {"success": True}
|
|
|
|
return result
|
|
|
|
except Exception as exc:
|
|
# pylint: disable=raise-missing-from
|
|
# if reached here, probably crawl doesn't exist anymore
|
|
raise HTTPException(
|
|
status_code=404, detail=f"crawl_not_found, (details: {exc})"
|
|
)
|
|
|
|
async def delete_crawl_qa_runs(
|
|
self, crawl_id: str, delete_list: DeleteQARunList, org: Organization
|
|
):
|
|
"""delete specified finished QA run"""
|
|
count = 0
|
|
for qa_run_id in delete_list.qa_run_ids:
|
|
await self.page_ops.delete_qa_run_from_pages(crawl_id, qa_run_id)
|
|
await self.delete_crawl_qa_run_files(crawl_id, qa_run_id, org)
|
|
|
|
res = await self.crawls.find_one_and_update(
|
|
{"_id": crawl_id, "type": "crawl"},
|
|
{"$unset": {f"qaFinished.{qa_run_id}": ""}},
|
|
)
|
|
|
|
if res:
|
|
count += 1
|
|
|
|
return {"deleted": count}
|
|
|
|
async def delete_crawl_qa_run_files(
|
|
self, crawl_id: str, qa_run_id: str, org: Organization
|
|
):
|
|
"""delete crawl qa wacz files"""
|
|
qa_run = await self.get_qa_run(crawl_id, qa_run_id, org)
|
|
for file_ in qa_run.files:
|
|
if not await self.storage_ops.delete_crawl_file_object(org, file_):
|
|
raise HTTPException(status_code=400, detail="file_deletion_error")
|
|
# Not replicating QA run WACZs yet
|
|
# await self.background_job_ops.create_delete_replica_jobs(
|
|
# org, file_, qa_run_id, "qa"
|
|
# )
|
|
|
|
async def qa_run_finished(self, crawl_id: str):
|
|
"""clear active qa, add qa run to finished list, if successful"""
|
|
crawl = await self.get_crawl(crawl_id)
|
|
|
|
if not crawl.qa:
|
|
return False
|
|
|
|
query: Dict[str, Any] = {"qa": None}
|
|
|
|
if crawl.qa.finished and crawl.qa.state in NON_RUNNING_STATES:
|
|
query[f"qaFinished.{crawl.qa.id}"] = crawl.qa.dict()
|
|
|
|
if await self.crawls.find_one_and_update(
|
|
{"_id": crawl_id, "type": "crawl"}, {"$set": query}
|
|
):
|
|
return True
|
|
|
|
return False
|
|
|
|
async def get_qa_runs(
|
|
self,
|
|
crawl_id: str,
|
|
skip_failed: bool = False,
|
|
org: Optional[Organization] = None,
|
|
) -> List[QARunOut]:
|
|
"""Return list of QA runs"""
|
|
crawl_data = await self.get_crawl_raw(
|
|
crawl_id, org, "crawl", project={"qaFinished": True, "qa": True}
|
|
)
|
|
qa_finished = crawl_data.get("qaFinished") or {}
|
|
if skip_failed:
|
|
all_qa = [
|
|
QARunOut(**qa_run_data)
|
|
for qa_run_data in qa_finished.values()
|
|
if qa_run_data.get("state") in SUCCESSFUL_STATES
|
|
]
|
|
else:
|
|
all_qa = [QARunOut(**qa_run_data) for qa_run_data in qa_finished.values()]
|
|
all_qa.sort(key=lambda x: x.finished or dt_now(), reverse=True)
|
|
qa = crawl_data.get("qa")
|
|
# ensure current QA run didn't just fail, just in case
|
|
if qa and (not skip_failed or qa.get("state") in SUCCESSFUL_STATES):
|
|
all_qa.insert(0, QARunOut(**qa))
|
|
return all_qa
|
|
|
|
async def get_active_qa(
|
|
self, crawl_id: str, org: Optional[Organization] = None
|
|
) -> Optional[QARunOut]:
|
|
"""return just the active QA, if any"""
|
|
crawl_data = await self.get_crawl_raw(
|
|
crawl_id, org, "crawl", project={"qa": True}
|
|
)
|
|
qa = crawl_data.get("qa")
|
|
return QARunOut(**qa) if qa else None
|
|
|
|
async def get_qa_run(
|
|
self, crawl_id: str, qa_run_id: str, org: Optional[Organization] = None
|
|
):
|
|
"""Get QARun by id"""
|
|
crawl = await self.get_crawl(crawl_id, org)
|
|
qa_finished = crawl.qaFinished or {}
|
|
qa_run = qa_finished.get(qa_run_id)
|
|
|
|
if not qa_run:
|
|
raise HTTPException(status_code=404, detail="crawl_qa_not_found")
|
|
|
|
return qa_run
|
|
|
|
async def get_qa_run_for_replay(
|
|
self, crawl_id: str, qa_run_id: str, org: Optional[Organization] = None
|
|
) -> QARunWithResources:
|
|
"""Fetch QA runs with resources for replay.json"""
|
|
crawl = await self.get_crawl(crawl_id, org)
|
|
qa_run = await self.get_qa_run(crawl_id, qa_run_id, org)
|
|
|
|
if not org:
|
|
org = await self.orgs.get_org_by_id(crawl.oid)
|
|
if not org:
|
|
raise HTTPException(status_code=400, detail="missing_org")
|
|
|
|
resources = await self.resolve_signed_urls(
|
|
qa_run.files, org, crawl.id, qa_run_id
|
|
)
|
|
|
|
qa_run.files = []
|
|
|
|
qa_run_dict = qa_run.dict()
|
|
qa_run_dict["resources"] = resources
|
|
|
|
return QARunWithResources(**qa_run_dict)
|
|
|
|
async def get_qa_run_aggregate_stats(
|
|
self,
|
|
crawl_id: str,
|
|
qa_run_id: str,
|
|
thresholds: Dict[str, List[float]],
|
|
) -> QARunAggregateStatsOut:
|
|
"""Get aggregate stats for QA run"""
|
|
screenshot_results = await self.page_ops.get_qa_run_aggregate_counts(
|
|
crawl_id, qa_run_id, thresholds, key="screenshotMatch"
|
|
)
|
|
text_results = await self.page_ops.get_qa_run_aggregate_counts(
|
|
crawl_id, qa_run_id, thresholds, key="textMatch"
|
|
)
|
|
return QARunAggregateStatsOut(
|
|
screenshotMatch=screenshot_results,
|
|
textMatch=text_results,
|
|
)
|
|
|
|
|
|
# ============================================================================
|
|
async def recompute_crawl_file_count_and_size(crawls, crawl_id):
|
|
"""Fully recompute file count and size for given crawl"""
|
|
file_count = 0
|
|
size = 0
|
|
|
|
crawl_raw = await crawls.find_one({"_id": crawl_id})
|
|
crawl = Crawl.from_dict(crawl_raw)
|
|
for file_ in crawl.files:
|
|
file_count += 1
|
|
size += file_.size
|
|
|
|
await crawls.find_one_and_update(
|
|
{"_id": crawl_id},
|
|
{"$set": {"fileCount": file_count, "fileSize": size}},
|
|
)
|
|
|
|
|
|
# ============================================================================
|
|
# pylint: disable=too-many-arguments, too-many-locals, too-many-statements
|
|
def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args):
|
|
"""API for crawl management, including crawl done callback"""
|
|
# pylint: disable=invalid-name, duplicate-code
|
|
|
|
ops = CrawlOps(crawl_manager, *args)
|
|
|
|
org_viewer_dep = ops.orgs.org_viewer_dep
|
|
org_crawl_dep = ops.orgs.org_crawl_dep
|
|
|
|
@app.get("/orgs/all/crawls", tags=["crawls"])
|
|
async def list_crawls_admin(
|
|
user: User = Depends(user_dep),
|
|
pageSize: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
userid: Optional[UUID] = None,
|
|
cid: Optional[UUID] = None,
|
|
state: Optional[str] = None,
|
|
firstSeed: Optional[str] = None,
|
|
name: Optional[str] = None,
|
|
description: Optional[str] = None,
|
|
collectionId: Optional[UUID] = None,
|
|
sortBy: Optional[str] = None,
|
|
sortDirection: int = -1,
|
|
runningOnly: Optional[bool] = True,
|
|
):
|
|
if not user.is_superuser:
|
|
raise HTTPException(status_code=403, detail="Not Allowed")
|
|
|
|
states = []
|
|
if state:
|
|
states = state.split(",")
|
|
|
|
if firstSeed:
|
|
firstSeed = urllib.parse.unquote(firstSeed)
|
|
|
|
if name:
|
|
name = urllib.parse.unquote(name)
|
|
|
|
if description:
|
|
description = urllib.parse.unquote(description)
|
|
|
|
crawls, total = await ops.list_crawls(
|
|
None,
|
|
userid=userid,
|
|
cid=cid,
|
|
running_only=runningOnly,
|
|
state=states,
|
|
first_seed=firstSeed,
|
|
name=name,
|
|
description=description,
|
|
collection_id=collectionId,
|
|
page_size=pageSize,
|
|
page=page,
|
|
sort_by=sortBy,
|
|
sort_direction=sortDirection,
|
|
)
|
|
return paginated_format(crawls, total, page, pageSize)
|
|
|
|
@app.get("/orgs/{oid}/crawls", tags=["crawls"], response_model=PaginatedResponse)
|
|
async def list_crawls(
|
|
org: Organization = Depends(org_viewer_dep),
|
|
pageSize: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
userid: Optional[UUID] = None,
|
|
cid: Optional[UUID] = None,
|
|
state: Optional[str] = None,
|
|
firstSeed: Optional[str] = None,
|
|
name: Optional[str] = None,
|
|
description: Optional[str] = None,
|
|
collectionId: Optional[UUID] = None,
|
|
sortBy: Optional[str] = None,
|
|
sortDirection: int = -1,
|
|
):
|
|
# pylint: disable=duplicate-code
|
|
states = []
|
|
if state:
|
|
states = state.split(",")
|
|
|
|
if firstSeed:
|
|
firstSeed = urllib.parse.unquote(firstSeed)
|
|
|
|
if name:
|
|
name = urllib.parse.unquote(name)
|
|
|
|
if description:
|
|
description = urllib.parse.unquote(description)
|
|
|
|
crawls, total = await ops.list_crawls(
|
|
org,
|
|
userid=userid,
|
|
cid=cid,
|
|
running_only=False,
|
|
state=states,
|
|
first_seed=firstSeed,
|
|
name=name,
|
|
description=description,
|
|
collection_id=collectionId,
|
|
page_size=pageSize,
|
|
page=page,
|
|
sort_by=sortBy,
|
|
sort_direction=sortDirection,
|
|
)
|
|
return paginated_format(crawls, total, page, pageSize)
|
|
|
|
@app.post(
|
|
"/orgs/{oid}/crawls/{crawl_id}/cancel",
|
|
tags=["crawls"],
|
|
)
|
|
async def crawl_cancel_immediately(
|
|
crawl_id, org: Organization = Depends(org_crawl_dep)
|
|
):
|
|
return await ops.shutdown_crawl(crawl_id, org, graceful=False)
|
|
|
|
@app.post(
|
|
"/orgs/{oid}/crawls/{crawl_id}/stop",
|
|
tags=["crawls"],
|
|
)
|
|
async def crawl_graceful_stop(crawl_id, org: Organization = Depends(org_crawl_dep)):
|
|
return await ops.shutdown_crawl(crawl_id, org, graceful=True)
|
|
|
|
@app.post("/orgs/{oid}/crawls/delete", tags=["crawls"])
|
|
async def delete_crawls(
|
|
delete_list: DeleteCrawlList,
|
|
user: User = Depends(user_dep),
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
return await ops.delete_crawls(org, delete_list, "crawl", user)
|
|
|
|
@app.get("/orgs/all/crawls/stats", tags=["crawls"])
|
|
async def get_all_orgs_crawl_stats(
|
|
user: User = Depends(user_dep),
|
|
):
|
|
if not user.is_superuser:
|
|
raise HTTPException(status_code=403, detail="Not Allowed")
|
|
|
|
crawl_stats = await ops.get_crawl_stats()
|
|
return stream_dict_list_as_csv(crawl_stats, "crawling-stats.csv")
|
|
|
|
@app.get("/orgs/{oid}/crawls/stats", tags=["crawls"])
|
|
async def get_org_crawl_stats(
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
crawl_stats = await ops.get_crawl_stats(org)
|
|
return stream_dict_list_as_csv(crawl_stats, f"crawling-stats-{org.id}.csv")
|
|
|
|
@app.get(
|
|
"/orgs/all/crawls/{crawl_id}/replay.json",
|
|
tags=["crawls"],
|
|
response_model=CrawlOutWithResources,
|
|
)
|
|
async def get_crawl_admin(crawl_id, user: User = Depends(user_dep)):
|
|
if not user.is_superuser:
|
|
raise HTTPException(status_code=403, detail="Not Allowed")
|
|
|
|
return await ops.get_crawl_out(crawl_id, None, "crawl")
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/replay.json",
|
|
tags=["crawls"],
|
|
response_model=CrawlOutWithResources,
|
|
)
|
|
async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)):
|
|
return await ops.get_crawl_out(crawl_id, org, "crawl")
|
|
|
|
# QA APIs
|
|
# ---------------------
|
|
@app.get(
|
|
"/orgs/all/crawls/{crawl_id}/qa/{qa_run_id}/replay.json",
|
|
tags=["qa"],
|
|
response_model=QARunWithResources,
|
|
)
|
|
async def get_qa_run_admin(crawl_id, qa_run_id, user: User = Depends(user_dep)):
|
|
if not user.is_superuser:
|
|
raise HTTPException(status_code=403, detail="Not Allowed")
|
|
|
|
return await ops.get_qa_run_for_replay(crawl_id, qa_run_id)
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/replay.json",
|
|
tags=["qa"],
|
|
response_model=QARunWithResources,
|
|
)
|
|
async def get_qa_run(
|
|
crawl_id, qa_run_id, org: Organization = Depends(org_viewer_dep)
|
|
):
|
|
return await ops.get_qa_run_for_replay(crawl_id, qa_run_id, org)
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/stats",
|
|
tags=["qa"],
|
|
response_model=QARunAggregateStatsOut,
|
|
)
|
|
async def get_qa_run_aggregate_stats(
|
|
crawl_id,
|
|
qa_run_id,
|
|
screenshotThresholds: str,
|
|
textThresholds: str,
|
|
# pylint: disable=unused-argument
|
|
org: Organization = Depends(org_viewer_dep),
|
|
):
|
|
thresholds: Dict[str, List[float]] = {}
|
|
try:
|
|
thresholds["screenshotMatch"] = [
|
|
float(threshold) for threshold in screenshotThresholds.split(",")
|
|
]
|
|
thresholds["textMatch"] = [
|
|
float(threshold) for threshold in textThresholds.split(",")
|
|
]
|
|
# pylint: disable=broad-exception-caught,raise-missing-from
|
|
except Exception:
|
|
raise HTTPException(status_code=400, detail="invalid_thresholds")
|
|
|
|
return await ops.get_qa_run_aggregate_stats(
|
|
crawl_id,
|
|
qa_run_id,
|
|
thresholds,
|
|
)
|
|
|
|
@app.post("/orgs/{oid}/crawls/{crawl_id}/qa/start", tags=["qa"])
|
|
async def start_crawl_qa_run(
|
|
crawl_id: str,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
user: User = Depends(user_dep),
|
|
):
|
|
qa_run_id = await ops.start_crawl_qa_run(crawl_id, org, user)
|
|
return {"started": qa_run_id}
|
|
|
|
@app.post("/orgs/{oid}/crawls/{crawl_id}/qa/stop", tags=["qa"])
|
|
async def stop_crawl_qa_run(
|
|
crawl_id: str, org: Organization = Depends(org_crawl_dep)
|
|
):
|
|
# pylint: disable=unused-argument
|
|
return await ops.stop_crawl_qa_run(crawl_id, org)
|
|
|
|
@app.post("/orgs/{oid}/crawls/{crawl_id}/qa/cancel", tags=["qa"])
|
|
async def cancel_crawl_qa_run(
|
|
crawl_id: str, org: Organization = Depends(org_crawl_dep)
|
|
):
|
|
# pylint: disable=unused-argument
|
|
return await ops.stop_crawl_qa_run(crawl_id, org, graceful=False)
|
|
|
|
@app.post("/orgs/{oid}/crawls/{crawl_id}/qa/delete", tags=["qa"])
|
|
async def delete_crawl_qa_runs(
|
|
crawl_id: str,
|
|
qa_run_ids: DeleteQARunList,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
return await ops.delete_crawl_qa_runs(crawl_id, qa_run_ids, org)
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/qa",
|
|
tags=["qa"],
|
|
response_model=List[QARunOut],
|
|
)
|
|
async def get_qa_runs(
|
|
crawl_id, org: Organization = Depends(org_viewer_dep), skipFailed: bool = False
|
|
):
|
|
return await ops.get_qa_runs(crawl_id, skip_failed=skipFailed, org=org)
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/qa/activeQA",
|
|
tags=["qa"],
|
|
response_model=Dict[str, Optional[QARunOut]],
|
|
)
|
|
async def get_active_qa(crawl_id, org: Organization = Depends(org_viewer_dep)):
|
|
return {"qa": await ops.get_active_qa(crawl_id, org)}
|
|
|
|
# ----
|
|
|
|
@app.get(
|
|
"/orgs/all/crawls/{crawl_id}",
|
|
tags=["crawls"],
|
|
response_model=CrawlOut,
|
|
)
|
|
async def list_single_crawl_admin(crawl_id, user: User = Depends(user_dep)):
|
|
if not user.is_superuser:
|
|
raise HTTPException(status_code=403, detail="Not Allowed")
|
|
|
|
crawls, _ = await ops.list_crawls(crawl_id=crawl_id)
|
|
if len(crawls) < 1:
|
|
raise HTTPException(status_code=404, detail="crawl_not_found")
|
|
|
|
return crawls[0]
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}",
|
|
tags=["crawls"],
|
|
response_model=CrawlOut,
|
|
)
|
|
async def list_single_crawl(crawl_id, org: Organization = Depends(org_viewer_dep)):
|
|
crawls, _ = await ops.list_crawls(org, crawl_id=crawl_id)
|
|
if len(crawls) < 1:
|
|
raise HTTPException(status_code=404, detail="crawl_not_found")
|
|
|
|
return crawls[0]
|
|
|
|
@app.patch("/orgs/{oid}/crawls/{crawl_id}", tags=["crawls"])
|
|
async def update_crawl_api(
|
|
update: UpdateCrawl, crawl_id: str, org: Organization = Depends(org_crawl_dep)
|
|
):
|
|
return await ops.update_crawl(crawl_id, org, update, "crawl")
|
|
|
|
@app.post(
|
|
"/orgs/{oid}/crawls/{crawl_id}/scale",
|
|
tags=["crawls"],
|
|
)
|
|
async def scale_crawl(
|
|
scale: CrawlScale,
|
|
crawl_id,
|
|
user: User = Depends(user_dep),
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
await ops.update_crawl_scale(crawl_id, org, scale, user)
|
|
|
|
result = await ops.crawl_manager.scale_crawl(crawl_id, scale.scale)
|
|
if not result or not result.get("success"):
|
|
raise HTTPException(
|
|
status_code=400, detail=result.get("error") or "unknown"
|
|
)
|
|
|
|
return {"scaled": scale.scale}
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/access",
|
|
tags=["crawls"],
|
|
)
|
|
async def access_check(crawl_id, org: Organization = Depends(org_crawl_dep)):
|
|
if await ops.get_crawl_raw(crawl_id, org):
|
|
return {}
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/queue",
|
|
tags=["crawls"],
|
|
)
|
|
async def get_crawl_queue(
|
|
crawl_id,
|
|
offset: int,
|
|
count: int,
|
|
regex: Optional[str] = "",
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
await ops.get_crawl_raw(crawl_id, org)
|
|
|
|
return await ops.get_crawl_queue(crawl_id, offset, count, regex)
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/queueMatchAll",
|
|
tags=["crawls"],
|
|
)
|
|
async def match_crawl_queue(
|
|
crawl_id,
|
|
regex: str,
|
|
offset: int = 0,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
await ops.get_crawl_raw(crawl_id, org)
|
|
|
|
return await ops.match_crawl_queue(crawl_id, regex, offset)
|
|
|
|
@app.post(
|
|
"/orgs/{oid}/crawls/{crawl_id}/exclusions",
|
|
tags=["crawls"],
|
|
)
|
|
async def add_exclusion(
|
|
crawl_id,
|
|
regex: str,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
user: User = Depends(user_dep),
|
|
):
|
|
return await ops.add_or_remove_exclusion(crawl_id, regex, org, user, add=True)
|
|
|
|
@app.delete(
|
|
"/orgs/{oid}/crawls/{crawl_id}/exclusions",
|
|
tags=["crawls"],
|
|
)
|
|
async def remove_exclusion(
|
|
crawl_id,
|
|
regex: str,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
user: User = Depends(user_dep),
|
|
):
|
|
return await ops.add_or_remove_exclusion(crawl_id, regex, org, user, add=False)
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/seeds",
|
|
tags=["crawls"],
|
|
response_model=PaginatedResponse,
|
|
)
|
|
async def get_crawl_config_seeds(
|
|
crawl_id: str,
|
|
org: Organization = Depends(org_viewer_dep),
|
|
pageSize: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
):
|
|
seeds, total = await ops.get_crawl_seeds(crawl_id, org, pageSize, page)
|
|
return paginated_format(seeds, total, page, pageSize)
|
|
|
|
@app.get("/orgs/{oid}/crawls/{crawl_id}/logs", tags=["crawls"])
|
|
async def stream_crawl_logs(
|
|
crawl_id,
|
|
org: Organization = Depends(org_viewer_dep),
|
|
logLevel: Optional[str] = None,
|
|
context: Optional[str] = None,
|
|
):
|
|
crawl = await ops.get_crawl_out(crawl_id, org)
|
|
|
|
log_levels = []
|
|
contexts = []
|
|
if logLevel:
|
|
log_levels = logLevel.split(",")
|
|
if context:
|
|
contexts = context.split(",")
|
|
|
|
# If crawl is finished, stream logs from WACZ files using presigned urls
|
|
if crawl.finished:
|
|
resp = await ops.storage_ops.sync_stream_wacz_logs(
|
|
crawl.resources or [], log_levels, contexts
|
|
)
|
|
return StreamingResponse(
|
|
resp,
|
|
media_type="text/jsonl",
|
|
headers={
|
|
"Content-Disposition": f'attachment; filename="{crawl_id}.log"'
|
|
},
|
|
)
|
|
|
|
raise HTTPException(status_code=400, detail="crawl_not_finished")
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/errors",
|
|
tags=["crawls"],
|
|
)
|
|
async def get_crawl_errors(
|
|
crawl_id: str,
|
|
pageSize: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
org: Organization = Depends(org_viewer_dep),
|
|
):
|
|
crawl = await ops.get_crawl(crawl_id, org)
|
|
|
|
skip = (page - 1) * pageSize
|
|
upper_bound = skip + pageSize
|
|
|
|
errors = crawl.errors[skip:upper_bound] if crawl.errors else []
|
|
parsed_errors = parse_jsonl_error_messages(errors)
|
|
return paginated_format(parsed_errors, len(crawl.errors or []), page, pageSize)
|
|
|
|
return ops
|