Backend work for #1672 Adds new sort options to /crawls and /all-crawls GET list endpoints: - `reviewStatus` - `qaRunCount`: number of completed QA runs for crawl (also added to CrawlOut) - `qaState` (sorts by `activeQAState` first, then `lastQAState`, both of which are added to CrawlOut)
1348 lines
44 KiB
Python
1348 lines
44 KiB
Python
""" Crawl API """
|
|
|
|
# pylint: disable=too-many-lines
|
|
|
|
import json
|
|
import re
|
|
import contextlib
|
|
import urllib.parse
|
|
from datetime import datetime
|
|
from uuid import UUID
|
|
|
|
from typing import Optional, List, Dict, Union, Any, Sequence
|
|
|
|
from fastapi import Depends, HTTPException
|
|
from fastapi.responses import StreamingResponse
|
|
from redis import asyncio as exceptions
|
|
import pymongo
|
|
|
|
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
|
|
from .utils import dt_now, parse_jsonl_error_messages, stream_dict_list_as_csv
|
|
from .basecrawls import BaseCrawlOps
|
|
from .crawlmanager import CrawlManager
|
|
from .models import (
|
|
UpdateCrawl,
|
|
DeleteCrawlList,
|
|
CrawlConfig,
|
|
UpdateCrawlConfig,
|
|
CrawlScale,
|
|
CrawlStats,
|
|
CrawlFile,
|
|
Crawl,
|
|
CrawlOut,
|
|
CrawlOutWithResources,
|
|
QARun,
|
|
QARunOut,
|
|
QARunWithResources,
|
|
DeleteQARunList,
|
|
Organization,
|
|
User,
|
|
PaginatedResponse,
|
|
RUNNING_AND_STARTING_STATES,
|
|
SUCCESSFUL_STATES,
|
|
NON_RUNNING_STATES,
|
|
ALL_CRAWL_STATES,
|
|
TYPE_ALL_CRAWL_STATES,
|
|
)
|
|
|
|
|
|
MAX_MATCH_SIZE = 500000
|
|
DEFAULT_RANGE_LIMIT = 50
|
|
|
|
|
|
# ============================================================================
|
|
# pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods
|
|
class CrawlOps(BaseCrawlOps):
|
|
"""Crawl Ops"""
|
|
|
|
crawl_manager: CrawlManager
|
|
|
|
def __init__(self, crawl_manager: CrawlManager, *args):
|
|
super().__init__(*args)
|
|
self.crawl_manager = crawl_manager
|
|
self.crawl_configs.set_crawl_ops(self)
|
|
self.colls.set_crawl_ops(self)
|
|
self.event_webhook_ops.set_crawl_ops(self)
|
|
|
|
async def init_index(self):
|
|
"""init index for crawls db collection"""
|
|
await self.crawls.create_index([("type", pymongo.HASHED)])
|
|
|
|
await self.crawls.create_index(
|
|
[("type", pymongo.HASHED), ("finished", pymongo.DESCENDING)]
|
|
)
|
|
await self.crawls.create_index(
|
|
[("type", pymongo.HASHED), ("oid", pymongo.DESCENDING)]
|
|
)
|
|
await self.crawls.create_index(
|
|
[("type", pymongo.HASHED), ("cid", pymongo.DESCENDING)]
|
|
)
|
|
await self.crawls.create_index(
|
|
[("type", pymongo.HASHED), ("state", pymongo.DESCENDING)]
|
|
)
|
|
await self.crawls.create_index(
|
|
[("type", pymongo.HASHED), ("fileSize", pymongo.DESCENDING)]
|
|
)
|
|
|
|
await self.crawls.create_index([("finished", pymongo.DESCENDING)])
|
|
await self.crawls.create_index([("oid", pymongo.HASHED)])
|
|
await self.crawls.create_index([("cid", pymongo.HASHED)])
|
|
await self.crawls.create_index([("state", pymongo.HASHED)])
|
|
await self.crawls.create_index([("fileSize", pymongo.DESCENDING)])
|
|
|
|
async def get_crawl(
|
|
self,
|
|
crawlid: str,
|
|
org: Optional[Organization] = None,
|
|
project: Optional[dict[str, bool]] = None,
|
|
) -> Crawl:
|
|
"""Get crawl data for internal use"""
|
|
res = await self.get_crawl_raw(crawlid, org, "crawl", project)
|
|
return Crawl.from_dict(res)
|
|
|
|
@contextlib.asynccontextmanager
|
|
async def get_redis(self, crawl_id):
|
|
"""get redis url for crawl id"""
|
|
redis_url = self.crawl_manager.get_redis_url(crawl_id)
|
|
|
|
redis = await self.crawl_manager.get_redis_client(redis_url)
|
|
|
|
try:
|
|
yield redis
|
|
finally:
|
|
await redis.close()
|
|
|
|
async def list_crawls(
|
|
self,
|
|
org: Optional[Organization] = None,
|
|
cid: Optional[UUID] = None,
|
|
userid: Optional[UUID] = None,
|
|
crawl_id: str = "",
|
|
running_only=False,
|
|
state: Optional[List[str]] = None,
|
|
first_seed: Optional[str] = None,
|
|
name: Optional[str] = None,
|
|
description: Optional[str] = None,
|
|
collection_id: Optional[UUID] = None,
|
|
page_size: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
sort_by: Optional[str] = None,
|
|
sort_direction: int = -1,
|
|
resources: bool = False,
|
|
):
|
|
"""List all finished crawls from the db"""
|
|
# pylint: disable=too-many-locals,too-many-branches,too-many-statements
|
|
# Zero-index page for query
|
|
page = page - 1
|
|
skip = page * page_size
|
|
|
|
oid = org.id if org else None
|
|
|
|
query: dict[str, object] = {"type": {"$in": ["crawl", None]}}
|
|
if oid:
|
|
query["oid"] = oid
|
|
|
|
if cid:
|
|
query["cid"] = cid
|
|
|
|
if userid:
|
|
query["userid"] = userid
|
|
|
|
if running_only:
|
|
query["state"] = {"$in": RUNNING_AND_STARTING_STATES}
|
|
|
|
# Override running_only if state list is explicitly passed
|
|
if state:
|
|
validated_states = [value for value in state if value in ALL_CRAWL_STATES]
|
|
query["state"] = {"$in": validated_states}
|
|
|
|
if crawl_id:
|
|
query["_id"] = crawl_id
|
|
|
|
# pylint: disable=duplicate-code
|
|
aggregate = [
|
|
{"$match": query},
|
|
{"$set": {"firstSeedObject": {"$arrayElemAt": ["$config.seeds", 0]}}},
|
|
{"$set": {"firstSeed": "$firstSeedObject.url"}},
|
|
{"$unset": ["firstSeedObject", "errors", "config"]},
|
|
{"$set": {"qaState": "$qa.state"}},
|
|
{"$set": {"activeQAState": "$qaState"}},
|
|
{
|
|
"$set": {
|
|
"qaFinishedArray": {
|
|
"$map": {
|
|
"input": {"$objectToArray": "$qaFinished"},
|
|
"in": "$$this.v",
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"$set": {
|
|
"sortedQARuns": {
|
|
"$sortArray": {
|
|
"input": "$qaFinishedArray",
|
|
"sortBy": {"started": -1},
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{"$set": {"lastQARun": {"$arrayElemAt": ["$sortedQARuns", 0]}}},
|
|
{"$set": {"lastQAState": "$lastQARun.state"}},
|
|
{
|
|
"$set": {
|
|
"qaRunCount": {
|
|
"$size": {
|
|
"$cond": [
|
|
{"$isArray": "$qaFinishedArray"},
|
|
"$qaFinishedArray",
|
|
[],
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
{
|
|
"$unset": [
|
|
"lastQARun",
|
|
"qaFinishedArray",
|
|
"sortedQARuns",
|
|
]
|
|
},
|
|
]
|
|
|
|
if not resources:
|
|
aggregate.extend([{"$unset": ["files"]}])
|
|
|
|
if name:
|
|
aggregate.extend([{"$match": {"name": name}}])
|
|
|
|
if description:
|
|
aggregate.extend([{"$match": {"description": description}}])
|
|
|
|
if first_seed:
|
|
aggregate.extend([{"$match": {"firstSeed": first_seed}}])
|
|
|
|
if collection_id:
|
|
aggregate.extend([{"$match": {"collectionIds": {"$in": [collection_id]}}}])
|
|
|
|
if sort_by:
|
|
if sort_by not in (
|
|
"started",
|
|
"finished",
|
|
"fileSize",
|
|
"firstSeed",
|
|
"reviewStatus",
|
|
"qaRunCount",
|
|
"qaState",
|
|
):
|
|
raise HTTPException(status_code=400, detail="invalid_sort_by")
|
|
if sort_direction not in (1, -1):
|
|
raise HTTPException(status_code=400, detail="invalid_sort_direction")
|
|
|
|
sort_query = {sort_by: sort_direction}
|
|
|
|
# Add secondary sort for qaState - sorted by current, then last
|
|
if sort_by == "qaState":
|
|
sort_query["lastQAState"] = sort_direction
|
|
|
|
aggregate.extend([{"$sort": sort_query}])
|
|
|
|
aggregate.extend(
|
|
[
|
|
{
|
|
"$facet": {
|
|
"items": [
|
|
{"$skip": skip},
|
|
{"$limit": page_size},
|
|
],
|
|
"total": [{"$count": "count"}],
|
|
}
|
|
},
|
|
]
|
|
)
|
|
|
|
# Get total
|
|
cursor = self.crawls.aggregate(aggregate)
|
|
results = await cursor.to_list(length=1)
|
|
result = results[0]
|
|
items = result["items"]
|
|
|
|
try:
|
|
total = int(result["total"][0]["count"])
|
|
except (IndexError, ValueError):
|
|
total = 0
|
|
|
|
cls = CrawlOut
|
|
if resources:
|
|
cls = CrawlOutWithResources
|
|
|
|
crawls = []
|
|
for result in items:
|
|
crawl = cls.from_dict(result)
|
|
files = result.get("files") if resources else None
|
|
crawl = await self._resolve_crawl_refs(
|
|
crawl, org, files=files, add_first_seed=False
|
|
)
|
|
crawls.append(crawl)
|
|
|
|
return crawls, total
|
|
|
|
async def delete_crawls(
|
|
self,
|
|
org: Organization,
|
|
delete_list: DeleteCrawlList,
|
|
type_="crawl",
|
|
user: Optional[User] = None,
|
|
):
|
|
"""Delete a list of crawls by id for given org"""
|
|
|
|
count, cids_to_update, quota_reached = await super().delete_crawls(
|
|
org, delete_list, type_, user
|
|
)
|
|
|
|
if count < 1:
|
|
raise HTTPException(status_code=404, detail="crawl_not_found")
|
|
|
|
for cid, cid_dict in cids_to_update.items():
|
|
cid_size = cid_dict["size"]
|
|
cid_inc = cid_dict["inc"]
|
|
await self.crawl_configs.stats_recompute_last(cid, -cid_size, -cid_inc)
|
|
|
|
return {"deleted": True, "storageQuotaReached": quota_reached}
|
|
|
|
# pylint: disable=too-many-arguments
|
|
async def add_new_crawl(
|
|
self,
|
|
crawl_id: str,
|
|
crawlconfig: CrawlConfig,
|
|
userid: UUID,
|
|
started: str,
|
|
manual: bool,
|
|
username: str = "",
|
|
):
|
|
"""initialize new crawl"""
|
|
if not username:
|
|
user = await self.user_manager.get_by_id(userid)
|
|
if user:
|
|
username = user.name
|
|
|
|
image = self.crawl_configs.get_channel_crawler_image(crawlconfig.crawlerChannel)
|
|
|
|
crawl = Crawl(
|
|
id=crawl_id,
|
|
state="starting",
|
|
userid=userid,
|
|
userName=username,
|
|
oid=crawlconfig.oid,
|
|
cid=crawlconfig.id,
|
|
cid_rev=crawlconfig.rev,
|
|
scale=crawlconfig.scale,
|
|
jobType=crawlconfig.jobType,
|
|
config=crawlconfig.config,
|
|
profileid=crawlconfig.profileid,
|
|
schedule=crawlconfig.schedule,
|
|
crawlTimeout=crawlconfig.crawlTimeout,
|
|
maxCrawlSize=crawlconfig.maxCrawlSize,
|
|
manual=manual,
|
|
started=started,
|
|
tags=crawlconfig.tags,
|
|
name=crawlconfig.name,
|
|
crawlerChannel=crawlconfig.crawlerChannel,
|
|
image=image,
|
|
)
|
|
|
|
try:
|
|
await self.crawls.insert_one(crawl.to_dict())
|
|
return dt_now
|
|
|
|
except pymongo.errors.DuplicateKeyError:
|
|
return None
|
|
|
|
async def update_crawl_scale(
|
|
self, crawl_id: str, org: Organization, crawl_scale: CrawlScale, user: User
|
|
):
|
|
"""Update crawl scale in the db"""
|
|
crawl = await self.get_crawl(crawl_id, org)
|
|
update = UpdateCrawlConfig(scale=crawl_scale.scale)
|
|
await self.crawl_configs.update_crawl_config(crawl.cid, org, user, update)
|
|
|
|
result = await self.crawls.find_one_and_update(
|
|
{"_id": crawl_id, "type": "crawl", "oid": org.id},
|
|
{"$set": {"scale": crawl_scale.scale}},
|
|
return_document=pymongo.ReturnDocument.AFTER,
|
|
)
|
|
|
|
if not result:
|
|
raise HTTPException(status_code=404, detail=f"Crawl '{crawl_id}' not found")
|
|
|
|
return True
|
|
|
|
async def _crawl_queue_len(self, redis, key):
|
|
try:
|
|
return await redis.zcard(key)
|
|
except exceptions.ResponseError:
|
|
# fallback to old crawler queue
|
|
return await redis.llen(key)
|
|
|
|
async def _crawl_queue_range(self, redis, key, offset, count):
|
|
try:
|
|
return await redis.zrangebyscore(key, 0, "inf", offset, count)
|
|
except exceptions.ResponseError:
|
|
# fallback to old crawler queue
|
|
return reversed(await redis.lrange(key, -offset - count, -offset - 1))
|
|
|
|
async def get_crawl_queue(self, crawl_id, offset, count, regex):
|
|
"""get crawl queue"""
|
|
|
|
total = 0
|
|
results = []
|
|
|
|
try:
|
|
async with self.get_redis(crawl_id) as redis:
|
|
total = await self._crawl_queue_len(redis, f"{crawl_id}:q")
|
|
results = await self._crawl_queue_range(
|
|
redis, f"{crawl_id}:q", offset, count
|
|
)
|
|
results = [json.loads(result)["url"] for result in results]
|
|
|
|
except exceptions.ConnectionError:
|
|
# can't connect to redis, likely not initialized yet
|
|
pass
|
|
|
|
matched = []
|
|
if regex:
|
|
try:
|
|
regex = re.compile(regex)
|
|
except re.error as exc:
|
|
raise HTTPException(status_code=400, detail="invalid_regex") from exc
|
|
|
|
matched = [result for result in results if regex.search(result)]
|
|
|
|
return {"total": total, "results": results, "matched": matched}
|
|
|
|
async def match_crawl_queue(self, crawl_id, regex, offset=0):
|
|
"""get list of urls that match regex, starting at offset and at most
|
|
around 'limit'. (limit rounded to next step boundary, so
|
|
limit <= next_offset < limit + step"""
|
|
total = 0
|
|
matched = []
|
|
step = DEFAULT_RANGE_LIMIT
|
|
|
|
async with self.get_redis(crawl_id) as redis:
|
|
try:
|
|
total = await self._crawl_queue_len(redis, f"{crawl_id}:q")
|
|
except exceptions.ConnectionError:
|
|
# can't connect to redis, likely not initialized yet
|
|
pass
|
|
|
|
try:
|
|
regex = re.compile(regex)
|
|
except re.error as exc:
|
|
raise HTTPException(status_code=400, detail="invalid_regex") from exc
|
|
|
|
next_offset = -1
|
|
size = 0
|
|
|
|
for count in range(offset, total, step):
|
|
results = await self._crawl_queue_range(
|
|
redis, f"{crawl_id}:q", count, step
|
|
)
|
|
for result in results:
|
|
url = json.loads(result)["url"]
|
|
if regex.search(url):
|
|
size += len(url)
|
|
matched.append(url)
|
|
|
|
# if size of match response exceeds size limit, set nextOffset
|
|
# and break
|
|
if size > MAX_MATCH_SIZE:
|
|
next_offset = count + step
|
|
break
|
|
|
|
return {"total": total, "matched": matched, "nextOffset": next_offset}
|
|
|
|
async def add_or_remove_exclusion(self, crawl_id, regex, org, user, add):
|
|
"""add new exclusion to config or remove exclusion from config
|
|
for given crawl_id, update config on crawl"""
|
|
|
|
crawl = await self.get_crawl(crawl_id, org, project={"cid": True})
|
|
|
|
cid = crawl.cid
|
|
|
|
scale = crawl.scale or 1
|
|
|
|
async with self.get_redis(crawl_id) as redis:
|
|
query = {
|
|
"regex": regex,
|
|
"type": "addExclusion" if add else "removeExclusion",
|
|
}
|
|
query_str = json.dumps(query)
|
|
|
|
for i in range(0, scale):
|
|
await redis.rpush(f"crawl-{crawl_id}-{i}:msg", query_str)
|
|
|
|
new_config = await self.crawl_configs.add_or_remove_exclusion(
|
|
regex, cid, org, user, add
|
|
)
|
|
|
|
await self.crawls.find_one_and_update(
|
|
{"_id": crawl_id, "type": "crawl", "oid": org.id},
|
|
{"$set": {"config": new_config.dict()}},
|
|
)
|
|
|
|
return {"success": True}
|
|
|
|
async def update_crawl_state_if_allowed(
|
|
self,
|
|
crawl_id: str,
|
|
is_qa: bool,
|
|
state: TYPE_ALL_CRAWL_STATES,
|
|
allowed_from: Sequence[TYPE_ALL_CRAWL_STATES],
|
|
finished: Optional[datetime] = None,
|
|
stats: Optional[CrawlStats] = None,
|
|
):
|
|
"""update crawl state and other properties in db if state has changed"""
|
|
prefix = "" if not is_qa else "qa."
|
|
|
|
update: Dict[str, Any] = {f"{prefix}state": state}
|
|
if finished:
|
|
update[f"{prefix}finished"] = finished
|
|
if stats:
|
|
update[f"{prefix}stats"] = stats.dict()
|
|
|
|
query: Dict[str, Any] = {"_id": crawl_id, "type": "crawl"}
|
|
if allowed_from:
|
|
query[f"{prefix}state"] = {"$in": allowed_from}
|
|
|
|
return await self.crawls.find_one_and_update(query, {"$set": update})
|
|
|
|
async def update_running_crawl_stats(
|
|
self, crawl_id: str, is_qa: bool, stats: CrawlStats
|
|
):
|
|
"""update running crawl stats"""
|
|
prefix = "" if not is_qa else "qa."
|
|
query = {"_id": crawl_id, "type": "crawl", f"{prefix}state": "running"}
|
|
return await self.crawls.find_one_and_update(
|
|
query, {"$set": {f"{prefix}stats": stats.dict()}}
|
|
)
|
|
|
|
async def inc_crawl_exec_time(
|
|
self,
|
|
crawl_id: str,
|
|
is_qa: bool,
|
|
exec_time,
|
|
last_updated_time,
|
|
):
|
|
"""increment exec time"""
|
|
# update both crawl-shared qa exec seconds and per-qa run exec seconds
|
|
if is_qa:
|
|
inc_update = {
|
|
"qaCrawlExecSeconds": exec_time,
|
|
"qa.crawlExecSeconds": exec_time,
|
|
}
|
|
else:
|
|
inc_update = {"crawlExecSeconds": exec_time}
|
|
|
|
return await self.crawls.find_one_and_update(
|
|
{
|
|
"_id": crawl_id,
|
|
"type": "crawl",
|
|
"_lut": {"$ne": last_updated_time},
|
|
},
|
|
{
|
|
"$inc": inc_update,
|
|
"$set": {"_lut": last_updated_time},
|
|
},
|
|
)
|
|
|
|
async def get_crawl_exec_last_update_time(self, crawl_id):
|
|
"""get crawl last updated time"""
|
|
res = await self.crawls.find_one(
|
|
{"_id": crawl_id, "type": "crawl"}, projection=["_lut"]
|
|
)
|
|
return res and res.get("_lut")
|
|
|
|
async def get_crawl_state(self, crawl_id: str, is_qa: bool):
|
|
"""return current crawl state of a crawl"""
|
|
prefix = "" if not is_qa else "qa."
|
|
|
|
res = await self.crawls.find_one(
|
|
{"_id": crawl_id},
|
|
projection={"state": f"${prefix}state", "finished": f"${prefix}finished"},
|
|
)
|
|
if not res:
|
|
return None, None
|
|
return res.get("state"), res.get("finished")
|
|
|
|
async def add_crawl_error(
|
|
self,
|
|
crawl_id: str,
|
|
is_qa: bool,
|
|
error: str,
|
|
):
|
|
"""add crawl error from redis to mongodb errors field"""
|
|
prefix = "" if not is_qa else "qa."
|
|
|
|
await self.crawls.find_one_and_update(
|
|
{"_id": crawl_id}, {"$push": {f"{prefix}errors": error}}
|
|
)
|
|
|
|
async def add_crawl_file(
|
|
self, crawl_id: str, is_qa: bool, crawl_file: CrawlFile, size: int
|
|
):
|
|
"""add new crawl file to crawl"""
|
|
prefix = "" if not is_qa else "qa."
|
|
|
|
await self.crawls.find_one_and_update(
|
|
{"_id": crawl_id},
|
|
{
|
|
"$push": {f"{prefix}files": crawl_file.dict()},
|
|
"$inc": {f"{prefix}fileCount": 1, f"{prefix}fileSize": size},
|
|
},
|
|
)
|
|
|
|
async def get_crawl_seeds(
|
|
self,
|
|
crawl_id: str,
|
|
org: Organization,
|
|
page_size: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
):
|
|
"""Get paginated list of seeds from crawl"""
|
|
skip = (page - 1) * page_size
|
|
upper_bound = skip + page_size
|
|
|
|
crawl = await self.get_crawl(crawl_id, org)
|
|
if not crawl.config or not crawl.config.seeds:
|
|
return [], 0
|
|
try:
|
|
return crawl.config.seeds[skip:upper_bound], len(crawl.config.seeds)
|
|
# pylint: disable=broad-exception-caught
|
|
except Exception:
|
|
return [], 0
|
|
|
|
async def get_crawl_stats(
|
|
self, org: Optional[Organization] = None
|
|
) -> List[Dict[str, Union[str, int]]]:
|
|
"""Return crawl statistics"""
|
|
# pylint: disable=too-many-locals
|
|
org_slugs = await self.orgs.get_org_slugs_by_ids()
|
|
user_emails = await self.user_manager.get_user_emails_by_ids()
|
|
|
|
crawls_data: List[Dict[str, Union[str, int]]] = []
|
|
|
|
query: Dict[str, Union[str, UUID]] = {"type": "crawl"}
|
|
if org:
|
|
query["oid"] = org.id
|
|
|
|
async for crawl_raw in self.crawls.find(query):
|
|
crawl = Crawl.from_dict(crawl_raw)
|
|
data: Dict[str, Union[str, int]] = {}
|
|
data["id"] = crawl.id
|
|
|
|
data["oid"] = str(crawl.oid)
|
|
data["org"] = org_slugs[crawl.oid]
|
|
|
|
data["cid"] = crawl.id
|
|
data["name"] = f'"{crawl.name}"' if crawl.name else ""
|
|
data["state"] = crawl.state
|
|
|
|
data["userid"] = str(crawl.userid)
|
|
data["user"] = user_emails.get(crawl.userid)
|
|
|
|
data["started"] = str(crawl.started)
|
|
data["finished"] = str(crawl.finished)
|
|
|
|
data["duration"] = 0
|
|
duration_seconds = 0
|
|
if crawl.started and crawl.finished:
|
|
duration = crawl.finished - crawl.started
|
|
duration_seconds = int(duration.total_seconds())
|
|
if duration_seconds:
|
|
data["duration"] = duration_seconds
|
|
|
|
if crawl.stats:
|
|
data["pages"] = crawl.stats.done
|
|
|
|
data["filesize"] = crawl.fileSize
|
|
|
|
data["avg_page_time"] = 0
|
|
if crawl.stats and crawl.stats.done != 0 and duration_seconds:
|
|
data["avg_page_time"] = int(duration_seconds / crawl.stats.done)
|
|
|
|
crawls_data.append(data)
|
|
|
|
return crawls_data
|
|
|
|
async def shutdown_crawl(
|
|
self, crawl_id: str, org: Organization, graceful: bool
|
|
) -> Dict[str, bool]:
|
|
"""stop or cancel specified crawl"""
|
|
crawl = await self.get_base_crawl(crawl_id, org)
|
|
if crawl and crawl.type != "crawl":
|
|
raise HTTPException(status_code=400, detail="not_a_crawl")
|
|
|
|
result = None
|
|
try:
|
|
result = await self.crawl_manager.shutdown_crawl(
|
|
crawl_id, graceful=graceful
|
|
)
|
|
|
|
if result.get("success"):
|
|
if graceful:
|
|
await self.crawls.find_one_and_update(
|
|
{"_id": crawl_id, "type": "crawl", "oid": org.id},
|
|
{"$set": {"stopping": True}},
|
|
)
|
|
return result
|
|
|
|
except Exception as exc:
|
|
# pylint: disable=raise-missing-from
|
|
# if reached here, probably crawl doesn't exist anymore
|
|
raise HTTPException(
|
|
status_code=404, detail=f"crawl_not_found, (details: {exc})"
|
|
)
|
|
|
|
# if job no longer running, canceling is considered success,
|
|
# but graceful stoppage is not possible, so would be a failure
|
|
if result.get("error") == "Not Found":
|
|
if not graceful:
|
|
await self.update_crawl_state(crawl_id, "canceled")
|
|
crawl = await self.get_crawl(crawl_id, org)
|
|
if not await self.crawl_configs.stats_recompute_last(crawl.cid, 0, -1):
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f"crawl_config_not_found: {crawl.cid}",
|
|
)
|
|
|
|
return {"success": True}
|
|
|
|
# return whatever detail may be included in the response
|
|
raise HTTPException(status_code=400, detail=result)
|
|
|
|
async def start_crawl_qa_run(self, crawl_id: str, org: Organization, user: User):
|
|
"""Start crawl QA run"""
|
|
|
|
crawl = await self.get_crawl(crawl_id, org)
|
|
|
|
# can only QA finished crawls
|
|
if not crawl.finished:
|
|
raise HTTPException(status_code=400, detail="crawl_not_finished")
|
|
|
|
# can only QA successfully finished crawls
|
|
if crawl.state not in SUCCESSFUL_STATES:
|
|
raise HTTPException(status_code=400, detail="crawl_did_not_succeed")
|
|
|
|
# can only run one QA at a time
|
|
if crawl.qa:
|
|
raise HTTPException(status_code=400, detail="qa_already_running")
|
|
|
|
# not a valid crawl
|
|
if not crawl.cid or crawl.type != "crawl":
|
|
raise HTTPException(status_code=400, detail="invalid_crawl_for_qa")
|
|
|
|
crawlconfig = await self.crawl_configs.prepare_for_run_crawl(crawl.cid, org)
|
|
|
|
try:
|
|
qa_run_id = await self.crawl_manager.create_qa_crawl_job(
|
|
crawlconfig,
|
|
org.storage,
|
|
userid=str(user.id),
|
|
qa_source=crawl_id,
|
|
)
|
|
|
|
image = self.crawl_configs.get_channel_crawler_image(
|
|
crawlconfig.crawlerChannel
|
|
)
|
|
|
|
qa_run = QARun(
|
|
id=qa_run_id,
|
|
started=datetime.now(),
|
|
userid=user.id,
|
|
userName=user.name,
|
|
state="starting",
|
|
image=image,
|
|
)
|
|
|
|
await self.crawls.find_one_and_update(
|
|
{"_id": crawl_id},
|
|
{
|
|
"$set": {
|
|
"qa": qa_run.dict(),
|
|
}
|
|
},
|
|
)
|
|
|
|
return qa_run_id
|
|
|
|
except Exception as exc:
|
|
# pylint: disable=raise-missing-from
|
|
raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")
|
|
|
|
async def stop_crawl_qa_run(
|
|
self, crawl_id: str, org: Organization, graceful: bool = True
|
|
):
|
|
"""Stop crawl QA run, QA run removed when actually finished"""
|
|
crawl = await self.get_crawl(crawl_id, org)
|
|
|
|
if not crawl.qa:
|
|
raise HTTPException(status_code=400, detail="qa_not_running")
|
|
|
|
try:
|
|
result = await self.crawl_manager.shutdown_crawl(
|
|
crawl.qa.id, graceful=graceful
|
|
)
|
|
|
|
if result.get("error") == "Not Found":
|
|
# treat as success, qa crawl no longer exists, so mark as no qa
|
|
result = {"success": True}
|
|
|
|
return result
|
|
|
|
except Exception as exc:
|
|
# pylint: disable=raise-missing-from
|
|
# if reached here, probably crawl doesn't exist anymore
|
|
raise HTTPException(
|
|
status_code=404, detail=f"crawl_not_found, (details: {exc})"
|
|
)
|
|
|
|
async def delete_crawl_qa_runs(self, crawl_id: str, delete_list: DeleteQARunList):
|
|
"""delete specified finished QA run"""
|
|
|
|
count = 0
|
|
for qa_run_id in delete_list.qa_run_ids:
|
|
res = await self.crawls.find_one_and_update(
|
|
{"_id": crawl_id, "type": "crawl"},
|
|
{"$unset": {f"qaFinished.{qa_run_id}": ""}},
|
|
)
|
|
|
|
if res:
|
|
count += 1
|
|
|
|
await self.page_ops.delete_qa_run_from_pages(crawl_id, qa_run_id)
|
|
|
|
return {"deleted": count}
|
|
|
|
async def qa_run_finished(self, crawl_id: str):
|
|
"""clear active qa, add qa run to finished list, if successful"""
|
|
crawl = await self.get_crawl(crawl_id)
|
|
|
|
if not crawl.qa:
|
|
return False
|
|
|
|
query: Dict[str, Any] = {"qa": None}
|
|
|
|
if crawl.qa.finished and crawl.qa.state in NON_RUNNING_STATES:
|
|
query[f"qaFinished.{crawl.qa.id}"] = crawl.qa.dict()
|
|
|
|
if await self.crawls.find_one_and_update(
|
|
{"_id": crawl_id, "type": "crawl"}, {"$set": query}
|
|
):
|
|
return True
|
|
|
|
return False
|
|
|
|
async def get_qa_runs(
|
|
self,
|
|
crawl_id: str,
|
|
skip_failed: bool = False,
|
|
org: Optional[Organization] = None,
|
|
) -> List[QARunOut]:
|
|
"""Return list of QA runs"""
|
|
crawl_data = await self.get_crawl_raw(
|
|
crawl_id, org, "crawl", project={"qaFinished": True, "qa": True}
|
|
)
|
|
qa_finished = crawl_data.get("qaFinished") or {}
|
|
if skip_failed:
|
|
all_qa = [
|
|
QARunOut(**qa_run_data)
|
|
for qa_run_data in qa_finished.values()
|
|
if qa_run_data.get("state") in SUCCESSFUL_STATES
|
|
]
|
|
else:
|
|
all_qa = [QARunOut(**qa_run_data) for qa_run_data in qa_finished.values()]
|
|
all_qa.sort(key=lambda x: x.finished or dt_now(), reverse=True)
|
|
qa = crawl_data.get("qa")
|
|
# ensure current QA run didn't just fail, just in case
|
|
if qa and (not skip_failed or qa.get("state") in SUCCESSFUL_STATES):
|
|
all_qa.insert(0, QARunOut(**qa))
|
|
return all_qa
|
|
|
|
async def get_active_qa(
|
|
self, crawl_id: str, org: Optional[Organization] = None
|
|
) -> Optional[QARunOut]:
|
|
"""return just the active QA, if any"""
|
|
crawl_data = await self.get_crawl_raw(
|
|
crawl_id, org, "crawl", project={"qa": True}
|
|
)
|
|
qa = crawl_data.get("qa")
|
|
return QARunOut(**qa) if qa else None
|
|
|
|
async def get_qa_run_for_replay(
|
|
self, crawl_id: str, qa_run_id: str, org: Optional[Organization] = None
|
|
) -> QARunWithResources:
|
|
"""Fetch QA runs with resources for replay.json"""
|
|
crawl = await self.get_crawl(crawl_id, org)
|
|
qa_finished = crawl.qaFinished or {}
|
|
qa_run = qa_finished.get(qa_run_id)
|
|
|
|
if not qa_run:
|
|
raise HTTPException(status_code=404, detail="crawl_qa_not_found")
|
|
|
|
if not org:
|
|
org = await self.orgs.get_org_by_id(crawl.oid)
|
|
if not org:
|
|
raise HTTPException(status_code=400, detail="missing_org")
|
|
|
|
resources = await self._resolve_signed_urls(
|
|
qa_run.files, org, crawl.id, qa_run_id
|
|
)
|
|
|
|
qa_run.files = []
|
|
|
|
qa_run_dict = qa_run.dict()
|
|
qa_run_dict["resources"] = resources
|
|
|
|
return QARunWithResources(**qa_run_dict)
|
|
|
|
|
|
# ============================================================================
|
|
async def recompute_crawl_file_count_and_size(crawls, crawl_id):
|
|
"""Fully recompute file count and size for given crawl"""
|
|
file_count = 0
|
|
size = 0
|
|
|
|
crawl_raw = await crawls.find_one({"_id": crawl_id})
|
|
crawl = Crawl.from_dict(crawl_raw)
|
|
for file_ in crawl.files:
|
|
file_count += 1
|
|
size += file_.size
|
|
|
|
await crawls.find_one_and_update(
|
|
{"_id": crawl_id},
|
|
{"$set": {"fileCount": file_count, "fileSize": size}},
|
|
)
|
|
|
|
|
|
# ============================================================================
|
|
# pylint: disable=too-many-arguments, too-many-locals, too-many-statements
|
|
def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args):
|
|
"""API for crawl management, including crawl done callback"""
|
|
# pylint: disable=invalid-name, duplicate-code
|
|
|
|
ops = CrawlOps(crawl_manager, *args)
|
|
|
|
org_viewer_dep = ops.orgs.org_viewer_dep
|
|
org_crawl_dep = ops.orgs.org_crawl_dep
|
|
|
|
@app.get("/orgs/all/crawls", tags=["crawls"])
|
|
async def list_crawls_admin(
|
|
user: User = Depends(user_dep),
|
|
pageSize: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
userid: Optional[UUID] = None,
|
|
cid: Optional[UUID] = None,
|
|
state: Optional[str] = None,
|
|
firstSeed: Optional[str] = None,
|
|
name: Optional[str] = None,
|
|
description: Optional[str] = None,
|
|
collectionId: Optional[UUID] = None,
|
|
sortBy: Optional[str] = None,
|
|
sortDirection: int = -1,
|
|
runningOnly: Optional[bool] = True,
|
|
):
|
|
if not user.is_superuser:
|
|
raise HTTPException(status_code=403, detail="Not Allowed")
|
|
|
|
states = []
|
|
if state:
|
|
states = state.split(",")
|
|
|
|
if firstSeed:
|
|
firstSeed = urllib.parse.unquote(firstSeed)
|
|
|
|
if name:
|
|
name = urllib.parse.unquote(name)
|
|
|
|
if description:
|
|
description = urllib.parse.unquote(description)
|
|
|
|
crawls, total = await ops.list_crawls(
|
|
None,
|
|
userid=userid,
|
|
cid=cid,
|
|
running_only=runningOnly,
|
|
state=states,
|
|
first_seed=firstSeed,
|
|
name=name,
|
|
description=description,
|
|
collection_id=collectionId,
|
|
page_size=pageSize,
|
|
page=page,
|
|
sort_by=sortBy,
|
|
sort_direction=sortDirection,
|
|
)
|
|
return paginated_format(crawls, total, page, pageSize)
|
|
|
|
@app.get("/orgs/{oid}/crawls", tags=["crawls"], response_model=PaginatedResponse)
|
|
async def list_crawls(
|
|
org: Organization = Depends(org_viewer_dep),
|
|
pageSize: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
userid: Optional[UUID] = None,
|
|
cid: Optional[UUID] = None,
|
|
state: Optional[str] = None,
|
|
firstSeed: Optional[str] = None,
|
|
name: Optional[str] = None,
|
|
description: Optional[str] = None,
|
|
collectionId: Optional[UUID] = None,
|
|
sortBy: Optional[str] = None,
|
|
sortDirection: int = -1,
|
|
):
|
|
# pylint: disable=duplicate-code
|
|
states = []
|
|
if state:
|
|
states = state.split(",")
|
|
|
|
if firstSeed:
|
|
firstSeed = urllib.parse.unquote(firstSeed)
|
|
|
|
if name:
|
|
name = urllib.parse.unquote(name)
|
|
|
|
if description:
|
|
description = urllib.parse.unquote(description)
|
|
|
|
crawls, total = await ops.list_crawls(
|
|
org,
|
|
userid=userid,
|
|
cid=cid,
|
|
running_only=False,
|
|
state=states,
|
|
first_seed=firstSeed,
|
|
name=name,
|
|
description=description,
|
|
collection_id=collectionId,
|
|
page_size=pageSize,
|
|
page=page,
|
|
sort_by=sortBy,
|
|
sort_direction=sortDirection,
|
|
)
|
|
return paginated_format(crawls, total, page, pageSize)
|
|
|
|
@app.post(
|
|
"/orgs/{oid}/crawls/{crawl_id}/cancel",
|
|
tags=["crawls"],
|
|
)
|
|
async def crawl_cancel_immediately(
|
|
crawl_id, org: Organization = Depends(org_crawl_dep)
|
|
):
|
|
return await ops.shutdown_crawl(crawl_id, org, graceful=False)
|
|
|
|
@app.post(
|
|
"/orgs/{oid}/crawls/{crawl_id}/stop",
|
|
tags=["crawls"],
|
|
)
|
|
async def crawl_graceful_stop(crawl_id, org: Organization = Depends(org_crawl_dep)):
|
|
return await ops.shutdown_crawl(crawl_id, org, graceful=True)
|
|
|
|
@app.post("/orgs/{oid}/crawls/delete", tags=["crawls"])
|
|
async def delete_crawls(
|
|
delete_list: DeleteCrawlList,
|
|
user: User = Depends(user_dep),
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
return await ops.delete_crawls(org, delete_list, "crawl", user)
|
|
|
|
@app.get("/orgs/all/crawls/stats", tags=["crawls"])
|
|
async def get_all_orgs_crawl_stats(
|
|
user: User = Depends(user_dep),
|
|
):
|
|
if not user.is_superuser:
|
|
raise HTTPException(status_code=403, detail="Not Allowed")
|
|
|
|
crawl_stats = await ops.get_crawl_stats()
|
|
return stream_dict_list_as_csv(crawl_stats, "crawling-stats.csv")
|
|
|
|
@app.get("/orgs/{oid}/crawls/stats", tags=["crawls"])
|
|
async def get_org_crawl_stats(
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
crawl_stats = await ops.get_crawl_stats(org)
|
|
return stream_dict_list_as_csv(crawl_stats, f"crawling-stats-{org.id}.csv")
|
|
|
|
@app.get(
|
|
"/orgs/all/crawls/{crawl_id}/replay.json",
|
|
tags=["crawls"],
|
|
response_model=CrawlOutWithResources,
|
|
)
|
|
async def get_crawl_admin(crawl_id, user: User = Depends(user_dep)):
|
|
if not user.is_superuser:
|
|
raise HTTPException(status_code=403, detail="Not Allowed")
|
|
|
|
return await ops.get_crawl_out(crawl_id, None, "crawl")
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/replay.json",
|
|
tags=["crawls"],
|
|
response_model=CrawlOutWithResources,
|
|
)
|
|
async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)):
|
|
return await ops.get_crawl_out(crawl_id, org, "crawl")
|
|
|
|
# QA APIs
|
|
# ---------------------
|
|
@app.get(
|
|
"/orgs/all/crawls/{crawl_id}/qa/{qa_run_id}/replay.json",
|
|
tags=["qa"],
|
|
response_model=QARunWithResources,
|
|
)
|
|
async def get_qa_run_admin(crawl_id, qa_run_id, user: User = Depends(user_dep)):
|
|
if not user.is_superuser:
|
|
raise HTTPException(status_code=403, detail="Not Allowed")
|
|
|
|
return await ops.get_qa_run_for_replay(crawl_id, qa_run_id)
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/replay.json",
|
|
tags=["qa"],
|
|
response_model=QARunWithResources,
|
|
)
|
|
async def get_qa_run(
|
|
crawl_id, qa_run_id, org: Organization = Depends(org_viewer_dep)
|
|
):
|
|
return await ops.get_qa_run_for_replay(crawl_id, qa_run_id, org)
|
|
|
|
@app.post("/orgs/{oid}/crawls/{crawl_id}/qa/start", tags=["qa"])
|
|
async def start_crawl_qa_run(
|
|
crawl_id: str,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
user: User = Depends(user_dep),
|
|
):
|
|
qa_run_id = await ops.start_crawl_qa_run(crawl_id, org, user)
|
|
return {"started": qa_run_id}
|
|
|
|
@app.post("/orgs/{oid}/crawls/{crawl_id}/qa/stop", tags=["qa"])
|
|
async def stop_crawl_qa_run(
|
|
crawl_id: str, org: Organization = Depends(org_crawl_dep)
|
|
):
|
|
# pylint: disable=unused-argument
|
|
return await ops.stop_crawl_qa_run(crawl_id, org)
|
|
|
|
@app.post("/orgs/{oid}/crawls/{crawl_id}/qa/cancel", tags=["qa"])
|
|
async def cancel_crawl_qa_run(
|
|
crawl_id: str, org: Organization = Depends(org_crawl_dep)
|
|
):
|
|
# pylint: disable=unused-argument
|
|
return await ops.stop_crawl_qa_run(crawl_id, org, graceful=False)
|
|
|
|
@app.post("/orgs/{oid}/crawls/{crawl_id}/qa/delete", tags=["qa"])
|
|
async def delete_crawl_qa_runs(
|
|
crawl_id: str,
|
|
qa_run_ids: DeleteQARunList,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
# pylint: disable=unused-argument
|
|
return await ops.delete_crawl_qa_runs(crawl_id, qa_run_ids)
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/qa",
|
|
tags=["qa"],
|
|
response_model=List[QARunOut],
|
|
)
|
|
async def get_qa_runs(
|
|
crawl_id, org: Organization = Depends(org_viewer_dep), skipFailed: bool = False
|
|
):
|
|
return await ops.get_qa_runs(crawl_id, skip_failed=skipFailed, org=org)
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/qa/activeQA",
|
|
tags=["qa"],
|
|
response_model=Dict[str, Optional[QARunOut]],
|
|
)
|
|
async def get_active_qa(crawl_id, org: Organization = Depends(org_viewer_dep)):
|
|
return {"qa": await ops.get_active_qa(crawl_id, org)}
|
|
|
|
# ----
|
|
|
|
@app.get(
|
|
"/orgs/all/crawls/{crawl_id}",
|
|
tags=["crawls"],
|
|
response_model=CrawlOut,
|
|
)
|
|
async def list_single_crawl_admin(crawl_id, user: User = Depends(user_dep)):
|
|
if not user.is_superuser:
|
|
raise HTTPException(status_code=403, detail="Not Allowed")
|
|
|
|
crawls, _ = await ops.list_crawls(crawl_id=crawl_id)
|
|
if len(crawls) < 1:
|
|
raise HTTPException(status_code=404, detail="crawl_not_found")
|
|
|
|
return crawls[0]
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}",
|
|
tags=["crawls"],
|
|
response_model=CrawlOut,
|
|
)
|
|
async def list_single_crawl(crawl_id, org: Organization = Depends(org_viewer_dep)):
|
|
crawls, _ = await ops.list_crawls(org, crawl_id=crawl_id)
|
|
if len(crawls) < 1:
|
|
raise HTTPException(status_code=404, detail="crawl_not_found")
|
|
|
|
return crawls[0]
|
|
|
|
@app.patch("/orgs/{oid}/crawls/{crawl_id}", tags=["crawls"])
|
|
async def update_crawl_api(
|
|
update: UpdateCrawl, crawl_id: str, org: Organization = Depends(org_crawl_dep)
|
|
):
|
|
return await ops.update_crawl(crawl_id, org, update, "crawl")
|
|
|
|
@app.post(
|
|
"/orgs/{oid}/crawls/{crawl_id}/scale",
|
|
tags=["crawls"],
|
|
)
|
|
async def scale_crawl(
|
|
scale: CrawlScale,
|
|
crawl_id,
|
|
user: User = Depends(user_dep),
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
await ops.update_crawl_scale(crawl_id, org, scale, user)
|
|
|
|
result = await ops.crawl_manager.scale_crawl(crawl_id, scale.scale)
|
|
if not result or not result.get("success"):
|
|
raise HTTPException(
|
|
status_code=400, detail=result.get("error") or "unknown"
|
|
)
|
|
|
|
return {"scaled": scale.scale}
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/access",
|
|
tags=["crawls"],
|
|
)
|
|
async def access_check(crawl_id, org: Organization = Depends(org_crawl_dep)):
|
|
if await ops.get_crawl_raw(crawl_id, org):
|
|
return {}
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/queue",
|
|
tags=["crawls"],
|
|
)
|
|
async def get_crawl_queue(
|
|
crawl_id,
|
|
offset: int,
|
|
count: int,
|
|
regex: Optional[str] = "",
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
await ops.get_crawl_raw(crawl_id, org)
|
|
|
|
return await ops.get_crawl_queue(crawl_id, offset, count, regex)
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/queueMatchAll",
|
|
tags=["crawls"],
|
|
)
|
|
async def match_crawl_queue(
|
|
crawl_id,
|
|
regex: str,
|
|
offset: int = 0,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
await ops.get_crawl_raw(crawl_id, org)
|
|
|
|
return await ops.match_crawl_queue(crawl_id, regex, offset)
|
|
|
|
@app.post(
|
|
"/orgs/{oid}/crawls/{crawl_id}/exclusions",
|
|
tags=["crawls"],
|
|
)
|
|
async def add_exclusion(
|
|
crawl_id,
|
|
regex: str,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
user: User = Depends(user_dep),
|
|
):
|
|
return await ops.add_or_remove_exclusion(crawl_id, regex, org, user, add=True)
|
|
|
|
@app.delete(
|
|
"/orgs/{oid}/crawls/{crawl_id}/exclusions",
|
|
tags=["crawls"],
|
|
)
|
|
async def remove_exclusion(
|
|
crawl_id,
|
|
regex: str,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
user: User = Depends(user_dep),
|
|
):
|
|
return await ops.add_or_remove_exclusion(crawl_id, regex, org, user, add=False)
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/seeds",
|
|
tags=["crawls"],
|
|
response_model=PaginatedResponse,
|
|
)
|
|
async def get_crawl_config_seeds(
|
|
crawl_id: str,
|
|
org: Organization = Depends(org_viewer_dep),
|
|
pageSize: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
):
|
|
seeds, total = await ops.get_crawl_seeds(crawl_id, org, pageSize, page)
|
|
return paginated_format(seeds, total, page, pageSize)
|
|
|
|
@app.get("/orgs/{oid}/crawls/{crawl_id}/logs", tags=["crawls"])
|
|
async def stream_crawl_logs(
|
|
crawl_id,
|
|
org: Organization = Depends(org_viewer_dep),
|
|
logLevel: Optional[str] = None,
|
|
context: Optional[str] = None,
|
|
):
|
|
crawl = await ops.get_crawl_out(crawl_id, org)
|
|
|
|
log_levels = []
|
|
contexts = []
|
|
if logLevel:
|
|
log_levels = logLevel.split(",")
|
|
if context:
|
|
contexts = context.split(",")
|
|
|
|
# If crawl is finished, stream logs from WACZ files using presigned urls
|
|
if crawl.finished:
|
|
resp = await ops.storage_ops.sync_stream_wacz_logs(
|
|
crawl.resources or [], log_levels, contexts
|
|
)
|
|
return StreamingResponse(
|
|
resp,
|
|
media_type="text/jsonl",
|
|
headers={
|
|
"Content-Disposition": f'attachment; filename="{crawl_id}.log"'
|
|
},
|
|
)
|
|
|
|
raise HTTPException(status_code=400, detail="crawl_not_finished")
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/crawls/{crawl_id}/errors",
|
|
tags=["crawls"],
|
|
)
|
|
async def get_crawl_errors(
|
|
crawl_id: str,
|
|
pageSize: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
org: Organization = Depends(org_viewer_dep),
|
|
):
|
|
crawl = await ops.get_crawl(crawl_id, org)
|
|
|
|
skip = (page - 1) * pageSize
|
|
upper_bound = skip + pageSize
|
|
|
|
errors = crawl.errors[skip:upper_bound] if crawl.errors else []
|
|
parsed_errors = parse_jsonl_error_messages(errors)
|
|
return paginated_format(parsed_errors, len(crawl.errors or []), page, pageSize)
|
|
|
|
return ops
|