Supports running QA Runs via the QA API! Builds on top of the `issue-1498-crawl-qa-backend-support` branch, fixes #1498 Also requires the latest Browsertrix Crawler 1.1.0+ (from webrecorder/browsertrix-crawler#469 branch) Notable changes: - QARun objects contain info about QA runs, which are crawls performed on data loaded from existing crawls. - Various crawl db operations can be performed on either the crawl or `qa.` object, and core crawl fields have been moved to CoreCrawlable. - While running,`QARun` data stored in a single `qa` object, while finished qa runs are added to `qaFinished` dictionary on the Crawl. The QA list API returns data from the finished list, sorted by most recent first. - Includes additional type fixes / type safety, especially around BaseCrawl / Crawl / UploadedCrawl functionality, also creating specific get_upload(), get_basecrawl(), get_crawl() getters for internal use and get_crawl_out() for API - Support filtering and sorting pages via `qaFilterBy` (screenshotMatch, textMatch) along with `gt`, `lt`, `gte`, `lte` params to return pages based on QA results. --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
210 lines
5.8 KiB
Python
210 lines
5.8 KiB
Python
""" Operator Models """
|
|
|
|
from collections import defaultdict
|
|
from uuid import UUID
|
|
from typing import Optional, DefaultDict
|
|
from pydantic import BaseModel, Field
|
|
from kubernetes.utils import parse_quantity
|
|
from btrixcloud.models import StorageRef
|
|
|
|
|
|
BTRIX_API = "btrix.cloud/v1"
|
|
|
|
CMAP = "ConfigMap.v1"
|
|
PVC = "PersistentVolumeClaim.v1"
|
|
POD = "Pod.v1"
|
|
CJS = f"CrawlJob.{BTRIX_API}"
|
|
|
|
|
|
# ============================================================================
|
|
class MCBaseRequest(BaseModel):
|
|
"""base metacontroller model, used for customize hook"""
|
|
|
|
parent: dict
|
|
controller: dict
|
|
|
|
|
|
# ============================================================================
|
|
class MCSyncData(MCBaseRequest):
|
|
"""sync / finalize metacontroller model"""
|
|
|
|
children: dict
|
|
related: dict
|
|
finalizing: bool = False
|
|
|
|
|
|
# ============================================================================
|
|
class MCDecoratorSyncData(BaseModel):
|
|
"""sync for decoratorcontroller model"""
|
|
|
|
object: dict
|
|
controller: dict
|
|
|
|
attachments: dict
|
|
related: dict
|
|
finalizing: bool = False
|
|
|
|
|
|
# ============================================================================
|
|
class CrawlSpec(BaseModel):
|
|
"""spec from k8s CrawlJob object"""
|
|
|
|
id: str
|
|
cid: UUID
|
|
oid: UUID
|
|
scale: int = 1
|
|
storage: StorageRef
|
|
started: str
|
|
crawler_channel: str
|
|
stopping: bool = False
|
|
scheduled: bool = False
|
|
timeout: int = 0
|
|
max_crawl_size: int = 0
|
|
qa_source_crawl_id: Optional[str] = ""
|
|
|
|
@property
|
|
def db_crawl_id(self) -> str:
|
|
"""return actual crawl_id for db, if qa run"""
|
|
return self.qa_source_crawl_id or self.id
|
|
|
|
@property
|
|
def is_qa(self) -> bool:
|
|
"""return true if qa run"""
|
|
return bool(self.qa_source_crawl_id)
|
|
|
|
|
|
# ============================================================================
|
|
class PodResourcePercentage(BaseModel):
|
|
"""Resource usage percentage ratios"""
|
|
|
|
memory: float = 0
|
|
cpu: float = 0
|
|
storage: float = 0
|
|
|
|
|
|
# ============================================================================
|
|
class PodResources(BaseModel):
|
|
"""Pod Resources"""
|
|
|
|
memory: int = 0
|
|
cpu: float = 0
|
|
storage: int = 0
|
|
|
|
def __init__(self, *a, **kw):
|
|
if "memory" in kw:
|
|
kw["memory"] = int(parse_quantity(kw["memory"]))
|
|
if "cpu" in kw:
|
|
kw["cpu"] = float(parse_quantity(kw["cpu"]))
|
|
if "storage" in kw:
|
|
kw["storage"] = int(parse_quantity(kw["storage"]))
|
|
super().__init__(*a, **kw)
|
|
|
|
|
|
# ============================================================================
|
|
class PodInfo(BaseModel):
|
|
"""Aggregate pod status info held in CrawlJob"""
|
|
|
|
exitTime: Optional[str] = None
|
|
exitCode: Optional[int] = None
|
|
isNewExit: Optional[bool] = Field(default=None, exclude=True)
|
|
reason: Optional[str] = None
|
|
|
|
allocated: PodResources = PodResources()
|
|
used: PodResources = PodResources()
|
|
|
|
newCpu: Optional[int] = None
|
|
newMemory: Optional[int] = None
|
|
|
|
def dict(self, *a, **kw):
|
|
res = super().dict(*a, **kw)
|
|
percent = {
|
|
"memory": self.get_percent_memory(),
|
|
"cpu": self.get_percent_cpu(),
|
|
"storage": self.get_percent_storage(),
|
|
}
|
|
res["percent"] = percent
|
|
return res
|
|
|
|
def get_percent_memory(self) -> float:
|
|
"""compute percent memory used"""
|
|
return (
|
|
float(self.used.memory) / float(self.allocated.memory)
|
|
if self.allocated.memory
|
|
else 0
|
|
)
|
|
|
|
def get_percent_cpu(self) -> float:
|
|
"""compute percent cpu used"""
|
|
return (
|
|
float(self.used.cpu) / float(self.allocated.cpu)
|
|
if self.allocated.cpu
|
|
else 0
|
|
)
|
|
|
|
def get_percent_storage(self) -> float:
|
|
"""compute percent storage used"""
|
|
return (
|
|
float(self.used.storage) / float(self.allocated.storage)
|
|
if self.allocated.storage
|
|
else 0
|
|
)
|
|
|
|
def should_restart_pod(self):
|
|
"""return true if pod should be restarted"""
|
|
if self.newMemory and self.newMemory != self.allocated.memory:
|
|
return True
|
|
|
|
if self.newCpu and self.newCpu != self.allocated.cpu:
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
# ============================================================================
|
|
# pylint: disable=invalid-name
|
|
class CrawlStatus(BaseModel):
|
|
"""status from k8s CrawlJob object"""
|
|
|
|
state: str = "starting"
|
|
pagesFound: int = 0
|
|
pagesDone: int = 0
|
|
size: int = 0
|
|
# human readable size string
|
|
sizeHuman: str = ""
|
|
scale: int = 1
|
|
filesAdded: int = 0
|
|
filesAddedSize: int = 0
|
|
finished: Optional[str] = None
|
|
stopping: bool = False
|
|
stopReason: Optional[str] = None
|
|
initRedis: bool = False
|
|
crawlerImage: Optional[str] = None
|
|
lastActiveTime: str = ""
|
|
podStatus: Optional[DefaultDict[str, PodInfo]] = defaultdict(
|
|
lambda: PodInfo() # pylint: disable=unnecessary-lambda
|
|
)
|
|
# placeholder for pydantic 2.0 -- will require this version
|
|
# podStatus: Optional[
|
|
# DefaultDict[str, Annotated[PodInfo, Field(default_factory=PodInfo)]]
|
|
# ]
|
|
restartTime: Optional[str]
|
|
canceled: bool = False
|
|
|
|
# updated on pod exits and at regular interval
|
|
# Crawl Execution Time -- time all crawler pods have been running
|
|
# used to track resource usage and enforce execution minutes limit
|
|
crawlExecTime: int = 0
|
|
|
|
# Elapsed Exec Time -- time crawl has been running in at least one pod
|
|
# used for crawl timeouts
|
|
elapsedCrawlTime: int = 0
|
|
|
|
# last exec time update
|
|
lastUpdatedTime: str = ""
|
|
|
|
# any pods exited
|
|
anyCrawlPodNewExit: Optional[bool] = Field(default=False, exclude=True)
|
|
|
|
# don't include in status, use by metacontroller
|
|
resync_after: Optional[int] = Field(default=None, exclude=True)
|