Supports running QA Runs via the QA API! Builds on top of the `issue-1498-crawl-qa-backend-support` branch, fixes #1498 Also requires the latest Browsertrix Crawler 1.1.0+ (from webrecorder/browsertrix-crawler#469 branch) Notable changes: - QARun objects contain info about QA runs, which are crawls performed on data loaded from existing crawls. - Various crawl db operations can be performed on either the crawl or `qa.` object, and core crawl fields have been moved to CoreCrawlable. - While running,`QARun` data stored in a single `qa` object, while finished qa runs are added to `qaFinished` dictionary on the Crawl. The QA list API returns data from the finished list, sorted by most recent first. - Includes additional type fixes / type safety, especially around BaseCrawl / Crawl / UploadedCrawl functionality, also creating specific get_upload(), get_basecrawl(), get_crawl() getters for internal use and get_crawl_out() for API - Support filtering and sorting pages via `qaFilterBy` (screenshotMatch, textMatch) along with `gt`, `lt`, `gte`, `lte` params to return pages based on QA results. --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
		
			
				
	
	
		
			131 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			131 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
""" Operator handler for crawl CronJobs """
 | 
						|
 | 
						|
from uuid import UUID
 | 
						|
import yaml
 | 
						|
 | 
						|
from btrixcloud.utils import to_k8s_date
 | 
						|
from .models import MCBaseRequest, MCDecoratorSyncData, CJS, CMAP
 | 
						|
from .baseoperator import BaseOperator
 | 
						|
 | 
						|
 | 
						|
# pylint: disable=too-many-locals
 | 
						|
# ============================================================================
 | 
						|
class CronJobOperator(BaseOperator):
 | 
						|
    """CronJob Operator"""
 | 
						|
 | 
						|
    def init_routes(self, app):
 | 
						|
        """init routes for crawl CronJob decorator"""
 | 
						|
 | 
						|
        @app.post("/op/cronjob/sync")
 | 
						|
        async def mc_sync_cronjob_crawls(data: MCDecoratorSyncData):
 | 
						|
            return await self.sync_cronjob_crawl(data)
 | 
						|
 | 
						|
        @app.post("/op/cronjob/customize")
 | 
						|
        async def mc_cronjob_related(data: MCBaseRequest):
 | 
						|
            return self.get_cronjob_crawl_related(data)
 | 
						|
 | 
						|
    def get_cronjob_crawl_related(self, data: MCBaseRequest):
 | 
						|
        """return configmap related to crawl"""
 | 
						|
        labels = data.parent.get("metadata", {}).get("labels", {})
 | 
						|
        cid = labels.get("btrix.crawlconfig")
 | 
						|
        return {
 | 
						|
            "relatedResources": [
 | 
						|
                {
 | 
						|
                    "apiVersion": "v1",
 | 
						|
                    "resource": "configmaps",
 | 
						|
                    "labelSelector": {"matchLabels": {"btrix.crawlconfig": cid}},
 | 
						|
                }
 | 
						|
            ]
 | 
						|
        }
 | 
						|
 | 
						|
    async def sync_cronjob_crawl(self, data: MCDecoratorSyncData):
 | 
						|
        """create crawljobs from a job object spawned by cronjob"""
 | 
						|
 | 
						|
        metadata = data.object["metadata"]
 | 
						|
        labels = metadata.get("labels", {})
 | 
						|
        cid = labels.get("btrix.crawlconfig")
 | 
						|
 | 
						|
        name = metadata.get("name")
 | 
						|
        crawl_id = name
 | 
						|
 | 
						|
        actual_state, finished = await self.crawl_ops.get_crawl_state(
 | 
						|
            crawl_id, is_qa=False
 | 
						|
        )
 | 
						|
        if finished:
 | 
						|
            status = None
 | 
						|
            # mark job as completed
 | 
						|
            if not data.object["status"].get("succeeded"):
 | 
						|
                print("Cron Job Complete!", finished)
 | 
						|
                status = {
 | 
						|
                    "succeeded": 1,
 | 
						|
                    "startTime": metadata.get("creationTimestamp"),
 | 
						|
                    "completionTime": to_k8s_date(finished),
 | 
						|
                }
 | 
						|
 | 
						|
            return {
 | 
						|
                "attachments": [],
 | 
						|
                "annotations": {"finished": finished},
 | 
						|
                "status": status,
 | 
						|
            }
 | 
						|
 | 
						|
        configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"]
 | 
						|
 | 
						|
        oid = configmap.get("ORG_ID")
 | 
						|
        userid = configmap.get("USER_ID")
 | 
						|
 | 
						|
        crawljobs = data.attachments[CJS]
 | 
						|
 | 
						|
        org = await self.org_ops.get_org_by_id(UUID(oid))
 | 
						|
 | 
						|
        warc_prefix = None
 | 
						|
 | 
						|
        if not actual_state:
 | 
						|
            # cronjob doesn't exist yet
 | 
						|
            crawlconfig = await self.crawl_config_ops.get_crawl_config(
 | 
						|
                UUID(cid), UUID(oid)
 | 
						|
            )
 | 
						|
            if not crawlconfig:
 | 
						|
                print(
 | 
						|
                    f"error: no crawlconfig {cid}. skipping scheduled job. old cronjob left over?"
 | 
						|
                )
 | 
						|
                return {"attachments": []}
 | 
						|
 | 
						|
            # db create
 | 
						|
            user = await self.user_ops.get_by_id(UUID(userid))
 | 
						|
            if not user:
 | 
						|
                print(f"error: missing user for id {userid}")
 | 
						|
                return {"attachments": []}
 | 
						|
 | 
						|
            warc_prefix = self.crawl_config_ops.get_warc_prefix(org, crawlconfig)
 | 
						|
 | 
						|
            await self.crawl_config_ops.add_new_crawl(
 | 
						|
                crawl_id,
 | 
						|
                crawlconfig,
 | 
						|
                user,
 | 
						|
                manual=False,
 | 
						|
            )
 | 
						|
            print("Scheduled Crawl Created: " + crawl_id)
 | 
						|
 | 
						|
        crawl_id, crawljob = self.k8s.new_crawl_job_yaml(
 | 
						|
            cid,
 | 
						|
            userid=userid,
 | 
						|
            oid=oid,
 | 
						|
            storage=org.storage,
 | 
						|
            crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"),
 | 
						|
            scale=int(configmap.get("INITIAL_SCALE", 1)),
 | 
						|
            crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)),
 | 
						|
            max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
 | 
						|
            manual=False,
 | 
						|
            crawl_id=crawl_id,
 | 
						|
            warc_prefix=warc_prefix,
 | 
						|
        )
 | 
						|
 | 
						|
        attachments = list(yaml.safe_load_all(crawljob))
 | 
						|
 | 
						|
        if crawl_id in crawljobs:
 | 
						|
            attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"]
 | 
						|
 | 
						|
        return {
 | 
						|
            "attachments": attachments,
 | 
						|
        }
 |