browsertrix/backend/btrixcloud/operator/cronjobs.py
Ilya Kreymer 2ac6584942
Refactor operator class into module (#1564)
The operator class has gotten fairly large, this is a first pass in
refactoring operator.py into a submodule instead, with multiple operator
instances which handle different types of objects.

- The main k8s interface has been split into K8sOpApi which extends K8sApi
and is shared across all operators.
- Each operator extends BaseOperator which also has an instance of K8sOpApi
- The CrawlOperator is still the bulk of the functionality, but will likely be further refactored
to support QA jobs

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-02-29 14:40:12 -08:00

129 lines
4.2 KiB
Python

""" Operator handler for crawl CronJobs """
from uuid import UUID
import yaml
from btrixcloud.utils import to_k8s_date
from .models import MCBaseRequest, MCDecoratorSyncData, CJS, CMAP
from .baseoperator import BaseOperator
# pylint: disable=too-many-locals
# ============================================================================
class CronJobOperator(BaseOperator):
"""CronJob Operator"""
def init_routes(self, app):
"""init routes for crawl CronJob decorator"""
@app.post("/op/cronjob/sync")
async def mc_sync_cronjob_crawls(data: MCDecoratorSyncData):
return await self.sync_cronjob_crawl(data)
@app.post("/op/cronjob/customize")
async def mc_cronjob_related(data: MCBaseRequest):
return self.get_cronjob_crawl_related(data)
def get_cronjob_crawl_related(self, data: MCBaseRequest):
"""return configmap related to crawl"""
labels = data.parent.get("metadata", {}).get("labels", {})
cid = labels.get("btrix.crawlconfig")
return {
"relatedResources": [
{
"apiVersion": "v1",
"resource": "configmaps",
"labelSelector": {"matchLabels": {"btrix.crawlconfig": cid}},
}
]
}
async def sync_cronjob_crawl(self, data: MCDecoratorSyncData):
"""create crawljobs from a job object spawned by cronjob"""
metadata = data.object["metadata"]
labels = metadata.get("labels", {})
cid = labels.get("btrix.crawlconfig")
name = metadata.get("name")
crawl_id = name
actual_state, finished = await self.crawl_ops.get_crawl_state(crawl_id)
if finished:
status = None
# mark job as completed
if not data.object["status"].get("succeeded"):
print("Cron Job Complete!", finished)
status = {
"succeeded": 1,
"startTime": metadata.get("creationTimestamp"),
"completionTime": to_k8s_date(finished),
}
return {
"attachments": [],
"annotations": {"finished": finished},
"status": status,
}
configmap = data.related[CMAP][f"crawl-config-{cid}"]["data"]
oid = configmap.get("ORG_ID")
userid = configmap.get("USER_ID")
crawljobs = data.attachments[CJS]
org = await self.org_ops.get_org_by_id(UUID(oid))
warc_prefix = None
if not actual_state:
# cronjob doesn't exist yet
crawlconfig = await self.crawl_config_ops.get_crawl_config(
UUID(cid), UUID(oid)
)
if not crawlconfig:
print(
f"error: no crawlconfig {cid}. skipping scheduled job. old cronjob left over?"
)
return {"attachments": []}
# db create
user = await self.user_ops.get_by_id(UUID(userid))
if not user:
print(f"error: missing user for id {userid}")
return {"attachments": []}
warc_prefix = self.crawl_config_ops.get_warc_prefix(org, crawlconfig)
await self.crawl_config_ops.add_new_crawl(
crawl_id,
crawlconfig,
user,
manual=False,
)
print("Scheduled Crawl Created: " + crawl_id)
crawl_id, crawljob = self.k8s.new_crawl_job_yaml(
cid,
userid=userid,
oid=oid,
storage=org.storage,
crawler_channel=configmap.get("CRAWLER_CHANNEL", "default"),
scale=int(configmap.get("INITIAL_SCALE", 1)),
crawl_timeout=int(configmap.get("CRAWL_TIMEOUT", 0)),
max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
manual=False,
crawl_id=crawl_id,
warc_prefix=warc_prefix,
)
attachments = list(yaml.safe_load_all(crawljob))
if crawl_id in crawljobs:
attachments[0]["status"] = crawljobs[CJS][crawl_id]["status"]
return {
"attachments": attachments,
}