browsertrix/backend/btrixcloud/crawlmanager.py
Ilya Kreymer fb3d88291f
Background Jobs Work (#1321)
Fixes #1252 

Supports a generic background job system, with two background jobs,
CreateReplicaJob and DeleteReplicaJob.
- CreateReplicaJob runs on new crawls, uploads, profiles and updates the
`replicas` array with the info about the replica after the job succeeds.
- DeleteReplicaJob deletes the replica.
- Both jobs are created from the new `replica_job.yaml` template. The
CreateReplicaJob sets secrets for primary storage + replica storage,
while DeleteReplicaJob only needs the replica storage.
- The job is processed in the operator when the job is finalized
(deleted), which should happen immediately when the job is done, either
because it succeeds or because the backoffLimit is reached (currently
set to 3).
- /jobs/ api lists all jobs using a paginated response, including filtering and sorting
- /jobs/<job id> returns details for a particular job
- tests: nightly tests updated to check create + delete replica jobs for crawls as well as uploads, job api endpoints
- tests: also fixes to timeouts in nightly tests to avoid crawls finishing too quickly.

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2023-11-02 13:02:17 -07:00

410 lines
13 KiB
Python

""" shared crawl manager implementation """
import os
import asyncio
import secrets
import json
from typing import Optional, Dict
from datetime import timedelta
from kubernetes_asyncio.client import V1ConfigMap
from fastapi import HTTPException
from .k8sapi import K8sAPI
from .utils import dt_now, to_k8s_date
from .models import StorageRef, CrawlConfig, UpdateCrawlConfig, BgJobType
# ============================================================================
class CrawlManager(K8sAPI):
"""abstract crawl manager"""
def __init__(self):
super().__init__()
self.loop = asyncio.get_running_loop()
# pylint: disable=too-many-arguments
async def run_profile_browser(
self,
userid: str,
oid: str,
url: str,
storage: StorageRef,
baseprofile: str = "",
profile_filename: str = "",
) -> str:
"""run browser for profile creation"""
storage_secret = storage.get_storage_secret_name(oid)
await self.has_storage_secret(storage_secret)
browserid = f"prf-{secrets.token_hex(5)}"
params = {
"id": browserid,
"userid": str(userid),
"oid": str(oid),
"storage_name": str(storage),
"base_profile": baseprofile or "",
"profile_filename": profile_filename or "",
"idle_timeout": os.environ.get("IDLE_TIMEOUT", "60"),
"url": url,
"vnc_password": secrets.token_hex(16),
"expire_time": to_k8s_date(dt_now() + timedelta(seconds=30)),
}
data = self.templates.env.get_template("profile_job.yaml").render(params)
await self.create_from_yaml(data)
return browserid
async def run_replica_job(
self,
oid: str,
job_type: str,
replica_storage: StorageRef,
replica_file_path: str,
replica_endpoint: str,
primary_storage: Optional[StorageRef] = None,
primary_file_path: Optional[str] = None,
primary_endpoint: Optional[str] = None,
job_id_prefix: Optional[str] = None,
):
"""run job to replicate file from primary storage to replica storage"""
if not job_id_prefix:
job_id_prefix = job_type
# ensure name is <=63 characters
job_id = f"{job_id_prefix[:52]}-{secrets.token_hex(5)}"
params = {
"id": job_id,
"oid": oid,
"job_type": job_type,
"replica_secret_name": replica_storage.get_storage_secret_name(oid),
"replica_file_path": replica_file_path,
"replica_endpoint": replica_endpoint,
"primary_secret_name": primary_storage.get_storage_secret_name(oid)
if primary_storage
else None,
"primary_file_path": primary_file_path if primary_file_path else None,
"primary_endpoint": primary_endpoint if primary_endpoint else None,
"BgJobType": BgJobType,
}
data = self.templates.env.get_template("replica_job.yaml").render(params)
await self.create_from_yaml(data)
return job_id
async def add_crawl_config(
self,
crawlconfig: CrawlConfig,
storage: StorageRef,
run_now: bool,
out_filename: str,
profile_filename: str,
) -> Optional[str]:
"""add new crawl, store crawl config in configmap"""
# Create Config Map
await self._create_config_map(
crawlconfig,
USER_ID=str(crawlconfig.modifiedBy),
ORG_ID=str(crawlconfig.oid),
STORE_FILENAME=out_filename,
PROFILE_FILENAME=profile_filename,
INITIAL_SCALE=str(crawlconfig.scale),
CRAWL_TIMEOUT=str(crawlconfig.crawlTimeout or 0),
MAX_CRAWL_SIZE=str(crawlconfig.maxCrawlSize or 0),
)
crawl_id = None
if run_now:
crawl_id = await self.create_crawl_job(
crawlconfig, storage, str(crawlconfig.modifiedBy)
)
await self._update_scheduled_job(crawlconfig)
return crawl_id
async def create_crawl_job(
self,
crawlconfig: CrawlConfig,
storage: StorageRef,
userid: str,
) -> str:
"""create new crawl job from config"""
cid = str(crawlconfig.id)
storage_secret = storage.get_storage_secret_name(str(crawlconfig.oid))
await self.has_storage_secret(storage_secret)
return await self.new_crawl_job(
cid,
userid,
crawlconfig.oid,
storage,
crawlconfig.scale,
crawlconfig.crawlTimeout,
crawlconfig.maxCrawlSize,
manual=True,
)
async def update_crawl_config(
self, crawlconfig: CrawlConfig, update: UpdateCrawlConfig, profile_filename=None
) -> bool:
"""Update the schedule or scale for existing crawl config"""
has_sched_update = update.schedule is not None
has_scale_update = update.scale is not None
has_timeout_update = update.crawlTimeout is not None
has_max_crawl_size_update = update.maxCrawlSize is not None
has_config_update = update.config is not None
if has_sched_update:
# crawlconfig here has already been updated
await self._update_scheduled_job(crawlconfig)
if (
has_scale_update
or has_config_update
or has_timeout_update
or profile_filename is not None
or has_max_crawl_size_update
):
await self._update_config_map(
crawlconfig,
update,
profile_filename,
has_config_update,
)
return True
async def remove_org_storage(self, storage: StorageRef, oid: str) -> bool:
"""Delete custom org storage secret"""
storage_secret = storage.get_storage_secret_name(oid)
storage_label = f"btrix.storage={storage_secret}"
if await self.has_custom_jobs_with_label("crawljobs", storage_label):
raise HTTPException(status_code=400, detail="storage_in_use")
if await self.has_custom_jobs_with_label("profilejobs", storage_label):
raise HTTPException(status_code=400, detail="storage_in_use")
try:
await self.core_api.delete_namespaced_secret(
storage_secret,
namespace=self.namespace,
)
return True
# pylint: disable=bare-except
except:
return False
async def add_org_storage(
self, storage: StorageRef, string_data: Dict[str, str], oid: str
) -> None:
"""Add custom org storage secret"""
labels = {"btrix.org": oid}
storage_secret = storage.get_storage_secret_name(oid)
crawl_secret = self.client.V1Secret(
metadata={
"name": storage_secret,
"namespace": self.namespace,
"labels": labels,
},
string_data=string_data,
)
try:
await self.core_api.create_namespaced_secret(
namespace=self.namespace, body=crawl_secret
)
# pylint: disable=bare-except
except:
await self.core_api.patch_namespaced_secret(
name=storage_secret, namespace=self.namespace, body=crawl_secret
)
async def get_profile_browser_metadata(self, browserid: str) -> dict[str, str]:
"""get browser profile labels"""
try:
browser = await self.get_profile_browser(browserid)
# pylint: disable=bare-except
except:
return {}
return browser["metadata"]["labels"]
async def get_configmap(self, cid: str) -> V1ConfigMap:
"""get configmap by id"""
return await self.core_api.read_namespaced_config_map(
name=f"crawl-config-{cid}", namespace=self.namespace
)
async def ping_profile_browser(self, browserid: str) -> None:
"""return ping profile browser"""
expire_at = dt_now() + timedelta(seconds=30)
await self._patch_job(
browserid, {"expireTime": to_k8s_date(expire_at)}, "profilejobs"
)
async def rollover_restart_crawl(self, crawl_id: str) -> dict:
"""Rolling restart of crawl by updating restartTime field"""
update = to_k8s_date(dt_now())
return await self._patch_job(crawl_id, {"restartTime": update})
async def scale_crawl(self, crawl_id: str, scale: int = 1) -> dict:
"""Set the crawl scale (job parallelism) on the specified job"""
return await self._patch_job(crawl_id, {"scale": scale})
async def shutdown_crawl(self, crawl_id: str, graceful=True) -> dict:
"""Request a crawl cancelation or stop by calling an API
on the job pod/container, returning the result"""
if graceful:
patch = {"stopping": True}
return await self._patch_job(crawl_id, patch)
return await self.delete_crawl_job(crawl_id)
async def delete_crawl_configs_for_org(self, org: str) -> None:
"""Delete all crawl configs for given org"""
await self._delete_crawl_configs(f"btrix.org={org}")
async def delete_crawl_config_by_id(self, cid: str) -> None:
"""Delete all crawl configs by id"""
await self._delete_crawl_configs(f"btrix.crawlconfig={cid}")
# ========================================================================
# Internal Methods
async def _create_config_map(self, crawlconfig: CrawlConfig, **data) -> None:
"""Create Config Map based on CrawlConfig"""
data["crawl-config.json"] = json.dumps(crawlconfig.get_raw_config())
labels = {
"btrix.crawlconfig": str(crawlconfig.id),
"btrix.org": str(crawlconfig.oid),
}
config_map = self.client.V1ConfigMap(
metadata={
"name": f"crawl-config-{crawlconfig.id}",
"namespace": self.namespace,
"labels": labels,
},
data=data,
)
await self.core_api.create_namespaced_config_map(
namespace=self.namespace, body=config_map
)
async def _delete_crawl_configs(self, label) -> None:
"""Delete Crawl Cron Job and all dependent resources, including configmap and secrets"""
await self.batch_api.delete_collection_namespaced_cron_job(
namespace=self.namespace,
label_selector=label,
)
await self.core_api.delete_collection_namespaced_config_map(
namespace=self.namespace,
label_selector=label,
)
async def _update_scheduled_job(self, crawlconfig: CrawlConfig) -> Optional[str]:
"""create or remove cron job based on crawlconfig schedule"""
cid = str(crawlconfig.id)
cron_job_id = f"sched-{cid[:12]}"
cron_job = None
try:
cron_job = await self.batch_api.read_namespaced_cron_job(
name=cron_job_id,
namespace=self.namespace,
)
# pylint: disable=bare-except
except:
pass
# if no schedule, delete cron_job if exists and we're done
if not crawlconfig.schedule:
if cron_job:
await self.batch_api.delete_namespaced_cron_job(
name=cron_job.metadata.name, namespace=self.namespace
)
return None
# if cron job exists, just patch schedule
if cron_job:
if crawlconfig.schedule != cron_job.spec.schedule:
cron_job.spec.schedule = crawlconfig.schedule
await self.batch_api.patch_namespaced_cron_job(
name=cron_job.metadata.name,
namespace=self.namespace,
body=cron_job,
)
return None
params = {
"id": cron_job_id,
"cid": str(crawlconfig.id),
"schedule": crawlconfig.schedule,
}
data = self.templates.env.get_template("crawl_cron_job.yaml").render(params)
await self.create_from_yaml(data, self.namespace)
return cron_job_id
async def _update_config_map(
self,
crawlconfig: CrawlConfig,
update: UpdateCrawlConfig,
profile_filename: Optional[str] = None,
update_config: bool = False,
) -> None:
config_map = await self.get_configmap(str(crawlconfig.id))
if update.scale is not None:
config_map.data["INITIAL_SCALE"] = str(update.scale)
if update.crawlTimeout is not None:
config_map.data["CRAWL_TIMEOUT"] = str(update.crawlTimeout)
if update.maxCrawlSize is not None:
config_map.data["MAX_CRAWL_SIZE"] = str(update.maxCrawlSize)
if update.crawlFilenameTemplate is not None:
config_map.data["STORE_FILENAME"] = update.crawlFilenameTemplate
if profile_filename is not None:
config_map.data["PROFILE_FILENAME"] = profile_filename
if update_config:
config_map.data["crawl-config.json"] = json.dumps(
crawlconfig.get_raw_config()
)
await self.core_api.patch_namespaced_config_map(
name=config_map.metadata.name, namespace=self.namespace, body=config_map
)