* Btrixjobs Operator - Phase 1 (#679) - add metacontroller and custom crds - add main_op entrypoint for operator * Btrix Operator Crawl Management (#767) * operator backend: - run operator api in separate container but in same pod, with WEB_CONCURRENCY=1 - operator creates statefulsets and services for CrawlJob and ProfileJob - operator: use service hook endpoint, set port in values.yaml * crawls working with CrawlJob - jobs start with 'crawljob-' prefix - update status to reflect current crawl state - set sync time to 10 seconds by default, overridable with 'operator_resync_seconds' - mark crawl as running, failed, complete when finished - store finished status when crawl is complete - support updating scale, forcing rollover, stop via patching CrawlJob - support cancel via deletion - requires hack to content-length for patching custom resources - auto-delete of CrawlJob via 'ttlSecondsAfterFinished' - also delete pvcs until autodelete supported via statefulset (k8s >1.27) - ensure filesAdded always set correctly, keep counter in redis, add to status display - optimization: attempt to reduce automerging, by reusing volumeClaimTemplates from existing children, as these may have additional props added - add add_crawl_errors_to_db() for storing crawl errors from redis '<crawl>:e' key to mongodb when crawl is finished/failed/canceled - add .status.size to display human-readable crawl size, if available (from webrecorder/browsertrix-crawler#291) - support new page size, >0.9.0 and old page size key (changed in webrecorder/browsertrix-crawler#284) * support for scheduled jobs! - add main_scheduled_job entrypoint to run scheduled jobs - add crawl_cron_job.yaml template for declaring CronJob - CronJobs moved to default namespace * operator manages ProfileJobs: - jobs start with 'profilejob-' - update expiry time by updating ProfileJob object 'expireTime' while profile is active * refactor/cleanup: - remove k8s package - merge k8sman and basecrawlmanager into crawlmanager - move templates, k8sapi, utils into root package - delete all *_job.py files - remove dt_now, ts_now from crawls, now in utils - all db operations happen in crawl/crawlconfig/org files - move shared crawl/crawlconfig/org functions that use the db to be importable directly, including get_crawl_config, add_new_crawl, inc_crawl_stats * role binding: more secure setup, don't allow crawler namespace any k8s permissions - move cronjobs to be created in default namespace - grant default namespace access to create cronjobs in default namespace - remove role binding from crawler namespace * additional tweaks to templates: - templates: split crawler and redis statefulset into separate yaml file (in case need to load one or other separately) * stats / redis optimization: - don't update stats in mongodb on every operator sync, only when crawl is finished - for api access, read stats directly from redis to get up-to-date stats - move get_page_stats() to utils, add get_redis_url() to k8sapi to unify access * Add migration for operator changes - Update configmap for crawl configs with scale > 1 or crawlTimeout > 0 and schedule exists to recreate CronJobs - add option to rerun last migration, enabled via env var and by running helm with --set=rerun_last_migration=1 * subcharts: move crawljob and profilejob crds to separate subchart, as this seems best way to guarantee proper install order with + update on upgrade with helm, add built btrix-crds-0.1.0.tgz subchart - metacontroller: use release from ghcr, add metacontroller-helm-v4.10.1.tgz subchart * backend api fixes - ensure changing scale of crawl also updates it in the db - crawlconfigs: add 'currCrawlSize' and 'lastCrawlSize' to crawlconfig api --------- Co-authored-by: D. Lee <leepro@gmail.com> Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
201 lines
6.6 KiB
Python
201 lines
6.6 KiB
Python
""" K8S API Access """
|
|
import os
|
|
import traceback
|
|
|
|
from datetime import timedelta
|
|
|
|
import yaml
|
|
|
|
from kubernetes_asyncio import client, config
|
|
from kubernetes_asyncio.stream import WsApiClient
|
|
from kubernetes_asyncio.client.api_client import ApiClient
|
|
from kubernetes_asyncio.client.api import custom_objects_api
|
|
from kubernetes_asyncio.utils import create_from_dict
|
|
|
|
|
|
from fastapi.templating import Jinja2Templates
|
|
from .utils import get_templates_dir, dt_now, to_k8s_date
|
|
|
|
|
|
# ============================================================================
|
|
# pylint: disable=too-many-instance-attributes
|
|
class K8sAPI:
|
|
"""K8S API accessors"""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.namespace = os.environ.get("CRAWLER_NAMESPACE") or "crawlers"
|
|
self.custom_resources = {}
|
|
|
|
self.templates = Jinja2Templates(directory=get_templates_dir())
|
|
|
|
config.load_incluster_config()
|
|
self.client = client
|
|
|
|
self.api_client = ApiClient()
|
|
|
|
self.core_api = client.CoreV1Api(self.api_client)
|
|
self.core_api_ws = client.CoreV1Api(api_client=WsApiClient())
|
|
self.batch_api = client.BatchV1Api(self.api_client)
|
|
self.apps_api = client.AppsV1Api(self.api_client)
|
|
|
|
# try separate api client to avoid content-type issues
|
|
self.custom_api = custom_objects_api.CustomObjectsApi(self.api_client)
|
|
|
|
# custom resource's client API
|
|
self.add_custom_resource("CrawlJob", "crawljobs")
|
|
self.add_custom_resource("ProfileJob", "profilejobs")
|
|
|
|
def add_custom_resource(self, name, plural):
|
|
"""add custom resource"""
|
|
self.custom_resources[name] = plural
|
|
|
|
def get_custom_api(self, kind):
|
|
"""return custom API"""
|
|
return self.custom_resources[kind] if kind in self.custom_resources else None
|
|
|
|
def get_redis_url(self, crawl_id):
|
|
"""get redis url for crawl id"""
|
|
redis_id = f"redis-{crawl_id}"
|
|
redis_url = (
|
|
f"redis://{redis_id}-0.{redis_id}.{self.namespace}.svc.cluster.local/0"
|
|
)
|
|
return redis_url
|
|
|
|
# pylint: disable=too-many-arguments
|
|
async def new_crawl_job(self, cid, userid, scale=1, crawl_timeout=0, manual=True):
|
|
"""load job template from yaml"""
|
|
if crawl_timeout:
|
|
crawl_expire_time = to_k8s_date(dt_now() + timedelta(seconds=crawl_timeout))
|
|
else:
|
|
crawl_expire_time = ""
|
|
|
|
ts_now = dt_now().strftime("%Y%m%d%H%M%S")
|
|
prefix = "manual" if manual else "sched"
|
|
crawl_id = f"{prefix}-{ts_now}-{cid[:12]}"
|
|
|
|
params = {
|
|
"id": crawl_id,
|
|
"cid": cid,
|
|
"userid": userid,
|
|
"scale": scale,
|
|
"expire_time": crawl_expire_time,
|
|
"manual": "1" if manual else "0",
|
|
}
|
|
|
|
data = self.templates.env.get_template("crawl_job.yaml").render(params)
|
|
|
|
# create job directly
|
|
await self.create_from_yaml(data)
|
|
|
|
return crawl_id
|
|
|
|
async def create_from_yaml(self, doc, namespace=None):
|
|
"""init k8s objects from yaml"""
|
|
yml_document_all = yaml.safe_load_all(doc)
|
|
k8s_objects = []
|
|
for yml_document in yml_document_all:
|
|
custom = self.custom_resources.get(yml_document["kind"])
|
|
if custom is not None:
|
|
created = await self.create_custom_from_dict(
|
|
custom, yml_document, namespace
|
|
)
|
|
else:
|
|
created = await create_from_dict(
|
|
self.api_client,
|
|
yml_document,
|
|
verbose=False,
|
|
namespace=namespace or self.namespace,
|
|
)
|
|
k8s_objects.append(created)
|
|
|
|
return k8s_objects
|
|
|
|
async def create_custom_from_dict(self, custom, doc, namespace):
|
|
"""create custom from dict"""
|
|
apiver = doc["apiVersion"].split("/")
|
|
created = await self.custom_api.create_namespaced_custom_object(
|
|
group=apiver[0],
|
|
version=apiver[1],
|
|
plural=custom,
|
|
body=doc,
|
|
namespace=namespace or self.namespace,
|
|
)
|
|
return created
|
|
|
|
async def delete_crawl_job(self, crawl_id):
|
|
"""delete custom crawljob object"""
|
|
try:
|
|
await self.custom_api.delete_namespaced_custom_object(
|
|
group="btrix.cloud",
|
|
version="v1",
|
|
namespace=self.namespace,
|
|
plural="crawljobs",
|
|
name=f"crawljob-{crawl_id}",
|
|
grace_period_seconds=0,
|
|
propagation_policy="Foreground",
|
|
)
|
|
return True
|
|
|
|
# pylint: disable=broad-except
|
|
except Exception as exc:
|
|
print("CrawlJob delete failed", exc)
|
|
return False
|
|
|
|
async def delete_profile_browser(self, browserid):
|
|
"""delete custom crawljob object"""
|
|
try:
|
|
await self.custom_api.delete_namespaced_custom_object(
|
|
group="btrix.cloud",
|
|
version="v1",
|
|
namespace=self.namespace,
|
|
plural="profilejobs",
|
|
name=f"profilejob-{browserid}",
|
|
grace_period_seconds=0,
|
|
propagation_policy="Foreground",
|
|
)
|
|
return True
|
|
|
|
# pylint: disable=broad-except
|
|
except Exception as exc:
|
|
print("ProfileJob delete failed", exc)
|
|
return False
|
|
|
|
async def get_profile_browser(self, browserid):
|
|
"""get profile browser"""
|
|
return await self.custom_api.get_namespaced_custom_object(
|
|
group="btrix.cloud",
|
|
version="v1",
|
|
namespace=self.namespace,
|
|
plural="profilejobs",
|
|
name=f"profilejob-{browserid}",
|
|
)
|
|
|
|
async def _patch_job(self, crawl_id, body, pluraltype="crawljobs"):
|
|
content_type = self.api_client.default_headers.get("Content-Type")
|
|
|
|
try:
|
|
self.api_client.set_default_header(
|
|
"Content-Type", "application/merge-patch+json"
|
|
)
|
|
|
|
await self.custom_api.patch_namespaced_custom_object(
|
|
group="btrix.cloud",
|
|
version="v1",
|
|
namespace=self.namespace,
|
|
plural=pluraltype,
|
|
name=f"{pluraltype[:-1]}-{crawl_id}",
|
|
body={"spec": body},
|
|
)
|
|
return {"success": True}
|
|
# pylint: disable=broad-except
|
|
except Exception as exc:
|
|
traceback.print_exc()
|
|
return {"error": str(exc)}
|
|
|
|
finally:
|
|
if content_type:
|
|
self.api_client.set_default_header("Content-Type", content_type)
|
|
else:
|
|
del self.api_client.default_headers["Content-Type"]
|