browsertrix/backend/btrixcloud/k8s/k8sman.py
Ilya Kreymer bf79959a5a refactoring to use statefulsets + job (#245)
- use statefulsets instead of deployments for mongo, redis, signer
- use k8s job + statefulset for running crawls
- use separate statefulset for crawl (scaled) and single-replica redis stateful set
- move crawl job update login to crawl_updater
- remove shared redis chart

package refactor:
- move to shared code to 'btrixcloud'
- move k8s to 'btrixcloud.k8s'
- move docker to 'btrixcloud.docker'
2022-06-05 10:37:17 -07:00

479 lines
16 KiB
Python

""" K8s support"""
import os
import datetime
import json
import base64
import yaml
import aiohttp
from kubernetes_asyncio import client, config
from kubernetes_asyncio.stream import WsApiClient
from kubernetes_asyncio.client.api_client import ApiClient
from fastapi.templating import Jinja2Templates
from ..archives import S3Storage
from .utils import create_from_yaml, send_signal_to_pods, get_templates_dir
# ============================================================================
CRAWLER_NAMESPACE = os.environ.get("CRAWLER_NAMESPACE") or "crawlers"
# pylint: disable=too-many-public-methods
# ============================================================================
class K8SManager:
# pylint: disable=too-many-instance-attributes,too-many-locals,too-many-arguments
"""K8SManager, manager creation of k8s resources from crawl api requests"""
def __init__(self, namespace=CRAWLER_NAMESPACE):
config.load_incluster_config()
self.api_client = ApiClient()
self.core_api = client.CoreV1Api(self.api_client)
self.core_api_ws = client.CoreV1Api(api_client=WsApiClient())
self.batch_api = client.BatchV1Api(self.api_client)
self.namespace = namespace
self._default_storages = {}
self.no_delete_jobs = os.environ.get("NO_DELETE_JOBS", "0") != "0"
self.templates = Jinja2Templates(directory=get_templates_dir())
self.job_image = os.environ.get("JOB_IMAGE")
# pylint: disable=unused-argument
async def check_storage(self, storage_name, is_default=False):
"""Check if storage is valid by trying to get the storage secret
Will throw if not valid, otherwise return True"""
await self._get_storage_secret(storage_name)
return True
async def update_archive_storage(self, aid, userid, storage):
"""Update storage by either creating a per-archive secret, if using custom storage
or deleting per-archive secret, if using default storage"""
archive_storage_name = f"storage-{aid}"
if storage.type == "default":
try:
await self.core_api.delete_namespaced_secret(
archive_storage_name,
namespace=self.namespace,
propagation_policy="Foreground",
)
# pylint: disable=bare-except
except:
pass
return
labels = {"btrix.archive": aid, "btrix.user": userid}
crawl_secret = client.V1Secret(
metadata={
"name": archive_storage_name,
"namespace": self.namespace,
"labels": labels,
},
string_data={
"STORE_ENDPOINT_URL": storage.endpoint_url,
"STORE_ACCESS_KEY": storage.access_key,
"STORE_SECRET_KEY": storage.secret_key,
},
)
try:
await self.core_api.create_namespaced_secret(
namespace=self.namespace, body=crawl_secret
)
# pylint: disable=bare-except
except:
await self.core_api.patch_namespaced_secret(
name=archive_storage_name, namespace=self.namespace, body=crawl_secret
)
async def add_crawl_config(
self,
crawlconfig,
storage,
run_now,
out_filename,
profile_filename,
):
"""add new crawl as cron job, store crawl config in configmap"""
if storage.type == "default":
storage_name = storage.name
storage_path = storage.path
else:
storage_name = str(crawlconfig.aid)
storage_path = ""
await self.check_storage(storage_name)
# Create Config Map
await self._create_config_map(
crawlconfig,
STORE_PATH=storage_path,
STORE_FILENAME=out_filename,
STORE_NAME=storage_name,
USER_ID=str(crawlconfig.userid),
ARCHIVE_ID=str(crawlconfig.aid),
CRAWL_CONFIG_ID=str(crawlconfig.id),
INITIAL_SCALE=str(crawlconfig.scale),
PROFILE_FILENAME=profile_filename,
)
crawl_id = None
if run_now:
crawl_id = await self._create_manual_job(crawlconfig)
await self._update_scheduled_job(crawlconfig)
return crawl_id
async def update_crawl_schedule_or_scale(
self, crawlconfig, scale=None, schedule=None
):
""" Update the schedule or scale for existing crawl config """
if schedule is not None:
await self._update_scheduled_job(crawlconfig)
if scale is not None:
config_map = await self.core_api.read_namespaced_config_map(
name=f"crawl-config-{crawlconfig.id}", namespace=self.namespace
)
config_map.data["INITIAL_SCALE"] = str(scale)
await self.core_api.patch_namespaced_config_map(
name=config_map.metadata.name, namespace=self.namespace, body=config_map
)
return True
async def run_crawl_config(self, crawlconfig, userid=None):
"""Run crawl job for cron job based on specified crawlconfig
optionally set different user"""
return await self._create_manual_job(crawlconfig)
async def get_default_storage_access_endpoint(self, name):
""" Get access_endpoint for default storage """
return (await self.get_default_storage(name)).access_endpoint_url
async def get_default_storage(self, name):
""" get default storage """
if name not in self._default_storages:
storage_secret = await self._get_storage_secret(name)
access_endpoint_url = self._secret_data(
storage_secret, "STORE_ACCESS_ENDPOINT_URL"
)
endpoint_url = self._secret_data(storage_secret, "STORE_ENDPOINT_URL")
access_key = self._secret_data(storage_secret, "STORE_ACCESS_KEY")
secret_key = self._secret_data(storage_secret, "STORE_SECRET_KEY")
region = self._secret_data(storage_secret, "STORE_REGION") or ""
self._default_storages[name] = S3Storage(
access_key=access_key,
secret_key=secret_key,
endpoint_url=endpoint_url,
access_endpoint_url=access_endpoint_url,
region=region,
)
return self._default_storages[name]
async def stop_crawl(self, crawl_id, aid, graceful=True):
"""Attempt to stop crawl, either gracefully by issuing a SIGTERM which
will attempt to finish current pages
OR, abruptly by first issueing a SIGABRT, followed by SIGTERM, which
will terminate immediately"""
return await self._post_to_job_pods(
crawl_id, aid, "/cancel" if not graceful else "/stop"
)
async def scale_crawl(self, crawl_id, aid, scale=1):
""" Set the crawl scale (job parallelism) on the specified job """
return await self._post_to_job_pods(crawl_id, aid, f"/scale/{scale}")
async def delete_crawl_configs_for_archive(self, archive):
"""Delete all crawl configs for given archive"""
return await self._delete_crawl_configs(f"btrix.archive={archive}")
async def delete_crawl_config_by_id(self, cid):
"""Delete all crawl configs by id"""
return await self._delete_crawl_configs(f"btrix.crawlconfig={cid}")
async def run_profile_browser(
self,
userid,
aid,
url,
storage=None,
storage_name=None,
baseprofile=None,
profile_path=None,
):
"""run browser for profile creation """
# if default storage, use name and path + profiles/
if storage:
storage_name = storage.name
storage_path = storage.path + "profiles/"
# otherwise, use storage name and existing path from secret
else:
storage_path = ""
await self.check_storage(storage_name)
params = {
"userid": str(userid),
"aid": str(aid),
"job_image": self.job_image,
"storage_name": storage_name,
"storage_path": storage_path or "",
"baseprofile": baseprofile or "",
"profile_path": profile_path,
"url": url,
}
data = self.templates.env.get_template("profile_job.yaml").render(params)
created = await create_from_yaml(
self.api_client, data, namespace=self.namespace
)
name = created[0][0].metadata.name
# pylint: disable=no-else-return
if name.startswith("job-"):
return name[4:]
else:
return name
async def ping_profile_browser(self, browserid):
""" return ping profile browser """
pods = await self.core_api.list_namespaced_pod(
namespace=self.namespace,
label_selector=f"job-name=job-{browserid},btrix.profile=1",
)
if len(pods.items) == 0:
return False
await send_signal_to_pods(
self.core_api_ws, self.namespace, pods.items, "SIGUSR1"
)
return True
async def get_profile_browser_metadata(self, browserid):
""" get browser profile labels """
try:
job = await self.batch_api.read_namespaced_job(
name=f"job-{browserid}", namespace=self.namespace
)
if not job.metadata.labels.get("btrix.profile"):
return {}
# pylint: disable=bare-except
except:
return {}
return job.metadata.labels
async def delete_profile_browser(self, browserid):
""" delete browser job, if it is a profile browser job """
return await self._handle_completed_job(f"job-{browserid}")
# ========================================================================
# Internal Methods
# pylint: disable=no-self-use
def _secret_data(self, secret, name):
""" decode secret data """
return base64.standard_b64decode(secret.data[name]).decode()
async def _load_job_template(self, crawlconfig, name, manual):
params = {
"cid": str(crawlconfig.id),
"userid": str(crawlconfig.userid),
"aid": str(crawlconfig.aid),
"job_image": self.job_image,
"job_name": name,
"manual": "1" if manual else "0",
}
return self.templates.env.get_template("job.yaml").render(params)
async def _handle_completed_job(self, job_name):
""" Handle completed job: delete """
# until ttl controller is ready
if self.no_delete_jobs:
return
try:
await self._delete_job(job_name)
# pylint: disable=bare-except
except:
pass
async def _delete_job(self, name):
await self.batch_api.delete_namespaced_job(
name=name,
namespace=self.namespace,
grace_period_seconds=60,
propagation_policy="Foreground",
)
async def _create_config_map(self, crawlconfig, **kwargs):
""" Create Config Map based on CrawlConfig """
data = kwargs
data["crawl-config.json"] = json.dumps(crawlconfig.get_raw_config())
config_map = client.V1ConfigMap(
metadata={
"name": f"crawl-config-{crawlconfig.id}",
"namespace": self.namespace,
# "labels": labels,
},
data=data,
)
return await self.core_api.create_namespaced_config_map(
namespace=self.namespace, body=config_map
)
# pylint: disable=unused-argument
async def _get_storage_secret(self, storage_name):
""" Check if storage_name is valid by checking existing secret """
try:
return await self.core_api.read_namespaced_secret(
f"storage-{storage_name}",
namespace=self.namespace,
)
except Exception:
# pylint: disable=broad-except,raise-missing-from
raise Exception(f"Storage {storage_name} not found")
return None
async def _delete_crawl_configs(self, label):
"""Delete Crawl Cron Job and all dependent resources, including configmap and secrets"""
await self.batch_api.delete_collection_namespaced_cron_job(
namespace=self.namespace,
label_selector=label,
propagation_policy="Foreground",
)
await self.core_api.delete_collection_namespaced_config_map(
namespace=self.namespace,
label_selector=label,
propagation_policy="Foreground",
)
async def _post_to_job_pods(self, crawl_id, aid, path, data=None):
job_name = f"job-{crawl_id}"
pods = await self.core_api.list_namespaced_pod(
namespace=self.namespace,
label_selector=f"job-name={job_name},btrix.archive={aid}",
)
for pod in pods.items:
async with aiohttp.ClientSession() as session:
async with session.request(
"POST", f"http://{pod.status.pod_ip}:8000{path}", json=data
) as resp:
await resp.json()
async def _create_manual_job(self, crawlconfig):
cid = str(crawlconfig.id)
ts_now = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
crawl_id = f"manual-{ts_now}-{cid[:12]}"
data = await self._load_job_template(
crawlconfig, "job-" + crawl_id, manual=True
)
# create job directly
await create_from_yaml(self.api_client, data, namespace=self.namespace)
return crawl_id
async def _update_scheduled_job(self, crawlconfig):
""" create or remove cron job based on crawlconfig schedule """
cid = str(crawlconfig.id)
cron_job_name = f"job-sched-{cid[:12]}"
cron_job = None
try:
cron_job = await self.batch_api.read_namespaced_cron_job(
name=cron_job_name,
namespace=self.namespace,
)
# pylint: disable=bare-except
except:
pass
if cron_job:
if crawlconfig.schedule and crawlconfig.schedule != cron_job.spec.schedule:
cron_job.spec.schedule = crawlconfig.schedule
await self.batch_api.patch_namespaced_cron_job(
name=cron_job.metadata.name, namespace=self.namespace, body=cron_job
)
if not crawlconfig.schedule:
await self.batch_api.delete_namespaced_cron_job(
name=cron_job.metadata.name, namespace=self.namespace
)
return
if not crawlconfig.schedule:
return
# create new cronjob
data = await self._load_job_template(crawlconfig, cron_job_name, manual=False)
job_yaml = yaml.safe_load(data)
job_template = self.api_client.deserialize(
FakeKubeResponse(job_yaml), "V1JobTemplateSpec"
)
metadata = job_yaml["metadata"]
spec = client.V1CronJobSpec(
schedule=crawlconfig.schedule,
suspend=False,
concurrency_policy="Forbid",
successful_jobs_history_limit=2,
failed_jobs_history_limit=3,
job_template=job_template,
)
cron_job = client.V1CronJob(metadata=metadata, spec=spec)
await self.batch_api.create_namespaced_cron_job(
namespace=self.namespace, body=cron_job
)
# ============================================================================
# pylint: disable=too-few-public-methods
class FakeKubeResponse:
""" wrap k8s response for decoding """
def __init__(self, obj):
self.data = json.dumps(obj)