443 lines
15 KiB
Python
443 lines
15 KiB
Python
""" K8s support"""
|
|
|
|
import os
|
|
import datetime
|
|
import json
|
|
|
|
from kubernetes_asyncio import client, config
|
|
from kubernetes_asyncio.stream import WsApiClient
|
|
|
|
from crawls import CrawlFinished
|
|
|
|
|
|
# ============================================================================
|
|
DEFAULT_NAMESPACE = os.environ.get("CRAWLER_NAMESPACE") or "crawlers"
|
|
|
|
DEFAULT_NO_SCHEDULE = "* * 31 2 *"
|
|
|
|
|
|
# ============================================================================
|
|
class K8SManager:
|
|
# pylint: disable=too-many-instance-attributes,too-many-locals,too-many-arguments
|
|
"""K8SManager, manager creation of k8s resources from crawl api requests"""
|
|
|
|
def __init__(self, namespace=DEFAULT_NAMESPACE):
|
|
config.load_incluster_config()
|
|
|
|
self.core_api = client.CoreV1Api()
|
|
self.core_api_ws = client.CoreV1Api(api_client=WsApiClient())
|
|
self.batch_api = client.BatchV1Api()
|
|
self.batch_beta_api = client.BatchV1beta1Api()
|
|
|
|
self.namespace = namespace
|
|
|
|
self.crawler_image = os.environ.get("CRAWLER_IMAGE")
|
|
self.crawler_image_pull_policy = "IfNotPresent"
|
|
|
|
# loop = asyncio.get_running_loop()
|
|
# loop.create_task(self.watch_job_done())
|
|
|
|
async def add_crawl_config(
|
|
self,
|
|
crawlconfig,
|
|
storage,
|
|
extra_crawl_params: list = None,
|
|
):
|
|
"""add new crawl as cron job, store crawl config in configmap"""
|
|
cid = str(crawlconfig.id)
|
|
userid = crawlconfig.user
|
|
aid = crawlconfig.archive
|
|
|
|
labels = {
|
|
"btrix.user": userid,
|
|
"btrix.archive": aid,
|
|
"btrix.crawlconfig": cid,
|
|
}
|
|
|
|
# Create Config Map
|
|
config_map = self._create_config_map(crawlconfig, labels)
|
|
|
|
await self.core_api.create_namespaced_config_map(
|
|
namespace=self.namespace, body=config_map
|
|
)
|
|
|
|
# Create Secret
|
|
endpoint_with_coll_url = os.path.join(
|
|
storage.endpoint_url, "collections", crawlconfig.config.collection + "/"
|
|
)
|
|
|
|
crawl_secret = client.V1Secret(
|
|
metadata={
|
|
"name": f"crawl-secret-{cid}",
|
|
"namespace": self.namespace,
|
|
"labels": labels,
|
|
},
|
|
string_data={
|
|
"STORE_USER": userid,
|
|
"STORE_ARCHIVE": aid,
|
|
"STORE_ENDPOINT_URL": endpoint_with_coll_url,
|
|
"STORE_ACCESS_KEY": storage.access_key,
|
|
"STORE_SECRET_KEY": storage.secret_key,
|
|
"WEBHOOK_URL": "http://browsertrix-cloud.default:8000/crawls/done",
|
|
},
|
|
)
|
|
|
|
await self.core_api.create_namespaced_secret(
|
|
namespace=self.namespace, body=crawl_secret
|
|
)
|
|
|
|
# Create Cron Job
|
|
|
|
suspend, schedule, run_now = self._get_schedule_suspend_run_now(crawlconfig)
|
|
|
|
extra_crawl_params = extra_crawl_params or []
|
|
|
|
job_template = self._get_job_template(cid, labels, extra_crawl_params)
|
|
|
|
spec = client.V1beta1CronJobSpec(
|
|
schedule=schedule,
|
|
suspend=suspend,
|
|
concurrency_policy="Forbid",
|
|
successful_jobs_history_limit=2,
|
|
failed_jobs_history_limit=3,
|
|
job_template=job_template,
|
|
)
|
|
|
|
cron_job = client.V1beta1CronJob(
|
|
metadata={
|
|
"name": f"crawl-scheduled-{cid}",
|
|
"namespace": self.namespace,
|
|
"labels": labels,
|
|
},
|
|
spec=spec,
|
|
)
|
|
|
|
cron_job = await self.batch_beta_api.create_namespaced_cron_job(
|
|
namespace=self.namespace, body=cron_job
|
|
)
|
|
|
|
# Run Job Now
|
|
if run_now:
|
|
await self._create_run_now_job(cron_job)
|
|
|
|
return cron_job
|
|
|
|
async def update_crawl_config(self, crawlconfig):
|
|
""" Update existing crawl config """
|
|
|
|
cid = crawlconfig.id
|
|
|
|
cron_jobs = await self.batch_beta_api.list_namespaced_cron_job(
|
|
namespace=self.namespace, label_selector=f"btrix.crawlconfig={cid}"
|
|
)
|
|
|
|
if len(cron_jobs.items) != 1:
|
|
return
|
|
|
|
cron_job = cron_jobs.items[0]
|
|
|
|
if crawlconfig.archive != cron_job.metadata.labels["btrix.archive"]:
|
|
print("wrong archive")
|
|
return
|
|
|
|
labels = {
|
|
"btrix.user": crawlconfig.user,
|
|
"btrix.archive": crawlconfig.archive,
|
|
"btrix.crawlconfig": cid,
|
|
}
|
|
|
|
# Update Config Map
|
|
config_map = self._create_config_map(crawlconfig, labels)
|
|
|
|
await self.core_api.patch_namespaced_config_map(
|
|
name=f"crawl-config-{cid}", namespace=self.namespace, body=config_map
|
|
)
|
|
|
|
# Update CronJob, if needed
|
|
suspend, schedule, run_now = self._get_schedule_suspend_run_now(crawlconfig)
|
|
|
|
changed = False
|
|
|
|
if schedule != cron_job.spec.schedule:
|
|
cron_job.spec.schedule = schedule
|
|
changed = True
|
|
|
|
if suspend != cron_job.spec.suspend:
|
|
cron_job.spec.suspend = suspend
|
|
changed = True
|
|
|
|
if changed:
|
|
await self.batch_beta_api.patch_namespaced_cron_job(
|
|
name=cron_job.metadata.name, namespace=self.namespace, body=cron_job
|
|
)
|
|
|
|
# Run Job Now
|
|
if run_now:
|
|
await self._create_run_now_job(cron_job)
|
|
|
|
async def run_crawl_config(self, cid):
|
|
""" Run crawl job for cron job based on specified crawlconfig id (cid) """
|
|
print(f"btrix.crawlconfig={cid}")
|
|
cron_jobs = await self.batch_beta_api.list_namespaced_cron_job(
|
|
namespace=self.namespace, label_selector=f"btrix.crawlconfig={cid}"
|
|
)
|
|
|
|
if len(cron_jobs.items) != 1:
|
|
raise Exception("Crawl Config Not Found")
|
|
|
|
res = await self._create_run_now_job(cron_jobs.items[0])
|
|
return res.metadata.name
|
|
|
|
async def validate_crawl_complete(self, crawlcomplete):
|
|
"""Ensure the crawlcomplete data is valid (job exists and user matches)
|
|
Fill in additional details about the crawl"""
|
|
job = await self.batch_api.read_namespaced_job(
|
|
name=crawlcomplete.id, namespace=self.namespace
|
|
)
|
|
|
|
if not job or job.metadata.labels["btrix.user"] != crawlcomplete.user:
|
|
return None
|
|
|
|
return CrawlFinished(
|
|
id=crawlcomplete.id,
|
|
state="complete" if crawlcomplete.completed else "partial_complete",
|
|
|
|
user=crawlcomplete.user,
|
|
aid=job.metadata.labels["btrix.archive"],
|
|
cid=job.metadata.labels["btrix.crawlconfig"],
|
|
started=job.status.start_time.replace(tzinfo=None),
|
|
finished=datetime.datetime.utcnow().replace(microsecond=0, tzinfo=None),
|
|
|
|
filename=crawlcomplete.filename,
|
|
size=crawlcomplete.size,
|
|
hash=crawlcomplete.hash
|
|
)
|
|
|
|
async def stop_crawl(self, job_id, aid):
|
|
""" Stop Crawl based on crawl job id """
|
|
job = await self.batch_api.read_namespaced_job(
|
|
name=job_id, namespace=self.namespace
|
|
)
|
|
|
|
if not job or job.metadata.labels["btrix.archive"] != aid:
|
|
return None
|
|
|
|
await self.batch_api.delete_namespaced_job(
|
|
name=job_id, namespace=self.namespace,
|
|
grace_period_seconds=10,
|
|
propagation_policy="Foreground",
|
|
)
|
|
|
|
return CrawlFinished(
|
|
id=job_id,
|
|
state="canceled",
|
|
|
|
user=job.metadata.labels["btrix.user"],
|
|
aid=job.metadata.labels["btrix.archive"],
|
|
cid=job.metadata.labels["btrix.crawlconfig"],
|
|
started=job.status.start_time.replace(tzinfo=None),
|
|
finished=datetime.datetime.utcnow().replace(microsecond=0, tzinfo=None),
|
|
)
|
|
|
|
async def stop_crawl_graceful(self, job_name, aid):
|
|
""" Attempt to gracefully stop crawl by sending a SIGINT to the pod(s)"""
|
|
|
|
pods = await self.core_api.list_namespaced_pod(
|
|
namespace=self.namespace,
|
|
label_selector=f"job-name={job_name},btrix.archive={aid}",
|
|
)
|
|
|
|
command = ["kill", "-s", "SIGINT", "1"]
|
|
interrupted = False
|
|
|
|
for pod in pods.items:
|
|
if pod.metadata.labels["btrix.archive"] != aid:
|
|
print("wrong archive")
|
|
continue
|
|
|
|
await self.core_api_ws.connect_get_namespaced_pod_exec(
|
|
pod.metadata.name, namespace=self.namespace, command=command,
|
|
stdout=True
|
|
)
|
|
interrupted = True
|
|
|
|
return interrupted
|
|
|
|
async def delete_crawl_configs_for_archive(self, archive):
|
|
"""Delete all crawl configs for given archive"""
|
|
return await self._delete_crawl_configs(f"btrix.archive={archive}")
|
|
|
|
async def delete_crawl_config_by_id(self, cid):
|
|
"""Delete all crawl configs by id"""
|
|
return await self._delete_crawl_configs(f"btrix.crawlconfig={cid}")
|
|
|
|
# ========================================================================
|
|
# Internal Methods
|
|
|
|
def _create_config_map(self, crawlconfig, labels):
|
|
""" Create Config Map based on CrawlConfig + labels """
|
|
config_map = client.V1ConfigMap(
|
|
metadata={
|
|
"name": f"crawl-config-{crawlconfig.id}",
|
|
"namespace": self.namespace,
|
|
"labels": labels,
|
|
},
|
|
data={"crawl-config.json": json.dumps(crawlconfig.config.dict())},
|
|
)
|
|
|
|
return config_map
|
|
|
|
# pylint: disable=no-self-use
|
|
def _get_schedule_suspend_run_now(self, crawlconfig):
|
|
""" get schedule/suspend/run_now data based on crawlconfig """
|
|
|
|
# Create Cron Job
|
|
suspend = False
|
|
schedule = crawlconfig.schedule
|
|
|
|
if not schedule:
|
|
schedule = DEFAULT_NO_SCHEDULE
|
|
suspend = True
|
|
|
|
run_now = False
|
|
if crawlconfig.runNow:
|
|
run_now = True
|
|
|
|
return suspend, schedule, run_now
|
|
|
|
async def _delete_crawl_configs(self, label):
|
|
"""Delete Crawl Cron Job and all dependent resources, including configmap and secrets"""
|
|
|
|
await self.batch_beta_api.delete_collection_namespaced_cron_job(
|
|
namespace=self.namespace,
|
|
label_selector=label,
|
|
propagation_policy="Foreground",
|
|
)
|
|
|
|
await self.core_api.delete_collection_namespaced_secret(
|
|
namespace=self.namespace,
|
|
label_selector=label,
|
|
propagation_policy="Foreground",
|
|
)
|
|
|
|
await self.core_api.delete_collection_namespaced_config_map(
|
|
namespace=self.namespace,
|
|
label_selector=label,
|
|
propagation_policy="Foreground",
|
|
)
|
|
|
|
async def _create_run_now_job(self, cron_job):
|
|
"""Create new job from cron job to run instantly"""
|
|
annotations = {}
|
|
annotations["cronjob.kubernetes.io/instantiate"] = "manual"
|
|
|
|
owner_ref = client.V1OwnerReference(
|
|
kind="CronJob",
|
|
name=cron_job.metadata.name,
|
|
block_owner_deletion=True,
|
|
controller=True,
|
|
uid=cron_job.metadata.uid,
|
|
api_version="batch/v1beta1",
|
|
)
|
|
|
|
ts_now = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
|
|
name = f"crawl-now-{ts_now}-{cron_job.metadata.labels['btrix.crawlconfig']}"
|
|
|
|
object_meta = client.V1ObjectMeta(
|
|
name=name,
|
|
annotations=annotations,
|
|
labels=cron_job.metadata.labels,
|
|
owner_references=[owner_ref],
|
|
)
|
|
|
|
job = client.V1Job(
|
|
kind="Job",
|
|
api_version="batch/v1",
|
|
metadata=object_meta,
|
|
spec=cron_job.spec.job_template.spec,
|
|
)
|
|
|
|
return await self.batch_api.create_namespaced_job(
|
|
body=job, namespace=self.namespace
|
|
)
|
|
|
|
def _get_job_template(self, uid, labels, extra_crawl_params):
|
|
"""Return crawl job template for crawl job, including labels, adding optiona crawl params"""
|
|
|
|
command = ["crawl", "--config", "/tmp/crawl-config.json"]
|
|
|
|
if extra_crawl_params:
|
|
command += extra_crawl_params
|
|
|
|
requests_memory = "256M"
|
|
limit_memory = "1G"
|
|
|
|
requests_cpu = "120m"
|
|
limit_cpu = "1000m"
|
|
|
|
resources = {
|
|
"limits": {
|
|
"cpu": limit_cpu,
|
|
"memory": limit_memory,
|
|
},
|
|
"requests": {
|
|
"cpu": requests_cpu,
|
|
"memory": requests_memory,
|
|
},
|
|
}
|
|
|
|
return {
|
|
"spec": {
|
|
"template": {
|
|
"metadata": {"labels": labels},
|
|
"spec": {
|
|
"containers": [
|
|
{
|
|
"name": "crawler",
|
|
"image": self.crawler_image,
|
|
"imagePullPolicy": "Never",
|
|
"command": command,
|
|
"volumeMounts": [
|
|
{
|
|
"name": "crawl-config",
|
|
"mountPath": "/tmp/crawl-config.json",
|
|
"subPath": "crawl-config.json",
|
|
"readOnly": True,
|
|
}
|
|
],
|
|
"envFrom": [
|
|
{"secretRef": {"name": f"crawl-secret-{uid}"}}
|
|
],
|
|
"env": [
|
|
{
|
|
"name": "CRAWL_ID",
|
|
"valueFrom": {
|
|
"fieldRef": {
|
|
"fieldPath": "metadata.labels['job-name']"
|
|
}
|
|
},
|
|
}
|
|
],
|
|
"resources": resources,
|
|
}
|
|
],
|
|
"volumes": [
|
|
{
|
|
"name": "crawl-config",
|
|
"configMap": {
|
|
"name": f"crawl-config-{uid}",
|
|
"items": [
|
|
{
|
|
"key": "crawl-config.json",
|
|
"path": "crawl-config.json",
|
|
}
|
|
],
|
|
},
|
|
}
|
|
],
|
|
"restartPolicy": "OnFailure",
|
|
},
|
|
}
|
|
}
|
|
}
|