- prefer deploy crawler, redis and job to same zone - prefer deploying crawler and job together via crawler node type, redis via redis node type (all optional)
214 lines
6.6 KiB
Python
214 lines
6.6 KiB
Python
""" shared crawl manager implementation """
|
|
|
|
import os
|
|
import asyncio
|
|
import datetime
|
|
|
|
from abc import ABC, abstractmethod
|
|
|
|
from fastapi.templating import Jinja2Templates
|
|
|
|
from .utils import random_suffix
|
|
|
|
|
|
# ============================================================================
|
|
class BaseCrawlManager(ABC):
|
|
""" abstract crawl manager """
|
|
|
|
def __init__(self, templates):
|
|
super().__init__()
|
|
|
|
self.job_image = os.environ["JOB_IMAGE"]
|
|
|
|
self.no_delete_jobs = os.environ.get("NO_DELETE_JOBS", "0") != "0"
|
|
|
|
self.crawler_node_type = os.environ.get("CRAWLER_NODE_TYPE", "")
|
|
|
|
self.templates = Jinja2Templates(directory=templates)
|
|
|
|
self.loop = asyncio.get_running_loop()
|
|
|
|
# pylint: disable=too-many-arguments
|
|
async def run_profile_browser(
|
|
self,
|
|
userid,
|
|
aid,
|
|
url,
|
|
storage=None,
|
|
storage_name=None,
|
|
baseprofile=None,
|
|
profile_path=None,
|
|
):
|
|
"""run browser for profile creation """
|
|
|
|
# if default storage, use name and path + profiles/
|
|
if storage:
|
|
storage_name = storage.name
|
|
storage_path = storage.path + "profiles/"
|
|
# otherwise, use storage name and existing path from secret
|
|
else:
|
|
storage_path = ""
|
|
|
|
await self.check_storage(storage_name)
|
|
|
|
browserid = f"prf-{random_suffix()}"
|
|
|
|
params = {
|
|
"id": browserid,
|
|
"userid": str(userid),
|
|
"aid": str(aid),
|
|
"job_image": self.job_image,
|
|
"storage_name": storage_name,
|
|
"storage_path": storage_path or "",
|
|
"baseprofile": baseprofile or "",
|
|
"profile_path": profile_path,
|
|
"url": url,
|
|
}
|
|
|
|
data = self.templates.env.get_template("profile_job.yaml").render(params)
|
|
|
|
await self._create_from_yaml(f"job-{browserid}", data)
|
|
|
|
return browserid
|
|
|
|
async def add_crawl_config(
|
|
self,
|
|
crawlconfig,
|
|
storage,
|
|
run_now,
|
|
out_filename,
|
|
profile_filename,
|
|
):
|
|
"""add new crawl as cron job, store crawl config in configmap"""
|
|
|
|
if storage.type == "default":
|
|
storage_name = storage.name
|
|
storage_path = storage.path
|
|
else:
|
|
storage_name = str(crawlconfig.aid)
|
|
storage_path = ""
|
|
|
|
await self.check_storage(storage_name)
|
|
|
|
# Create Config Map
|
|
await self._create_config_map(
|
|
crawlconfig,
|
|
STORE_PATH=storage_path,
|
|
STORE_FILENAME=out_filename,
|
|
STORAGE_NAME=storage_name,
|
|
USER_ID=str(crawlconfig.userid),
|
|
ARCHIVE_ID=str(crawlconfig.aid),
|
|
CRAWL_CONFIG_ID=str(crawlconfig.id),
|
|
PROFILE_FILENAME=profile_filename,
|
|
)
|
|
|
|
crawl_id = None
|
|
|
|
if run_now:
|
|
crawl_id = await self._create_manual_job(crawlconfig)
|
|
|
|
await self._update_scheduled_job(crawlconfig)
|
|
|
|
return crawl_id
|
|
|
|
# pylint: disable=unused-argument
|
|
async def run_crawl_config(self, crawlconfig, userid=None):
|
|
"""Run crawl job for cron job based on specified crawlconfig
|
|
optionally set different user"""
|
|
|
|
return await self._create_manual_job(crawlconfig)
|
|
|
|
async def update_crawlconfig_schedule_or_scale(
|
|
self, crawlconfig, scale=None, schedule=None
|
|
):
|
|
""" Update the schedule or scale for existing crawl config """
|
|
|
|
if schedule is not None:
|
|
await self._update_scheduled_job(crawlconfig)
|
|
|
|
if scale is not None:
|
|
await self._update_config_initial_scale(crawlconfig, scale)
|
|
|
|
return True
|
|
|
|
async def stop_crawl(self, crawl_id, aid, graceful=True):
|
|
"""Attempt to stop crawl, either gracefully by issuing a SIGTERM which
|
|
will attempt to finish current pages
|
|
|
|
OR, abruptly by first issueing a SIGABRT, followed by SIGTERM, which
|
|
will terminate immediately"""
|
|
return await self._post_to_job(
|
|
crawl_id, aid, "/cancel" if not graceful else "/stop"
|
|
)
|
|
|
|
async def scale_crawl(self, crawl_id, aid, scale=1):
|
|
""" Set the crawl scale (job parallelism) on the specified job """
|
|
|
|
return await self._post_to_job(crawl_id, aid, f"/scale/{scale}")
|
|
|
|
async def delete_crawl_configs_for_archive(self, archive):
|
|
"""Delete all crawl configs for given archive"""
|
|
return await self._delete_crawl_configs(f"btrix.archive={archive}")
|
|
|
|
async def delete_crawl_config_by_id(self, cid):
|
|
"""Delete all crawl configs by id"""
|
|
return await self._delete_crawl_configs(f"btrix.crawlconfig={cid}")
|
|
|
|
async def _create_manual_job(self, crawlconfig):
|
|
cid = str(crawlconfig.id)
|
|
ts_now = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
|
|
crawl_id = f"manual-{ts_now}-{cid[:12]}"
|
|
|
|
data = await self._load_job_template(crawlconfig, crawl_id, manual=True)
|
|
|
|
# create job directly
|
|
await self._create_from_yaml(f"job-{crawl_id}", data)
|
|
|
|
return crawl_id
|
|
|
|
async def _load_job_template(self, crawlconfig, job_id, manual, schedule=None):
|
|
params = {
|
|
"id": job_id,
|
|
"cid": str(crawlconfig.id),
|
|
"userid": str(crawlconfig.userid),
|
|
"aid": str(crawlconfig.aid),
|
|
"job_image": self.job_image,
|
|
"manual": "1" if manual else "0",
|
|
"crawler_node_type": self.crawler_node_type,
|
|
"schedule": schedule,
|
|
}
|
|
|
|
self._add_extra_crawl_job_params(params)
|
|
|
|
return self.templates.env.get_template("crawl_job.yaml").render(params)
|
|
|
|
def _add_extra_crawl_job_params(self, params):
|
|
""" add extra params for crawl job template, if any (swarm only) """
|
|
|
|
async def _update_config_initial_scale(self, crawlconfig, scale):
|
|
""" update initial scale in config, if needed (k8s only) """
|
|
|
|
@abstractmethod
|
|
async def check_storage(self, storage_name, is_default=False):
|
|
""" check if given storage is valid """
|
|
|
|
@abstractmethod
|
|
async def _create_from_yaml(self, id_, yaml_data):
|
|
""" check if given storage is valid """
|
|
|
|
@abstractmethod
|
|
async def _create_config_map(self, crawlconfig, **kwargs):
|
|
""" create config map for config """
|
|
|
|
@abstractmethod
|
|
async def _update_scheduled_job(self, crawlconfig):
|
|
""" update schedule on crawl job """
|
|
|
|
@abstractmethod
|
|
async def _post_to_job(self, crawl_id, aid, path, data=None):
|
|
""" make a POST request to the container for specified crawl job """
|
|
|
|
@abstractmethod
|
|
async def _delete_crawl_configs(self, label):
|
|
""" delete crawl configs by specified label """
|