browsertrix/backend/btrixcloud/operator/baseoperator.py
Ilya Kreymer 61239a40ed
include workflow config in QA runs + different browser instances for QA (#1829)
Currently, the workflow crawl settings were not being included at all in
QA runs.
This mounts the crawl workflow config, as well as QA configmap, into QA
run crawls, allowing for page limits from crawl workflow to be applied
to QA runs.

It also allows a different number of browser instances to be used for QA
runs, as QA runs might work better with less browsers, (eg. 2 instead of
4). This can be set with `qa_browser_instances` in helm chart.

Default qa browser workers to 1 if unset (for now, for best results)

Fixes #1828
2024-05-29 13:32:25 -07:00

191 lines
6.6 KiB
Python

""" Base Operator class for all operators """
import asyncio
import os
from typing import TYPE_CHECKING
from kubernetes.utils import parse_quantity
import yaml
from btrixcloud.k8sapi import K8sAPI
if TYPE_CHECKING:
from btrixcloud.crawlconfigs import CrawlConfigOps
from btrixcloud.crawls import CrawlOps
from btrixcloud.orgs import OrgOps
from btrixcloud.colls import CollectionOps
from btrixcloud.storages import StorageOps
from btrixcloud.webhooks import EventWebhookOps
from btrixcloud.users import UserManager
from btrixcloud.background_jobs import BackgroundJobOps
from btrixcloud.pages import PageOps
from redis.asyncio.client import Redis
else:
CrawlConfigOps = CrawlOps = OrgOps = CollectionOps = Redis = object
StorageOps = EventWebhookOps = UserManager = BackgroundJobOps = PageOps = object
# ============================================================================
class K8sOpAPI(K8sAPI):
"""Additional k8s api for operators"""
has_pod_metrics: bool
max_crawler_memory_size: int
def __init__(self):
super().__init__()
self.config_file = "/config/config.yaml"
with open(self.config_file, encoding="utf-8") as fh_config:
self.shared_params = yaml.safe_load(fh_config)
self.has_pod_metrics = False
self.max_crawler_memory_size = 0
self.compute_crawler_resources()
self.compute_profile_resources()
def compute_crawler_resources(self):
"""compute memory / cpu resources for crawlers"""
p = self.shared_params
num_workers = max(int(p["crawler_browser_instances"]), 1)
try:
qa_num_workers = max(int(p["qa_browser_instances"]), 1)
# pylint: disable=bare-except
except:
# default to 1 for now for best results (to revisit in the future)
qa_num_workers = 1
crawler_cpu: float = 0
crawler_memory: int = 0
qa_cpu: float = 0
qa_memory: int = 0
print("crawler resources")
if not p.get("crawler_cpu"):
base = parse_quantity(p["crawler_cpu_base"])
extra = parse_quantity(p["crawler_extra_cpu_per_browser"])
# cpu is a floating value of cpu cores
crawler_cpu = float(base + (num_workers - 1) * extra)
qa_cpu = float(base + (qa_num_workers - 1) * extra)
print(f"cpu = {base} + {num_workers - 1} * {extra} = {crawler_cpu}")
print(f"qa_cpu = {base} + {qa_num_workers - 1} * {extra} = {qa_cpu}")
else:
crawler_cpu = float(parse_quantity(p["crawler_cpu"]))
qa_cpu = crawler_cpu
print(f"cpu = {crawler_cpu}")
if not p.get("crawler_memory"):
base = parse_quantity(p["crawler_memory_base"])
extra = parse_quantity(p["crawler_extra_memory_per_browser"])
# memory is always an int
crawler_memory = int(base + (num_workers - 1) * extra)
qa_memory = int(base + (qa_num_workers - 1) * extra)
print(f"memory = {base} + {num_workers - 1} * {extra} = {crawler_memory}")
print(f"qa_memory = {base} + {qa_num_workers - 1} * {extra} = {qa_memory}")
else:
crawler_memory = int(parse_quantity(p["crawler_memory"]))
qa_memory = crawler_memory
print(f"memory = {crawler_memory}")
max_crawler_memory_size = 0
max_crawler_memory = os.environ.get("MAX_CRAWLER_MEMORY")
if max_crawler_memory:
max_crawler_memory_size = int(parse_quantity(max_crawler_memory))
self.max_crawler_memory_size = max_crawler_memory_size or crawler_memory
print(f"max crawler memory size: {self.max_crawler_memory_size}")
p["crawler_cpu"] = crawler_cpu
p["crawler_memory"] = crawler_memory
p["crawler_workers"] = num_workers
p["qa_cpu"] = qa_cpu
p["qa_memory"] = qa_memory
p["qa_workers"] = qa_num_workers
def compute_profile_resources(self):
"""compute memory /cpu resources for a single profile browser"""
p = self.shared_params
# if no profile specific options provided, default to crawler base for one browser
profile_cpu = parse_quantity(
p.get("profile_browser_cpu") or p["crawler_cpu_base"]
)
profile_memory = parse_quantity(
p.get("profile_browser_memory") or p["crawler_memory_base"]
)
p["profile_cpu"] = profile_cpu
p["profile_memory"] = profile_memory
print("profile browser resources")
print(f"cpu = {profile_cpu}")
print(f"memory = {profile_memory}")
async def async_init(self):
"""perform any async init here"""
self.has_pod_metrics = await self.is_pod_metrics_available()
print("Pod Metrics Available:", self.has_pod_metrics)
# pylint: disable=too-many-instance-attributes, too-many-arguments
# ============================================================================
class BaseOperator:
"""BaseOperator"""
k8s: K8sOpAPI
crawl_config_ops: CrawlConfigOps
crawl_ops: CrawlOps
orgs_ops: OrgOps
coll_ops: CollectionOps
storage_ops: StorageOps
event_webhook_ops: EventWebhookOps
background_job_ops: BackgroundJobOps
user_ops: UserManager
page_ops: PageOps
def __init__(
self,
k8s,
crawl_config_ops,
crawl_ops,
org_ops,
coll_ops,
storage_ops,
event_webhook_ops,
background_job_ops,
page_ops,
):
self.k8s = k8s
self.crawl_config_ops = crawl_config_ops
self.crawl_ops = crawl_ops
self.org_ops = org_ops
self.coll_ops = coll_ops
self.storage_ops = storage_ops
self.background_job_ops = background_job_ops
self.event_webhook_ops = event_webhook_ops
self.page_ops = page_ops
self.user_ops = crawl_config_ops.user_manager
# to avoid background tasks being garbage collected
# see: https://stackoverflow.com/a/74059981
self.bg_tasks = set()
def init_routes(self, app):
"""init routes for this operator"""
def run_task(self, func):
"""add bg tasks to set to avoid premature garbage collection"""
task = asyncio.create_task(func)
self.bg_tasks.add(task)
task.add_done_callback(self.bg_tasks.discard)
def load_from_yaml(self, filename, params):
"""load and parse k8s template from yaml file"""
return list(
yaml.safe_load_all(
self.k8s.templates.env.get_template(filename).render(params)
)
)