include workflow config in QA runs + different browser instances for QA (#1829)

Currently, the workflow crawl settings were not being included at all in
QA runs.
This mounts the crawl workflow config, as well as QA configmap, into QA
run crawls, allowing for page limits from crawl workflow to be applied
to QA runs.

It also allows a different number of browser instances to be used for QA
runs, as QA runs might work better with less browsers, (eg. 2 instead of
4). This can be set with `qa_browser_instances` in helm chart.

Default qa browser workers to 1 if unset (for now, for best results)

Fixes #1828
This commit is contained in:
Ilya Kreymer 2024-05-29 13:32:25 -07:00 committed by GitHub
parent 18e5ed94f1
commit 61239a40ed
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 60 additions and 30 deletions

View File

@ -47,20 +47,31 @@ class K8sOpAPI(K8sAPI):
def compute_crawler_resources(self):
"""compute memory / cpu resources for crawlers"""
p = self.shared_params
num = max(int(p["crawler_browser_instances"]) - 1, 0)
num_workers = max(int(p["crawler_browser_instances"]), 1)
try:
qa_num_workers = max(int(p["qa_browser_instances"]), 1)
# pylint: disable=bare-except
except:
# default to 1 for now for best results (to revisit in the future)
qa_num_workers = 1
crawler_cpu: float = 0
crawler_memory: int = 0
qa_cpu: float = 0
qa_memory: int = 0
print("crawler resources")
if not p.get("crawler_cpu"):
base = parse_quantity(p["crawler_cpu_base"])
extra = parse_quantity(p["crawler_extra_cpu_per_browser"])
# cpu is a floating value of cpu cores
crawler_cpu = float(base + num * extra)
crawler_cpu = float(base + (num_workers - 1) * extra)
qa_cpu = float(base + (qa_num_workers - 1) * extra)
print(f"cpu = {base} + {num} * {extra} = {crawler_cpu}")
print(f"cpu = {base} + {num_workers - 1} * {extra} = {crawler_cpu}")
print(f"qa_cpu = {base} + {qa_num_workers - 1} * {extra} = {qa_cpu}")
else:
crawler_cpu = float(parse_quantity(p["crawler_cpu"]))
qa_cpu = crawler_cpu
print(f"cpu = {crawler_cpu}")
if not p.get("crawler_memory"):
@ -68,11 +79,14 @@ class K8sOpAPI(K8sAPI):
extra = parse_quantity(p["crawler_extra_memory_per_browser"])
# memory is always an int
crawler_memory = int(base + num * extra)
crawler_memory = int(base + (num_workers - 1) * extra)
qa_memory = int(base + (qa_num_workers - 1) * extra)
print(f"memory = {base} + {num} * {extra} = {crawler_memory}")
print(f"memory = {base} + {num_workers - 1} * {extra} = {crawler_memory}")
print(f"qa_memory = {base} + {qa_num_workers - 1} * {extra} = {qa_memory}")
else:
crawler_memory = int(parse_quantity(p["crawler_memory"]))
qa_memory = crawler_memory
print(f"memory = {crawler_memory}")
max_crawler_memory_size = 0
@ -86,6 +100,10 @@ class K8sOpAPI(K8sAPI):
p["crawler_cpu"] = crawler_cpu
p["crawler_memory"] = crawler_memory
p["crawler_workers"] = num_workers
p["qa_cpu"] = qa_cpu
p["qa_memory"] = qa_memory
p["qa_workers"] = qa_num_workers
def compute_profile_resources(self):
"""compute memory /cpu resources for a single profile browser"""

View File

@ -335,22 +335,30 @@ class CrawlOperator(BaseOperator):
name = f"crawl-{params['id']}-{i}"
has_pod = name in children[POD]
if params.get("qa_source_crawl_id"):
cpu_field = "qa_cpu"
mem_field = "qa_memory"
worker_field = "qa_workers"
pri_class = f"qa-crawl-pri-{i}"
else:
cpu_field = "crawler_cpu"
mem_field = "crawler_memory"
worker_field = "crawler_workers"
pri_class = f"crawl-pri-{i}"
pod_info = status.podStatus[name]
params["name"] = name
params["cpu"] = pod_info.newCpu or params.get("crawler_cpu")
params["memory"] = pod_info.newMemory or params.get("crawler_memory")
params["priorityClassName"] = pri_class
params["cpu"] = pod_info.newCpu or params.get(cpu_field)
params["memory"] = pod_info.newMemory or params.get(mem_field)
params["memory_limit"] = float(params["memory"]) * MEM_LIMIT_PADDING
params["workers"] = params.get(worker_field) or 1
params["do_restart"] = (
pod_info.should_restart_pod() or params.get("force_restart")
) and has_pod
if params.get("do_restart"):
print(f"Restart {name}")
if params.get("qa_source_crawl_id"):
params["priorityClassName"] = f"qa-crawl-pri-{i}"
else:
params["priorityClassName"] = f"crawl-pri-{i}"
return self.load_from_yaml("crawler.yaml", params)
# pylint: disable=too-many-arguments

View File

@ -61,11 +61,12 @@ spec:
volumes:
- name: crawl-config
configMap:
{% if not qa_source_crawl_id %}
name: crawl-config-{{ cid }}
{% else %}
{% if qa_source_crawl_id %}
- name: qa-config
configMap:
name: qa-replay-{{ qa_source_crawl_id }}
{% endif %}
{% endif %}
- name: crawl-data
persistentVolumeClaim:
claimName: {{ name }}
@ -114,31 +115,33 @@ spec:
image: {{ crawler_image }}
imagePullPolicy: {{ crawler_image_pull_policy }}
command:
{% if not qa_source_crawl_id %}
- crawl
- {{ "crawl" if not qa_source_crawl_id else "qa" }}
- --config
- /tmp/crawl-config.json
- --workers
- "{{ workers }}"
- --redisStoreUrl
- {{ redis_url }}
{%- if profile_filename %}
{% if qa_source_crawl_id %}
- --qaSource
- /tmp/qa-config.json
{% elif profile_filename %}
- --profile
- "@{{ profile_filename }}"
{%- endif %}
{% else %}
- qa
- --qaSource
- /tmp/crawl-config.json
- --redisStoreUrl
- {{ redis_url }}
- --writePagesToRedis
{% endif %}
{% endif %}
volumeMounts:
- name: crawl-config
mountPath: /tmp/crawl-config.json
subPath: crawl-config.json
readOnly: True
{% if qa_source_crawl_id %}
- name: qa-config
mountPath: /tmp/qa-config.json
subPath: qa-config.json
readOnly: True
{% endif %}
- name: crawl-data
mountPath: /crawls
envFrom:

View File

@ -11,4 +11,4 @@ metadata:
role: crawler
data:
crawl-config.json: {{ qa_source_replay_json | tojson }}
qa-config.json: {{ qa_source_replay_json | tojson }}

View File

@ -67,7 +67,7 @@ metadata:
data:
CRAWL_ARGS: >-
--workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --writePagesToRedis --restartsOnError --headless --screenshot view,thumbnail {{ .Values.crawler_extra_args }}
--sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --writePagesToRedis --restartsOnError --headless --screenshot view,thumbnail {{ .Values.crawler_extra_args }}
---
apiVersion: v1
@ -105,6 +105,7 @@ data:
crawler_extra_memory_per_browser: "{{ .Values.crawler_extra_memory_per_browser | default 0 }}"
crawler_browser_instances: "{{ .Values.crawler_browser_instances }}"
qa_browser_instances: "{{ .Values.qa_browser_instances }}"
crawler_cpu: "{{ .Values.crawler_cpu }}"
crawler_memory: "{{ .Values.crawler_memory }}"