include workflow config in QA runs + different browser instances for QA (#1829)
Currently, the workflow crawl settings were not being included at all in QA runs. This mounts the crawl workflow config, as well as QA configmap, into QA run crawls, allowing for page limits from crawl workflow to be applied to QA runs. It also allows a different number of browser instances to be used for QA runs, as QA runs might work better with less browsers, (eg. 2 instead of 4). This can be set with `qa_browser_instances` in helm chart. Default qa browser workers to 1 if unset (for now, for best results) Fixes #1828
This commit is contained in:
parent
18e5ed94f1
commit
61239a40ed
@ -47,20 +47,31 @@ class K8sOpAPI(K8sAPI):
|
||||
def compute_crawler_resources(self):
|
||||
"""compute memory / cpu resources for crawlers"""
|
||||
p = self.shared_params
|
||||
num = max(int(p["crawler_browser_instances"]) - 1, 0)
|
||||
num_workers = max(int(p["crawler_browser_instances"]), 1)
|
||||
try:
|
||||
qa_num_workers = max(int(p["qa_browser_instances"]), 1)
|
||||
# pylint: disable=bare-except
|
||||
except:
|
||||
# default to 1 for now for best results (to revisit in the future)
|
||||
qa_num_workers = 1
|
||||
crawler_cpu: float = 0
|
||||
crawler_memory: int = 0
|
||||
qa_cpu: float = 0
|
||||
qa_memory: int = 0
|
||||
print("crawler resources")
|
||||
if not p.get("crawler_cpu"):
|
||||
base = parse_quantity(p["crawler_cpu_base"])
|
||||
extra = parse_quantity(p["crawler_extra_cpu_per_browser"])
|
||||
|
||||
# cpu is a floating value of cpu cores
|
||||
crawler_cpu = float(base + num * extra)
|
||||
crawler_cpu = float(base + (num_workers - 1) * extra)
|
||||
qa_cpu = float(base + (qa_num_workers - 1) * extra)
|
||||
|
||||
print(f"cpu = {base} + {num} * {extra} = {crawler_cpu}")
|
||||
print(f"cpu = {base} + {num_workers - 1} * {extra} = {crawler_cpu}")
|
||||
print(f"qa_cpu = {base} + {qa_num_workers - 1} * {extra} = {qa_cpu}")
|
||||
else:
|
||||
crawler_cpu = float(parse_quantity(p["crawler_cpu"]))
|
||||
qa_cpu = crawler_cpu
|
||||
print(f"cpu = {crawler_cpu}")
|
||||
|
||||
if not p.get("crawler_memory"):
|
||||
@ -68,11 +79,14 @@ class K8sOpAPI(K8sAPI):
|
||||
extra = parse_quantity(p["crawler_extra_memory_per_browser"])
|
||||
|
||||
# memory is always an int
|
||||
crawler_memory = int(base + num * extra)
|
||||
crawler_memory = int(base + (num_workers - 1) * extra)
|
||||
qa_memory = int(base + (qa_num_workers - 1) * extra)
|
||||
|
||||
print(f"memory = {base} + {num} * {extra} = {crawler_memory}")
|
||||
print(f"memory = {base} + {num_workers - 1} * {extra} = {crawler_memory}")
|
||||
print(f"qa_memory = {base} + {qa_num_workers - 1} * {extra} = {qa_memory}")
|
||||
else:
|
||||
crawler_memory = int(parse_quantity(p["crawler_memory"]))
|
||||
qa_memory = crawler_memory
|
||||
print(f"memory = {crawler_memory}")
|
||||
|
||||
max_crawler_memory_size = 0
|
||||
@ -86,6 +100,10 @@ class K8sOpAPI(K8sAPI):
|
||||
|
||||
p["crawler_cpu"] = crawler_cpu
|
||||
p["crawler_memory"] = crawler_memory
|
||||
p["crawler_workers"] = num_workers
|
||||
p["qa_cpu"] = qa_cpu
|
||||
p["qa_memory"] = qa_memory
|
||||
p["qa_workers"] = qa_num_workers
|
||||
|
||||
def compute_profile_resources(self):
|
||||
"""compute memory /cpu resources for a single profile browser"""
|
||||
|
@ -335,22 +335,30 @@ class CrawlOperator(BaseOperator):
|
||||
name = f"crawl-{params['id']}-{i}"
|
||||
has_pod = name in children[POD]
|
||||
|
||||
if params.get("qa_source_crawl_id"):
|
||||
cpu_field = "qa_cpu"
|
||||
mem_field = "qa_memory"
|
||||
worker_field = "qa_workers"
|
||||
pri_class = f"qa-crawl-pri-{i}"
|
||||
else:
|
||||
cpu_field = "crawler_cpu"
|
||||
mem_field = "crawler_memory"
|
||||
worker_field = "crawler_workers"
|
||||
pri_class = f"crawl-pri-{i}"
|
||||
|
||||
pod_info = status.podStatus[name]
|
||||
params["name"] = name
|
||||
params["cpu"] = pod_info.newCpu or params.get("crawler_cpu")
|
||||
params["memory"] = pod_info.newMemory or params.get("crawler_memory")
|
||||
params["priorityClassName"] = pri_class
|
||||
params["cpu"] = pod_info.newCpu or params.get(cpu_field)
|
||||
params["memory"] = pod_info.newMemory or params.get(mem_field)
|
||||
params["memory_limit"] = float(params["memory"]) * MEM_LIMIT_PADDING
|
||||
params["workers"] = params.get(worker_field) or 1
|
||||
params["do_restart"] = (
|
||||
pod_info.should_restart_pod() or params.get("force_restart")
|
||||
) and has_pod
|
||||
if params.get("do_restart"):
|
||||
print(f"Restart {name}")
|
||||
|
||||
if params.get("qa_source_crawl_id"):
|
||||
params["priorityClassName"] = f"qa-crawl-pri-{i}"
|
||||
else:
|
||||
params["priorityClassName"] = f"crawl-pri-{i}"
|
||||
|
||||
return self.load_from_yaml("crawler.yaml", params)
|
||||
|
||||
# pylint: disable=too-many-arguments
|
||||
|
@ -61,11 +61,12 @@ spec:
|
||||
volumes:
|
||||
- name: crawl-config
|
||||
configMap:
|
||||
{% if not qa_source_crawl_id %}
|
||||
name: crawl-config-{{ cid }}
|
||||
{% else %}
|
||||
{% if qa_source_crawl_id %}
|
||||
- name: qa-config
|
||||
configMap:
|
||||
name: qa-replay-{{ qa_source_crawl_id }}
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
- name: crawl-data
|
||||
persistentVolumeClaim:
|
||||
claimName: {{ name }}
|
||||
@ -114,31 +115,33 @@ spec:
|
||||
image: {{ crawler_image }}
|
||||
imagePullPolicy: {{ crawler_image_pull_policy }}
|
||||
command:
|
||||
{% if not qa_source_crawl_id %}
|
||||
- crawl
|
||||
- {{ "crawl" if not qa_source_crawl_id else "qa" }}
|
||||
- --config
|
||||
- /tmp/crawl-config.json
|
||||
- --workers
|
||||
- "{{ workers }}"
|
||||
- --redisStoreUrl
|
||||
- {{ redis_url }}
|
||||
{%- if profile_filename %}
|
||||
{% if qa_source_crawl_id %}
|
||||
- --qaSource
|
||||
- /tmp/qa-config.json
|
||||
{% elif profile_filename %}
|
||||
- --profile
|
||||
- "@{{ profile_filename }}"
|
||||
{%- endif %}
|
||||
|
||||
{% else %}
|
||||
- qa
|
||||
- --qaSource
|
||||
- /tmp/crawl-config.json
|
||||
- --redisStoreUrl
|
||||
- {{ redis_url }}
|
||||
- --writePagesToRedis
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
volumeMounts:
|
||||
- name: crawl-config
|
||||
mountPath: /tmp/crawl-config.json
|
||||
subPath: crawl-config.json
|
||||
readOnly: True
|
||||
|
||||
{% if qa_source_crawl_id %}
|
||||
- name: qa-config
|
||||
mountPath: /tmp/qa-config.json
|
||||
subPath: qa-config.json
|
||||
readOnly: True
|
||||
{% endif %}
|
||||
|
||||
- name: crawl-data
|
||||
mountPath: /crawls
|
||||
envFrom:
|
||||
|
@ -11,4 +11,4 @@ metadata:
|
||||
role: crawler
|
||||
|
||||
data:
|
||||
crawl-config.json: {{ qa_source_replay_json | tojson }}
|
||||
qa-config.json: {{ qa_source_replay_json | tojson }}
|
||||
|
@ -67,7 +67,7 @@ metadata:
|
||||
|
||||
data:
|
||||
CRAWL_ARGS: >-
|
||||
--workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --writePagesToRedis --restartsOnError --headless --screenshot view,thumbnail {{ .Values.crawler_extra_args }}
|
||||
--sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }} --diskUtilization {{ .Values.disk_utilization_threshold }} --logging {{ .Values.crawler_logging_opts }} --text {{ .Values.crawler_extract_full_text }} --generateWACZ --collection thecrawl --screencastPort 9037 --logErrorsToRedis --writePagesToRedis --restartsOnError --headless --screenshot view,thumbnail {{ .Values.crawler_extra_args }}
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
@ -105,6 +105,7 @@ data:
|
||||
crawler_extra_memory_per_browser: "{{ .Values.crawler_extra_memory_per_browser | default 0 }}"
|
||||
|
||||
crawler_browser_instances: "{{ .Values.crawler_browser_instances }}"
|
||||
qa_browser_instances: "{{ .Values.qa_browser_instances }}"
|
||||
|
||||
crawler_cpu: "{{ .Values.crawler_cpu }}"
|
||||
crawler_memory: "{{ .Values.crawler_memory }}"
|
||||
|
Loading…
Reference in New Issue
Block a user