additional scale / browser window cleanup to properly support QA: (#2663)

- follow up to #2627 
- use qa_num_browser_windows to set exact number of QA browsers,
fallback to qa_scale
- set num_browser_windows and num_browsers_per_pod using crawler / qa
values depending if QA crawl
- scale_from_browser_windows() accepts optional browsers_per_pod if
dealing with possible QA override
- store 'desiredScale' in CrawlStatus to avoid recomputing for later
scale resolving
- ensure status.scale is always the actual scale observed
This commit is contained in:
Ilya Kreymer 2025-06-12 13:09:04 -04:00 committed by GitHub
parent 001277ac9d
commit d4a2a66d6d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 52 additions and 55 deletions

View File

@ -57,6 +57,7 @@ class K8sOpAPI(K8sAPI):
except: except:
# default to 1 for now for best results (to revisit in the future) # default to 1 for now for best results (to revisit in the future)
qa_num_workers = 1 qa_num_workers = 1
p["qa_browser_instances"] = 1
crawler_memory, crawler_cpu = self.compute_for_num_browsers( crawler_memory, crawler_cpu = self.compute_for_num_browsers(
num_workers, p.get("crawler_memory"), p.get("crawler_cpu") num_workers, p.get("crawler_memory"), p.get("crawler_cpu")

View File

@ -101,8 +101,6 @@ class CrawlOperator(BaseOperator):
paused_expires_delta: timedelta paused_expires_delta: timedelta
num_browsers_per_pod: int
def __init__(self, *args): def __init__(self, *args):
super().__init__(*args) super().__init__(*args)
@ -127,8 +125,6 @@ class CrawlOperator(BaseOperator):
self.paused_expires_delta = timedelta(minutes=paused_crawl_limit_minutes) self.paused_expires_delta = timedelta(minutes=paused_crawl_limit_minutes)
self.num_browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1))
def init_routes(self, app): def init_routes(self, app):
"""init routes for this operator""" """init routes for this operator"""
@ -188,9 +184,6 @@ class CrawlOperator(BaseOperator):
is_single_page=spec.get("isSinglePage") == "1", is_single_page=spec.get("isSinglePage") == "1",
) )
if crawl.qa_source_crawl_id:
crawl.browser_windows = int(params.get("qa_scale", 1))
# if finalizing, crawl is being deleted # if finalizing, crawl is being deleted
if data.finalizing: if data.finalizing:
if not status.finished: if not status.finished:
@ -259,8 +252,13 @@ class CrawlOperator(BaseOperator):
"starting", status, crawl, allowed_from=["waiting_org_limit"] "starting", status, crawl, allowed_from=["waiting_org_limit"]
) )
if len(pods): status.scale = len(pods)
if status.scale:
for pod_name, pod in pods.items(): for pod_name, pod in pods.items():
# don't count redis pod
if pod_name.startswith("redis-"):
status.scale -= 1
self.sync_resources(status, pod_name, pod, data.children) self.sync_resources(status, pod_name, pod, data.children)
status = await self.sync_crawl_state(redis_url, crawl, status, pods, data) status = await self.sync_crawl_state(redis_url, crawl, status, pods, data)
@ -282,9 +280,6 @@ class CrawlOperator(BaseOperator):
pods, crawl, status, EXEC_TIME_UPDATE_SECS pods, crawl, status, EXEC_TIME_UPDATE_SECS
) )
else:
status.scale = 1
# stopping paused crawls # stopping paused crawls
if crawl.paused_at: if crawl.paused_at:
stop_reason: Optional[StopReason] = None stop_reason: Optional[StopReason] = None
@ -367,22 +362,31 @@ class CrawlOperator(BaseOperator):
if crawl.qa_source_crawl_id: if crawl.qa_source_crawl_id:
params["qa_source_crawl_id"] = crawl.qa_source_crawl_id params["qa_source_crawl_id"] = crawl.qa_source_crawl_id
children.extend(await self._load_qa_configmap(params, data.children)) children.extend(await self._load_qa_configmap(params, data.children))
num_browsers_per_pod = int(params["qa_browser_instances"])
num_browser_windows = int(params.get("qa_num_browser_windows", 1))
else:
num_browsers_per_pod = int(params["crawler_browser_instances"])
num_browser_windows = crawl.browser_windows
# desired scale is the number of pods to create
status.desiredScale = scale_from_browser_windows(
num_browser_windows, num_browsers_per_pod
)
if status.pagesFound < status.desiredScale:
status.desiredScale = max(1, status.pagesFound)
is_paused = bool(crawl.paused_at) and status.state == "paused" is_paused = bool(crawl.paused_at) and status.state == "paused"
# crawl_scale is the number of pods to create for i in range(0, status.desiredScale):
crawler_scale = scale_from_browser_windows(crawl.browser_windows) if status.pagesFound < i * num_browsers_per_pod:
for i in range(0, crawler_scale):
if status.pagesFound < i * self.num_browsers_per_pod:
break break
children.extend( children.extend(
self._load_crawler( self._load_crawler(
params, params,
i, i,
crawler_scale, num_browser_windows,
crawl.browser_windows,
status, status,
data.children, data.children,
is_paused, is_paused,
@ -498,7 +502,6 @@ class CrawlOperator(BaseOperator):
self, self,
params, params,
i: int, i: int,
total_pods: int,
total_browser_windows: int, total_browser_windows: int,
status: CrawlStatus, status: CrawlStatus,
children, children,
@ -506,6 +509,7 @@ class CrawlOperator(BaseOperator):
): ):
name = f"crawl-{params['id']}-{i}" name = f"crawl-{params['id']}-{i}"
has_pod = name in children[POD] has_pod = name in children[POD]
total_pods = status.desiredScale
if params.get("qa_source_crawl_id"): if params.get("qa_source_crawl_id"):
cpu_field = "qa_cpu" cpu_field = "qa_cpu"
@ -581,41 +585,29 @@ class CrawlOperator(BaseOperator):
return False return False
# pylint: disable=too-many-arguments async def _resolve_scale_down(
async def _resolve_scale(
self, self,
crawl: CrawlSpec, crawl: CrawlSpec,
redis: Redis, redis: Redis,
status: CrawlStatus, status: CrawlStatus,
pods: dict[str, dict], pods: dict[str, dict],
): ) -> None:
"""Resolve scale """Resolve scale down
If desired_scale >= actual scale, just set (also limit by number of pages Limit desired scale to number of pages
found). If desired_scale >= actual scale, just return
If desired scale < actual scale, attempt to shut down each crawl instance If desired scale < actual scale, attempt to shut down each crawl instance
via redis setting. If contiguous instances shutdown (successful exit), lower via redis setting. If contiguous instances shutdown (successful exit), lower
scale and clean up previous scale state. scale and clean up previous scale state.
""" """
desired_scale = status.desiredScale
actual_scale = status.scale
desired_scale = scale_from_browser_windows(crawl.browser_windows) # if not scaling down, just return
if desired_scale >= actual_scale:
if status.pagesFound < desired_scale: return
desired_scale = max(1, status.pagesFound)
if desired_scale == status.scale:
return status.scale
crawl_id = crawl.id crawl_id = crawl.id
# actual scale (minus redis pod)
actual_scale = len(pods)
if pods.get(f"redis-{crawl_id}"):
actual_scale -= 1
# if desired_scale same or scaled up, return desired_scale
if desired_scale >= actual_scale:
return desired_scale
new_scale = actual_scale new_scale = actual_scale
for i in range(actual_scale - 1, desired_scale - 1, -1): for i in range(actual_scale - 1, desired_scale - 1, -1):
name = f"crawl-{crawl_id}-{i}" name = f"crawl-{crawl_id}-{i}"
@ -641,8 +633,6 @@ class CrawlOperator(BaseOperator):
await redis.hdel(f"{crawl_id}:stopone", name) await redis.hdel(f"{crawl_id}:stopone", name)
await redis.hdel(f"{crawl_id}:status", name) await redis.hdel(f"{crawl_id}:status", name)
return new_scale
def sync_resources(self, status, name, pod, children): def sync_resources(self, status, name, pod, children):
"""set crawljob status from current resources""" """set crawljob status from current resources"""
resources = status.podStatus[name].allocated resources = status.podStatus[name].allocated
@ -1544,8 +1534,8 @@ class CrawlOperator(BaseOperator):
f"Crawl gracefully stopping: {status.stopReason}, id: {crawl.id}" f"Crawl gracefully stopping: {status.stopReason}, id: {crawl.id}"
) )
# resolve scale # resolve scale down, if needed
await self._resolve_scale(crawl, redis, status, pods) await self._resolve_scale_down(crawl, redis, status, pods)
# check if done / failed # check if done / failed
status_count: dict[str, int] = {} status_count: dict[str, int] = {}

View File

@ -209,8 +209,12 @@ class CrawlStatus(BaseModel):
size: int = 0 size: int = 0
# human readable size string # human readable size string
sizeHuman: str = "" sizeHuman: str = ""
# number of pods
scale: int = 1 # actual observed scale (number of pods active)
scale: int = 0
# desired scale as computed by crawl state (number of pods that should be active)
desiredScale: int = 0
filesAdded: int = 0 filesAdded: int = 0
filesAddedSize: int = 0 filesAddedSize: int = 0
finished: Optional[str] = None finished: Optional[str] = None

View File

@ -24,6 +24,8 @@ from slugify import slugify
default_origin = os.environ.get("APP_ORIGIN", "") default_origin = os.environ.get("APP_ORIGIN", "")
browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1))
class JSONSerializer(json.JSONEncoder): class JSONSerializer(json.JSONEncoder):
"""Serializer class for json.dumps with UUID and datetime support""" """Serializer class for json.dumps with UUID and datetime support"""
@ -203,13 +205,13 @@ def validate_language_code(lang: str):
raise HTTPException(status_code=400, detail="invalid_lang") raise HTTPException(status_code=400, detail="invalid_lang")
def scale_from_browser_windows(browser_windows: int) -> int: def scale_from_browser_windows(
browser_windows: int, custom_browsers_per_pod=None
) -> int:
"""Return number of pods for given number of browser windows""" """Return number of pods for given number of browser windows"""
browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1)) return math.ceil(browser_windows / (custom_browsers_per_pod or browsers_per_pod))
return math.ceil(browser_windows / browsers_per_pod)
def browser_windows_from_scale(scale: int) -> int: def browser_windows_from_scale(scale: int) -> int:
"""Return number of browser windows from specified scale""" """Return number of browser windows from specified scale"""
browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1))
return scale * browsers_per_pod return scale * browsers_per_pod

View File

@ -42,7 +42,7 @@ def qa_run_pages_ready(qa_crawl_id, crawler_auth_headers, default_org_id, qa_run
if count + 1 == MAX_ATTEMPTS: if count + 1 == MAX_ATTEMPTS:
assert False assert False
time.sleep(5) time.sleep(10)
count += 1 count += 1
# Wait until pages are ready # Wait until pages are ready

View File

@ -174,7 +174,7 @@ data:
profile_browser_workdir_size: "{{ .Values.profile_browser_workdir_size | default "4Gi" }}" profile_browser_workdir_size: "{{ .Values.profile_browser_workdir_size | default "4Gi" }}"
qa_scale: "{{ .Values.qa_scale | default 1 }}" qa_num_browser_windows: "{{ .Values.qa_num_browser_windows | default (.Values.qa_scale | default 1) }}"
crawler_node_type: "{{ .Values.crawler_node_type }}" crawler_node_type: "{{ .Values.crawler_node_type }}"
redis_node_type: "{{ .Values.redis_node_type }}" redis_node_type: "{{ .Values.redis_node_type }}"

View File

@ -281,8 +281,8 @@ crawler_browser_instances: 2
# defaults to 'crawler_browser_instances' if not set # defaults to 'crawler_browser_instances' if not set
qa_browser_instances: 1 qa_browser_instances: 1
# fixed scale (number of QA pods) to run # number of browser windows to run for QA (with 'qa_browser_instances' per pod)
qa_scale: 1 qa_num_browser_windows: 2
# this value is added to crawler_cpu_base, for each additional browser # this value is added to crawler_cpu_base, for each additional browser
# crawler_cpu = crawler_cpu_base + crawler_pu_per_extra_browser * (crawler_browser_instances - 1) # crawler_cpu = crawler_cpu_base + crawler_pu_per_extra_browser * (crawler_browser_instances - 1)