additional scale / browser window cleanup to properly support QA: (#2663)
- follow up to #2627 - use qa_num_browser_windows to set exact number of QA browsers, fallback to qa_scale - set num_browser_windows and num_browsers_per_pod using crawler / qa values depending if QA crawl - scale_from_browser_windows() accepts optional browsers_per_pod if dealing with possible QA override - store 'desiredScale' in CrawlStatus to avoid recomputing for later scale resolving - ensure status.scale is always the actual scale observed
This commit is contained in:
parent
001277ac9d
commit
d4a2a66d6d
@ -57,6 +57,7 @@ class K8sOpAPI(K8sAPI):
|
||||
except:
|
||||
# default to 1 for now for best results (to revisit in the future)
|
||||
qa_num_workers = 1
|
||||
p["qa_browser_instances"] = 1
|
||||
|
||||
crawler_memory, crawler_cpu = self.compute_for_num_browsers(
|
||||
num_workers, p.get("crawler_memory"), p.get("crawler_cpu")
|
||||
|
@ -101,8 +101,6 @@ class CrawlOperator(BaseOperator):
|
||||
|
||||
paused_expires_delta: timedelta
|
||||
|
||||
num_browsers_per_pod: int
|
||||
|
||||
def __init__(self, *args):
|
||||
super().__init__(*args)
|
||||
|
||||
@ -127,8 +125,6 @@ class CrawlOperator(BaseOperator):
|
||||
|
||||
self.paused_expires_delta = timedelta(minutes=paused_crawl_limit_minutes)
|
||||
|
||||
self.num_browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1))
|
||||
|
||||
def init_routes(self, app):
|
||||
"""init routes for this operator"""
|
||||
|
||||
@ -188,9 +184,6 @@ class CrawlOperator(BaseOperator):
|
||||
is_single_page=spec.get("isSinglePage") == "1",
|
||||
)
|
||||
|
||||
if crawl.qa_source_crawl_id:
|
||||
crawl.browser_windows = int(params.get("qa_scale", 1))
|
||||
|
||||
# if finalizing, crawl is being deleted
|
||||
if data.finalizing:
|
||||
if not status.finished:
|
||||
@ -259,8 +252,13 @@ class CrawlOperator(BaseOperator):
|
||||
"starting", status, crawl, allowed_from=["waiting_org_limit"]
|
||||
)
|
||||
|
||||
if len(pods):
|
||||
status.scale = len(pods)
|
||||
if status.scale:
|
||||
for pod_name, pod in pods.items():
|
||||
# don't count redis pod
|
||||
if pod_name.startswith("redis-"):
|
||||
status.scale -= 1
|
||||
|
||||
self.sync_resources(status, pod_name, pod, data.children)
|
||||
|
||||
status = await self.sync_crawl_state(redis_url, crawl, status, pods, data)
|
||||
@ -282,9 +280,6 @@ class CrawlOperator(BaseOperator):
|
||||
pods, crawl, status, EXEC_TIME_UPDATE_SECS
|
||||
)
|
||||
|
||||
else:
|
||||
status.scale = 1
|
||||
|
||||
# stopping paused crawls
|
||||
if crawl.paused_at:
|
||||
stop_reason: Optional[StopReason] = None
|
||||
@ -367,22 +362,31 @@ class CrawlOperator(BaseOperator):
|
||||
if crawl.qa_source_crawl_id:
|
||||
params["qa_source_crawl_id"] = crawl.qa_source_crawl_id
|
||||
children.extend(await self._load_qa_configmap(params, data.children))
|
||||
num_browsers_per_pod = int(params["qa_browser_instances"])
|
||||
num_browser_windows = int(params.get("qa_num_browser_windows", 1))
|
||||
else:
|
||||
num_browsers_per_pod = int(params["crawler_browser_instances"])
|
||||
num_browser_windows = crawl.browser_windows
|
||||
|
||||
# desired scale is the number of pods to create
|
||||
status.desiredScale = scale_from_browser_windows(
|
||||
num_browser_windows, num_browsers_per_pod
|
||||
)
|
||||
|
||||
if status.pagesFound < status.desiredScale:
|
||||
status.desiredScale = max(1, status.pagesFound)
|
||||
|
||||
is_paused = bool(crawl.paused_at) and status.state == "paused"
|
||||
|
||||
# crawl_scale is the number of pods to create
|
||||
crawler_scale = scale_from_browser_windows(crawl.browser_windows)
|
||||
|
||||
for i in range(0, crawler_scale):
|
||||
if status.pagesFound < i * self.num_browsers_per_pod:
|
||||
for i in range(0, status.desiredScale):
|
||||
if status.pagesFound < i * num_browsers_per_pod:
|
||||
break
|
||||
|
||||
children.extend(
|
||||
self._load_crawler(
|
||||
params,
|
||||
i,
|
||||
crawler_scale,
|
||||
crawl.browser_windows,
|
||||
num_browser_windows,
|
||||
status,
|
||||
data.children,
|
||||
is_paused,
|
||||
@ -498,7 +502,6 @@ class CrawlOperator(BaseOperator):
|
||||
self,
|
||||
params,
|
||||
i: int,
|
||||
total_pods: int,
|
||||
total_browser_windows: int,
|
||||
status: CrawlStatus,
|
||||
children,
|
||||
@ -506,6 +509,7 @@ class CrawlOperator(BaseOperator):
|
||||
):
|
||||
name = f"crawl-{params['id']}-{i}"
|
||||
has_pod = name in children[POD]
|
||||
total_pods = status.desiredScale
|
||||
|
||||
if params.get("qa_source_crawl_id"):
|
||||
cpu_field = "qa_cpu"
|
||||
@ -581,41 +585,29 @@ class CrawlOperator(BaseOperator):
|
||||
|
||||
return False
|
||||
|
||||
# pylint: disable=too-many-arguments
|
||||
async def _resolve_scale(
|
||||
async def _resolve_scale_down(
|
||||
self,
|
||||
crawl: CrawlSpec,
|
||||
redis: Redis,
|
||||
status: CrawlStatus,
|
||||
pods: dict[str, dict],
|
||||
):
|
||||
"""Resolve scale
|
||||
If desired_scale >= actual scale, just set (also limit by number of pages
|
||||
found).
|
||||
) -> None:
|
||||
"""Resolve scale down
|
||||
Limit desired scale to number of pages
|
||||
If desired_scale >= actual scale, just return
|
||||
If desired scale < actual scale, attempt to shut down each crawl instance
|
||||
via redis setting. If contiguous instances shutdown (successful exit), lower
|
||||
scale and clean up previous scale state.
|
||||
"""
|
||||
desired_scale = status.desiredScale
|
||||
actual_scale = status.scale
|
||||
|
||||
desired_scale = scale_from_browser_windows(crawl.browser_windows)
|
||||
|
||||
if status.pagesFound < desired_scale:
|
||||
desired_scale = max(1, status.pagesFound)
|
||||
|
||||
if desired_scale == status.scale:
|
||||
return status.scale
|
||||
# if not scaling down, just return
|
||||
if desired_scale >= actual_scale:
|
||||
return
|
||||
|
||||
crawl_id = crawl.id
|
||||
|
||||
# actual scale (minus redis pod)
|
||||
actual_scale = len(pods)
|
||||
if pods.get(f"redis-{crawl_id}"):
|
||||
actual_scale -= 1
|
||||
|
||||
# if desired_scale same or scaled up, return desired_scale
|
||||
if desired_scale >= actual_scale:
|
||||
return desired_scale
|
||||
|
||||
new_scale = actual_scale
|
||||
for i in range(actual_scale - 1, desired_scale - 1, -1):
|
||||
name = f"crawl-{crawl_id}-{i}"
|
||||
@ -641,8 +633,6 @@ class CrawlOperator(BaseOperator):
|
||||
await redis.hdel(f"{crawl_id}:stopone", name)
|
||||
await redis.hdel(f"{crawl_id}:status", name)
|
||||
|
||||
return new_scale
|
||||
|
||||
def sync_resources(self, status, name, pod, children):
|
||||
"""set crawljob status from current resources"""
|
||||
resources = status.podStatus[name].allocated
|
||||
@ -1544,8 +1534,8 @@ class CrawlOperator(BaseOperator):
|
||||
f"Crawl gracefully stopping: {status.stopReason}, id: {crawl.id}"
|
||||
)
|
||||
|
||||
# resolve scale
|
||||
await self._resolve_scale(crawl, redis, status, pods)
|
||||
# resolve scale down, if needed
|
||||
await self._resolve_scale_down(crawl, redis, status, pods)
|
||||
|
||||
# check if done / failed
|
||||
status_count: dict[str, int] = {}
|
||||
|
@ -209,8 +209,12 @@ class CrawlStatus(BaseModel):
|
||||
size: int = 0
|
||||
# human readable size string
|
||||
sizeHuman: str = ""
|
||||
# number of pods
|
||||
scale: int = 1
|
||||
|
||||
# actual observed scale (number of pods active)
|
||||
scale: int = 0
|
||||
# desired scale as computed by crawl state (number of pods that should be active)
|
||||
desiredScale: int = 0
|
||||
|
||||
filesAdded: int = 0
|
||||
filesAddedSize: int = 0
|
||||
finished: Optional[str] = None
|
||||
|
@ -24,6 +24,8 @@ from slugify import slugify
|
||||
|
||||
default_origin = os.environ.get("APP_ORIGIN", "")
|
||||
|
||||
browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1))
|
||||
|
||||
|
||||
class JSONSerializer(json.JSONEncoder):
|
||||
"""Serializer class for json.dumps with UUID and datetime support"""
|
||||
@ -203,13 +205,13 @@ def validate_language_code(lang: str):
|
||||
raise HTTPException(status_code=400, detail="invalid_lang")
|
||||
|
||||
|
||||
def scale_from_browser_windows(browser_windows: int) -> int:
|
||||
def scale_from_browser_windows(
|
||||
browser_windows: int, custom_browsers_per_pod=None
|
||||
) -> int:
|
||||
"""Return number of pods for given number of browser windows"""
|
||||
browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1))
|
||||
return math.ceil(browser_windows / browsers_per_pod)
|
||||
return math.ceil(browser_windows / (custom_browsers_per_pod or browsers_per_pod))
|
||||
|
||||
|
||||
def browser_windows_from_scale(scale: int) -> int:
|
||||
"""Return number of browser windows from specified scale"""
|
||||
browsers_per_pod = int(os.environ.get("NUM_BROWSERS", 1))
|
||||
return scale * browsers_per_pod
|
||||
|
@ -42,7 +42,7 @@ def qa_run_pages_ready(qa_crawl_id, crawler_auth_headers, default_org_id, qa_run
|
||||
if count + 1 == MAX_ATTEMPTS:
|
||||
assert False
|
||||
|
||||
time.sleep(5)
|
||||
time.sleep(10)
|
||||
count += 1
|
||||
|
||||
# Wait until pages are ready
|
||||
|
@ -174,7 +174,7 @@ data:
|
||||
|
||||
profile_browser_workdir_size: "{{ .Values.profile_browser_workdir_size | default "4Gi" }}"
|
||||
|
||||
qa_scale: "{{ .Values.qa_scale | default 1 }}"
|
||||
qa_num_browser_windows: "{{ .Values.qa_num_browser_windows | default (.Values.qa_scale | default 1) }}"
|
||||
|
||||
crawler_node_type: "{{ .Values.crawler_node_type }}"
|
||||
redis_node_type: "{{ .Values.redis_node_type }}"
|
||||
|
@ -281,8 +281,8 @@ crawler_browser_instances: 2
|
||||
# defaults to 'crawler_browser_instances' if not set
|
||||
qa_browser_instances: 1
|
||||
|
||||
# fixed scale (number of QA pods) to run
|
||||
qa_scale: 1
|
||||
# number of browser windows to run for QA (with 'qa_browser_instances' per pod)
|
||||
qa_num_browser_windows: 2
|
||||
|
||||
# this value is added to crawler_cpu_base, for each additional browser
|
||||
# crawler_cpu = crawler_cpu_base + crawler_pu_per_extra_browser * (crawler_browser_instances - 1)
|
||||
|
Loading…
Reference in New Issue
Block a user