Improved Scale Handling (#1889)
Fixes #1888 Refactors scale handling: - Ensures number of scaled instances does not exceed number of pages, but is also at minimum 1 - Checks for finish condition to be numFailed + numDone >= desired scale - If at least one instance succeeds, crawl considers successful / done. - If all instances fail, crawl considered failed - Ensures that pod done count >= redis done count --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
9140dd75bc
commit
6df10d5fb0
@ -248,7 +248,7 @@ class CrawlOperator(BaseOperator):
|
|||||||
)
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
status.scale = crawl.scale
|
status.scale = 1
|
||||||
now = dt_now()
|
now = dt_now()
|
||||||
await self.crawl_ops.inc_crawl_exec_time(
|
await self.crawl_ops.inc_crawl_exec_time(
|
||||||
crawl.db_crawl_id, crawl.is_qa, 0, now
|
crawl.db_crawl_id, crawl.is_qa, 0, now
|
||||||
@ -410,8 +410,8 @@ class CrawlOperator(BaseOperator):
|
|||||||
actual_scale -= 1
|
actual_scale -= 1
|
||||||
|
|
||||||
# ensure at least enough pages for the scale
|
# ensure at least enough pages for the scale
|
||||||
if status.pagesFound and status.pagesFound < desired_scale:
|
if status.pagesFound < desired_scale:
|
||||||
desired_scale = status.pagesFound
|
desired_scale = max(1, status.pagesFound)
|
||||||
|
|
||||||
# if desired_scale same or scaled up, return desired_scale
|
# if desired_scale same or scaled up, return desired_scale
|
||||||
if desired_scale >= actual_scale:
|
if desired_scale >= actual_scale:
|
||||||
@ -514,13 +514,15 @@ class CrawlOperator(BaseOperator):
|
|||||||
status.finished = to_k8s_date(finished)
|
status.finished = to_k8s_date(finished)
|
||||||
|
|
||||||
if actual_state != state:
|
if actual_state != state:
|
||||||
print(f"state mismatch, actual state {actual_state}, requested {state}")
|
print(
|
||||||
|
f"State mismatch, actual state {actual_state}, requested {state}, {crawl.id}"
|
||||||
|
)
|
||||||
if not actual_state and state == "canceled":
|
if not actual_state and state == "canceled":
|
||||||
return True
|
return True
|
||||||
|
|
||||||
if status.state != state:
|
if status.state != state:
|
||||||
print(
|
print(
|
||||||
f"Not setting state: {status.state} -> {state}, {crawl.id} not allowed"
|
f"Not setting state: {status.state} -> {state}, not allowed, {crawl.id}"
|
||||||
)
|
)
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@ -730,7 +732,9 @@ class CrawlOperator(BaseOperator):
|
|||||||
):
|
):
|
||||||
"""sync crawl state for running crawl"""
|
"""sync crawl state for running crawl"""
|
||||||
# check if at least one crawler pod started running
|
# check if at least one crawler pod started running
|
||||||
crawler_running, redis_running, done = self.sync_pod_status(pods, status)
|
crawler_running, redis_running, pod_done_count = self.sync_pod_status(
|
||||||
|
pods, status
|
||||||
|
)
|
||||||
redis = None
|
redis = None
|
||||||
|
|
||||||
try:
|
try:
|
||||||
@ -745,7 +749,9 @@ class CrawlOperator(BaseOperator):
|
|||||||
|
|
||||||
if not crawler_running or not redis:
|
if not crawler_running or not redis:
|
||||||
# if either crawler is not running or redis is inaccessible
|
# if either crawler is not running or redis is inaccessible
|
||||||
if self.should_mark_waiting(status.state, crawl.started):
|
if not pod_done_count and self.should_mark_waiting(
|
||||||
|
status.state, crawl.started
|
||||||
|
):
|
||||||
# mark as waiting (if already running)
|
# mark as waiting (if already running)
|
||||||
await self.set_state(
|
await self.set_state(
|
||||||
"waiting_capacity",
|
"waiting_capacity",
|
||||||
@ -755,8 +761,10 @@ class CrawlOperator(BaseOperator):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if not crawler_running and redis:
|
if not crawler_running and redis:
|
||||||
# if crawler running, but no redis, stop redis instance until crawler
|
# if crawler is not running for REDIS_TTL seconds, also stop redis
|
||||||
# is running
|
# but not right away in case crawler pod is just restarting.
|
||||||
|
# avoids keeping redis pods around while no crawler pods are up
|
||||||
|
# (eg. due to resource constraints)
|
||||||
if status.lastActiveTime and (
|
if status.lastActiveTime and (
|
||||||
(
|
(
|
||||||
dt_now() - from_k8s_date(status.lastActiveTime)
|
dt_now() - from_k8s_date(status.lastActiveTime)
|
||||||
@ -770,7 +778,6 @@ class CrawlOperator(BaseOperator):
|
|||||||
elif crawler_running and not redis:
|
elif crawler_running and not redis:
|
||||||
# if crawler is running, but no redis, init redis
|
# if crawler is running, but no redis, init redis
|
||||||
status.initRedis = True
|
status.initRedis = True
|
||||||
status.lastActiveTime = to_k8s_date(dt_now())
|
|
||||||
|
|
||||||
# if no crawler / no redis, resync after N seconds
|
# if no crawler / no redis, resync after N seconds
|
||||||
status.resync_after = self.fast_retry_secs
|
status.resync_after = self.fast_retry_secs
|
||||||
@ -791,6 +798,10 @@ class CrawlOperator(BaseOperator):
|
|||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# update lastActiveTime if crawler is running
|
||||||
|
if crawler_running:
|
||||||
|
status.lastActiveTime = to_k8s_date(dt_now())
|
||||||
|
|
||||||
file_done = await redis.lpop(self.done_key)
|
file_done = await redis.lpop(self.done_key)
|
||||||
while file_done:
|
while file_done:
|
||||||
msg = json.loads(file_done)
|
msg = json.loads(file_done)
|
||||||
@ -824,7 +835,9 @@ class CrawlOperator(BaseOperator):
|
|||||||
status.filesAddedSize = int(await redis.get("filesAddedSize") or 0)
|
status.filesAddedSize = int(await redis.get("filesAddedSize") or 0)
|
||||||
|
|
||||||
# update stats and get status
|
# update stats and get status
|
||||||
return await self.update_crawl_state(redis, crawl, status, pods, done)
|
return await self.update_crawl_state(
|
||||||
|
redis, crawl, status, pods, pod_done_count
|
||||||
|
)
|
||||||
|
|
||||||
# pylint: disable=broad-except
|
# pylint: disable=broad-except
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
@ -836,11 +849,13 @@ class CrawlOperator(BaseOperator):
|
|||||||
if redis:
|
if redis:
|
||||||
await redis.close()
|
await redis.close()
|
||||||
|
|
||||||
def sync_pod_status(self, pods: dict[str, dict], status: CrawlStatus):
|
def sync_pod_status(
|
||||||
|
self, pods: dict[str, dict], status: CrawlStatus
|
||||||
|
) -> tuple[bool, bool, int]:
|
||||||
"""check status of pods"""
|
"""check status of pods"""
|
||||||
crawler_running = False
|
crawler_running = False
|
||||||
redis_running = False
|
redis_running = False
|
||||||
done = True
|
pod_done_count = 0
|
||||||
|
|
||||||
try:
|
try:
|
||||||
for name, pod in pods.items():
|
for name, pod in pods.items():
|
||||||
@ -871,16 +886,16 @@ class CrawlOperator(BaseOperator):
|
|||||||
|
|
||||||
if role == "crawler":
|
if role == "crawler":
|
||||||
crawler_running = crawler_running or running
|
crawler_running = crawler_running or running
|
||||||
done = done and phase == "Succeeded"
|
if phase == "Succeeded":
|
||||||
|
pod_done_count += 1
|
||||||
elif role == "redis":
|
elif role == "redis":
|
||||||
redis_running = redis_running or running
|
redis_running = redis_running or running
|
||||||
|
|
||||||
# pylint: disable=broad-except
|
# pylint: disable=broad-except
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
done = False
|
|
||||||
print(exc)
|
print(exc)
|
||||||
|
|
||||||
return crawler_running, redis_running, done
|
return crawler_running, redis_running, pod_done_count
|
||||||
|
|
||||||
def handle_terminated_pod(self, name, role, status: CrawlStatus, terminated):
|
def handle_terminated_pod(self, name, role, status: CrawlStatus, terminated):
|
||||||
"""handle terminated pod state"""
|
"""handle terminated pod state"""
|
||||||
@ -1231,7 +1246,7 @@ class CrawlOperator(BaseOperator):
|
|||||||
crawl: CrawlSpec,
|
crawl: CrawlSpec,
|
||||||
status: CrawlStatus,
|
status: CrawlStatus,
|
||||||
pods: dict[str, dict],
|
pods: dict[str, dict],
|
||||||
done: bool,
|
pod_done_count: int,
|
||||||
) -> CrawlStatus:
|
) -> CrawlStatus:
|
||||||
"""update crawl state and check if crawl is now done"""
|
"""update crawl state and check if crawl is now done"""
|
||||||
results = await redis.hgetall(f"{crawl.id}:status")
|
results = await redis.hgetall(f"{crawl.id}:status")
|
||||||
@ -1274,13 +1289,20 @@ class CrawlOperator(BaseOperator):
|
|||||||
|
|
||||||
# check if done / failed
|
# check if done / failed
|
||||||
status_count: dict[str, int] = {}
|
status_count: dict[str, int] = {}
|
||||||
for i in range(crawl.scale):
|
for i in range(status.scale):
|
||||||
res = results.get(f"crawl-{crawl.id}-{i}")
|
res = results.get(f"crawl-{crawl.id}-{i}")
|
||||||
if res:
|
if res:
|
||||||
status_count[res] = status_count.get(res, 0) + 1
|
status_count[res] = status_count.get(res, 0) + 1
|
||||||
|
|
||||||
# check if all crawlers are done
|
num_done = status_count.get("done", 0)
|
||||||
if done and status_count.get("done", 0) >= crawl.scale:
|
num_failed = status_count.get("failed", 0)
|
||||||
|
# all expected pods are either done or failed
|
||||||
|
all_completed = (num_done + num_failed) >= status.scale
|
||||||
|
|
||||||
|
# if at least one is done according to redis, consider crawl successful
|
||||||
|
# ensure pod successfully exited as well
|
||||||
|
# pylint: disable=chained-comparison
|
||||||
|
if all_completed and num_done >= 1 and pod_done_count >= num_done:
|
||||||
# check if one-page crawls actually succeeded
|
# check if one-page crawls actually succeeded
|
||||||
# if only one page found, and no files, assume failed
|
# if only one page found, and no files, assume failed
|
||||||
if status.pagesFound == 1 and not status.filesAdded:
|
if status.pagesFound == 1 and not status.filesAdded:
|
||||||
@ -1297,8 +1319,8 @@ class CrawlOperator(BaseOperator):
|
|||||||
|
|
||||||
await self.mark_finished(crawl, status, state, stats)
|
await self.mark_finished(crawl, status, state, stats)
|
||||||
|
|
||||||
# check if all crawlers failed
|
# check if all crawlers failed -- no crawl data was generated
|
||||||
elif status_count.get("failed", 0) >= crawl.scale:
|
elif all_completed and num_done == 0 and num_failed > 0:
|
||||||
# if stopping, and no pages finished, mark as canceled
|
# if stopping, and no pages finished, mark as canceled
|
||||||
if status.stopping and not status.pagesDone:
|
if status.stopping and not status.pagesDone:
|
||||||
await self.mark_finished(crawl, status, "canceled", stats)
|
await self.mark_finished(crawl, status, "canceled", stats)
|
||||||
@ -1318,6 +1340,7 @@ class CrawlOperator(BaseOperator):
|
|||||||
new_status = "uploading-wacz"
|
new_status = "uploading-wacz"
|
||||||
elif status_count.get("pending-wait"):
|
elif status_count.get("pending-wait"):
|
||||||
new_status = "pending-wait"
|
new_status = "pending-wait"
|
||||||
|
|
||||||
if new_status:
|
if new_status:
|
||||||
await self.set_state(
|
await self.set_state(
|
||||||
new_status, status, crawl, allowed_from=RUNNING_STATES
|
new_status, status, crawl, allowed_from=RUNNING_STATES
|
||||||
|
|||||||
@ -12,6 +12,8 @@ default_crawl_filename_template: "@ts-testing-@hostsuffix.wacz"
|
|||||||
|
|
||||||
operator_resync_seconds: 3
|
operator_resync_seconds: 3
|
||||||
|
|
||||||
|
qa_scale: 2
|
||||||
|
|
||||||
# for testing only
|
# for testing only
|
||||||
crawler_extra_cpu_per_browser: 300m
|
crawler_extra_cpu_per_browser: 300m
|
||||||
|
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user