Additional operator edge case fixes (#2007)

Fix a few edge-case situations:
- Restart evicted pods that have reached the terminal `Failed` state
with reason `Evicted`, by just recreating them. These pods will not be
automatically retried, so need to be recreated (usually happens due to
memory pressure from the node)
- Don't treat containers in ContainerCreating as running, even though
this state is usually quick, its possible for containers to get stuck
there, and will improve accuracy of exec seconds tracking.
- Consolidate state transition for running states, either sets to
running or to pending-wait/generate-wacz/upload-wacz and allows changing
from to either of these states from each other or waiting_capacity
This commit is contained in:
Ilya Kreymer 2024-08-09 13:12:25 -07:00 committed by GitHub
parent 3923996aaf
commit 4ec7cf8adc
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 48 additions and 42 deletions

View File

@ -305,7 +305,7 @@ class CrawlOperator(BaseOperator):
"resyncAfterSeconds": status.resync_after, "resyncAfterSeconds": status.resync_after,
} }
def _load_redis(self, params, status, children): def _load_redis(self, params, status: CrawlStatus, children):
name = f"redis-{params['id']}" name = f"redis-{params['id']}"
has_pod = name in children[POD] has_pod = name in children[POD]
@ -313,11 +313,13 @@ class CrawlOperator(BaseOperator):
params["name"] = name params["name"] = name
params["cpu"] = pod_info.newCpu or params.get("redis_cpu") params["cpu"] = pod_info.newCpu or params.get("redis_cpu")
params["memory"] = pod_info.newMemory or params.get("redis_memory") params["memory"] = pod_info.newMemory or params.get("redis_memory")
restart = pod_info.should_restart_pod() and has_pod restart_reason = None
if restart: if has_pod:
print(f"Restart {name}") restart_reason = pod_info.should_restart_pod()
if restart_reason:
print(f"Restarting {name}, reason: {restart_reason}")
params["init_redis"] = status.initRedis and not restart params["init_redis"] = status.initRedis and not restart_reason
return self.load_from_yaml("redis.yaml", params) return self.load_from_yaml("redis.yaml", params)
@ -362,7 +364,7 @@ class CrawlOperator(BaseOperator):
params["qa_source_replay_json"] = crawl_replay.json(include={"resources"}) params["qa_source_replay_json"] = crawl_replay.json(include={"resources"})
return self.load_from_yaml("qa_configmap.yaml", params) return self.load_from_yaml("qa_configmap.yaml", params)
def _load_crawler(self, params, i, status, children): def _load_crawler(self, params, i, status: CrawlStatus, children):
name = f"crawl-{params['id']}-{i}" name = f"crawl-{params['id']}-{i}"
has_pod = name in children[POD] has_pod = name in children[POD]
@ -387,11 +389,12 @@ class CrawlOperator(BaseOperator):
else: else:
params["memory_limit"] = self.k8s.max_crawler_memory_size params["memory_limit"] = self.k8s.max_crawler_memory_size
params["workers"] = params.get(worker_field) or 1 params["workers"] = params.get(worker_field) or 1
params["do_restart"] = ( params["do_restart"] = False
pod_info.should_restart_pod() or params.get("force_restart") if has_pod:
) and has_pod restart_reason = pod_info.should_restart_pod(params.get("force_restart"))
if params.get("do_restart"): if restart_reason:
print(f"Restart {name}") print(f"Restarting {name}, reason: {restart_reason}")
params["do_restart"] = True
return self.load_from_yaml("crawler.yaml", params) return self.load_from_yaml("crawler.yaml", params)
@ -523,7 +526,7 @@ class CrawlOperator(BaseOperator):
finished=finished, finished=finished,
stats=stats, stats=stats,
) )
if res: if res and status.state != state:
print(f"Setting state: {status.state} -> {state}, {crawl.id}") print(f"Setting state: {status.state} -> {state}, {crawl.id}")
status.state = state status.state = state
return True return True
@ -804,14 +807,6 @@ class CrawlOperator(BaseOperator):
status.resync_after = self.fast_retry_secs status.resync_after = self.fast_retry_secs
return status return status
# ensure running state is set
await self.set_state(
"running",
status,
crawl,
allowed_from=["starting", "waiting_capacity"],
)
# update lastActiveTime if crawler is running # update lastActiveTime if crawler is running
if crawler_running: if crawler_running:
status.lastActiveTime = to_k8s_date(dt_now()) status.lastActiveTime = to_k8s_date(dt_now())
@ -874,6 +869,7 @@ class CrawlOperator(BaseOperator):
try: try:
for name, pod in pods.items(): for name, pod in pods.items():
running = False running = False
evicted = False
pstatus = pod["status"] pstatus = pod["status"]
phase = pstatus["phase"] phase = pstatus["phase"]
@ -881,18 +877,24 @@ class CrawlOperator(BaseOperator):
if phase in ("Running", "Succeeded"): if phase in ("Running", "Succeeded"):
running = True running = True
elif phase == "Failed" and pstatus.get("reason") == "Evicted":
evicted = True
status.podStatus[name].evicted = evicted
if "containerStatuses" in pstatus: if "containerStatuses" in pstatus:
cstatus = pstatus["containerStatuses"][0] cstatus = pstatus["containerStatuses"][0]
# consider 'ContainerCreating' as running # don't consider 'ContainerCreating' as running for now
waiting = cstatus["state"].get("waiting") # may be stuck in this state for other reasons
if ( #
phase == "Pending" # waiting = cstatus["state"].get("waiting")
and waiting # if (
and waiting.get("reason") == "ContainerCreating" # phase == "Pending"
): # and waiting
running = True # and waiting.get("reason") == "ContainerCreating"
# ):
# running = True
self.handle_terminated_pod( self.handle_terminated_pod(
name, role, status, cstatus["state"].get("terminated") name, role, status, cstatus["state"].get("terminated")
@ -1388,24 +1390,20 @@ class CrawlOperator(BaseOperator):
else: else:
await self.fail_crawl(crawl, status, pods, stats) await self.fail_crawl(crawl, status, pods, stats)
# check for other statuses # check for other statuses, default to "running"
else: else:
new_status: Optional[TYPE_RUNNING_STATES] = None new_status: TYPE_RUNNING_STATES = "running"
if status_count.get("running"):
if status.state in ("generate-wacz", "uploading-wacz", "pending-wacz"):
new_status = "running"
elif status_count.get("generate-wacz"): if status_count.get("generate-wacz"):
new_status = "generate-wacz" new_status = "generate-wacz"
elif status_count.get("uploading-wacz"): elif status_count.get("uploading-wacz"):
new_status = "uploading-wacz" new_status = "uploading-wacz"
elif status_count.get("pending-wait"): elif status_count.get("pending-wait"):
new_status = "pending-wait" new_status = "pending-wait"
if new_status: await self.set_state(
await self.set_state( new_status, status, crawl, allowed_from=RUNNING_AND_WAITING_STATES
new_status, status, crawl, allowed_from=RUNNING_STATES )
)
return status return status

View File

@ -134,6 +134,8 @@ class PodInfo(BaseModel):
newMemory: Optional[int] = None newMemory: Optional[int] = None
signalAtMem: Optional[int] = None signalAtMem: Optional[int] = None
evicted: Optional[bool] = False
def dict(self, *a, **kw): def dict(self, *a, **kw):
res = super().dict(*a, **kw) res = super().dict(*a, **kw)
percent = { percent = {
@ -168,15 +170,21 @@ class PodInfo(BaseModel):
else 0 else 0
) )
def should_restart_pod(self): def should_restart_pod(self, forced: bool = False) -> Optional[str]:
"""return true if pod should be restarted""" """return true if pod should be restarted"""
if self.newMemory and self.newMemory != self.allocated.memory: if self.newMemory and self.newMemory != self.allocated.memory:
return True return "newMemory"
if self.newCpu and self.newCpu != self.allocated.cpu: if self.newCpu and self.newCpu != self.allocated.cpu:
return True return "newCpu"
return False if self.evicted:
return "evicted"
if forced:
return "forced"
return None
# ============================================================================ # ============================================================================