better handling of failed redis connection + exec time updates (#1520)
This PR addresses a possible failure when Redis pod was inaccessible from Crawler pod. - Ensure crawl is set to 'waiting_for_capacity' if either no crawler pods are available or no redis pod. previously, missing/inaccessible redis would not result in 'waiting_for_capacity' if crawler pods are available - Rework logic: if no crawler and redis after >60 seconds, shutdown redis. if crawler and no redis, init (or reinit) redis - track 'lastUpdatedTime' in db when incrementing exec time to avoid double counting if lastUpdatedTime has not changed, eg. if operator sync fails. - add redis timeout of 20 seconds to avoid timing out operator responses if redis conn takes too long, assume unavailable
This commit is contained in:
parent
d1156b0145
commit
0653657637
@ -451,11 +451,14 @@ class CrawlOps(BaseCrawlOps):
|
||||
query = {"_id": crawl_id, "type": "crawl", "state": "running"}
|
||||
return await self.crawls.find_one_and_update(query, {"$set": {"stats": stats}})
|
||||
|
||||
async def inc_crawl_exec_time(self, crawl_id, exec_time):
|
||||
async def inc_crawl_exec_time(self, crawl_id, exec_time, last_updated_time):
|
||||
"""increment exec time"""
|
||||
return await self.crawls.find_one_and_update(
|
||||
{"_id": crawl_id, "type": "crawl"},
|
||||
{"$inc": {"crawlExecSeconds": exec_time}},
|
||||
{"_id": crawl_id, "type": "crawl", "_lut": {"$ne": last_updated_time}},
|
||||
{
|
||||
"$inc": {"crawlExecSeconds": exec_time},
|
||||
"$set": {"_lut": last_updated_time},
|
||||
},
|
||||
)
|
||||
|
||||
async def get_crawl_state(self, crawl_id):
|
||||
|
@ -67,7 +67,10 @@ class K8sAPI:
|
||||
async def get_redis_client(self, redis_url):
|
||||
"""return redis client with correct params for one-time use"""
|
||||
return aioredis.from_url(
|
||||
redis_url, decode_responses=True, auto_close_connection_pool=True
|
||||
redis_url,
|
||||
decode_responses=True,
|
||||
auto_close_connection_pool=True,
|
||||
socket_timeout=20,
|
||||
)
|
||||
|
||||
# pylint: disable=too-many-arguments, too-many-locals
|
||||
|
@ -942,8 +942,10 @@ class BtrixOperator(K8sAPI):
|
||||
if status.anyCrawlPodNewExit:
|
||||
await self.log_crashes(crawl.id, status.podStatus, redis)
|
||||
|
||||
if not crawler_running:
|
||||
if not crawler_running or not redis:
|
||||
# if either crawler is not running or redis is inaccessible
|
||||
if self.should_mark_waiting(status.state, crawl.started):
|
||||
# mark as waiting (if already running)
|
||||
await self.set_state(
|
||||
"waiting_capacity",
|
||||
status,
|
||||
@ -951,25 +953,25 @@ class BtrixOperator(K8sAPI):
|
||||
allowed_from=RUNNING_AND_STARTING_ONLY,
|
||||
)
|
||||
|
||||
# for now, don't reset redis once inited
|
||||
if status.lastActiveTime and (
|
||||
(dt_now() - from_k8s_date(status.lastActiveTime)).total_seconds()
|
||||
> REDIS_TTL
|
||||
):
|
||||
print(
|
||||
f"Pausing redis, no running crawler pods for >{REDIS_TTL} secs"
|
||||
)
|
||||
status.initRedis = False
|
||||
if not crawler_running and redis:
|
||||
# if crawler running, but no redis, stop redis instance until crawler
|
||||
# is running
|
||||
if status.lastActiveTime and (
|
||||
(
|
||||
dt_now() - from_k8s_date(status.lastActiveTime)
|
||||
).total_seconds()
|
||||
> REDIS_TTL
|
||||
):
|
||||
print(
|
||||
f"Pausing redis, no running crawler pods for >{REDIS_TTL} secs"
|
||||
)
|
||||
status.initRedis = False
|
||||
elif crawler_running and not redis:
|
||||
# if crawler is running, but no redis, init redis
|
||||
status.initRedis = True
|
||||
status.lastActiveTime = to_k8s_date(dt_now())
|
||||
|
||||
# if still running, resync after N seconds
|
||||
status.resync_after = self.fast_retry_secs
|
||||
return status
|
||||
|
||||
status.initRedis = True
|
||||
status.lastActiveTime = to_k8s_date(dt_now())
|
||||
|
||||
if not redis:
|
||||
# if still running, resync after N seconds
|
||||
# if no crawler / no redis, resync after N seconds
|
||||
status.resync_after = self.fast_retry_secs
|
||||
return status
|
||||
|
||||
@ -1182,7 +1184,16 @@ class BtrixOperator(K8sAPI):
|
||||
max_duration = max(duration, max_duration)
|
||||
|
||||
if exec_time:
|
||||
await self.crawl_ops.inc_crawl_exec_time(crawl_id, exec_time)
|
||||
if not await self.crawl_ops.inc_crawl_exec_time(
|
||||
crawl_id, exec_time, status.lastUpdatedTime
|
||||
):
|
||||
# if lastUpdatedTime is same as previous, something is wrong, don't update!
|
||||
print(
|
||||
"Already updated for lastUpdatedTime, skipping execTime update!",
|
||||
flush=True,
|
||||
)
|
||||
return
|
||||
|
||||
await self.org_ops.inc_org_time_stats(oid, exec_time, True)
|
||||
status.crawlExecTime += exec_time
|
||||
status.elapsedCrawlTime += max_duration
|
||||
|
Loading…
Reference in New Issue
Block a user