better handling of failed redis connection + exec time updates (#1520)
This PR addresses a possible failure when Redis pod was inaccessible from Crawler pod. - Ensure crawl is set to 'waiting_for_capacity' if either no crawler pods are available or no redis pod. previously, missing/inaccessible redis would not result in 'waiting_for_capacity' if crawler pods are available - Rework logic: if no crawler and redis after >60 seconds, shutdown redis. if crawler and no redis, init (or reinit) redis - track 'lastUpdatedTime' in db when incrementing exec time to avoid double counting if lastUpdatedTime has not changed, eg. if operator sync fails. - add redis timeout of 20 seconds to avoid timing out operator responses if redis conn takes too long, assume unavailable
This commit is contained in:
parent
d1156b0145
commit
0653657637
@ -451,11 +451,14 @@ class CrawlOps(BaseCrawlOps):
|
|||||||
query = {"_id": crawl_id, "type": "crawl", "state": "running"}
|
query = {"_id": crawl_id, "type": "crawl", "state": "running"}
|
||||||
return await self.crawls.find_one_and_update(query, {"$set": {"stats": stats}})
|
return await self.crawls.find_one_and_update(query, {"$set": {"stats": stats}})
|
||||||
|
|
||||||
async def inc_crawl_exec_time(self, crawl_id, exec_time):
|
async def inc_crawl_exec_time(self, crawl_id, exec_time, last_updated_time):
|
||||||
"""increment exec time"""
|
"""increment exec time"""
|
||||||
return await self.crawls.find_one_and_update(
|
return await self.crawls.find_one_and_update(
|
||||||
{"_id": crawl_id, "type": "crawl"},
|
{"_id": crawl_id, "type": "crawl", "_lut": {"$ne": last_updated_time}},
|
||||||
{"$inc": {"crawlExecSeconds": exec_time}},
|
{
|
||||||
|
"$inc": {"crawlExecSeconds": exec_time},
|
||||||
|
"$set": {"_lut": last_updated_time},
|
||||||
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
async def get_crawl_state(self, crawl_id):
|
async def get_crawl_state(self, crawl_id):
|
||||||
|
@ -67,7 +67,10 @@ class K8sAPI:
|
|||||||
async def get_redis_client(self, redis_url):
|
async def get_redis_client(self, redis_url):
|
||||||
"""return redis client with correct params for one-time use"""
|
"""return redis client with correct params for one-time use"""
|
||||||
return aioredis.from_url(
|
return aioredis.from_url(
|
||||||
redis_url, decode_responses=True, auto_close_connection_pool=True
|
redis_url,
|
||||||
|
decode_responses=True,
|
||||||
|
auto_close_connection_pool=True,
|
||||||
|
socket_timeout=20,
|
||||||
)
|
)
|
||||||
|
|
||||||
# pylint: disable=too-many-arguments, too-many-locals
|
# pylint: disable=too-many-arguments, too-many-locals
|
||||||
|
@ -942,8 +942,10 @@ class BtrixOperator(K8sAPI):
|
|||||||
if status.anyCrawlPodNewExit:
|
if status.anyCrawlPodNewExit:
|
||||||
await self.log_crashes(crawl.id, status.podStatus, redis)
|
await self.log_crashes(crawl.id, status.podStatus, redis)
|
||||||
|
|
||||||
if not crawler_running:
|
if not crawler_running or not redis:
|
||||||
|
# if either crawler is not running or redis is inaccessible
|
||||||
if self.should_mark_waiting(status.state, crawl.started):
|
if self.should_mark_waiting(status.state, crawl.started):
|
||||||
|
# mark as waiting (if already running)
|
||||||
await self.set_state(
|
await self.set_state(
|
||||||
"waiting_capacity",
|
"waiting_capacity",
|
||||||
status,
|
status,
|
||||||
@ -951,25 +953,25 @@ class BtrixOperator(K8sAPI):
|
|||||||
allowed_from=RUNNING_AND_STARTING_ONLY,
|
allowed_from=RUNNING_AND_STARTING_ONLY,
|
||||||
)
|
)
|
||||||
|
|
||||||
# for now, don't reset redis once inited
|
if not crawler_running and redis:
|
||||||
|
# if crawler running, but no redis, stop redis instance until crawler
|
||||||
|
# is running
|
||||||
if status.lastActiveTime and (
|
if status.lastActiveTime and (
|
||||||
(dt_now() - from_k8s_date(status.lastActiveTime)).total_seconds()
|
(
|
||||||
|
dt_now() - from_k8s_date(status.lastActiveTime)
|
||||||
|
).total_seconds()
|
||||||
> REDIS_TTL
|
> REDIS_TTL
|
||||||
):
|
):
|
||||||
print(
|
print(
|
||||||
f"Pausing redis, no running crawler pods for >{REDIS_TTL} secs"
|
f"Pausing redis, no running crawler pods for >{REDIS_TTL} secs"
|
||||||
)
|
)
|
||||||
status.initRedis = False
|
status.initRedis = False
|
||||||
|
elif crawler_running and not redis:
|
||||||
# if still running, resync after N seconds
|
# if crawler is running, but no redis, init redis
|
||||||
status.resync_after = self.fast_retry_secs
|
|
||||||
return status
|
|
||||||
|
|
||||||
status.initRedis = True
|
status.initRedis = True
|
||||||
status.lastActiveTime = to_k8s_date(dt_now())
|
status.lastActiveTime = to_k8s_date(dt_now())
|
||||||
|
|
||||||
if not redis:
|
# if no crawler / no redis, resync after N seconds
|
||||||
# if still running, resync after N seconds
|
|
||||||
status.resync_after = self.fast_retry_secs
|
status.resync_after = self.fast_retry_secs
|
||||||
return status
|
return status
|
||||||
|
|
||||||
@ -1182,7 +1184,16 @@ class BtrixOperator(K8sAPI):
|
|||||||
max_duration = max(duration, max_duration)
|
max_duration = max(duration, max_duration)
|
||||||
|
|
||||||
if exec_time:
|
if exec_time:
|
||||||
await self.crawl_ops.inc_crawl_exec_time(crawl_id, exec_time)
|
if not await self.crawl_ops.inc_crawl_exec_time(
|
||||||
|
crawl_id, exec_time, status.lastUpdatedTime
|
||||||
|
):
|
||||||
|
# if lastUpdatedTime is same as previous, something is wrong, don't update!
|
||||||
|
print(
|
||||||
|
"Already updated for lastUpdatedTime, skipping execTime update!",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
await self.org_ops.inc_org_time_stats(oid, exec_time, True)
|
await self.org_ops.inc_org_time_stats(oid, exec_time, True)
|
||||||
status.crawlExecTime += exec_time
|
status.crawlExecTime += exec_time
|
||||||
status.elapsedCrawlTime += max_duration
|
status.elapsedCrawlTime += max_duration
|
||||||
|
Loading…
Reference in New Issue
Block a user