better handling of failed redis connection + exec time updates (#1520)

This PR addresses a possible failure when Redis pod was inaccessible
from Crawler pod.
- Ensure crawl is set to 'waiting_for_capacity' if either no crawler
pods are available or no redis pod. previously, missing/inaccessible
redis would not result in 'waiting_for_capacity' if crawler pods are
available
- Rework logic: if no crawler and redis after >60 seconds, shutdown
redis. if crawler and no redis, init (or reinit) redis
- track 'lastUpdatedTime' in db when incrementing exec time to avoid
double counting if lastUpdatedTime has not changed, eg. if operator sync
fails.
- add redis timeout of 20 seconds to avoid timing out operator responses
if redis conn takes too long, assume unavailable
This commit is contained in:
Ilya Kreymer 2024-02-09 16:14:29 -08:00 committed by GitHub
parent d1156b0145
commit 0653657637
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 41 additions and 24 deletions

View File

@ -451,11 +451,14 @@ class CrawlOps(BaseCrawlOps):
query = {"_id": crawl_id, "type": "crawl", "state": "running"}
return await self.crawls.find_one_and_update(query, {"$set": {"stats": stats}})
async def inc_crawl_exec_time(self, crawl_id, exec_time):
async def inc_crawl_exec_time(self, crawl_id, exec_time, last_updated_time):
"""increment exec time"""
return await self.crawls.find_one_and_update(
{"_id": crawl_id, "type": "crawl"},
{"$inc": {"crawlExecSeconds": exec_time}},
{"_id": crawl_id, "type": "crawl", "_lut": {"$ne": last_updated_time}},
{
"$inc": {"crawlExecSeconds": exec_time},
"$set": {"_lut": last_updated_time},
},
)
async def get_crawl_state(self, crawl_id):

View File

@ -67,7 +67,10 @@ class K8sAPI:
async def get_redis_client(self, redis_url):
"""return redis client with correct params for one-time use"""
return aioredis.from_url(
redis_url, decode_responses=True, auto_close_connection_pool=True
redis_url,
decode_responses=True,
auto_close_connection_pool=True,
socket_timeout=20,
)
# pylint: disable=too-many-arguments, too-many-locals

View File

@ -942,8 +942,10 @@ class BtrixOperator(K8sAPI):
if status.anyCrawlPodNewExit:
await self.log_crashes(crawl.id, status.podStatus, redis)
if not crawler_running:
if not crawler_running or not redis:
# if either crawler is not running or redis is inaccessible
if self.should_mark_waiting(status.state, crawl.started):
# mark as waiting (if already running)
await self.set_state(
"waiting_capacity",
status,
@ -951,25 +953,25 @@ class BtrixOperator(K8sAPI):
allowed_from=RUNNING_AND_STARTING_ONLY,
)
# for now, don't reset redis once inited
if status.lastActiveTime and (
(dt_now() - from_k8s_date(status.lastActiveTime)).total_seconds()
> REDIS_TTL
):
print(
f"Pausing redis, no running crawler pods for >{REDIS_TTL} secs"
)
status.initRedis = False
if not crawler_running and redis:
# if crawler running, but no redis, stop redis instance until crawler
# is running
if status.lastActiveTime and (
(
dt_now() - from_k8s_date(status.lastActiveTime)
).total_seconds()
> REDIS_TTL
):
print(
f"Pausing redis, no running crawler pods for >{REDIS_TTL} secs"
)
status.initRedis = False
elif crawler_running and not redis:
# if crawler is running, but no redis, init redis
status.initRedis = True
status.lastActiveTime = to_k8s_date(dt_now())
# if still running, resync after N seconds
status.resync_after = self.fast_retry_secs
return status
status.initRedis = True
status.lastActiveTime = to_k8s_date(dt_now())
if not redis:
# if still running, resync after N seconds
# if no crawler / no redis, resync after N seconds
status.resync_after = self.fast_retry_secs
return status
@ -1182,7 +1184,16 @@ class BtrixOperator(K8sAPI):
max_duration = max(duration, max_duration)
if exec_time:
await self.crawl_ops.inc_crawl_exec_time(crawl_id, exec_time)
if not await self.crawl_ops.inc_crawl_exec_time(
crawl_id, exec_time, status.lastUpdatedTime
):
# if lastUpdatedTime is same as previous, something is wrong, don't update!
print(
"Already updated for lastUpdatedTime, skipping execTime update!",
flush=True,
)
return
await self.org_ops.inc_org_time_stats(oid, exec_time, True)
status.crawlExecTime += exec_time
status.elapsedCrawlTime += max_duration