load handling: scale up redis only when crawler pods running (#1009)
Operator: Modified init behavior to only load redis when at least one crawler pod available: - waits for at least one crawler pod to be available before starting redis pod, to avoid situation where many crawler pods are in pending mode, but redis pods are still running. - redis statefulset starts at scale of 0 - once crawler pod becomes available, redis sts is scaled to 1 (via `initRedis==true` status) - crawl remains in 'starting' or 'waiting_capacity' state until pod becomes available without redis pod running - set to 'running' state only after redis and at least one crawler pod is available - if no crawler pods available after running, or, if stuck in starting for >60 seconds, switch to 'waiting_capacity' state - when switching to 'waiting_capacity', also scale down redis to 0, wait for crawler pod to become available, only then scale up redis to 1, and get back to 'running' other tweaks: - add new status field 'initRedis', default to false, not displayed - crawler pod: consider 'ContainerCreating' state as available, as container will not be blocked by resource limits - add a resync after 3 seconds when waiting for crawler pod or redis pod to become available, configurable via 'operator_fast_resync_secs' - set_state: if not updating state, ensure state reflects actual value in db
This commit is contained in:
parent
608a744aaf
commit
4bea7565bc
@ -2,6 +2,7 @@
|
||||
|
||||
import asyncio
|
||||
import traceback
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from datetime import datetime
|
||||
@ -44,6 +45,9 @@ CJS = "CrawlJob.btrix.cloud/v1"
|
||||
|
||||
DEFAULT_TTL = 30
|
||||
|
||||
# time in seconds before a crawl is deemed 'waiting' instead of 'starting'
|
||||
STARTING_TIME_SECS = 60
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class DeleteCrawlException(Exception):
|
||||
@ -95,7 +99,10 @@ class CrawlStatus(BaseModel):
|
||||
filesAddedSize: int = 0
|
||||
finished: Optional[str] = None
|
||||
stopping: bool = False
|
||||
# forceRestart: Optional[str]
|
||||
initRedis: bool = False
|
||||
|
||||
# don't include in status, use by metacontroller
|
||||
resync_after: Optional[int] = None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -117,6 +124,8 @@ class BtrixOperator(K8sAPI):
|
||||
|
||||
self.done_key = "crawls-done"
|
||||
|
||||
self.fast_retry_secs = int(os.environ.get("FAST_RETRY_SECS") or 0)
|
||||
|
||||
with open(self.config_file, encoding="utf-8") as fh_config:
|
||||
self.shared_params = yaml.safe_load(fh_config)
|
||||
|
||||
@ -232,6 +241,7 @@ class BtrixOperator(K8sAPI):
|
||||
params["force_restart"] = spec.get("forceRestart")
|
||||
|
||||
params["redis_url"] = redis_url
|
||||
params["redis_scale"] = 1 if status.initRedis else 0
|
||||
|
||||
children = self.load_from_yaml("crawler.yaml", params)
|
||||
children.extend(self.load_from_yaml("redis.yaml", params))
|
||||
@ -249,7 +259,11 @@ class BtrixOperator(K8sAPI):
|
||||
"spec"
|
||||
]["volumeClaimTemplates"]
|
||||
|
||||
return {"status": status.dict(exclude_none=True), "children": children}
|
||||
return {
|
||||
"status": status.dict(exclude_none=True, exclude={"resync_after": True}),
|
||||
"children": children,
|
||||
"resyncAfterSeconds": status.resync_after,
|
||||
}
|
||||
|
||||
async def set_state(self, state, status, crawl_id, allowed_from, **kwargs):
|
||||
"""set status state and update db, if changed
|
||||
@ -286,12 +300,15 @@ class BtrixOperator(K8sAPI):
|
||||
return True
|
||||
|
||||
# get actual crawl state
|
||||
new_state, finished = await get_crawl_state(self.crawls, crawl_id)
|
||||
if new_state:
|
||||
status.state = state
|
||||
actual_state, finished = await get_crawl_state(self.crawls, crawl_id)
|
||||
if actual_state:
|
||||
status.state = actual_state
|
||||
if finished:
|
||||
status.finished = to_k8s_date(finished)
|
||||
|
||||
if actual_state != state:
|
||||
print(f"state mismatch, actual state {actual_state}, requested {state}")
|
||||
|
||||
if status.state != state:
|
||||
print(
|
||||
f"Not setting state: {status.state} -> {state}, {crawl_id} not allowed"
|
||||
@ -416,7 +433,7 @@ class BtrixOperator(K8sAPI):
|
||||
def _done_response(self, status, finalized=False):
|
||||
"""done response for removing crawl"""
|
||||
return {
|
||||
"status": status.dict(exclude_none=True),
|
||||
"status": status.dict(exclude_none=True, exclude={"resync_after": True}),
|
||||
"children": [],
|
||||
"finalized": finalized,
|
||||
}
|
||||
@ -450,12 +467,34 @@ class BtrixOperator(K8sAPI):
|
||||
|
||||
async def sync_crawl_state(self, redis_url, crawl, status, pods):
|
||||
"""sync crawl state for running crawl"""
|
||||
redis = await self._get_redis(redis_url)
|
||||
if not redis:
|
||||
# check if at least one pod started running
|
||||
if not await self.check_if_pods_running(pods):
|
||||
if self.should_mark_waiting(status.state, crawl.started):
|
||||
await self.set_state(
|
||||
"waiting_capacity",
|
||||
status,
|
||||
crawl.id,
|
||||
allowed_from=["starting", "running"],
|
||||
)
|
||||
|
||||
status.initRedis = False
|
||||
|
||||
# resync after N seconds
|
||||
status.resync_after = self.fast_retry_secs
|
||||
return status
|
||||
|
||||
# if not prev_start_time:
|
||||
# await redis.set("start_time", str(self.started))
|
||||
status.initRedis = True
|
||||
|
||||
redis = await self._get_redis(redis_url)
|
||||
if not redis:
|
||||
# resync after N seconds, until redis is inited
|
||||
status.resync_after = self.fast_retry_secs
|
||||
return status
|
||||
|
||||
# set state to running (if not already)
|
||||
await self.set_state(
|
||||
"running", status, crawl.id, allowed_from=["starting", "waiting_capacity"]
|
||||
)
|
||||
|
||||
try:
|
||||
file_done = await redis.lpop(self.done_key)
|
||||
@ -475,7 +514,7 @@ class BtrixOperator(K8sAPI):
|
||||
status.filesAddedSize = int(await redis.get("filesAddedSize") or 0)
|
||||
|
||||
# update stats and get status
|
||||
return await self.update_crawl_state(redis, crawl, status, pods)
|
||||
return await self.update_crawl_state(redis, crawl, status)
|
||||
|
||||
# pylint: disable=broad-except
|
||||
except Exception as exc:
|
||||
@ -487,8 +526,21 @@ class BtrixOperator(K8sAPI):
|
||||
"""check if at least one crawler pod has started"""
|
||||
try:
|
||||
for pod in pods.values():
|
||||
if pod["status"]["phase"] == "Running":
|
||||
status = pod["status"]
|
||||
if status["phase"] == "Running":
|
||||
return True
|
||||
|
||||
# consider 'ContainerCreating' as running
|
||||
if status["phase"] == "Pending":
|
||||
if (
|
||||
"containerStatuses" in status
|
||||
and status["containerStatuses"][0]["state"]["waiting"]["reason"]
|
||||
== "ContainerCreating"
|
||||
):
|
||||
return True
|
||||
|
||||
# print("non-running pod status", pod["status"], flush=True)
|
||||
|
||||
# pylint: disable=bare-except
|
||||
except:
|
||||
# assume no valid pod found
|
||||
@ -496,6 +548,17 @@ class BtrixOperator(K8sAPI):
|
||||
|
||||
return False
|
||||
|
||||
def should_mark_waiting(self, state, started):
|
||||
"""Should the crawl be marked as waiting for capacity?"""
|
||||
if state == "running":
|
||||
return True
|
||||
|
||||
if state == "starting":
|
||||
started = from_k8s_date(started)
|
||||
return (datetime.utcnow() - started).total_seconds() > STARTING_TIME_SECS
|
||||
|
||||
return False
|
||||
|
||||
async def add_file_to_crawl(self, cc_data, crawl, redis):
|
||||
"""Handle finished CrawlFile to db"""
|
||||
|
||||
@ -523,7 +586,7 @@ class BtrixOperator(K8sAPI):
|
||||
return True
|
||||
|
||||
# pylint: disable=too-many-branches
|
||||
async def update_crawl_state(self, redis, crawl, status, pods):
|
||||
async def update_crawl_state(self, redis, crawl, status):
|
||||
"""update crawl state and check if crawl is now done"""
|
||||
results = await redis.hvals(f"{crawl.id}:status")
|
||||
stats = await get_redis_crawl_stats(redis, crawl.id)
|
||||
@ -542,23 +605,6 @@ class BtrixOperator(K8sAPI):
|
||||
# backwards compatibility with older crawler
|
||||
await redis.set("crawl-stop", "1")
|
||||
|
||||
# check if at least one pod started running
|
||||
# otherwise, mark as 'waiting' and return
|
||||
if not await self.check_if_pods_running(pods):
|
||||
await self.set_state(
|
||||
"waiting_capacity",
|
||||
status,
|
||||
crawl.id,
|
||||
allowed_from=["starting", "running"],
|
||||
)
|
||||
|
||||
return status
|
||||
|
||||
# set state to running (if not already)
|
||||
await self.set_state(
|
||||
"running", status, crawl.id, allowed_from=["starting", "waiting_capacity"]
|
||||
)
|
||||
|
||||
# update status
|
||||
status.pagesDone = stats["done"]
|
||||
status.pagesFound = stats["found"]
|
||||
|
@ -18,7 +18,7 @@ spec:
|
||||
role: redis
|
||||
|
||||
serviceName: redis-{{ id }}
|
||||
replicas: 1
|
||||
replicas: {{ redis_scale }}
|
||||
podManagementPolicy: Parallel
|
||||
|
||||
# not yet supported
|
||||
|
@ -64,6 +64,8 @@ data:
|
||||
|
||||
PRESIGN_DURATION_MINUTES: "{{ .Values.storage_presign_duration_minutes | default 60 }}"
|
||||
|
||||
FAST_RETRY_SECS: "{{ .Values.operator_fast_resync_secs | default 3 }}"
|
||||
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
|
Loading…
Reference in New Issue
Block a user