Attempt to auto-adjust storage if usage is running out while crawl is running (#2023)
Attempt to auto-adjust PVC storage if: - used storage (as reported in redis by the crawler) * 2.5 > total_storage - will cause PVC to resize, if possible (not supported by all drivers) - uses multiples of 1Gi, rounding up to next GB - AVAIL_STORAGE_RATIO hard-coded to 2.5 for now, to account for 2x space for WACZ plus change for fast updating crawls Some caveats: - only works if the storageClass used for PVCs has `allowVolumeExpansion: true`, if not, it will have no effect - designed as a last resort option: the `crawl_storage` in values and `--sizeLimit` and `--diskUtilization` should generally result in this not being needed. - can be useful in cases where a crawl is rapidly capturing a lot of content in one page, and there's no time to interrupt / restart, since the other limits apply only at page end. - May want to have crawler update the disk usage more frequently, not just at page end to make this more effective.
This commit is contained in:
parent
a1df689729
commit
95969ec747
@ -2,6 +2,7 @@
|
||||
|
||||
import traceback
|
||||
import os
|
||||
import math
|
||||
from pprint import pprint
|
||||
from typing import Optional, Any, Sequence
|
||||
from datetime import datetime
|
||||
@ -75,6 +76,9 @@ MEM_SOFT_OOM_THRESHOLD = 1.0
|
||||
# set memory limit to this much of request for extra padding
|
||||
MEM_LIMIT_PADDING = 1.2
|
||||
|
||||
# ensure available storage is at least this much times used storage
|
||||
AVAIL_STORAGE_RATIO = 2.5
|
||||
|
||||
|
||||
# pylint: disable=too-many-public-methods, too-many-locals, too-many-branches, too-many-statements
|
||||
# pylint: disable=invalid-name, too-many-lines, too-many-return-statements
|
||||
@ -388,6 +392,7 @@ class CrawlOperator(BaseOperator):
|
||||
params["memory_limit"] = float(params["memory"]) * MEM_LIMIT_PADDING
|
||||
else:
|
||||
params["memory_limit"] = self.k8s.max_crawler_memory_size
|
||||
params["storage"] = pod_info.newStorage or params.get("crawler_storage")
|
||||
params["workers"] = params.get(worker_field) or 1
|
||||
params["do_restart"] = False
|
||||
if has_pod:
|
||||
@ -481,8 +486,12 @@ class CrawlOperator(BaseOperator):
|
||||
|
||||
pvc = children[PVC].get(name)
|
||||
if pvc:
|
||||
src = pvc["spec"]["resources"]["requests"]
|
||||
resources.storage = int(parse_quantity(src.get("storage")))
|
||||
try:
|
||||
src = pvc["status"]["capacity"]
|
||||
resources.storage = int(parse_quantity(src.get("storage")))
|
||||
# pylint: disable=bare-except
|
||||
except:
|
||||
pass
|
||||
|
||||
async def set_state(
|
||||
self,
|
||||
@ -1325,6 +1334,20 @@ class CrawlOperator(BaseOperator):
|
||||
pod_info = status.podStatus[key]
|
||||
pod_info.used.storage = value
|
||||
|
||||
if (
|
||||
status.state == "running"
|
||||
and pod_info.allocated.storage
|
||||
and pod_info.used.storage * AVAIL_STORAGE_RATIO
|
||||
> pod_info.allocated.storage
|
||||
):
|
||||
new_storage = math.ceil(
|
||||
pod_info.used.storage * AVAIL_STORAGE_RATIO / 1_000_000_000
|
||||
)
|
||||
pod_info.newStorage = f"{new_storage}Gi"
|
||||
print(
|
||||
f"Attempting to adjust storage to {pod_info.newStorage} for {key}"
|
||||
)
|
||||
|
||||
if not status.stopReason:
|
||||
status.stopReason = await self.is_crawl_stopping(crawl, status, data)
|
||||
status.stopping = status.stopReason is not None
|
||||
|
@ -132,6 +132,7 @@ class PodInfo(BaseModel):
|
||||
|
||||
newCpu: Optional[int] = None
|
||||
newMemory: Optional[int] = None
|
||||
newStorage: Optional[str] = None
|
||||
signalAtMem: Optional[int] = None
|
||||
|
||||
evicted: Optional[bool] = False
|
||||
|
@ -17,7 +17,7 @@ spec:
|
||||
|
||||
resources:
|
||||
requests:
|
||||
storage: {{ crawler_storage }}
|
||||
storage: {{ storage }}
|
||||
|
||||
{% if volume_storage_class %}
|
||||
storageClassName: {{ volume_storage_class }}
|
||||
|
Loading…
Reference in New Issue
Block a user