Attempt to auto-adjust storage if usage is running out while crawl is running (#2023)

Attempt to auto-adjust PVC storage if:
- used storage (as reported in redis by the crawler) * 2.5 >
total_storage
- will cause PVC to resize, if possible (not supported by all drivers)
- uses multiples of 1Gi, rounding up to next GB
- AVAIL_STORAGE_RATIO hard-coded to 2.5 for now, to account for 2x space
for WACZ plus change for fast updating crawls

Some caveats:
- only works if the storageClass used for PVCs has
`allowVolumeExpansion: true`, if not, it will have no effect
- designed as a last resort option: the `crawl_storage` in values and
`--sizeLimit` and `--diskUtilization` should generally result in this
not being needed.
- can be useful in cases where a crawl is rapidly capturing a lot of
content in one page, and there's no time to interrupt / restart, since
the other limits apply only at page end.
- May want to have crawler update the disk usage more frequently, not
just at page end to make this more effective.
This commit is contained in:
Ilya Kreymer 2024-08-26 14:19:20 -07:00 committed by GitHub
parent a1df689729
commit 95969ec747
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 27 additions and 3 deletions

View File

@ -2,6 +2,7 @@
import traceback
import os
import math
from pprint import pprint
from typing import Optional, Any, Sequence
from datetime import datetime
@ -75,6 +76,9 @@ MEM_SOFT_OOM_THRESHOLD = 1.0
# set memory limit to this much of request for extra padding
MEM_LIMIT_PADDING = 1.2
# ensure available storage is at least this much times used storage
AVAIL_STORAGE_RATIO = 2.5
# pylint: disable=too-many-public-methods, too-many-locals, too-many-branches, too-many-statements
# pylint: disable=invalid-name, too-many-lines, too-many-return-statements
@ -388,6 +392,7 @@ class CrawlOperator(BaseOperator):
params["memory_limit"] = float(params["memory"]) * MEM_LIMIT_PADDING
else:
params["memory_limit"] = self.k8s.max_crawler_memory_size
params["storage"] = pod_info.newStorage or params.get("crawler_storage")
params["workers"] = params.get(worker_field) or 1
params["do_restart"] = False
if has_pod:
@ -481,8 +486,12 @@ class CrawlOperator(BaseOperator):
pvc = children[PVC].get(name)
if pvc:
src = pvc["spec"]["resources"]["requests"]
resources.storage = int(parse_quantity(src.get("storage")))
try:
src = pvc["status"]["capacity"]
resources.storage = int(parse_quantity(src.get("storage")))
# pylint: disable=bare-except
except:
pass
async def set_state(
self,
@ -1325,6 +1334,20 @@ class CrawlOperator(BaseOperator):
pod_info = status.podStatus[key]
pod_info.used.storage = value
if (
status.state == "running"
and pod_info.allocated.storage
and pod_info.used.storage * AVAIL_STORAGE_RATIO
> pod_info.allocated.storage
):
new_storage = math.ceil(
pod_info.used.storage * AVAIL_STORAGE_RATIO / 1_000_000_000
)
pod_info.newStorage = f"{new_storage}Gi"
print(
f"Attempting to adjust storage to {pod_info.newStorage} for {key}"
)
if not status.stopReason:
status.stopReason = await self.is_crawl_stopping(crawl, status, data)
status.stopping = status.stopReason is not None

View File

@ -132,6 +132,7 @@ class PodInfo(BaseModel):
newCpu: Optional[int] = None
newMemory: Optional[int] = None
newStorage: Optional[str] = None
signalAtMem: Optional[int] = None
evicted: Optional[bool] = False

View File

@ -17,7 +17,7 @@ spec:
resources:
requests:
storage: {{ crawler_storage }}
storage: {{ storage }}
{% if volume_storage_class %}
storageClassName: {{ volume_storage_class }}