Attempt to auto-adjust storage if usage is running out while crawl is running (#2023)

Attempt to auto-adjust PVC storage if:
- used storage (as reported in redis by the crawler) * 2.5 >
total_storage
- will cause PVC to resize, if possible (not supported by all drivers)
- uses multiples of 1Gi, rounding up to next GB
- AVAIL_STORAGE_RATIO hard-coded to 2.5 for now, to account for 2x space
for WACZ plus change for fast updating crawls

Some caveats:
- only works if the storageClass used for PVCs has
`allowVolumeExpansion: true`, if not, it will have no effect
- designed as a last resort option: the `crawl_storage` in values and
`--sizeLimit` and `--diskUtilization` should generally result in this
not being needed.
- can be useful in cases where a crawl is rapidly capturing a lot of
content in one page, and there's no time to interrupt / restart, since
the other limits apply only at page end.
- May want to have crawler update the disk usage more frequently, not
just at page end to make this more effective.
This commit is contained in:
Ilya Kreymer 2024-08-26 14:19:20 -07:00 committed by GitHub
parent a1df689729
commit 95969ec747
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 27 additions and 3 deletions

View File

@ -2,6 +2,7 @@
import traceback import traceback
import os import os
import math
from pprint import pprint from pprint import pprint
from typing import Optional, Any, Sequence from typing import Optional, Any, Sequence
from datetime import datetime from datetime import datetime
@ -75,6 +76,9 @@ MEM_SOFT_OOM_THRESHOLD = 1.0
# set memory limit to this much of request for extra padding # set memory limit to this much of request for extra padding
MEM_LIMIT_PADDING = 1.2 MEM_LIMIT_PADDING = 1.2
# ensure available storage is at least this much times used storage
AVAIL_STORAGE_RATIO = 2.5
# pylint: disable=too-many-public-methods, too-many-locals, too-many-branches, too-many-statements # pylint: disable=too-many-public-methods, too-many-locals, too-many-branches, too-many-statements
# pylint: disable=invalid-name, too-many-lines, too-many-return-statements # pylint: disable=invalid-name, too-many-lines, too-many-return-statements
@ -388,6 +392,7 @@ class CrawlOperator(BaseOperator):
params["memory_limit"] = float(params["memory"]) * MEM_LIMIT_PADDING params["memory_limit"] = float(params["memory"]) * MEM_LIMIT_PADDING
else: else:
params["memory_limit"] = self.k8s.max_crawler_memory_size params["memory_limit"] = self.k8s.max_crawler_memory_size
params["storage"] = pod_info.newStorage or params.get("crawler_storage")
params["workers"] = params.get(worker_field) or 1 params["workers"] = params.get(worker_field) or 1
params["do_restart"] = False params["do_restart"] = False
if has_pod: if has_pod:
@ -481,8 +486,12 @@ class CrawlOperator(BaseOperator):
pvc = children[PVC].get(name) pvc = children[PVC].get(name)
if pvc: if pvc:
src = pvc["spec"]["resources"]["requests"] try:
resources.storage = int(parse_quantity(src.get("storage"))) src = pvc["status"]["capacity"]
resources.storage = int(parse_quantity(src.get("storage")))
# pylint: disable=bare-except
except:
pass
async def set_state( async def set_state(
self, self,
@ -1325,6 +1334,20 @@ class CrawlOperator(BaseOperator):
pod_info = status.podStatus[key] pod_info = status.podStatus[key]
pod_info.used.storage = value pod_info.used.storage = value
if (
status.state == "running"
and pod_info.allocated.storage
and pod_info.used.storage * AVAIL_STORAGE_RATIO
> pod_info.allocated.storage
):
new_storage = math.ceil(
pod_info.used.storage * AVAIL_STORAGE_RATIO / 1_000_000_000
)
pod_info.newStorage = f"{new_storage}Gi"
print(
f"Attempting to adjust storage to {pod_info.newStorage} for {key}"
)
if not status.stopReason: if not status.stopReason:
status.stopReason = await self.is_crawl_stopping(crawl, status, data) status.stopReason = await self.is_crawl_stopping(crawl, status, data)
status.stopping = status.stopReason is not None status.stopping = status.stopReason is not None

View File

@ -132,6 +132,7 @@ class PodInfo(BaseModel):
newCpu: Optional[int] = None newCpu: Optional[int] = None
newMemory: Optional[int] = None newMemory: Optional[int] = None
newStorage: Optional[str] = None
signalAtMem: Optional[int] = None signalAtMem: Optional[int] = None
evicted: Optional[bool] = False evicted: Optional[bool] = False

View File

@ -17,7 +17,7 @@ spec:
resources: resources:
requests: requests:
storage: {{ crawler_storage }} storage: {{ storage }}
{% if volume_storage_class %} {% if volume_storage_class %}
storageClassName: {{ volume_storage_class }} storageClassName: {{ volume_storage_class }}