Attempt to auto-adjust storage if usage is running out while crawl is running (#2023)
Attempt to auto-adjust PVC storage if: - used storage (as reported in redis by the crawler) * 2.5 > total_storage - will cause PVC to resize, if possible (not supported by all drivers) - uses multiples of 1Gi, rounding up to next GB - AVAIL_STORAGE_RATIO hard-coded to 2.5 for now, to account for 2x space for WACZ plus change for fast updating crawls Some caveats: - only works if the storageClass used for PVCs has `allowVolumeExpansion: true`, if not, it will have no effect - designed as a last resort option: the `crawl_storage` in values and `--sizeLimit` and `--diskUtilization` should generally result in this not being needed. - can be useful in cases where a crawl is rapidly capturing a lot of content in one page, and there's no time to interrupt / restart, since the other limits apply only at page end. - May want to have crawler update the disk usage more frequently, not just at page end to make this more effective.
This commit is contained in:
parent
a1df689729
commit
95969ec747
@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
import traceback
|
import traceback
|
||||||
import os
|
import os
|
||||||
|
import math
|
||||||
from pprint import pprint
|
from pprint import pprint
|
||||||
from typing import Optional, Any, Sequence
|
from typing import Optional, Any, Sequence
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@ -75,6 +76,9 @@ MEM_SOFT_OOM_THRESHOLD = 1.0
|
|||||||
# set memory limit to this much of request for extra padding
|
# set memory limit to this much of request for extra padding
|
||||||
MEM_LIMIT_PADDING = 1.2
|
MEM_LIMIT_PADDING = 1.2
|
||||||
|
|
||||||
|
# ensure available storage is at least this much times used storage
|
||||||
|
AVAIL_STORAGE_RATIO = 2.5
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=too-many-public-methods, too-many-locals, too-many-branches, too-many-statements
|
# pylint: disable=too-many-public-methods, too-many-locals, too-many-branches, too-many-statements
|
||||||
# pylint: disable=invalid-name, too-many-lines, too-many-return-statements
|
# pylint: disable=invalid-name, too-many-lines, too-many-return-statements
|
||||||
@ -388,6 +392,7 @@ class CrawlOperator(BaseOperator):
|
|||||||
params["memory_limit"] = float(params["memory"]) * MEM_LIMIT_PADDING
|
params["memory_limit"] = float(params["memory"]) * MEM_LIMIT_PADDING
|
||||||
else:
|
else:
|
||||||
params["memory_limit"] = self.k8s.max_crawler_memory_size
|
params["memory_limit"] = self.k8s.max_crawler_memory_size
|
||||||
|
params["storage"] = pod_info.newStorage or params.get("crawler_storage")
|
||||||
params["workers"] = params.get(worker_field) or 1
|
params["workers"] = params.get(worker_field) or 1
|
||||||
params["do_restart"] = False
|
params["do_restart"] = False
|
||||||
if has_pod:
|
if has_pod:
|
||||||
@ -481,8 +486,12 @@ class CrawlOperator(BaseOperator):
|
|||||||
|
|
||||||
pvc = children[PVC].get(name)
|
pvc = children[PVC].get(name)
|
||||||
if pvc:
|
if pvc:
|
||||||
src = pvc["spec"]["resources"]["requests"]
|
try:
|
||||||
resources.storage = int(parse_quantity(src.get("storage")))
|
src = pvc["status"]["capacity"]
|
||||||
|
resources.storage = int(parse_quantity(src.get("storage")))
|
||||||
|
# pylint: disable=bare-except
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
async def set_state(
|
async def set_state(
|
||||||
self,
|
self,
|
||||||
@ -1325,6 +1334,20 @@ class CrawlOperator(BaseOperator):
|
|||||||
pod_info = status.podStatus[key]
|
pod_info = status.podStatus[key]
|
||||||
pod_info.used.storage = value
|
pod_info.used.storage = value
|
||||||
|
|
||||||
|
if (
|
||||||
|
status.state == "running"
|
||||||
|
and pod_info.allocated.storage
|
||||||
|
and pod_info.used.storage * AVAIL_STORAGE_RATIO
|
||||||
|
> pod_info.allocated.storage
|
||||||
|
):
|
||||||
|
new_storage = math.ceil(
|
||||||
|
pod_info.used.storage * AVAIL_STORAGE_RATIO / 1_000_000_000
|
||||||
|
)
|
||||||
|
pod_info.newStorage = f"{new_storage}Gi"
|
||||||
|
print(
|
||||||
|
f"Attempting to adjust storage to {pod_info.newStorage} for {key}"
|
||||||
|
)
|
||||||
|
|
||||||
if not status.stopReason:
|
if not status.stopReason:
|
||||||
status.stopReason = await self.is_crawl_stopping(crawl, status, data)
|
status.stopReason = await self.is_crawl_stopping(crawl, status, data)
|
||||||
status.stopping = status.stopReason is not None
|
status.stopping = status.stopReason is not None
|
||||||
|
|||||||
@ -132,6 +132,7 @@ class PodInfo(BaseModel):
|
|||||||
|
|
||||||
newCpu: Optional[int] = None
|
newCpu: Optional[int] = None
|
||||||
newMemory: Optional[int] = None
|
newMemory: Optional[int] = None
|
||||||
|
newStorage: Optional[str] = None
|
||||||
signalAtMem: Optional[int] = None
|
signalAtMem: Optional[int] = None
|
||||||
|
|
||||||
evicted: Optional[bool] = False
|
evicted: Optional[bool] = False
|
||||||
|
|||||||
@ -17,7 +17,7 @@ spec:
|
|||||||
|
|
||||||
resources:
|
resources:
|
||||||
requests:
|
requests:
|
||||||
storage: {{ crawler_storage }}
|
storage: {{ storage }}
|
||||||
|
|
||||||
{% if volume_storage_class %}
|
{% if volume_storage_class %}
|
||||||
storageClassName: {{ volume_storage_class }}
|
storageClassName: {{ volume_storage_class }}
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user