From 95969ec747e9af9173968c0c4278faf0e14bde33 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 26 Aug 2024 14:19:20 -0700 Subject: [PATCH] Attempt to auto-adjust storage if usage is running out while crawl is running (#2023) Attempt to auto-adjust PVC storage if: - used storage (as reported in redis by the crawler) * 2.5 > total_storage - will cause PVC to resize, if possible (not supported by all drivers) - uses multiples of 1Gi, rounding up to next GB - AVAIL_STORAGE_RATIO hard-coded to 2.5 for now, to account for 2x space for WACZ plus change for fast updating crawls Some caveats: - only works if the storageClass used for PVCs has `allowVolumeExpansion: true`, if not, it will have no effect - designed as a last resort option: the `crawl_storage` in values and `--sizeLimit` and `--diskUtilization` should generally result in this not being needed. - can be useful in cases where a crawl is rapidly capturing a lot of content in one page, and there's no time to interrupt / restart, since the other limits apply only at page end. - May want to have crawler update the disk usage more frequently, not just at page end to make this more effective. --- backend/btrixcloud/operator/crawls.py | 27 +++++++++++++++++++++++++-- backend/btrixcloud/operator/models.py | 1 + chart/app-templates/crawler.yaml | 2 +- 3 files changed, 27 insertions(+), 3 deletions(-) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index d970a563..115acb24 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -2,6 +2,7 @@ import traceback import os +import math from pprint import pprint from typing import Optional, Any, Sequence from datetime import datetime @@ -75,6 +76,9 @@ MEM_SOFT_OOM_THRESHOLD = 1.0 # set memory limit to this much of request for extra padding MEM_LIMIT_PADDING = 1.2 +# ensure available storage is at least this much times used storage +AVAIL_STORAGE_RATIO = 2.5 + # pylint: disable=too-many-public-methods, too-many-locals, too-many-branches, too-many-statements # pylint: disable=invalid-name, too-many-lines, too-many-return-statements @@ -388,6 +392,7 @@ class CrawlOperator(BaseOperator): params["memory_limit"] = float(params["memory"]) * MEM_LIMIT_PADDING else: params["memory_limit"] = self.k8s.max_crawler_memory_size + params["storage"] = pod_info.newStorage or params.get("crawler_storage") params["workers"] = params.get(worker_field) or 1 params["do_restart"] = False if has_pod: @@ -481,8 +486,12 @@ class CrawlOperator(BaseOperator): pvc = children[PVC].get(name) if pvc: - src = pvc["spec"]["resources"]["requests"] - resources.storage = int(parse_quantity(src.get("storage"))) + try: + src = pvc["status"]["capacity"] + resources.storage = int(parse_quantity(src.get("storage"))) + # pylint: disable=bare-except + except: + pass async def set_state( self, @@ -1325,6 +1334,20 @@ class CrawlOperator(BaseOperator): pod_info = status.podStatus[key] pod_info.used.storage = value + if ( + status.state == "running" + and pod_info.allocated.storage + and pod_info.used.storage * AVAIL_STORAGE_RATIO + > pod_info.allocated.storage + ): + new_storage = math.ceil( + pod_info.used.storage * AVAIL_STORAGE_RATIO / 1_000_000_000 + ) + pod_info.newStorage = f"{new_storage}Gi" + print( + f"Attempting to adjust storage to {pod_info.newStorage} for {key}" + ) + if not status.stopReason: status.stopReason = await self.is_crawl_stopping(crawl, status, data) status.stopping = status.stopReason is not None diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index d01cb630..0a31fc79 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -132,6 +132,7 @@ class PodInfo(BaseModel): newCpu: Optional[int] = None newMemory: Optional[int] = None + newStorage: Optional[str] = None signalAtMem: Optional[int] = None evicted: Optional[bool] = False diff --git a/chart/app-templates/crawler.yaml b/chart/app-templates/crawler.yaml index dead7f3b..81836be5 100644 --- a/chart/app-templates/crawler.yaml +++ b/chart/app-templates/crawler.yaml @@ -17,7 +17,7 @@ spec: resources: requests: - storage: {{ crawler_storage }} + storage: {{ storage }} {% if volume_storage_class %} storageClassName: {{ volume_storage_class }}