Allow custom auto-resize crawler volume ratio adjustable (#2076)
Make the avail / used storage ratio (for crawler volumes) adjustable. Disable auto-resize if set to 0. Follow-up to #2023
This commit is contained in:
		
							parent
							
								
									49ce894353
								
							
						
					
					
						commit
						1f919de294
					
				| @ -76,9 +76,6 @@ MEM_SOFT_OOM_THRESHOLD = 1.0 | ||||
| # set memory limit to this much of request for extra padding | ||||
| MEM_LIMIT_PADDING = 1.2 | ||||
| 
 | ||||
| # ensure available storage is at least this much times used storage | ||||
| AVAIL_STORAGE_RATIO = 2.5 | ||||
| 
 | ||||
| 
 | ||||
| # pylint: disable=too-many-public-methods, too-many-locals, too-many-branches, too-many-statements | ||||
| # pylint: disable=invalid-name, too-many-lines, too-many-return-statements | ||||
| @ -93,6 +90,8 @@ class CrawlOperator(BaseOperator): | ||||
|     fast_retry_secs: int | ||||
|     log_failed_crawl_lines: int | ||||
| 
 | ||||
|     min_avail_storage_ratio: float | ||||
| 
 | ||||
|     def __init__(self, *args): | ||||
|         super().__init__(*args) | ||||
| 
 | ||||
| @ -104,6 +103,11 @@ class CrawlOperator(BaseOperator): | ||||
| 
 | ||||
|         self.log_failed_crawl_lines = int(os.environ.get("LOG_FAILED_CRAWL_LINES") or 0) | ||||
| 
 | ||||
|         # ensure available storage is at least this much times used storage | ||||
|         self.min_avail_storage_ratio = float( | ||||
|             os.environ.get("CRAWLER_MIN_AVAIL_STORAGE_RATIO") or 0 | ||||
|         ) | ||||
| 
 | ||||
|     def init_routes(self, app): | ||||
|         """init routes for this operator""" | ||||
| 
 | ||||
| @ -1336,12 +1340,15 @@ class CrawlOperator(BaseOperator): | ||||
| 
 | ||||
|                 if ( | ||||
|                     status.state == "running" | ||||
|                     and self.min_avail_storage_ratio | ||||
|                     and pod_info.allocated.storage | ||||
|                     and pod_info.used.storage * AVAIL_STORAGE_RATIO | ||||
|                     and pod_info.used.storage * self.min_avail_storage_ratio | ||||
|                     > pod_info.allocated.storage | ||||
|                 ): | ||||
|                     new_storage = math.ceil( | ||||
|                         pod_info.used.storage * AVAIL_STORAGE_RATIO / 1_000_000_000 | ||||
|                         pod_info.used.storage | ||||
|                         * self.min_avail_storage_ratio | ||||
|                         / 1_000_000_000 | ||||
|                     ) | ||||
|                     pod_info.newStorage = f"{new_storage}Gi" | ||||
|                     print( | ||||
|  | ||||
| @ -60,6 +60,8 @@ data: | ||||
| 
 | ||||
|   MAX_CRAWLER_MEMORY: "{{ .Values.max_crawler_memory }}" | ||||
| 
 | ||||
|   CRAWLER_MIN_AVAIL_STORAGE_RATIO: "{{ .Values.crawler_min_avail_storage_ratio }}" | ||||
| 
 | ||||
|   ENABLE_AUTO_RESIZE_CRAWLERS: "{{ .Values.enable_auto_resize_crawlers }}" | ||||
| 
 | ||||
|   BILLING_ENABLED: "{{ .Values.billing_enabled }}" | ||||
|  | ||||
| @ -75,7 +75,7 @@ allow_dupe_invites: "0" | ||||
| invite_expire_seconds: 604800 | ||||
| 
 | ||||
| # base url for replayweb.page | ||||
| rwp_base_url: "https://cdn.jsdelivr.net/npm/replaywebpage@1.8.15/" | ||||
| rwp_base_url: "https://cdn.jsdelivr.net/npm/replaywebpage@2.1.4/" | ||||
| 
 | ||||
| superuser: | ||||
|   # set this to enable a superuser admin | ||||
| @ -288,12 +288,19 @@ enable_auto_resize_crawlers: false | ||||
| # the workdir is used to store the browser profile data and other temporary files | ||||
| # profile_browser_workdir_size: 4Gi | ||||
| 
 | ||||
| 
 | ||||
| # Other Crawler Settings | ||||
| # ---------------------- | ||||
| 
 | ||||
| # minimum size allocated to each crawler | ||||
| # should be at least double crawl session size to ensure space for WACZ and browser profile data | ||||
| crawler_storage: "26Gi" | ||||
| crawler_storage: "25Gi" | ||||
| 
 | ||||
| 
 | ||||
| # if set, will ensure 'crawler_storage' is at least this times used storage | ||||
| # eg. if crawler session reaches 10Gb, and this value is 2.5, will attempt | ||||
| # to resize to at least 25Gb. | ||||
| crawler_min_avail_storage_ratio: 2.5 | ||||
| 
 | ||||
| # max size at which crawler will commit current crawl session | ||||
| crawler_session_size_limit_bytes: "10000000000" | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user