From 001839a52196526ea13693bccf82acbf6ef84a74 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Mon, 10 Feb 2025 16:15:21 -0800 Subject: [PATCH] Fix max pages quota setting and display (#2370) - add ensure_page_limit_quotas() which sets the config limit to the max pages quota, if any - set the page limit on the config when: creating new crawl, creating configmap - don't set the quota page limit on new or existing crawl workflows (remove setting it on new workflows) to allow updated quotas to take affect for next crawl - frontend: correctly display page limit on workflow settings page from org quotas, if any. - operator: get org on each sync in one place - fixes #2369 --------- Co-authored-by: sua yoo --- backend/btrixcloud/crawlconfigs.py | 26 ++++++--- backend/btrixcloud/operator/crawls.py | 26 ++++----- backend/btrixcloud/operator/cronjobs.py | 1 + backend/btrixcloud/operator/models.py | 3 +- frontend/src/components/ui/config-details.ts | 40 +++++++------ .../crawl-workflows/workflow-editor.ts | 58 ++++++++++--------- 6 files changed, 86 insertions(+), 68 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 86e0e0c2..e65dbe29 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -251,11 +251,6 @@ class CrawlConfigOps: crawlconfig.lastStartedBy = user.id crawlconfig.lastStartedByName = user.name - # Ensure page limit is below org maxPagesPerCall if set - max_pages = org.quotas.maxPagesPerCrawl or 0 - if max_pages > 0: - crawlconfig.config.limit = max_pages - # add CrawlConfig to DB here result = await self.crawl_configs.insert_one(crawlconfig.to_dict()) @@ -286,13 +281,30 @@ class CrawlConfigOps: execMinutesQuotaReached=exec_mins_quota_reached, ) + def ensure_quota_page_limit(self, crawlconfig: CrawlConfig, org: Organization): + """ensure page limit is set to no greater than quota page limit, if any""" + if org.quotas.maxPagesPerCrawl and org.quotas.maxPagesPerCrawl > 0: + if crawlconfig.config.limit and crawlconfig.config.limit > 0: + crawlconfig.config.limit = min( + org.quotas.maxPagesPerCrawl, crawlconfig.config.limit + ) + else: + crawlconfig.config.limit = org.quotas.maxPagesPerCrawl + async def add_new_crawl( - self, crawl_id: str, crawlconfig: CrawlConfig, user: User, manual: bool + self, + crawl_id: str, + crawlconfig: CrawlConfig, + user: User, + org: Organization, + manual: bool, ) -> None: """increments crawl count for this config and adds new crawl""" started = dt_now() + self.ensure_quota_page_limit(crawlconfig, org) + inc = self.inc_crawl_count(crawlconfig.id) add = self.crawl_ops.add_new_crawl( crawl_id, crawlconfig, user.id, started, manual @@ -892,7 +904,7 @@ class CrawlConfigOps: storage_filename=storage_filename, profile_filename=profile_filename or "", ) - await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True) + await self.add_new_crawl(crawl_id, crawlconfig, user, org, manual=True) return crawl_id except Exception as exc: diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 41e8f53e..91976cad 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -6,6 +6,7 @@ import math from pprint import pprint from typing import Optional, Any, Sequence from datetime import datetime +from uuid import UUID import json @@ -29,7 +30,6 @@ from btrixcloud.models import ( CrawlFile, CrawlCompleteIn, StorageRef, - Organization, ) from btrixcloud.utils import str_to_date, date_to_str, dt_now @@ -145,11 +145,13 @@ class CrawlOperator(BaseOperator): params["userid"] = spec.get("userid", "") pods = data.children[POD] + org = await self.org_ops.get_org_by_id(UUID(oid)) crawl = CrawlSpec( id=crawl_id, cid=cid, oid=oid, + org=org, storage=StorageRef(spec["storageName"]), crawler_channel=spec.get("crawlerChannel"), proxy_id=spec.get("proxyId"), @@ -204,8 +206,6 @@ class CrawlOperator(BaseOperator): await self.k8s.delete_crawl_job(crawl.id) return {"status": status.dict(exclude_none=True), "children": []} - org = None - # first, check storage quota, and fail immediately if quota reached if status.state in ( "starting", @@ -215,7 +215,6 @@ class CrawlOperator(BaseOperator): # only check on very first run, before any pods/pvcs created # for now, allow if crawl has already started (pods/pvcs created) if not pods and not data.children[PVC]: - org = await self.org_ops.get_org_by_id(crawl.oid) if self.org_ops.storage_quota_reached(org): await self.mark_finished( crawl, status, "skipped_storage_quota_reached" @@ -229,7 +228,7 @@ class CrawlOperator(BaseOperator): return self._empty_response(status) if status.state in ("starting", "waiting_org_limit"): - if not await self.can_start_new(crawl, data, status, org): + if not await self.can_start_new(crawl, data, status): return self._empty_response(status) await self.set_state( @@ -382,8 +381,9 @@ class CrawlOperator(BaseOperator): crawlconfig = await self.crawl_config_ops.get_crawl_config(crawl.cid, crawl.oid) - raw_config = crawlconfig.get_raw_config() + self.crawl_config_ops.ensure_quota_page_limit(crawlconfig, crawl.org) + raw_config = crawlconfig.get_raw_config() raw_config["behaviors"] = self._filter_autoclick_behavior( raw_config["behaviors"], params["crawler_image"] ) @@ -637,14 +637,10 @@ class CrawlOperator(BaseOperator): crawl: CrawlSpec, data: MCSyncData, status: CrawlStatus, - org: Optional[Organization] = None, ): """return true if crawl can start, otherwise set crawl to 'queued' state until more crawls for org finish""" - if not org: - org = await self.org_ops.get_org_by_id(crawl.oid) - - max_crawls = org.quotas.maxConcurrentCrawls or 0 + max_crawls = crawl.org.quotas.maxConcurrentCrawls or 0 if not max_crawls: return True @@ -1238,15 +1234,13 @@ class CrawlOperator(BaseOperator): } return json.dumps(err) - async def add_file_to_crawl(self, cc_data, crawl, redis): + async def add_file_to_crawl(self, cc_data, crawl: CrawlSpec, redis): """Handle finished CrawlFile to db""" filecomplete = CrawlCompleteIn(**cc_data) - org = await self.org_ops.get_org_by_id(crawl.oid) - filename = self.storage_ops.get_org_relative_path( - org, crawl.storage, filecomplete.filename + crawl.org, crawl.storage, filecomplete.filename ) crawl_file = CrawlFile( @@ -1299,7 +1293,7 @@ class CrawlOperator(BaseOperator): return "size-limit" # gracefully stop crawl if current running crawl sizes reach storage quota - org = await self.org_ops.get_org_by_id(crawl.oid) + org = crawl.org if org.readOnly: return "stopped_org_readonly" diff --git a/backend/btrixcloud/operator/cronjobs.py b/backend/btrixcloud/operator/cronjobs.py index 018fae9a..929d7920 100644 --- a/backend/btrixcloud/operator/cronjobs.py +++ b/backend/btrixcloud/operator/cronjobs.py @@ -112,6 +112,7 @@ class CronJobOperator(BaseOperator): crawl_id, crawlconfig, user, + org, manual=False, ) print("Scheduled Crawl Created: " + crawl_id) diff --git a/backend/btrixcloud/operator/models.py b/backend/btrixcloud/operator/models.py index 439eeb26..9f511ee7 100644 --- a/backend/btrixcloud/operator/models.py +++ b/backend/btrixcloud/operator/models.py @@ -5,7 +5,7 @@ from uuid import UUID from typing import Optional, DefaultDict, Literal, Annotated, Any from pydantic import BaseModel, Field from kubernetes.utils import parse_quantity -from btrixcloud.models import StorageRef, TYPE_ALL_CRAWL_STATES +from btrixcloud.models import StorageRef, TYPE_ALL_CRAWL_STATES, Organization BTRIX_API = "btrix.cloud/v1" @@ -70,6 +70,7 @@ class CrawlSpec(BaseModel): id: str cid: UUID oid: UUID + org: Organization scale: int = 1 storage: StorageRef started: str diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts index 5eb2d7f6..8a230641 100644 --- a/frontend/src/components/ui/config-details.ts +++ b/frontend/src/components/ui/config-details.ts @@ -14,10 +14,10 @@ import sectionStrings from "@/strings/crawl-workflows/section"; import type { Collection } from "@/types/collection"; import { WorkflowScopeType } from "@/types/workflow"; import { isApiError } from "@/utils/api"; -import { getAppSettings } from "@/utils/app"; import { DEPTH_SUPPORTED_SCOPES, isPageScopeType } from "@/utils/crawler"; import { humanizeSchedule } from "@/utils/cron"; import { pluralOf } from "@/utils/pluralize"; +import { getServerDefaults } from "@/utils/workflow"; /** * Usage: @@ -55,7 +55,7 @@ export class ConfigDetails extends BtrixElement { async connectedCallback() { super.connectedCallback(); - void this.fetchAPIDefaults(); + void this.fetchOrgDefaults(); await this.fetchCollections(); } @@ -137,7 +137,9 @@ export class ConfigDetails extends BtrixElement { if (this.orgDefaults?.maxPagesPerCrawl) { return html` - ${this.localize.number(this.orgDefaults.maxPagesPerCrawl)} + ${this.orgDefaults.maxPagesPerCrawl === Infinity + ? msg("Unlimited") + : this.localize.number(this.orgDefaults.maxPagesPerCrawl)} ${pluralOf("pages", this.orgDefaults.maxPagesPerCrawl)} ${msg("(default)")}`; @@ -510,25 +512,29 @@ export class ConfigDetails extends BtrixElement { this.requestUpdate(); } - private async fetchAPIDefaults() { + // TODO Consolidate with workflow-editor + private async fetchOrgDefaults() { try { - const settings = await getAppSettings(); - const orgDefaults = { + const [serverDefaults, { quotas }] = await Promise.all([ + getServerDefaults(), + this.api.fetch<{ + quotas: { maxPagesPerCrawl?: number }; + }>(`/orgs/${this.orgId}`), + ]); + + const defaults = { ...this.orgDefaults, + ...serverDefaults, }; - if (settings.defaultBehaviorTimeSeconds > 0) { - orgDefaults.behaviorTimeoutSeconds = - settings.defaultBehaviorTimeSeconds; + if (defaults.maxPagesPerCrawl && quotas.maxPagesPerCrawl) { + defaults.maxPagesPerCrawl = Math.min( + defaults.maxPagesPerCrawl, + quotas.maxPagesPerCrawl, + ); } - if (settings.defaultPageLoadTimeSeconds > 0) { - orgDefaults.pageLoadTimeoutSeconds = - settings.defaultPageLoadTimeSeconds; - } - if (settings.maxPagesPerCrawl > 0) { - orgDefaults.maxPagesPerCrawl = settings.maxPagesPerCrawl; - } - this.orgDefaults = orgDefaults; + + this.orgDefaults = defaults; } catch (e) { console.debug(e); } diff --git a/frontend/src/features/crawl-workflows/workflow-editor.ts b/frontend/src/features/crawl-workflows/workflow-editor.ts index 631e8e34..5c4d8bfa 100644 --- a/frontend/src/features/crawl-workflows/workflow-editor.ts +++ b/frontend/src/features/crawl-workflows/workflow-editor.ts @@ -214,7 +214,7 @@ export class WorkflowEditor extends BtrixElement { private progressState?: ProgressState; @state() - private defaults: WorkflowDefaults = appDefaults; + private orgDefaults: WorkflowDefaults = appDefaults; @state() private formState = getDefaultFormState(); @@ -304,7 +304,9 @@ export class WorkflowEditor extends BtrixElement { connectedCallback(): void { this.initializeEditor(); super.connectedCallback(); - void this.fetchServerDefaults(); + + void this.fetchOrgDefaults(); + void this.fetchTags(); this.addEventListener( "btrix-intersect", @@ -350,15 +352,6 @@ export class WorkflowEditor extends BtrixElement { if (this.progressState?.activeTab !== STEPS[0]) { void this.scrollToActivePanel(); } - - if (this.orgId) { - void this.fetchTags(); - void this.fetchOrgQuotaDefaults(); - } - } - - private async fetchServerDefaults() { - this.defaults = await getServerDefaults(); } private initializeEditor() { @@ -1104,12 +1097,12 @@ https://archiveweb.page/images/${"logo.svg"}`} value=${this.formState.pageLimit || ""} min=${minPages} max=${ifDefined( - this.defaults.maxPagesPerCrawl && - this.defaults.maxPagesPerCrawl < Infinity - ? this.defaults.maxPagesPerCrawl + this.orgDefaults.maxPagesPerCrawl && + this.orgDefaults.maxPagesPerCrawl < Infinity + ? this.orgDefaults.maxPagesPerCrawl : undefined, )} - placeholder=${defaultLabel(this.defaults.maxPagesPerCrawl)} + placeholder=${defaultLabel(this.orgDefaults.maxPagesPerCrawl)} @sl-input=${onInputMinMax} > ${msg("pages")} @@ -1152,7 +1145,7 @@ https://archiveweb.page/images/${"logo.svg"}`} type="number" inputmode="numeric" label=${msg("Page Load Timeout")} - placeholder=${defaultLabel(this.defaults.pageLoadTimeoutSeconds)} + placeholder=${defaultLabel(this.orgDefaults.pageLoadTimeoutSeconds)} value=${ifDefined(this.formState.pageLoadTimeoutSeconds ?? undefined)} min="0" @sl-input=${onInputMinMax} @@ -1181,7 +1174,7 @@ https://archiveweb.page/images/${"logo.svg"}`} type="number" inputmode="numeric" label=${msg("Behavior Timeout")} - placeholder=${defaultLabel(this.defaults.behaviorTimeoutSeconds)} + placeholder=${defaultLabel(this.orgDefaults.behaviorTimeoutSeconds)} value=${ifDefined(this.formState.behaviorTimeoutSeconds ?? undefined)} min="0" @sl-input=${onInputMinMax} @@ -1278,7 +1271,7 @@ https://archiveweb.page/images/${"logo.svg"}`} > ${when(this.appState.settings?.numBrowsers, (numBrowsers) => map( - range(this.defaults.maxScale), + range(this.orgDefaults.maxScale), (i: number) => html` ${(i + 1) * numBrowsers}(`/orgs/${this.orgId}`); - const orgDefaults = { - ...this.defaults, + const [serverDefaults, { quotas }] = await Promise.all([ + getServerDefaults(), + this.api.fetch<{ + quotas: { maxPagesPerCrawl?: number }; + }>(`/orgs/${this.orgId}`), + ]); + + const defaults = { + ...this.orgDefaults, + ...serverDefaults, }; - if (data.quotas.maxPagesPerCrawl && data.quotas.maxPagesPerCrawl > 0) { - orgDefaults.maxPagesPerCrawl = data.quotas.maxPagesPerCrawl; + + if (defaults.maxPagesPerCrawl && quotas.maxPagesPerCrawl) { + defaults.maxPagesPerCrawl = Math.min( + defaults.maxPagesPerCrawl, + quotas.maxPagesPerCrawl, + ); } - this.defaults = orgDefaults; + + this.orgDefaults = defaults; } catch (e) { console.debug(e); }