Fix max pages quota setting and display (#2370)

- add ensure_page_limit_quotas() which sets the config limit to the max
pages quota, if any
- set the page limit on the config when: creating new crawl, creating
configmap
- don't set the quota page limit on new or existing crawl workflows
(remove setting it on new workflows) to allow updated quotas to take
affect for next crawl
- frontend: correctly display page limit on workflow settings page from
org quotas, if any.
- operator: get org on each sync in one place
- fixes #2369

---------

Co-authored-by: sua yoo <sua@webrecorder.org>
This commit is contained in:
Ilya Kreymer 2025-02-10 16:15:21 -08:00 committed by GitHub
parent aae1c02b3a
commit 001839a521
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 86 additions and 68 deletions

View File

@ -251,11 +251,6 @@ class CrawlConfigOps:
crawlconfig.lastStartedBy = user.id crawlconfig.lastStartedBy = user.id
crawlconfig.lastStartedByName = user.name crawlconfig.lastStartedByName = user.name
# Ensure page limit is below org maxPagesPerCall if set
max_pages = org.quotas.maxPagesPerCrawl or 0
if max_pages > 0:
crawlconfig.config.limit = max_pages
# add CrawlConfig to DB here # add CrawlConfig to DB here
result = await self.crawl_configs.insert_one(crawlconfig.to_dict()) result = await self.crawl_configs.insert_one(crawlconfig.to_dict())
@ -286,13 +281,30 @@ class CrawlConfigOps:
execMinutesQuotaReached=exec_mins_quota_reached, execMinutesQuotaReached=exec_mins_quota_reached,
) )
def ensure_quota_page_limit(self, crawlconfig: CrawlConfig, org: Organization):
"""ensure page limit is set to no greater than quota page limit, if any"""
if org.quotas.maxPagesPerCrawl and org.quotas.maxPagesPerCrawl > 0:
if crawlconfig.config.limit and crawlconfig.config.limit > 0:
crawlconfig.config.limit = min(
org.quotas.maxPagesPerCrawl, crawlconfig.config.limit
)
else:
crawlconfig.config.limit = org.quotas.maxPagesPerCrawl
async def add_new_crawl( async def add_new_crawl(
self, crawl_id: str, crawlconfig: CrawlConfig, user: User, manual: bool self,
crawl_id: str,
crawlconfig: CrawlConfig,
user: User,
org: Organization,
manual: bool,
) -> None: ) -> None:
"""increments crawl count for this config and adds new crawl""" """increments crawl count for this config and adds new crawl"""
started = dt_now() started = dt_now()
self.ensure_quota_page_limit(crawlconfig, org)
inc = self.inc_crawl_count(crawlconfig.id) inc = self.inc_crawl_count(crawlconfig.id)
add = self.crawl_ops.add_new_crawl( add = self.crawl_ops.add_new_crawl(
crawl_id, crawlconfig, user.id, started, manual crawl_id, crawlconfig, user.id, started, manual
@ -892,7 +904,7 @@ class CrawlConfigOps:
storage_filename=storage_filename, storage_filename=storage_filename,
profile_filename=profile_filename or "", profile_filename=profile_filename or "",
) )
await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True) await self.add_new_crawl(crawl_id, crawlconfig, user, org, manual=True)
return crawl_id return crawl_id
except Exception as exc: except Exception as exc:

View File

@ -6,6 +6,7 @@ import math
from pprint import pprint from pprint import pprint
from typing import Optional, Any, Sequence from typing import Optional, Any, Sequence
from datetime import datetime from datetime import datetime
from uuid import UUID
import json import json
@ -29,7 +30,6 @@ from btrixcloud.models import (
CrawlFile, CrawlFile,
CrawlCompleteIn, CrawlCompleteIn,
StorageRef, StorageRef,
Organization,
) )
from btrixcloud.utils import str_to_date, date_to_str, dt_now from btrixcloud.utils import str_to_date, date_to_str, dt_now
@ -145,11 +145,13 @@ class CrawlOperator(BaseOperator):
params["userid"] = spec.get("userid", "") params["userid"] = spec.get("userid", "")
pods = data.children[POD] pods = data.children[POD]
org = await self.org_ops.get_org_by_id(UUID(oid))
crawl = CrawlSpec( crawl = CrawlSpec(
id=crawl_id, id=crawl_id,
cid=cid, cid=cid,
oid=oid, oid=oid,
org=org,
storage=StorageRef(spec["storageName"]), storage=StorageRef(spec["storageName"]),
crawler_channel=spec.get("crawlerChannel"), crawler_channel=spec.get("crawlerChannel"),
proxy_id=spec.get("proxyId"), proxy_id=spec.get("proxyId"),
@ -204,8 +206,6 @@ class CrawlOperator(BaseOperator):
await self.k8s.delete_crawl_job(crawl.id) await self.k8s.delete_crawl_job(crawl.id)
return {"status": status.dict(exclude_none=True), "children": []} return {"status": status.dict(exclude_none=True), "children": []}
org = None
# first, check storage quota, and fail immediately if quota reached # first, check storage quota, and fail immediately if quota reached
if status.state in ( if status.state in (
"starting", "starting",
@ -215,7 +215,6 @@ class CrawlOperator(BaseOperator):
# only check on very first run, before any pods/pvcs created # only check on very first run, before any pods/pvcs created
# for now, allow if crawl has already started (pods/pvcs created) # for now, allow if crawl has already started (pods/pvcs created)
if not pods and not data.children[PVC]: if not pods and not data.children[PVC]:
org = await self.org_ops.get_org_by_id(crawl.oid)
if self.org_ops.storage_quota_reached(org): if self.org_ops.storage_quota_reached(org):
await self.mark_finished( await self.mark_finished(
crawl, status, "skipped_storage_quota_reached" crawl, status, "skipped_storage_quota_reached"
@ -229,7 +228,7 @@ class CrawlOperator(BaseOperator):
return self._empty_response(status) return self._empty_response(status)
if status.state in ("starting", "waiting_org_limit"): if status.state in ("starting", "waiting_org_limit"):
if not await self.can_start_new(crawl, data, status, org): if not await self.can_start_new(crawl, data, status):
return self._empty_response(status) return self._empty_response(status)
await self.set_state( await self.set_state(
@ -382,8 +381,9 @@ class CrawlOperator(BaseOperator):
crawlconfig = await self.crawl_config_ops.get_crawl_config(crawl.cid, crawl.oid) crawlconfig = await self.crawl_config_ops.get_crawl_config(crawl.cid, crawl.oid)
raw_config = crawlconfig.get_raw_config() self.crawl_config_ops.ensure_quota_page_limit(crawlconfig, crawl.org)
raw_config = crawlconfig.get_raw_config()
raw_config["behaviors"] = self._filter_autoclick_behavior( raw_config["behaviors"] = self._filter_autoclick_behavior(
raw_config["behaviors"], params["crawler_image"] raw_config["behaviors"], params["crawler_image"]
) )
@ -637,14 +637,10 @@ class CrawlOperator(BaseOperator):
crawl: CrawlSpec, crawl: CrawlSpec,
data: MCSyncData, data: MCSyncData,
status: CrawlStatus, status: CrawlStatus,
org: Optional[Organization] = None,
): ):
"""return true if crawl can start, otherwise set crawl to 'queued' state """return true if crawl can start, otherwise set crawl to 'queued' state
until more crawls for org finish""" until more crawls for org finish"""
if not org: max_crawls = crawl.org.quotas.maxConcurrentCrawls or 0
org = await self.org_ops.get_org_by_id(crawl.oid)
max_crawls = org.quotas.maxConcurrentCrawls or 0
if not max_crawls: if not max_crawls:
return True return True
@ -1238,15 +1234,13 @@ class CrawlOperator(BaseOperator):
} }
return json.dumps(err) return json.dumps(err)
async def add_file_to_crawl(self, cc_data, crawl, redis): async def add_file_to_crawl(self, cc_data, crawl: CrawlSpec, redis):
"""Handle finished CrawlFile to db""" """Handle finished CrawlFile to db"""
filecomplete = CrawlCompleteIn(**cc_data) filecomplete = CrawlCompleteIn(**cc_data)
org = await self.org_ops.get_org_by_id(crawl.oid)
filename = self.storage_ops.get_org_relative_path( filename = self.storage_ops.get_org_relative_path(
org, crawl.storage, filecomplete.filename crawl.org, crawl.storage, filecomplete.filename
) )
crawl_file = CrawlFile( crawl_file = CrawlFile(
@ -1299,7 +1293,7 @@ class CrawlOperator(BaseOperator):
return "size-limit" return "size-limit"
# gracefully stop crawl if current running crawl sizes reach storage quota # gracefully stop crawl if current running crawl sizes reach storage quota
org = await self.org_ops.get_org_by_id(crawl.oid) org = crawl.org
if org.readOnly: if org.readOnly:
return "stopped_org_readonly" return "stopped_org_readonly"

View File

@ -112,6 +112,7 @@ class CronJobOperator(BaseOperator):
crawl_id, crawl_id,
crawlconfig, crawlconfig,
user, user,
org,
manual=False, manual=False,
) )
print("Scheduled Crawl Created: " + crawl_id) print("Scheduled Crawl Created: " + crawl_id)

View File

@ -5,7 +5,7 @@ from uuid import UUID
from typing import Optional, DefaultDict, Literal, Annotated, Any from typing import Optional, DefaultDict, Literal, Annotated, Any
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from kubernetes.utils import parse_quantity from kubernetes.utils import parse_quantity
from btrixcloud.models import StorageRef, TYPE_ALL_CRAWL_STATES from btrixcloud.models import StorageRef, TYPE_ALL_CRAWL_STATES, Organization
BTRIX_API = "btrix.cloud/v1" BTRIX_API = "btrix.cloud/v1"
@ -70,6 +70,7 @@ class CrawlSpec(BaseModel):
id: str id: str
cid: UUID cid: UUID
oid: UUID oid: UUID
org: Organization
scale: int = 1 scale: int = 1
storage: StorageRef storage: StorageRef
started: str started: str

View File

@ -14,10 +14,10 @@ import sectionStrings from "@/strings/crawl-workflows/section";
import type { Collection } from "@/types/collection"; import type { Collection } from "@/types/collection";
import { WorkflowScopeType } from "@/types/workflow"; import { WorkflowScopeType } from "@/types/workflow";
import { isApiError } from "@/utils/api"; import { isApiError } from "@/utils/api";
import { getAppSettings } from "@/utils/app";
import { DEPTH_SUPPORTED_SCOPES, isPageScopeType } from "@/utils/crawler"; import { DEPTH_SUPPORTED_SCOPES, isPageScopeType } from "@/utils/crawler";
import { humanizeSchedule } from "@/utils/cron"; import { humanizeSchedule } from "@/utils/cron";
import { pluralOf } from "@/utils/pluralize"; import { pluralOf } from "@/utils/pluralize";
import { getServerDefaults } from "@/utils/workflow";
/** /**
* Usage: * Usage:
@ -55,7 +55,7 @@ export class ConfigDetails extends BtrixElement {
async connectedCallback() { async connectedCallback() {
super.connectedCallback(); super.connectedCallback();
void this.fetchAPIDefaults(); void this.fetchOrgDefaults();
await this.fetchCollections(); await this.fetchCollections();
} }
@ -137,7 +137,9 @@ export class ConfigDetails extends BtrixElement {
if (this.orgDefaults?.maxPagesPerCrawl) { if (this.orgDefaults?.maxPagesPerCrawl) {
return html`<span class="text-neutral-400"> return html`<span class="text-neutral-400">
${this.localize.number(this.orgDefaults.maxPagesPerCrawl)} ${this.orgDefaults.maxPagesPerCrawl === Infinity
? msg("Unlimited")
: this.localize.number(this.orgDefaults.maxPagesPerCrawl)}
${pluralOf("pages", this.orgDefaults.maxPagesPerCrawl)} ${pluralOf("pages", this.orgDefaults.maxPagesPerCrawl)}
${msg("(default)")}</span ${msg("(default)")}</span
>`; >`;
@ -510,25 +512,29 @@ export class ConfigDetails extends BtrixElement {
this.requestUpdate(); this.requestUpdate();
} }
private async fetchAPIDefaults() { // TODO Consolidate with workflow-editor
private async fetchOrgDefaults() {
try { try {
const settings = await getAppSettings(); const [serverDefaults, { quotas }] = await Promise.all([
const orgDefaults = { getServerDefaults(),
this.api.fetch<{
quotas: { maxPagesPerCrawl?: number };
}>(`/orgs/${this.orgId}`),
]);
const defaults = {
...this.orgDefaults, ...this.orgDefaults,
...serverDefaults,
}; };
if (settings.defaultBehaviorTimeSeconds > 0) { if (defaults.maxPagesPerCrawl && quotas.maxPagesPerCrawl) {
orgDefaults.behaviorTimeoutSeconds = defaults.maxPagesPerCrawl = Math.min(
settings.defaultBehaviorTimeSeconds; defaults.maxPagesPerCrawl,
quotas.maxPagesPerCrawl,
);
} }
if (settings.defaultPageLoadTimeSeconds > 0) {
orgDefaults.pageLoadTimeoutSeconds = this.orgDefaults = defaults;
settings.defaultPageLoadTimeSeconds;
}
if (settings.maxPagesPerCrawl > 0) {
orgDefaults.maxPagesPerCrawl = settings.maxPagesPerCrawl;
}
this.orgDefaults = orgDefaults;
} catch (e) { } catch (e) {
console.debug(e); console.debug(e);
} }

View File

@ -214,7 +214,7 @@ export class WorkflowEditor extends BtrixElement {
private progressState?: ProgressState; private progressState?: ProgressState;
@state() @state()
private defaults: WorkflowDefaults = appDefaults; private orgDefaults: WorkflowDefaults = appDefaults;
@state() @state()
private formState = getDefaultFormState(); private formState = getDefaultFormState();
@ -304,7 +304,9 @@ export class WorkflowEditor extends BtrixElement {
connectedCallback(): void { connectedCallback(): void {
this.initializeEditor(); this.initializeEditor();
super.connectedCallback(); super.connectedCallback();
void this.fetchServerDefaults();
void this.fetchOrgDefaults();
void this.fetchTags();
this.addEventListener( this.addEventListener(
"btrix-intersect", "btrix-intersect",
@ -350,15 +352,6 @@ export class WorkflowEditor extends BtrixElement {
if (this.progressState?.activeTab !== STEPS[0]) { if (this.progressState?.activeTab !== STEPS[0]) {
void this.scrollToActivePanel(); void this.scrollToActivePanel();
} }
if (this.orgId) {
void this.fetchTags();
void this.fetchOrgQuotaDefaults();
}
}
private async fetchServerDefaults() {
this.defaults = await getServerDefaults();
} }
private initializeEditor() { private initializeEditor() {
@ -1104,12 +1097,12 @@ https://archiveweb.page/images/${"logo.svg"}`}
value=${this.formState.pageLimit || ""} value=${this.formState.pageLimit || ""}
min=${minPages} min=${minPages}
max=${ifDefined( max=${ifDefined(
this.defaults.maxPagesPerCrawl && this.orgDefaults.maxPagesPerCrawl &&
this.defaults.maxPagesPerCrawl < Infinity this.orgDefaults.maxPagesPerCrawl < Infinity
? this.defaults.maxPagesPerCrawl ? this.orgDefaults.maxPagesPerCrawl
: undefined, : undefined,
)} )}
placeholder=${defaultLabel(this.defaults.maxPagesPerCrawl)} placeholder=${defaultLabel(this.orgDefaults.maxPagesPerCrawl)}
@sl-input=${onInputMinMax} @sl-input=${onInputMinMax}
> >
<span slot="suffix">${msg("pages")}</span> <span slot="suffix">${msg("pages")}</span>
@ -1152,7 +1145,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
type="number" type="number"
inputmode="numeric" inputmode="numeric"
label=${msg("Page Load Timeout")} label=${msg("Page Load Timeout")}
placeholder=${defaultLabel(this.defaults.pageLoadTimeoutSeconds)} placeholder=${defaultLabel(this.orgDefaults.pageLoadTimeoutSeconds)}
value=${ifDefined(this.formState.pageLoadTimeoutSeconds ?? undefined)} value=${ifDefined(this.formState.pageLoadTimeoutSeconds ?? undefined)}
min="0" min="0"
@sl-input=${onInputMinMax} @sl-input=${onInputMinMax}
@ -1181,7 +1174,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
type="number" type="number"
inputmode="numeric" inputmode="numeric"
label=${msg("Behavior Timeout")} label=${msg("Behavior Timeout")}
placeholder=${defaultLabel(this.defaults.behaviorTimeoutSeconds)} placeholder=${defaultLabel(this.orgDefaults.behaviorTimeoutSeconds)}
value=${ifDefined(this.formState.behaviorTimeoutSeconds ?? undefined)} value=${ifDefined(this.formState.behaviorTimeoutSeconds ?? undefined)}
min="0" min="0"
@sl-input=${onInputMinMax} @sl-input=${onInputMinMax}
@ -1278,7 +1271,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
> >
${when(this.appState.settings?.numBrowsers, (numBrowsers) => ${when(this.appState.settings?.numBrowsers, (numBrowsers) =>
map( map(
range(this.defaults.maxScale), range(this.orgDefaults.maxScale),
(i: number) => (i: number) =>
html` <sl-radio-button value="${i + 1}" size="small" html` <sl-radio-button value="${i + 1}" size="small"
>${(i + 1) * numBrowsers}</sl-radio-button >${(i + 1) * numBrowsers}</sl-radio-button
@ -2229,18 +2222,29 @@ https://archiveweb.page/images/${"logo.svg"}`}
} }
} }
private async fetchOrgQuotaDefaults() { // TODO Consolidate with config-details
private async fetchOrgDefaults() {
try { try {
const data = await this.api.fetch<{ const [serverDefaults, { quotas }] = await Promise.all([
quotas: { maxPagesPerCrawl?: number }; getServerDefaults(),
}>(`/orgs/${this.orgId}`); this.api.fetch<{
const orgDefaults = { quotas: { maxPagesPerCrawl?: number };
...this.defaults, }>(`/orgs/${this.orgId}`),
]);
const defaults = {
...this.orgDefaults,
...serverDefaults,
}; };
if (data.quotas.maxPagesPerCrawl && data.quotas.maxPagesPerCrawl > 0) {
orgDefaults.maxPagesPerCrawl = data.quotas.maxPagesPerCrawl; if (defaults.maxPagesPerCrawl && quotas.maxPagesPerCrawl) {
defaults.maxPagesPerCrawl = Math.min(
defaults.maxPagesPerCrawl,
quotas.maxPagesPerCrawl,
);
} }
this.defaults = orgDefaults;
this.orgDefaults = defaults;
} catch (e) { } catch (e) {
console.debug(e); console.debug(e);
} }