Fix max pages quota setting and display (#2370)

- add ensure_page_limit_quotas() which sets the config limit to the max
pages quota, if any
- set the page limit on the config when: creating new crawl, creating
configmap
- don't set the quota page limit on new or existing crawl workflows
(remove setting it on new workflows) to allow updated quotas to take
affect for next crawl
- frontend: correctly display page limit on workflow settings page from
org quotas, if any.
- operator: get org on each sync in one place
- fixes #2369

---------

Co-authored-by: sua yoo <sua@webrecorder.org>
This commit is contained in:
Ilya Kreymer 2025-02-10 16:15:21 -08:00 committed by GitHub
parent aae1c02b3a
commit 001839a521
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 86 additions and 68 deletions

View File

@ -251,11 +251,6 @@ class CrawlConfigOps:
crawlconfig.lastStartedBy = user.id
crawlconfig.lastStartedByName = user.name
# Ensure page limit is below org maxPagesPerCall if set
max_pages = org.quotas.maxPagesPerCrawl or 0
if max_pages > 0:
crawlconfig.config.limit = max_pages
# add CrawlConfig to DB here
result = await self.crawl_configs.insert_one(crawlconfig.to_dict())
@ -286,13 +281,30 @@ class CrawlConfigOps:
execMinutesQuotaReached=exec_mins_quota_reached,
)
def ensure_quota_page_limit(self, crawlconfig: CrawlConfig, org: Organization):
"""ensure page limit is set to no greater than quota page limit, if any"""
if org.quotas.maxPagesPerCrawl and org.quotas.maxPagesPerCrawl > 0:
if crawlconfig.config.limit and crawlconfig.config.limit > 0:
crawlconfig.config.limit = min(
org.quotas.maxPagesPerCrawl, crawlconfig.config.limit
)
else:
crawlconfig.config.limit = org.quotas.maxPagesPerCrawl
async def add_new_crawl(
self, crawl_id: str, crawlconfig: CrawlConfig, user: User, manual: bool
self,
crawl_id: str,
crawlconfig: CrawlConfig,
user: User,
org: Organization,
manual: bool,
) -> None:
"""increments crawl count for this config and adds new crawl"""
started = dt_now()
self.ensure_quota_page_limit(crawlconfig, org)
inc = self.inc_crawl_count(crawlconfig.id)
add = self.crawl_ops.add_new_crawl(
crawl_id, crawlconfig, user.id, started, manual
@ -892,7 +904,7 @@ class CrawlConfigOps:
storage_filename=storage_filename,
profile_filename=profile_filename or "",
)
await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True)
await self.add_new_crawl(crawl_id, crawlconfig, user, org, manual=True)
return crawl_id
except Exception as exc:

View File

@ -6,6 +6,7 @@ import math
from pprint import pprint
from typing import Optional, Any, Sequence
from datetime import datetime
from uuid import UUID
import json
@ -29,7 +30,6 @@ from btrixcloud.models import (
CrawlFile,
CrawlCompleteIn,
StorageRef,
Organization,
)
from btrixcloud.utils import str_to_date, date_to_str, dt_now
@ -145,11 +145,13 @@ class CrawlOperator(BaseOperator):
params["userid"] = spec.get("userid", "")
pods = data.children[POD]
org = await self.org_ops.get_org_by_id(UUID(oid))
crawl = CrawlSpec(
id=crawl_id,
cid=cid,
oid=oid,
org=org,
storage=StorageRef(spec["storageName"]),
crawler_channel=spec.get("crawlerChannel"),
proxy_id=spec.get("proxyId"),
@ -204,8 +206,6 @@ class CrawlOperator(BaseOperator):
await self.k8s.delete_crawl_job(crawl.id)
return {"status": status.dict(exclude_none=True), "children": []}
org = None
# first, check storage quota, and fail immediately if quota reached
if status.state in (
"starting",
@ -215,7 +215,6 @@ class CrawlOperator(BaseOperator):
# only check on very first run, before any pods/pvcs created
# for now, allow if crawl has already started (pods/pvcs created)
if not pods and not data.children[PVC]:
org = await self.org_ops.get_org_by_id(crawl.oid)
if self.org_ops.storage_quota_reached(org):
await self.mark_finished(
crawl, status, "skipped_storage_quota_reached"
@ -229,7 +228,7 @@ class CrawlOperator(BaseOperator):
return self._empty_response(status)
if status.state in ("starting", "waiting_org_limit"):
if not await self.can_start_new(crawl, data, status, org):
if not await self.can_start_new(crawl, data, status):
return self._empty_response(status)
await self.set_state(
@ -382,8 +381,9 @@ class CrawlOperator(BaseOperator):
crawlconfig = await self.crawl_config_ops.get_crawl_config(crawl.cid, crawl.oid)
raw_config = crawlconfig.get_raw_config()
self.crawl_config_ops.ensure_quota_page_limit(crawlconfig, crawl.org)
raw_config = crawlconfig.get_raw_config()
raw_config["behaviors"] = self._filter_autoclick_behavior(
raw_config["behaviors"], params["crawler_image"]
)
@ -637,14 +637,10 @@ class CrawlOperator(BaseOperator):
crawl: CrawlSpec,
data: MCSyncData,
status: CrawlStatus,
org: Optional[Organization] = None,
):
"""return true if crawl can start, otherwise set crawl to 'queued' state
until more crawls for org finish"""
if not org:
org = await self.org_ops.get_org_by_id(crawl.oid)
max_crawls = org.quotas.maxConcurrentCrawls or 0
max_crawls = crawl.org.quotas.maxConcurrentCrawls or 0
if not max_crawls:
return True
@ -1238,15 +1234,13 @@ class CrawlOperator(BaseOperator):
}
return json.dumps(err)
async def add_file_to_crawl(self, cc_data, crawl, redis):
async def add_file_to_crawl(self, cc_data, crawl: CrawlSpec, redis):
"""Handle finished CrawlFile to db"""
filecomplete = CrawlCompleteIn(**cc_data)
org = await self.org_ops.get_org_by_id(crawl.oid)
filename = self.storage_ops.get_org_relative_path(
org, crawl.storage, filecomplete.filename
crawl.org, crawl.storage, filecomplete.filename
)
crawl_file = CrawlFile(
@ -1299,7 +1293,7 @@ class CrawlOperator(BaseOperator):
return "size-limit"
# gracefully stop crawl if current running crawl sizes reach storage quota
org = await self.org_ops.get_org_by_id(crawl.oid)
org = crawl.org
if org.readOnly:
return "stopped_org_readonly"

View File

@ -112,6 +112,7 @@ class CronJobOperator(BaseOperator):
crawl_id,
crawlconfig,
user,
org,
manual=False,
)
print("Scheduled Crawl Created: " + crawl_id)

View File

@ -5,7 +5,7 @@ from uuid import UUID
from typing import Optional, DefaultDict, Literal, Annotated, Any
from pydantic import BaseModel, Field
from kubernetes.utils import parse_quantity
from btrixcloud.models import StorageRef, TYPE_ALL_CRAWL_STATES
from btrixcloud.models import StorageRef, TYPE_ALL_CRAWL_STATES, Organization
BTRIX_API = "btrix.cloud/v1"
@ -70,6 +70,7 @@ class CrawlSpec(BaseModel):
id: str
cid: UUID
oid: UUID
org: Organization
scale: int = 1
storage: StorageRef
started: str

View File

@ -14,10 +14,10 @@ import sectionStrings from "@/strings/crawl-workflows/section";
import type { Collection } from "@/types/collection";
import { WorkflowScopeType } from "@/types/workflow";
import { isApiError } from "@/utils/api";
import { getAppSettings } from "@/utils/app";
import { DEPTH_SUPPORTED_SCOPES, isPageScopeType } from "@/utils/crawler";
import { humanizeSchedule } from "@/utils/cron";
import { pluralOf } from "@/utils/pluralize";
import { getServerDefaults } from "@/utils/workflow";
/**
* Usage:
@ -55,7 +55,7 @@ export class ConfigDetails extends BtrixElement {
async connectedCallback() {
super.connectedCallback();
void this.fetchAPIDefaults();
void this.fetchOrgDefaults();
await this.fetchCollections();
}
@ -137,7 +137,9 @@ export class ConfigDetails extends BtrixElement {
if (this.orgDefaults?.maxPagesPerCrawl) {
return html`<span class="text-neutral-400">
${this.localize.number(this.orgDefaults.maxPagesPerCrawl)}
${this.orgDefaults.maxPagesPerCrawl === Infinity
? msg("Unlimited")
: this.localize.number(this.orgDefaults.maxPagesPerCrawl)}
${pluralOf("pages", this.orgDefaults.maxPagesPerCrawl)}
${msg("(default)")}</span
>`;
@ -510,25 +512,29 @@ export class ConfigDetails extends BtrixElement {
this.requestUpdate();
}
private async fetchAPIDefaults() {
// TODO Consolidate with workflow-editor
private async fetchOrgDefaults() {
try {
const settings = await getAppSettings();
const orgDefaults = {
const [serverDefaults, { quotas }] = await Promise.all([
getServerDefaults(),
this.api.fetch<{
quotas: { maxPagesPerCrawl?: number };
}>(`/orgs/${this.orgId}`),
]);
const defaults = {
...this.orgDefaults,
...serverDefaults,
};
if (settings.defaultBehaviorTimeSeconds > 0) {
orgDefaults.behaviorTimeoutSeconds =
settings.defaultBehaviorTimeSeconds;
if (defaults.maxPagesPerCrawl && quotas.maxPagesPerCrawl) {
defaults.maxPagesPerCrawl = Math.min(
defaults.maxPagesPerCrawl,
quotas.maxPagesPerCrawl,
);
}
if (settings.defaultPageLoadTimeSeconds > 0) {
orgDefaults.pageLoadTimeoutSeconds =
settings.defaultPageLoadTimeSeconds;
}
if (settings.maxPagesPerCrawl > 0) {
orgDefaults.maxPagesPerCrawl = settings.maxPagesPerCrawl;
}
this.orgDefaults = orgDefaults;
this.orgDefaults = defaults;
} catch (e) {
console.debug(e);
}

View File

@ -214,7 +214,7 @@ export class WorkflowEditor extends BtrixElement {
private progressState?: ProgressState;
@state()
private defaults: WorkflowDefaults = appDefaults;
private orgDefaults: WorkflowDefaults = appDefaults;
@state()
private formState = getDefaultFormState();
@ -304,7 +304,9 @@ export class WorkflowEditor extends BtrixElement {
connectedCallback(): void {
this.initializeEditor();
super.connectedCallback();
void this.fetchServerDefaults();
void this.fetchOrgDefaults();
void this.fetchTags();
this.addEventListener(
"btrix-intersect",
@ -350,15 +352,6 @@ export class WorkflowEditor extends BtrixElement {
if (this.progressState?.activeTab !== STEPS[0]) {
void this.scrollToActivePanel();
}
if (this.orgId) {
void this.fetchTags();
void this.fetchOrgQuotaDefaults();
}
}
private async fetchServerDefaults() {
this.defaults = await getServerDefaults();
}
private initializeEditor() {
@ -1104,12 +1097,12 @@ https://archiveweb.page/images/${"logo.svg"}`}
value=${this.formState.pageLimit || ""}
min=${minPages}
max=${ifDefined(
this.defaults.maxPagesPerCrawl &&
this.defaults.maxPagesPerCrawl < Infinity
? this.defaults.maxPagesPerCrawl
this.orgDefaults.maxPagesPerCrawl &&
this.orgDefaults.maxPagesPerCrawl < Infinity
? this.orgDefaults.maxPagesPerCrawl
: undefined,
)}
placeholder=${defaultLabel(this.defaults.maxPagesPerCrawl)}
placeholder=${defaultLabel(this.orgDefaults.maxPagesPerCrawl)}
@sl-input=${onInputMinMax}
>
<span slot="suffix">${msg("pages")}</span>
@ -1152,7 +1145,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
type="number"
inputmode="numeric"
label=${msg("Page Load Timeout")}
placeholder=${defaultLabel(this.defaults.pageLoadTimeoutSeconds)}
placeholder=${defaultLabel(this.orgDefaults.pageLoadTimeoutSeconds)}
value=${ifDefined(this.formState.pageLoadTimeoutSeconds ?? undefined)}
min="0"
@sl-input=${onInputMinMax}
@ -1181,7 +1174,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
type="number"
inputmode="numeric"
label=${msg("Behavior Timeout")}
placeholder=${defaultLabel(this.defaults.behaviorTimeoutSeconds)}
placeholder=${defaultLabel(this.orgDefaults.behaviorTimeoutSeconds)}
value=${ifDefined(this.formState.behaviorTimeoutSeconds ?? undefined)}
min="0"
@sl-input=${onInputMinMax}
@ -1278,7 +1271,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
>
${when(this.appState.settings?.numBrowsers, (numBrowsers) =>
map(
range(this.defaults.maxScale),
range(this.orgDefaults.maxScale),
(i: number) =>
html` <sl-radio-button value="${i + 1}" size="small"
>${(i + 1) * numBrowsers}</sl-radio-button
@ -2229,18 +2222,29 @@ https://archiveweb.page/images/${"logo.svg"}`}
}
}
private async fetchOrgQuotaDefaults() {
// TODO Consolidate with config-details
private async fetchOrgDefaults() {
try {
const data = await this.api.fetch<{
quotas: { maxPagesPerCrawl?: number };
}>(`/orgs/${this.orgId}`);
const orgDefaults = {
...this.defaults,
const [serverDefaults, { quotas }] = await Promise.all([
getServerDefaults(),
this.api.fetch<{
quotas: { maxPagesPerCrawl?: number };
}>(`/orgs/${this.orgId}`),
]);
const defaults = {
...this.orgDefaults,
...serverDefaults,
};
if (data.quotas.maxPagesPerCrawl && data.quotas.maxPagesPerCrawl > 0) {
orgDefaults.maxPagesPerCrawl = data.quotas.maxPagesPerCrawl;
if (defaults.maxPagesPerCrawl && quotas.maxPagesPerCrawl) {
defaults.maxPagesPerCrawl = Math.min(
defaults.maxPagesPerCrawl,
quotas.maxPagesPerCrawl,
);
}
this.defaults = orgDefaults;
this.orgDefaults = defaults;
} catch (e) {
console.debug(e);
}