Fix max pages quota setting and display (#2370)
- add ensure_page_limit_quotas() which sets the config limit to the max pages quota, if any - set the page limit on the config when: creating new crawl, creating configmap - don't set the quota page limit on new or existing crawl workflows (remove setting it on new workflows) to allow updated quotas to take affect for next crawl - frontend: correctly display page limit on workflow settings page from org quotas, if any. - operator: get org on each sync in one place - fixes #2369 --------- Co-authored-by: sua yoo <sua@webrecorder.org>
This commit is contained in:
parent
aae1c02b3a
commit
001839a521
@ -251,11 +251,6 @@ class CrawlConfigOps:
|
||||
crawlconfig.lastStartedBy = user.id
|
||||
crawlconfig.lastStartedByName = user.name
|
||||
|
||||
# Ensure page limit is below org maxPagesPerCall if set
|
||||
max_pages = org.quotas.maxPagesPerCrawl or 0
|
||||
if max_pages > 0:
|
||||
crawlconfig.config.limit = max_pages
|
||||
|
||||
# add CrawlConfig to DB here
|
||||
result = await self.crawl_configs.insert_one(crawlconfig.to_dict())
|
||||
|
||||
@ -286,13 +281,30 @@ class CrawlConfigOps:
|
||||
execMinutesQuotaReached=exec_mins_quota_reached,
|
||||
)
|
||||
|
||||
def ensure_quota_page_limit(self, crawlconfig: CrawlConfig, org: Organization):
|
||||
"""ensure page limit is set to no greater than quota page limit, if any"""
|
||||
if org.quotas.maxPagesPerCrawl and org.quotas.maxPagesPerCrawl > 0:
|
||||
if crawlconfig.config.limit and crawlconfig.config.limit > 0:
|
||||
crawlconfig.config.limit = min(
|
||||
org.quotas.maxPagesPerCrawl, crawlconfig.config.limit
|
||||
)
|
||||
else:
|
||||
crawlconfig.config.limit = org.quotas.maxPagesPerCrawl
|
||||
|
||||
async def add_new_crawl(
|
||||
self, crawl_id: str, crawlconfig: CrawlConfig, user: User, manual: bool
|
||||
self,
|
||||
crawl_id: str,
|
||||
crawlconfig: CrawlConfig,
|
||||
user: User,
|
||||
org: Organization,
|
||||
manual: bool,
|
||||
) -> None:
|
||||
"""increments crawl count for this config and adds new crawl"""
|
||||
|
||||
started = dt_now()
|
||||
|
||||
self.ensure_quota_page_limit(crawlconfig, org)
|
||||
|
||||
inc = self.inc_crawl_count(crawlconfig.id)
|
||||
add = self.crawl_ops.add_new_crawl(
|
||||
crawl_id, crawlconfig, user.id, started, manual
|
||||
@ -892,7 +904,7 @@ class CrawlConfigOps:
|
||||
storage_filename=storage_filename,
|
||||
profile_filename=profile_filename or "",
|
||||
)
|
||||
await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True)
|
||||
await self.add_new_crawl(crawl_id, crawlconfig, user, org, manual=True)
|
||||
return crawl_id
|
||||
|
||||
except Exception as exc:
|
||||
|
@ -6,6 +6,7 @@ import math
|
||||
from pprint import pprint
|
||||
from typing import Optional, Any, Sequence
|
||||
from datetime import datetime
|
||||
from uuid import UUID
|
||||
|
||||
import json
|
||||
|
||||
@ -29,7 +30,6 @@ from btrixcloud.models import (
|
||||
CrawlFile,
|
||||
CrawlCompleteIn,
|
||||
StorageRef,
|
||||
Organization,
|
||||
)
|
||||
|
||||
from btrixcloud.utils import str_to_date, date_to_str, dt_now
|
||||
@ -145,11 +145,13 @@ class CrawlOperator(BaseOperator):
|
||||
params["userid"] = spec.get("userid", "")
|
||||
|
||||
pods = data.children[POD]
|
||||
org = await self.org_ops.get_org_by_id(UUID(oid))
|
||||
|
||||
crawl = CrawlSpec(
|
||||
id=crawl_id,
|
||||
cid=cid,
|
||||
oid=oid,
|
||||
org=org,
|
||||
storage=StorageRef(spec["storageName"]),
|
||||
crawler_channel=spec.get("crawlerChannel"),
|
||||
proxy_id=spec.get("proxyId"),
|
||||
@ -204,8 +206,6 @@ class CrawlOperator(BaseOperator):
|
||||
await self.k8s.delete_crawl_job(crawl.id)
|
||||
return {"status": status.dict(exclude_none=True), "children": []}
|
||||
|
||||
org = None
|
||||
|
||||
# first, check storage quota, and fail immediately if quota reached
|
||||
if status.state in (
|
||||
"starting",
|
||||
@ -215,7 +215,6 @@ class CrawlOperator(BaseOperator):
|
||||
# only check on very first run, before any pods/pvcs created
|
||||
# for now, allow if crawl has already started (pods/pvcs created)
|
||||
if not pods and not data.children[PVC]:
|
||||
org = await self.org_ops.get_org_by_id(crawl.oid)
|
||||
if self.org_ops.storage_quota_reached(org):
|
||||
await self.mark_finished(
|
||||
crawl, status, "skipped_storage_quota_reached"
|
||||
@ -229,7 +228,7 @@ class CrawlOperator(BaseOperator):
|
||||
return self._empty_response(status)
|
||||
|
||||
if status.state in ("starting", "waiting_org_limit"):
|
||||
if not await self.can_start_new(crawl, data, status, org):
|
||||
if not await self.can_start_new(crawl, data, status):
|
||||
return self._empty_response(status)
|
||||
|
||||
await self.set_state(
|
||||
@ -382,8 +381,9 @@ class CrawlOperator(BaseOperator):
|
||||
|
||||
crawlconfig = await self.crawl_config_ops.get_crawl_config(crawl.cid, crawl.oid)
|
||||
|
||||
raw_config = crawlconfig.get_raw_config()
|
||||
self.crawl_config_ops.ensure_quota_page_limit(crawlconfig, crawl.org)
|
||||
|
||||
raw_config = crawlconfig.get_raw_config()
|
||||
raw_config["behaviors"] = self._filter_autoclick_behavior(
|
||||
raw_config["behaviors"], params["crawler_image"]
|
||||
)
|
||||
@ -637,14 +637,10 @@ class CrawlOperator(BaseOperator):
|
||||
crawl: CrawlSpec,
|
||||
data: MCSyncData,
|
||||
status: CrawlStatus,
|
||||
org: Optional[Organization] = None,
|
||||
):
|
||||
"""return true if crawl can start, otherwise set crawl to 'queued' state
|
||||
until more crawls for org finish"""
|
||||
if not org:
|
||||
org = await self.org_ops.get_org_by_id(crawl.oid)
|
||||
|
||||
max_crawls = org.quotas.maxConcurrentCrawls or 0
|
||||
max_crawls = crawl.org.quotas.maxConcurrentCrawls or 0
|
||||
if not max_crawls:
|
||||
return True
|
||||
|
||||
@ -1238,15 +1234,13 @@ class CrawlOperator(BaseOperator):
|
||||
}
|
||||
return json.dumps(err)
|
||||
|
||||
async def add_file_to_crawl(self, cc_data, crawl, redis):
|
||||
async def add_file_to_crawl(self, cc_data, crawl: CrawlSpec, redis):
|
||||
"""Handle finished CrawlFile to db"""
|
||||
|
||||
filecomplete = CrawlCompleteIn(**cc_data)
|
||||
|
||||
org = await self.org_ops.get_org_by_id(crawl.oid)
|
||||
|
||||
filename = self.storage_ops.get_org_relative_path(
|
||||
org, crawl.storage, filecomplete.filename
|
||||
crawl.org, crawl.storage, filecomplete.filename
|
||||
)
|
||||
|
||||
crawl_file = CrawlFile(
|
||||
@ -1299,7 +1293,7 @@ class CrawlOperator(BaseOperator):
|
||||
return "size-limit"
|
||||
|
||||
# gracefully stop crawl if current running crawl sizes reach storage quota
|
||||
org = await self.org_ops.get_org_by_id(crawl.oid)
|
||||
org = crawl.org
|
||||
|
||||
if org.readOnly:
|
||||
return "stopped_org_readonly"
|
||||
|
@ -112,6 +112,7 @@ class CronJobOperator(BaseOperator):
|
||||
crawl_id,
|
||||
crawlconfig,
|
||||
user,
|
||||
org,
|
||||
manual=False,
|
||||
)
|
||||
print("Scheduled Crawl Created: " + crawl_id)
|
||||
|
@ -5,7 +5,7 @@ from uuid import UUID
|
||||
from typing import Optional, DefaultDict, Literal, Annotated, Any
|
||||
from pydantic import BaseModel, Field
|
||||
from kubernetes.utils import parse_quantity
|
||||
from btrixcloud.models import StorageRef, TYPE_ALL_CRAWL_STATES
|
||||
from btrixcloud.models import StorageRef, TYPE_ALL_CRAWL_STATES, Organization
|
||||
|
||||
|
||||
BTRIX_API = "btrix.cloud/v1"
|
||||
@ -70,6 +70,7 @@ class CrawlSpec(BaseModel):
|
||||
id: str
|
||||
cid: UUID
|
||||
oid: UUID
|
||||
org: Organization
|
||||
scale: int = 1
|
||||
storage: StorageRef
|
||||
started: str
|
||||
|
@ -14,10 +14,10 @@ import sectionStrings from "@/strings/crawl-workflows/section";
|
||||
import type { Collection } from "@/types/collection";
|
||||
import { WorkflowScopeType } from "@/types/workflow";
|
||||
import { isApiError } from "@/utils/api";
|
||||
import { getAppSettings } from "@/utils/app";
|
||||
import { DEPTH_SUPPORTED_SCOPES, isPageScopeType } from "@/utils/crawler";
|
||||
import { humanizeSchedule } from "@/utils/cron";
|
||||
import { pluralOf } from "@/utils/pluralize";
|
||||
import { getServerDefaults } from "@/utils/workflow";
|
||||
|
||||
/**
|
||||
* Usage:
|
||||
@ -55,7 +55,7 @@ export class ConfigDetails extends BtrixElement {
|
||||
|
||||
async connectedCallback() {
|
||||
super.connectedCallback();
|
||||
void this.fetchAPIDefaults();
|
||||
void this.fetchOrgDefaults();
|
||||
await this.fetchCollections();
|
||||
}
|
||||
|
||||
@ -137,7 +137,9 @@ export class ConfigDetails extends BtrixElement {
|
||||
|
||||
if (this.orgDefaults?.maxPagesPerCrawl) {
|
||||
return html`<span class="text-neutral-400">
|
||||
${this.localize.number(this.orgDefaults.maxPagesPerCrawl)}
|
||||
${this.orgDefaults.maxPagesPerCrawl === Infinity
|
||||
? msg("Unlimited")
|
||||
: this.localize.number(this.orgDefaults.maxPagesPerCrawl)}
|
||||
${pluralOf("pages", this.orgDefaults.maxPagesPerCrawl)}
|
||||
${msg("(default)")}</span
|
||||
>`;
|
||||
@ -510,25 +512,29 @@ export class ConfigDetails extends BtrixElement {
|
||||
this.requestUpdate();
|
||||
}
|
||||
|
||||
private async fetchAPIDefaults() {
|
||||
// TODO Consolidate with workflow-editor
|
||||
private async fetchOrgDefaults() {
|
||||
try {
|
||||
const settings = await getAppSettings();
|
||||
const orgDefaults = {
|
||||
const [serverDefaults, { quotas }] = await Promise.all([
|
||||
getServerDefaults(),
|
||||
this.api.fetch<{
|
||||
quotas: { maxPagesPerCrawl?: number };
|
||||
}>(`/orgs/${this.orgId}`),
|
||||
]);
|
||||
|
||||
const defaults = {
|
||||
...this.orgDefaults,
|
||||
...serverDefaults,
|
||||
};
|
||||
|
||||
if (settings.defaultBehaviorTimeSeconds > 0) {
|
||||
orgDefaults.behaviorTimeoutSeconds =
|
||||
settings.defaultBehaviorTimeSeconds;
|
||||
if (defaults.maxPagesPerCrawl && quotas.maxPagesPerCrawl) {
|
||||
defaults.maxPagesPerCrawl = Math.min(
|
||||
defaults.maxPagesPerCrawl,
|
||||
quotas.maxPagesPerCrawl,
|
||||
);
|
||||
}
|
||||
if (settings.defaultPageLoadTimeSeconds > 0) {
|
||||
orgDefaults.pageLoadTimeoutSeconds =
|
||||
settings.defaultPageLoadTimeSeconds;
|
||||
}
|
||||
if (settings.maxPagesPerCrawl > 0) {
|
||||
orgDefaults.maxPagesPerCrawl = settings.maxPagesPerCrawl;
|
||||
}
|
||||
this.orgDefaults = orgDefaults;
|
||||
|
||||
this.orgDefaults = defaults;
|
||||
} catch (e) {
|
||||
console.debug(e);
|
||||
}
|
||||
|
@ -214,7 +214,7 @@ export class WorkflowEditor extends BtrixElement {
|
||||
private progressState?: ProgressState;
|
||||
|
||||
@state()
|
||||
private defaults: WorkflowDefaults = appDefaults;
|
||||
private orgDefaults: WorkflowDefaults = appDefaults;
|
||||
|
||||
@state()
|
||||
private formState = getDefaultFormState();
|
||||
@ -304,7 +304,9 @@ export class WorkflowEditor extends BtrixElement {
|
||||
connectedCallback(): void {
|
||||
this.initializeEditor();
|
||||
super.connectedCallback();
|
||||
void this.fetchServerDefaults();
|
||||
|
||||
void this.fetchOrgDefaults();
|
||||
void this.fetchTags();
|
||||
|
||||
this.addEventListener(
|
||||
"btrix-intersect",
|
||||
@ -350,15 +352,6 @@ export class WorkflowEditor extends BtrixElement {
|
||||
if (this.progressState?.activeTab !== STEPS[0]) {
|
||||
void this.scrollToActivePanel();
|
||||
}
|
||||
|
||||
if (this.orgId) {
|
||||
void this.fetchTags();
|
||||
void this.fetchOrgQuotaDefaults();
|
||||
}
|
||||
}
|
||||
|
||||
private async fetchServerDefaults() {
|
||||
this.defaults = await getServerDefaults();
|
||||
}
|
||||
|
||||
private initializeEditor() {
|
||||
@ -1104,12 +1097,12 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
value=${this.formState.pageLimit || ""}
|
||||
min=${minPages}
|
||||
max=${ifDefined(
|
||||
this.defaults.maxPagesPerCrawl &&
|
||||
this.defaults.maxPagesPerCrawl < Infinity
|
||||
? this.defaults.maxPagesPerCrawl
|
||||
this.orgDefaults.maxPagesPerCrawl &&
|
||||
this.orgDefaults.maxPagesPerCrawl < Infinity
|
||||
? this.orgDefaults.maxPagesPerCrawl
|
||||
: undefined,
|
||||
)}
|
||||
placeholder=${defaultLabel(this.defaults.maxPagesPerCrawl)}
|
||||
placeholder=${defaultLabel(this.orgDefaults.maxPagesPerCrawl)}
|
||||
@sl-input=${onInputMinMax}
|
||||
>
|
||||
<span slot="suffix">${msg("pages")}</span>
|
||||
@ -1152,7 +1145,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
type="number"
|
||||
inputmode="numeric"
|
||||
label=${msg("Page Load Timeout")}
|
||||
placeholder=${defaultLabel(this.defaults.pageLoadTimeoutSeconds)}
|
||||
placeholder=${defaultLabel(this.orgDefaults.pageLoadTimeoutSeconds)}
|
||||
value=${ifDefined(this.formState.pageLoadTimeoutSeconds ?? undefined)}
|
||||
min="0"
|
||||
@sl-input=${onInputMinMax}
|
||||
@ -1181,7 +1174,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
type="number"
|
||||
inputmode="numeric"
|
||||
label=${msg("Behavior Timeout")}
|
||||
placeholder=${defaultLabel(this.defaults.behaviorTimeoutSeconds)}
|
||||
placeholder=${defaultLabel(this.orgDefaults.behaviorTimeoutSeconds)}
|
||||
value=${ifDefined(this.formState.behaviorTimeoutSeconds ?? undefined)}
|
||||
min="0"
|
||||
@sl-input=${onInputMinMax}
|
||||
@ -1278,7 +1271,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
>
|
||||
${when(this.appState.settings?.numBrowsers, (numBrowsers) =>
|
||||
map(
|
||||
range(this.defaults.maxScale),
|
||||
range(this.orgDefaults.maxScale),
|
||||
(i: number) =>
|
||||
html` <sl-radio-button value="${i + 1}" size="small"
|
||||
>${(i + 1) * numBrowsers}</sl-radio-button
|
||||
@ -2229,18 +2222,29 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
}
|
||||
}
|
||||
|
||||
private async fetchOrgQuotaDefaults() {
|
||||
// TODO Consolidate with config-details
|
||||
private async fetchOrgDefaults() {
|
||||
try {
|
||||
const data = await this.api.fetch<{
|
||||
quotas: { maxPagesPerCrawl?: number };
|
||||
}>(`/orgs/${this.orgId}`);
|
||||
const orgDefaults = {
|
||||
...this.defaults,
|
||||
const [serverDefaults, { quotas }] = await Promise.all([
|
||||
getServerDefaults(),
|
||||
this.api.fetch<{
|
||||
quotas: { maxPagesPerCrawl?: number };
|
||||
}>(`/orgs/${this.orgId}`),
|
||||
]);
|
||||
|
||||
const defaults = {
|
||||
...this.orgDefaults,
|
||||
...serverDefaults,
|
||||
};
|
||||
if (data.quotas.maxPagesPerCrawl && data.quotas.maxPagesPerCrawl > 0) {
|
||||
orgDefaults.maxPagesPerCrawl = data.quotas.maxPagesPerCrawl;
|
||||
|
||||
if (defaults.maxPagesPerCrawl && quotas.maxPagesPerCrawl) {
|
||||
defaults.maxPagesPerCrawl = Math.min(
|
||||
defaults.maxPagesPerCrawl,
|
||||
quotas.maxPagesPerCrawl,
|
||||
);
|
||||
}
|
||||
this.defaults = orgDefaults;
|
||||
|
||||
this.orgDefaults = defaults;
|
||||
} catch (e) {
|
||||
console.debug(e);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user