Add max crawl size option to backend and frontend (#1045)

Backend:
- add 'maxCrawlSize' to models and crawljob spec
- add 'MAX_CRAWL_SIZE' to configmap
- add maxCrawlSize to new crawlconfig + update APIs
- operator: gracefully stop crawl if current size (from stats) exceeds maxCrawlSize
- tests: add max crawl size tests

Frontend:
- Add Max Crawl Size text box Limits tab
- Users enter max crawl size in GB, convert to bytes
- Add BYTES_PER_GB as constant for converting to bytes
- docs: Crawl Size Limit to user guide workflow setup section

Operator Refactor:
- use 'status.stopping' instead of 'crawl.stopping' to indicate crawl is being stopped, as changing later has no effect in operator
- add is_crawl_stopping() to return if crawl is being stopped, based on crawl.stopping or size or time limit being reached
- crawlerjob status: store byte size under 'size', human readable size under 'sizeHuman' for clarity
- size stat always exists so remove unneeded conditional (defaults to 0)
- store raw byte size in 'size', human readable size in 'sizeHuman'

Charts:
- subchart: update crawlerjob crd in btrix-crds to show status.stopping instead of spec.stopping
- subchart: show 'sizeHuman' property instead of 'size'
- bump subchart version to 0.1.1

---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2023-08-27 01:00:37 -04:00 committed by GitHub
parent 2da6c1c905
commit e667fe2e97
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
22 changed files with 225 additions and 36 deletions

View File

@ -200,6 +200,9 @@ class CrawlConfigOps:
changed = changed or (
self.check_attr_changed(orig_crawl_config, update, "crawlTimeout")
)
changed = changed or (
self.check_attr_changed(orig_crawl_config, update, "maxCrawlSize")
)
changed = changed or (
self.check_attr_changed(orig_crawl_config, update, "crawlFilenameTemplate")
)
@ -267,7 +270,7 @@ class CrawlConfigOps:
status_code=404, detail=f"Crawl Config '{cid}' not found"
)
# update in crawl manager if config, schedule, scale or crawlTimeout changed
# update in crawl manager if config, schedule, scale, maxCrawlSize or crawlTimeout changed
if changed:
crawlconfig = CrawlConfig.from_dict(result)
try:

View File

@ -103,7 +103,8 @@ class CrawlManager(K8sAPI):
STORAGE_NAME=storage_name,
PROFILE_FILENAME=profile_filename,
INITIAL_SCALE=str(crawlconfig.scale),
CRAWL_TIMEOUT=str(crawlconfig.crawlTimeout or 0)
CRAWL_TIMEOUT=str(crawlconfig.crawlTimeout or 0),
MAX_CRAWL_SIZE=str(crawlconfig.maxCrawlSize or 0)
# REV=str(crawlconfig.rev),
)
@ -128,6 +129,7 @@ class CrawlManager(K8sAPI):
crawlconfig.oid,
crawlconfig.scale,
crawlconfig.crawlTimeout,
crawlconfig.maxCrawlSize,
manual=True,
)
@ -137,6 +139,7 @@ class CrawlManager(K8sAPI):
has_sched_update = update.schedule is not None
has_scale_update = update.scale is not None
has_timeout_update = update.crawlTimeout is not None
has_max_crawl_size_update = update.maxCrawlSize is not None
has_config_update = update.config is not None
if has_sched_update:
@ -147,6 +150,7 @@ class CrawlManager(K8sAPI):
or has_config_update
or has_timeout_update
or profile_filename
or has_max_crawl_size_update
):
await self._update_config_map(
crawlconfig,
@ -409,6 +413,9 @@ class CrawlManager(K8sAPI):
if update.crawlTimeout is not None:
config_map.data["CRAWL_TIMEOUT"] = str(update.crawlTimeout)
if update.maxCrawlSize is not None:
config_map.data["MAX_CRAWL_SIZE"] = str(update.maxCrawlSize)
if update.crawlFilenameTemplate is not None:
config_map.data["STORE_FILENAME"] = update.crawlFilenameTemplate

View File

@ -538,6 +538,7 @@ async def add_new_crawl(
profileid=crawlconfig.profileid,
schedule=crawlconfig.schedule,
crawlTimeout=crawlconfig.crawlTimeout,
maxCrawlSize=crawlconfig.maxCrawlSize,
manual=manual,
started=started,
tags=crawlconfig.tags,

View File

@ -71,7 +71,7 @@ class K8sAPI:
# pylint: disable=too-many-arguments
async def new_crawl_job(
self, cid, userid, oid, scale=1, crawl_timeout=0, manual=True
self, cid, userid, oid, scale=1, crawl_timeout=0, max_crawl_size=0, manual=True
):
"""load job template from yaml"""
if crawl_timeout:
@ -90,6 +90,7 @@ class K8sAPI:
"userid": userid,
"scale": scale,
"expire_time": crawl_expire_time,
"max_crawl_size": max_crawl_size,
"manual": "1" if manual else "0",
}

View File

@ -128,6 +128,7 @@ class CrawlConfigIn(BaseModel):
tags: Optional[List[str]] = []
crawlTimeout: Optional[int] = 0
maxCrawlSize: Optional[int] = 0
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
crawlFilenameTemplate: Optional[str]
@ -146,6 +147,7 @@ class ConfigRevision(BaseMongoModel):
profileid: Optional[UUID4]
crawlTimeout: Optional[int] = 0
maxCrawlSize: Optional[int] = 0
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
modified: datetime
@ -166,6 +168,7 @@ class CrawlConfigCore(BaseMongoModel):
tags: Optional[List[str]] = []
crawlTimeout: Optional[int] = 0
maxCrawlSize: Optional[int] = 0
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
oid: UUID4
@ -250,6 +253,7 @@ class UpdateCrawlConfig(BaseModel):
schedule: Optional[str]
profileid: Optional[str]
crawlTimeout: Optional[int]
maxCrawlSize: Optional[int]
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)]
crawlFilenameTemplate: Optional[str]
config: Optional[RawCrawlConfig]

View File

@ -88,6 +88,7 @@ class CrawlSpec(BaseModel):
started: str
stopping: bool = False
expire_time: Optional[datetime] = None
max_crawl_size: Optional[int] = None
# ============================================================================
@ -97,7 +98,9 @@ class CrawlStatus(BaseModel):
state: str = "starting"
pagesFound: int = 0
pagesDone: int = 0
size: str = ""
size: int = 0
# human readable size string
sizeHuman: str = ""
scale: int = 1
filesAdded: int = 0
filesAddedSize: int = 0
@ -110,12 +113,11 @@ class CrawlStatus(BaseModel):
# ============================================================================
# pylint: disable=too-many-statements
# pylint: disable=too-many-statements, too-many-public-methods, too-many-branches
# pylint: disable=too-many-instance-attributes,too-many-locals
class BtrixOperator(K8sAPI):
"""BtrixOperator Handler"""
# pylint: disable=too-many-instance-attributes,too-many-locals
def __init__(self):
super().__init__()
self.config_file = "/config/config.yaml"
@ -209,6 +211,7 @@ class BtrixOperator(K8sAPI):
started=data.parent["metadata"]["creationTimestamp"],
stopping=spec.get("stopping", False),
expire_time=from_k8s_date(spec.get("expireTime")),
max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
)
if status.state in ("starting", "waiting_org_limit"):
@ -226,7 +229,7 @@ class BtrixOperator(K8sAPI):
if has_crawl_children:
pods = data.related[POD]
status = await self.sync_crawl_state(redis_url, crawl, status, pods)
if crawl.stopping:
if status.stopping:
await self.check_if_finished(crawl, status)
if status.finished:
@ -618,31 +621,42 @@ class BtrixOperator(K8sAPI):
return True
# pylint: disable=too-many-branches
def is_crawl_stopping(self, crawl, size):
"""return true if crawl should begin graceful stopping phase"""
# if user requested stop, then enter stopping phase
if crawl.stopping:
print("Graceful Stop: User requested stop")
return True
# check crawl expiry
if crawl.expire_time and datetime.utcnow() > crawl.expire_time:
print(f"Graceful Stop: Job duration expired at {crawl.expire_time}")
return True
if crawl.max_crawl_size and size > crawl.max_crawl_size:
print(f"Graceful Stop: Maximum crawl size {crawl.max_crawl_size} hit")
return True
return False
async def update_crawl_state(self, redis, crawl, status):
"""update crawl state and check if crawl is now done"""
results = await redis.hvals(f"{crawl.id}:status")
stats = await get_redis_crawl_stats(redis, crawl.id)
# check crawl expiry
if crawl.expire_time and datetime.utcnow() > crawl.expire_time:
crawl.stopping = True
print(
"Job duration expired at {crawl.expire_time}, "
+ "gracefully stopping crawl"
)
if crawl.stopping:
print("Graceful Stop")
await redis.set(f"{crawl.id}:stopping", "1")
# backwards compatibility with older crawler
await redis.set("crawl-stop", "1")
# update status
status.pagesDone = stats["done"]
status.pagesFound = stats["found"]
if stats["size"] is not None:
status.size = humanize.naturalsize(stats["size"])
status.size = stats["size"]
status.sizeHuman = humanize.naturalsize(status.size)
status.stopping = self.is_crawl_stopping(crawl, status.size)
if status.stopping:
await redis.set(f"{crawl.id}:stopping", "1")
# backwards compatibility with older crawler
await redis.set("crawl-stop", "1")
# check if done / failed
status_count = {}
@ -669,7 +683,7 @@ class BtrixOperator(K8sAPI):
# check if all crawlers failed
elif status_count.get("failed", 0) >= crawl.scale:
# if stopping, and no pages finished, mark as canceled
if crawl.stopping and not status.pagesDone:
if status.stopping and not status.pagesDone:
state = "canceled"
else:
state = "failed"

View File

@ -18,6 +18,7 @@ spec:
cid: "{{ cid }}"
oid: "{{ oid }}"
scale: {{ scale }}
maxCrawlSize: {{ max_crawl_size }}
ttlSecondsAfterFinished: 30
{% if expire_time %}

View File

@ -196,6 +196,30 @@ def test_update_crawl_timeout(crawler_auth_headers, default_org_id, sample_crawl
assert data["crawlTimeout"] == 60
def test_update_max_crawl_size(crawler_auth_headers, default_org_id, sample_crawl_data):
# Verify that updating crawl timeout works
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"maxCrawlSize": 4096},
)
assert r.status_code == 200
data = r.json()
assert data["settings_changed"] == True
assert data["metadata_changed"] == False
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["maxCrawlSize"] == 4096
def test_verify_delete_tags(crawler_auth_headers, default_org_id):
# Verify that deleting tags and name works as well
r = requests.patch(
@ -224,9 +248,9 @@ def test_verify_revs_history(crawler_auth_headers, default_org_id):
assert r.status_code == 200
data = r.json()
assert data["total"] == 2
assert data["total"] == 3
items = data["items"]
assert len(items) == 2
assert len(items) == 3
sorted_data = sorted(items, key=lambda revision: revision["rev"])
assert sorted_data[0]["config"]["scopeType"] == "prefix"

View File

@ -227,6 +227,30 @@ def timeout_crawl(admin_auth_headers, default_org_id):
return data["run_now_job"]
@pytest.fixture(scope="session")
def max_crawl_size_crawl_id(admin_auth_headers, default_org_id):
# Start crawl
crawl_data = {
"runNow": True,
"name": "Crawl with 5 MB max crawl size limit",
# Note crawl will exceed this size, as crawl begins to gracefully
# shut down when operator notices this value has been exceeded.
"maxCrawlSize": 5242880,
"config": {
"seeds": [{"url": "https://webrecorder.net/"}],
"scopeType": "domain",
"limit": 100,
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()
return data["run_now_job"]
@pytest.fixture(scope="session")
def error_crawl_id(admin_auth_headers, default_org_id):
crawl_data = {

View File

@ -0,0 +1,33 @@
import requests
import time
from .conftest import API_PREFIX
def test_max_crawl_size(admin_auth_headers, default_org_id, max_crawl_size_crawl_id):
# Verify that crawl has started
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{max_crawl_size_crawl_id}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["state"] in (
"starting",
"running",
"generate-wacz",
"uploading-wacz",
"pending-wait",
)
# Wait some time to let crawl start, hit max size limit, and gracefully stop
time.sleep(240)
# Verify crawl was stopped
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{max_crawl_size_crawl_id}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["state"] == "partial_complete"

View File

@ -4,9 +4,9 @@ dependencies:
version: 0.1.0
- name: btrix-crds
repository: file://./btrix-crds
version: 0.1.0
version: 0.1.1
- name: metacontroller-helm
repository: oci://ghcr.io/metacontroller
version: v4.10.1
digest: sha256:e40073e42a13c1765a9ddaf91c5cd93ccc4804bedf6954e3ba1ade8fb26cca7c
generated: "2023-04-22T01:08:16.572747-07:00"
digest: sha256:4b95cff1974baeaec17b87a0f2c41787bf58437d6235dbacc121195390f4910e
generated: "2023-08-24T21:19:12.893947-07:00"

View File

@ -20,7 +20,7 @@ dependencies:
condition: addons.admin.logging
repository: file://./admin/logging
- name: btrix-crds
version: 0.1.0
version: 0.1.1
repository: file://./btrix-crds
- name: metacontroller-helm
version: v4.10.1

View File

@ -7,9 +7,9 @@ icon: https://webrecorder.net/assets/icon.png
# This is the chart version. This version number should be incremented each time you make changes
# to the chart and its templates, including the app version.
# Versions are expected to follow Semantic Versioning (https://semver.org/)
version: 0.1.0
version: 0.1.1
# This is the version number of the application being deployed. This version number should be
# incremented each time you make changes to the application. Versions are not expected to
# follow Semantic Versioning. They should reflect the version the application is using.
appVersion: 0.1.0
appVersion: 0.1.1

View File

@ -53,7 +53,7 @@ spec:
- name: Size
type: string
jsonPath: .status.size
jsonPath: .status.sizeHuman
description: Crawl Size
- name: Time Started
@ -68,7 +68,7 @@ spec:
- name: Stopping
type: boolean
jsonPath: .spec.stopping
jsonPath: .status.stopping
description: "if set, crawl is being stopped"
- name: Files Added

Binary file not shown.

Binary file not shown.

View File

@ -116,6 +116,10 @@ Adds a hard limit on the number of pages that will be crawled. The crawl will be
The crawl will be gracefully stopped after this set period of time.
### Crawl Size Limit
The crawl will be gracefully stopped after reaching this set size in GB.
### Crawler Instances
Increasing the amount of crawler instances will speed up crawls by using additional browser windows to capture more pages in parallel. This will also increase the amount of traffic sent to the website and may result in a higher chance of getting rate limited.

View File

@ -96,6 +96,41 @@ export class ConfigDetails extends LiteElement {
}
};
const renderSize = (valueBytes?: number | null, fallbackValue?: number) => {
const bytesPerGB = 1073741824;
// Eventually we will want to set this to the selected locale
const formatter = new Intl.NumberFormat(undefined, {
style: "unit",
unit: "gigabyte",
unitDisplay: "narrow",
});
if (valueBytes) {
const sizeGB = Math.floor(valueBytes / bytesPerGB);
return formatter.format(sizeGB);
}
if (typeof fallbackValue === "number") {
let value = "";
if (fallbackValue === Infinity) {
value = msg("Unlimited");
} else if (fallbackValue === 0) {
value = formatter.format(0);
} else {
const sizeGB = Math.floor(fallbackValue / bytesPerGB);
value = formatter.format(sizeGB);
}
return html`<span class="text-neutral-400"
>${value} ${msg("(default)")}</span
>`;
}
return html`<span class="text-neutral-400"
>${msg("Unlimited")} ${msg("(default)")}</span
>`;
};
return html`
<section id="crawler-settings" class="mb-8">
<btrix-section-heading style="--margin: var(--sl-spacing-medium)">
@ -168,6 +203,10 @@ export class ConfigDetails extends LiteElement {
msg("Crawl Time Limit"),
renderTimeLimit(crawlConfig?.crawlTimeout, Infinity)
)}
${this.renderSetting(
msg("Crawl Size Limit"),
renderSize(crawlConfig?.maxCrawlSize, Infinity)
)}
${this.renderSetting(msg("Crawler Instances"), crawlConfig?.scale)}
</btrix-desc-list>
</section>

View File

@ -82,6 +82,7 @@ type FormState = {
behaviorTimeoutSeconds: number | null;
pageLoadTimeoutSeconds: number | null;
pageExtraDelaySeconds: number | null;
maxCrawlSizeGB: number | null;
maxScopeDepth: number | null;
scopeType: WorkflowParams["config"]["scopeType"];
exclusions: WorkflowParams["config"]["exclude"];
@ -153,6 +154,7 @@ const getDefaultFormState = (): FormState => ({
useSitemap: true,
customIncludeUrlList: "",
crawlTimeoutMinutes: null,
maxCrawlSizeGB: 0,
behaviorTimeoutSeconds: null,
pageLoadTimeoutSeconds: null,
pageExtraDelaySeconds: null,
@ -213,6 +215,7 @@ const DEFAULT_BEHAVIORS = [
"autofetch",
"siteSpecific",
];
const BYTES_PER_GB = 1073741824;
@localized()
export class CrawlConfigEditor extends LiteElement {
@ -490,6 +493,12 @@ export class CrawlConfigEditor extends LiteElement {
return fallback;
};
const bytesToGB = (value: any, fallback: number | null) => {
if (typeof value === "number" && value > 0)
return Math.floor(value / BYTES_PER_GB);
return fallback;
};
return {
primarySeedUrl: defaultFormState.primarySeedUrl,
urlList: defaultFormState.urlList,
@ -498,6 +507,10 @@ export class CrawlConfigEditor extends LiteElement {
this.initialWorkflow.crawlTimeout,
defaultFormState.crawlTimeoutMinutes
),
maxCrawlSizeGB: bytesToGB(
this.initialWorkflow.maxCrawlSize,
defaultFormState.maxCrawlSizeGB
),
behaviorTimeoutSeconds:
seedsConfig.behaviorTimeout ?? defaultFormState.behaviorTimeoutSeconds,
pageLoadTimeoutSeconds:
@ -1314,6 +1327,22 @@ https://archiveweb.page/images/${"logo.svg"}`}
${this.renderHelpTextCol(
msg(`Gracefully stop the crawler after a specified time limit.`)
)}
${this.renderFormCol(html`
<sl-input
name="maxCrawlSizeGB"
label=${msg("Crawl Size Limit")}
value=${this.formState.maxCrawlSizeGB || ""}
placeholder=${msg("Default: Unlimited")}
min="0"
type="number"
inputmode="numeric"
>
<span slot="suffix">${msg("GB")}</span>
</sl-input>
`)}
${this.renderHelpTextCol(
msg(`Gracefully stop the crawler after a specified size limit.`)
)}
${this.renderFormCol(html`
<sl-radio-group
name="scale"
@ -2109,6 +2138,9 @@ https://archiveweb.page/images/${"logo.svg"}`}
crawlTimeout: this.formState.crawlTimeoutMinutes
? this.formState.crawlTimeoutMinutes * 60
: null,
maxCrawlSize: this.formState.maxCrawlSizeGB
? this.formState.maxCrawlSizeGB * BYTES_PER_GB
: null,
tags: this.formState.tags,
autoAddCollections: this.formState.autoAddCollections,
config: {

View File

@ -27,6 +27,7 @@ const defaultValue = {
},
tags: [],
crawlTimeout: null,
maxCrawlSize: null,
jobType: undefined,
scale: 1,
autoAddCollections: [],

View File

@ -44,6 +44,7 @@ export type WorkflowParams = {
config: SeedConfig;
tags: string[];
crawlTimeout: number | null;
maxCrawlSize: number | null;
description: string | null;
autoAddCollections: string[];
};