Add max crawl size option to backend and frontend (#1045)
Backend: - add 'maxCrawlSize' to models and crawljob spec - add 'MAX_CRAWL_SIZE' to configmap - add maxCrawlSize to new crawlconfig + update APIs - operator: gracefully stop crawl if current size (from stats) exceeds maxCrawlSize - tests: add max crawl size tests Frontend: - Add Max Crawl Size text box Limits tab - Users enter max crawl size in GB, convert to bytes - Add BYTES_PER_GB as constant for converting to bytes - docs: Crawl Size Limit to user guide workflow setup section Operator Refactor: - use 'status.stopping' instead of 'crawl.stopping' to indicate crawl is being stopped, as changing later has no effect in operator - add is_crawl_stopping() to return if crawl is being stopped, based on crawl.stopping or size or time limit being reached - crawlerjob status: store byte size under 'size', human readable size under 'sizeHuman' for clarity - size stat always exists so remove unneeded conditional (defaults to 0) - store raw byte size in 'size', human readable size in 'sizeHuman' Charts: - subchart: update crawlerjob crd in btrix-crds to show status.stopping instead of spec.stopping - subchart: show 'sizeHuman' property instead of 'size' - bump subchart version to 0.1.1 --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
2da6c1c905
commit
e667fe2e97
@ -200,6 +200,9 @@ class CrawlConfigOps:
|
||||
changed = changed or (
|
||||
self.check_attr_changed(orig_crawl_config, update, "crawlTimeout")
|
||||
)
|
||||
changed = changed or (
|
||||
self.check_attr_changed(orig_crawl_config, update, "maxCrawlSize")
|
||||
)
|
||||
changed = changed or (
|
||||
self.check_attr_changed(orig_crawl_config, update, "crawlFilenameTemplate")
|
||||
)
|
||||
@ -267,7 +270,7 @@ class CrawlConfigOps:
|
||||
status_code=404, detail=f"Crawl Config '{cid}' not found"
|
||||
)
|
||||
|
||||
# update in crawl manager if config, schedule, scale or crawlTimeout changed
|
||||
# update in crawl manager if config, schedule, scale, maxCrawlSize or crawlTimeout changed
|
||||
if changed:
|
||||
crawlconfig = CrawlConfig.from_dict(result)
|
||||
try:
|
||||
|
||||
@ -103,7 +103,8 @@ class CrawlManager(K8sAPI):
|
||||
STORAGE_NAME=storage_name,
|
||||
PROFILE_FILENAME=profile_filename,
|
||||
INITIAL_SCALE=str(crawlconfig.scale),
|
||||
CRAWL_TIMEOUT=str(crawlconfig.crawlTimeout or 0)
|
||||
CRAWL_TIMEOUT=str(crawlconfig.crawlTimeout or 0),
|
||||
MAX_CRAWL_SIZE=str(crawlconfig.maxCrawlSize or 0)
|
||||
# REV=str(crawlconfig.rev),
|
||||
)
|
||||
|
||||
@ -128,6 +129,7 @@ class CrawlManager(K8sAPI):
|
||||
crawlconfig.oid,
|
||||
crawlconfig.scale,
|
||||
crawlconfig.crawlTimeout,
|
||||
crawlconfig.maxCrawlSize,
|
||||
manual=True,
|
||||
)
|
||||
|
||||
@ -137,6 +139,7 @@ class CrawlManager(K8sAPI):
|
||||
has_sched_update = update.schedule is not None
|
||||
has_scale_update = update.scale is not None
|
||||
has_timeout_update = update.crawlTimeout is not None
|
||||
has_max_crawl_size_update = update.maxCrawlSize is not None
|
||||
has_config_update = update.config is not None
|
||||
|
||||
if has_sched_update:
|
||||
@ -147,6 +150,7 @@ class CrawlManager(K8sAPI):
|
||||
or has_config_update
|
||||
or has_timeout_update
|
||||
or profile_filename
|
||||
or has_max_crawl_size_update
|
||||
):
|
||||
await self._update_config_map(
|
||||
crawlconfig,
|
||||
@ -409,6 +413,9 @@ class CrawlManager(K8sAPI):
|
||||
if update.crawlTimeout is not None:
|
||||
config_map.data["CRAWL_TIMEOUT"] = str(update.crawlTimeout)
|
||||
|
||||
if update.maxCrawlSize is not None:
|
||||
config_map.data["MAX_CRAWL_SIZE"] = str(update.maxCrawlSize)
|
||||
|
||||
if update.crawlFilenameTemplate is not None:
|
||||
config_map.data["STORE_FILENAME"] = update.crawlFilenameTemplate
|
||||
|
||||
|
||||
@ -538,6 +538,7 @@ async def add_new_crawl(
|
||||
profileid=crawlconfig.profileid,
|
||||
schedule=crawlconfig.schedule,
|
||||
crawlTimeout=crawlconfig.crawlTimeout,
|
||||
maxCrawlSize=crawlconfig.maxCrawlSize,
|
||||
manual=manual,
|
||||
started=started,
|
||||
tags=crawlconfig.tags,
|
||||
|
||||
@ -71,7 +71,7 @@ class K8sAPI:
|
||||
|
||||
# pylint: disable=too-many-arguments
|
||||
async def new_crawl_job(
|
||||
self, cid, userid, oid, scale=1, crawl_timeout=0, manual=True
|
||||
self, cid, userid, oid, scale=1, crawl_timeout=0, max_crawl_size=0, manual=True
|
||||
):
|
||||
"""load job template from yaml"""
|
||||
if crawl_timeout:
|
||||
@ -90,6 +90,7 @@ class K8sAPI:
|
||||
"userid": userid,
|
||||
"scale": scale,
|
||||
"expire_time": crawl_expire_time,
|
||||
"max_crawl_size": max_crawl_size,
|
||||
"manual": "1" if manual else "0",
|
||||
}
|
||||
|
||||
|
||||
@ -128,6 +128,7 @@ class CrawlConfigIn(BaseModel):
|
||||
tags: Optional[List[str]] = []
|
||||
|
||||
crawlTimeout: Optional[int] = 0
|
||||
maxCrawlSize: Optional[int] = 0
|
||||
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
|
||||
|
||||
crawlFilenameTemplate: Optional[str]
|
||||
@ -146,6 +147,7 @@ class ConfigRevision(BaseMongoModel):
|
||||
profileid: Optional[UUID4]
|
||||
|
||||
crawlTimeout: Optional[int] = 0
|
||||
maxCrawlSize: Optional[int] = 0
|
||||
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
|
||||
|
||||
modified: datetime
|
||||
@ -166,6 +168,7 @@ class CrawlConfigCore(BaseMongoModel):
|
||||
tags: Optional[List[str]] = []
|
||||
|
||||
crawlTimeout: Optional[int] = 0
|
||||
maxCrawlSize: Optional[int] = 0
|
||||
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)] = 1
|
||||
|
||||
oid: UUID4
|
||||
@ -250,6 +253,7 @@ class UpdateCrawlConfig(BaseModel):
|
||||
schedule: Optional[str]
|
||||
profileid: Optional[str]
|
||||
crawlTimeout: Optional[int]
|
||||
maxCrawlSize: Optional[int]
|
||||
scale: Optional[conint(ge=1, le=MAX_CRAWL_SCALE)]
|
||||
crawlFilenameTemplate: Optional[str]
|
||||
config: Optional[RawCrawlConfig]
|
||||
|
||||
@ -88,6 +88,7 @@ class CrawlSpec(BaseModel):
|
||||
started: str
|
||||
stopping: bool = False
|
||||
expire_time: Optional[datetime] = None
|
||||
max_crawl_size: Optional[int] = None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -97,7 +98,9 @@ class CrawlStatus(BaseModel):
|
||||
state: str = "starting"
|
||||
pagesFound: int = 0
|
||||
pagesDone: int = 0
|
||||
size: str = ""
|
||||
size: int = 0
|
||||
# human readable size string
|
||||
sizeHuman: str = ""
|
||||
scale: int = 1
|
||||
filesAdded: int = 0
|
||||
filesAddedSize: int = 0
|
||||
@ -110,12 +113,11 @@ class CrawlStatus(BaseModel):
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# pylint: disable=too-many-statements
|
||||
# pylint: disable=too-many-statements, too-many-public-methods, too-many-branches
|
||||
# pylint: disable=too-many-instance-attributes,too-many-locals
|
||||
class BtrixOperator(K8sAPI):
|
||||
"""BtrixOperator Handler"""
|
||||
|
||||
# pylint: disable=too-many-instance-attributes,too-many-locals
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.config_file = "/config/config.yaml"
|
||||
@ -209,6 +211,7 @@ class BtrixOperator(K8sAPI):
|
||||
started=data.parent["metadata"]["creationTimestamp"],
|
||||
stopping=spec.get("stopping", False),
|
||||
expire_time=from_k8s_date(spec.get("expireTime")),
|
||||
max_crawl_size=int(configmap.get("MAX_CRAWL_SIZE", "0")),
|
||||
)
|
||||
|
||||
if status.state in ("starting", "waiting_org_limit"):
|
||||
@ -226,7 +229,7 @@ class BtrixOperator(K8sAPI):
|
||||
if has_crawl_children:
|
||||
pods = data.related[POD]
|
||||
status = await self.sync_crawl_state(redis_url, crawl, status, pods)
|
||||
if crawl.stopping:
|
||||
if status.stopping:
|
||||
await self.check_if_finished(crawl, status)
|
||||
|
||||
if status.finished:
|
||||
@ -618,31 +621,42 @@ class BtrixOperator(K8sAPI):
|
||||
|
||||
return True
|
||||
|
||||
# pylint: disable=too-many-branches
|
||||
def is_crawl_stopping(self, crawl, size):
|
||||
"""return true if crawl should begin graceful stopping phase"""
|
||||
|
||||
# if user requested stop, then enter stopping phase
|
||||
if crawl.stopping:
|
||||
print("Graceful Stop: User requested stop")
|
||||
return True
|
||||
|
||||
# check crawl expiry
|
||||
if crawl.expire_time and datetime.utcnow() > crawl.expire_time:
|
||||
print(f"Graceful Stop: Job duration expired at {crawl.expire_time}")
|
||||
return True
|
||||
|
||||
if crawl.max_crawl_size and size > crawl.max_crawl_size:
|
||||
print(f"Graceful Stop: Maximum crawl size {crawl.max_crawl_size} hit")
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
async def update_crawl_state(self, redis, crawl, status):
|
||||
"""update crawl state and check if crawl is now done"""
|
||||
results = await redis.hvals(f"{crawl.id}:status")
|
||||
stats = await get_redis_crawl_stats(redis, crawl.id)
|
||||
|
||||
# check crawl expiry
|
||||
if crawl.expire_time and datetime.utcnow() > crawl.expire_time:
|
||||
crawl.stopping = True
|
||||
print(
|
||||
"Job duration expired at {crawl.expire_time}, "
|
||||
+ "gracefully stopping crawl"
|
||||
)
|
||||
|
||||
if crawl.stopping:
|
||||
print("Graceful Stop")
|
||||
await redis.set(f"{crawl.id}:stopping", "1")
|
||||
# backwards compatibility with older crawler
|
||||
await redis.set("crawl-stop", "1")
|
||||
|
||||
# update status
|
||||
status.pagesDone = stats["done"]
|
||||
status.pagesFound = stats["found"]
|
||||
if stats["size"] is not None:
|
||||
status.size = humanize.naturalsize(stats["size"])
|
||||
status.size = stats["size"]
|
||||
status.sizeHuman = humanize.naturalsize(status.size)
|
||||
|
||||
status.stopping = self.is_crawl_stopping(crawl, status.size)
|
||||
|
||||
if status.stopping:
|
||||
await redis.set(f"{crawl.id}:stopping", "1")
|
||||
# backwards compatibility with older crawler
|
||||
await redis.set("crawl-stop", "1")
|
||||
|
||||
# check if done / failed
|
||||
status_count = {}
|
||||
@ -669,7 +683,7 @@ class BtrixOperator(K8sAPI):
|
||||
# check if all crawlers failed
|
||||
elif status_count.get("failed", 0) >= crawl.scale:
|
||||
# if stopping, and no pages finished, mark as canceled
|
||||
if crawl.stopping and not status.pagesDone:
|
||||
if status.stopping and not status.pagesDone:
|
||||
state = "canceled"
|
||||
else:
|
||||
state = "failed"
|
||||
|
||||
@ -18,6 +18,7 @@ spec:
|
||||
cid: "{{ cid }}"
|
||||
oid: "{{ oid }}"
|
||||
scale: {{ scale }}
|
||||
maxCrawlSize: {{ max_crawl_size }}
|
||||
ttlSecondsAfterFinished: 30
|
||||
|
||||
{% if expire_time %}
|
||||
|
||||
@ -196,6 +196,30 @@ def test_update_crawl_timeout(crawler_auth_headers, default_org_id, sample_crawl
|
||||
assert data["crawlTimeout"] == 60
|
||||
|
||||
|
||||
def test_update_max_crawl_size(crawler_auth_headers, default_org_id, sample_crawl_data):
|
||||
# Verify that updating crawl timeout works
|
||||
r = requests.patch(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
||||
headers=crawler_auth_headers,
|
||||
json={"maxCrawlSize": 4096},
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
|
||||
assert data["settings_changed"] == True
|
||||
assert data["metadata_changed"] == False
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
|
||||
data = r.json()
|
||||
|
||||
assert data["maxCrawlSize"] == 4096
|
||||
|
||||
|
||||
def test_verify_delete_tags(crawler_auth_headers, default_org_id):
|
||||
# Verify that deleting tags and name works as well
|
||||
r = requests.patch(
|
||||
@ -224,9 +248,9 @@ def test_verify_revs_history(crawler_auth_headers, default_org_id):
|
||||
assert r.status_code == 200
|
||||
|
||||
data = r.json()
|
||||
assert data["total"] == 2
|
||||
assert data["total"] == 3
|
||||
items = data["items"]
|
||||
assert len(items) == 2
|
||||
assert len(items) == 3
|
||||
sorted_data = sorted(items, key=lambda revision: revision["rev"])
|
||||
assert sorted_data[0]["config"]["scopeType"] == "prefix"
|
||||
|
||||
|
||||
@ -227,6 +227,30 @@ def timeout_crawl(admin_auth_headers, default_org_id):
|
||||
return data["run_now_job"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def max_crawl_size_crawl_id(admin_auth_headers, default_org_id):
|
||||
# Start crawl
|
||||
crawl_data = {
|
||||
"runNow": True,
|
||||
"name": "Crawl with 5 MB max crawl size limit",
|
||||
# Note crawl will exceed this size, as crawl begins to gracefully
|
||||
# shut down when operator notices this value has been exceeded.
|
||||
"maxCrawlSize": 5242880,
|
||||
"config": {
|
||||
"seeds": [{"url": "https://webrecorder.net/"}],
|
||||
"scopeType": "domain",
|
||||
"limit": 100,
|
||||
},
|
||||
}
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
||||
headers=admin_auth_headers,
|
||||
json=crawl_data,
|
||||
)
|
||||
data = r.json()
|
||||
return data["run_now_job"]
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def error_crawl_id(admin_auth_headers, default_org_id):
|
||||
crawl_data = {
|
||||
|
||||
33
backend/test_nightly/test_max_crawl_size_limit.py
Normal file
33
backend/test_nightly/test_max_crawl_size_limit.py
Normal file
@ -0,0 +1,33 @@
|
||||
import requests
|
||||
import time
|
||||
|
||||
from .conftest import API_PREFIX
|
||||
|
||||
|
||||
def test_max_crawl_size(admin_auth_headers, default_org_id, max_crawl_size_crawl_id):
|
||||
# Verify that crawl has started
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{max_crawl_size_crawl_id}/replay.json",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["state"] in (
|
||||
"starting",
|
||||
"running",
|
||||
"generate-wacz",
|
||||
"uploading-wacz",
|
||||
"pending-wait",
|
||||
)
|
||||
|
||||
# Wait some time to let crawl start, hit max size limit, and gracefully stop
|
||||
time.sleep(240)
|
||||
|
||||
# Verify crawl was stopped
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{max_crawl_size_crawl_id}/replay.json",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["state"] == "partial_complete"
|
||||
@ -4,9 +4,9 @@ dependencies:
|
||||
version: 0.1.0
|
||||
- name: btrix-crds
|
||||
repository: file://./btrix-crds
|
||||
version: 0.1.0
|
||||
version: 0.1.1
|
||||
- name: metacontroller-helm
|
||||
repository: oci://ghcr.io/metacontroller
|
||||
version: v4.10.1
|
||||
digest: sha256:e40073e42a13c1765a9ddaf91c5cd93ccc4804bedf6954e3ba1ade8fb26cca7c
|
||||
generated: "2023-04-22T01:08:16.572747-07:00"
|
||||
digest: sha256:4b95cff1974baeaec17b87a0f2c41787bf58437d6235dbacc121195390f4910e
|
||||
generated: "2023-08-24T21:19:12.893947-07:00"
|
||||
|
||||
@ -20,7 +20,7 @@ dependencies:
|
||||
condition: addons.admin.logging
|
||||
repository: file://./admin/logging
|
||||
- name: btrix-crds
|
||||
version: 0.1.0
|
||||
version: 0.1.1
|
||||
repository: file://./btrix-crds
|
||||
- name: metacontroller-helm
|
||||
version: v4.10.1
|
||||
|
||||
@ -7,9 +7,9 @@ icon: https://webrecorder.net/assets/icon.png
|
||||
# This is the chart version. This version number should be incremented each time you make changes
|
||||
# to the chart and its templates, including the app version.
|
||||
# Versions are expected to follow Semantic Versioning (https://semver.org/)
|
||||
version: 0.1.0
|
||||
version: 0.1.1
|
||||
|
||||
# This is the version number of the application being deployed. This version number should be
|
||||
# incremented each time you make changes to the application. Versions are not expected to
|
||||
# follow Semantic Versioning. They should reflect the version the application is using.
|
||||
appVersion: 0.1.0
|
||||
appVersion: 0.1.1
|
||||
|
||||
@ -53,7 +53,7 @@ spec:
|
||||
|
||||
- name: Size
|
||||
type: string
|
||||
jsonPath: .status.size
|
||||
jsonPath: .status.sizeHuman
|
||||
description: Crawl Size
|
||||
|
||||
- name: Time Started
|
||||
@ -68,7 +68,7 @@ spec:
|
||||
|
||||
- name: Stopping
|
||||
type: boolean
|
||||
jsonPath: .spec.stopping
|
||||
jsonPath: .status.stopping
|
||||
description: "if set, crawl is being stopped"
|
||||
|
||||
- name: Files Added
|
||||
|
||||
Binary file not shown.
Binary file not shown.
BIN
chart/charts/btrix-crds-0.1.1.tgz
Normal file
BIN
chart/charts/btrix-crds-0.1.1.tgz
Normal file
Binary file not shown.
@ -116,6 +116,10 @@ Adds a hard limit on the number of pages that will be crawled. The crawl will be
|
||||
|
||||
The crawl will be gracefully stopped after this set period of time.
|
||||
|
||||
### Crawl Size Limit
|
||||
|
||||
The crawl will be gracefully stopped after reaching this set size in GB.
|
||||
|
||||
### Crawler Instances
|
||||
|
||||
Increasing the amount of crawler instances will speed up crawls by using additional browser windows to capture more pages in parallel. This will also increase the amount of traffic sent to the website and may result in a higher chance of getting rate limited.
|
||||
|
||||
@ -96,6 +96,41 @@ export class ConfigDetails extends LiteElement {
|
||||
}
|
||||
};
|
||||
|
||||
const renderSize = (valueBytes?: number | null, fallbackValue?: number) => {
|
||||
const bytesPerGB = 1073741824;
|
||||
|
||||
// Eventually we will want to set this to the selected locale
|
||||
const formatter = new Intl.NumberFormat(undefined, {
|
||||
style: "unit",
|
||||
unit: "gigabyte",
|
||||
unitDisplay: "narrow",
|
||||
});
|
||||
|
||||
if (valueBytes) {
|
||||
const sizeGB = Math.floor(valueBytes / bytesPerGB);
|
||||
return formatter.format(sizeGB);
|
||||
}
|
||||
|
||||
if (typeof fallbackValue === "number") {
|
||||
let value = "";
|
||||
if (fallbackValue === Infinity) {
|
||||
value = msg("Unlimited");
|
||||
} else if (fallbackValue === 0) {
|
||||
value = formatter.format(0);
|
||||
} else {
|
||||
const sizeGB = Math.floor(fallbackValue / bytesPerGB);
|
||||
value = formatter.format(sizeGB);
|
||||
}
|
||||
return html`<span class="text-neutral-400"
|
||||
>${value} ${msg("(default)")}</span
|
||||
>`;
|
||||
}
|
||||
|
||||
return html`<span class="text-neutral-400"
|
||||
>${msg("Unlimited")} ${msg("(default)")}</span
|
||||
>`;
|
||||
};
|
||||
|
||||
return html`
|
||||
<section id="crawler-settings" class="mb-8">
|
||||
<btrix-section-heading style="--margin: var(--sl-spacing-medium)">
|
||||
@ -168,6 +203,10 @@ export class ConfigDetails extends LiteElement {
|
||||
msg("Crawl Time Limit"),
|
||||
renderTimeLimit(crawlConfig?.crawlTimeout, Infinity)
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Crawl Size Limit"),
|
||||
renderSize(crawlConfig?.maxCrawlSize, Infinity)
|
||||
)}
|
||||
${this.renderSetting(msg("Crawler Instances"), crawlConfig?.scale)}
|
||||
</btrix-desc-list>
|
||||
</section>
|
||||
|
||||
@ -82,6 +82,7 @@ type FormState = {
|
||||
behaviorTimeoutSeconds: number | null;
|
||||
pageLoadTimeoutSeconds: number | null;
|
||||
pageExtraDelaySeconds: number | null;
|
||||
maxCrawlSizeGB: number | null;
|
||||
maxScopeDepth: number | null;
|
||||
scopeType: WorkflowParams["config"]["scopeType"];
|
||||
exclusions: WorkflowParams["config"]["exclude"];
|
||||
@ -153,6 +154,7 @@ const getDefaultFormState = (): FormState => ({
|
||||
useSitemap: true,
|
||||
customIncludeUrlList: "",
|
||||
crawlTimeoutMinutes: null,
|
||||
maxCrawlSizeGB: 0,
|
||||
behaviorTimeoutSeconds: null,
|
||||
pageLoadTimeoutSeconds: null,
|
||||
pageExtraDelaySeconds: null,
|
||||
@ -213,6 +215,7 @@ const DEFAULT_BEHAVIORS = [
|
||||
"autofetch",
|
||||
"siteSpecific",
|
||||
];
|
||||
const BYTES_PER_GB = 1073741824;
|
||||
|
||||
@localized()
|
||||
export class CrawlConfigEditor extends LiteElement {
|
||||
@ -490,6 +493,12 @@ export class CrawlConfigEditor extends LiteElement {
|
||||
return fallback;
|
||||
};
|
||||
|
||||
const bytesToGB = (value: any, fallback: number | null) => {
|
||||
if (typeof value === "number" && value > 0)
|
||||
return Math.floor(value / BYTES_PER_GB);
|
||||
return fallback;
|
||||
};
|
||||
|
||||
return {
|
||||
primarySeedUrl: defaultFormState.primarySeedUrl,
|
||||
urlList: defaultFormState.urlList,
|
||||
@ -498,6 +507,10 @@ export class CrawlConfigEditor extends LiteElement {
|
||||
this.initialWorkflow.crawlTimeout,
|
||||
defaultFormState.crawlTimeoutMinutes
|
||||
),
|
||||
maxCrawlSizeGB: bytesToGB(
|
||||
this.initialWorkflow.maxCrawlSize,
|
||||
defaultFormState.maxCrawlSizeGB
|
||||
),
|
||||
behaviorTimeoutSeconds:
|
||||
seedsConfig.behaviorTimeout ?? defaultFormState.behaviorTimeoutSeconds,
|
||||
pageLoadTimeoutSeconds:
|
||||
@ -1314,6 +1327,22 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
${this.renderHelpTextCol(
|
||||
msg(`Gracefully stop the crawler after a specified time limit.`)
|
||||
)}
|
||||
${this.renderFormCol(html`
|
||||
<sl-input
|
||||
name="maxCrawlSizeGB"
|
||||
label=${msg("Crawl Size Limit")}
|
||||
value=${this.formState.maxCrawlSizeGB || ""}
|
||||
placeholder=${msg("Default: Unlimited")}
|
||||
min="0"
|
||||
type="number"
|
||||
inputmode="numeric"
|
||||
>
|
||||
<span slot="suffix">${msg("GB")}</span>
|
||||
</sl-input>
|
||||
`)}
|
||||
${this.renderHelpTextCol(
|
||||
msg(`Gracefully stop the crawler after a specified size limit.`)
|
||||
)}
|
||||
${this.renderFormCol(html`
|
||||
<sl-radio-group
|
||||
name="scale"
|
||||
@ -2109,6 +2138,9 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
crawlTimeout: this.formState.crawlTimeoutMinutes
|
||||
? this.formState.crawlTimeoutMinutes * 60
|
||||
: null,
|
||||
maxCrawlSize: this.formState.maxCrawlSizeGB
|
||||
? this.formState.maxCrawlSizeGB * BYTES_PER_GB
|
||||
: null,
|
||||
tags: this.formState.tags,
|
||||
autoAddCollections: this.formState.autoAddCollections,
|
||||
config: {
|
||||
|
||||
@ -27,6 +27,7 @@ const defaultValue = {
|
||||
},
|
||||
tags: [],
|
||||
crawlTimeout: null,
|
||||
maxCrawlSize: null,
|
||||
jobType: undefined,
|
||||
scale: 1,
|
||||
autoAddCollections: [],
|
||||
|
||||
@ -44,6 +44,7 @@ export type WorkflowParams = {
|
||||
config: SeedConfig;
|
||||
tags: string[];
|
||||
crawlTimeout: number | null;
|
||||
maxCrawlSize: number | null;
|
||||
description: string | null;
|
||||
autoAddCollections: string[];
|
||||
};
|
||||
|
||||
Loading…
Reference in New Issue
Block a user