feat: Display scale as number of browser windows (#2057)

Resolves https://github.com/webrecorder/browsertrix/issues/2048

---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
Co-authored-by: Henry Wilkinson <henry@wilkinson.graphics>
This commit is contained in:
sua yoo 2024-09-05 17:32:40 -07:00 committed by GitHub
parent b3c1195878
commit 4c36c80351
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 120 additions and 92 deletions

View File

@ -111,6 +111,7 @@ class SettingsResponse(BaseModel):
defaultPageLoadTimeSeconds: int
maxPagesPerCrawl: int
numBrowsers: int
maxScale: int
billingEnabled: bool
@ -143,6 +144,7 @@ def main() -> None:
os.environ.get("DEFAULT_PAGE_LOAD_TIME_SECONDS", 120)
),
maxPagesPerCrawl=int(os.environ.get("MAX_PAGES_PER_CRAWL", 0)),
numBrowsers=int(os.environ.get("NUM_BROWSERS", 1)),
maxScale=int(os.environ.get("MAX_CRAWL_SCALE", 3)),
billingEnabled=is_bool(os.environ.get("BILLING_ENABLED")),
signUpUrl=os.environ.get("SIGN_UP_URL", ""),

View File

@ -43,6 +43,7 @@ def test_api_settings():
"jwtTokenLifetime": 86400,
"defaultBehaviorTimeSeconds": 300,
"maxPagesPerCrawl": 4,
"numBrowsers": 2,
"maxScale": 3,
"defaultPageLoadTimeSeconds": 120,
"billingEnabled": True,

View File

@ -56,6 +56,8 @@ data:
MIN_QA_CRAWLER_IMAGE: "{{ .Values.min_qa_crawler_image }}"
NUM_BROWSERS: "{{ .Values.crawler_browser_instances }}"
MAX_CRAWLER_MEMORY: "{{ .Values.max_crawler_memory }}"
ENABLE_AUTO_RESIZE_CRAWLERS: "{{ .Values.enable_auto_resize_crawlers }}"

View File

@ -56,7 +56,7 @@ For more details on navigating web archives within ReplayWeb.page, see the [Repl
### Exporting Files
While crawling, Browsertrix will output one or more WACZ files — the crawler aims to output files in consistently sized chunks, and each [crawler instance](workflow-setup.md#crawler-instances) will output separate WACZ files.
While crawling, Browsertrix will output one or more WACZ files — the crawler aims to output files in consistently sized chunks, and each crawler will output separate WACZ files.
The **WACZ Files** tab lists the individually downloadable WACZ files that make up the archived item as well as their file sizes and backup status.

View File

@ -34,7 +34,7 @@ Run a crawl workflow by clicking _Run Crawl_ in the actions menu of the workflow
While crawling, the **Watch Crawl** section displays a list of queued URLs that will be visited, and streams the current state of the browser windows as they visit pages from the queue. You can [modify the crawl live](./running-crawl.md) by adding URL exclusions or changing the number of crawling instances.
Re-running a crawl workflow can be useful to capture a website as it changes over time, or to run with an updated [crawl scope](workflow-setup.md#scope).
Re-running a crawl workflow can be useful to capture a website as it changes over time, or to run with an updated [crawl scope](workflow-setup.md#crawl-scope).
## Status

View File

@ -21,7 +21,7 @@ The crawling panel lists the number of currently running and waiting crawls, as
For organizations with a set execution minute limit, the crawling panel displays a graph of how much execution time has been used and how much is currently remaining. Monthly execution time limits reset on the first of each month at 12:00 AM GMT.
??? Question "How is execution time calculated?"
Execution time is the total runtime of all [_Crawler Instances_](workflow-setup.md/#crawler-instances) during a crawl. For instance, if _Crawler Instances_ scale is set to 2× and each crawler instance uses 2 minutes of active crawling time, execution time for the crawl will be 4 minutes. Like elapsed time, this is tracked as the crawl runs so changing the _Crawler Instances_ scale while a crawl is running may change the amount of execution time used in a given time period.
Execution time is the total runtime of scaled by the [_Browser Windows_](workflow-setup.md/#browser-windows) setting increment value during a crawl. Like elapsed time, this is tracked as the crawl runs so changing the amount of _Browser Windows_ while a crawl is running may change the amount of execution time used in a given time period.
## Collections

View File

@ -23,9 +23,9 @@ If the crawl queue is filled with URLs that should not be crawled, use the _Edit
Exclusions added while crawling are applied to the same exclusion table saved in the workflow's settings and will be used the next time the crawl workflow is run unless they are manually removed.
## Changing the Number of Crawler Instances
## Changing the Number of Browser Windows
Like exclusions, the [crawler instance](workflow-setup.md#crawler-instances) scale can also be adjusted while crawling. On the Watch Crawl page, press the _Edit Crawler Instances_ button, and set the desired value.
Like exclusions, the number of [browser windows](workflow-setup.md#browser-windows) can also be adjusted while crawling. On the **Watch Crawl** tab, press the _Edit Browser Windows_ button, and set the desired value.
Unlike exclusions, this change will not be applied to future workflow runs.

View File

@ -6,7 +6,7 @@ Changes to a setting will only apply to subsequent crawls.
Crawl settings are shown in the crawl workflow detail **Settings** tab and in the archived item **Crawl Settings** tab.
## Scope
## Crawl Scope
Specify the range and depth of your crawl. Different settings will be shown depending on whether you chose _Known URLs_ (crawl type of **URL List**) or _Automated Discovery_ (crawl type of **Seeded Crawl**) when creating a new workflow.
@ -114,10 +114,6 @@ The crawl will be gracefully stopped after this set period of elapsed time.
The crawl will be gracefully stopped after reaching this set size in GB.
### Crawler Instances
Increasing the amount of crawler instances will speed up crawls by using additional browser windows to capture more pages in parallel. This will also increase the amount of traffic sent to the website and may result in a higher chance of getting rate limited.
### Page Load Timeout
Limits amount of elapsed time to wait for a page to load. Behaviors will run after this timeout only if the page is partially or fully loaded.
@ -146,6 +142,15 @@ Configure the browser used to visit URLs during the crawl.
Sets the [_Browser Profile_](browser-profiles.md) to be used for this crawl.
### Browser Windows
Sets the number of browser windows that are used to visit webpages while crawling. Increasing the number of browser windows will speed up crawls by capturing more pages in parallel.
There are some trade-offs:
- This may result in a higher chance of getting rate limited due to the increase in traffic sent to the website.
- More execution minutes will be used per-crawl.
### Crawler Release Channel
Sets the release channel of [Browsertrix Crawler](https://github.com/webrecorder/browsertrix-crawler) to be used for this crawl. Crawls started by this workflow will use the latest crawler version from the selected release channel. Generally "Default" will be the most stable, however others may have newer features (or bugs)!

View File

@ -166,10 +166,6 @@ export class ConfigDetails extends LiteElement {
msg("Crawl Size Limit"),
renderSize(crawlConfig?.maxCrawlSize),
)}
${this.renderSetting(
msg("Crawler Instances"),
crawlConfig?.scale ? `${crawlConfig.scale}×` : "",
)}
<btrix-section-heading style="--margin: var(--sl-spacing-medium)">
<h4>${sectionStrings.perPageLimits}</h4>
</btrix-section-heading>
@ -232,6 +228,12 @@ export class ConfigDetails extends LiteElement {
>`,
),
)}
${this.renderSetting(
msg("Browser Windows"),
crawlConfig?.scale && this.appState.settings
? `${crawlConfig.scale * this.appState.settings.numBrowsers}`
: "",
)}
${this.renderSetting(
msg("Crawler Channel (Exact Crawler Version)"),
capitalize(crawlConfig?.crawlerChannel || "default") +

View File

@ -1254,29 +1254,6 @@ https://archiveweb.page/images/${"logo.svg"}`}
</sl-input>
`)}
${this.renderHelpTextCol(infoTextStrings["maxCrawlSizeGB"])}
${inputCol(html`
<sl-radio-group
name="scale"
label=${msg("Crawler Instances")}
value=${this.formState.scale}
@sl-change=${(e: Event) =>
this.updateFormState({
scale: +(e.target as SlCheckbox).value,
})}
>
${map(
range(this.defaults.maxScale),
(i: number) =>
html` <sl-radio-button value="${i + 1}" size="small"
>${i + 1}×</sl-radio-button
>`,
)}
</sl-radio-group>
`)}
${this.renderHelpTextCol(
msg(`Increasing parallel crawler instances can speed up crawls, but may
increase the chances of getting rate limited.`),
)}
${this.renderSectionHeading(sectionStrings.perPageLimits)}
${inputCol(html`
<sl-input
@ -1366,6 +1343,38 @@ https://archiveweb.page/images/${"logo.svg"}`}
></btrix-select-browser-profile>
`)}
${this.renderHelpTextCol(infoTextStrings["browserProfile"])}
${inputCol(html`
<sl-radio-group
name="scale"
label=${msg("Browser Windows")}
value=${this.formState.scale}
@sl-change=${(e: Event) =>
this.updateFormState({
scale: +(e.target as SlCheckbox).value,
})}
>
${when(this.appState.settings?.numBrowsers, (numBrowsers) =>
map(
range(this.defaults.maxScale),
(i: number) =>
html` <sl-radio-button value="${i + 1}" size="small"
>${(i + 1) * numBrowsers}</sl-radio-button
>`,
),
)}
</sl-radio-group>
`)}
${this.renderHelpTextCol(
html`${msg(
`Increase the number of open browser windows during a crawl. This will speed up your crawl by effectively running more crawlers at the same time.`,
)}
<a
href="https://docs.browsertrix.com/user-guide/workflow-setup/#browser-windows"
class="text-blue-600 hover:text-blue-500"
target="_blank"
>${msg("See caveats")}</a
>.`,
)}
${inputCol(html`
<btrix-select-crawler
.crawlerChannel=${this.formState.crawlerChannel}

View File

@ -51,7 +51,7 @@ export class UsageHistoryTable extends BtrixElement {
<sl-tooltip>
<div slot="content" style="text-transform: initial">
${msg(
"Aggregated time across all crawler instances that the crawler was actively executing a crawl or QA analysis run, i.e. not in a waiting state",
"Aggregated time across all browser windows that the crawler was actively executing a crawl or QA analysis run, i.e. not in a waiting state",
)}
</div>
<sl-icon name="info-circle" style="vertical-align: -.175em"></sl-icon>

View File

@ -493,11 +493,15 @@ export class WorkflowDetail extends LiteElement {
return html` <h3>${this.tabLabels[this.activePanel]}</h3>
<sl-button
size="small"
?disabled=${!this.workflow?.isCrawlRunning}
?disabled=${this.workflow?.lastCrawlState !== "running"}
@click=${() => (this.openDialogName = "scale")}
>
<sl-icon name="plus-slash-minus" slot="prefix"></sl-icon>
<span> ${msg("Edit Crawler Instances")} </span>
<sl-icon
name="plus-slash-minus"
slot="prefix"
label=${msg("Increase or decrease")}
></sl-icon>
<span>${msg("Edit Browser Windows")}</span>
</sl-button>`;
}
if (this.activePanel === "logs") {
@ -558,17 +562,15 @@ export class WorkflowDetail extends LiteElement {
</header>
${when(
!this.isLoading && this.seeds,
() => html`
!this.isLoading && this.seeds && this.workflow,
(workflow) => html`
<btrix-workflow-editor
.initialWorkflow=${this.workflow}
.initialWorkflow=${workflow}
.initialSeeds=${this.seeds!.items}
jobType=${this.workflow!.jobType!}
configId=${this.workflow!.id}
jobType=${workflow.jobType!}
configId=${workflow.id}
@reset=${() =>
this.navTo(
`${this.orgBasePath}/workflows/crawl/${this.workflow!.id}`,
)}
this.navTo(`${this.orgBasePath}/workflows/crawl/${workflow.id}`)}
></btrix-workflow-editor>
`,
this.renderLoading,
@ -675,7 +677,7 @@ export class WorkflowDetail extends LiteElement {
<sl-divider></sl-divider>
<sl-menu-item @click=${() => (this.openDialogName = "scale")}>
<sl-icon name="plus-slash-minus" slot="prefix"></sl-icon>
${msg("Edit Crawler Instances")}
${msg("Edit Browser Windows")}
</sl-menu-item>
<sl-menu-item
@click=${() => (this.openDialogName = "exclusions")}
@ -732,36 +734,36 @@ export class WorkflowDetail extends LiteElement {
<btrix-desc-list horizontal>
${this.renderDetailItem(
msg("Status"),
() => html`
(workflow) => html`
<btrix-crawl-status
state=${this.workflow!.lastCrawlState || msg("No Crawls Yet")}
?stopping=${this.workflow?.lastCrawlStopping}
state=${workflow.lastCrawlState || msg("No Crawls Yet")}
?stopping=${workflow.lastCrawlStopping}
></btrix-crawl-status>
`,
)}
${this.renderDetailItem(
msg("Total Size"),
() =>
(workflow) =>
html` <sl-format-bytes
value=${Number(this.workflow!.totalSize)}
value=${Number(workflow.totalSize)}
display="narrow"
></sl-format-bytes>`,
)}
${this.renderDetailItem(msg("Schedule"), () =>
this.workflow!.schedule
${this.renderDetailItem(msg("Schedule"), (workflow) =>
workflow.schedule
? html`
<div>
${humanizeSchedule(this.workflow!.schedule, {
${humanizeSchedule(workflow.schedule, {
length: "short",
})}
</div>
`
: html`<span class="text-neutral-400">${msg("No Schedule")}</span>`,
)}
${this.renderDetailItem(msg("Created By"), () =>
${this.renderDetailItem(msg("Created By"), (workflow) =>
msg(
str`${this.workflow!.createdByName} on ${this.dateFormatter.format(
new Date(`${this.workflow!.created}Z`),
str`${workflow.createdByName} on ${this.dateFormatter.format(
new Date(`${workflow.created}Z`),
)}`,
),
)}
@ -771,7 +773,7 @@ export class WorkflowDetail extends LiteElement {
private renderDetailItem(
label: string | TemplateResult,
renderContent: () => TemplateResult | string | number,
renderContent: (workflow: Workflow) => TemplateResult | string | number,
) {
return html`
<btrix-desc-list-item label=${label}>
@ -947,8 +949,10 @@ export class WorkflowDetail extends LiteElement {
></sl-format-bytes>`
: skeleton,
)}
${this.renderDetailItem(msg("Crawler Instances"), () =>
this.workflow ? this.workflow.scale : skeleton,
${this.renderDetailItem(msg("Browser Windows"), () =>
this.workflow && this.appState.settings
? this.workflow.scale * this.appState.settings.numBrowsers
: skeleton,
)}
</btrix-desc-list>
`;
@ -1002,13 +1006,13 @@ export class WorkflowDetail extends LiteElement {
`
: this.renderInactiveCrawlMessage()}
${when(
isRunning,
() => html`
isRunning && this.workflow,
(workflow) => html`
<div id="screencast-crawl">
<btrix-screencast
authToken=${authToken}
.crawlId=${this.lastCrawlId ?? undefined}
scale=${this.workflow!.scale}
scale=${workflow.scale}
></btrix-screencast>
</div>
@ -1016,7 +1020,7 @@ export class WorkflowDetail extends LiteElement {
<section class="mt-8">${this.renderExclusions()}</section>
<btrix-dialog
.label=${msg("Edit Crawler Instances")}
.label=${msg("Edit Browser Windows")}
.open=${this.openDialogName === "scale"}
@sl-request-close=${() => (this.openDialogName = undefined)}
@sl-show=${this.showDialog}
@ -1039,12 +1043,10 @@ export class WorkflowDetail extends LiteElement {
</p>
<div class="mt-4">
${when(
this.workflow?.lastCrawlId,
() => html`
this.workflow?.lastCrawlId && this.workflow,
(workflow) => html`
<sl-button
href=${`${this.orgBasePath}/items/crawl/${
this.workflow!.lastCrawlId
}#replay`}
href=${`${this.orgBasePath}/items/crawl/${workflow.lastCrawlId}#replay`}
variant="primary"
size="small"
@click=${this.navLink}
@ -1059,12 +1061,10 @@ export class WorkflowDetail extends LiteElement {
`,
)}
${when(
this.isCrawler,
() =>
this.isCrawler && this.workflow,
(workflow) =>
html` <sl-button
href=${`${this.orgBasePath}/items/crawl/${
this.workflow!.lastCrawlId
}#qa`}
href=${`${this.orgBasePath}/items/crawl/${workflow.lastCrawlId}#qa`}
size="small"
@click=${this.navLink}
>
@ -1254,21 +1254,24 @@ export class WorkflowDetail extends LiteElement {
if (!this.workflow) return;
const scaleOptions = [];
for (let value = 1; value <= this.maxScale; value++) {
scaleOptions.push({
value,
label: `${value}×`,
});
if (this.appState.settings) {
for (let value = 1; value <= this.maxScale; value++) {
scaleOptions.push({
value,
label: value * this.appState.settings.numBrowsers,
});
}
}
return html`
<div>
<sl-radio-group
value=${this.workflow.scale}
help-text=${msg(
"This change will only apply to the currently running crawl.",
<p class="mb-4 text-neutral-600">
${msg(
"Change the number of browser windows crawling in parallel. This change will take effect immediately on the currently running crawl and update crawl workflow settings.",
)}
>
</p>
<sl-radio-group value=${this.workflow.scale}>
${scaleOptions.map(
({ value, label }) => html`
<sl-radio-button
@ -1340,7 +1343,7 @@ export class WorkflowDetail extends LiteElement {
if (data.scaled) {
void this.fetchWorkflow();
this.notify({
message: msg("Updated crawl scale."),
message: msg("Updated number of browser windows."),
variant: "success",
icon: "check2-circle",
});
@ -1349,7 +1352,9 @@ export class WorkflowDetail extends LiteElement {
}
} catch {
this.notify({
message: msg("Sorry, couldn't change crawl scale at this time."),
message: msg(
"Sorry, couldn't change number of browser windows at this time.",
),
variant: "danger",
icon: "exclamation-octagon",
});
@ -1567,7 +1572,7 @@ export class WorkflowDetail extends LiteElement {
private async runNow(): Promise<void> {
try {
const data = await this.apiFetch<{ started: string | null }>(
`/orgs/${this.orgId}/crawlconfigs/${this.workflow!.id}/run`,
`/orgs/${this.orgId}/crawlconfigs/${this.workflowId}/run`,
{
method: "POST",
},

View File

@ -475,7 +475,7 @@ export class WorkflowsList extends LiteElement {
)}
>
<sl-icon name="plus-slash-minus" slot="prefix"></sl-icon>
${msg("Edit Crawler Instances")}
${msg("Edit Browser Windows")}
</sl-menu-item>
<sl-menu-item
@click=${() =>

View File

@ -6,6 +6,7 @@ export type AppSettings = {
defaultBehaviorTimeSeconds: number;
defaultPageLoadTimeSeconds: number;
maxPagesPerCrawl: number;
numBrowsers: number;
maxScale: number;
billingEnabled: boolean;
signUpUrl: string;
@ -33,6 +34,7 @@ export async function getAppSettings(): Promise<AppSettings> {
defaultBehaviorTimeSeconds: 0,
defaultPageLoadTimeSeconds: 0,
maxPagesPerCrawl: 0,
numBrowsers: 1,
maxScale: 0,
billingEnabled: false,
signUpUrl: "",