Add post load delay to Browsertrix (#1700)

Fixes #1699 

Adds post load delay to:
- Backend `RawCrawlConfig` model
- Frontend (workflow editor and config details component)
- Workflow setup docs
This commit is contained in:
Tessa Walsh 2024-04-18 23:03:47 -04:00 committed by GitHub
parent 9609ff4194
commit 80008a2853
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 34 additions and 0 deletions

View File

@ -252,6 +252,7 @@ class RawCrawlConfig(BaseModel):
behaviorTimeout: Optional[int]
pageLoadTimeout: Optional[int]
pageExtraDelay: Optional[int] = 0
postLoadDelay: Optional[int] = 0
workers: Optional[int] = None

View File

@ -134,6 +134,10 @@ Increasing the amount of crawler instances will speed up crawls by using additio
Limits amount of elapsed time to wait for a page to load. Behaviors will run after this timeout only if the page is partially or fully loaded.
### Delay After Page Load
Waits on the page after initial HTML page load for a set number of seconds prior to moving on to next steps such as link extraction and behaviors. Can be useful with pages that are slow to load page contents.
### Behavior Timeout
Limits amount of elapsed time behaviors have to complete.

View File

@ -164,6 +164,10 @@ export class ConfigDetails extends LiteElement {
this.orgDefaults?.pageLoadTimeoutSeconds ?? Infinity,
),
)}
${this.renderSetting(
msg("Delay After Page Load"),
renderTimeLimit(crawlConfig?.config.postLoadDelay, 0),
)}
${this.renderSetting(
msg("Page Behavior Timeout"),
renderTimeLimit(

View File

@ -105,6 +105,7 @@ type FormState = {
behaviorTimeoutSeconds: number | null;
pageLoadTimeoutSeconds: number | null;
pageExtraDelaySeconds: number | null;
postLoadDelaySeconds: number | null;
maxCrawlSizeGB: number;
maxScopeDepth: number | null;
scopeType: WorkflowParams["config"]["scopeType"];
@ -184,6 +185,7 @@ const getDefaultFormState = (): FormState => ({
behaviorTimeoutSeconds: null,
pageLoadTimeoutSeconds: null,
pageExtraDelaySeconds: null,
postLoadDelaySeconds: null,
maxScopeDepth: null,
scopeType: "host",
exclusions: [],
@ -580,6 +582,8 @@ export class CrawlConfigEditor extends LiteElement {
seedsConfig.pageLoadTimeout ?? defaultFormState.pageLoadTimeoutSeconds,
pageExtraDelaySeconds:
seedsConfig.pageExtraDelay ?? defaultFormState.pageExtraDelaySeconds,
postLoadDelaySeconds:
seedsConfig.postLoadDelay ?? defaultFormState.postLoadDelaySeconds,
maxScopeDepth: primarySeedConfig.depth ?? defaultFormState.maxScopeDepth,
scale: this.initialWorkflow.scale,
blockAds: this.initialWorkflow.config.blockAds,
@ -1585,6 +1589,24 @@ https://archiveweb.page/images/${"logo.svg"}`}
`Limits amount of time to wait for a page to load. Behaviors will run after this timeout only if the page is partially or fully loaded.`,
),
)}
${this.renderFormCol(html`
<sl-input
name="postLoadDelaySeconds"
type="number"
inputmode="numeric"
label=${msg("Delay After Page Load")}
placeholder=${"Default: 0"}
value=${ifDefined(this.formState.postLoadDelaySeconds ?? undefined)}
min="0"
>
<span slot="suffix">${msg("seconds")}</span>
</sl-input>
`)}
${this.renderHelpTextCol(
msg(
`Waits on the page after initial HTML page load prior to moving on to next steps such as link extraction and behaviors. Can be useful with pages that are slow to load page contents.`,
),
)}
${this.renderFormCol(html`
<sl-input
name="behaviorTimeoutSeconds"
@ -2445,6 +2467,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
behaviorTimeout: this.formState.behaviorTimeoutSeconds,
pageLoadTimeout: this.formState.pageLoadTimeoutSeconds,
pageExtraDelay: this.formState.pageExtraDelaySeconds,
postLoadDelay: this.formState.postLoadDelaySeconds,
userAgent: this.formState.userAgent,
limit: this.formState.pageLimit,
lang: this.formState.lang || "",

View File

@ -24,6 +24,7 @@ const defaultValue = {
behaviorTimeout: null,
pageLoadTimeout: null,
pageExtraDelay: null,
postLoadDelay: null,
useSitemap: false,
failOnFailedSeed: false,
userAgent: null,

View File

@ -26,6 +26,7 @@ export type SeedConfig = Expand<
behaviorTimeout: number | null;
pageLoadTimeout: number | null;
pageExtraDelay: number | null;
postLoadDelay: number | null;
behaviors?: string | null;
extraHops?: number | null;
useSitemap?: boolean;