diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 06336884..7287e5f1 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -252,6 +252,7 @@ class RawCrawlConfig(BaseModel): behaviorTimeout: Optional[int] pageLoadTimeout: Optional[int] pageExtraDelay: Optional[int] = 0 + postLoadDelay: Optional[int] = 0 workers: Optional[int] = None diff --git a/docs/user-guide/workflow-setup.md b/docs/user-guide/workflow-setup.md index 0b51fa1c..917189c7 100644 --- a/docs/user-guide/workflow-setup.md +++ b/docs/user-guide/workflow-setup.md @@ -134,6 +134,10 @@ Increasing the amount of crawler instances will speed up crawls by using additio Limits amount of elapsed time to wait for a page to load. Behaviors will run after this timeout only if the page is partially or fully loaded. +### Delay After Page Load + +Waits on the page after initial HTML page load for a set number of seconds prior to moving on to next steps such as link extraction and behaviors. Can be useful with pages that are slow to load page contents. + ### Behavior Timeout Limits amount of elapsed time behaviors have to complete. diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts index 2d405b3b..35158b4e 100644 --- a/frontend/src/components/ui/config-details.ts +++ b/frontend/src/components/ui/config-details.ts @@ -164,6 +164,10 @@ export class ConfigDetails extends LiteElement { this.orgDefaults?.pageLoadTimeoutSeconds ?? Infinity, ), )} + ${this.renderSetting( + msg("Delay After Page Load"), + renderTimeLimit(crawlConfig?.config.postLoadDelay, 0), + )} ${this.renderSetting( msg("Page Behavior Timeout"), renderTimeLimit( diff --git a/frontend/src/pages/org/workflow-editor.ts b/frontend/src/pages/org/workflow-editor.ts index 5a5358a2..d9aa7e9e 100644 --- a/frontend/src/pages/org/workflow-editor.ts +++ b/frontend/src/pages/org/workflow-editor.ts @@ -105,6 +105,7 @@ type FormState = { behaviorTimeoutSeconds: number | null; pageLoadTimeoutSeconds: number | null; pageExtraDelaySeconds: number | null; + postLoadDelaySeconds: number | null; maxCrawlSizeGB: number; maxScopeDepth: number | null; scopeType: WorkflowParams["config"]["scopeType"]; @@ -184,6 +185,7 @@ const getDefaultFormState = (): FormState => ({ behaviorTimeoutSeconds: null, pageLoadTimeoutSeconds: null, pageExtraDelaySeconds: null, + postLoadDelaySeconds: null, maxScopeDepth: null, scopeType: "host", exclusions: [], @@ -580,6 +582,8 @@ export class CrawlConfigEditor extends LiteElement { seedsConfig.pageLoadTimeout ?? defaultFormState.pageLoadTimeoutSeconds, pageExtraDelaySeconds: seedsConfig.pageExtraDelay ?? defaultFormState.pageExtraDelaySeconds, + postLoadDelaySeconds: + seedsConfig.postLoadDelay ?? defaultFormState.postLoadDelaySeconds, maxScopeDepth: primarySeedConfig.depth ?? defaultFormState.maxScopeDepth, scale: this.initialWorkflow.scale, blockAds: this.initialWorkflow.config.blockAds, @@ -1585,6 +1589,24 @@ https://archiveweb.page/images/${"logo.svg"}`} `Limits amount of time to wait for a page to load. Behaviors will run after this timeout only if the page is partially or fully loaded.`, ), )} + ${this.renderFormCol(html` + + ${msg("seconds")} + + `)} + ${this.renderHelpTextCol( + msg( + `Waits on the page after initial HTML page load prior to moving on to next steps such as link extraction and behaviors. Can be useful with pages that are slow to load page contents.`, + ), + )} ${this.renderFormCol(html`