From 80008a2853a13d4af64524fd0317d51f399d5cf8 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 18 Apr 2024 23:03:47 -0400 Subject: [PATCH] Add post load delay to Browsertrix (#1700) Fixes #1699 Adds post load delay to: - Backend `RawCrawlConfig` model - Frontend (workflow editor and config details component) - Workflow setup docs --- backend/btrixcloud/models.py | 1 + docs/user-guide/workflow-setup.md | 4 ++++ frontend/src/components/ui/config-details.ts | 4 ++++ frontend/src/pages/org/workflow-editor.ts | 23 ++++++++++++++++++++ frontend/src/pages/org/workflows-new.ts | 1 + frontend/src/types/crawler.ts | 1 + 6 files changed, 34 insertions(+) diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 06336884..7287e5f1 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -252,6 +252,7 @@ class RawCrawlConfig(BaseModel): behaviorTimeout: Optional[int] pageLoadTimeout: Optional[int] pageExtraDelay: Optional[int] = 0 + postLoadDelay: Optional[int] = 0 workers: Optional[int] = None diff --git a/docs/user-guide/workflow-setup.md b/docs/user-guide/workflow-setup.md index 0b51fa1c..917189c7 100644 --- a/docs/user-guide/workflow-setup.md +++ b/docs/user-guide/workflow-setup.md @@ -134,6 +134,10 @@ Increasing the amount of crawler instances will speed up crawls by using additio Limits amount of elapsed time to wait for a page to load. Behaviors will run after this timeout only if the page is partially or fully loaded. +### Delay After Page Load + +Waits on the page after initial HTML page load for a set number of seconds prior to moving on to next steps such as link extraction and behaviors. Can be useful with pages that are slow to load page contents. + ### Behavior Timeout Limits amount of elapsed time behaviors have to complete. diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts index 2d405b3b..35158b4e 100644 --- a/frontend/src/components/ui/config-details.ts +++ b/frontend/src/components/ui/config-details.ts @@ -164,6 +164,10 @@ export class ConfigDetails extends LiteElement { this.orgDefaults?.pageLoadTimeoutSeconds ?? Infinity, ), )} + ${this.renderSetting( + msg("Delay After Page Load"), + renderTimeLimit(crawlConfig?.config.postLoadDelay, 0), + )} ${this.renderSetting( msg("Page Behavior Timeout"), renderTimeLimit( diff --git a/frontend/src/pages/org/workflow-editor.ts b/frontend/src/pages/org/workflow-editor.ts index 5a5358a2..d9aa7e9e 100644 --- a/frontend/src/pages/org/workflow-editor.ts +++ b/frontend/src/pages/org/workflow-editor.ts @@ -105,6 +105,7 @@ type FormState = { behaviorTimeoutSeconds: number | null; pageLoadTimeoutSeconds: number | null; pageExtraDelaySeconds: number | null; + postLoadDelaySeconds: number | null; maxCrawlSizeGB: number; maxScopeDepth: number | null; scopeType: WorkflowParams["config"]["scopeType"]; @@ -184,6 +185,7 @@ const getDefaultFormState = (): FormState => ({ behaviorTimeoutSeconds: null, pageLoadTimeoutSeconds: null, pageExtraDelaySeconds: null, + postLoadDelaySeconds: null, maxScopeDepth: null, scopeType: "host", exclusions: [], @@ -580,6 +582,8 @@ export class CrawlConfigEditor extends LiteElement { seedsConfig.pageLoadTimeout ?? defaultFormState.pageLoadTimeoutSeconds, pageExtraDelaySeconds: seedsConfig.pageExtraDelay ?? defaultFormState.pageExtraDelaySeconds, + postLoadDelaySeconds: + seedsConfig.postLoadDelay ?? defaultFormState.postLoadDelaySeconds, maxScopeDepth: primarySeedConfig.depth ?? defaultFormState.maxScopeDepth, scale: this.initialWorkflow.scale, blockAds: this.initialWorkflow.config.blockAds, @@ -1585,6 +1589,24 @@ https://archiveweb.page/images/${"logo.svg"}`} `Limits amount of time to wait for a page to load. Behaviors will run after this timeout only if the page is partially or fully loaded.`, ), )} + ${this.renderFormCol(html` + + ${msg("seconds")} + + `)} + ${this.renderHelpTextCol( + msg( + `Waits on the page after initial HTML page load prior to moving on to next steps such as link extraction and behaviors. Can be useful with pages that are slow to load page contents.`, + ), + )} ${this.renderFormCol(html`