From 91c2c1ad625456095d7806e542aed4990e7c9fb5 Mon Sep 17 00:00:00 2001 From: sua yoo Date: Wed, 5 Apr 2023 20:06:46 -0700 Subject: [PATCH] Allow users to set additional page time limits (#744) --- frontend/src/components/config-details.ts | 111 ++++++--- frontend/src/pages/org/workflow-editor.ts | 280 ++++++++++++++-------- frontend/src/pages/org/workflows-new.ts | 3 + frontend/src/types/crawler.ts | 6 +- 4 files changed, 270 insertions(+), 130 deletions(-) diff --git a/frontend/src/components/config-details.ts b/frontend/src/components/config-details.ts index 33119c30..92754da0 100644 --- a/frontend/src/components/config-details.ts +++ b/frontend/src/components/config-details.ts @@ -8,6 +8,7 @@ import ISO6391 from "iso-639-1"; import LiteElement, { html } from "../utils/LiteElement"; import type { CrawlConfig, Seed, SeedConfig } from "../pages/org/types"; import { humanizeSchedule } from "../utils/cron"; +import { RelativeDuration } from "./relative-duration"; /** * Usage: @@ -30,9 +31,10 @@ export class ConfigDetails extends LiteElement { hideTags = false; @state() - private orgDefaults = { - behaviorTimeoutMinutes: Infinity, - maxPagesPerCrawl: Infinity, + private orgDefaults?: { + pageLoadTimeoutSeconds?: number; + behaviorTimeoutSeconds?: number; + maxPagesPerCrawl?: number; }; private readonly scopeTypeLabels: Record< @@ -55,7 +57,25 @@ export class ConfigDetails extends LiteElement { render() { const crawlConfig = this.crawlConfig; - const exclusions = crawlConfig?.config.exclude || []; + const seedsConfig = crawlConfig?.config; + const exclusions = seedsConfig?.exclude || []; + const maxPages = seedsConfig?.seeds[0]?.limit ?? seedsConfig?.limit; + const renderTimeLimit = ( + valueSeconds?: number | null, + fallbackValue?: number + ) => + valueSeconds + ? RelativeDuration.humanize(valueSeconds * 1000, { verbose: true }) + : typeof fallbackValue === "number" + ? html`${fallbackValue === Infinity + ? msg("Unlimited") + : RelativeDuration.humanize(fallbackValue * 1000, { + verbose: true, + })} + ${msg("(default)")}` + : undefined; return html`
@@ -85,16 +105,51 @@ export class ConfigDetails extends LiteElement { () => this.renderSetting(msg("Exclusions"), msg("None")) )} ${this.renderSetting( - msg("Page Time Limit"), - crawlConfig?.config.behaviorTimeout - ? msg(str`${crawlConfig?.config.behaviorTimeout / 60} minute(s)`) - : msg("None") + msg("Max Pages"), + when( + maxPages, + () => msg(str`${maxPages!.toLocaleString()} pages`), + () => + this.orgDefaults?.maxPagesPerCrawl + ? html`${msg( + str`${this.orgDefaults.maxPagesPerCrawl.toLocaleString()} pages` + )} + ${msg("(default)")}` + : undefined + ) + )} + ${this.renderSetting( + msg("Page Load Timeout"), + renderTimeLimit( + crawlConfig?.config.pageLoadTimeout, + this.orgDefaults?.pageLoadTimeoutSeconds ?? Infinity + ) + )} + ${this.renderSetting( + msg("Page Behavior Timeout"), + renderTimeLimit( + crawlConfig?.config.behaviorTimeout, + this.orgDefaults?.behaviorTimeoutSeconds ?? Infinity + ) + )} + ${this.renderSetting( + msg("Auto-Scroll Behavior"), + crawlConfig?.config.behaviors && + !crawlConfig.config.behaviors.includes("autoscroll") + ? msg("Disabled") + : html`${msg("Enabled (default)")}` + )} + ${this.renderSetting( + msg("Delay Before Next Page"), + renderTimeLimit(crawlConfig?.config.pageExtraDelay, 0) )} ${this.renderSetting( msg("Crawl Time Limit"), - crawlConfig?.crawlTimeout - ? msg(str`${crawlConfig?.crawlTimeout / 60} minute(s)`) - : msg("None") + renderTimeLimit(crawlConfig?.crawlTimeout, Infinity) )} ${this.renderSetting(msg("Crawler Instances"), crawlConfig?.scale)} @@ -212,10 +267,9 @@ export class ConfigDetails extends LiteElement { const crawlConfig = this.crawlConfig!; const seedsConfig = crawlConfig.config; const additionalUrlList = seedsConfig.seeds.slice(1); - let primarySeedConfig: SeedConfig | Seed = seedsConfig; - let primarySeedUrl = seedsConfig.seeds[0].url; + const primarySeedConfig: SeedConfig | Seed = seedsConfig; + const primarySeedUrl = seedsConfig.seeds[0].url; const includeUrlList = primarySeedConfig.include || seedsConfig.include; - const maxPages = primarySeedConfig.limit ?? seedsConfig.limit; return html` ${this.renderSetting(msg("Primary Seed URL"), primarySeedUrl, true)} ${this.renderSetting( @@ -255,19 +309,6 @@ export class ConfigDetails extends LiteElement { : msg("None"), true )} - ${this.renderSetting( - msg("Max Pages"), - when( - maxPages, - () => msg(str`${maxPages} page(s)`), - () => - this.orgDefaults.maxPagesPerCrawl < Infinity - ? msg( - str`Maximum Allowed (${this.orgDefaults.maxPagesPerCrawl.toLocaleString()} pages)` - ) - : undefined - ) - )} `; }; @@ -292,7 +333,7 @@ export class ConfigDetails extends LiteElement { } else if (typeof value === "boolean") { content = value ? msg("Yes") : msg("No"); } else if (typeof value !== "number" && !value) { - content = html`${msg("Not specified")}`; } @@ -304,7 +345,6 @@ export class ConfigDetails extends LiteElement { } private async fetchAPIDefaults() { - const orgDefaults = { ...this.orgDefaults }; try { const resp = await fetch("/api/settings", { headers: { "Content-Type": "application/json" }, @@ -312,17 +352,22 @@ export class ConfigDetails extends LiteElement { if (!resp.ok) { throw new Error(resp.statusText); } + const orgDefaults = { + ...this.orgDefaults, + }; const data = await resp.json(); - if (data.defaultBehaviorTimeSeconds) { - orgDefaults.behaviorTimeoutMinutes = - data.defaultBehaviorTimeSeconds / 60; + if (data.defaultBehaviorTimeSeconds > 0) { + orgDefaults.behaviorTimeoutSeconds = data.defaultBehaviorTimeSeconds; + } + if (data.defaultPageLoadTimeSeconds > 0) { + orgDefaults.pageLoadTimeoutSeconds = data.defaultPageLoadTimeSeconds; } if (data.maxPagesPerCrawl > 0) { orgDefaults.maxPagesPerCrawl = data.maxPagesPerCrawl; } + this.orgDefaults = orgDefaults; } catch (e: any) { console.debug(e); } - this.orgDefaults = orgDefaults; } } diff --git a/frontend/src/pages/org/workflow-editor.ts b/frontend/src/pages/org/workflow-editor.ts index 688666d6..3b688169 100644 --- a/frontend/src/pages/org/workflow-editor.ts +++ b/frontend/src/pages/org/workflow-editor.ts @@ -16,8 +16,6 @@ import compact from "lodash/fp/compact"; import { mergeDeep } from "immutable"; import flow from "lodash/fp/flow"; import uniq from "lodash/fp/uniq"; -import RegexColorize from "regex-colorize"; -import ISO6391 from "iso-639-1"; import Fuse from "fuse.js"; import LiteElement, { html } from "../../utils/LiteElement"; @@ -78,7 +76,9 @@ type FormState = { includeLinkedPages: boolean; customIncludeUrlList: string; crawlTimeoutMinutes: number | null; - pageTimeoutMinutes: number | null; + behaviorTimeoutSeconds: number | null; + pageLoadTimeoutSeconds: number | null; + pageExtraDelaySeconds: number | null; scopeType: WorkflowParams["config"]["scopeType"]; exclusions: WorkflowParams["config"]["exclude"]; pageLimit: WorkflowParams["config"]["limit"]; @@ -99,6 +99,7 @@ type FormState = { browserProfile: Profile | null; tags: Tags; description: WorkflowParams["description"]; + disableAutoscrollBehavior: boolean; }; const getDefaultProgressState = (hasConfigId = false): ProgressState => { @@ -144,7 +145,9 @@ const getDefaultFormState = (): FormState => ({ includeLinkedPages: false, customIncludeUrlList: "", crawlTimeoutMinutes: null, - pageTimeoutMinutes: null, + behaviorTimeoutSeconds: null, + pageLoadTimeoutSeconds: null, + pageExtraDelaySeconds: null, scopeType: "host", exclusions: [], pageLimit: undefined, @@ -165,6 +168,7 @@ const getDefaultFormState = (): FormState => ({ browserProfile: null, tags: [], description: null, + disableAutoscrollBehavior: false, }); const defaultProgressState = getDefaultProgressState(); const orderedTabNames = STEPS.filter( @@ -191,8 +195,12 @@ const urlListToArray = flow( (str: string) => (str.length ? str.trim().split(/\s+/g) : []), trimArray ); -const DEFAULT_BEHAVIOR_TIMEOUT_MINUTES = 5; -const DEFAULT_MAX_PAGES_PER_CRAWL = Infinity; +const DEFAULT_BEHAVIORS = [ + "autoscroll", + "autoplay", + "autofetch", + "siteSpecific", +]; @localized() export class CrawlConfigEditor extends LiteElement { @@ -221,9 +229,10 @@ export class CrawlConfigEditor extends LiteElement { private progressState!: ProgressState; @state() - private orgDefaults = { - behaviorTimeoutMinutes: DEFAULT_BEHAVIOR_TIMEOUT_MINUTES, - maxPagesPerCrawl: DEFAULT_MAX_PAGES_PER_CRAWL, + private orgDefaults?: { + behaviorTimeoutSeconds?: number; + pageLoadTimeoutSeconds?: number; + maxPagesPerCrawl?: number; }; @state() @@ -394,8 +403,9 @@ export class CrawlConfigEditor extends LiteElement { return null; } - private getInitialFormState(): FormState | {} { - if (!this.initialWorkflow) return {}; + private getInitialFormState(): FormState { + const defaultFormState = getDefaultFormState(); + if (!this.initialWorkflow) return defaultFormState; const formState: Partial = {}; const seedsConfig = this.initialWorkflow.config; const { seeds } = seedsConfig; @@ -456,36 +466,46 @@ export class CrawlConfigEditor extends LiteElement { if (this.initialWorkflow.tags?.length) { formState.tags = this.initialWorkflow.tags; } - if (typeof this.initialWorkflow.crawlTimeout === "number") { - formState.crawlTimeoutMinutes = this.initialWorkflow.crawlTimeout / 60; - } - if (typeof seedsConfig.behaviorTimeout === "number") { - formState.pageTimeoutMinutes = seedsConfig.behaviorTimeout / 60; - } + const secondsToMinutes = (value: any, fallback: number | null) => { + if (typeof value === "number" && value > 0) return value / 60; + return fallback; + }; return { - primarySeedUrl: "", - urlList: "", - customIncludeUrlList: "", - crawlTimeoutMinutes: null, - pageTimeoutMinutes: null, + primarySeedUrl: defaultFormState.primarySeedUrl, + urlList: defaultFormState.urlList, + customIncludeUrlList: defaultFormState.customIncludeUrlList, + crawlTimeoutMinutes: secondsToMinutes( + this.initialWorkflow.crawlTimeout, + defaultFormState.crawlTimeoutMinutes + ), + behaviorTimeoutSeconds: + seedsConfig.behaviorTimeout ?? defaultFormState.behaviorTimeoutSeconds, + pageLoadTimeoutSeconds: + seedsConfig.pageLoadTimeout ?? defaultFormState.pageLoadTimeoutSeconds, + pageExtraDelaySeconds: + seedsConfig.pageExtraDelay ?? defaultFormState.pageExtraDelaySeconds, scale: this.initialWorkflow.scale, blockAds: this.initialWorkflow.config.blockAds, lang: this.initialWorkflow.config.lang, - scheduleType: "none", - runNow: false, + scheduleType: defaultFormState.scheduleType, + scheduleFrequency: defaultFormState.scheduleFrequency, + runNow: defaultFormState.runNow, tags: this.initialWorkflow.tags, - jobName: this.initialWorkflow.name || "", + jobName: this.initialWorkflow.name || defaultFormState.jobName, description: this.initialWorkflow.description, browserProfile: this.initialWorkflow.profileid ? ({ id: this.initialWorkflow.profileid } as Profile) - : null, + : defaultFormState.browserProfile, scopeType: primarySeedConfig.scopeType as FormState["scopeType"], exclusions: seedsConfig.exclude, - includeLinkedPages: Boolean( - primarySeedConfig.extraHops || seedsConfig.extraHops - ), - pageLimit: this.initialWorkflow.config.limit ?? undefined, + includeLinkedPages: + Boolean(primarySeedConfig.extraHops || seedsConfig.extraHops) ?? true, + pageLimit: + this.initialWorkflow.config.limit ?? defaultFormState.pageLimit, + disableAutoscrollBehavior: this.initialWorkflow.config.behaviors + ? !this.initialWorkflow.config.behaviors.includes("autoscroll") + : defaultFormState.disableAutoscrollBehavior, ...formState, }; } @@ -1154,7 +1174,100 @@ https://archiveweb.page/images/${"logo.svg"}`} urlListToArray(this.formState.urlList).length + (this.jobType === "seed-crawl" ? 1 : 0) ); + const onInputMinMax = async (e: CustomEvent) => { + const inputEl = e.target as SlInput; + await inputEl.updateComplete; + let helpText = ""; + if (inputEl.invalid) { + const value = +inputEl.value; + const min = inputEl.min; + const max = inputEl.max; + if (min && value < +min) { + helpText = msg( + str`Must be more than minimum of ${(+min).toLocaleString()}` + ); + } else if (max && value > +max) { + helpText = msg( + str`Must be less than maximum of ${(+max).toLocaleString()}` + ); + } + } + inputEl.helpText = helpText; + }; return html` + ${this.renderSectionHeading(msg("Limit Per Page"))} + ${this.renderFormCol(html` + + ${msg("seconds")} + + `)} + ${this.renderHelpTextCol( + msg( + `Limits amount of time to wait for a page to load. Behaviors will run after this timeout only if the page is partially or fully loaded.` + ) + )} + ${this.renderFormCol(html` + + ${msg("seconds")} + + `)} + ${this.renderHelpTextCol( + msg(`Limits how long behaviors can run on each page.`) + )} + ${this.renderFormCol(html` + ${msg("Disable Auto-Scroll Behavior")} + `)} + ${this.renderHelpTextCol( + msg( + `Prevents browser from automatically scrolling until the end of the page.` + ), + false + )} + ${this.renderFormCol(html` + + ${msg("seconds")} + + `)} + ${this.renderHelpTextCol( + msg( + `Waits on the page after behaviors are complete before moving onto the next page. Can be helpful for rate limiting.` + ) + )} + ${this.renderSectionHeading(msg("Limit Per Crawl"))} ${this.renderFormCol(html` { - const inputEl = e.target as SlInput; - await inputEl.updateComplete; - let helpText = ""; - if (inputEl.invalid) { - const value = +inputEl.value; - if (value < minPages) { - helpText = - minPages === 1 - ? msg( - str`Minimum ${minPages.toLocaleString()} page per crawl` - ) - : msg( - str`Minimum ${minPages.toLocaleString()} pages per crawl` - ); - } else if (value > this.orgDefaults.maxPagesPerCrawl) { - helpText = msg( - str`Maximum ${this.orgDefaults.maxPagesPerCrawl.toLocaleString()} pages per crawl` - ); - } - } - inputEl.helpText = helpText; - }} + max=${ifDefined( + this.orgDefaults?.maxPagesPerCrawl && + this.orgDefaults.maxPagesPerCrawl < Infinity + ? this.orgDefaults.maxPagesPerCrawl + : undefined + )} + placeholder=${this.orgDefaults?.maxPagesPerCrawl + ? this.orgDefaults.maxPagesPerCrawl === Infinity + ? msg("Default: Unlimited") + : msg( + str`Default: ${this.orgDefaults.maxPagesPerCrawl.toLocaleString()}` + ) + : ""} + @sl-input=${onInputMinMax} > ${msg("pages")} @@ -1214,33 +1312,12 @@ https://archiveweb.page/images/${"logo.svg"}`} msg(`Adds a hard limit on the number of pages that will be crawled.`) )} - ${this.renderFormCol(html` - - ${msg("minutes")} - - `)} - ${this.renderHelpTextCol( - msg(`Adds a hard time limit for how long the crawler can spend on a - single webpage.`) - )} ${this.renderFormCol(html` @@ -1549,8 +1626,8 @@ https://archiveweb.page/images/${"logo.svg"}`} ? msg( "There are issues with this Workflow. Please go through previous steps and fix all issues to continue." ) - : msg(html`There is an issue with this Workflow:

Crawl - URL(s) required in + : msg(html`There is an issue with this Workflow:

Crawl URL(s) + required in Crawl Setup.

@@ -1953,21 +2030,27 @@ https://archiveweb.page/images/${"logo.svg"}`} ...(this.jobType === "seed-crawl" ? this.parseSeededConfig() : this.parseUrlListConfig()), - behaviorTimeout: - (this.formState.pageTimeoutMinutes ?? - this.orgDefaults.behaviorTimeoutMinutes ?? - DEFAULT_BEHAVIOR_TIMEOUT_MINUTES) * 60, + behaviorTimeout: +(this.formState.behaviorTimeoutSeconds || ""), + pageLoadTimeout: +(this.formState.pageLoadTimeoutSeconds || ""), + pageExtraDelay: +(this.formState.pageExtraDelaySeconds || ""), limit: this.formState.pageLimit ? +this.formState.pageLimit : undefined, lang: this.formState.lang || "", blockAds: this.formState.blockAds, exclude: trimArray(this.formState.exclusions), + behaviors: (this.formState.disableAutoscrollBehavior + ? DEFAULT_BEHAVIORS.slice(1) + : DEFAULT_BEHAVIORS + ).join(","), }, }; return config; } - private parseUrlListConfig(): NewCrawlConfigParams["config"] { + private parseUrlListConfig(): Pick< + NewCrawlConfigParams["config"], + "seeds" | "scopeType" | "extraHops" + > { const config = { seeds: urlListToArray(this.formState.urlList).map((seedUrl) => { const newSeed: Seed = { url: seedUrl, scopeType: "page" }; @@ -1980,7 +2063,10 @@ https://archiveweb.page/images/${"logo.svg"}`} return config; } - private parseSeededConfig(): NewCrawlConfigParams["config"] { + private parseSeededConfig(): Pick< + NewCrawlConfigParams["config"], + "seeds" | "scopeType" + > { const primarySeedUrl = this.formState.primarySeedUrl; const includeUrlList = this.formState.customIncludeUrlList ? urlListToArray(this.formState.customIncludeUrlList) @@ -2003,7 +2089,7 @@ https://archiveweb.page/images/${"logo.svg"}`} : [], extraHops: this.formState.includeLinkedPages ? 1 : 0, }; - const config: SeedConfig = { + const config = { seeds: [primarySeed, ...additionalSeedUrlList], scopeType: additionalSeedUrlList.length ? "page" @@ -2043,7 +2129,6 @@ https://archiveweb.page/images/${"logo.svg"}`} } private async fetchAPIDefaults() { - const orgDefaults = { ...this.orgDefaults }; try { const resp = await fetch("/api/settings", { headers: { "Content-Type": "application/json" }, @@ -2051,18 +2136,23 @@ https://archiveweb.page/images/${"logo.svg"}`} if (!resp.ok) { throw new Error(resp.statusText); } + const orgDefaults = { + ...this.orgDefaults, + }; const data = await resp.json(); - if (data.defaultBehaviorTimeSeconds) { - orgDefaults.behaviorTimeoutMinutes = - data.defaultBehaviorTimeSeconds / 60; + if (data.defaultBehaviorTimeSeconds > 0) { + orgDefaults.behaviorTimeoutSeconds = data.defaultBehaviorTimeSeconds; + } + if (data.defaultPageLoadTimeSeconds > 0) { + orgDefaults.pageLoadTimeoutSeconds = data.defaultPageLoadTimeSeconds; } if (data.maxPagesPerCrawl > 0) { orgDefaults.maxPagesPerCrawl = data.maxPagesPerCrawl; } + this.orgDefaults = orgDefaults; } catch (e: any) { console.debug(e); } - this.orgDefaults = orgDefaults; } } diff --git a/frontend/src/pages/org/workflows-new.ts b/frontend/src/pages/org/workflows-new.ts index 4fd46d42..420a6600 100644 --- a/frontend/src/pages/org/workflows-new.ts +++ b/frontend/src/pages/org/workflows-new.ts @@ -20,6 +20,9 @@ const defaultValue = { seeds: [], scopeType: "prefix", exclude: [""], + behaviorTimeout: null, + pageLoadTimeout: null, + pageExtraDelay: null, }, tags: [], crawlTimeout: null, diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index c3d6f08c..a5303c62 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -20,10 +20,12 @@ export type SeedConfig = Pick< Seed, "scopeType" | "include" | "exclude" | "limit" | "extraHops" > & { - seeds: (Seed)[]; + seeds: Seed[]; lang?: string | null; blockAds?: boolean; - behaviorTimeout?: number | null; + behaviorTimeout: number | null; + pageLoadTimeout: number | null; + pageExtraDelay: number | null; behaviors?: string | null; extraHops?: number | null; };