Allow users to set additional page time limits (#744)

This commit is contained in:
sua yoo 2023-04-05 20:06:46 -07:00 committed by GitHub
parent 72967a0381
commit 91c2c1ad62
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 270 additions and 130 deletions

View File

@ -8,6 +8,7 @@ import ISO6391 from "iso-639-1";
import LiteElement, { html } from "../utils/LiteElement";
import type { CrawlConfig, Seed, SeedConfig } from "../pages/org/types";
import { humanizeSchedule } from "../utils/cron";
import { RelativeDuration } from "./relative-duration";
/**
* Usage:
@ -30,9 +31,10 @@ export class ConfigDetails extends LiteElement {
hideTags = false;
@state()
private orgDefaults = {
behaviorTimeoutMinutes: Infinity,
maxPagesPerCrawl: Infinity,
private orgDefaults?: {
pageLoadTimeoutSeconds?: number;
behaviorTimeoutSeconds?: number;
maxPagesPerCrawl?: number;
};
private readonly scopeTypeLabels: Record<
@ -55,7 +57,25 @@ export class ConfigDetails extends LiteElement {
render() {
const crawlConfig = this.crawlConfig;
const exclusions = crawlConfig?.config.exclude || [];
const seedsConfig = crawlConfig?.config;
const exclusions = seedsConfig?.exclude || [];
const maxPages = seedsConfig?.seeds[0]?.limit ?? seedsConfig?.limit;
const renderTimeLimit = (
valueSeconds?: number | null,
fallbackValue?: number
) =>
valueSeconds
? RelativeDuration.humanize(valueSeconds * 1000, { verbose: true })
: typeof fallbackValue === "number"
? html`<span class="text-neutral-400"
>${fallbackValue === Infinity
? msg("Unlimited")
: RelativeDuration.humanize(fallbackValue * 1000, {
verbose: true,
})}
${msg("(default)")}</span
>`
: undefined;
return html`
<section id="crawler-settings" class="mb-8">
@ -85,16 +105,51 @@ export class ConfigDetails extends LiteElement {
() => this.renderSetting(msg("Exclusions"), msg("None"))
)}
${this.renderSetting(
msg("Page Time Limit"),
crawlConfig?.config.behaviorTimeout
? msg(str`${crawlConfig?.config.behaviorTimeout / 60} minute(s)`)
: msg("None")
msg("Max Pages"),
when(
maxPages,
() => msg(str`${maxPages!.toLocaleString()} pages`),
() =>
this.orgDefaults?.maxPagesPerCrawl
? html`<span class="text-neutral-400"
>${msg(
str`${this.orgDefaults.maxPagesPerCrawl.toLocaleString()} pages`
)}
${msg("(default)")}</span
>`
: undefined
)
)}
${this.renderSetting(
msg("Page Load Timeout"),
renderTimeLimit(
crawlConfig?.config.pageLoadTimeout,
this.orgDefaults?.pageLoadTimeoutSeconds ?? Infinity
)
)}
${this.renderSetting(
msg("Page Behavior Timeout"),
renderTimeLimit(
crawlConfig?.config.behaviorTimeout,
this.orgDefaults?.behaviorTimeoutSeconds ?? Infinity
)
)}
${this.renderSetting(
msg("Auto-Scroll Behavior"),
crawlConfig?.config.behaviors &&
!crawlConfig.config.behaviors.includes("autoscroll")
? msg("Disabled")
: html`<span class="text-neutral-400"
>${msg("Enabled (default)")}</span
>`
)}
${this.renderSetting(
msg("Delay Before Next Page"),
renderTimeLimit(crawlConfig?.config.pageExtraDelay, 0)
)}
${this.renderSetting(
msg("Crawl Time Limit"),
crawlConfig?.crawlTimeout
? msg(str`${crawlConfig?.crawlTimeout / 60} minute(s)`)
: msg("None")
renderTimeLimit(crawlConfig?.crawlTimeout, Infinity)
)}
${this.renderSetting(msg("Crawler Instances"), crawlConfig?.scale)}
</btrix-desc-list>
@ -212,10 +267,9 @@ export class ConfigDetails extends LiteElement {
const crawlConfig = this.crawlConfig!;
const seedsConfig = crawlConfig.config;
const additionalUrlList = seedsConfig.seeds.slice(1);
let primarySeedConfig: SeedConfig | Seed = seedsConfig;
let primarySeedUrl = seedsConfig.seeds[0].url;
const primarySeedConfig: SeedConfig | Seed = seedsConfig;
const primarySeedUrl = seedsConfig.seeds[0].url;
const includeUrlList = primarySeedConfig.include || seedsConfig.include;
const maxPages = primarySeedConfig.limit ?? seedsConfig.limit;
return html`
${this.renderSetting(msg("Primary Seed URL"), primarySeedUrl, true)}
${this.renderSetting(
@ -255,19 +309,6 @@ export class ConfigDetails extends LiteElement {
: msg("None"),
true
)}
${this.renderSetting(
msg("Max Pages"),
when(
maxPages,
() => msg(str`${maxPages} page(s)`),
() =>
this.orgDefaults.maxPagesPerCrawl < Infinity
? msg(
str`Maximum Allowed (${this.orgDefaults.maxPagesPerCrawl.toLocaleString()} pages)`
)
: undefined
)
)}
`;
};
@ -292,7 +333,7 @@ export class ConfigDetails extends LiteElement {
} else if (typeof value === "boolean") {
content = value ? msg("Yes") : msg("No");
} else if (typeof value !== "number" && !value) {
content = html`<span class="text-neutral-300"
content = html`<span class="text-neutral-400"
>${msg("Not specified")}</span
>`;
}
@ -304,7 +345,6 @@ export class ConfigDetails extends LiteElement {
}
private async fetchAPIDefaults() {
const orgDefaults = { ...this.orgDefaults };
try {
const resp = await fetch("/api/settings", {
headers: { "Content-Type": "application/json" },
@ -312,17 +352,22 @@ export class ConfigDetails extends LiteElement {
if (!resp.ok) {
throw new Error(resp.statusText);
}
const orgDefaults = {
...this.orgDefaults,
};
const data = await resp.json();
if (data.defaultBehaviorTimeSeconds) {
orgDefaults.behaviorTimeoutMinutes =
data.defaultBehaviorTimeSeconds / 60;
if (data.defaultBehaviorTimeSeconds > 0) {
orgDefaults.behaviorTimeoutSeconds = data.defaultBehaviorTimeSeconds;
}
if (data.defaultPageLoadTimeSeconds > 0) {
orgDefaults.pageLoadTimeoutSeconds = data.defaultPageLoadTimeSeconds;
}
if (data.maxPagesPerCrawl > 0) {
orgDefaults.maxPagesPerCrawl = data.maxPagesPerCrawl;
}
this.orgDefaults = orgDefaults;
} catch (e: any) {
console.debug(e);
}
this.orgDefaults = orgDefaults;
}
}

View File

@ -16,8 +16,6 @@ import compact from "lodash/fp/compact";
import { mergeDeep } from "immutable";
import flow from "lodash/fp/flow";
import uniq from "lodash/fp/uniq";
import RegexColorize from "regex-colorize";
import ISO6391 from "iso-639-1";
import Fuse from "fuse.js";
import LiteElement, { html } from "../../utils/LiteElement";
@ -78,7 +76,9 @@ type FormState = {
includeLinkedPages: boolean;
customIncludeUrlList: string;
crawlTimeoutMinutes: number | null;
pageTimeoutMinutes: number | null;
behaviorTimeoutSeconds: number | null;
pageLoadTimeoutSeconds: number | null;
pageExtraDelaySeconds: number | null;
scopeType: WorkflowParams["config"]["scopeType"];
exclusions: WorkflowParams["config"]["exclude"];
pageLimit: WorkflowParams["config"]["limit"];
@ -99,6 +99,7 @@ type FormState = {
browserProfile: Profile | null;
tags: Tags;
description: WorkflowParams["description"];
disableAutoscrollBehavior: boolean;
};
const getDefaultProgressState = (hasConfigId = false): ProgressState => {
@ -144,7 +145,9 @@ const getDefaultFormState = (): FormState => ({
includeLinkedPages: false,
customIncludeUrlList: "",
crawlTimeoutMinutes: null,
pageTimeoutMinutes: null,
behaviorTimeoutSeconds: null,
pageLoadTimeoutSeconds: null,
pageExtraDelaySeconds: null,
scopeType: "host",
exclusions: [],
pageLimit: undefined,
@ -165,6 +168,7 @@ const getDefaultFormState = (): FormState => ({
browserProfile: null,
tags: [],
description: null,
disableAutoscrollBehavior: false,
});
const defaultProgressState = getDefaultProgressState();
const orderedTabNames = STEPS.filter(
@ -191,8 +195,12 @@ const urlListToArray = flow(
(str: string) => (str.length ? str.trim().split(/\s+/g) : []),
trimArray
);
const DEFAULT_BEHAVIOR_TIMEOUT_MINUTES = 5;
const DEFAULT_MAX_PAGES_PER_CRAWL = Infinity;
const DEFAULT_BEHAVIORS = [
"autoscroll",
"autoplay",
"autofetch",
"siteSpecific",
];
@localized()
export class CrawlConfigEditor extends LiteElement {
@ -221,9 +229,10 @@ export class CrawlConfigEditor extends LiteElement {
private progressState!: ProgressState;
@state()
private orgDefaults = {
behaviorTimeoutMinutes: DEFAULT_BEHAVIOR_TIMEOUT_MINUTES,
maxPagesPerCrawl: DEFAULT_MAX_PAGES_PER_CRAWL,
private orgDefaults?: {
behaviorTimeoutSeconds?: number;
pageLoadTimeoutSeconds?: number;
maxPagesPerCrawl?: number;
};
@state()
@ -394,8 +403,9 @@ export class CrawlConfigEditor extends LiteElement {
return null;
}
private getInitialFormState(): FormState | {} {
if (!this.initialWorkflow) return {};
private getInitialFormState(): FormState {
const defaultFormState = getDefaultFormState();
if (!this.initialWorkflow) return defaultFormState;
const formState: Partial<FormState> = {};
const seedsConfig = this.initialWorkflow.config;
const { seeds } = seedsConfig;
@ -456,36 +466,46 @@ export class CrawlConfigEditor extends LiteElement {
if (this.initialWorkflow.tags?.length) {
formState.tags = this.initialWorkflow.tags;
}
if (typeof this.initialWorkflow.crawlTimeout === "number") {
formState.crawlTimeoutMinutes = this.initialWorkflow.crawlTimeout / 60;
}
if (typeof seedsConfig.behaviorTimeout === "number") {
formState.pageTimeoutMinutes = seedsConfig.behaviorTimeout / 60;
}
const secondsToMinutes = (value: any, fallback: number | null) => {
if (typeof value === "number" && value > 0) return value / 60;
return fallback;
};
return {
primarySeedUrl: "",
urlList: "",
customIncludeUrlList: "",
crawlTimeoutMinutes: null,
pageTimeoutMinutes: null,
primarySeedUrl: defaultFormState.primarySeedUrl,
urlList: defaultFormState.urlList,
customIncludeUrlList: defaultFormState.customIncludeUrlList,
crawlTimeoutMinutes: secondsToMinutes(
this.initialWorkflow.crawlTimeout,
defaultFormState.crawlTimeoutMinutes
),
behaviorTimeoutSeconds:
seedsConfig.behaviorTimeout ?? defaultFormState.behaviorTimeoutSeconds,
pageLoadTimeoutSeconds:
seedsConfig.pageLoadTimeout ?? defaultFormState.pageLoadTimeoutSeconds,
pageExtraDelaySeconds:
seedsConfig.pageExtraDelay ?? defaultFormState.pageExtraDelaySeconds,
scale: this.initialWorkflow.scale,
blockAds: this.initialWorkflow.config.blockAds,
lang: this.initialWorkflow.config.lang,
scheduleType: "none",
runNow: false,
scheduleType: defaultFormState.scheduleType,
scheduleFrequency: defaultFormState.scheduleFrequency,
runNow: defaultFormState.runNow,
tags: this.initialWorkflow.tags,
jobName: this.initialWorkflow.name || "",
jobName: this.initialWorkflow.name || defaultFormState.jobName,
description: this.initialWorkflow.description,
browserProfile: this.initialWorkflow.profileid
? ({ id: this.initialWorkflow.profileid } as Profile)
: null,
: defaultFormState.browserProfile,
scopeType: primarySeedConfig.scopeType as FormState["scopeType"],
exclusions: seedsConfig.exclude,
includeLinkedPages: Boolean(
primarySeedConfig.extraHops || seedsConfig.extraHops
),
pageLimit: this.initialWorkflow.config.limit ?? undefined,
includeLinkedPages:
Boolean(primarySeedConfig.extraHops || seedsConfig.extraHops) ?? true,
pageLimit:
this.initialWorkflow.config.limit ?? defaultFormState.pageLimit,
disableAutoscrollBehavior: this.initialWorkflow.config.behaviors
? !this.initialWorkflow.config.behaviors.includes("autoscroll")
: defaultFormState.disableAutoscrollBehavior,
...formState,
};
}
@ -1154,7 +1174,100 @@ https://archiveweb.page/images/${"logo.svg"}`}
urlListToArray(this.formState.urlList).length +
(this.jobType === "seed-crawl" ? 1 : 0)
);
const onInputMinMax = async (e: CustomEvent) => {
const inputEl = e.target as SlInput;
await inputEl.updateComplete;
let helpText = "";
if (inputEl.invalid) {
const value = +inputEl.value;
const min = inputEl.min;
const max = inputEl.max;
if (min && value < +min) {
helpText = msg(
str`Must be more than minimum of ${(+min).toLocaleString()}`
);
} else if (max && value > +max) {
helpText = msg(
str`Must be less than maximum of ${(+max).toLocaleString()}`
);
}
}
inputEl.helpText = helpText;
};
return html`
${this.renderSectionHeading(msg("Limit Per Page"))}
${this.renderFormCol(html`
<sl-input
name="pageLoadTimeoutSeconds"
type="number"
label=${msg("Page Load Timeout")}
placeholder=${this.orgDefaults?.pageLoadTimeoutSeconds
? msg(
str`Default: ${this.orgDefaults.pageLoadTimeoutSeconds.toLocaleString()}`
)
: "Default: Unlimited"}
value=${ifDefined(this.formState.pageLoadTimeoutSeconds ?? undefined)}
min="0"
@sl-input=${onInputMinMax}
>
<span slot="suffix">${msg("seconds")}</span>
</sl-input>
`)}
${this.renderHelpTextCol(
msg(
`Limits amount of time to wait for a page to load. Behaviors will run after this timeout only if the page is partially or fully loaded.`
)
)}
${this.renderFormCol(html`
<sl-input
name="behaviorTimeoutSeconds"
type="number"
label=${msg("Behavior Timeout")}
placeholder=${this.orgDefaults?.behaviorTimeoutSeconds
? msg(
str`Default: ${this.orgDefaults.behaviorTimeoutSeconds.toLocaleString()}`
)
: msg("Unlimited")}
value=${ifDefined(this.formState.behaviorTimeoutSeconds ?? undefined)}
min="0"
@sl-input=${onInputMinMax}
>
<span slot="suffix">${msg("seconds")}</span>
</sl-input>
`)}
${this.renderHelpTextCol(
msg(`Limits how long behaviors can run on each page.`)
)}
${this.renderFormCol(html`<sl-checkbox
name="disableAutoscrollBehavior"
?checked=${this.formState.disableAutoscrollBehavior}
>
${msg("Disable Auto-Scroll Behavior")}
</sl-checkbox>`)}
${this.renderHelpTextCol(
msg(
`Prevents browser from automatically scrolling until the end of the page.`
),
false
)}
${this.renderFormCol(html`
<sl-input
name="pageExtraDelaySeconds"
type="number"
label=${msg("Delay Before Next Page")}
placeholder=${"Default: 0"}
value=${ifDefined(this.formState.pageExtraDelaySeconds ?? undefined)}
min="0"
>
<span slot="suffix">${msg("seconds")}</span>
</sl-input>
`)}
${this.renderHelpTextCol(
msg(
`Waits on the page after behaviors are complete before moving onto the next page. Can be helpful for rate limiting.`
)
)}
${this.renderSectionHeading(msg("Limit Per Crawl"))}
${this.renderFormCol(html`
<sl-mutation-observer
attr="min"
@ -1176,35 +1289,20 @@ https://archiveweb.page/images/${"logo.svg"}`}
type="number"
value=${this.formState.pageLimit || ""}
min=${minPages}
max=${this.orgDefaults.maxPagesPerCrawl}
placeholder=${this.orgDefaults.maxPagesPerCrawl === Infinity
? msg("Unlimited")
: msg(
str`Maximum Allowed (${this.orgDefaults.maxPagesPerCrawl.toLocaleString()})`
)}
@sl-input=${async (e: CustomEvent) => {
const inputEl = e.target as SlInput;
await inputEl.updateComplete;
let helpText = "";
if (inputEl.invalid) {
const value = +inputEl.value;
if (value < minPages) {
helpText =
minPages === 1
? msg(
str`Minimum ${minPages.toLocaleString()} page per crawl`
)
: msg(
str`Minimum ${minPages.toLocaleString()} pages per crawl`
);
} else if (value > this.orgDefaults.maxPagesPerCrawl) {
helpText = msg(
str`Maximum ${this.orgDefaults.maxPagesPerCrawl.toLocaleString()} pages per crawl`
);
}
}
inputEl.helpText = helpText;
}}
max=${ifDefined(
this.orgDefaults?.maxPagesPerCrawl &&
this.orgDefaults.maxPagesPerCrawl < Infinity
? this.orgDefaults.maxPagesPerCrawl
: undefined
)}
placeholder=${this.orgDefaults?.maxPagesPerCrawl
? this.orgDefaults.maxPagesPerCrawl === Infinity
? msg("Default: Unlimited")
: msg(
str`Default: ${this.orgDefaults.maxPagesPerCrawl.toLocaleString()}`
)
: ""}
@sl-input=${onInputMinMax}
>
<span slot="suffix">${msg("pages")}</span>
</sl-input>
@ -1214,33 +1312,12 @@ https://archiveweb.page/images/${"logo.svg"}`}
msg(`Adds a hard limit on the number of pages
that will be crawled.`)
)}
${this.renderFormCol(html`
<sl-input
name="pageTimeoutMinutes"
type="number"
label=${msg("Page Time Limit")}
placeholder=${msg("Unlimited")}
value=${ifDefined(
this.formState.pageTimeoutMinutes ??
this.orgDefaults.behaviorTimeoutMinutes
)}
?disabled=${this.orgDefaults.behaviorTimeoutMinutes === undefined}
min="1"
required
>
<span slot="suffix">${msg("minutes")}</span>
</sl-input>
`)}
${this.renderHelpTextCol(
msg(`Adds a hard time limit for how long the crawler can spend on a
single webpage.`)
)}
${this.renderFormCol(html`
<sl-input
name="crawlTimeoutMinutes"
label=${msg("Crawl Time Limit")}
value=${this.formState.crawlTimeoutMinutes || ""}
placeholder=${msg("Unlimited")}
placeholder=${msg("Default: Unlimited")}
min="0"
type="number"
>
@ -1549,8 +1626,8 @@ https://archiveweb.page/images/${"logo.svg"}`}
? msg(
"There are issues with this Workflow. Please go through previous steps and fix all issues to continue."
)
: msg(html`There is an issue with this Workflow:<br /><br />Crawl
URL(s) required in
: msg(html`There is an issue with this Workflow:<br /><br />Crawl URL(s)
required in
<a href="${crawlSetupUrl}" class="bold underline hover:no-underline"
>Crawl Setup</a
>. <br /><br />
@ -1953,21 +2030,27 @@ https://archiveweb.page/images/${"logo.svg"}`}
...(this.jobType === "seed-crawl"
? this.parseSeededConfig()
: this.parseUrlListConfig()),
behaviorTimeout:
(this.formState.pageTimeoutMinutes ??
this.orgDefaults.behaviorTimeoutMinutes ??
DEFAULT_BEHAVIOR_TIMEOUT_MINUTES) * 60,
behaviorTimeout: +(this.formState.behaviorTimeoutSeconds || ""),
pageLoadTimeout: +(this.formState.pageLoadTimeoutSeconds || ""),
pageExtraDelay: +(this.formState.pageExtraDelaySeconds || ""),
limit: this.formState.pageLimit ? +this.formState.pageLimit : undefined,
lang: this.formState.lang || "",
blockAds: this.formState.blockAds,
exclude: trimArray(this.formState.exclusions),
behaviors: (this.formState.disableAutoscrollBehavior
? DEFAULT_BEHAVIORS.slice(1)
: DEFAULT_BEHAVIORS
).join(","),
},
};
return config;
}
private parseUrlListConfig(): NewCrawlConfigParams["config"] {
private parseUrlListConfig(): Pick<
NewCrawlConfigParams["config"],
"seeds" | "scopeType" | "extraHops"
> {
const config = {
seeds: urlListToArray(this.formState.urlList).map((seedUrl) => {
const newSeed: Seed = { url: seedUrl, scopeType: "page" };
@ -1980,7 +2063,10 @@ https://archiveweb.page/images/${"logo.svg"}`}
return config;
}
private parseSeededConfig(): NewCrawlConfigParams["config"] {
private parseSeededConfig(): Pick<
NewCrawlConfigParams["config"],
"seeds" | "scopeType"
> {
const primarySeedUrl = this.formState.primarySeedUrl;
const includeUrlList = this.formState.customIncludeUrlList
? urlListToArray(this.formState.customIncludeUrlList)
@ -2003,7 +2089,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
: [],
extraHops: this.formState.includeLinkedPages ? 1 : 0,
};
const config: SeedConfig = {
const config = {
seeds: [primarySeed, ...additionalSeedUrlList],
scopeType: additionalSeedUrlList.length
? "page"
@ -2043,7 +2129,6 @@ https://archiveweb.page/images/${"logo.svg"}`}
}
private async fetchAPIDefaults() {
const orgDefaults = { ...this.orgDefaults };
try {
const resp = await fetch("/api/settings", {
headers: { "Content-Type": "application/json" },
@ -2051,18 +2136,23 @@ https://archiveweb.page/images/${"logo.svg"}`}
if (!resp.ok) {
throw new Error(resp.statusText);
}
const orgDefaults = {
...this.orgDefaults,
};
const data = await resp.json();
if (data.defaultBehaviorTimeSeconds) {
orgDefaults.behaviorTimeoutMinutes =
data.defaultBehaviorTimeSeconds / 60;
if (data.defaultBehaviorTimeSeconds > 0) {
orgDefaults.behaviorTimeoutSeconds = data.defaultBehaviorTimeSeconds;
}
if (data.defaultPageLoadTimeSeconds > 0) {
orgDefaults.pageLoadTimeoutSeconds = data.defaultPageLoadTimeSeconds;
}
if (data.maxPagesPerCrawl > 0) {
orgDefaults.maxPagesPerCrawl = data.maxPagesPerCrawl;
}
this.orgDefaults = orgDefaults;
} catch (e: any) {
console.debug(e);
}
this.orgDefaults = orgDefaults;
}
}

View File

@ -20,6 +20,9 @@ const defaultValue = {
seeds: [],
scopeType: "prefix",
exclude: [""],
behaviorTimeout: null,
pageLoadTimeout: null,
pageExtraDelay: null,
},
tags: [],
crawlTimeout: null,

View File

@ -20,10 +20,12 @@ export type SeedConfig = Pick<
Seed,
"scopeType" | "include" | "exclude" | "limit" | "extraHops"
> & {
seeds: (Seed)[];
seeds: Seed[];
lang?: string | null;
blockAds?: boolean;
behaviorTimeout?: number | null;
behaviorTimeout: number | null;
pageLoadTimeout: number | null;
pageExtraDelay: number | null;
behaviors?: string | null;
extraHops?: number | null;
};