Allow users to set additional page time limits (#744)
This commit is contained in:
parent
72967a0381
commit
91c2c1ad62
@ -8,6 +8,7 @@ import ISO6391 from "iso-639-1";
|
||||
import LiteElement, { html } from "../utils/LiteElement";
|
||||
import type { CrawlConfig, Seed, SeedConfig } from "../pages/org/types";
|
||||
import { humanizeSchedule } from "../utils/cron";
|
||||
import { RelativeDuration } from "./relative-duration";
|
||||
|
||||
/**
|
||||
* Usage:
|
||||
@ -30,9 +31,10 @@ export class ConfigDetails extends LiteElement {
|
||||
hideTags = false;
|
||||
|
||||
@state()
|
||||
private orgDefaults = {
|
||||
behaviorTimeoutMinutes: Infinity,
|
||||
maxPagesPerCrawl: Infinity,
|
||||
private orgDefaults?: {
|
||||
pageLoadTimeoutSeconds?: number;
|
||||
behaviorTimeoutSeconds?: number;
|
||||
maxPagesPerCrawl?: number;
|
||||
};
|
||||
|
||||
private readonly scopeTypeLabels: Record<
|
||||
@ -55,7 +57,25 @@ export class ConfigDetails extends LiteElement {
|
||||
|
||||
render() {
|
||||
const crawlConfig = this.crawlConfig;
|
||||
const exclusions = crawlConfig?.config.exclude || [];
|
||||
const seedsConfig = crawlConfig?.config;
|
||||
const exclusions = seedsConfig?.exclude || [];
|
||||
const maxPages = seedsConfig?.seeds[0]?.limit ?? seedsConfig?.limit;
|
||||
const renderTimeLimit = (
|
||||
valueSeconds?: number | null,
|
||||
fallbackValue?: number
|
||||
) =>
|
||||
valueSeconds
|
||||
? RelativeDuration.humanize(valueSeconds * 1000, { verbose: true })
|
||||
: typeof fallbackValue === "number"
|
||||
? html`<span class="text-neutral-400"
|
||||
>${fallbackValue === Infinity
|
||||
? msg("Unlimited")
|
||||
: RelativeDuration.humanize(fallbackValue * 1000, {
|
||||
verbose: true,
|
||||
})}
|
||||
${msg("(default)")}</span
|
||||
>`
|
||||
: undefined;
|
||||
|
||||
return html`
|
||||
<section id="crawler-settings" class="mb-8">
|
||||
@ -85,16 +105,51 @@ export class ConfigDetails extends LiteElement {
|
||||
() => this.renderSetting(msg("Exclusions"), msg("None"))
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Page Time Limit"),
|
||||
crawlConfig?.config.behaviorTimeout
|
||||
? msg(str`${crawlConfig?.config.behaviorTimeout / 60} minute(s)`)
|
||||
: msg("None")
|
||||
msg("Max Pages"),
|
||||
when(
|
||||
maxPages,
|
||||
() => msg(str`${maxPages!.toLocaleString()} pages`),
|
||||
() =>
|
||||
this.orgDefaults?.maxPagesPerCrawl
|
||||
? html`<span class="text-neutral-400"
|
||||
>${msg(
|
||||
str`${this.orgDefaults.maxPagesPerCrawl.toLocaleString()} pages`
|
||||
)}
|
||||
${msg("(default)")}</span
|
||||
>`
|
||||
: undefined
|
||||
)
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Page Load Timeout"),
|
||||
renderTimeLimit(
|
||||
crawlConfig?.config.pageLoadTimeout,
|
||||
this.orgDefaults?.pageLoadTimeoutSeconds ?? Infinity
|
||||
)
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Page Behavior Timeout"),
|
||||
renderTimeLimit(
|
||||
crawlConfig?.config.behaviorTimeout,
|
||||
this.orgDefaults?.behaviorTimeoutSeconds ?? Infinity
|
||||
)
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Auto-Scroll Behavior"),
|
||||
crawlConfig?.config.behaviors &&
|
||||
!crawlConfig.config.behaviors.includes("autoscroll")
|
||||
? msg("Disabled")
|
||||
: html`<span class="text-neutral-400"
|
||||
>${msg("Enabled (default)")}</span
|
||||
>`
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Delay Before Next Page"),
|
||||
renderTimeLimit(crawlConfig?.config.pageExtraDelay, 0)
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Crawl Time Limit"),
|
||||
crawlConfig?.crawlTimeout
|
||||
? msg(str`${crawlConfig?.crawlTimeout / 60} minute(s)`)
|
||||
: msg("None")
|
||||
renderTimeLimit(crawlConfig?.crawlTimeout, Infinity)
|
||||
)}
|
||||
${this.renderSetting(msg("Crawler Instances"), crawlConfig?.scale)}
|
||||
</btrix-desc-list>
|
||||
@ -212,10 +267,9 @@ export class ConfigDetails extends LiteElement {
|
||||
const crawlConfig = this.crawlConfig!;
|
||||
const seedsConfig = crawlConfig.config;
|
||||
const additionalUrlList = seedsConfig.seeds.slice(1);
|
||||
let primarySeedConfig: SeedConfig | Seed = seedsConfig;
|
||||
let primarySeedUrl = seedsConfig.seeds[0].url;
|
||||
const primarySeedConfig: SeedConfig | Seed = seedsConfig;
|
||||
const primarySeedUrl = seedsConfig.seeds[0].url;
|
||||
const includeUrlList = primarySeedConfig.include || seedsConfig.include;
|
||||
const maxPages = primarySeedConfig.limit ?? seedsConfig.limit;
|
||||
return html`
|
||||
${this.renderSetting(msg("Primary Seed URL"), primarySeedUrl, true)}
|
||||
${this.renderSetting(
|
||||
@ -255,19 +309,6 @@ export class ConfigDetails extends LiteElement {
|
||||
: msg("None"),
|
||||
true
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Max Pages"),
|
||||
when(
|
||||
maxPages,
|
||||
() => msg(str`${maxPages} page(s)`),
|
||||
() =>
|
||||
this.orgDefaults.maxPagesPerCrawl < Infinity
|
||||
? msg(
|
||||
str`Maximum Allowed (${this.orgDefaults.maxPagesPerCrawl.toLocaleString()} pages)`
|
||||
)
|
||||
: undefined
|
||||
)
|
||||
)}
|
||||
`;
|
||||
};
|
||||
|
||||
@ -292,7 +333,7 @@ export class ConfigDetails extends LiteElement {
|
||||
} else if (typeof value === "boolean") {
|
||||
content = value ? msg("Yes") : msg("No");
|
||||
} else if (typeof value !== "number" && !value) {
|
||||
content = html`<span class="text-neutral-300"
|
||||
content = html`<span class="text-neutral-400"
|
||||
>${msg("Not specified")}</span
|
||||
>`;
|
||||
}
|
||||
@ -304,7 +345,6 @@ export class ConfigDetails extends LiteElement {
|
||||
}
|
||||
|
||||
private async fetchAPIDefaults() {
|
||||
const orgDefaults = { ...this.orgDefaults };
|
||||
try {
|
||||
const resp = await fetch("/api/settings", {
|
||||
headers: { "Content-Type": "application/json" },
|
||||
@ -312,17 +352,22 @@ export class ConfigDetails extends LiteElement {
|
||||
if (!resp.ok) {
|
||||
throw new Error(resp.statusText);
|
||||
}
|
||||
const orgDefaults = {
|
||||
...this.orgDefaults,
|
||||
};
|
||||
const data = await resp.json();
|
||||
if (data.defaultBehaviorTimeSeconds) {
|
||||
orgDefaults.behaviorTimeoutMinutes =
|
||||
data.defaultBehaviorTimeSeconds / 60;
|
||||
if (data.defaultBehaviorTimeSeconds > 0) {
|
||||
orgDefaults.behaviorTimeoutSeconds = data.defaultBehaviorTimeSeconds;
|
||||
}
|
||||
if (data.defaultPageLoadTimeSeconds > 0) {
|
||||
orgDefaults.pageLoadTimeoutSeconds = data.defaultPageLoadTimeSeconds;
|
||||
}
|
||||
if (data.maxPagesPerCrawl > 0) {
|
||||
orgDefaults.maxPagesPerCrawl = data.maxPagesPerCrawl;
|
||||
}
|
||||
this.orgDefaults = orgDefaults;
|
||||
} catch (e: any) {
|
||||
console.debug(e);
|
||||
}
|
||||
this.orgDefaults = orgDefaults;
|
||||
}
|
||||
}
|
||||
|
@ -16,8 +16,6 @@ import compact from "lodash/fp/compact";
|
||||
import { mergeDeep } from "immutable";
|
||||
import flow from "lodash/fp/flow";
|
||||
import uniq from "lodash/fp/uniq";
|
||||
import RegexColorize from "regex-colorize";
|
||||
import ISO6391 from "iso-639-1";
|
||||
import Fuse from "fuse.js";
|
||||
|
||||
import LiteElement, { html } from "../../utils/LiteElement";
|
||||
@ -78,7 +76,9 @@ type FormState = {
|
||||
includeLinkedPages: boolean;
|
||||
customIncludeUrlList: string;
|
||||
crawlTimeoutMinutes: number | null;
|
||||
pageTimeoutMinutes: number | null;
|
||||
behaviorTimeoutSeconds: number | null;
|
||||
pageLoadTimeoutSeconds: number | null;
|
||||
pageExtraDelaySeconds: number | null;
|
||||
scopeType: WorkflowParams["config"]["scopeType"];
|
||||
exclusions: WorkflowParams["config"]["exclude"];
|
||||
pageLimit: WorkflowParams["config"]["limit"];
|
||||
@ -99,6 +99,7 @@ type FormState = {
|
||||
browserProfile: Profile | null;
|
||||
tags: Tags;
|
||||
description: WorkflowParams["description"];
|
||||
disableAutoscrollBehavior: boolean;
|
||||
};
|
||||
|
||||
const getDefaultProgressState = (hasConfigId = false): ProgressState => {
|
||||
@ -144,7 +145,9 @@ const getDefaultFormState = (): FormState => ({
|
||||
includeLinkedPages: false,
|
||||
customIncludeUrlList: "",
|
||||
crawlTimeoutMinutes: null,
|
||||
pageTimeoutMinutes: null,
|
||||
behaviorTimeoutSeconds: null,
|
||||
pageLoadTimeoutSeconds: null,
|
||||
pageExtraDelaySeconds: null,
|
||||
scopeType: "host",
|
||||
exclusions: [],
|
||||
pageLimit: undefined,
|
||||
@ -165,6 +168,7 @@ const getDefaultFormState = (): FormState => ({
|
||||
browserProfile: null,
|
||||
tags: [],
|
||||
description: null,
|
||||
disableAutoscrollBehavior: false,
|
||||
});
|
||||
const defaultProgressState = getDefaultProgressState();
|
||||
const orderedTabNames = STEPS.filter(
|
||||
@ -191,8 +195,12 @@ const urlListToArray = flow(
|
||||
(str: string) => (str.length ? str.trim().split(/\s+/g) : []),
|
||||
trimArray
|
||||
);
|
||||
const DEFAULT_BEHAVIOR_TIMEOUT_MINUTES = 5;
|
||||
const DEFAULT_MAX_PAGES_PER_CRAWL = Infinity;
|
||||
const DEFAULT_BEHAVIORS = [
|
||||
"autoscroll",
|
||||
"autoplay",
|
||||
"autofetch",
|
||||
"siteSpecific",
|
||||
];
|
||||
|
||||
@localized()
|
||||
export class CrawlConfigEditor extends LiteElement {
|
||||
@ -221,9 +229,10 @@ export class CrawlConfigEditor extends LiteElement {
|
||||
private progressState!: ProgressState;
|
||||
|
||||
@state()
|
||||
private orgDefaults = {
|
||||
behaviorTimeoutMinutes: DEFAULT_BEHAVIOR_TIMEOUT_MINUTES,
|
||||
maxPagesPerCrawl: DEFAULT_MAX_PAGES_PER_CRAWL,
|
||||
private orgDefaults?: {
|
||||
behaviorTimeoutSeconds?: number;
|
||||
pageLoadTimeoutSeconds?: number;
|
||||
maxPagesPerCrawl?: number;
|
||||
};
|
||||
|
||||
@state()
|
||||
@ -394,8 +403,9 @@ export class CrawlConfigEditor extends LiteElement {
|
||||
return null;
|
||||
}
|
||||
|
||||
private getInitialFormState(): FormState | {} {
|
||||
if (!this.initialWorkflow) return {};
|
||||
private getInitialFormState(): FormState {
|
||||
const defaultFormState = getDefaultFormState();
|
||||
if (!this.initialWorkflow) return defaultFormState;
|
||||
const formState: Partial<FormState> = {};
|
||||
const seedsConfig = this.initialWorkflow.config;
|
||||
const { seeds } = seedsConfig;
|
||||
@ -456,36 +466,46 @@ export class CrawlConfigEditor extends LiteElement {
|
||||
if (this.initialWorkflow.tags?.length) {
|
||||
formState.tags = this.initialWorkflow.tags;
|
||||
}
|
||||
if (typeof this.initialWorkflow.crawlTimeout === "number") {
|
||||
formState.crawlTimeoutMinutes = this.initialWorkflow.crawlTimeout / 60;
|
||||
}
|
||||
if (typeof seedsConfig.behaviorTimeout === "number") {
|
||||
formState.pageTimeoutMinutes = seedsConfig.behaviorTimeout / 60;
|
||||
}
|
||||
const secondsToMinutes = (value: any, fallback: number | null) => {
|
||||
if (typeof value === "number" && value > 0) return value / 60;
|
||||
return fallback;
|
||||
};
|
||||
|
||||
return {
|
||||
primarySeedUrl: "",
|
||||
urlList: "",
|
||||
customIncludeUrlList: "",
|
||||
crawlTimeoutMinutes: null,
|
||||
pageTimeoutMinutes: null,
|
||||
primarySeedUrl: defaultFormState.primarySeedUrl,
|
||||
urlList: defaultFormState.urlList,
|
||||
customIncludeUrlList: defaultFormState.customIncludeUrlList,
|
||||
crawlTimeoutMinutes: secondsToMinutes(
|
||||
this.initialWorkflow.crawlTimeout,
|
||||
defaultFormState.crawlTimeoutMinutes
|
||||
),
|
||||
behaviorTimeoutSeconds:
|
||||
seedsConfig.behaviorTimeout ?? defaultFormState.behaviorTimeoutSeconds,
|
||||
pageLoadTimeoutSeconds:
|
||||
seedsConfig.pageLoadTimeout ?? defaultFormState.pageLoadTimeoutSeconds,
|
||||
pageExtraDelaySeconds:
|
||||
seedsConfig.pageExtraDelay ?? defaultFormState.pageExtraDelaySeconds,
|
||||
scale: this.initialWorkflow.scale,
|
||||
blockAds: this.initialWorkflow.config.blockAds,
|
||||
lang: this.initialWorkflow.config.lang,
|
||||
scheduleType: "none",
|
||||
runNow: false,
|
||||
scheduleType: defaultFormState.scheduleType,
|
||||
scheduleFrequency: defaultFormState.scheduleFrequency,
|
||||
runNow: defaultFormState.runNow,
|
||||
tags: this.initialWorkflow.tags,
|
||||
jobName: this.initialWorkflow.name || "",
|
||||
jobName: this.initialWorkflow.name || defaultFormState.jobName,
|
||||
description: this.initialWorkflow.description,
|
||||
browserProfile: this.initialWorkflow.profileid
|
||||
? ({ id: this.initialWorkflow.profileid } as Profile)
|
||||
: null,
|
||||
: defaultFormState.browserProfile,
|
||||
scopeType: primarySeedConfig.scopeType as FormState["scopeType"],
|
||||
exclusions: seedsConfig.exclude,
|
||||
includeLinkedPages: Boolean(
|
||||
primarySeedConfig.extraHops || seedsConfig.extraHops
|
||||
),
|
||||
pageLimit: this.initialWorkflow.config.limit ?? undefined,
|
||||
includeLinkedPages:
|
||||
Boolean(primarySeedConfig.extraHops || seedsConfig.extraHops) ?? true,
|
||||
pageLimit:
|
||||
this.initialWorkflow.config.limit ?? defaultFormState.pageLimit,
|
||||
disableAutoscrollBehavior: this.initialWorkflow.config.behaviors
|
||||
? !this.initialWorkflow.config.behaviors.includes("autoscroll")
|
||||
: defaultFormState.disableAutoscrollBehavior,
|
||||
...formState,
|
||||
};
|
||||
}
|
||||
@ -1154,7 +1174,100 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
urlListToArray(this.formState.urlList).length +
|
||||
(this.jobType === "seed-crawl" ? 1 : 0)
|
||||
);
|
||||
const onInputMinMax = async (e: CustomEvent) => {
|
||||
const inputEl = e.target as SlInput;
|
||||
await inputEl.updateComplete;
|
||||
let helpText = "";
|
||||
if (inputEl.invalid) {
|
||||
const value = +inputEl.value;
|
||||
const min = inputEl.min;
|
||||
const max = inputEl.max;
|
||||
if (min && value < +min) {
|
||||
helpText = msg(
|
||||
str`Must be more than minimum of ${(+min).toLocaleString()}`
|
||||
);
|
||||
} else if (max && value > +max) {
|
||||
helpText = msg(
|
||||
str`Must be less than maximum of ${(+max).toLocaleString()}`
|
||||
);
|
||||
}
|
||||
}
|
||||
inputEl.helpText = helpText;
|
||||
};
|
||||
return html`
|
||||
${this.renderSectionHeading(msg("Limit Per Page"))}
|
||||
${this.renderFormCol(html`
|
||||
<sl-input
|
||||
name="pageLoadTimeoutSeconds"
|
||||
type="number"
|
||||
label=${msg("Page Load Timeout")}
|
||||
placeholder=${this.orgDefaults?.pageLoadTimeoutSeconds
|
||||
? msg(
|
||||
str`Default: ${this.orgDefaults.pageLoadTimeoutSeconds.toLocaleString()}`
|
||||
)
|
||||
: "Default: Unlimited"}
|
||||
value=${ifDefined(this.formState.pageLoadTimeoutSeconds ?? undefined)}
|
||||
min="0"
|
||||
@sl-input=${onInputMinMax}
|
||||
>
|
||||
<span slot="suffix">${msg("seconds")}</span>
|
||||
</sl-input>
|
||||
`)}
|
||||
${this.renderHelpTextCol(
|
||||
msg(
|
||||
`Limits amount of time to wait for a page to load. Behaviors will run after this timeout only if the page is partially or fully loaded.`
|
||||
)
|
||||
)}
|
||||
${this.renderFormCol(html`
|
||||
<sl-input
|
||||
name="behaviorTimeoutSeconds"
|
||||
type="number"
|
||||
label=${msg("Behavior Timeout")}
|
||||
placeholder=${this.orgDefaults?.behaviorTimeoutSeconds
|
||||
? msg(
|
||||
str`Default: ${this.orgDefaults.behaviorTimeoutSeconds.toLocaleString()}`
|
||||
)
|
||||
: msg("Unlimited")}
|
||||
value=${ifDefined(this.formState.behaviorTimeoutSeconds ?? undefined)}
|
||||
min="0"
|
||||
@sl-input=${onInputMinMax}
|
||||
>
|
||||
<span slot="suffix">${msg("seconds")}</span>
|
||||
</sl-input>
|
||||
`)}
|
||||
${this.renderHelpTextCol(
|
||||
msg(`Limits how long behaviors can run on each page.`)
|
||||
)}
|
||||
${this.renderFormCol(html`<sl-checkbox
|
||||
name="disableAutoscrollBehavior"
|
||||
?checked=${this.formState.disableAutoscrollBehavior}
|
||||
>
|
||||
${msg("Disable Auto-Scroll Behavior")}
|
||||
</sl-checkbox>`)}
|
||||
${this.renderHelpTextCol(
|
||||
msg(
|
||||
`Prevents browser from automatically scrolling until the end of the page.`
|
||||
),
|
||||
false
|
||||
)}
|
||||
${this.renderFormCol(html`
|
||||
<sl-input
|
||||
name="pageExtraDelaySeconds"
|
||||
type="number"
|
||||
label=${msg("Delay Before Next Page")}
|
||||
placeholder=${"Default: 0"}
|
||||
value=${ifDefined(this.formState.pageExtraDelaySeconds ?? undefined)}
|
||||
min="0"
|
||||
>
|
||||
<span slot="suffix">${msg("seconds")}</span>
|
||||
</sl-input>
|
||||
`)}
|
||||
${this.renderHelpTextCol(
|
||||
msg(
|
||||
`Waits on the page after behaviors are complete before moving onto the next page. Can be helpful for rate limiting.`
|
||||
)
|
||||
)}
|
||||
${this.renderSectionHeading(msg("Limit Per Crawl"))}
|
||||
${this.renderFormCol(html`
|
||||
<sl-mutation-observer
|
||||
attr="min"
|
||||
@ -1176,35 +1289,20 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
type="number"
|
||||
value=${this.formState.pageLimit || ""}
|
||||
min=${minPages}
|
||||
max=${this.orgDefaults.maxPagesPerCrawl}
|
||||
placeholder=${this.orgDefaults.maxPagesPerCrawl === Infinity
|
||||
? msg("Unlimited")
|
||||
: msg(
|
||||
str`Maximum Allowed (${this.orgDefaults.maxPagesPerCrawl.toLocaleString()})`
|
||||
)}
|
||||
@sl-input=${async (e: CustomEvent) => {
|
||||
const inputEl = e.target as SlInput;
|
||||
await inputEl.updateComplete;
|
||||
let helpText = "";
|
||||
if (inputEl.invalid) {
|
||||
const value = +inputEl.value;
|
||||
if (value < minPages) {
|
||||
helpText =
|
||||
minPages === 1
|
||||
? msg(
|
||||
str`Minimum ${minPages.toLocaleString()} page per crawl`
|
||||
)
|
||||
: msg(
|
||||
str`Minimum ${minPages.toLocaleString()} pages per crawl`
|
||||
);
|
||||
} else if (value > this.orgDefaults.maxPagesPerCrawl) {
|
||||
helpText = msg(
|
||||
str`Maximum ${this.orgDefaults.maxPagesPerCrawl.toLocaleString()} pages per crawl`
|
||||
);
|
||||
}
|
||||
}
|
||||
inputEl.helpText = helpText;
|
||||
}}
|
||||
max=${ifDefined(
|
||||
this.orgDefaults?.maxPagesPerCrawl &&
|
||||
this.orgDefaults.maxPagesPerCrawl < Infinity
|
||||
? this.orgDefaults.maxPagesPerCrawl
|
||||
: undefined
|
||||
)}
|
||||
placeholder=${this.orgDefaults?.maxPagesPerCrawl
|
||||
? this.orgDefaults.maxPagesPerCrawl === Infinity
|
||||
? msg("Default: Unlimited")
|
||||
: msg(
|
||||
str`Default: ${this.orgDefaults.maxPagesPerCrawl.toLocaleString()}`
|
||||
)
|
||||
: ""}
|
||||
@sl-input=${onInputMinMax}
|
||||
>
|
||||
<span slot="suffix">${msg("pages")}</span>
|
||||
</sl-input>
|
||||
@ -1214,33 +1312,12 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
msg(`Adds a hard limit on the number of pages
|
||||
that will be crawled.`)
|
||||
)}
|
||||
${this.renderFormCol(html`
|
||||
<sl-input
|
||||
name="pageTimeoutMinutes"
|
||||
type="number"
|
||||
label=${msg("Page Time Limit")}
|
||||
placeholder=${msg("Unlimited")}
|
||||
value=${ifDefined(
|
||||
this.formState.pageTimeoutMinutes ??
|
||||
this.orgDefaults.behaviorTimeoutMinutes
|
||||
)}
|
||||
?disabled=${this.orgDefaults.behaviorTimeoutMinutes === undefined}
|
||||
min="1"
|
||||
required
|
||||
>
|
||||
<span slot="suffix">${msg("minutes")}</span>
|
||||
</sl-input>
|
||||
`)}
|
||||
${this.renderHelpTextCol(
|
||||
msg(`Adds a hard time limit for how long the crawler can spend on a
|
||||
single webpage.`)
|
||||
)}
|
||||
${this.renderFormCol(html`
|
||||
<sl-input
|
||||
name="crawlTimeoutMinutes"
|
||||
label=${msg("Crawl Time Limit")}
|
||||
value=${this.formState.crawlTimeoutMinutes || ""}
|
||||
placeholder=${msg("Unlimited")}
|
||||
placeholder=${msg("Default: Unlimited")}
|
||||
min="0"
|
||||
type="number"
|
||||
>
|
||||
@ -1549,8 +1626,8 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
? msg(
|
||||
"There are issues with this Workflow. Please go through previous steps and fix all issues to continue."
|
||||
)
|
||||
: msg(html`There is an issue with this Workflow:<br /><br />Crawl
|
||||
URL(s) required in
|
||||
: msg(html`There is an issue with this Workflow:<br /><br />Crawl URL(s)
|
||||
required in
|
||||
<a href="${crawlSetupUrl}" class="bold underline hover:no-underline"
|
||||
>Crawl Setup</a
|
||||
>. <br /><br />
|
||||
@ -1953,21 +2030,27 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
...(this.jobType === "seed-crawl"
|
||||
? this.parseSeededConfig()
|
||||
: this.parseUrlListConfig()),
|
||||
behaviorTimeout:
|
||||
(this.formState.pageTimeoutMinutes ??
|
||||
this.orgDefaults.behaviorTimeoutMinutes ??
|
||||
DEFAULT_BEHAVIOR_TIMEOUT_MINUTES) * 60,
|
||||
behaviorTimeout: +(this.formState.behaviorTimeoutSeconds || ""),
|
||||
pageLoadTimeout: +(this.formState.pageLoadTimeoutSeconds || ""),
|
||||
pageExtraDelay: +(this.formState.pageExtraDelaySeconds || ""),
|
||||
limit: this.formState.pageLimit ? +this.formState.pageLimit : undefined,
|
||||
lang: this.formState.lang || "",
|
||||
blockAds: this.formState.blockAds,
|
||||
exclude: trimArray(this.formState.exclusions),
|
||||
behaviors: (this.formState.disableAutoscrollBehavior
|
||||
? DEFAULT_BEHAVIORS.slice(1)
|
||||
: DEFAULT_BEHAVIORS
|
||||
).join(","),
|
||||
},
|
||||
};
|
||||
|
||||
return config;
|
||||
}
|
||||
|
||||
private parseUrlListConfig(): NewCrawlConfigParams["config"] {
|
||||
private parseUrlListConfig(): Pick<
|
||||
NewCrawlConfigParams["config"],
|
||||
"seeds" | "scopeType" | "extraHops"
|
||||
> {
|
||||
const config = {
|
||||
seeds: urlListToArray(this.formState.urlList).map((seedUrl) => {
|
||||
const newSeed: Seed = { url: seedUrl, scopeType: "page" };
|
||||
@ -1980,7 +2063,10 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
return config;
|
||||
}
|
||||
|
||||
private parseSeededConfig(): NewCrawlConfigParams["config"] {
|
||||
private parseSeededConfig(): Pick<
|
||||
NewCrawlConfigParams["config"],
|
||||
"seeds" | "scopeType"
|
||||
> {
|
||||
const primarySeedUrl = this.formState.primarySeedUrl;
|
||||
const includeUrlList = this.formState.customIncludeUrlList
|
||||
? urlListToArray(this.formState.customIncludeUrlList)
|
||||
@ -2003,7 +2089,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
: [],
|
||||
extraHops: this.formState.includeLinkedPages ? 1 : 0,
|
||||
};
|
||||
const config: SeedConfig = {
|
||||
const config = {
|
||||
seeds: [primarySeed, ...additionalSeedUrlList],
|
||||
scopeType: additionalSeedUrlList.length
|
||||
? "page"
|
||||
@ -2043,7 +2129,6 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
}
|
||||
|
||||
private async fetchAPIDefaults() {
|
||||
const orgDefaults = { ...this.orgDefaults };
|
||||
try {
|
||||
const resp = await fetch("/api/settings", {
|
||||
headers: { "Content-Type": "application/json" },
|
||||
@ -2051,18 +2136,23 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
if (!resp.ok) {
|
||||
throw new Error(resp.statusText);
|
||||
}
|
||||
const orgDefaults = {
|
||||
...this.orgDefaults,
|
||||
};
|
||||
const data = await resp.json();
|
||||
if (data.defaultBehaviorTimeSeconds) {
|
||||
orgDefaults.behaviorTimeoutMinutes =
|
||||
data.defaultBehaviorTimeSeconds / 60;
|
||||
if (data.defaultBehaviorTimeSeconds > 0) {
|
||||
orgDefaults.behaviorTimeoutSeconds = data.defaultBehaviorTimeSeconds;
|
||||
}
|
||||
if (data.defaultPageLoadTimeSeconds > 0) {
|
||||
orgDefaults.pageLoadTimeoutSeconds = data.defaultPageLoadTimeSeconds;
|
||||
}
|
||||
if (data.maxPagesPerCrawl > 0) {
|
||||
orgDefaults.maxPagesPerCrawl = data.maxPagesPerCrawl;
|
||||
}
|
||||
this.orgDefaults = orgDefaults;
|
||||
} catch (e: any) {
|
||||
console.debug(e);
|
||||
}
|
||||
this.orgDefaults = orgDefaults;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -20,6 +20,9 @@ const defaultValue = {
|
||||
seeds: [],
|
||||
scopeType: "prefix",
|
||||
exclude: [""],
|
||||
behaviorTimeout: null,
|
||||
pageLoadTimeout: null,
|
||||
pageExtraDelay: null,
|
||||
},
|
||||
tags: [],
|
||||
crawlTimeout: null,
|
||||
|
@ -20,10 +20,12 @@ export type SeedConfig = Pick<
|
||||
Seed,
|
||||
"scopeType" | "include" | "exclude" | "limit" | "extraHops"
|
||||
> & {
|
||||
seeds: (Seed)[];
|
||||
seeds: Seed[];
|
||||
lang?: string | null;
|
||||
blockAds?: boolean;
|
||||
behaviorTimeout?: number | null;
|
||||
behaviorTimeout: number | null;
|
||||
pageLoadTimeout: number | null;
|
||||
pageExtraDelay: number | null;
|
||||
behaviors?: string | null;
|
||||
extraHops?: number | null;
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user