Allow users to set additional page time limits (#744)
This commit is contained in:
parent
72967a0381
commit
91c2c1ad62
@ -8,6 +8,7 @@ import ISO6391 from "iso-639-1";
|
|||||||
import LiteElement, { html } from "../utils/LiteElement";
|
import LiteElement, { html } from "../utils/LiteElement";
|
||||||
import type { CrawlConfig, Seed, SeedConfig } from "../pages/org/types";
|
import type { CrawlConfig, Seed, SeedConfig } from "../pages/org/types";
|
||||||
import { humanizeSchedule } from "../utils/cron";
|
import { humanizeSchedule } from "../utils/cron";
|
||||||
|
import { RelativeDuration } from "./relative-duration";
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Usage:
|
* Usage:
|
||||||
@ -30,9 +31,10 @@ export class ConfigDetails extends LiteElement {
|
|||||||
hideTags = false;
|
hideTags = false;
|
||||||
|
|
||||||
@state()
|
@state()
|
||||||
private orgDefaults = {
|
private orgDefaults?: {
|
||||||
behaviorTimeoutMinutes: Infinity,
|
pageLoadTimeoutSeconds?: number;
|
||||||
maxPagesPerCrawl: Infinity,
|
behaviorTimeoutSeconds?: number;
|
||||||
|
maxPagesPerCrawl?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
private readonly scopeTypeLabels: Record<
|
private readonly scopeTypeLabels: Record<
|
||||||
@ -55,7 +57,25 @@ export class ConfigDetails extends LiteElement {
|
|||||||
|
|
||||||
render() {
|
render() {
|
||||||
const crawlConfig = this.crawlConfig;
|
const crawlConfig = this.crawlConfig;
|
||||||
const exclusions = crawlConfig?.config.exclude || [];
|
const seedsConfig = crawlConfig?.config;
|
||||||
|
const exclusions = seedsConfig?.exclude || [];
|
||||||
|
const maxPages = seedsConfig?.seeds[0]?.limit ?? seedsConfig?.limit;
|
||||||
|
const renderTimeLimit = (
|
||||||
|
valueSeconds?: number | null,
|
||||||
|
fallbackValue?: number
|
||||||
|
) =>
|
||||||
|
valueSeconds
|
||||||
|
? RelativeDuration.humanize(valueSeconds * 1000, { verbose: true })
|
||||||
|
: typeof fallbackValue === "number"
|
||||||
|
? html`<span class="text-neutral-400"
|
||||||
|
>${fallbackValue === Infinity
|
||||||
|
? msg("Unlimited")
|
||||||
|
: RelativeDuration.humanize(fallbackValue * 1000, {
|
||||||
|
verbose: true,
|
||||||
|
})}
|
||||||
|
${msg("(default)")}</span
|
||||||
|
>`
|
||||||
|
: undefined;
|
||||||
|
|
||||||
return html`
|
return html`
|
||||||
<section id="crawler-settings" class="mb-8">
|
<section id="crawler-settings" class="mb-8">
|
||||||
@ -85,16 +105,51 @@ export class ConfigDetails extends LiteElement {
|
|||||||
() => this.renderSetting(msg("Exclusions"), msg("None"))
|
() => this.renderSetting(msg("Exclusions"), msg("None"))
|
||||||
)}
|
)}
|
||||||
${this.renderSetting(
|
${this.renderSetting(
|
||||||
msg("Page Time Limit"),
|
msg("Max Pages"),
|
||||||
crawlConfig?.config.behaviorTimeout
|
when(
|
||||||
? msg(str`${crawlConfig?.config.behaviorTimeout / 60} minute(s)`)
|
maxPages,
|
||||||
: msg("None")
|
() => msg(str`${maxPages!.toLocaleString()} pages`),
|
||||||
|
() =>
|
||||||
|
this.orgDefaults?.maxPagesPerCrawl
|
||||||
|
? html`<span class="text-neutral-400"
|
||||||
|
>${msg(
|
||||||
|
str`${this.orgDefaults.maxPagesPerCrawl.toLocaleString()} pages`
|
||||||
|
)}
|
||||||
|
${msg("(default)")}</span
|
||||||
|
>`
|
||||||
|
: undefined
|
||||||
|
)
|
||||||
|
)}
|
||||||
|
${this.renderSetting(
|
||||||
|
msg("Page Load Timeout"),
|
||||||
|
renderTimeLimit(
|
||||||
|
crawlConfig?.config.pageLoadTimeout,
|
||||||
|
this.orgDefaults?.pageLoadTimeoutSeconds ?? Infinity
|
||||||
|
)
|
||||||
|
)}
|
||||||
|
${this.renderSetting(
|
||||||
|
msg("Page Behavior Timeout"),
|
||||||
|
renderTimeLimit(
|
||||||
|
crawlConfig?.config.behaviorTimeout,
|
||||||
|
this.orgDefaults?.behaviorTimeoutSeconds ?? Infinity
|
||||||
|
)
|
||||||
|
)}
|
||||||
|
${this.renderSetting(
|
||||||
|
msg("Auto-Scroll Behavior"),
|
||||||
|
crawlConfig?.config.behaviors &&
|
||||||
|
!crawlConfig.config.behaviors.includes("autoscroll")
|
||||||
|
? msg("Disabled")
|
||||||
|
: html`<span class="text-neutral-400"
|
||||||
|
>${msg("Enabled (default)")}</span
|
||||||
|
>`
|
||||||
|
)}
|
||||||
|
${this.renderSetting(
|
||||||
|
msg("Delay Before Next Page"),
|
||||||
|
renderTimeLimit(crawlConfig?.config.pageExtraDelay, 0)
|
||||||
)}
|
)}
|
||||||
${this.renderSetting(
|
${this.renderSetting(
|
||||||
msg("Crawl Time Limit"),
|
msg("Crawl Time Limit"),
|
||||||
crawlConfig?.crawlTimeout
|
renderTimeLimit(crawlConfig?.crawlTimeout, Infinity)
|
||||||
? msg(str`${crawlConfig?.crawlTimeout / 60} minute(s)`)
|
|
||||||
: msg("None")
|
|
||||||
)}
|
)}
|
||||||
${this.renderSetting(msg("Crawler Instances"), crawlConfig?.scale)}
|
${this.renderSetting(msg("Crawler Instances"), crawlConfig?.scale)}
|
||||||
</btrix-desc-list>
|
</btrix-desc-list>
|
||||||
@ -212,10 +267,9 @@ export class ConfigDetails extends LiteElement {
|
|||||||
const crawlConfig = this.crawlConfig!;
|
const crawlConfig = this.crawlConfig!;
|
||||||
const seedsConfig = crawlConfig.config;
|
const seedsConfig = crawlConfig.config;
|
||||||
const additionalUrlList = seedsConfig.seeds.slice(1);
|
const additionalUrlList = seedsConfig.seeds.slice(1);
|
||||||
let primarySeedConfig: SeedConfig | Seed = seedsConfig;
|
const primarySeedConfig: SeedConfig | Seed = seedsConfig;
|
||||||
let primarySeedUrl = seedsConfig.seeds[0].url;
|
const primarySeedUrl = seedsConfig.seeds[0].url;
|
||||||
const includeUrlList = primarySeedConfig.include || seedsConfig.include;
|
const includeUrlList = primarySeedConfig.include || seedsConfig.include;
|
||||||
const maxPages = primarySeedConfig.limit ?? seedsConfig.limit;
|
|
||||||
return html`
|
return html`
|
||||||
${this.renderSetting(msg("Primary Seed URL"), primarySeedUrl, true)}
|
${this.renderSetting(msg("Primary Seed URL"), primarySeedUrl, true)}
|
||||||
${this.renderSetting(
|
${this.renderSetting(
|
||||||
@ -255,19 +309,6 @@ export class ConfigDetails extends LiteElement {
|
|||||||
: msg("None"),
|
: msg("None"),
|
||||||
true
|
true
|
||||||
)}
|
)}
|
||||||
${this.renderSetting(
|
|
||||||
msg("Max Pages"),
|
|
||||||
when(
|
|
||||||
maxPages,
|
|
||||||
() => msg(str`${maxPages} page(s)`),
|
|
||||||
() =>
|
|
||||||
this.orgDefaults.maxPagesPerCrawl < Infinity
|
|
||||||
? msg(
|
|
||||||
str`Maximum Allowed (${this.orgDefaults.maxPagesPerCrawl.toLocaleString()} pages)`
|
|
||||||
)
|
|
||||||
: undefined
|
|
||||||
)
|
|
||||||
)}
|
|
||||||
`;
|
`;
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -292,7 +333,7 @@ export class ConfigDetails extends LiteElement {
|
|||||||
} else if (typeof value === "boolean") {
|
} else if (typeof value === "boolean") {
|
||||||
content = value ? msg("Yes") : msg("No");
|
content = value ? msg("Yes") : msg("No");
|
||||||
} else if (typeof value !== "number" && !value) {
|
} else if (typeof value !== "number" && !value) {
|
||||||
content = html`<span class="text-neutral-300"
|
content = html`<span class="text-neutral-400"
|
||||||
>${msg("Not specified")}</span
|
>${msg("Not specified")}</span
|
||||||
>`;
|
>`;
|
||||||
}
|
}
|
||||||
@ -304,7 +345,6 @@ export class ConfigDetails extends LiteElement {
|
|||||||
}
|
}
|
||||||
|
|
||||||
private async fetchAPIDefaults() {
|
private async fetchAPIDefaults() {
|
||||||
const orgDefaults = { ...this.orgDefaults };
|
|
||||||
try {
|
try {
|
||||||
const resp = await fetch("/api/settings", {
|
const resp = await fetch("/api/settings", {
|
||||||
headers: { "Content-Type": "application/json" },
|
headers: { "Content-Type": "application/json" },
|
||||||
@ -312,17 +352,22 @@ export class ConfigDetails extends LiteElement {
|
|||||||
if (!resp.ok) {
|
if (!resp.ok) {
|
||||||
throw new Error(resp.statusText);
|
throw new Error(resp.statusText);
|
||||||
}
|
}
|
||||||
|
const orgDefaults = {
|
||||||
|
...this.orgDefaults,
|
||||||
|
};
|
||||||
const data = await resp.json();
|
const data = await resp.json();
|
||||||
if (data.defaultBehaviorTimeSeconds) {
|
if (data.defaultBehaviorTimeSeconds > 0) {
|
||||||
orgDefaults.behaviorTimeoutMinutes =
|
orgDefaults.behaviorTimeoutSeconds = data.defaultBehaviorTimeSeconds;
|
||||||
data.defaultBehaviorTimeSeconds / 60;
|
}
|
||||||
|
if (data.defaultPageLoadTimeSeconds > 0) {
|
||||||
|
orgDefaults.pageLoadTimeoutSeconds = data.defaultPageLoadTimeSeconds;
|
||||||
}
|
}
|
||||||
if (data.maxPagesPerCrawl > 0) {
|
if (data.maxPagesPerCrawl > 0) {
|
||||||
orgDefaults.maxPagesPerCrawl = data.maxPagesPerCrawl;
|
orgDefaults.maxPagesPerCrawl = data.maxPagesPerCrawl;
|
||||||
}
|
}
|
||||||
|
this.orgDefaults = orgDefaults;
|
||||||
} catch (e: any) {
|
} catch (e: any) {
|
||||||
console.debug(e);
|
console.debug(e);
|
||||||
}
|
}
|
||||||
this.orgDefaults = orgDefaults;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -16,8 +16,6 @@ import compact from "lodash/fp/compact";
|
|||||||
import { mergeDeep } from "immutable";
|
import { mergeDeep } from "immutable";
|
||||||
import flow from "lodash/fp/flow";
|
import flow from "lodash/fp/flow";
|
||||||
import uniq from "lodash/fp/uniq";
|
import uniq from "lodash/fp/uniq";
|
||||||
import RegexColorize from "regex-colorize";
|
|
||||||
import ISO6391 from "iso-639-1";
|
|
||||||
import Fuse from "fuse.js";
|
import Fuse from "fuse.js";
|
||||||
|
|
||||||
import LiteElement, { html } from "../../utils/LiteElement";
|
import LiteElement, { html } from "../../utils/LiteElement";
|
||||||
@ -78,7 +76,9 @@ type FormState = {
|
|||||||
includeLinkedPages: boolean;
|
includeLinkedPages: boolean;
|
||||||
customIncludeUrlList: string;
|
customIncludeUrlList: string;
|
||||||
crawlTimeoutMinutes: number | null;
|
crawlTimeoutMinutes: number | null;
|
||||||
pageTimeoutMinutes: number | null;
|
behaviorTimeoutSeconds: number | null;
|
||||||
|
pageLoadTimeoutSeconds: number | null;
|
||||||
|
pageExtraDelaySeconds: number | null;
|
||||||
scopeType: WorkflowParams["config"]["scopeType"];
|
scopeType: WorkflowParams["config"]["scopeType"];
|
||||||
exclusions: WorkflowParams["config"]["exclude"];
|
exclusions: WorkflowParams["config"]["exclude"];
|
||||||
pageLimit: WorkflowParams["config"]["limit"];
|
pageLimit: WorkflowParams["config"]["limit"];
|
||||||
@ -99,6 +99,7 @@ type FormState = {
|
|||||||
browserProfile: Profile | null;
|
browserProfile: Profile | null;
|
||||||
tags: Tags;
|
tags: Tags;
|
||||||
description: WorkflowParams["description"];
|
description: WorkflowParams["description"];
|
||||||
|
disableAutoscrollBehavior: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
const getDefaultProgressState = (hasConfigId = false): ProgressState => {
|
const getDefaultProgressState = (hasConfigId = false): ProgressState => {
|
||||||
@ -144,7 +145,9 @@ const getDefaultFormState = (): FormState => ({
|
|||||||
includeLinkedPages: false,
|
includeLinkedPages: false,
|
||||||
customIncludeUrlList: "",
|
customIncludeUrlList: "",
|
||||||
crawlTimeoutMinutes: null,
|
crawlTimeoutMinutes: null,
|
||||||
pageTimeoutMinutes: null,
|
behaviorTimeoutSeconds: null,
|
||||||
|
pageLoadTimeoutSeconds: null,
|
||||||
|
pageExtraDelaySeconds: null,
|
||||||
scopeType: "host",
|
scopeType: "host",
|
||||||
exclusions: [],
|
exclusions: [],
|
||||||
pageLimit: undefined,
|
pageLimit: undefined,
|
||||||
@ -165,6 +168,7 @@ const getDefaultFormState = (): FormState => ({
|
|||||||
browserProfile: null,
|
browserProfile: null,
|
||||||
tags: [],
|
tags: [],
|
||||||
description: null,
|
description: null,
|
||||||
|
disableAutoscrollBehavior: false,
|
||||||
});
|
});
|
||||||
const defaultProgressState = getDefaultProgressState();
|
const defaultProgressState = getDefaultProgressState();
|
||||||
const orderedTabNames = STEPS.filter(
|
const orderedTabNames = STEPS.filter(
|
||||||
@ -191,8 +195,12 @@ const urlListToArray = flow(
|
|||||||
(str: string) => (str.length ? str.trim().split(/\s+/g) : []),
|
(str: string) => (str.length ? str.trim().split(/\s+/g) : []),
|
||||||
trimArray
|
trimArray
|
||||||
);
|
);
|
||||||
const DEFAULT_BEHAVIOR_TIMEOUT_MINUTES = 5;
|
const DEFAULT_BEHAVIORS = [
|
||||||
const DEFAULT_MAX_PAGES_PER_CRAWL = Infinity;
|
"autoscroll",
|
||||||
|
"autoplay",
|
||||||
|
"autofetch",
|
||||||
|
"siteSpecific",
|
||||||
|
];
|
||||||
|
|
||||||
@localized()
|
@localized()
|
||||||
export class CrawlConfigEditor extends LiteElement {
|
export class CrawlConfigEditor extends LiteElement {
|
||||||
@ -221,9 +229,10 @@ export class CrawlConfigEditor extends LiteElement {
|
|||||||
private progressState!: ProgressState;
|
private progressState!: ProgressState;
|
||||||
|
|
||||||
@state()
|
@state()
|
||||||
private orgDefaults = {
|
private orgDefaults?: {
|
||||||
behaviorTimeoutMinutes: DEFAULT_BEHAVIOR_TIMEOUT_MINUTES,
|
behaviorTimeoutSeconds?: number;
|
||||||
maxPagesPerCrawl: DEFAULT_MAX_PAGES_PER_CRAWL,
|
pageLoadTimeoutSeconds?: number;
|
||||||
|
maxPagesPerCrawl?: number;
|
||||||
};
|
};
|
||||||
|
|
||||||
@state()
|
@state()
|
||||||
@ -394,8 +403,9 @@ export class CrawlConfigEditor extends LiteElement {
|
|||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
private getInitialFormState(): FormState | {} {
|
private getInitialFormState(): FormState {
|
||||||
if (!this.initialWorkflow) return {};
|
const defaultFormState = getDefaultFormState();
|
||||||
|
if (!this.initialWorkflow) return defaultFormState;
|
||||||
const formState: Partial<FormState> = {};
|
const formState: Partial<FormState> = {};
|
||||||
const seedsConfig = this.initialWorkflow.config;
|
const seedsConfig = this.initialWorkflow.config;
|
||||||
const { seeds } = seedsConfig;
|
const { seeds } = seedsConfig;
|
||||||
@ -456,36 +466,46 @@ export class CrawlConfigEditor extends LiteElement {
|
|||||||
if (this.initialWorkflow.tags?.length) {
|
if (this.initialWorkflow.tags?.length) {
|
||||||
formState.tags = this.initialWorkflow.tags;
|
formState.tags = this.initialWorkflow.tags;
|
||||||
}
|
}
|
||||||
if (typeof this.initialWorkflow.crawlTimeout === "number") {
|
const secondsToMinutes = (value: any, fallback: number | null) => {
|
||||||
formState.crawlTimeoutMinutes = this.initialWorkflow.crawlTimeout / 60;
|
if (typeof value === "number" && value > 0) return value / 60;
|
||||||
}
|
return fallback;
|
||||||
if (typeof seedsConfig.behaviorTimeout === "number") {
|
};
|
||||||
formState.pageTimeoutMinutes = seedsConfig.behaviorTimeout / 60;
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
primarySeedUrl: "",
|
primarySeedUrl: defaultFormState.primarySeedUrl,
|
||||||
urlList: "",
|
urlList: defaultFormState.urlList,
|
||||||
customIncludeUrlList: "",
|
customIncludeUrlList: defaultFormState.customIncludeUrlList,
|
||||||
crawlTimeoutMinutes: null,
|
crawlTimeoutMinutes: secondsToMinutes(
|
||||||
pageTimeoutMinutes: null,
|
this.initialWorkflow.crawlTimeout,
|
||||||
|
defaultFormState.crawlTimeoutMinutes
|
||||||
|
),
|
||||||
|
behaviorTimeoutSeconds:
|
||||||
|
seedsConfig.behaviorTimeout ?? defaultFormState.behaviorTimeoutSeconds,
|
||||||
|
pageLoadTimeoutSeconds:
|
||||||
|
seedsConfig.pageLoadTimeout ?? defaultFormState.pageLoadTimeoutSeconds,
|
||||||
|
pageExtraDelaySeconds:
|
||||||
|
seedsConfig.pageExtraDelay ?? defaultFormState.pageExtraDelaySeconds,
|
||||||
scale: this.initialWorkflow.scale,
|
scale: this.initialWorkflow.scale,
|
||||||
blockAds: this.initialWorkflow.config.blockAds,
|
blockAds: this.initialWorkflow.config.blockAds,
|
||||||
lang: this.initialWorkflow.config.lang,
|
lang: this.initialWorkflow.config.lang,
|
||||||
scheduleType: "none",
|
scheduleType: defaultFormState.scheduleType,
|
||||||
runNow: false,
|
scheduleFrequency: defaultFormState.scheduleFrequency,
|
||||||
|
runNow: defaultFormState.runNow,
|
||||||
tags: this.initialWorkflow.tags,
|
tags: this.initialWorkflow.tags,
|
||||||
jobName: this.initialWorkflow.name || "",
|
jobName: this.initialWorkflow.name || defaultFormState.jobName,
|
||||||
description: this.initialWorkflow.description,
|
description: this.initialWorkflow.description,
|
||||||
browserProfile: this.initialWorkflow.profileid
|
browserProfile: this.initialWorkflow.profileid
|
||||||
? ({ id: this.initialWorkflow.profileid } as Profile)
|
? ({ id: this.initialWorkflow.profileid } as Profile)
|
||||||
: null,
|
: defaultFormState.browserProfile,
|
||||||
scopeType: primarySeedConfig.scopeType as FormState["scopeType"],
|
scopeType: primarySeedConfig.scopeType as FormState["scopeType"],
|
||||||
exclusions: seedsConfig.exclude,
|
exclusions: seedsConfig.exclude,
|
||||||
includeLinkedPages: Boolean(
|
includeLinkedPages:
|
||||||
primarySeedConfig.extraHops || seedsConfig.extraHops
|
Boolean(primarySeedConfig.extraHops || seedsConfig.extraHops) ?? true,
|
||||||
),
|
pageLimit:
|
||||||
pageLimit: this.initialWorkflow.config.limit ?? undefined,
|
this.initialWorkflow.config.limit ?? defaultFormState.pageLimit,
|
||||||
|
disableAutoscrollBehavior: this.initialWorkflow.config.behaviors
|
||||||
|
? !this.initialWorkflow.config.behaviors.includes("autoscroll")
|
||||||
|
: defaultFormState.disableAutoscrollBehavior,
|
||||||
...formState,
|
...formState,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@ -1154,7 +1174,100 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
urlListToArray(this.formState.urlList).length +
|
urlListToArray(this.formState.urlList).length +
|
||||||
(this.jobType === "seed-crawl" ? 1 : 0)
|
(this.jobType === "seed-crawl" ? 1 : 0)
|
||||||
);
|
);
|
||||||
|
const onInputMinMax = async (e: CustomEvent) => {
|
||||||
|
const inputEl = e.target as SlInput;
|
||||||
|
await inputEl.updateComplete;
|
||||||
|
let helpText = "";
|
||||||
|
if (inputEl.invalid) {
|
||||||
|
const value = +inputEl.value;
|
||||||
|
const min = inputEl.min;
|
||||||
|
const max = inputEl.max;
|
||||||
|
if (min && value < +min) {
|
||||||
|
helpText = msg(
|
||||||
|
str`Must be more than minimum of ${(+min).toLocaleString()}`
|
||||||
|
);
|
||||||
|
} else if (max && value > +max) {
|
||||||
|
helpText = msg(
|
||||||
|
str`Must be less than maximum of ${(+max).toLocaleString()}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
inputEl.helpText = helpText;
|
||||||
|
};
|
||||||
return html`
|
return html`
|
||||||
|
${this.renderSectionHeading(msg("Limit Per Page"))}
|
||||||
|
${this.renderFormCol(html`
|
||||||
|
<sl-input
|
||||||
|
name="pageLoadTimeoutSeconds"
|
||||||
|
type="number"
|
||||||
|
label=${msg("Page Load Timeout")}
|
||||||
|
placeholder=${this.orgDefaults?.pageLoadTimeoutSeconds
|
||||||
|
? msg(
|
||||||
|
str`Default: ${this.orgDefaults.pageLoadTimeoutSeconds.toLocaleString()}`
|
||||||
|
)
|
||||||
|
: "Default: Unlimited"}
|
||||||
|
value=${ifDefined(this.formState.pageLoadTimeoutSeconds ?? undefined)}
|
||||||
|
min="0"
|
||||||
|
@sl-input=${onInputMinMax}
|
||||||
|
>
|
||||||
|
<span slot="suffix">${msg("seconds")}</span>
|
||||||
|
</sl-input>
|
||||||
|
`)}
|
||||||
|
${this.renderHelpTextCol(
|
||||||
|
msg(
|
||||||
|
`Limits amount of time to wait for a page to load. Behaviors will run after this timeout only if the page is partially or fully loaded.`
|
||||||
|
)
|
||||||
|
)}
|
||||||
|
${this.renderFormCol(html`
|
||||||
|
<sl-input
|
||||||
|
name="behaviorTimeoutSeconds"
|
||||||
|
type="number"
|
||||||
|
label=${msg("Behavior Timeout")}
|
||||||
|
placeholder=${this.orgDefaults?.behaviorTimeoutSeconds
|
||||||
|
? msg(
|
||||||
|
str`Default: ${this.orgDefaults.behaviorTimeoutSeconds.toLocaleString()}`
|
||||||
|
)
|
||||||
|
: msg("Unlimited")}
|
||||||
|
value=${ifDefined(this.formState.behaviorTimeoutSeconds ?? undefined)}
|
||||||
|
min="0"
|
||||||
|
@sl-input=${onInputMinMax}
|
||||||
|
>
|
||||||
|
<span slot="suffix">${msg("seconds")}</span>
|
||||||
|
</sl-input>
|
||||||
|
`)}
|
||||||
|
${this.renderHelpTextCol(
|
||||||
|
msg(`Limits how long behaviors can run on each page.`)
|
||||||
|
)}
|
||||||
|
${this.renderFormCol(html`<sl-checkbox
|
||||||
|
name="disableAutoscrollBehavior"
|
||||||
|
?checked=${this.formState.disableAutoscrollBehavior}
|
||||||
|
>
|
||||||
|
${msg("Disable Auto-Scroll Behavior")}
|
||||||
|
</sl-checkbox>`)}
|
||||||
|
${this.renderHelpTextCol(
|
||||||
|
msg(
|
||||||
|
`Prevents browser from automatically scrolling until the end of the page.`
|
||||||
|
),
|
||||||
|
false
|
||||||
|
)}
|
||||||
|
${this.renderFormCol(html`
|
||||||
|
<sl-input
|
||||||
|
name="pageExtraDelaySeconds"
|
||||||
|
type="number"
|
||||||
|
label=${msg("Delay Before Next Page")}
|
||||||
|
placeholder=${"Default: 0"}
|
||||||
|
value=${ifDefined(this.formState.pageExtraDelaySeconds ?? undefined)}
|
||||||
|
min="0"
|
||||||
|
>
|
||||||
|
<span slot="suffix">${msg("seconds")}</span>
|
||||||
|
</sl-input>
|
||||||
|
`)}
|
||||||
|
${this.renderHelpTextCol(
|
||||||
|
msg(
|
||||||
|
`Waits on the page after behaviors are complete before moving onto the next page. Can be helpful for rate limiting.`
|
||||||
|
)
|
||||||
|
)}
|
||||||
|
${this.renderSectionHeading(msg("Limit Per Crawl"))}
|
||||||
${this.renderFormCol(html`
|
${this.renderFormCol(html`
|
||||||
<sl-mutation-observer
|
<sl-mutation-observer
|
||||||
attr="min"
|
attr="min"
|
||||||
@ -1176,35 +1289,20 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
type="number"
|
type="number"
|
||||||
value=${this.formState.pageLimit || ""}
|
value=${this.formState.pageLimit || ""}
|
||||||
min=${minPages}
|
min=${minPages}
|
||||||
max=${this.orgDefaults.maxPagesPerCrawl}
|
max=${ifDefined(
|
||||||
placeholder=${this.orgDefaults.maxPagesPerCrawl === Infinity
|
this.orgDefaults?.maxPagesPerCrawl &&
|
||||||
? msg("Unlimited")
|
this.orgDefaults.maxPagesPerCrawl < Infinity
|
||||||
: msg(
|
? this.orgDefaults.maxPagesPerCrawl
|
||||||
str`Maximum Allowed (${this.orgDefaults.maxPagesPerCrawl.toLocaleString()})`
|
: undefined
|
||||||
)}
|
)}
|
||||||
@sl-input=${async (e: CustomEvent) => {
|
placeholder=${this.orgDefaults?.maxPagesPerCrawl
|
||||||
const inputEl = e.target as SlInput;
|
? this.orgDefaults.maxPagesPerCrawl === Infinity
|
||||||
await inputEl.updateComplete;
|
? msg("Default: Unlimited")
|
||||||
let helpText = "";
|
: msg(
|
||||||
if (inputEl.invalid) {
|
str`Default: ${this.orgDefaults.maxPagesPerCrawl.toLocaleString()}`
|
||||||
const value = +inputEl.value;
|
)
|
||||||
if (value < minPages) {
|
: ""}
|
||||||
helpText =
|
@sl-input=${onInputMinMax}
|
||||||
minPages === 1
|
|
||||||
? msg(
|
|
||||||
str`Minimum ${minPages.toLocaleString()} page per crawl`
|
|
||||||
)
|
|
||||||
: msg(
|
|
||||||
str`Minimum ${minPages.toLocaleString()} pages per crawl`
|
|
||||||
);
|
|
||||||
} else if (value > this.orgDefaults.maxPagesPerCrawl) {
|
|
||||||
helpText = msg(
|
|
||||||
str`Maximum ${this.orgDefaults.maxPagesPerCrawl.toLocaleString()} pages per crawl`
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
inputEl.helpText = helpText;
|
|
||||||
}}
|
|
||||||
>
|
>
|
||||||
<span slot="suffix">${msg("pages")}</span>
|
<span slot="suffix">${msg("pages")}</span>
|
||||||
</sl-input>
|
</sl-input>
|
||||||
@ -1214,33 +1312,12 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
msg(`Adds a hard limit on the number of pages
|
msg(`Adds a hard limit on the number of pages
|
||||||
that will be crawled.`)
|
that will be crawled.`)
|
||||||
)}
|
)}
|
||||||
${this.renderFormCol(html`
|
|
||||||
<sl-input
|
|
||||||
name="pageTimeoutMinutes"
|
|
||||||
type="number"
|
|
||||||
label=${msg("Page Time Limit")}
|
|
||||||
placeholder=${msg("Unlimited")}
|
|
||||||
value=${ifDefined(
|
|
||||||
this.formState.pageTimeoutMinutes ??
|
|
||||||
this.orgDefaults.behaviorTimeoutMinutes
|
|
||||||
)}
|
|
||||||
?disabled=${this.orgDefaults.behaviorTimeoutMinutes === undefined}
|
|
||||||
min="1"
|
|
||||||
required
|
|
||||||
>
|
|
||||||
<span slot="suffix">${msg("minutes")}</span>
|
|
||||||
</sl-input>
|
|
||||||
`)}
|
|
||||||
${this.renderHelpTextCol(
|
|
||||||
msg(`Adds a hard time limit for how long the crawler can spend on a
|
|
||||||
single webpage.`)
|
|
||||||
)}
|
|
||||||
${this.renderFormCol(html`
|
${this.renderFormCol(html`
|
||||||
<sl-input
|
<sl-input
|
||||||
name="crawlTimeoutMinutes"
|
name="crawlTimeoutMinutes"
|
||||||
label=${msg("Crawl Time Limit")}
|
label=${msg("Crawl Time Limit")}
|
||||||
value=${this.formState.crawlTimeoutMinutes || ""}
|
value=${this.formState.crawlTimeoutMinutes || ""}
|
||||||
placeholder=${msg("Unlimited")}
|
placeholder=${msg("Default: Unlimited")}
|
||||||
min="0"
|
min="0"
|
||||||
type="number"
|
type="number"
|
||||||
>
|
>
|
||||||
@ -1549,8 +1626,8 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
? msg(
|
? msg(
|
||||||
"There are issues with this Workflow. Please go through previous steps and fix all issues to continue."
|
"There are issues with this Workflow. Please go through previous steps and fix all issues to continue."
|
||||||
)
|
)
|
||||||
: msg(html`There is an issue with this Workflow:<br /><br />Crawl
|
: msg(html`There is an issue with this Workflow:<br /><br />Crawl URL(s)
|
||||||
URL(s) required in
|
required in
|
||||||
<a href="${crawlSetupUrl}" class="bold underline hover:no-underline"
|
<a href="${crawlSetupUrl}" class="bold underline hover:no-underline"
|
||||||
>Crawl Setup</a
|
>Crawl Setup</a
|
||||||
>. <br /><br />
|
>. <br /><br />
|
||||||
@ -1953,21 +2030,27 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
...(this.jobType === "seed-crawl"
|
...(this.jobType === "seed-crawl"
|
||||||
? this.parseSeededConfig()
|
? this.parseSeededConfig()
|
||||||
: this.parseUrlListConfig()),
|
: this.parseUrlListConfig()),
|
||||||
behaviorTimeout:
|
behaviorTimeout: +(this.formState.behaviorTimeoutSeconds || ""),
|
||||||
(this.formState.pageTimeoutMinutes ??
|
pageLoadTimeout: +(this.formState.pageLoadTimeoutSeconds || ""),
|
||||||
this.orgDefaults.behaviorTimeoutMinutes ??
|
pageExtraDelay: +(this.formState.pageExtraDelaySeconds || ""),
|
||||||
DEFAULT_BEHAVIOR_TIMEOUT_MINUTES) * 60,
|
|
||||||
limit: this.formState.pageLimit ? +this.formState.pageLimit : undefined,
|
limit: this.formState.pageLimit ? +this.formState.pageLimit : undefined,
|
||||||
lang: this.formState.lang || "",
|
lang: this.formState.lang || "",
|
||||||
blockAds: this.formState.blockAds,
|
blockAds: this.formState.blockAds,
|
||||||
exclude: trimArray(this.formState.exclusions),
|
exclude: trimArray(this.formState.exclusions),
|
||||||
|
behaviors: (this.formState.disableAutoscrollBehavior
|
||||||
|
? DEFAULT_BEHAVIORS.slice(1)
|
||||||
|
: DEFAULT_BEHAVIORS
|
||||||
|
).join(","),
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
|
|
||||||
private parseUrlListConfig(): NewCrawlConfigParams["config"] {
|
private parseUrlListConfig(): Pick<
|
||||||
|
NewCrawlConfigParams["config"],
|
||||||
|
"seeds" | "scopeType" | "extraHops"
|
||||||
|
> {
|
||||||
const config = {
|
const config = {
|
||||||
seeds: urlListToArray(this.formState.urlList).map((seedUrl) => {
|
seeds: urlListToArray(this.formState.urlList).map((seedUrl) => {
|
||||||
const newSeed: Seed = { url: seedUrl, scopeType: "page" };
|
const newSeed: Seed = { url: seedUrl, scopeType: "page" };
|
||||||
@ -1980,7 +2063,10 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
return config;
|
return config;
|
||||||
}
|
}
|
||||||
|
|
||||||
private parseSeededConfig(): NewCrawlConfigParams["config"] {
|
private parseSeededConfig(): Pick<
|
||||||
|
NewCrawlConfigParams["config"],
|
||||||
|
"seeds" | "scopeType"
|
||||||
|
> {
|
||||||
const primarySeedUrl = this.formState.primarySeedUrl;
|
const primarySeedUrl = this.formState.primarySeedUrl;
|
||||||
const includeUrlList = this.formState.customIncludeUrlList
|
const includeUrlList = this.formState.customIncludeUrlList
|
||||||
? urlListToArray(this.formState.customIncludeUrlList)
|
? urlListToArray(this.formState.customIncludeUrlList)
|
||||||
@ -2003,7 +2089,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
: [],
|
: [],
|
||||||
extraHops: this.formState.includeLinkedPages ? 1 : 0,
|
extraHops: this.formState.includeLinkedPages ? 1 : 0,
|
||||||
};
|
};
|
||||||
const config: SeedConfig = {
|
const config = {
|
||||||
seeds: [primarySeed, ...additionalSeedUrlList],
|
seeds: [primarySeed, ...additionalSeedUrlList],
|
||||||
scopeType: additionalSeedUrlList.length
|
scopeType: additionalSeedUrlList.length
|
||||||
? "page"
|
? "page"
|
||||||
@ -2043,7 +2129,6 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
}
|
}
|
||||||
|
|
||||||
private async fetchAPIDefaults() {
|
private async fetchAPIDefaults() {
|
||||||
const orgDefaults = { ...this.orgDefaults };
|
|
||||||
try {
|
try {
|
||||||
const resp = await fetch("/api/settings", {
|
const resp = await fetch("/api/settings", {
|
||||||
headers: { "Content-Type": "application/json" },
|
headers: { "Content-Type": "application/json" },
|
||||||
@ -2051,18 +2136,23 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
if (!resp.ok) {
|
if (!resp.ok) {
|
||||||
throw new Error(resp.statusText);
|
throw new Error(resp.statusText);
|
||||||
}
|
}
|
||||||
|
const orgDefaults = {
|
||||||
|
...this.orgDefaults,
|
||||||
|
};
|
||||||
const data = await resp.json();
|
const data = await resp.json();
|
||||||
if (data.defaultBehaviorTimeSeconds) {
|
if (data.defaultBehaviorTimeSeconds > 0) {
|
||||||
orgDefaults.behaviorTimeoutMinutes =
|
orgDefaults.behaviorTimeoutSeconds = data.defaultBehaviorTimeSeconds;
|
||||||
data.defaultBehaviorTimeSeconds / 60;
|
}
|
||||||
|
if (data.defaultPageLoadTimeSeconds > 0) {
|
||||||
|
orgDefaults.pageLoadTimeoutSeconds = data.defaultPageLoadTimeSeconds;
|
||||||
}
|
}
|
||||||
if (data.maxPagesPerCrawl > 0) {
|
if (data.maxPagesPerCrawl > 0) {
|
||||||
orgDefaults.maxPagesPerCrawl = data.maxPagesPerCrawl;
|
orgDefaults.maxPagesPerCrawl = data.maxPagesPerCrawl;
|
||||||
}
|
}
|
||||||
|
this.orgDefaults = orgDefaults;
|
||||||
} catch (e: any) {
|
} catch (e: any) {
|
||||||
console.debug(e);
|
console.debug(e);
|
||||||
}
|
}
|
||||||
this.orgDefaults = orgDefaults;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,6 +20,9 @@ const defaultValue = {
|
|||||||
seeds: [],
|
seeds: [],
|
||||||
scopeType: "prefix",
|
scopeType: "prefix",
|
||||||
exclude: [""],
|
exclude: [""],
|
||||||
|
behaviorTimeout: null,
|
||||||
|
pageLoadTimeout: null,
|
||||||
|
pageExtraDelay: null,
|
||||||
},
|
},
|
||||||
tags: [],
|
tags: [],
|
||||||
crawlTimeout: null,
|
crawlTimeout: null,
|
||||||
|
@ -20,10 +20,12 @@ export type SeedConfig = Pick<
|
|||||||
Seed,
|
Seed,
|
||||||
"scopeType" | "include" | "exclude" | "limit" | "extraHops"
|
"scopeType" | "include" | "exclude" | "limit" | "extraHops"
|
||||||
> & {
|
> & {
|
||||||
seeds: (Seed)[];
|
seeds: Seed[];
|
||||||
lang?: string | null;
|
lang?: string | null;
|
||||||
blockAds?: boolean;
|
blockAds?: boolean;
|
||||||
behaviorTimeout?: number | null;
|
behaviorTimeout: number | null;
|
||||||
|
pageLoadTimeout: number | null;
|
||||||
|
pageExtraDelay: number | null;
|
||||||
behaviors?: string | null;
|
behaviors?: string | null;
|
||||||
extraHops?: number | null;
|
extraHops?: number | null;
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user