browsertrix/frontend/src/pages/archive/crawl-config-editor.ts
2023-01-16 10:08:45 -08:00

1762 lines
53 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import type { LitElement, TemplateResult } from "lit";
import { html as staticHtml, unsafeStatic } from "lit/static-html.js";
import type {
SlCheckbox,
SlInput,
SlRadio,
SlRadioGroup,
SlSelect,
SlTextarea,
} from "@shoelace-style/shoelace";
import { state, property, query, queryAsync } from "lit/decorators.js";
import { when } from "lit/directives/when.js";
import { msg, localized, str } from "@lit/localize";
import { ifDefined } from "lit/directives/if-defined.js";
import compact from "lodash/fp/compact";
import { mergeDeep } from "immutable";
import flow from "lodash/fp/flow";
import uniq from "lodash/fp/uniq";
import RegexColorize from "regex-colorize";
import ISO6391 from "iso-639-1";
import LiteElement, { html } from "../../utils/LiteElement";
import { regexEscape } from "../../utils/string";
import type { AuthState } from "../../utils/AuthService";
import {
getUTCSchedule,
humanizeSchedule,
humanizeNextDate,
getScheduleInterval,
getNextDate,
} from "../../utils/cron";
import type { Tab } from "../../components/tab-list";
import type {
ExclusionRemoveEvent,
ExclusionChangeEvent,
} from "../../components/queue-exclusion-table";
import type { TimeInputChangeEvent } from "../../components/time-input";
import type { Tags, TagsChangeEvent } from "../../components/tag-input";
import type {
CrawlConfigParams,
Profile,
InitialCrawlConfig,
JobType,
} from "./types";
type NewCrawlConfigParams = CrawlConfigParams & {
runNow: boolean;
oldId?: string;
};
const STEPS = [
"crawlSetup",
"browserSettings",
"crawlScheduling",
"crawlInformation",
"confirmSettings",
] as const;
type StepName = typeof STEPS[number];
type TabState = {
enabled: boolean;
completed: boolean;
error: boolean;
};
type Tabs = Record<StepName, TabState>;
type ProgressState = {
currentStep: StepName;
activeTab: StepName;
tabs: Tabs;
};
type FormState = {
primarySeedUrl: string;
urlList: string;
includeLinkedPages: boolean;
allowedExternalUrlList: string;
crawlTimeoutMinutes: number | null;
pageTimeoutMinutes: number | null;
scopeType: CrawlConfigParams["config"]["scopeType"];
exclusions: CrawlConfigParams["config"]["exclude"];
pageLimit: CrawlConfigParams["config"]["limit"];
scale: CrawlConfigParams["scale"];
blockAds: CrawlConfigParams["config"]["blockAds"];
lang: CrawlConfigParams["config"]["lang"];
scheduleType: "now" | "date" | "cron" | "none";
scheduleFrequency: "daily" | "weekly" | "monthly";
scheduleDayOfMonth: number;
scheduleDayOfWeek: number;
scheduleTime: {
hour: number;
minute: number;
period: "AM" | "PM";
};
runNow: boolean;
jobName: CrawlConfigParams["name"];
browserProfile: Profile | null;
tags: Tags;
};
const getDefaultProgressState = (hasConfigId = false): ProgressState => {
let activeTab: StepName = "crawlSetup";
if (hasConfigId && window.location.hash) {
const hashValue = window.location.hash.slice(1);
if (STEPS.includes(hashValue as any)) {
activeTab = hashValue as StepName;
}
}
return {
activeTab,
currentStep: hasConfigId ? "confirmSettings" : "crawlSetup",
tabs: {
crawlSetup: { enabled: true, error: false, completed: hasConfigId },
browserSettings: {
enabled: hasConfigId,
error: false,
completed: hasConfigId,
},
crawlScheduling: {
enabled: hasConfigId,
error: false,
completed: hasConfigId,
},
crawlInformation: {
enabled: hasConfigId,
error: false,
completed: hasConfigId,
},
confirmSettings: {
enabled: hasConfigId,
error: false,
completed: hasConfigId,
},
},
};
};
const getDefaultFormState = (): FormState => ({
primarySeedUrl: "",
urlList: "",
includeLinkedPages: false,
allowedExternalUrlList: "",
crawlTimeoutMinutes: null,
pageTimeoutMinutes: null,
scopeType: "host",
exclusions: [],
pageLimit: undefined,
scale: 1,
blockAds: true,
lang: undefined,
scheduleType: "now",
scheduleFrequency: "weekly",
scheduleDayOfMonth: new Date().getDate(),
scheduleDayOfWeek: new Date().getDay(),
scheduleTime: {
hour: 12,
minute: 0,
period: "AM",
},
runNow: false,
jobName: "",
browserProfile: null,
tags: [],
});
const defaultProgressState = getDefaultProgressState();
const orderedTabNames = STEPS.filter(
(stepName) => defaultProgressState.tabs[stepName as StepName]
) as StepName[];
function getLocalizedWeekDays() {
const now = new Date();
// TODO accept locale from locale-picker
const { format } = new Intl.DateTimeFormat(undefined, { weekday: "short" });
return Array.from({ length: 7 }).map((x, day) =>
format(Date.now() - (now.getDay() - day) * 86400000)
);
}
function validURL(url: string) {
return /((([A-Za-z]{3,9}:(?:\/\/)?)(?:[\-;:&=\+\$,\w]+@)?[A-Za-z0-9\.\-]+|(?:www\.|[\-;:&=\+\$,\w]+@)[A-Za-z0-9\.\-]+)((?:\/[\+~%\/\.\w\-_]*)?\??(?:[\-\+=&;%@\.\w_]*)#?(?:[\.\!\/\\\w]*))?)/.test(
url
);
}
const trimExclusions = flow(uniq, compact);
const urlListToArray = (str: string) =>
str.trim().replace(/,/g, " ").split(/\s+/g);
@localized()
export class CrawlConfigEditor extends LiteElement {
@property({ type: Object })
authState!: AuthState;
@property({ type: String })
archiveId!: string;
@property({ type: String })
configId?: string;
@property({ type: String })
jobType?: JobType;
@property({ type: Object })
initialCrawlConfig?: InitialCrawlConfig;
@state()
private isSubmitting = false;
@state()
private progressState!: ProgressState;
@state()
private formState!: FormState;
@state()
private serverError?: TemplateResult | string;
private get formHasError() {
return Object.values(this.progressState.tabs).some(({ error }) => error);
}
private get utcSchedule() {
return getUTCSchedule({
interval: this.formState.scheduleFrequency,
dayOfMonth: this.formState.scheduleDayOfMonth,
dayOfWeek: this.formState.scheduleDayOfWeek,
...this.formState.scheduleTime,
});
}
private readonly daysOfWeek = getLocalizedWeekDays();
private readonly scopeTypeLabels: Record<FormState["scopeType"], string> = {
prefix: msg("Path Begins with This URL"),
host: msg("Pages on This Domain"),
domain: msg("Pages on This Domain & Subdomains"),
"page-spa": msg("Single Page App (In-Page Links Only)"),
page: msg("Page"),
custom: msg("Custom"),
any: msg("Any"),
};
private readonly scheduleTypeLabels: Record<
FormState["scheduleType"],
string
> = {
now: msg("Run Immediately on Save"),
date: msg("Run on a Specific Date & Time"),
cron: msg("Run on a Recurring Basis"),
none: msg("No Schedule"),
};
private readonly scheduleFrequencyLabels: Record<
FormState["scheduleFrequency"],
string
> = {
daily: msg("Daily"),
weekly: msg("Weekly"),
monthly: msg("Monthly"),
};
@query('form[name="newJobConfig"]')
formElem!: HTMLFormElement;
@queryAsync("btrix-tab-panel[aria-hidden=false]")
activeTabPanel!: Promise<HTMLElement | null>;
connectedCallback(): void {
this.initializeEditor();
super.connectedCallback();
}
willUpdate(changedProperties: Map<string, any>) {
if (
changedProperties.get("initialCrawlConfig") &&
this.initialCrawlConfig
) {
this.initializeEditor();
}
if (changedProperties.get("formState") && this.formState) {
this.handleFormStateChange();
}
}
updated(changedProperties: Map<string, any>) {
if (changedProperties.get("progressState") && this.progressState) {
this.handleProgressStateChange(
changedProperties.get("progressState") as ProgressState
);
}
}
private initializeEditor() {
this.progressState = getDefaultProgressState(Boolean(this.configId));
this.formState = {
...getDefaultFormState(),
...this.getInitialFormState(),
};
if (!this.formState.lang) {
this.formState.lang = this.getInitialLang();
}
if (!this.formState.exclusions?.length) {
this.formState.exclusions = [""]; // Add empty slot
}
}
private getInitialLang() {
// Default to current user browser language
const browserLanguage = window.navigator.language;
if (browserLanguage) {
return browserLanguage.slice(0, browserLanguage.indexOf("-"));
}
return null;
}
private getInitialFormState(): Partial<FormState> {
if (!this.initialCrawlConfig) return {};
const formState: Partial<FormState> = {};
const { seeds, scopeType } = this.initialCrawlConfig.config;
if (this.initialCrawlConfig.jobType === "seed-crawl") {
formState.primarySeedUrl =
typeof seeds[0] === "string" ? seeds[0] : seeds[0].url;
} else {
// Treat "custom" like URL list
formState.urlList = seeds
.map((seed) => (typeof seed === "string" ? seed : seed.url))
.join("\n");
if (this.initialCrawlConfig.jobType === "custom") {
formState.scopeType = scopeType || "page";
}
}
if (this.initialCrawlConfig.schedule) {
formState.scheduleType = "cron";
formState.scheduleFrequency = getScheduleInterval(
this.initialCrawlConfig.schedule
);
const nextDate = getNextDate(this.initialCrawlConfig.schedule)!;
formState.scheduleDayOfMonth = nextDate.getDate();
formState.scheduleDayOfWeek = nextDate.getDay();
const hours = nextDate.getHours();
formState.scheduleTime = {
hour: hours % 12 || 12,
minute: nextDate.getMinutes(),
period: hours > 11 ? "PM" : "AM",
};
} else {
if (this.configId) {
formState.scheduleType = "none";
} else {
formState.scheduleType = "now";
}
}
if (this.initialCrawlConfig.tags?.length) {
formState.tags = this.initialCrawlConfig.tags;
}
return {
jobName: this.initialCrawlConfig.name,
browserProfile: this.initialCrawlConfig.profileid
? ({ id: this.initialCrawlConfig.profileid } as Profile)
: undefined,
scopeType: this.initialCrawlConfig.config
.scopeType as FormState["scopeType"],
exclusions: this.initialCrawlConfig.config.exclude,
includeLinkedPages: Boolean(this.initialCrawlConfig.config.extraHops),
...formState,
};
}
render() {
const tabLabels: Record<StepName, string> = {
crawlSetup: msg("Crawl Setup"),
browserSettings: msg("Browser Settings"),
crawlScheduling: msg("Crawl Scheduling"),
crawlInformation: msg("Crawl Information"),
confirmSettings: msg("Confirm Settings"),
};
return html`
<form
name="newJobConfig"
@reset=${this.onReset}
@submit=${this.onSubmit}
@keydown=${this.onKeyDown}
@sl-blur=${this.validateOnBlur}
@sl-change=${this.updateFormStateOnChange}
>
<btrix-tab-list
activePanel="newJobConfig-${this.progressState.activeTab}"
progressPanel="newJobConfig-${this.progressState.currentStep}"
>
<header slot="header" class="flex justify-between items-baseline">
<h3>${tabLabels[this.progressState.activeTab]}</h3>
<p class="text-xs text-neutral-500 font-normal">
${msg(
html`Fields marked with
<span style="color:var(--sl-input-required-content-color)"
>*</span
>
are required`
)}
</p>
</header>
${orderedTabNames.map((tabName) =>
this.renderNavItem(tabName, tabLabels[tabName])
)}
<btrix-tab-panel name="newJobConfig-crawlSetup" class="scroll-m-3">
${this.renderPanelContent(
html`
${when(this.jobType === "url-list", this.renderUrlListSetup)}
${when(
this.jobType === "seed-crawl",
this.renderSeededCrawlSetup
)}
${when(this.jobType === "custom", () =>
this.renderUrlListSetup(true)
)}
`,
{ isFirst: true }
)}
</btrix-tab-panel>
<btrix-tab-panel
name="newJobConfig-browserSettings"
class="scroll-m-3"
>
${this.renderPanelContent(this.renderCrawlBehaviors())}
</btrix-tab-panel>
<btrix-tab-panel
name="newJobConfig-crawlScheduling"
class="scroll-m-3"
>
${this.renderPanelContent(this.renderJobScheduling())}
</btrix-tab-panel>
<btrix-tab-panel
name="newJobConfig-crawlInformation"
class="scroll-m-3"
>
${this.renderPanelContent(this.renderJobInformation())}
</btrix-tab-panel>
<btrix-tab-panel
name="newJobConfig-confirmSettings"
class="scroll-m-3"
>
${this.renderPanelContent(this.renderConfirmSettings(), {
isLast: true,
})}
</btrix-tab-panel>
</btrix-tab-list>
</form>
`;
}
private renderNavItem(tabName: StepName, content: TemplateResult | string) {
const isActive = tabName === this.progressState.activeTab;
const isConfirmSettings = tabName === "confirmSettings";
const { error: isInvalid, completed } = this.progressState.tabs[tabName];
const iconProps = {
name: "circle",
library: "default",
class: "text-neutral-300",
};
if (isConfirmSettings) {
iconProps.name = "info-circle";
iconProps.class = "text-base";
} else {
if (isInvalid) {
iconProps.name = "exclamation-circle";
iconProps.class = "text-danger";
} else if (isActive) {
iconProps.name = "pencil-circle-dashed";
iconProps.library = "app";
iconProps.class = "text-base";
} else if (completed) {
iconProps.name = "check-circle";
iconProps.class = "text-success";
}
}
const { enabled } = this.progressState.tabs[tabName];
const isEnabled = isConfirmSettings
? this.progressState.tabs.confirmSettings.enabled ||
this.progressState.tabs.crawlSetup.completed
: enabled;
return html`
<btrix-tab
slot="nav"
name="newJobConfig-${tabName}"
class="whitespace-nowrap"
?disabled=${!isEnabled}
@click=${this.tabClickHandler(tabName)}
>
<sl-icon
name=${iconProps.name}
library=${iconProps.library}
class="inline-block align-middle mr-1 text-base ${iconProps.class}"
></sl-icon>
<span class="inline-block align-middle whitespace-normal">
${content}
</span>
</btrix-tab>
`;
}
private renderPanelContent(
content: TemplateResult,
{ isFirst = false, isLast = false } = {}
) {
return html`
<div class="border rounded-lg flex flex-col h-full">
<div class="flex-1 p-6 grid grid-cols-1 md:grid-cols-5 gap-6">
${content}
${when(this.serverError, () =>
this.renderErrorAlert(this.serverError!)
)}
</div>
${this.renderFooter({ isFirst, isLast })}
</div>
`;
}
private renderFooter({ isFirst = false, isLast = false }) {
return html`
<div class="px-6 py-4 border-t flex justify-between">
${isFirst
? html`
<sl-button size="small" type="reset">
<sl-icon slot="prefix" name="chevron-left"></sl-icon>
${this.configId ? msg("Cancel") : msg("Start Over")}
</sl-button>
`
: html`
<sl-button size="small" @click=${this.backStep}>
<sl-icon slot="prefix" name="chevron-left"></sl-icon>
${msg("Previous Step")}
</sl-button>
`}
${when(
this.configId,
() => html`
<div>
${when(
!isLast,
() => html`
<sl-button class="mr-1" size="small" @click=${this.nextStep}>
<sl-icon slot="suffix" name="chevron-right"></sl-icon>
${msg("Next")}
</sl-button>
`
)}
<sl-button
type="submit"
size="small"
variant="primary"
?disabled=${this.isSubmitting || this.formHasError}
?loading=${this.isSubmitting}
>
${msg("Save Changes")}
</sl-button>
</div>
`,
() =>
isLast
? html`<sl-button
type="submit"
size="small"
variant="primary"
?disabled=${this.isSubmitting || this.formHasError}
?loading=${this.isSubmitting}
>
${this.formState.runNow
? msg("Save & Run Crawl")
: msg("Save & Schedule Crawl")}
</sl-button>`
: html`
<div>
<sl-button
class="mr-1"
size="small"
variant="primary"
@click=${this.nextStep}
>
<sl-icon slot="suffix" name="chevron-right"></sl-icon>
${msg("Next Step")}
</sl-button>
<sl-button
size="small"
@click=${() => {
if (this.hasRequiredFields()) {
this.updateProgressState({
activeTab: "confirmSettings",
currentStep: "confirmSettings",
tabs: { crawlSetup: { completed: true } },
});
} else {
this.nextStep();
}
}}
>
<sl-icon
slot="suffix"
name="chevron-double-right"
></sl-icon>
${msg("Confirm & Save")}
</sl-button>
</div>
`
)}
</div>
`;
}
private renderSectionHeading(content: TemplateResult | string) {
return html`
<btrix-section-heading class="col-span-1 md:col-span-5">
<h4>${content}</h4>
</btrix-section-heading>
`;
}
private renderFormCol = (content: TemplateResult) => {
return html`<div class="col-span-1 md:col-span-3">${content}</div> `;
};
private renderHelpTextCol(content: TemplateResult, padTop = true) {
return html`
<div class="col-span-1 md:col-span-2 flex${padTop ? " pt-6" : ""}">
<div class="text-base mr-2">
<sl-icon name="info-circle"></sl-icon>
</div>
<div class="mt-0.5 text-xs text-neutral-500">${content}</div>
</div>
`;
}
private renderUrlListSetup = (isCustom = false) => {
return html`
${this.renderFormCol(html`
<sl-textarea
name="urlList"
label=${msg("List of URLs")}
rows="10"
autocomplete="off"
value=${this.formState.urlList}
placeholder=${`https://example.com
https://example.com/path`}
required
@sl-input=${async (e: Event) => {
const inputEl = e.target as SlInput;
await inputEl.updateComplete;
if (
inputEl.invalid &&
!urlListToArray(inputEl.value).some((url) => !validURL(url))
) {
inputEl.setCustomValidity("");
inputEl.helpText = "";
}
}}
@sl-blur=${async (e: Event) => {
const inputEl = e.target as SlInput;
await inputEl.updateComplete;
if (
inputEl.value &&
urlListToArray(inputEl.value).some((url) => !validURL(url))
) {
const text = msg("Please fix invalid URL in list.");
inputEl.invalid = true;
inputEl.helpText = text;
inputEl.setCustomValidity(text);
}
}}
></sl-textarea>
`)}
${this.renderHelpTextCol(
html`The crawler will visit and record each URL listed in the order
defined here.`
)}
${when(
isCustom,
() => html`
${this.renderFormCol(html`
<sl-select
name="scopeType"
label=${msg("Crawl Scope")}
defaultValue=${this.formState.scopeType}
value=${this.formState.scopeType}
@sl-select=${(e: Event) =>
this.updateFormState({
scopeType: (e.target as HTMLSelectElement)
.value as FormState["scopeType"],
})}
>
<sl-menu-item value="prefix">
${this.scopeTypeLabels["prefix"]}
</sl-menu-item>
<sl-menu-item value="host">
${this.scopeTypeLabels["host"]}
</sl-menu-item>
<sl-menu-item value="domain">
${this.scopeTypeLabels["domain"]}
</sl-menu-item>
<sl-divider></sl-divider>
<sl-menu-label>${msg("Advanced Options")}</sl-menu-label>
<sl-menu-item value="page-spa">
${this.scopeTypeLabels["page-spa"]}
</sl-menu-item>
<sl-menu-item value="page">
${this.scopeTypeLabels["page"]}
</sl-menu-item>
<sl-menu-item value="custom">
${this.scopeTypeLabels["custom"]}
</sl-menu-item>
<sl-menu-item value="any">
${this.scopeTypeLabels["any"]}
</sl-menu-item>
</sl-select>
`)}
${this.renderHelpTextCol(
html`Tells the crawler which pages it can visit.`
)}
`
)}
${this.renderFormCol(html`<sl-checkbox
name="includeLinkedPages"
?checked=${this.formState.includeLinkedPages}
>
${msg("Include Linked Pages")}
</sl-checkbox>`)}
${this.renderHelpTextCol(
html`If checked, the crawler will visit pages one link away from a Crawl
URL.`,
false
)}
${when(
this.formState.includeLinkedPages || this.jobType === "custom",
() => html`
${this.renderSectionHeading(msg("Page Limits"))}
${this.renderFormCol(html`
<btrix-queue-exclusion-table
.exclusions=${this.formState.exclusions}
pageSize="10"
editable
removable
@on-remove=${this.handleRemoveRegex}
@on-change=${this.handleChangeRegex}
></btrix-queue-exclusion-table>
<sl-button
class="w-full mt-1"
@click=${() =>
this.updateFormState({
exclusions: [""],
})}
>
<sl-icon slot="prefix" name="plus-lg"></sl-icon>
<span class="text-neutral-600">${msg("Add More")}</span>
</sl-button>
`)}
${this.renderHelpTextCol(
html`Specify exclusion rules for what pages should not be visited.
Exclusions apply to all URLs.`
)}
`
)}
${this.renderCrawlScale()}
`;
};
private renderSeededCrawlSetup = () => {
const urlPlaceholder = "https://example.com";
let exampleUrl = new URL(urlPlaceholder);
if (this.formState.primarySeedUrl) {
try {
exampleUrl = new URL(this.formState.primarySeedUrl);
} catch {}
}
const exampleHost = exampleUrl.host;
const exampleProtocol = exampleUrl.protocol;
const examplePathname = exampleUrl.pathname.replace(/\/$/, "");
const exampleDomain = `${exampleProtocol}//${exampleHost}`;
let helpText: TemplateResult | string;
switch (this.formState.scopeType) {
case "prefix":
helpText = msg(
html`Will crawl all page URLs that begin with
<span class="text-blue-500 break-word"
>${exampleDomain}${examplePathname}</span
>, e.g.
<span class="text-blue-500 break-word break-word"
>${exampleDomain}${examplePathname}</span
><span class="text-blue-500 font-medium break-word"
>/path/page.html</span
>`
);
break;
case "host":
helpText = msg(
html`Will crawl all pages on
<span class="text-blue-500">${exampleHost}</span> and ignore pages
on any subdomains.`
);
break;
case "domain":
helpText = msg(
html`Will crawl all pages on
<span class="text-blue-500">${exampleHost}</span> and
<span class="text-blue-500">subdomain.${exampleHost}</span>.`
);
break;
case "page-spa":
helpText = msg(
html`Will only visit
<span class="text-blue-500 break-word"
>${exampleDomain}${examplePathname}</span
>
and links that stay within the same URL, e.g. hash anchor links:
<span class="text-blue-500 break-word"
>${exampleDomain}${examplePathname}</span
><span class="text-blue-500 font-medium break-word"
>#example-page</span
>`
);
break;
default:
helpText = "";
break;
}
return html`
${this.renderFormCol(html`
<sl-input
name="primarySeedUrl"
label=${msg("Crawl Start URL")}
autocomplete="off"
placeholder=${urlPlaceholder}
value=${this.formState.primarySeedUrl}
required
@sl-input=${async (e: Event) => {
const inputEl = e.target as SlInput;
await inputEl.updateComplete;
if (inputEl.invalid && validURL(inputEl.value)) {
inputEl.setCustomValidity("");
inputEl.helpText = "";
}
}}
@sl-blur=${async (e: Event) => {
const inputEl = e.target as SlInput;
await inputEl.updateComplete;
if (inputEl.value && !validURL(inputEl.value)) {
const text = msg("Please enter a valid URL.");
inputEl.invalid = true;
inputEl.helpText = text;
inputEl.setCustomValidity(text);
}
}}
></sl-input>
`)}
${this.renderHelpTextCol(html`The starting point of your crawl.`)}
${this.renderFormCol(html`
<sl-select
name="scopeType"
label=${msg("Crawl Scope")}
defaultValue=${this.formState.scopeType}
value=${this.formState.scopeType}
@sl-select=${(e: Event) =>
this.updateFormState({
scopeType: (e.target as HTMLSelectElement)
.value as FormState["scopeType"],
})}
>
<div slot="help-text">${helpText}</div>
<sl-menu-item value="prefix">
${this.scopeTypeLabels["prefix"]}
</sl-menu-item>
<sl-menu-item value="host">
${this.scopeTypeLabels["host"]}
</sl-menu-item>
<sl-menu-item value="domain">
${this.scopeTypeLabels["domain"]}
</sl-menu-item>
<sl-divider></sl-divider>
<sl-menu-label>${msg("Advanced Options")}</sl-menu-label>
<sl-menu-item value="page-spa">
${this.scopeTypeLabels["page-spa"]}
</sl-menu-item>
</sl-select>
`)}
${this.renderHelpTextCol(
html`Tells the crawler which pages it can visit.`
)}
${this.renderSectionHeading(msg("Additional Pages"))}
${this.renderFormCol(html`
<sl-textarea
name="allowedExternalUrlList"
label=${msg("Extra URLs in Scope")}
rows="3"
autocomplete="off"
value=${this.formState.allowedExternalUrlList}
placeholder=${`https://example.org/page/
https://example.net`}
?disabled=${this.formState.scopeType === "page-spa"}
></sl-textarea>
`)}
${this.renderHelpTextCol(
html`Crawl pages outside of Crawl Scope that begin with these URLs.`
)}
${this.renderFormCol(html`
<sl-checkbox
name="includeLinkedPages"
?checked=${this.formState.includeLinkedPages}
>
${msg("Include Any Linked Page (“one hop out”)")}
</sl-checkbox>
`)}
${this.renderHelpTextCol(
html`If checked, the crawler will visit pages one link away outside of
Crawl Scope.`,
false
)}
${this.renderSectionHeading(msg("Page Limits"))}
${this.renderFormCol(html`
<sl-input
name="pageLimit"
label=${msg("Max Pages")}
type="number"
defaultValue=${this.formState.pageLimit || ""}
placeholder=${msg("Unlimited")}
>
<span slot="suffix">${msg("pages")}</span>
</sl-input>
`)}
${this.renderHelpTextCol(html`Adds a hard limit on the number of pages
that will be crawled.`)}
${this.renderFormCol(html`
<btrix-queue-exclusion-table
.exclusions=${this.formState.exclusions}
pageSize="10"
editable
removable
@on-remove=${this.handleRemoveRegex}
@on-change=${this.handleChangeRegex}
></btrix-queue-exclusion-table>
<sl-button
class="w-full mt-1"
@click=${() =>
this.updateFormState({
exclusions: [""],
})}
>
<sl-icon slot="prefix" name="plus-lg"></sl-icon>
<span class="text-neutral-600">${msg("Add More")}</span>
</sl-button>
`)}
${this.renderHelpTextCol(
html`Specify exclusion rules for what pages should not be visited.`
)}
${this.renderCrawlScale()}
`;
};
private renderCrawlScale() {
return html`
${this.renderSectionHeading(msg("Crawl Limits"))}
${this.renderFormCol(html`
<sl-input
name="crawlTimeoutMinutes"
label=${msg("Crawl Time Limit")}
placeholder=${msg("Unlimited")}
type="number"
>
<span slot="suffix">${msg("minutes")}</span>
</sl-input>
`)}
${this.renderHelpTextCol(
html`Gracefully stop the crawler after a specified time limit.`
)}
${this.renderFormCol(html`
<sl-radio-group
name="scale"
label=${msg("Crawler Instances")}
value=${this.formState.scale}
@sl-change=${(e: Event) =>
this.updateFormState({
scale: +(e.target as SlCheckbox).value,
})}
>
<sl-radio-button value="1" size="small">1</sl-radio-button>
<sl-radio-button value="2" size="small">2</sl-radio-button>
<sl-radio-button value="3" size="small">3</sl-radio-button>
</sl-radio-group>
`)}
${this.renderHelpTextCol(
html`Increasing parallel crawler instances will speed up crawls, but
take up more system resources.`
)}
`;
}
private renderCrawlBehaviors() {
return html`
${this.renderFormCol(html`
<btrix-select-browser-profile
archiveId=${this.archiveId}
.profileId=${this.formState.browserProfile?.id}
.authState=${this.authState}
@on-change=${(e: any) =>
this.updateFormState({
browserProfile: e.detail.value,
})}
></btrix-select-browser-profile>
`)}
${this.renderHelpTextCol(
html`Choose a custom profile to make use of saved cookies and logged-in
accounts.`
)}
${this.renderFormCol(html`
<sl-checkbox name="blockAds" ?checked=${this.formState.blockAds}>
${msg("Block Ads by Domain")}
</sl-checkbox>
`)}
${this.renderHelpTextCol(
html`Blocks advertising content from being loaded. Uses
<a
href="https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts"
class="text-blue-600 hover:text-blue-500"
target="_blank"
rel="noopener noreferrer nofollow"
>Steven Blacks Hosts file</a
>.`,
false
)}
${this.renderFormCol(html`
<btrix-language-select
.value=${this.formState.lang}
@sl-select=${(e: CustomEvent) =>
this.updateFormState({
lang: e.detail.item.value,
})}
@sl-clear=${() => {
this.updateFormState({
lang: null,
});
}}
>
<span slot="label">${msg("Language")}</span>
</btrix-language-select>
`)}
${this.renderHelpTextCol(
html`Websites that observe the browsers language setting may serve
content in that language if available.`
)}
${this.renderSectionHeading(msg("On-Page Behavior"))}
${this.renderFormCol(html`
<sl-input
name="pageTimeoutMinutes"
type="number"
label=${msg("Page Time Limit")}
placeholder=${msg("Unlimited")}
value=${ifDefined(this.formState.pageTimeoutMinutes || undefined)}
>
<span slot="suffix">${msg("minutes")}</span>
</sl-input>
`)}
${this.renderHelpTextCol(
html`Adds a hard time limit for how long the crawler can spend on a
single webpage.`
)}
`;
}
private renderJobScheduling() {
return html`
${this.renderFormCol(html`
<sl-radio-group
label=${msg("Crawl Schedule Type")}
name="scheduleType"
value=${this.formState.scheduleType}
@sl-change=${(e: Event) =>
this.updateFormState({
scheduleType: (e.target as SlRadio)
.value as FormState["scheduleType"],
runNow: (e.target as SlRadio).value === "now",
})}
>
<sl-radio value="now">${this.scheduleTypeLabels["now"]}</sl-radio>
<sl-radio value="cron">${this.scheduleTypeLabels["cron"]}</sl-radio>
<sl-radio value="none">${this.scheduleTypeLabels["none"]}</sl-radio>
</sl-radio-group>
`)}
${this.renderHelpTextCol(
html`Should a crawl run immediately when setup is complete, on a set
day, or on a recurring schedule?`
)}
${when(this.formState.scheduleType === "cron", this.renderScheduleCron)}
`;
}
private renderScheduleCron = () => {
const utcSchedule = this.utcSchedule;
return html`
${this.renderSectionHeading(msg("Set Schedule"))}
${this.renderFormCol(html`
<sl-select
name="scheduleFrequency"
label=${msg("Frequency")}
value=${this.formState.scheduleFrequency}
@sl-select=${(e: Event) =>
this.updateFormState({
scheduleFrequency: (e.target as HTMLSelectElement)
.value as FormState["scheduleFrequency"],
})}
>
<sl-menu-item value="daily"
>${this.scheduleFrequencyLabels["daily"]}</sl-menu-item
>
<sl-menu-item value="weekly"
>${this.scheduleFrequencyLabels["weekly"]}</sl-menu-item
>
<sl-menu-item value="monthly"
>${this.scheduleFrequencyLabels["monthly"]}</sl-menu-item
>
</sl-select>
`)}
${this.renderHelpTextCol(
html`Limit the frequency for how often a crawl will run.`
)}
${when(
this.formState.scheduleFrequency === "weekly",
() => html`
${this.renderFormCol(html`
<sl-radio-group
name="scheduleDayOfWeek"
label=${msg("Day")}
value=${this.formState.scheduleDayOfWeek}
@sl-change=${(e: Event) =>
this.updateFormState({
scheduleDayOfWeek: +(e.target as SlRadioGroup).value,
})}
>
${this.daysOfWeek.map(
(label, day) =>
html`<sl-radio-button value=${day}>${label}</sl-radio-button>`
)}
</sl-radio-group>
`)}
${this.renderHelpTextCol(
html`What day of the week should a crawl run on?`
)}
`
)}
${when(
this.formState.scheduleFrequency === "monthly",
() => html`
${this.renderFormCol(html`
<sl-input
name="scheduleDayOfMonth"
label=${msg("Date")}
type="number"
min="1"
max="31"
value=${this.formState.scheduleDayOfMonth}
required
>
</sl-input>
`)}
${this.renderHelpTextCol(
html`What day of the month should a crawl run on?`
)}
`
)}
${this.renderFormCol(html`
<btrix-time-input
hour=${ifDefined(this.formState.scheduleTime.hour)}
minute=${ifDefined(this.formState.scheduleTime.minute)}
period=${ifDefined(this.formState.scheduleTime.period)}
@time-change=${(e: TimeInputChangeEvent) => {
this.updateFormState({
scheduleTime: e.detail,
});
}}
>
<span slot="label">${msg("Start Time")}</span>
</btrix-time-input>
<div class="text-xs text-neutral-500 mt-3">
<p class="mb-1">
${msg(
html`Schedule:
<span class="text-blue-500"
>${utcSchedule
? humanizeSchedule(utcSchedule)
: msg("Invalid date")}</span
>.`
)}
</p>
<p>
${msg(
html`Next scheduled run:
<span
>${utcSchedule
? humanizeNextDate(utcSchedule)
: msg("Invalid date")}</span
>.`
)}
</p>
</div>
`)}
${this.renderHelpTextCol(
html`A crawl will run at this time in your current timezone.`
)}
${this.renderFormCol(html`<sl-checkbox
name="runNow"
?checked=${this.formState.runNow}
>
${msg("Also run a crawl immediately on save")}
</sl-checkbox>`)}
${this.renderHelpTextCol(
html`If checked, a crawl will run at the time specified above and also
once when setup is complete.`,
false
)}
`;
};
private renderJobInformation() {
const jobNameValue =
this.formState.jobName ||
(this.jobType === "seed-crawl" && this.formState.primarySeedUrl) ||
"";
return html`
${this.renderFormCol(html`
<sl-input
name="jobName"
label=${msg("Name")}
autocomplete="off"
placeholder=${msg("Example (example.com) Weekly Crawl", {
desc: "Example crawl config name",
})}
value=${jobNameValue}
required
></sl-input>
`)}
${this.renderHelpTextCol(
html`Try to create a unique name to help keep things organized!`
)}
${this.renderFormCol(
html`
<btrix-tag-input
.initialTags=${this.formState.tags}
@tags-change=${(e: TagsChangeEvent) =>
this.updateFormState(
{
tags: e.detail.tags,
},
true
)}
></btrix-tag-input>
`
)}
${this.renderHelpTextCol(
html`Create or assign this crawl (and its outputs) to one or more tags
to help organize your archived data.`
)}
`;
}
private renderErrorAlert(errorMessage: string | TemplateResult) {
return html`
<div class="col-span-1 md:col-span-5">
<btrix-alert variant="danger">${errorMessage}</btrix-alert>
</div>
`;
}
private renderConfirmSettings = () => {
const crawlConfig = this.parseConfig();
return html`
<div class="col-span-1 md:col-span-5">
<btrix-config-details .crawlConfig=${crawlConfig}>
</btrix-config-details>
</div>
${when(this.formHasError, () =>
this.renderErrorAlert(
msg(
"There are issues with this crawl configuration. Please go through previous steps and fix all issues to continue."
)
)
)}
`;
};
private hasRequiredFields(): Boolean {
if (this.jobType === "seed-crawl") {
return Boolean(this.formState.jobName && this.formState.primarySeedUrl);
}
return Boolean(this.formState.jobName && this.formState.urlList);
}
private handleFormStateChange() {
if (!this.formState.jobName) {
this.setDefaultJobName();
}
const hasRequiredFields = this.hasRequiredFields();
if (hasRequiredFields && !this.progressState.tabs.crawlSetup.error) {
this.updateProgressState({
tabs: {
crawlSetup: { completed: true },
},
});
}
}
private async handleProgressStateChange(oldState: ProgressState) {
const { activeTab } = this.progressState;
if (oldState.activeTab !== activeTab) {
const activeTabPanel = (await this.activeTabPanel) as HTMLElement;
if (activeTabPanel && activeTabPanel.getBoundingClientRect().top < 0) {
activeTabPanel.scrollIntoView({
behavior: "smooth",
});
}
}
}
private setDefaultJobName() {
// Set default crawl name based on seed URLs
if (!this.formState.primarySeedUrl && !this.formState.urlList) {
return;
}
let jobName = "";
if (this.jobType === "seed-crawl") {
jobName = this.formState.primarySeedUrl;
} else {
const urlList = urlListToArray(this.formState.urlList);
const firstUrl = urlList[0].trim();
if (urlList.length > 1) {
const remainder = urlList.length - 1;
if (remainder === 1) {
jobName = msg(str`${firstUrl} + ${remainder} more URL`);
} else {
jobName = msg(str`${firstUrl} + ${remainder} more URLs`);
}
} else {
jobName = firstUrl;
}
}
this.updateFormState({ jobName });
}
private async handleRemoveRegex(e: ExclusionRemoveEvent) {
const { index } = e.detail;
if (!this.formState.exclusions) {
this.updateFormState(
{
exclusions: this.formState.exclusions,
},
true
);
} else {
const { exclusions: exclude } = this.formState;
this.updateFormState(
{
exclusions: [...exclude.slice(0, index), ...exclude.slice(index + 1)],
},
true
);
}
// Check if we removed an erroring input
const table = e.target as LitElement;
await this.updateComplete;
await table.updateComplete;
this.syncTabErrorState(table);
}
private handleChangeRegex(e: ExclusionChangeEvent) {
const { regex, index } = e.detail;
const nextExclusions = [...this.formState.exclusions!];
nextExclusions[index] = regex;
this.updateFormState(
{
exclusions: nextExclusions,
},
true
);
}
private validateOnBlur = async (e: Event) => {
const el = e.target as SlInput | SlTextarea | SlSelect | SlCheckbox;
const tagName = el.tagName.toLowerCase();
if (
!["sl-input", "sl-textarea", "sl-select", "sl-checkbox"].includes(tagName)
) {
return;
}
await el.updateComplete;
await this.updateComplete;
const currentTab = this.progressState.activeTab as StepName;
const tabs = { ...this.progressState.tabs };
// Check [data-user-invalid] instead of .invalid property
// to validate only touched inputs
if ("userInvalid" in el.dataset) {
tabs[currentTab].error = true;
this.updateProgressState({ tabs });
} else if (this.progressState.tabs[currentTab].error) {
this.syncTabErrorState(el);
}
};
private syncTabErrorState(el: HTMLElement) {
const currentTab = this.progressState.activeTab as StepName;
const tabs = { ...this.progressState.tabs };
const panelEl = el.closest("btrix-tab-panel")!;
const hasInvalid = panelEl.querySelector("[data-user-invalid]");
if (!hasInvalid) {
tabs[currentTab].error = false;
this.updateProgressState({ tabs });
}
}
private updateFormStateOnChange(e: Event) {
const elem = e.target as SlTextarea | SlInput | SlCheckbox;
const name = elem.name;
const tagName = elem.tagName.toLowerCase();
let value: any;
switch (tagName) {
case "sl-checkbox":
value = (elem as SlCheckbox).checked;
break;
case "sl-textarea":
value = elem.value;
break;
case "sl-input": {
if ((elem as SlInput).type === "number") {
value = +elem.value;
} else {
value = elem.value;
}
break;
}
default:
return;
}
if (name in this.formState) {
this.updateFormState({
[name]: value,
});
}
}
private tabClickHandler = (step: StepName) => (e: MouseEvent) => {
const tab = e.currentTarget as Tab;
if (tab.disabled || tab.active) {
e.preventDefault();
e.stopPropagation();
return;
}
window.location.hash = step;
this.updateProgressState({ activeTab: step });
};
private backStep() {
const targetTabIdx = STEPS.indexOf(this.progressState.activeTab!);
if (targetTabIdx) {
this.updateProgressState({
activeTab: STEPS[targetTabIdx - 1] as StepName,
});
}
}
private nextStep() {
const isValid = this.checkCurrentPanelValidity();
if (isValid) {
const { activeTab, tabs, currentStep } = this.progressState;
const nextTab = STEPS[STEPS.indexOf(activeTab!) + 1] as StepName;
const isFirstTimeEnabled = !tabs[nextTab].enabled;
const nextTabs = { ...tabs };
let nextCurrentStep = currentStep;
if (isFirstTimeEnabled) {
nextTabs[nextTab].enabled = true;
nextCurrentStep = nextTab;
}
nextTabs[activeTab!].completed = true;
this.updateProgressState({
activeTab: nextTab,
currentStep: nextCurrentStep,
tabs: nextTabs,
});
}
}
private checkCurrentPanelValidity = (): boolean => {
if (!this.formElem) return false;
const currentTab = this.progressState.activeTab as StepName;
const activePanel = this.formElem.querySelector(
`btrix-tab-panel[name="newJobConfig-${currentTab}"]`
);
const invalidElems = [...activePanel!.querySelectorAll("[data-invalid]")];
const hasInvalid = Boolean(invalidElems.length);
if (hasInvalid) {
invalidElems.forEach((el) => {
(el as HTMLInputElement).reportValidity();
});
}
return !hasInvalid;
};
private onKeyDown(event: KeyboardEvent) {
const el = event.target as HTMLElement;
const tagName = el.tagName.toLowerCase();
if (tagName !== "sl-input") return;
const { key } = event;
if ((el as SlInput).type === "number") {
// Prevent typing non-numeric keys
if (key.length === 1 && /\D/.test(key)) {
event.preventDefault();
return;
}
}
if (
key === "Enter" &&
this.progressState.activeTab !== STEPS[STEPS.length - 1]
) {
// Prevent submission by "Enter" keypress if not on last tab
event.preventDefault();
}
}
private async onSubmit(event: SubmitEvent) {
event.preventDefault();
const isValid = this.checkCurrentPanelValidity();
await this.updateComplete;
if (!isValid || this.formHasError) {
return;
}
const config = this.parseConfig();
this.isSubmitting = true;
try {
const data = await this.apiFetch(
`/archives/${this.archiveId}/crawlconfigs/`,
this.authState!,
{
method: "POST",
body: JSON.stringify(config),
}
);
const crawlId = data.run_now_job;
let message = msg("Crawl config created.");
if (crawlId) {
message = msg("Crawl started with new template.");
} else if (this.configId) {
message = msg("Crawl config updated.");
}
this.notify({
message,
variant: "success",
icon: "check2-circle",
duration: 8000,
});
if (crawlId) {
this.navTo(`/archives/${this.archiveId}/crawls/crawl/${crawlId}`);
} else {
this.navTo(
`/archives/${this.archiveId}/crawl-templates/config/${data.added}`
);
}
} catch (e: any) {
if (e?.isApiError) {
const isConfigError = ({ loc }: any) =>
loc.some((v: string) => v === "config");
if (e.details && e.details.some(isConfigError)) {
this.serverError = this.formatConfigServerError(e.details);
} else {
this.serverError = e.message;
}
} else {
this.serverError = msg("Something unexpected went wrong");
}
}
this.isSubmitting = false;
}
private async onReset() {
this.initializeEditor();
}
/**
* Format `config` related API error returned from server
*/
private formatConfigServerError(details: any): TemplateResult {
const detailsWithoutDictError = details.filter(
({ type }: any) => type !== "type_error.dict"
);
const renderDetail = ({ loc, msg: detailMsg }: any) => html`
<li>
${loc.some((v: string) => v === "seeds") &&
typeof loc[loc.length - 1] === "number"
? msg(str`Seed URL ${loc[loc.length - 1] + 1}: `)
: `${loc[loc.length - 1]}: `}
${detailMsg}
</li>
`;
return html`
${msg(
"Couldn't save crawl config. Please fix the following crawl configuration issues:"
)}
<ul class="list-disc w-fit pl-4">
${detailsWithoutDictError.map(renderDetail)}
</ul>
`;
}
private parseConfig(): NewCrawlConfigParams {
const config: NewCrawlConfigParams = {
jobType: this.jobType || "custom",
name: this.formState.jobName || this.formState.primarySeedUrl,
scale: this.formState.scale,
profileid: this.formState.browserProfile?.id || null,
runNow: this.formState.runNow || this.formState.scheduleType === "now",
schedule: this.formState.scheduleType === "cron" ? this.utcSchedule : "",
crawlTimeout: this.formState.crawlTimeoutMinutes
? this.formState.crawlTimeoutMinutes * 60
: 0,
tags: this.formState.tags,
config: {
...(this.jobType === "seed-crawl"
? this.parseSeededConfig()
: this.parseUrlListConfig()),
behaviorTimeout: this.formState.pageTimeoutMinutes
? this.formState.pageTimeoutMinutes * 60
: 0,
limit: this.formState.pageLimit ? +this.formState.pageLimit : null,
extraHops: this.formState.includeLinkedPages ? 1 : 0,
lang: this.formState.lang || null,
blockAds: this.formState.blockAds,
exclude: trimExclusions(this.formState.exclusions),
},
};
if (this.configId) {
config.oldId = this.configId;
}
return config;
}
private parseUrlListConfig(): NewCrawlConfigParams["config"] {
const config = {
seeds: urlListToArray(this.formState.urlList),
scopeType: "page" as FormState["scopeType"],
};
return config;
}
private parseSeededConfig(): NewCrawlConfigParams["config"] {
const primarySeedUrl = this.formState.primarySeedUrl.replace(/\/$/, "");
const externalUrlList = this.formState.allowedExternalUrlList
? urlListToArray(this.formState.allowedExternalUrlList).map((str) =>
str.replace(/\/$/, "")
)
: [];
let scopeType = this.formState.scopeType;
const include = [];
if (externalUrlList.length) {
const { host, origin } = new URL(primarySeedUrl);
scopeType = "custom";
// Replicate scope type with regex
switch (this.formState.scopeType) {
case "prefix":
include.push(`${regexEscape(primarySeedUrl)}\/.*`);
break;
case "host":
include.push(`${regexEscape(origin)}\/.*`);
break;
case "domain":
include.push(
`${regexEscape(origin)}\/.*`,
`.*\.${regexEscape(host)}\/.*`
);
break;
default:
break;
}
externalUrlList.forEach((url) => {
include.push(`${regexEscape(url)}\/.*`);
});
}
const config = {
seeds: [primarySeedUrl],
scopeType,
include,
};
return config;
}
private updateProgressState(
nextState: {
activeTab?: ProgressState["activeTab"];
currentStep?: ProgressState["currentStep"];
tabs?: {
[K in StepName]?: Partial<TabState>;
};
},
shallowMerge = false
) {
if (shallowMerge) {
this.progressState = {
...this.progressState,
...(nextState as Partial<ProgressState>),
};
} else {
this.progressState = mergeDeep(this.progressState, nextState);
}
}
private updateFormState(nextState: Partial<FormState>, shallowMerge = false) {
if (shallowMerge) {
this.formState = {
...this.formState,
...nextState,
};
} else {
this.formState = mergeDeep(this.formState, nextState);
}
}
}
customElements.define("btrix-crawl-config-editor", CrawlConfigEditor);