feat: Specify custom link selectors (#2487)

- Allows users to specify page link selectors in workflow "Scope"
section
- Adds new `<btrix-syntax-input>` component for syntax-highlighted
inputs
- Refactors highlight.js implementation to prevent unnecessary language
loading
- Updates exclusion table header styles

---------

Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
Co-authored-by: Henry Wilkinson <henry@wilkinson.graphics>
This commit is contained in:
sua yoo 2025-04-02 00:32:34 -07:00 committed by GitHub
parent b5b4c4da15
commit f6481272f4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
21 changed files with 745 additions and 43 deletions

View File

@ -109,6 +109,23 @@ When enabled, the crawler will check for a sitemap at /sitemap.xml and use it to
This can be useful for discovering and capturing pages on a website that aren't linked to from the seed and which might not otherwise be captured.
### Link Selectors
Instructs the crawler which HTML elements should be used to extract URLs, i.e. considered a “link.” By default, the crawler checks the `href` value of all anchor (`<a>`) elements on a page.
Specifying a custom link selector can be useful for websites that hyperlink to pages using an element other than the standard `<a>` tag, or use an attribute other than `href` to specify the URL.
For example, for a page with the given HTML markup:
```html
<button class="link" data-href="/blog">Blog</button>
<button class="link" data-href="/about">About</button>
```
The _CSS Selector_ for a custom link selector could be `button.link` and its _Link Attribute_ would be `data-href`.
See [Basic CSS selectors (MDN)](https://developer.mozilla.org/en-US/docs/Learn_web_development/Core/Styling_basics/Basic_selectors) for examples of valid CSS selectors.
### Additional Pages
A list of page URLs outside of the _Crawl Scope_ to include in the crawl.

View File

@ -36,6 +36,7 @@
"construct-style-sheets-polyfill": "^3.1.0",
"copy-webpack-plugin": "^12.0.2",
"css-loader": "^6.3.0",
"css-selector-parser": "^3.0.5",
"date-fns": "^3.6.0",
"del-cli": "^4.0.1",
"diff": "^5.2.0",
@ -62,6 +63,7 @@
"lodash": "^4.17.21",
"micromark": "^4.0.0",
"micromark-extension-gfm-strikethrough": "^2.0.0",
"nanoid": "^5.1.5",
"node-fetch": "^3.1.0",
"parse-ms": "^4.0.0",
"patch-package": "^8.0.0",

View File

@ -1,24 +1,46 @@
import clsx from "clsx";
import type { LanguageFn } from "highlight.js";
import hljs from "highlight.js/lib/core";
import javascript from "highlight.js/lib/languages/javascript";
import xml from "highlight.js/lib/languages/xml";
import { css, html } from "lit";
import { customElement, property } from "lit/decorators.js";
import { html as staticHtml, unsafeStatic } from "lit/static-html.js";
import { TailwindElement } from "@/classes/TailwindElement";
import { tw } from "@/utils/tailwind";
enum Language {
Javascript = "javascript",
XML = "xml",
CSS = "css",
}
const langaugeFiles: Record<Language, Promise<{ default: LanguageFn }>> = {
[Language.Javascript]: import(
/* webpackChunkName: "highlight.js" */ "highlight.js/lib/languages/javascript"
),
[Language.XML]: import(
/* webpackChunkName: "highlight.js" */ "highlight.js/lib/languages/xml"
),
[Language.CSS]: import(
/* webpackChunkName: "highlight.js" */ "highlight.js/lib/languages/css"
),
};
/**
* Syntax highlighting for javascript and HTML (XML)
* Syntax highlighting for javascript, HTML (XML), and CSS
*/
@customElement("btrix-code")
export class Code extends TailwindElement {
static styles = css`
.hljs-name {
color: #22863a;
.hljs-name,
.hljs-selector-tag {
color: var(--sl-color-lime-600);
}
.hljs-attr {
color: #6f42c1;
.hljs-attr,
.hljs-selector-attr,
.hljs-selector-class {
color: var(--sl-color-violet-500);
}
.hljs-string {
@ -30,20 +52,33 @@ export class Code extends TailwindElement {
value = "";
@property({ type: String })
language: "javascript" | "xml" = "xml";
language = Language.XML;
constructor() {
super();
hljs.registerLanguage("javascript", javascript);
hljs.registerLanguage("xml", xml);
@property({ type: Boolean })
wrap = true;
async connectedCallback() {
const languageFn = (await langaugeFiles[this.language]).default;
const registeredLanguages = hljs.listLanguages();
if (!registeredLanguages.includes(this.language)) {
hljs.registerLanguage(this.language, languageFn);
}
super.connectedCallback();
}
render() {
const htmlStr = hljs.highlight(this.value, {
language: this.language,
}).value;
return html`<pre
class="font-monospace m-0 whitespace-pre-wrap text-neutral-800"
class=${clsx(
tw`font-monospace m-0 text-neutral-600`,
this.wrap ? tw`whitespace-pre-wrap` : tw`whitespace-nowrap`,
)}
><code>${staticHtml`${unsafeStatic(htmlStr)}`}</code></pre>`;
}
}

View File

@ -381,6 +381,10 @@ export class ConfigDetails extends BtrixElement {
msg("Include Any Linked Page (“one hop out”)"),
Boolean(config.extraHops),
)}
${when(
config.extraHops,
() => html`${this.renderLinkSelectors()}${this.renderExclusions()}`,
)}
`;
};
@ -392,7 +396,6 @@ export class ConfigDetails extends BtrixElement {
const primarySeedConfig = this.seeds[0] as SeedConfig | Seed | undefined;
const primarySeedUrl = (primarySeedConfig as Seed | undefined)?.url;
const includeUrlList = primarySeedConfig?.include || config.include || [];
const exclusions = config.exclude || [];
const scopeType = config.scopeType!;
return html`
@ -443,6 +446,7 @@ export class ConfigDetails extends BtrixElement {
msg("Check For Sitemap"),
Boolean(config.useSitemap),
)}
${this.renderLinkSelectors()}
${this.renderSetting(
msg("Additional Page URLs"),
additionalUrlList.length
@ -465,22 +469,44 @@ export class ConfigDetails extends BtrixElement {
: none,
true,
)}
${when(
exclusions.length,
() => html`
<div class="mb-2">
<btrix-queue-exclusion-table
.exclusions=${exclusions}
labelClassName="text-xs text-neutral-500"
>
</btrix-queue-exclusion-table>
</div>
`,
() => this.renderSetting(msg("Exclusions"), none),
)}
${this.renderExclusions()}
`;
};
private renderLinkSelectors() {
const selectors = this.crawlConfig?.config.selectLinks || [];
return this.renderSetting(
labelFor.selectLink,
selectors.length
? html`
<div class="mb-2">
<btrix-link-selector-table .selectors=${selectors}>
</btrix-link-selector-table>
</div>
`
: msg("None"),
);
}
private renderExclusions() {
const exclusions = this.crawlConfig?.config.exclude || [];
return when(
exclusions.length,
() => html`
<div class="mb-2">
<btrix-queue-exclusion-table
.exclusions=${exclusions}
labelClassName="text-xs text-neutral-500"
>
</btrix-queue-exclusion-table>
</div>
`,
() => this.renderSetting(msg("Exclusions"), none),
);
}
private renderSetting(label: string, value: unknown, breakAll?: boolean) {
let content = value;

View File

@ -27,6 +27,7 @@ export class DataTable extends TailwindElement {
static styles = css`
btrix-table {
--btrix-cell-gap: var(--sl-spacing-x-small);
/* TODO Refactor padding config https://github.com/webrecorder/browsertrix/issues/2497 */
--btrix-cell-padding-top: var(--sl-spacing-x-small);
--btrix-cell-padding-bottom: var(--sl-spacing-x-small);
--btrix-cell-padding-left: var(--sl-spacing-x-small);

View File

@ -34,6 +34,7 @@ import("./search-combobox");
import("./section-heading");
import("./select-crawler-proxy");
import("./select-crawler");
import("./syntax-input");
import("./table");
import("./tag-input");
import("./tag");

View File

@ -0,0 +1,189 @@
import { localized } from "@lit/localize";
import type {
SlInput,
SlInputEvent,
SlTooltip,
} from "@shoelace-style/shoelace";
import clsx from "clsx";
import { html } from "lit";
import { customElement, property, query, state } from "lit/decorators.js";
import { ifDefined } from "lit/directives/if-defined.js";
import { TailwindElement } from "@/classes/TailwindElement";
import type { Code } from "@/components/ui/code";
import { tw } from "@/utils/tailwind";
/**
* Basic text input with code syntax highlighting
*
* @fires btrix-change
*/
@customElement("btrix-syntax-input")
@localized()
export class SyntaxInput extends TailwindElement {
@property({ type: String })
value = "";
@property({ type: Number })
minlength = 1;
@property({ type: Number })
maxlength?: number;
@property({ type: Boolean })
required?: boolean;
@property({ type: String })
placeholder?: string;
@property({ type: String })
language?: Code["language"];
@state()
private error = "";
@query("sl-input")
public readonly input?: SlInput | null;
@query("sl-tooltip")
public readonly tooltip?: SlTooltip | null;
@query("btrix-code")
private readonly code?: Code | null;
public setCustomValidity(message: string) {
this.input?.setCustomValidity(message);
this.error = message;
}
public reportValidity() {
const valid = this.checkValidity();
if (this.input && this.tooltip) {
this.tooltip.disabled = true;
// Suppress tooltip validation from showing on focus
this.input.addEventListener(
"focus",
async () => {
await this.updateComplete;
await this.input!.updateComplete;
this.tooltip!.disabled = !this.error;
},
{ once: true },
);
this.input.reportValidity();
}
return valid;
}
public checkValidity() {
if (!this.input?.input) {
if (this.required) {
return false;
}
return true;
}
return this.input.checkValidity();
}
disconnectedCallback(): void {
super.disconnectedCallback();
document.removeEventListener("selectionchange", this.onSelectionChange);
}
render() {
return html`<sl-tooltip
content=${this.error}
?disabled=${!this.error}
hoist
placement="bottom"
>
<div class=${clsx(tw`relative overflow-hidden p-px`)}>
<sl-input
class=${clsx(
tw`relative z-10 block`,
tw`[--sl-input-border-color:transparent] [--sl-input-border-radius-medium:0] [--sl-input-font-family:var(--sl-font-mono)] [--sl-input-spacing-medium:var(--sl-spacing-small)]`,
tw`caret-black part-[base]:bg-transparent part-[input]:text-transparent`,
)}
spellcheck="false"
value=${this.value}
minlength=${ifDefined(this.minlength)}
maxlength=${ifDefined(this.maxlength)}
placeholder=${ifDefined(this.placeholder)}
?required=${this.required}
@sl-input=${async (e: SlInputEvent) => {
const value = (e.target as SlInput).value;
this.setCustomValidity("");
if (this.code) {
this.code.value = value;
await this.code.updateComplete;
void this.scrollSync({ pad: true });
}
}}
@sl-focus=${() => {
if (!this.input?.input) return;
// For Firefox
this.input.input.addEventListener(
"selectionchange",
this.onSelectionChange,
);
// Non-Firefox
document.addEventListener(
"selectionchange",
this.onSelectionChange,
);
}}
@sl-blur=${() => {
this.input?.input.removeEventListener(
"selectionchange",
this.onSelectionChange,
);
document.removeEventListener(
"selectionchange",
this.onSelectionChange,
);
}}
></sl-input>
<btrix-code
class=${clsx(
tw`absolute inset-0.5 flex items-center overflow-auto px-3 [scrollbar-width:none]`,
)}
value=${this.value}
language=${ifDefined(this.language)}
.wrap=${false}
aria-hidden="true"
></btrix-code>
</div>
</sl-tooltip>`;
}
private readonly onSelectionChange = () => {
void this.scrollSync();
};
private readonly scrollSync = async (opts?: { pad: boolean }) => {
await this.input?.updateComplete;
const innerInput = this.input?.input;
if (!innerInput || !this.code) return;
// TODO Calculate single character width from actual font
const ch = 8;
// Pad scroll left when moving forward to prevent
// delay in cursor moving to the correct position
this.code.scrollLeft = innerInput.scrollLeft + (opts?.pad ? ch : 0);
};
}

View File

@ -2,7 +2,7 @@ import { msg } from "@lit/localize";
import type { ReactiveController, ReactiveControllerHost } from "lit";
import throttle from "lodash/fp/throttle";
import { APIError, type Detail } from "@/utils/api";
import { APIError } from "@/utils/api";
import AuthService from "@/utils/AuthService";
import appState from "@/utils/state";
@ -101,6 +101,7 @@ export class APIController implements ReactiveController {
}
let errorDetail;
let errorDetails = null;
try {
errorDetail = (await resp.json()).detail;
} catch {
@ -151,6 +152,8 @@ export class APIController implements ReactiveController {
if (typeof errorDetail === "string") {
errorMessage = errorDetail;
} else if (Array.isArray(errorDetail) && errorDetail.length) {
errorDetails = errorDetail;
const fieldDetail = errorDetail[0] || {};
const { loc, msg } = fieldDetail;
@ -166,7 +169,7 @@ export class APIController implements ReactiveController {
throw new APIError({
message: errorMessage,
status: resp.status,
details: errorDetail as Detail[],
details: errorDetails,
});
}

View File

@ -1,5 +1,6 @@
import("./exclusion-editor");
import("./live-workflow-status");
import("./link-selector-table");
import("./new-workflow-dialog");
import("./queue-exclusion-form");
import("./queue-exclusion-table");

View File

@ -0,0 +1,285 @@
import { localized, msg } from "@lit/localize";
import clsx from "clsx";
import { createParser } from "css-selector-parser";
import { html, type PropertyValues } from "lit";
import { customElement, property, queryAll, state } from "lit/decorators.js";
import { repeat } from "lit/directives/repeat.js";
import { when } from "lit/directives/when.js";
import { nanoid } from "nanoid";
import { BtrixElement } from "@/classes/BtrixElement";
import type { SyntaxInput } from "@/components/ui/syntax-input";
import type { SeedConfig } from "@/types/crawler";
import { tw } from "@/utils/tailwind";
const SELECTOR_DELIMITER = "->" as const;
const emptyCells = ["", ""];
/**
* @fires btrix-change
*/
@customElement("btrix-link-selector-table")
@localized()
export class LinkSelectorTable extends BtrixElement {
@property({ type: Array })
selectors: SeedConfig["selectLinks"] = [];
@property({ type: Boolean })
editable = false;
@state()
private rows: {
id: string;
cells: string[];
}[] = [];
@queryAll("btrix-syntax-input")
private readonly syntaxInputs!: NodeListOf<SyntaxInput>;
// CSS parser should ideally match the parser used in browsertrix-crawler.
// https://github.com/webrecorder/browsertrix-crawler/blob/v1.5.8/package.json#L23
private readonly cssParser = createParser();
public get value(): SeedConfig["selectLinks"] {
return this.rows
.filter(({ cells }) => cells[0] || cells[1])
.map(({ cells }) => cells.join(SELECTOR_DELIMITER));
}
public reportValidity() {
let tableValid = true;
this.syntaxInputs.forEach((input) => {
const valid = input.reportValidity();
if (!valid) {
tableValid = valid;
}
});
return tableValid;
}
public checkValidity() {
let tableValid = true;
this.syntaxInputs.forEach((input) => {
const valid = input.checkValidity();
if (!valid) {
tableValid = valid;
}
});
return tableValid;
}
protected willUpdate(changedProperties: PropertyValues): void {
if (changedProperties.has("selectors")) {
this.rows = this.selectors.map((str) => ({
id: nanoid(),
cells: str.split(SELECTOR_DELIMITER),
}));
}
}
render() {
return html`
<btrix-table
class="relative h-full w-full grid-cols-[1fr_1fr_min-content] rounded border"
>
<btrix-table-head
class=${clsx(
tw`rounded-t-[0.1875rem] border-b bg-slate-50 font-medium`,
// TODO Refactor padding config https://github.com/webrecorder/browsertrix/issues/2497
tw`[--btrix-cell-padding-bottom:var(--sl-spacing-x-small)] [--btrix-cell-padding-left:var(--sl-spacing-x-small)] [--btrix-cell-padding-right:var(--sl-spacing-x-small)] [--btrix-cell-padding-top:var(--sl-spacing-x-small)]`,
)}
>
<btrix-table-header-cell>
${msg("CSS Selector")}
</btrix-table-header-cell>
<btrix-table-header-cell class="border-l">
${msg("Link Attribute")}
</btrix-table-header-cell>
${when(
this.editable,
() => html`
<btrix-table-header-cell class="border-l">
<span class="sr-only">${msg("Row actions")}</span>
</btrix-table-header-cell>
`,
)}
</btrix-table-head>
<btrix-table-body class="overflow-auto">
${repeat(this.rows, (row) => row.id, this.row)}
</btrix-table-body>
</btrix-table>
${when(
this.editable,
() => html`
<sl-button
class="mt-1 w-full"
@click=${() =>
void this.updateRows(
{
id: nanoid(),
cells: emptyCells,
},
this.rows.length,
)}
>
<sl-icon slot="prefix" name="plus-lg"></sl-icon>
<span class="text-neutral-600">${msg("Add More")}</span>
</sl-button>
`,
)}
`;
}
private readonly row = (
{ id, cells }: LinkSelectorTable["rows"][0],
i: number,
) => {
const [sel, attr] = cells;
return html`
<btrix-table-row class=${i > 0 ? "border-t" : ""}>
<btrix-table-cell>
${when(
this.editable,
() => html`
<btrix-syntax-input
class="flex-1"
value=${sel}
language="css"
placeholder="Enter selector"
required
@sl-change=${(e: CustomEvent) => {
const el = e.currentTarget as SyntaxInput;
const value = el.input?.value.trim() || "";
void this.updateRows(
{
id,
cells: [value, attr],
},
i,
);
if (value) {
try {
// Validate selector
this.cssParser(value);
} catch {
el.setCustomValidity(
msg("Please enter a valid CSS selector"),
);
}
}
}}
>
</btrix-syntax-input>
`,
() =>
html`<btrix-code
class="m-2"
value=${sel}
language="css"
></btrix-code>`,
)}
</btrix-table-cell>
<btrix-table-cell class="border-l">
${when(
this.editable,
() => html`
<btrix-syntax-input
class="flex-1"
value=${attr}
language="xml"
placeholder="Enter attribute"
required
@sl-change=${(e: CustomEvent) => {
const el = e.currentTarget as SyntaxInput;
const value = el.input?.value.trim() || "";
void this.updateRows(
{
id,
cells: [sel, value],
},
i,
);
if (value) {
try {
// Validate attribute
document.createElement("a").setAttribute(value, "x-test");
} catch {
el.setCustomValidity(
msg("Please enter a valid HTML attribute"),
);
}
}
}}
>
</btrix-syntax-input>
`,
() =>
html`<btrix-code
class="m-2"
value=${attr}
language="css"
></btrix-code>`,
)}
</btrix-table-cell>
${when(
this.editable,
() => html`
<btrix-table-cell class="border-l">
<sl-tooltip content=${msg("Remove")} hoist placement="bottom">
<sl-icon-button
label=${msg("Remove exclusion")}
class="text-base hover:text-danger"
name="trash3"
@click=${() => void this.updateRows(undefined, i)}
></sl-icon-button>
</sl-tooltip>
</btrix-table-cell>
`,
)}
</btrix-table-row>
`;
};
private async updateRows(
row: LinkSelectorTable["rows"][0] | undefined,
idx: number,
) {
const pre = this.rows.slice(0, idx);
const ap = this.rows.slice(idx + 1);
const rows = row ? [...pre, row, ...ap] : [...pre, ...ap];
if (rows.length) {
this.rows = rows;
} else {
this.rows = [
{
id: nanoid(),
cells: emptyCells,
},
];
}
await this.updateComplete;
this.dispatchEvent(
new CustomEvent("btrix-change", {
detail: {
value: this.value,
},
}),
);
}
}

View File

@ -186,7 +186,7 @@ export class QueueExclusionTable extends TailwindElement {
class="w-full border-separate leading-none"
style="border-spacing: 0;"
>
<thead class="font-mono text-xs uppercase text-neutral-600">
<thead class="text-xs text-neutral-600">
<tr class="h-10 text-left">
<th class="${typeColClass} w-40 bg-slate-50 px-2 font-normal">
${msg("Exclusion Type")}

View File

@ -29,9 +29,12 @@ import { range } from "lit/directives/range.js";
import { when } from "lit/directives/when.js";
import compact from "lodash/fp/compact";
import flow from "lodash/fp/flow";
import isEqual from "lodash/fp/isEqual";
import throttle from "lodash/fp/throttle";
import uniq from "lodash/fp/uniq";
import type { LinkSelectorTable } from "./link-selector-table";
import { BtrixElement } from "@/classes/BtrixElement";
import type {
SelectCrawlerChangeEvent,
@ -62,6 +65,7 @@ import { labelFor } from "@/strings/crawl-workflows/labels";
import scopeTypeLabels from "@/strings/crawl-workflows/scopeType";
import sectionStrings from "@/strings/crawl-workflows/section";
import { AnalyticsTrackEvent } from "@/trackEvents";
import { APIErrorDetail } from "@/types/api";
import {
Behavior,
ScopeType,
@ -71,7 +75,7 @@ import {
import type { UnderlyingFunction } from "@/types/utils";
import { NewWorkflowOnlyScopeType } from "@/types/workflow";
import { track } from "@/utils/analytics";
import { isApiError } from "@/utils/api";
import { isApiError, isApiErrorDetail } from "@/utils/api";
import { DEPTH_SUPPORTED_SCOPES, isPageScopeType } from "@/utils/crawler";
import {
getUTCSchedule,
@ -88,6 +92,7 @@ import { tw } from "@/utils/tailwind";
import {
appDefaults,
BYTES_PER_GB,
DEFAULT_SELECT_LINKS,
defaultLabel,
getDefaultFormState,
getInitialFormState,
@ -122,6 +127,7 @@ const DEFAULT_BEHAVIORS = [
] as const;
const formName = "newJobConfig" as const;
const panelSuffix = "--panel" as const;
const defaultFormState = getDefaultFormState();
const getDefaultProgressState = (hasConfigId = false): ProgressState => {
let activeTab: StepName = "scope";
@ -225,7 +231,7 @@ export class WorkflowEditor extends BtrixElement {
private orgDefaults: WorkflowDefaults = appDefaults;
@state()
private formState = getDefaultFormState();
private formState = defaultFormState;
@state()
private serverError?: TemplateResult | string;
@ -307,6 +313,9 @@ export class WorkflowEditor extends BtrixElement {
@query("btrix-queue-exclusion-table")
private readonly exclusionTable?: QueueExclusionTable | null;
@query("btrix-link-selector-table")
private readonly linkSelectorTable?: LinkSelectorTable | null;
connectedCallback(): void {
this.initializeEditor();
super.connectedCallback();
@ -731,7 +740,10 @@ export class WorkflowEditor extends BtrixElement {
@btrix-change=${this.handleChangeRegex}
></btrix-queue-exclusion-table>
`)}
${this.renderHelpTextCol(infoTextStrings["exclusions"])}
${this.renderHelpTextCol(
infoTextStrings["exclusions"],
false,
)}
</div>
</btrix-details>
</div>
@ -844,6 +856,9 @@ https://archiveweb.page/guide`}
msg(`If checked, the crawler will visit pages one link away.`),
false,
)}
${when(this.formState.includeLinkedPages, () =>
this.renderLinkSelectors(),
)}
`;
};
@ -1042,6 +1057,7 @@ https://example.net`}
),
false,
)}
${this.renderLinkSelectors()}
<div class="col-span-5">
<btrix-details>
@ -1108,6 +1124,36 @@ https://archiveweb.page/images/${"logo.svg"}`}
}
}
private renderLinkSelectors() {
const selectors = this.formState.selectLinks;
const isCustom = !isEqual(defaultFormState.selectLinks, selectors);
return html`
<div class="col-span-5">
<btrix-details ?open=${isCustom}>
<span slot="title">
${labelFor.selectLink}
${isCustom
? html`<btrix-badge>${selectors.length}</btrix-badge>`
: ""}
</span>
<div class="grid grid-cols-5 gap-5 py-2">
${inputCol(
html`<btrix-link-selector-table
.selectors=${selectors}
editable
@btrix-change=${() => {
this.updateSelectorsValidity();
}}
></btrix-link-selector-table>`,
)}
${this.renderHelpTextCol(infoTextStrings["selectLinks"], false)}
</div>
</btrix-details>
</div>
`;
}
private renderCrawlLimits() {
// Max Pages minimum value cannot be lower than seed count
const minPages = Math.max(
@ -1858,6 +1904,20 @@ https://archiveweb.page/images/${"logo.svg"}`}
}
}
/**
* HACK Set data attribute manually so that
* selectors table works with `syncTabErrorState`
*/
private updateSelectorsValidity() {
if (this.linkSelectorTable?.checkValidity() === false) {
this.linkSelectorTable.setAttribute("data-invalid", "true");
this.linkSelectorTable.setAttribute("data-user-invalid", "true");
} else {
this.linkSelectorTable?.removeAttribute("data-invalid");
this.linkSelectorTable?.removeAttribute("data-user-invalid");
}
}
private readonly validateOnBlur = async (e: Event) => {
const el = e.target as SlInput | SlTextarea | SlSelect | SlCheckbox;
const tagName = el.tagName.toLowerCase();
@ -2071,6 +2131,8 @@ https://archiveweb.page/images/${"logo.svg"}`}
id: "workflow-created-status",
});
} else {
// TODO Handle field errors more consistently
// https://github.com/webrecorder/browsertrix/issues/2512
this.notify.toast({
message: msg("Please fix all errors and try again."),
variant: "danger",
@ -2083,11 +2145,26 @@ https://archiveweb.page/images/${"logo.svg"}`}
: e.details;
if (typeof errorDetail === "string") {
this.serverError = `${msg("Please fix the following issue: ")} ${
errorDetail === "invalid_regex"
? msg("Page exclusion contains invalid regex")
: errorDetail.replace(/_/, " ")
}`;
let errorDetailMessage = errorDetail.replace(/_/, " ");
if (isApiErrorDetail(errorDetail)) {
switch (errorDetail) {
case APIErrorDetail.WorkflowInvalidLinkSelector:
errorDetailMessage = msg(
"Page link selectors contain invalid selector or attribute",
);
break;
case APIErrorDetail.WorkflowInvalidRegex:
errorDetailMessage = msg(
"Page exclusion contains invalid regex",
);
break;
default:
break;
}
}
this.serverError = `${msg("Please fix the following issue: ")} ${errorDetailMessage}`;
}
}
} else {
@ -2200,7 +2277,9 @@ https://archiveweb.page/images/${"logo.svg"}`}
blockAds: this.formState.blockAds,
exclude: trimArray(this.formState.exclusions),
behaviors: this.setBehaviors(),
selectLinks: ["a[href]->href"],
selectLinks: this.linkSelectorTable?.value.length
? this.linkSelectorTable.value
: DEFAULT_SELECT_LINKS,
},
crawlerChannel: this.formState.crawlerChannel || "default",
proxyId: this.formState.proxyId,

View File

@ -10,7 +10,10 @@ import type { UserGuideEventMap } from "@/index";
import { pageNav, type Breadcrumb } from "@/layouts/pageHeader";
import { WorkflowScopeType } from "@/types/workflow";
import LiteElement, { html } from "@/utils/LiteElement";
import type { FormState as WorkflowFormState } from "@/utils/workflow";
import {
DEFAULT_SELECT_LINKS,
type FormState as WorkflowFormState,
} from "@/utils/workflow";
type GuideHash =
| "scope"
@ -84,7 +87,7 @@ export class WorkflowsNew extends LiteElement {
useSitemap: false,
failOnFailedSeed: false,
userAgent: null,
selectLinks: ["a[href]->href"],
selectLinks: DEFAULT_SELECT_LINKS,
},
tags: [],
crawlTimeout: null,

View File

@ -0,0 +1,3 @@
// TODO Add all error codes
// https://github.com/webrecorder/browsertrix/issues/2512
export const errorFor = {};

View File

@ -59,7 +59,19 @@ const infoText: Partial<Record<Field, string | TemplateResult>> = {
),
lang: msg(`Websites that observe the browsers language setting may serve
content in that language if available.`),
proxyId: msg(`Choose a proxy to crawl through`),
proxyId: msg(`Choose a proxy to crawl through.`),
selectLinks: msg(
html`Customize how URLs are extracted from a page. The crawler will use the
specified
<a
href="https://developer.mozilla.org/en-US/docs/Learn_web_development/Core/Styling_basics/Basic_selectors"
class="text-blue-600 hover:text-blue-500"
target="_blank"
rel="noopener noreferrer nofollow"
>CSS selectors</a
>
to find URLs that are defined in custom HTML attributes.`,
),
};
export default infoText;

View File

@ -8,4 +8,5 @@ export const labelFor = {
postLoadDelaySeconds: msg("Delay After Page Load"),
behaviorTimeoutSeconds: "Behavior Limit",
pageExtraDelaySeconds: msg("Delay Before Next Page"),
selectLink: msg("Link Selectors"),
};

View File

@ -77,6 +77,8 @@
--sl-input-font-size-medium: var(--sl-font-size-small);
--sl-input-font-size-large: var(--sl-font-size-medium);
--sl-input-placeholder-color: var(--sl-color-neutral-400);
/* From GitHub Primer https://github.com/primer/primitives/blob/8b767947e35a79db17b9d7970836f03c904c8afe/data/colors/vars/global_light.ts#L47 */
/* TODO replace hardcoded color */
--sl-input-required-content-color: #9a6700;
@ -385,6 +387,11 @@
border-radius: var(--sl-input-border-radius-small);
}
sl-textarea::part(textarea)::placeholder,
sl-input::part(input)::placeholder {
font-weight: var(--sl-font-weight-light);
}
sl-drawer::part(header) {
--header-spacing: var(--sl-spacing-small);
}

View File

@ -34,3 +34,12 @@ export type APISortQuery<T = Record<string, unknown>> = {
sortBy?: keyof T;
sortDirection?: SortDirection;
};
// TODO Add all error codes
// https://github.com/webrecorder/browsertrix/issues/2512
export enum APIErrorDetail {
WorkflowInvalidLinkSelector = "invalid_link_selector",
WorkflowInvalidRegex = "invalid_regex",
}
export const APIErrorDetailEnum = z.nativeEnum(APIErrorDetail);
export type APIErrorDetailEnum = z.infer<typeof APIErrorDetailEnum>;

View File

@ -1,3 +1,5 @@
import { APIErrorDetailEnum, type APIErrorDetail } from "@/types/api";
type StatusCode = number;
export type Detail = {
loc: string[];
@ -7,7 +9,10 @@ export type Detail = {
export class APIError extends Error {
statusCode: StatusCode;
// TODO Refactor so that details is always the array returned from API
// https://github.com/webrecorder/browsertrix/issues/2512
details: Detail[] | string | null;
errorCode: APIErrorDetail | string | null;
get isApiError(): true {
return true;
@ -17,18 +22,27 @@ export class APIError extends Error {
message,
status,
details,
errorCode,
}: {
message: string;
status: StatusCode;
details?: Detail[];
details?: APIError["details"];
errorCode?: APIError["errorCode"];
}) {
super(message);
this.statusCode = status;
this.details = details || null;
this.errorCode = errorCode || null;
}
}
export function isApiError(error: unknown): error is APIError {
return Boolean((error as APIError | undefined)?.isApiError);
}
export function isApiErrorDetail(detail: unknown): detail is APIErrorDetail {
if (!detail || typeof detail !== "string") return false;
return APIErrorDetailEnum.safeParse(detail).success;
}

View File

@ -23,6 +23,7 @@ import localize, { getDefaultLang } from "@/utils/localize";
import { regexUnescape } from "@/utils/string";
export const BYTES_PER_GB = 1e9;
export const DEFAULT_SELECT_LINKS = ["a[href]->href" as const];
export const SECTIONS = [
"scope",
@ -90,6 +91,7 @@ export type FormState = {
userAgent: string | null;
crawlerChannel: string;
proxyId: string | null;
selectLinks: string[];
};
export type FormStateField = keyof FormState;
@ -145,6 +147,7 @@ export const getDefaultFormState = (): FormState => ({
userAgent: null,
crawlerChannel: "default",
proxyId: null,
selectLinks: DEFAULT_SELECT_LINKS,
});
export const mapSeedToUrl = (arr: Seed[]) =>
@ -290,6 +293,7 @@ export function getInitialFormState(params: {
autoclickBehavior: params.initialWorkflow.config.behaviors
? params.initialWorkflow.config.behaviors.includes(Behavior.AutoClick)
: defaultFormState.autoclickBehavior,
selectLinks: params.initialWorkflow.config.selectLinks,
userAgent:
params.initialWorkflow.config.userAgent ?? defaultFormState.userAgent,
crawlerChannel:

10
frontend/yarn.lock generated
View File

@ -4142,6 +4142,11 @@ css-select@^4.1.3:
domutils "^2.8.0"
nth-check "^2.0.1"
css-selector-parser@^3.0.5:
version "3.1.1"
resolved "https://registry.yarnpkg.com/css-selector-parser/-/css-selector-parser-3.1.1.tgz#d6635dad07e54d0985884c148d8c35e404b0a2a6"
integrity sha512-Y+DuvJ7JAjpL1f4DeILe5sXCC3kRXMl0DxM4lAWbS8/jEZ29o3V0L5TL6zIifj4Csmj6c+jiF2ENjida2OVOGA==
css-what@^6.0.1:
version "6.1.0"
resolved "https://registry.yarnpkg.com/css-what/-/css-what-6.1.0.tgz#fb5effcf76f1ddea2c81bdfaa4de44e79bac70f4"
@ -7044,6 +7049,11 @@ nanoid@^3.3.7:
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.7.tgz#d0c301a691bc8d54efa0a2226ccf3fe2fd656bd8"
integrity sha512-eSRppjcPIatRIMC1U6UngP8XFcz8MQWGQdt1MTBQ7NaAmvXDfvNxbvWV3x2y6CdEUciCSsDHDQZbhYaB8QEo2g==
nanoid@^5.1.5:
version "5.1.5"
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-5.1.5.tgz#f7597f9d9054eb4da9548cdd53ca70f1790e87de"
integrity sha512-Ir/+ZpE9fDsNH0hQ3C68uyThDXzYcim2EqcZ8zn8Chtt1iylPT9xXJB0kPCnqzgcEGikO9RxSrh63MsmVCU7Fw==
natural-compare@^1.4.0:
version "1.4.0"
resolved "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz"