feat: Add custom behaviors to org crawling defaults (#2546)

Resolves https://github.com/webrecorder/browsertrix/issues/2513

## Changes

- Allows org admins to set custom behaviors as crawling defaults
- Shows warning text if both autoscroll/autoclick and custom behaviors
are enabled
- Refactors `infoTextStrings` -> `infoTextFor` to match other
label/string matchers

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
sua yoo 2025-04-09 01:10:30 -07:00 committed by GitHub
parent 0a0d2d04d3
commit 7c6bae8d61
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 108 additions and 48 deletions

View File

@ -67,7 +67,7 @@ import type {
import { infoCol, inputCol } from "@/layouts/columns"; import { infoCol, inputCol } from "@/layouts/columns";
import { pageSectionsWithNav } from "@/layouts/pageSectionsWithNav"; import { pageSectionsWithNav } from "@/layouts/pageSectionsWithNav";
import { panel } from "@/layouts/panel"; import { panel } from "@/layouts/panel";
import infoTextStrings from "@/strings/crawl-workflows/infoText"; import { infoTextFor } from "@/strings/crawl-workflows/infoText";
import { labelFor } from "@/strings/crawl-workflows/labels"; import { labelFor } from "@/strings/crawl-workflows/labels";
import scopeTypeLabels from "@/strings/crawl-workflows/scopeType"; import scopeTypeLabels from "@/strings/crawl-workflows/scopeType";
import sectionStrings from "@/strings/crawl-workflows/section"; import sectionStrings from "@/strings/crawl-workflows/section";
@ -758,10 +758,7 @@ export class WorkflowEditor extends BtrixElement {
@btrix-change=${this.handleChangeRegex} @btrix-change=${this.handleChangeRegex}
></btrix-queue-exclusion-table> ></btrix-queue-exclusion-table>
`)} `)}
${this.renderHelpTextCol( ${this.renderHelpTextCol(infoTextFor["exclusions"], false)}
infoTextStrings["exclusions"],
false,
)}
</div> </div>
</btrix-details> </btrix-details>
</div> </div>
@ -1176,7 +1173,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
)} )}
${this.renderHelpTextCol( ${this.renderHelpTextCol(
html` html`
${infoTextStrings["selectLinks"]} ${infoTextFor["selectLinks"]}
<br /><br /> <br /><br />
${msg( ${msg(
html`If none are specified, the crawler will default to html`If none are specified, the crawler will default to
@ -1235,7 +1232,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
</sl-input> </sl-input>
</sl-mutation-observer> </sl-mutation-observer>
`)} `)}
${this.renderHelpTextCol(infoTextStrings["pageLimit"])} ${this.renderHelpTextCol(infoTextFor["pageLimit"])}
${inputCol(html` ${inputCol(html`
<sl-input <sl-input
name="crawlTimeoutMinutes" name="crawlTimeoutMinutes"
@ -1249,7 +1246,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
<span slot="suffix">${msg("minutes")}</span> <span slot="suffix">${msg("minutes")}</span>
</sl-input> </sl-input>
`)} `)}
${this.renderHelpTextCol(infoTextStrings["crawlTimeoutMinutes"])} ${this.renderHelpTextCol(infoTextFor["crawlTimeoutMinutes"])}
${inputCol(html` ${inputCol(html`
<sl-input <sl-input
name="maxCrawlSizeGB" name="maxCrawlSizeGB"
@ -1263,19 +1260,34 @@ https://archiveweb.page/images/${"logo.svg"}`}
<span slot="suffix">${msg("GB")}</span> <span slot="suffix">${msg("GB")}</span>
</sl-input> </sl-input>
`)} `)}
${this.renderHelpTextCol(infoTextStrings["maxCrawlSizeGB"])} ${this.renderHelpTextCol(infoTextFor["maxCrawlSizeGB"])}
`; `;
} }
private renderPageBehavior() { private renderPageBehavior() {
const behaviorOverrideWarning = html`
<span slot="help-text" class="text-warning-600">
<sl-icon
name="exclamation-triangle"
class="align-[-.175em] text-sm"
></sl-icon>
${msg("May be overridden by custom behaviors.")}
</span>
`;
return html` return html`
${this.renderSectionHeading(labelFor.behaviors)} ${this.renderSectionHeading(labelFor.behaviors)}
${inputCol( ${inputCol(
html`<sl-checkbox html`<sl-checkbox
name="autoscrollBehavior" name="autoscrollBehavior"
class="part-[form-control-help-text]:mt-1.5"
?checked=${this.formState.autoscrollBehavior} ?checked=${this.formState.autoscrollBehavior}
> >
${labelFor.autoscrollBehavior} ${labelFor.autoscrollBehavior}
${when(
this.formState.autoscrollBehavior && this.formState.customBehavior,
() => behaviorOverrideWarning,
)}
</sl-checkbox>`, </sl-checkbox>`,
)} )}
${this.renderHelpTextCol( ${this.renderHelpTextCol(
@ -1285,9 +1297,14 @@ https://archiveweb.page/images/${"logo.svg"}`}
${inputCol( ${inputCol(
html`<sl-checkbox html`<sl-checkbox
name="autoclickBehavior" name="autoclickBehavior"
class="part-[form-control-help-text]:mt-1.5"
?checked=${this.formState.autoclickBehavior} ?checked=${this.formState.autoclickBehavior}
> >
${labelFor.autoclickBehavior} ${labelFor.autoclickBehavior}
${when(
this.formState.autoclickBehavior && this.formState.customBehavior,
() => behaviorOverrideWarning,
)}
</sl-checkbox> </sl-checkbox>
${when( ${when(
@ -1360,7 +1377,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
<span slot="suffix">${msg("seconds")}</span> <span slot="suffix">${msg("seconds")}</span>
</sl-input> </sl-input>
`)} `)}
${this.renderHelpTextCol(infoTextStrings["pageLoadTimeoutSeconds"])} ${this.renderHelpTextCol(infoTextFor["pageLoadTimeoutSeconds"])}
${inputCol(html` ${inputCol(html`
<sl-input <sl-input
name="postLoadDelaySeconds" name="postLoadDelaySeconds"
@ -1374,7 +1391,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
<span slot="suffix">${msg("seconds")}</span> <span slot="suffix">${msg("seconds")}</span>
</sl-input> </sl-input>
`)} `)}
${this.renderHelpTextCol(infoTextStrings["postLoadDelaySeconds"])} ${this.renderHelpTextCol(infoTextFor["postLoadDelaySeconds"])}
${inputCol(html` ${inputCol(html`
<sl-input <sl-input
name="behaviorTimeoutSeconds" name="behaviorTimeoutSeconds"
@ -1389,7 +1406,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
<span slot="suffix">${msg("seconds")}</span> <span slot="suffix">${msg("seconds")}</span>
</sl-input> </sl-input>
`)} `)}
${this.renderHelpTextCol(infoTextStrings["behaviorTimeoutSeconds"])} ${this.renderHelpTextCol(infoTextFor["behaviorTimeoutSeconds"])}
${inputCol(html` ${inputCol(html`
<sl-input <sl-input
name="pageExtraDelaySeconds" name="pageExtraDelaySeconds"
@ -1403,7 +1420,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
<span slot="suffix">${msg("seconds")}</span> <span slot="suffix">${msg("seconds")}</span>
</sl-input> </sl-input>
`)} `)}
${this.renderHelpTextCol(infoTextStrings["pageExtraDelaySeconds"])} ${this.renderHelpTextCol(infoTextFor["pageExtraDelaySeconds"])}
`; `;
} }
@ -1433,12 +1450,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
`, `,
)} `, )} `,
)} )}
${this.renderHelpTextCol( ${this.renderHelpTextCol(infoTextFor.customBehavior, false)}
msg(
`Enable custom page actions with behavior scripts. You can specify any publicly accessible URL or public Git repository.`,
),
false,
)}
`; `;
} }
@ -1454,7 +1466,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
})} })}
></btrix-select-browser-profile> ></btrix-select-browser-profile>
`)} `)}
${this.renderHelpTextCol(infoTextStrings["browserProfile"])} ${this.renderHelpTextCol(infoTextFor["browserProfile"])}
${this.proxies?.servers.length ${this.proxies?.servers.length
? [ ? [
inputCol(html` inputCol(html`
@ -1470,7 +1482,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
})} })}
></btrix-select-crawler-proxy> ></btrix-select-crawler-proxy>
`), `),
this.renderHelpTextCol(infoTextStrings["proxyId"]), this.renderHelpTextCol(infoTextFor["proxyId"]),
] ]
: nothing} : nothing}
${inputCol(html` ${inputCol(html`
@ -1517,14 +1529,14 @@ https://archiveweb.page/images/${"logo.svg"}`}
></btrix-select-crawler> ></btrix-select-crawler>
`)} `)}
${this.showCrawlerChannels ${this.showCrawlerChannels
? this.renderHelpTextCol(infoTextStrings["crawlerChannel"]) ? this.renderHelpTextCol(infoTextFor["crawlerChannel"])
: html``} : html``}
${inputCol(html` ${inputCol(html`
<sl-checkbox name="blockAds" ?checked=${this.formState.blockAds}> <sl-checkbox name="blockAds" ?checked=${this.formState.blockAds}>
${msg("Block ads by domain")} ${msg("Block ads by domain")}
</sl-checkbox> </sl-checkbox>
`)} `)}
${this.renderHelpTextCol(infoTextStrings["blockAds"], false)} ${this.renderHelpTextCol(infoTextFor["blockAds"], false)}
${inputCol(html` ${inputCol(html`
<sl-input <sl-input
name="userAgent" name="userAgent"
@ -1535,7 +1547,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
> >
</sl-input> </sl-input>
`)} `)}
${this.renderHelpTextCol(infoTextStrings["userAgent"])} ${this.renderHelpTextCol(infoTextFor["userAgent"])}
${inputCol(html` ${inputCol(html`
<btrix-language-select <btrix-language-select
.value=${this.formState.lang as LanguageCode} .value=${this.formState.lang as LanguageCode}
@ -1548,7 +1560,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
<span slot="label">${msg("Language")}</span> <span slot="label">${msg("Language")}</span>
</btrix-language-select> </btrix-language-select>
`)} `)}
${this.renderHelpTextCol(infoTextStrings["lang"])} ${this.renderHelpTextCol(infoTextFor["lang"])}
`; `;
} }
@ -2191,10 +2203,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
// TODO Move away from manual validation check // TODO Move away from manual validation check
// See https://github.com/webrecorder/browsertrix/issues/2536 // See https://github.com/webrecorder/browsertrix/issues/2536
if ( if (this.formState.autoclickBehavior && this.clickSelector) {
this.formState.autoclickBehavior &&
this.clickSelector
) {
if (!this.clickSelector.checkValidity()) { if (!this.clickSelector.checkValidity()) {
this.clickSelector.reportValidity(); this.clickSelector.reportValidity();
return; return;
@ -2434,7 +2443,9 @@ https://archiveweb.page/images/${"logo.svg"}`}
selectLinks: this.linkSelectorTable?.value.length selectLinks: this.linkSelectorTable?.value.length
? this.linkSelectorTable.value ? this.linkSelectorTable.value
: DEFAULT_SELECT_LINKS, : DEFAULT_SELECT_LINKS,
customBehaviors: this.customBehaviorsTable?.value || [], customBehaviors:
(this.formState.customBehavior && this.customBehaviorsTable?.value) ||
[],
clickSelector: clickSelector:
this.formState.clickSelector || DEFAULT_AUTOCLICK_SELECTOR, this.formState.clickSelector || DEFAULT_AUTOCLICK_SELECTOR,
}, },

View File

@ -12,11 +12,14 @@ import { BtrixElement } from "@/classes/BtrixElement";
import type { LanguageSelect } from "@/components/ui/language-select"; import type { LanguageSelect } from "@/components/ui/language-select";
import type { SelectCrawlerProxy } from "@/components/ui/select-crawler-proxy"; import type { SelectCrawlerProxy } from "@/components/ui/select-crawler-proxy";
import { proxiesContext, type ProxiesContext } from "@/context/org"; import { proxiesContext, type ProxiesContext } from "@/context/org";
import type { CustomBehaviorsTable } from "@/features/crawl-workflows/custom-behaviors-table";
import type { QueueExclusionTable } from "@/features/crawl-workflows/queue-exclusion-table"; import type { QueueExclusionTable } from "@/features/crawl-workflows/queue-exclusion-table";
import { columns, type Cols } from "@/layouts/columns"; import { columns, type Cols } from "@/layouts/columns";
import infoTextStrings from "@/strings/crawl-workflows/infoText"; import { infoTextFor } from "@/strings/crawl-workflows/infoText";
import { labelFor } from "@/strings/crawl-workflows/labels";
import sectionStrings from "@/strings/crawl-workflows/section"; import sectionStrings from "@/strings/crawl-workflows/section";
import { crawlingDefaultsSchema, type CrawlingDefaults } from "@/types/org"; import { crawlingDefaultsSchema, type CrawlingDefaults } from "@/types/org";
import { formValidator } from "@/utils/form";
import { import {
appDefaults, appDefaults,
BYTES_PER_GB, BYTES_PER_GB,
@ -32,14 +35,10 @@ type Field = Record<FieldName, TemplateResult<1> | undefined>;
const PLACEHOLDER_EXCLUSIONS = [""]; // Add empty slot const PLACEHOLDER_EXCLUSIONS = [""]; // Add empty slot
function section(section: SectionsEnum | "exclusions", cols: Cols) { function section(section: SectionsEnum, cols: Cols) {
return html` return html`
<section class="p-5"> <section class="p-5">
<btrix-section-heading <btrix-section-heading>${sectionStrings[section]}</btrix-section-heading>
>${section === "exclusions"
? msg("Exclusions")
: sectionStrings[section]}</btrix-section-heading
>
${columns(cols)} ${columns(cols)}
</section> </section>
`; `;
@ -63,6 +62,9 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
@query("btrix-queue-exclusion-table") @query("btrix-queue-exclusion-table")
exclusionTable?: QueueExclusionTable | null; exclusionTable?: QueueExclusionTable | null;
@query("btrix-custom-behaviors-table")
customBehaviorsTable?: CustomBehaviorsTable | null;
@query("btrix-language-select") @query("btrix-language-select")
languageSelect?: LanguageSelect | null; languageSelect?: LanguageSelect | null;
@ -72,6 +74,8 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
@query('sl-button[type="submit"]') @query('sl-button[type="submit"]')
submitButton?: SlButton | null; submitButton?: SlButton | null;
private readonly checkFormValidity = formValidator(this);
connectedCallback() { connectedCallback() {
super.connectedCallback(); super.connectedCallback();
@ -140,6 +144,13 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
`, `,
}; };
const behaviors = { const behaviors = {
customBehavior: html`
<label class="form-label text-xs">${labelFor.customBehaviors}</label>
<btrix-custom-behaviors-table
.customBehaviors=${orgDefaults.customBehaviors || []}
editable
></btrix-custom-behaviors-table>
`,
pageLoadTimeoutSeconds: html` pageLoadTimeoutSeconds: html`
<sl-input <sl-input
size="small" size="small"
@ -258,7 +269,7 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
limits, limits,
behaviors, behaviors,
browserSettings, browserSettings,
}; } as const;
} }
private renderWorkflowDefaults() { private renderWorkflowDefaults() {
@ -270,10 +281,11 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
section( section(
sectionName as SectionsEnum, sectionName as SectionsEnum,
Object.entries(fields) Object.entries(fields)
.filter(([, field]) => field as unknown) // eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
.filter(([, field]) => field)
.map(([fieldName, field]) => [ .map(([fieldName, field]) => [
field, field,
infoTextStrings[fieldName as FieldName], infoTextFor[fieldName as keyof typeof infoTextFor],
]), ]),
), ),
), ),
@ -292,6 +304,31 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
e.preventDefault(); e.preventDefault();
const form = e.target as HTMLFormElement; const form = e.target as HTMLFormElement;
// Wait for custom behaviors validation to finish
// TODO Move away from manual validation check
// See https://github.com/webrecorder/browsertrix/issues/2536
if (this.customBehaviorsTable) {
if (!this.customBehaviorsTable.checkValidity()) {
this.customBehaviorsTable.reportValidity();
return;
}
try {
await this.customBehaviorsTable.taskComplete;
} catch {
this.customBehaviorsTable.reportValidity();
return;
}
}
const isValid = await this.checkFormValidity(form);
if (!isValid) {
form.reportValidity();
return;
}
const values = serialize(form) as Record<string, string>; const values = serialize(form) as Record<string, string>;
const parseNumber = (value: string) => (value ? Number(value) : undefined); const parseNumber = (value: string) => (value ? Number(value) : undefined);
const parsedValues: CrawlingDefaults = { const parsedValues: CrawlingDefaults = {
@ -312,6 +349,7 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
userAgent: values.userAgent, userAgent: values.userAgent,
lang: this.languageSelect?.value || undefined, lang: this.languageSelect?.value || undefined,
exclude: this.exclusionTable?.exclusions?.filter((v) => v) || [], exclude: this.exclusionTable?.exclusions?.filter((v) => v) || [],
customBehaviors: this.customBehaviorsTable?.value || [],
}; };
// Set null or empty strings to undefined // Set null or empty strings to undefined

View File

@ -160,6 +160,7 @@ export class WorkflowsNew extends LiteElement {
userAgent: org.crawlingDefaults?.userAgent, userAgent: org.crawlingDefaults?.userAgent,
blockAds: org.crawlingDefaults?.blockAds, blockAds: org.crawlingDefaults?.blockAds,
lang: org.crawlingDefaults?.lang, lang: org.crawlingDefaults?.lang,
customBehaviors: org.crawlingDefaults?.customBehaviors,
}, },
crawlTimeout: org.crawlingDefaults?.crawlTimeout, crawlTimeout: org.crawlingDefaults?.crawlTimeout,
maxCrawlSize: org.crawlingDefaults?.maxCrawlSize, maxCrawlSize: org.crawlingDefaults?.maxCrawlSize,

View File

@ -5,7 +5,7 @@ import { type FormState } from "@/utils/workflow";
type Field = keyof FormState; type Field = keyof FormState;
const infoText: Partial<Record<Field, string | TemplateResult>> = { export const infoTextFor = {
exclusions: msg( exclusions: msg(
"Specify exclusion rules for what pages should not be visited.", "Specify exclusion rules for what pages should not be visited.",
), ),
@ -72,6 +72,9 @@ const infoText: Partial<Record<Field, string | TemplateResult>> = {
> >
to find URLs that are defined in custom HTML attributes.`, to find URLs that are defined in custom HTML attributes.`,
), ),
}; customBehavior: msg(
`Enable custom page actions with behavior scripts. You can specify any publicly accessible URL or public Git repository.`,
),
} as const satisfies Partial<Record<Field, string | TemplateResult>>;
export default infoText; export default infoTextFor;

View File

@ -47,6 +47,7 @@ export const crawlingDefaultsSchema = z.object({
lang: z.string().optional(), lang: z.string().optional(),
userAgent: z.string().optional(), userAgent: z.string().optional(),
exclude: z.array(z.string()), exclude: z.array(z.string()),
customBehaviors: z.array(z.string()),
}); });
export type CrawlingDefaults = z.infer<typeof crawlingDefaultsSchema>; export type CrawlingDefaults = z.infer<typeof crawlingDefaultsSchema>;

View File

@ -244,6 +244,10 @@ export function getInitialFormState(params: {
return fallback; return fallback;
}; };
const enableCustomBehaviors = Boolean(
params.initialWorkflow.config.customBehaviors.length,
);
return { return {
...defaultFormState, ...defaultFormState,
primarySeedUrl: defaultFormState.primarySeedUrl, primarySeedUrl: defaultFormState.primarySeedUrl,
@ -294,13 +298,15 @@ export function getInitialFormState(params: {
params.initialWorkflow.config.limit ?? defaultFormState.pageLimit, params.initialWorkflow.config.limit ?? defaultFormState.pageLimit,
autoscrollBehavior: params.initialWorkflow.config.behaviors autoscrollBehavior: params.initialWorkflow.config.behaviors
? params.initialWorkflow.config.behaviors.includes(Behavior.AutoScroll) ? params.initialWorkflow.config.behaviors.includes(Behavior.AutoScroll)
: defaultFormState.autoscrollBehavior, : enableCustomBehaviors
? false
: defaultFormState.autoscrollBehavior,
autoclickBehavior: params.initialWorkflow.config.behaviors autoclickBehavior: params.initialWorkflow.config.behaviors
? params.initialWorkflow.config.behaviors.includes(Behavior.AutoClick) ? params.initialWorkflow.config.behaviors.includes(Behavior.AutoClick)
: defaultFormState.autoclickBehavior, : enableCustomBehaviors
customBehavior: Boolean( ? false
params.initialWorkflow.config.customBehaviors.length, : defaultFormState.autoclickBehavior,
), customBehavior: enableCustomBehaviors,
selectLinks: params.initialWorkflow.config.selectLinks, selectLinks: params.initialWorkflow.config.selectLinks,
clickSelector: params.initialWorkflow.config.clickSelector, clickSelector: params.initialWorkflow.config.clickSelector,
userAgent: userAgent: