feat: Add custom behaviors to org crawling defaults (#2546)

Resolves https://github.com/webrecorder/browsertrix/issues/2513

## Changes

- Allows org admins to set custom behaviors as crawling defaults
- Shows warning text if both autoscroll/autoclick and custom behaviors
are enabled
- Refactors `infoTextStrings` -> `infoTextFor` to match other
label/string matchers

---------

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
sua yoo 2025-04-09 01:10:30 -07:00 committed by GitHub
parent 0a0d2d04d3
commit 7c6bae8d61
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 108 additions and 48 deletions

View File

@ -67,7 +67,7 @@ import type {
import { infoCol, inputCol } from "@/layouts/columns";
import { pageSectionsWithNav } from "@/layouts/pageSectionsWithNav";
import { panel } from "@/layouts/panel";
import infoTextStrings from "@/strings/crawl-workflows/infoText";
import { infoTextFor } from "@/strings/crawl-workflows/infoText";
import { labelFor } from "@/strings/crawl-workflows/labels";
import scopeTypeLabels from "@/strings/crawl-workflows/scopeType";
import sectionStrings from "@/strings/crawl-workflows/section";
@ -758,10 +758,7 @@ export class WorkflowEditor extends BtrixElement {
@btrix-change=${this.handleChangeRegex}
></btrix-queue-exclusion-table>
`)}
${this.renderHelpTextCol(
infoTextStrings["exclusions"],
false,
)}
${this.renderHelpTextCol(infoTextFor["exclusions"], false)}
</div>
</btrix-details>
</div>
@ -1176,7 +1173,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
)}
${this.renderHelpTextCol(
html`
${infoTextStrings["selectLinks"]}
${infoTextFor["selectLinks"]}
<br /><br />
${msg(
html`If none are specified, the crawler will default to
@ -1235,7 +1232,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
</sl-input>
</sl-mutation-observer>
`)}
${this.renderHelpTextCol(infoTextStrings["pageLimit"])}
${this.renderHelpTextCol(infoTextFor["pageLimit"])}
${inputCol(html`
<sl-input
name="crawlTimeoutMinutes"
@ -1249,7 +1246,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
<span slot="suffix">${msg("minutes")}</span>
</sl-input>
`)}
${this.renderHelpTextCol(infoTextStrings["crawlTimeoutMinutes"])}
${this.renderHelpTextCol(infoTextFor["crawlTimeoutMinutes"])}
${inputCol(html`
<sl-input
name="maxCrawlSizeGB"
@ -1263,19 +1260,34 @@ https://archiveweb.page/images/${"logo.svg"}`}
<span slot="suffix">${msg("GB")}</span>
</sl-input>
`)}
${this.renderHelpTextCol(infoTextStrings["maxCrawlSizeGB"])}
${this.renderHelpTextCol(infoTextFor["maxCrawlSizeGB"])}
`;
}
private renderPageBehavior() {
const behaviorOverrideWarning = html`
<span slot="help-text" class="text-warning-600">
<sl-icon
name="exclamation-triangle"
class="align-[-.175em] text-sm"
></sl-icon>
${msg("May be overridden by custom behaviors.")}
</span>
`;
return html`
${this.renderSectionHeading(labelFor.behaviors)}
${inputCol(
html`<sl-checkbox
name="autoscrollBehavior"
class="part-[form-control-help-text]:mt-1.5"
?checked=${this.formState.autoscrollBehavior}
>
${labelFor.autoscrollBehavior}
${when(
this.formState.autoscrollBehavior && this.formState.customBehavior,
() => behaviorOverrideWarning,
)}
</sl-checkbox>`,
)}
${this.renderHelpTextCol(
@ -1285,9 +1297,14 @@ https://archiveweb.page/images/${"logo.svg"}`}
${inputCol(
html`<sl-checkbox
name="autoclickBehavior"
class="part-[form-control-help-text]:mt-1.5"
?checked=${this.formState.autoclickBehavior}
>
${labelFor.autoclickBehavior}
${when(
this.formState.autoclickBehavior && this.formState.customBehavior,
() => behaviorOverrideWarning,
)}
</sl-checkbox>
${when(
@ -1360,7 +1377,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
<span slot="suffix">${msg("seconds")}</span>
</sl-input>
`)}
${this.renderHelpTextCol(infoTextStrings["pageLoadTimeoutSeconds"])}
${this.renderHelpTextCol(infoTextFor["pageLoadTimeoutSeconds"])}
${inputCol(html`
<sl-input
name="postLoadDelaySeconds"
@ -1374,7 +1391,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
<span slot="suffix">${msg("seconds")}</span>
</sl-input>
`)}
${this.renderHelpTextCol(infoTextStrings["postLoadDelaySeconds"])}
${this.renderHelpTextCol(infoTextFor["postLoadDelaySeconds"])}
${inputCol(html`
<sl-input
name="behaviorTimeoutSeconds"
@ -1389,7 +1406,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
<span slot="suffix">${msg("seconds")}</span>
</sl-input>
`)}
${this.renderHelpTextCol(infoTextStrings["behaviorTimeoutSeconds"])}
${this.renderHelpTextCol(infoTextFor["behaviorTimeoutSeconds"])}
${inputCol(html`
<sl-input
name="pageExtraDelaySeconds"
@ -1403,7 +1420,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
<span slot="suffix">${msg("seconds")}</span>
</sl-input>
`)}
${this.renderHelpTextCol(infoTextStrings["pageExtraDelaySeconds"])}
${this.renderHelpTextCol(infoTextFor["pageExtraDelaySeconds"])}
`;
}
@ -1433,12 +1450,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
`,
)} `,
)}
${this.renderHelpTextCol(
msg(
`Enable custom page actions with behavior scripts. You can specify any publicly accessible URL or public Git repository.`,
),
false,
)}
${this.renderHelpTextCol(infoTextFor.customBehavior, false)}
`;
}
@ -1454,7 +1466,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
})}
></btrix-select-browser-profile>
`)}
${this.renderHelpTextCol(infoTextStrings["browserProfile"])}
${this.renderHelpTextCol(infoTextFor["browserProfile"])}
${this.proxies?.servers.length
? [
inputCol(html`
@ -1470,7 +1482,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
})}
></btrix-select-crawler-proxy>
`),
this.renderHelpTextCol(infoTextStrings["proxyId"]),
this.renderHelpTextCol(infoTextFor["proxyId"]),
]
: nothing}
${inputCol(html`
@ -1517,14 +1529,14 @@ https://archiveweb.page/images/${"logo.svg"}`}
></btrix-select-crawler>
`)}
${this.showCrawlerChannels
? this.renderHelpTextCol(infoTextStrings["crawlerChannel"])
? this.renderHelpTextCol(infoTextFor["crawlerChannel"])
: html``}
${inputCol(html`
<sl-checkbox name="blockAds" ?checked=${this.formState.blockAds}>
${msg("Block ads by domain")}
</sl-checkbox>
`)}
${this.renderHelpTextCol(infoTextStrings["blockAds"], false)}
${this.renderHelpTextCol(infoTextFor["blockAds"], false)}
${inputCol(html`
<sl-input
name="userAgent"
@ -1535,7 +1547,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
>
</sl-input>
`)}
${this.renderHelpTextCol(infoTextStrings["userAgent"])}
${this.renderHelpTextCol(infoTextFor["userAgent"])}
${inputCol(html`
<btrix-language-select
.value=${this.formState.lang as LanguageCode}
@ -1548,7 +1560,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
<span slot="label">${msg("Language")}</span>
</btrix-language-select>
`)}
${this.renderHelpTextCol(infoTextStrings["lang"])}
${this.renderHelpTextCol(infoTextFor["lang"])}
`;
}
@ -2191,10 +2203,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
// TODO Move away from manual validation check
// See https://github.com/webrecorder/browsertrix/issues/2536
if (
this.formState.autoclickBehavior &&
this.clickSelector
) {
if (this.formState.autoclickBehavior && this.clickSelector) {
if (!this.clickSelector.checkValidity()) {
this.clickSelector.reportValidity();
return;
@ -2434,7 +2443,9 @@ https://archiveweb.page/images/${"logo.svg"}`}
selectLinks: this.linkSelectorTable?.value.length
? this.linkSelectorTable.value
: DEFAULT_SELECT_LINKS,
customBehaviors: this.customBehaviorsTable?.value || [],
customBehaviors:
(this.formState.customBehavior && this.customBehaviorsTable?.value) ||
[],
clickSelector:
this.formState.clickSelector || DEFAULT_AUTOCLICK_SELECTOR,
},

View File

@ -12,11 +12,14 @@ import { BtrixElement } from "@/classes/BtrixElement";
import type { LanguageSelect } from "@/components/ui/language-select";
import type { SelectCrawlerProxy } from "@/components/ui/select-crawler-proxy";
import { proxiesContext, type ProxiesContext } from "@/context/org";
import type { CustomBehaviorsTable } from "@/features/crawl-workflows/custom-behaviors-table";
import type { QueueExclusionTable } from "@/features/crawl-workflows/queue-exclusion-table";
import { columns, type Cols } from "@/layouts/columns";
import infoTextStrings from "@/strings/crawl-workflows/infoText";
import { infoTextFor } from "@/strings/crawl-workflows/infoText";
import { labelFor } from "@/strings/crawl-workflows/labels";
import sectionStrings from "@/strings/crawl-workflows/section";
import { crawlingDefaultsSchema, type CrawlingDefaults } from "@/types/org";
import { formValidator } from "@/utils/form";
import {
appDefaults,
BYTES_PER_GB,
@ -32,14 +35,10 @@ type Field = Record<FieldName, TemplateResult<1> | undefined>;
const PLACEHOLDER_EXCLUSIONS = [""]; // Add empty slot
function section(section: SectionsEnum | "exclusions", cols: Cols) {
function section(section: SectionsEnum, cols: Cols) {
return html`
<section class="p-5">
<btrix-section-heading
>${section === "exclusions"
? msg("Exclusions")
: sectionStrings[section]}</btrix-section-heading
>
<btrix-section-heading>${sectionStrings[section]}</btrix-section-heading>
${columns(cols)}
</section>
`;
@ -63,6 +62,9 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
@query("btrix-queue-exclusion-table")
exclusionTable?: QueueExclusionTable | null;
@query("btrix-custom-behaviors-table")
customBehaviorsTable?: CustomBehaviorsTable | null;
@query("btrix-language-select")
languageSelect?: LanguageSelect | null;
@ -72,6 +74,8 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
@query('sl-button[type="submit"]')
submitButton?: SlButton | null;
private readonly checkFormValidity = formValidator(this);
connectedCallback() {
super.connectedCallback();
@ -140,6 +144,13 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
`,
};
const behaviors = {
customBehavior: html`
<label class="form-label text-xs">${labelFor.customBehaviors}</label>
<btrix-custom-behaviors-table
.customBehaviors=${orgDefaults.customBehaviors || []}
editable
></btrix-custom-behaviors-table>
`,
pageLoadTimeoutSeconds: html`
<sl-input
size="small"
@ -258,7 +269,7 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
limits,
behaviors,
browserSettings,
};
} as const;
}
private renderWorkflowDefaults() {
@ -270,10 +281,11 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
section(
sectionName as SectionsEnum,
Object.entries(fields)
.filter(([, field]) => field as unknown)
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
.filter(([, field]) => field)
.map(([fieldName, field]) => [
field,
infoTextStrings[fieldName as FieldName],
infoTextFor[fieldName as keyof typeof infoTextFor],
]),
),
),
@ -292,6 +304,31 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
e.preventDefault();
const form = e.target as HTMLFormElement;
// Wait for custom behaviors validation to finish
// TODO Move away from manual validation check
// See https://github.com/webrecorder/browsertrix/issues/2536
if (this.customBehaviorsTable) {
if (!this.customBehaviorsTable.checkValidity()) {
this.customBehaviorsTable.reportValidity();
return;
}
try {
await this.customBehaviorsTable.taskComplete;
} catch {
this.customBehaviorsTable.reportValidity();
return;
}
}
const isValid = await this.checkFormValidity(form);
if (!isValid) {
form.reportValidity();
return;
}
const values = serialize(form) as Record<string, string>;
const parseNumber = (value: string) => (value ? Number(value) : undefined);
const parsedValues: CrawlingDefaults = {
@ -312,6 +349,7 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
userAgent: values.userAgent,
lang: this.languageSelect?.value || undefined,
exclude: this.exclusionTable?.exclusions?.filter((v) => v) || [],
customBehaviors: this.customBehaviorsTable?.value || [],
};
// Set null or empty strings to undefined

View File

@ -160,6 +160,7 @@ export class WorkflowsNew extends LiteElement {
userAgent: org.crawlingDefaults?.userAgent,
blockAds: org.crawlingDefaults?.blockAds,
lang: org.crawlingDefaults?.lang,
customBehaviors: org.crawlingDefaults?.customBehaviors,
},
crawlTimeout: org.crawlingDefaults?.crawlTimeout,
maxCrawlSize: org.crawlingDefaults?.maxCrawlSize,

View File

@ -5,7 +5,7 @@ import { type FormState } from "@/utils/workflow";
type Field = keyof FormState;
const infoText: Partial<Record<Field, string | TemplateResult>> = {
export const infoTextFor = {
exclusions: msg(
"Specify exclusion rules for what pages should not be visited.",
),
@ -72,6 +72,9 @@ const infoText: Partial<Record<Field, string | TemplateResult>> = {
>
to find URLs that are defined in custom HTML attributes.`,
),
};
customBehavior: msg(
`Enable custom page actions with behavior scripts. You can specify any publicly accessible URL or public Git repository.`,
),
} as const satisfies Partial<Record<Field, string | TemplateResult>>;
export default infoText;
export default infoTextFor;

View File

@ -47,6 +47,7 @@ export const crawlingDefaultsSchema = z.object({
lang: z.string().optional(),
userAgent: z.string().optional(),
exclude: z.array(z.string()),
customBehaviors: z.array(z.string()),
});
export type CrawlingDefaults = z.infer<typeof crawlingDefaultsSchema>;

View File

@ -244,6 +244,10 @@ export function getInitialFormState(params: {
return fallback;
};
const enableCustomBehaviors = Boolean(
params.initialWorkflow.config.customBehaviors.length,
);
return {
...defaultFormState,
primarySeedUrl: defaultFormState.primarySeedUrl,
@ -294,13 +298,15 @@ export function getInitialFormState(params: {
params.initialWorkflow.config.limit ?? defaultFormState.pageLimit,
autoscrollBehavior: params.initialWorkflow.config.behaviors
? params.initialWorkflow.config.behaviors.includes(Behavior.AutoScroll)
: defaultFormState.autoscrollBehavior,
: enableCustomBehaviors
? false
: defaultFormState.autoscrollBehavior,
autoclickBehavior: params.initialWorkflow.config.behaviors
? params.initialWorkflow.config.behaviors.includes(Behavior.AutoClick)
: defaultFormState.autoclickBehavior,
customBehavior: Boolean(
params.initialWorkflow.config.customBehaviors.length,
),
: enableCustomBehaviors
? false
: defaultFormState.autoclickBehavior,
customBehavior: enableCustomBehaviors,
selectLinks: params.initialWorkflow.config.selectLinks,
clickSelector: params.initialWorkflow.config.clickSelector,
userAgent: