feat: Add behaviors section to workflow form (#2464)
- Moves "Per-Page Limits" fields to new "Page Behavior" section - Fixes workflow settings closing tags with refactor to how sections are rendered - Updates user guide with behaviors documentation --------- Co-authored-by: Henry Wilkinson <henry@wilkinson.graphics>
This commit is contained in:
parent
a42d83c9f6
commit
ac1236f15b
@ -12,7 +12,7 @@ The status of an archived item depends on its type. Uploads will always have the
|
||||
|
||||
| Status | Description |
|
||||
| ---- | ---- |
|
||||
| <span class="status-success">:bootstrap-check-circle-fill: Complete</span> | The crawl completed according to the workflow's settings. Workflows with [limits](workflow-setup.md#limits) set may stop running before they capture every queued page, but the resulting archived item will still be marked as "Complete". |
|
||||
| <span class="status-success">:bootstrap-check-circle-fill: Complete</span> | The crawl completed according to the workflow's settings. Workflows with [crawl limits](workflow-setup.md#crawl-limits) set may stop running before they capture every queued page, but the resulting archived item will still be marked as "Complete". |
|
||||
| <span class="status-neutral">:bootstrap-dash-square-fill: Stopped</span> | The crawl workflow was _stopped_ gracefully by a user and data is saved. |
|
||||
| <span class="status-neutral">:bootstrap-exclamation-square-fill: Stopped: Reason</span> | A workflow limit (listed as the reason) was reached and data is saved. |
|
||||
| <span class="status-warning">:bootstrap-x-octagon-fill: Canceled</span> | The crawl workflow was _canceled_ by a user, no data is saved. |
|
||||
|
||||
@ -129,7 +129,7 @@ This can be useful for avoiding crawler traps — sites that may automatically g
|
||||
|
||||
e.g: If `#!regex \babout\/?\b` is entered, `example.com/about/` will not be crawled however `example.com/aboutme/` will be crawled.
|
||||
|
||||
## Limits
|
||||
## Crawl Limits
|
||||
|
||||
Enforce maximum limits on your crawl.
|
||||
|
||||
@ -145,7 +145,36 @@ The crawl will be gracefully stopped after this set period of elapsed time.
|
||||
|
||||
The crawl will be gracefully stopped after reaching this set size in GB.
|
||||
|
||||
### Page Load Timeout
|
||||
## Page Behavior
|
||||
|
||||
Customize how and when the browser performs specific operations on a page.
|
||||
|
||||
**Built-in Behaviors**
|
||||
|
||||
Behaviors are browser operations that can be enabled for additional page interactivity.
|
||||
|
||||
### Autoscroll
|
||||
|
||||
When enabled, the browser will automatically scroll to the end of the page.
|
||||
|
||||
### Autoclick
|
||||
|
||||
When enabled, the browser will automatically click on all link-like elements.
|
||||
|
||||
When clicking a link-like element that would normally result in navigation, autoclick will only record the click and prevent navigation away from the current page.
|
||||
|
||||
??? Info "Autoclick use cases"
|
||||
This behavior can be helpful for:
|
||||
|
||||
- Websites that use anchor links (`<a>`) in non-standard ways, such as by using JavaScript in place of the standard `href` attribute to create a hyperlink.
|
||||
|
||||
- Websites that use `<a>` in place of a `<button>` to reveal in-page content.
|
||||
|
||||
**Page Timing**
|
||||
|
||||
Page timing gives you more granular control over how long the browser should stay on a page and when behaviors should run on a page. Add limits to decrease the amount of time the browser spends on a page, and add delays to increase the amount of time the browser waits on a page. Adding delays will increase the total amount of time spent on a crawl and may impact your overall crawl minutes.
|
||||
|
||||
### Page Load Limit
|
||||
|
||||
Limits amount of elapsed time to wait for a page to load. Behaviors will run after this timeout only if the page is partially or fully loaded.
|
||||
|
||||
@ -153,23 +182,13 @@ Limits amount of elapsed time to wait for a page to load. Behaviors will run aft
|
||||
|
||||
Waits on the page after initial HTML page load for a set number of seconds prior to moving on to next steps such as link extraction and behaviors. Can be useful with pages that are slow to load page contents.
|
||||
|
||||
### Behavior Timeout
|
||||
### Behavior Limit
|
||||
|
||||
Limits amount of elapsed time behaviors have to complete.
|
||||
|
||||
### Autoscroll Behavior
|
||||
|
||||
When enabled, the browser will automatically scroll to the end of the page.
|
||||
|
||||
### Autoclick Behavior
|
||||
|
||||
When enabled, the browser will automatically click on all links, even if they're empty or don't navigate to another page.
|
||||
|
||||
This can be helpful for web applications that use JavaScript to handle navigation and don't link to things properly with `href=""` attributes.
|
||||
|
||||
### Delay Before Next Page
|
||||
|
||||
Waits on the page for a set period of elapsed time after any behaviors have finished running. This can be helpful to avoid rate limiting however it will slow down your crawl.
|
||||
Waits on the page for a set number of seconds before unloading the current page. If any [behaviors](#autoscroll) are enabled, this delay will take place after all behaviors have finished running. This can be helpful to avoid rate limiting.
|
||||
|
||||
## Browser Settings
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
import { localized, msg, str } from "@lit/localize";
|
||||
import ISO6391 from "iso-639-1";
|
||||
import { html, nothing } from "lit";
|
||||
import { html, nothing, type TemplateResult } from "lit";
|
||||
import { customElement, property, state } from "lit/decorators.js";
|
||||
import { when } from "lit/directives/when.js";
|
||||
import { html as staticHtml, unsafeStatic } from "lit/static-html.js";
|
||||
@ -9,6 +9,7 @@ import RegexColorize from "regex-colorize";
|
||||
|
||||
import { BtrixElement } from "@/classes/BtrixElement";
|
||||
import type { CrawlConfig, Seed, SeedConfig } from "@/pages/org/types";
|
||||
import { labelFor } from "@/strings/crawl-workflows/labels";
|
||||
import scopeTypeLabel from "@/strings/crawl-workflows/scopeType";
|
||||
import sectionStrings from "@/strings/crawl-workflows/section";
|
||||
import type { Collection } from "@/types/collection";
|
||||
@ -61,7 +62,6 @@ export class ConfigDetails extends BtrixElement {
|
||||
|
||||
render() {
|
||||
const crawlConfig = this.crawlConfig;
|
||||
const seedsConfig = crawlConfig?.config;
|
||||
const renderTimeLimit = (
|
||||
valueSeconds?: number | null,
|
||||
fallbackValue?: number,
|
||||
@ -99,12 +99,11 @@ export class ConfigDetails extends BtrixElement {
|
||||
};
|
||||
|
||||
return html`
|
||||
<section id="crawler-settings" class="mb-8">
|
||||
<btrix-section-heading style="--margin: var(--sl-spacing-medium)">
|
||||
<h4>${sectionStrings.scope}</h4>
|
||||
</btrix-section-heading>
|
||||
<btrix-desc-list>
|
||||
${when(
|
||||
${this.renderSection({
|
||||
id: "crawler-settings",
|
||||
heading: sectionStrings.scope,
|
||||
renderDescItems: (seedsConfig) =>
|
||||
when(
|
||||
seedsConfig,
|
||||
(config) => html`
|
||||
${this.renderSetting(
|
||||
@ -121,10 +120,12 @@ export class ConfigDetails extends BtrixElement {
|
||||
? this.renderConfirmUrlListSettings(config)
|
||||
: this.renderConfirmSeededSettings(config)}
|
||||
`,
|
||||
)}
|
||||
<btrix-section-heading style="--margin: var(--sl-spacing-medium)">
|
||||
<h4>${sectionStrings.perCrawlLimits}</h4>
|
||||
</btrix-section-heading>
|
||||
),
|
||||
})}
|
||||
${this.renderSection({
|
||||
id: "crawl-limits",
|
||||
heading: sectionStrings.limits,
|
||||
renderDescItems: (seedsConfig) => html`
|
||||
${this.renderSetting(
|
||||
msg("Max Pages"),
|
||||
when(seedsConfig && this.seeds, (seeds) => {
|
||||
@ -148,62 +149,64 @@ export class ConfigDetails extends BtrixElement {
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Crawl Time Limit"),
|
||||
renderTimeLimit(crawlConfig?.crawlTimeout, Infinity),
|
||||
renderTimeLimit(this.crawlConfig?.crawlTimeout, Infinity),
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Crawl Size Limit"),
|
||||
renderSize(crawlConfig?.maxCrawlSize),
|
||||
renderSize(this.crawlConfig?.maxCrawlSize),
|
||||
)}
|
||||
<btrix-section-heading style="--margin: var(--sl-spacing-medium)">
|
||||
<h4>${sectionStrings.perPageLimits}</h4>
|
||||
</btrix-section-heading>
|
||||
`,
|
||||
})}
|
||||
${this.renderSection({
|
||||
id: "browser-behaviors",
|
||||
heading: sectionStrings.behaviors,
|
||||
renderDescItems: (seedsConfig) => html`
|
||||
${this.renderSetting(
|
||||
msg("Page Load Timeout"),
|
||||
renderTimeLimit(
|
||||
crawlConfig?.config.pageLoadTimeout,
|
||||
this.orgDefaults?.pageLoadTimeoutSeconds ?? Infinity,
|
||||
),
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Delay After Page Load"),
|
||||
renderTimeLimit(crawlConfig?.config.postLoadDelay, 0),
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Behavior Timeout"),
|
||||
renderTimeLimit(
|
||||
crawlConfig?.config.behaviorTimeout,
|
||||
this.orgDefaults?.behaviorTimeoutSeconds ?? Infinity,
|
||||
),
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Autoscroll Behavior"),
|
||||
crawlConfig?.config.behaviors &&
|
||||
!crawlConfig.config.behaviors.includes("autoscroll")
|
||||
labelFor.autoscrollBehavior,
|
||||
seedsConfig?.behaviors &&
|
||||
!seedsConfig.behaviors.includes("autoscroll")
|
||||
? msg("Disabled")
|
||||
: html`<span class="text-neutral-400"
|
||||
>${msg("Enabled (default)")}</span
|
||||
>`,
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Autoclick Behavior"),
|
||||
crawlConfig?.config.behaviors &&
|
||||
crawlConfig.config.behaviors.includes("autoclick")
|
||||
labelFor.autoclickBehavior,
|
||||
seedsConfig?.behaviors &&
|
||||
seedsConfig.behaviors.includes("autoclick")
|
||||
? msg("Enabled")
|
||||
: html`<span class="text-neutral-400"
|
||||
>${msg("Disabled (default)")}</span
|
||||
>`,
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Delay Before Next Page"),
|
||||
renderTimeLimit(crawlConfig?.config.pageExtraDelay, 0),
|
||||
labelFor.pageLoadTimeoutSeconds,
|
||||
renderTimeLimit(
|
||||
seedsConfig?.pageLoadTimeout,
|
||||
this.orgDefaults?.pageLoadTimeoutSeconds ?? Infinity,
|
||||
),
|
||||
)}
|
||||
</btrix-desc-list>
|
||||
</section>
|
||||
<section id="browser-settings" class="mb-8">
|
||||
<btrix-section-heading style="--margin: var(--sl-spacing-medium)">
|
||||
<h4>${sectionStrings.browserSettings}</h4>
|
||||
</btrix-section-heading>
|
||||
<btrix-desc-list>
|
||||
${this.renderSetting(
|
||||
labelFor.pageLoadTimeoutSeconds,
|
||||
renderTimeLimit(seedsConfig?.postLoadDelay, 0),
|
||||
)}
|
||||
${this.renderSetting(
|
||||
labelFor.behaviorTimeoutSeconds,
|
||||
renderTimeLimit(
|
||||
seedsConfig?.behaviorTimeout,
|
||||
this.orgDefaults?.behaviorTimeoutSeconds ?? Infinity,
|
||||
),
|
||||
)}
|
||||
${this.renderSetting(
|
||||
labelFor.pageExtraDelaySeconds,
|
||||
renderTimeLimit(seedsConfig?.pageExtraDelay, 0),
|
||||
)}
|
||||
`,
|
||||
})}
|
||||
${this.renderSection({
|
||||
id: "browser-settings",
|
||||
heading: sectionStrings.browserSettings,
|
||||
renderDescItems: (seedsConfig) => html`
|
||||
${this.renderSetting(
|
||||
msg("Browser Profile"),
|
||||
when(
|
||||
@ -238,32 +241,31 @@ export class ConfigDetails extends BtrixElement {
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Block Ads by Domain"),
|
||||
crawlConfig?.config.blockAds,
|
||||
seedsConfig?.blockAds,
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("User Agent"),
|
||||
crawlConfig?.config.userAgent
|
||||
? crawlConfig.config.userAgent
|
||||
seedsConfig?.userAgent
|
||||
? seedsConfig.userAgent
|
||||
: html`<span class="text-neutral-400"
|
||||
>${msg("Browser User Agent (default)")}</span
|
||||
>`,
|
||||
)}
|
||||
${crawlConfig?.config.lang
|
||||
${seedsConfig?.lang
|
||||
? this.renderSetting(
|
||||
msg("Language"),
|
||||
ISO6391.getName(crawlConfig.config.lang),
|
||||
ISO6391.getName(seedsConfig.lang),
|
||||
)
|
||||
: nothing}
|
||||
${crawlConfig?.proxyId
|
||||
? this.renderSetting(msg("Proxy"), capitalize(crawlConfig.proxyId))
|
||||
: nothing}
|
||||
</btrix-desc-list>
|
||||
</section>
|
||||
<section id="crawl-scheduling" class="mb-8">
|
||||
<btrix-section-heading style="--margin: var(--sl-spacing-medium)">
|
||||
<h4>${sectionStrings.scheduling}</h4>
|
||||
</btrix-section-heading>
|
||||
<btrix-desc-list>
|
||||
`,
|
||||
})}
|
||||
${this.renderSection({
|
||||
id: "crawl-scheduling",
|
||||
heading: sectionStrings.scheduling,
|
||||
renderDescItems: () => html`
|
||||
${this.renderSetting(
|
||||
msg("Crawl Schedule Type"),
|
||||
crawlConfig?.schedule
|
||||
@ -278,54 +280,72 @@ export class ConfigDetails extends BtrixElement {
|
||||
: undefined,
|
||||
),
|
||||
)}
|
||||
`,
|
||||
})}
|
||||
${when(!this.hideMetadata, () =>
|
||||
this.renderSection({
|
||||
id: "crawl-metadata",
|
||||
heading: sectionStrings.metadata,
|
||||
renderDescItems: () => html`
|
||||
${this.renderSetting(msg("Name"), crawlConfig?.name)}
|
||||
${this.renderSetting(
|
||||
msg("Description"),
|
||||
crawlConfig?.description
|
||||
? html`
|
||||
<p class="max-w-prose font-sans">
|
||||
${crawlConfig.description}
|
||||
</p>
|
||||
`
|
||||
: undefined,
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Tags"),
|
||||
crawlConfig?.tags.length
|
||||
? crawlConfig.tags.map(
|
||||
(tag) =>
|
||||
html`<btrix-tag class="mr-2 mt-1">${tag}</btrix-tag>`,
|
||||
)
|
||||
: [],
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Collections"),
|
||||
this.collections.length
|
||||
? this.collections.map(
|
||||
(coll) =>
|
||||
html`<sl-tag class="mr-2 mt-1" variant="neutral">
|
||||
${coll.name}
|
||||
<span class="font-monostyle pl-1 text-xs">
|
||||
(${this.localize.number(coll.crawlCount)}
|
||||
${pluralOf("items", coll.crawlCount)})
|
||||
</span>
|
||||
</sl-tag>`,
|
||||
)
|
||||
: undefined,
|
||||
)}
|
||||
`,
|
||||
}),
|
||||
)}
|
||||
`;
|
||||
}
|
||||
|
||||
private renderSection({
|
||||
id,
|
||||
heading,
|
||||
renderDescItems,
|
||||
}: {
|
||||
id: string;
|
||||
heading: string;
|
||||
renderDescItems: (seedsConfig?: CrawlConfig["config"]) => TemplateResult;
|
||||
}) {
|
||||
return html`
|
||||
<section id=${id} class="mb-8">
|
||||
<btrix-section-heading style="--margin: var(--sl-spacing-medium)">
|
||||
<h4>${heading}</h4>
|
||||
</btrix-section-heading>
|
||||
<btrix-desc-list>
|
||||
${renderDescItems(this.crawlConfig?.config)}
|
||||
</btrix-desc-list>
|
||||
</section>
|
||||
${this.hideMetadata
|
||||
? nothing
|
||||
: html`
|
||||
<section id="crawl-metadata" class="mb-8">
|
||||
<btrix-section-heading style="--margin: var(--sl-spacing-medium)">
|
||||
<h4>${msg("Metadata")}</h4>
|
||||
</btrix-section-heading>
|
||||
<btrix-desc-list>
|
||||
${this.renderSetting(msg("Name"), crawlConfig?.name)}
|
||||
${this.renderSetting(
|
||||
msg("Description"),
|
||||
crawlConfig?.description
|
||||
? html`
|
||||
<p class="max-w-prose font-sans">
|
||||
${crawlConfig.description}
|
||||
</p>
|
||||
`
|
||||
: undefined,
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Tags"),
|
||||
crawlConfig?.tags.length
|
||||
? crawlConfig.tags.map(
|
||||
(tag) =>
|
||||
html`<btrix-tag class="mr-2 mt-1">${tag}</btrix-tag>`,
|
||||
)
|
||||
: [],
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Collections"),
|
||||
this.collections.length
|
||||
? this.collections.map(
|
||||
(coll) =>
|
||||
html`<sl-tag class="mr-2 mt-1" variant="neutral">
|
||||
${coll.name}
|
||||
<span class="font-monostyle pl-1 text-xs">
|
||||
(${this.localize.number(coll.crawlCount)}
|
||||
${pluralOf("items", coll.crawlCount)})
|
||||
</span>
|
||||
</sl-tag>`,
|
||||
)
|
||||
: undefined,
|
||||
)}
|
||||
</btrix-desc-list>
|
||||
</section>
|
||||
`}
|
||||
`;
|
||||
}
|
||||
|
||||
|
||||
@ -57,6 +57,7 @@ import { infoCol, inputCol } from "@/layouts/columns";
|
||||
import { pageSectionsWithNav } from "@/layouts/pageSectionsWithNav";
|
||||
import { panel } from "@/layouts/panel";
|
||||
import infoTextStrings from "@/strings/crawl-workflows/infoText";
|
||||
import { labelFor } from "@/strings/crawl-workflows/labels";
|
||||
import scopeTypeLabels from "@/strings/crawl-workflows/scopeType";
|
||||
import sectionStrings from "@/strings/crawl-workflows/section";
|
||||
import { AnalyticsTrackEvent } from "@/trackEvents";
|
||||
@ -85,6 +86,7 @@ import {
|
||||
getDefaultFormState,
|
||||
getInitialFormState,
|
||||
getServerDefaults,
|
||||
SECTIONS,
|
||||
type FormState,
|
||||
type WorkflowDefaults,
|
||||
} from "@/utils/workflow";
|
||||
@ -96,13 +98,7 @@ type NewCrawlConfigParams = WorkflowParams & {
|
||||
};
|
||||
};
|
||||
|
||||
const STEPS = [
|
||||
"crawlSetup",
|
||||
"crawlLimits",
|
||||
"browserSettings",
|
||||
"crawlScheduling",
|
||||
"crawlMetadata",
|
||||
] as const;
|
||||
const STEPS = SECTIONS;
|
||||
type StepName = (typeof STEPS)[number];
|
||||
type TabState = {
|
||||
completed: boolean;
|
||||
@ -123,7 +119,7 @@ const formName = "newJobConfig" as const;
|
||||
const panelSuffix = "--panel" as const;
|
||||
|
||||
const getDefaultProgressState = (hasConfigId = false): ProgressState => {
|
||||
let activeTab: StepName = "crawlSetup";
|
||||
let activeTab: StepName = "scope";
|
||||
if (window.location.hash) {
|
||||
const hashValue = window.location.hash.slice(1);
|
||||
|
||||
@ -136,8 +132,12 @@ const getDefaultProgressState = (hasConfigId = false): ProgressState => {
|
||||
activeTab,
|
||||
// TODO Mark as completed only if form section has data
|
||||
tabs: {
|
||||
crawlSetup: { error: false, completed: hasConfigId },
|
||||
crawlLimits: {
|
||||
scope: { error: false, completed: hasConfigId },
|
||||
limits: {
|
||||
error: false,
|
||||
completed: hasConfigId,
|
||||
},
|
||||
behaviors: {
|
||||
error: false,
|
||||
completed: hasConfigId,
|
||||
},
|
||||
@ -145,11 +145,11 @@ const getDefaultProgressState = (hasConfigId = false): ProgressState => {
|
||||
error: false,
|
||||
completed: hasConfigId,
|
||||
},
|
||||
crawlScheduling: {
|
||||
scheduling: {
|
||||
error: false,
|
||||
completed: hasConfigId,
|
||||
},
|
||||
crawlMetadata: {
|
||||
metadata: {
|
||||
error: false,
|
||||
completed: hasConfigId,
|
||||
},
|
||||
@ -242,13 +242,7 @@ export class WorkflowEditor extends BtrixElement {
|
||||
private readonly validateNameMax = maxLengthValidator(50);
|
||||
private readonly validateDescriptionMax = maxLengthValidator(350);
|
||||
|
||||
private readonly tabLabels: Record<StepName, string> = {
|
||||
crawlSetup: sectionStrings.scope,
|
||||
crawlLimits: msg("Limits"),
|
||||
browserSettings: sectionStrings.browserSettings,
|
||||
crawlScheduling: sectionStrings.scheduling,
|
||||
crawlMetadata: msg("Metadata"),
|
||||
};
|
||||
private readonly tabLabels = sectionStrings;
|
||||
|
||||
private get formHasError() {
|
||||
return (
|
||||
@ -1086,28 +1080,8 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
urlListToArray(this.formState.urlList).length +
|
||||
(isPageScopeType(this.formState.scopeType) ? 0 : 1),
|
||||
);
|
||||
const onInputMinMax = async (e: CustomEvent) => {
|
||||
const inputEl = e.target as SlInput;
|
||||
await inputEl.updateComplete;
|
||||
let helpText = "";
|
||||
if (!inputEl.checkValidity()) {
|
||||
const value = +inputEl.value;
|
||||
const min = inputEl.min;
|
||||
const max = inputEl.max;
|
||||
if (min && value < +min) {
|
||||
helpText = msg(
|
||||
str`Must be more than minimum of ${this.localize.number(+min)}`,
|
||||
);
|
||||
} else if (max && value > +max) {
|
||||
helpText = msg(
|
||||
str`Must be less than maximum of ${this.localize.number(+max)}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
inputEl.helpText = helpText;
|
||||
};
|
||||
|
||||
return html`
|
||||
${this.renderSectionHeading(sectionStrings.perCrawlLimits)}
|
||||
${inputCol(html`
|
||||
<sl-mutation-observer
|
||||
attr="min"
|
||||
@ -1137,7 +1111,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
: undefined,
|
||||
)}
|
||||
placeholder=${defaultLabel(this.orgDefaults.maxPagesPerCrawl)}
|
||||
@sl-input=${onInputMinMax}
|
||||
@sl-input=${this.onInputMinMax}
|
||||
>
|
||||
<span slot="suffix">${msg("pages")}</span>
|
||||
</sl-input>
|
||||
@ -1172,17 +1146,49 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
</sl-input>
|
||||
`)}
|
||||
${this.renderHelpTextCol(infoTextStrings["maxCrawlSizeGB"])}
|
||||
${this.renderSectionHeading(sectionStrings.perPageLimits)}
|
||||
`;
|
||||
}
|
||||
|
||||
private renderBehaviors() {
|
||||
return html`
|
||||
${this.renderSectionHeading(msg("Built-in Behaviors"))}
|
||||
${inputCol(
|
||||
html`<sl-checkbox
|
||||
name="autoscrollBehavior"
|
||||
?checked=${this.formState.autoscrollBehavior}
|
||||
>
|
||||
${labelFor.autoscrollBehavior}
|
||||
</sl-checkbox>`,
|
||||
)}
|
||||
${this.renderHelpTextCol(
|
||||
msg(`Automatically scroll to the end of the page.`),
|
||||
false,
|
||||
)}
|
||||
${inputCol(
|
||||
html`<sl-checkbox
|
||||
name="autoclickBehavior"
|
||||
?checked=${this.formState.autoclickBehavior}
|
||||
>
|
||||
${labelFor.autoclickBehavior}
|
||||
</sl-checkbox>`,
|
||||
)}
|
||||
${this.renderHelpTextCol(
|
||||
msg(
|
||||
`Automatically click on all link-like elements. Useful for capturing in-page interactions or for clicking links without navigating away from the page.`,
|
||||
),
|
||||
false,
|
||||
)}
|
||||
${this.renderSectionHeading(msg("Page Timing"))}
|
||||
${inputCol(html`
|
||||
<sl-input
|
||||
name="pageLoadTimeoutSeconds"
|
||||
type="number"
|
||||
inputmode="numeric"
|
||||
label=${msg("Page Load Timeout")}
|
||||
label=${labelFor.pageLoadTimeoutSeconds}
|
||||
placeholder=${defaultLabel(this.orgDefaults.pageLoadTimeoutSeconds)}
|
||||
value=${ifDefined(this.formState.pageLoadTimeoutSeconds ?? undefined)}
|
||||
min="0"
|
||||
@sl-input=${onInputMinMax}
|
||||
@sl-input=${this.onInputMinMax}
|
||||
>
|
||||
<span slot="suffix">${msg("seconds")}</span>
|
||||
</sl-input>
|
||||
@ -1193,7 +1199,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
name="postLoadDelaySeconds"
|
||||
type="number"
|
||||
inputmode="numeric"
|
||||
label=${msg("Delay After Page Load")}
|
||||
label=${labelFor.postLoadDelaySeconds}
|
||||
placeholder=${defaultLabel(0)}
|
||||
value=${ifDefined(this.formState.postLoadDelaySeconds ?? undefined)}
|
||||
min="0"
|
||||
@ -1207,50 +1213,22 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
name="behaviorTimeoutSeconds"
|
||||
type="number"
|
||||
inputmode="numeric"
|
||||
label=${msg("Behavior Timeout")}
|
||||
label=${labelFor.behaviorTimeoutSeconds}
|
||||
placeholder=${defaultLabel(this.orgDefaults.behaviorTimeoutSeconds)}
|
||||
value=${ifDefined(this.formState.behaviorTimeoutSeconds ?? undefined)}
|
||||
min="0"
|
||||
@sl-input=${onInputMinMax}
|
||||
@sl-input=${this.onInputMinMax}
|
||||
>
|
||||
<span slot="suffix">${msg("seconds")}</span>
|
||||
</sl-input>
|
||||
`)}
|
||||
${this.renderHelpTextCol(infoTextStrings["behaviorTimeoutSeconds"])}
|
||||
${inputCol(
|
||||
html`<sl-checkbox
|
||||
name="autoscrollBehavior"
|
||||
?checked=${this.formState.autoscrollBehavior}
|
||||
>
|
||||
${msg("Autoscroll behavior")}
|
||||
</sl-checkbox>`,
|
||||
)}
|
||||
${this.renderHelpTextCol(
|
||||
msg(
|
||||
`When enabled the browser will automatically scroll to the end of the page.`,
|
||||
),
|
||||
false,
|
||||
)}
|
||||
${inputCol(
|
||||
html`<sl-checkbox
|
||||
name="autoclickBehavior"
|
||||
?checked=${this.formState.autoclickBehavior}
|
||||
>
|
||||
${msg("Autoclick behavior")}
|
||||
</sl-checkbox>`,
|
||||
)}
|
||||
${this.renderHelpTextCol(
|
||||
msg(
|
||||
`When enabled the browser will automatically click on links that don't navigate to other pages.`,
|
||||
),
|
||||
false,
|
||||
)}
|
||||
${inputCol(html`
|
||||
<sl-input
|
||||
name="pageExtraDelaySeconds"
|
||||
type="number"
|
||||
inputmode="numeric"
|
||||
label=${msg("Delay Before Next Page")}
|
||||
label=${labelFor.pageExtraDelaySeconds}
|
||||
placeholder=${defaultLabel(0)}
|
||||
value=${ifDefined(this.formState.pageExtraDelaySeconds ?? undefined)}
|
||||
min="0"
|
||||
@ -1262,7 +1240,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
`;
|
||||
}
|
||||
|
||||
private renderCrawlBehaviors() {
|
||||
private renderBrowserSettings() {
|
||||
if (!this.formState.lang) throw new Error("missing formstate.lang");
|
||||
return html`
|
||||
${inputCol(html`
|
||||
@ -1601,35 +1579,59 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
required?: boolean;
|
||||
}[] = [
|
||||
{
|
||||
name: "crawlSetup",
|
||||
name: "scope",
|
||||
desc: msg("Specify the range and depth of your crawl."),
|
||||
render: this.renderScope,
|
||||
required: true,
|
||||
},
|
||||
{
|
||||
name: "crawlLimits",
|
||||
desc: msg("Enforce maximum limits on your crawl."),
|
||||
name: "limits",
|
||||
desc: msg("Limit the size and duration of the crawl."),
|
||||
render: this.renderCrawlLimits,
|
||||
},
|
||||
{
|
||||
name: "browserSettings",
|
||||
desc: msg(
|
||||
"Configure the browser that's used to visit URLs during the crawl.",
|
||||
),
|
||||
render: this.renderCrawlBehaviors,
|
||||
name: "behaviors",
|
||||
desc: msg("Customize how the browser loads and interacts with a page."),
|
||||
render: this.renderBehaviors,
|
||||
},
|
||||
{
|
||||
name: "crawlScheduling",
|
||||
name: "browserSettings",
|
||||
desc: msg("Configure the browser used to crawl."),
|
||||
render: this.renderBrowserSettings,
|
||||
},
|
||||
{
|
||||
name: "scheduling",
|
||||
desc: msg("Schedule recurring crawls."),
|
||||
render: this.renderJobScheduling,
|
||||
},
|
||||
{
|
||||
name: "crawlMetadata",
|
||||
name: "metadata",
|
||||
desc: msg("Describe and organize crawls from this workflow."),
|
||||
render: this.renderJobMetadata,
|
||||
},
|
||||
];
|
||||
|
||||
private readonly onInputMinMax = async (e: CustomEvent) => {
|
||||
const inputEl = e.target as SlInput;
|
||||
await inputEl.updateComplete;
|
||||
let helpText = "";
|
||||
if (!inputEl.checkValidity()) {
|
||||
const value = +inputEl.value;
|
||||
const min = inputEl.min;
|
||||
const max = inputEl.max;
|
||||
if (min && value < +min) {
|
||||
helpText = msg(
|
||||
str`Must be more than minimum of ${this.localize.number(+min)}`,
|
||||
);
|
||||
} else if (max && value > +max) {
|
||||
helpText = msg(
|
||||
str`Must be less than maximum of ${this.localize.number(+max)}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
inputEl.helpText = helpText;
|
||||
};
|
||||
|
||||
private changeScopeType(value: FormState["scopeType"]) {
|
||||
const prevScopeType = this.formState.scopeType;
|
||||
const formState: Partial<FormState> = {
|
||||
@ -1799,7 +1801,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
if (e.detail.valid === false || !table.checkValidity()) {
|
||||
this.updateProgressState({
|
||||
tabs: {
|
||||
crawlSetup: { error: true },
|
||||
scope: { error: true },
|
||||
},
|
||||
});
|
||||
} else {
|
||||
|
||||
@ -101,7 +101,7 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
|
||||
></btrix-queue-exclusion-table>
|
||||
`,
|
||||
};
|
||||
const perCrawlLimits = {
|
||||
const limits = {
|
||||
crawlTimeoutMinutes: html`
|
||||
<sl-input
|
||||
size="small"
|
||||
@ -139,7 +139,7 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
|
||||
</sl-input>
|
||||
`,
|
||||
};
|
||||
const perPageLimits = {
|
||||
const behaviors = {
|
||||
pageLoadTimeoutSeconds: html`
|
||||
<sl-input
|
||||
size="small"
|
||||
@ -255,8 +255,8 @@ export class OrgSettingsCrawlWorkflows extends BtrixElement {
|
||||
|
||||
return {
|
||||
scope,
|
||||
perCrawlLimits,
|
||||
perPageLimits,
|
||||
limits,
|
||||
behaviors,
|
||||
browserSettings,
|
||||
};
|
||||
}
|
||||
|
||||
10
frontend/src/strings/crawl-workflows/labels.ts
Normal file
10
frontend/src/strings/crawl-workflows/labels.ts
Normal file
@ -0,0 +1,10 @@
|
||||
import { msg } from "@lit/localize";
|
||||
|
||||
export const labelFor = {
|
||||
autoscrollBehavior: msg("Autoscroll"),
|
||||
autoclickBehavior: msg("Autoclick"),
|
||||
pageLoadTimeoutSeconds: msg("Page Load Limit"),
|
||||
postLoadDelaySeconds: msg("Delay After Page Load"),
|
||||
behaviorTimeoutSeconds: "Behavior Limit",
|
||||
pageExtraDelaySeconds: msg("Delay Before Next Page"),
|
||||
};
|
||||
@ -4,10 +4,11 @@ import { type SectionsEnum } from "@/utils/workflow";
|
||||
|
||||
const section: Record<SectionsEnum, string> = {
|
||||
scope: msg("Scope"),
|
||||
perCrawlLimits: msg("Per-Crawl Limits"),
|
||||
perPageLimits: msg("Per-Page Limits"),
|
||||
limits: msg("Crawl Limits"),
|
||||
behaviors: msg("Page Behavior"),
|
||||
browserSettings: msg("Browser Settings"),
|
||||
scheduling: msg("Scheduling"),
|
||||
metadata: msg("Metadata"),
|
||||
};
|
||||
|
||||
export default section;
|
||||
|
||||
@ -25,10 +25,11 @@ export const BYTES_PER_GB = 1e9;
|
||||
|
||||
export const SECTIONS = [
|
||||
"scope",
|
||||
"perCrawlLimits",
|
||||
"perPageLimits",
|
||||
"limits",
|
||||
"behaviors",
|
||||
"browserSettings",
|
||||
"scheduling",
|
||||
"metadata",
|
||||
] as const;
|
||||
export const sectionsEnum = z.enum(SECTIONS);
|
||||
export type SectionsEnum = z.infer<typeof sectionsEnum>;
|
||||
|
||||
Loading…
Reference in New Issue
Block a user