Allow users to set max depth/hop out within scope (#816)
- Adds an input to the Workflow creation and edit form for specifying crawl depth. This input is conditionally shown for seeded crawls when the scope is set to "Pages on this domain", "Pages on this domain & subdomains" or "Custom page prefix". The "any" scope is also supported for backwards compatibility but is not shown by default or in new configs. - API implementation: The depth value is set in the primary seed config, i.e. the first seed in seeds: [], not in the outer .config.depth property.
This commit is contained in:
parent
7409e0637e
commit
9fcbc3f87e
@ -304,6 +304,18 @@ export class ConfigDetails extends LiteElement {
|
||||
: msg("None"),
|
||||
true
|
||||
)}
|
||||
${when(
|
||||
["host", "domain", "custom", "any"].includes(
|
||||
primarySeedConfig.scopeType || seedsConfig.scopeType
|
||||
),
|
||||
() =>
|
||||
this.renderSetting(
|
||||
msg("Max Depth"),
|
||||
primarySeedConfig.depth
|
||||
? msg(str`${primarySeedConfig.depth} hop(s)`)
|
||||
: msg("None")
|
||||
)
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Include Any Linked Page (“one hop out”)"),
|
||||
Boolean(primarySeedConfig.extraHops ?? seedsConfig.extraHops)
|
||||
|
@ -79,6 +79,7 @@ type FormState = {
|
||||
behaviorTimeoutSeconds: number | null;
|
||||
pageLoadTimeoutSeconds: number | null;
|
||||
pageExtraDelaySeconds: number | null;
|
||||
maxScopeDepth: number | null;
|
||||
scopeType: WorkflowParams["config"]["scopeType"];
|
||||
exclusions: WorkflowParams["config"]["exclude"];
|
||||
pageLimit: WorkflowParams["config"]["limit"];
|
||||
@ -148,6 +149,7 @@ const getDefaultFormState = (): FormState => ({
|
||||
behaviorTimeoutSeconds: null,
|
||||
pageLoadTimeoutSeconds: null,
|
||||
pageExtraDelaySeconds: null,
|
||||
maxScopeDepth: null,
|
||||
scopeType: "host",
|
||||
exclusions: [],
|
||||
pageLimit: null,
|
||||
@ -485,6 +487,7 @@ export class CrawlConfigEditor extends LiteElement {
|
||||
seedsConfig.pageLoadTimeout ?? defaultFormState.pageLoadTimeoutSeconds,
|
||||
pageExtraDelaySeconds:
|
||||
seedsConfig.pageExtraDelay ?? defaultFormState.pageExtraDelaySeconds,
|
||||
maxScopeDepth: primarySeedConfig.depth ?? defaultFormState.maxScopeDepth,
|
||||
scale: this.initialWorkflow.scale,
|
||||
blockAds: this.initialWorkflow.config.blockAds,
|
||||
lang: this.initialWorkflow.config.lang,
|
||||
@ -1044,6 +1047,29 @@ https://example.com/path`}
|
||||
${this.renderHelpTextCol(
|
||||
msg(`Tells the crawler which pages it can visit.`)
|
||||
)}
|
||||
${when(
|
||||
["host", "domain", "custom", "any"].includes(this.formState.scopeType),
|
||||
() => html`
|
||||
${this.renderFormCol(html`
|
||||
<sl-input
|
||||
name="maxScopeDepth"
|
||||
label=${msg("Max Depth")}
|
||||
value=${this.formState.maxScopeDepth}
|
||||
placeholder=${msg("Default: Unlimited")}
|
||||
min="0"
|
||||
type="number"
|
||||
inputmode="numeric"
|
||||
>
|
||||
<span slot="suffix">${msg("hops")}</span>
|
||||
</sl-input>
|
||||
`)}
|
||||
${this.renderHelpTextCol(
|
||||
msg(
|
||||
`Limits how many hops away the crawler can visit while staying within the Start URL Scope.`
|
||||
)
|
||||
)}
|
||||
`
|
||||
)}
|
||||
${when(
|
||||
this.formState.scopeType === "custom",
|
||||
() => html`
|
||||
@ -1076,7 +1102,7 @@ https://example.net`}
|
||||
`)}
|
||||
${this.renderHelpTextCol(
|
||||
msg(`If checked, the crawler will visit pages one link away outside of
|
||||
Crawl Scope.`),
|
||||
Start URL Scope.`),
|
||||
false
|
||||
)}
|
||||
<div class="col-span-5">
|
||||
@ -2047,6 +2073,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
behaviorTimeout: this.formState.behaviorTimeoutSeconds,
|
||||
pageLoadTimeout: this.formState.pageLoadTimeoutSeconds,
|
||||
pageExtraDelay: this.formState.pageExtraDelaySeconds,
|
||||
|
||||
limit: this.formState.pageLimit,
|
||||
lang: this.formState.lang || "",
|
||||
blockAds: this.formState.blockAds,
|
||||
@ -2103,6 +2130,13 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
: [],
|
||||
extraHops: this.formState.includeLinkedPages ? 1 : 0,
|
||||
};
|
||||
|
||||
if (
|
||||
["host", "domain", "custom", "any"].includes(this.formState.scopeType)
|
||||
) {
|
||||
primarySeed.depth = this.formState.maxScopeDepth;
|
||||
}
|
||||
|
||||
const config = {
|
||||
seeds: [primarySeed, ...additionalSeedUrlList],
|
||||
scopeType: additionalSeedUrlList.length
|
||||
|
@ -14,6 +14,7 @@ export type Seed = {
|
||||
exclude?: string[] | null;
|
||||
limit?: number | null;
|
||||
extraHops?: number | null;
|
||||
depth?: number | null;
|
||||
};
|
||||
|
||||
export type SeedConfig = Pick<
|
||||
@ -28,6 +29,7 @@ export type SeedConfig = Pick<
|
||||
pageExtraDelay: number | null;
|
||||
behaviors?: string | null;
|
||||
extraHops?: number | null;
|
||||
depth?: number | null;
|
||||
};
|
||||
|
||||
export type JobType = "url-list" | "seed-crawl" | "custom";
|
||||
|
Loading…
Reference in New Issue
Block a user