Allow users to set max depth/hop out within scope (#816)

- Adds an input to the Workflow creation and edit form for specifying crawl depth. This input is conditionally shown for seeded crawls when the scope is set to "Pages on this domain", "Pages on this domain & subdomains" or "Custom page prefix". The "any" scope is also supported for backwards compatibility but is not shown by default or in new configs.
- API implementation: The depth value is set in the primary seed config, i.e. the first seed in seeds: [], not in the outer .config.depth property.
This commit is contained in:
sua yoo 2023-05-05 14:26:48 -07:00 committed by GitHub
parent 7409e0637e
commit 9fcbc3f87e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 49 additions and 1 deletions

View File

@ -304,6 +304,18 @@ export class ConfigDetails extends LiteElement {
: msg("None"),
true
)}
${when(
["host", "domain", "custom", "any"].includes(
primarySeedConfig.scopeType || seedsConfig.scopeType
),
() =>
this.renderSetting(
msg("Max Depth"),
primarySeedConfig.depth
? msg(str`${primarySeedConfig.depth} hop(s)`)
: msg("None")
)
)}
${this.renderSetting(
msg("Include Any Linked Page (“one hop out”)"),
Boolean(primarySeedConfig.extraHops ?? seedsConfig.extraHops)

View File

@ -79,6 +79,7 @@ type FormState = {
behaviorTimeoutSeconds: number | null;
pageLoadTimeoutSeconds: number | null;
pageExtraDelaySeconds: number | null;
maxScopeDepth: number | null;
scopeType: WorkflowParams["config"]["scopeType"];
exclusions: WorkflowParams["config"]["exclude"];
pageLimit: WorkflowParams["config"]["limit"];
@ -148,6 +149,7 @@ const getDefaultFormState = (): FormState => ({
behaviorTimeoutSeconds: null,
pageLoadTimeoutSeconds: null,
pageExtraDelaySeconds: null,
maxScopeDepth: null,
scopeType: "host",
exclusions: [],
pageLimit: null,
@ -485,6 +487,7 @@ export class CrawlConfigEditor extends LiteElement {
seedsConfig.pageLoadTimeout ?? defaultFormState.pageLoadTimeoutSeconds,
pageExtraDelaySeconds:
seedsConfig.pageExtraDelay ?? defaultFormState.pageExtraDelaySeconds,
maxScopeDepth: primarySeedConfig.depth ?? defaultFormState.maxScopeDepth,
scale: this.initialWorkflow.scale,
blockAds: this.initialWorkflow.config.blockAds,
lang: this.initialWorkflow.config.lang,
@ -1044,6 +1047,29 @@ https://example.com/path`}
${this.renderHelpTextCol(
msg(`Tells the crawler which pages it can visit.`)
)}
${when(
["host", "domain", "custom", "any"].includes(this.formState.scopeType),
() => html`
${this.renderFormCol(html`
<sl-input
name="maxScopeDepth"
label=${msg("Max Depth")}
value=${this.formState.maxScopeDepth}
placeholder=${msg("Default: Unlimited")}
min="0"
type="number"
inputmode="numeric"
>
<span slot="suffix">${msg("hops")}</span>
</sl-input>
`)}
${this.renderHelpTextCol(
msg(
`Limits how many hops away the crawler can visit while staying within the Start URL Scope.`
)
)}
`
)}
${when(
this.formState.scopeType === "custom",
() => html`
@ -1076,7 +1102,7 @@ https://example.net`}
`)}
${this.renderHelpTextCol(
msg(`If checked, the crawler will visit pages one link away outside of
Crawl Scope.`),
Start URL Scope.`),
false
)}
<div class="col-span-5">
@ -2047,6 +2073,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
behaviorTimeout: this.formState.behaviorTimeoutSeconds,
pageLoadTimeout: this.formState.pageLoadTimeoutSeconds,
pageExtraDelay: this.formState.pageExtraDelaySeconds,
limit: this.formState.pageLimit,
lang: this.formState.lang || "",
blockAds: this.formState.blockAds,
@ -2103,6 +2130,13 @@ https://archiveweb.page/images/${"logo.svg"}`}
: [],
extraHops: this.formState.includeLinkedPages ? 1 : 0,
};
if (
["host", "domain", "custom", "any"].includes(this.formState.scopeType)
) {
primarySeed.depth = this.formState.maxScopeDepth;
}
const config = {
seeds: [primarySeed, ...additionalSeedUrlList],
scopeType: additionalSeedUrlList.length

View File

@ -14,6 +14,7 @@ export type Seed = {
exclude?: string[] | null;
limit?: number | null;
extraHops?: number | null;
depth?: number | null;
};
export type SeedConfig = Pick<
@ -28,6 +29,7 @@ export type SeedConfig = Pick<
pageExtraDelay: number | null;
behaviors?: string | null;
extraHops?: number | null;
depth?: number | null;
};
export type JobType = "url-list" | "seed-crawl" | "custom";