diff --git a/frontend/src/components/config-details.ts b/frontend/src/components/config-details.ts index faea67f3..1b14bd37 100644 --- a/frontend/src/components/config-details.ts +++ b/frontend/src/components/config-details.ts @@ -304,6 +304,18 @@ export class ConfigDetails extends LiteElement { : msg("None"), true )} + ${when( + ["host", "domain", "custom", "any"].includes( + primarySeedConfig.scopeType || seedsConfig.scopeType + ), + () => + this.renderSetting( + msg("Max Depth"), + primarySeedConfig.depth + ? msg(str`${primarySeedConfig.depth} hop(s)`) + : msg("None") + ) + )} ${this.renderSetting( msg("Include Any Linked Page (“one hop out”)"), Boolean(primarySeedConfig.extraHops ?? seedsConfig.extraHops) diff --git a/frontend/src/pages/org/workflow-editor.ts b/frontend/src/pages/org/workflow-editor.ts index 99924e58..79f54c54 100644 --- a/frontend/src/pages/org/workflow-editor.ts +++ b/frontend/src/pages/org/workflow-editor.ts @@ -79,6 +79,7 @@ type FormState = { behaviorTimeoutSeconds: number | null; pageLoadTimeoutSeconds: number | null; pageExtraDelaySeconds: number | null; + maxScopeDepth: number | null; scopeType: WorkflowParams["config"]["scopeType"]; exclusions: WorkflowParams["config"]["exclude"]; pageLimit: WorkflowParams["config"]["limit"]; @@ -148,6 +149,7 @@ const getDefaultFormState = (): FormState => ({ behaviorTimeoutSeconds: null, pageLoadTimeoutSeconds: null, pageExtraDelaySeconds: null, + maxScopeDepth: null, scopeType: "host", exclusions: [], pageLimit: null, @@ -485,6 +487,7 @@ export class CrawlConfigEditor extends LiteElement { seedsConfig.pageLoadTimeout ?? defaultFormState.pageLoadTimeoutSeconds, pageExtraDelaySeconds: seedsConfig.pageExtraDelay ?? defaultFormState.pageExtraDelaySeconds, + maxScopeDepth: primarySeedConfig.depth ?? defaultFormState.maxScopeDepth, scale: this.initialWorkflow.scale, blockAds: this.initialWorkflow.config.blockAds, lang: this.initialWorkflow.config.lang, @@ -1044,6 +1047,29 @@ https://example.com/path`} ${this.renderHelpTextCol( msg(`Tells the crawler which pages it can visit.`) )} + ${when( + ["host", "domain", "custom", "any"].includes(this.formState.scopeType), + () => html` + ${this.renderFormCol(html` + + ${msg("hops")} + + `)} + ${this.renderHelpTextCol( + msg( + `Limits how many hops away the crawler can visit while staying within the Start URL Scope.` + ) + )} + ` + )} ${when( this.formState.scopeType === "custom", () => html` @@ -1076,7 +1102,7 @@ https://example.net`} `)} ${this.renderHelpTextCol( msg(`If checked, the crawler will visit pages one link away outside of - Crawl Scope.`), + Start URL Scope.`), false )}
@@ -2047,6 +2073,7 @@ https://archiveweb.page/images/${"logo.svg"}`} behaviorTimeout: this.formState.behaviorTimeoutSeconds, pageLoadTimeout: this.formState.pageLoadTimeoutSeconds, pageExtraDelay: this.formState.pageExtraDelaySeconds, + limit: this.formState.pageLimit, lang: this.formState.lang || "", blockAds: this.formState.blockAds, @@ -2103,6 +2130,13 @@ https://archiveweb.page/images/${"logo.svg"}`} : [], extraHops: this.formState.includeLinkedPages ? 1 : 0, }; + + if ( + ["host", "domain", "custom", "any"].includes(this.formState.scopeType) + ) { + primarySeed.depth = this.formState.maxScopeDepth; + } + const config = { seeds: [primarySeed, ...additionalSeedUrlList], scopeType: additionalSeedUrlList.length diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index e8c0003e..9e935bc1 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -14,6 +14,7 @@ export type Seed = { exclude?: string[] | null; limit?: number | null; extraHops?: number | null; + depth?: number | null; }; export type SeedConfig = Pick< @@ -28,6 +29,7 @@ export type SeedConfig = Pick< pageExtraDelay: number | null; behaviors?: string | null; extraHops?: number | null; + depth?: number | null; }; export type JobType = "url-list" | "seed-crawl" | "custom";