Fix Extra URLs in Scope (#913)

* scope fix: when using 'Custom Page Prefix scope (fixes #873)
- don't include primary seed URL in include list
- don't always add trailing slash to extra in scope URLs
- set seed scope to 'prefix' (supported via webrecorder/browsertrix-crawler#318) instead of re-including seed URL
- add comments on using 'custom' to indicate 'Custom Prefix Scope' semantics on frontend, setting actual scope to 'prefix' on backend
- remove unneeded conditional for additional urls, main scopeType overridden per seed anyway
This commit is contained in:
Ilya Kreymer 2023-06-12 17:29:41 -07:00 committed by GitHub
parent 79703baa69
commit ec3404c798
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -103,6 +103,8 @@ type FormState = {
autoscrollBehavior: boolean;
};
const DEPTH_SUPPORTED_SCOPES = ["prefix", "host", "domain", "custom", "any"];
const getDefaultProgressState = (hasConfigId = false): ProgressState => {
let activeTab: StepName = "crawlSetup";
if (window.location.hash) {
@ -422,13 +424,15 @@ export class CrawlConfigEditor extends LiteElement {
formState.primarySeedUrl = primarySeedConfig.url;
}
if (
primarySeedConfig.scopeType === "custom" &&
primarySeedConfig.include?.length
) {
formState.customIncludeUrlList = primarySeedConfig.include
// Unescape regex
.map((url) => url.replace(/(\\|\/\.\*)/g, ""))
.join("\n");
// if we have additional include URLs, set to "custom" scope here
// to indicate 'Custom Page Prefix' option
formState.scopeType = "custom";
}
const additionalSeeds = seeds.slice(1);
if (additionalSeeds.length) {
@ -1044,7 +1048,7 @@ https://example.com/path`}
msg(`Tells the crawler which pages it can visit.`)
)}
${when(
["host", "domain", "custom", "any"].includes(this.formState.scopeType),
DEPTH_SUPPORTED_SCOPES.includes(this.formState.scopeType),
() => html`
${this.renderFormCol(html`
<sl-input
@ -2110,28 +2114,25 @@ https://archiveweb.page/images/${"logo.svg"}`}
: [];
const primarySeed: Seed = {
url: primarySeedUrl,
scopeType: this.formState.scopeType,
// the 'custom' scope here indicates we have extra URLs, actually set to 'prefix'
// scope on backend to ensure seed URL is also added as part of standard prefix scope
scopeType: this.formState.scopeType === "custom" ? "prefix" : this.formState.scopeType,
include:
this.formState.scopeType === "custom"
? [
`${regexEscape(primarySeedUrl)}\/.*`,
...includeUrlList.map((url) => `${regexEscape(url)}\/.*`),
...includeUrlList.map((url) => regexEscape(url)),
]
: [],
extraHops: this.formState.includeLinkedPages ? 1 : 0,
};
if (
["host", "domain", "custom", "any"].includes(this.formState.scopeType)
) {
if (DEPTH_SUPPORTED_SCOPES.includes(this.formState.scopeType)) {
primarySeed.depth = this.formState.maxScopeDepth;
}
const config = {
seeds: [primarySeed, ...additionalSeedUrlList],
scopeType: additionalSeedUrlList.length
? "page"
: this.formState.scopeType,
scopeType: this.formState.scopeType,
};
return config;
}