Custom Page Prefix Usability Fixes (#2736)

- Automatically compute prefix from starting URL, if no other prefix is
set in custom prefix mode.
- Ensure each prefix is actually a prefix: add '^' to each custom prefix
URL, as include URL path is a regex
- rename 'Extra URL Prefixes' to just 'URL Prefixes' and adjust help
text to indicate that the prefix list is the list that is in scope
- fixes #2735, follow up to #2722

---------

Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
Co-authored-by: sua yoo <sua@webrecorder.org>
This commit is contained in:
Ilya Kreymer 2025-07-15 13:19:20 -07:00 committed by GitHub
parent b0f2d87ce2
commit 5d2b34f3b6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 31 additions and 8 deletions

View File

@ -60,7 +60,7 @@ _Site Crawl_
: This scope will crawl all pages on the domain and any subdomains found. If `example.com` is set as the _Crawl Start URL_, both pages on `example.com` and `subdomain.example.com` will be crawled.
`Custom Page Prefix`
: This scope will crawl all pages that begin with the _Crawl Start URL_ as well as pages from any URL that begin with the URLs listed in `Extra URL Prefixes in Scope`
: This scope will crawl the _Crawl Start URL_ and then include only those pages that begin with the URLs listed in _URL Prefixes in Scope_. By default, _URL Prefixes in Scope_ will be prefilled with the prefix of the _Crawl Start URL_ to the last `/`. For example, if `https://example.com/path/page` is set as the _Crawl Start URL_, `https://example.com/path/` will be automatically added to `URL Prefixes in Scope`. This prefix can then be removed or modified as needed.
### Page URL(s)

View File

@ -1034,12 +1034,12 @@ https://archiveweb.page/guide`}
break;
case ScopeType.Custom:
helpText = msg(
html`Will crawl all page URLs that begin with
html`Will start with
<span class="break-word text-blue-500"
>${exampleDomain}${examplePathname}</span
>
or any URL that begins with those specified in
<em>Extra URL Prefixes in Scope</em>`,
and include <em>only</em> URLs that start with the
<em>URL Prefixes in Scope</em> listed below.`,
);
break;
default:
@ -1092,6 +1092,30 @@ https://archiveweb.page/guide`}
true,
);
}
if (
this.formState.primarySeedUrl &&
this.formState.scopeType === ScopeType.Custom &&
!this.formState.customIncludeUrlList
) {
let prefixUrl = this.formState.primarySeedUrl;
try {
const startingUrl = new URL(this.formState.primarySeedUrl);
prefixUrl =
startingUrl.origin +
startingUrl.pathname.slice(
0,
startingUrl.pathname.lastIndexOf("/") + 1,
);
} catch (e) {
// ignore
}
this.updateFormState(
{
customIncludeUrlList: prefixUrl,
},
true,
);
}
}}
>
<div slot="help-text">${helpText}</div>
@ -1104,7 +1128,7 @@ https://archiveweb.page/guide`}
${inputCol(html`
<sl-textarea
name="customIncludeUrlList"
label=${msg("Extra URL Prefixes in Scope")}
label=${msg("URL Prefixes in Scope")}
rows="3"
autocomplete="off"
inputmode="url"
@ -1115,8 +1139,7 @@ https://example.net`}
></sl-textarea>
`)}
${this.renderHelpTextCol(
msg(`If the crawler finds pages outside of the Crawl Scope they
will only be saved if they begin with URLs listed here.`),
msg(`Only crawl pages that begin with URLs listed here.`),
)}
`,
)}
@ -2643,7 +2666,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
scopeType: this.formState.scopeType as ScopeType,
include:
this.formState.scopeType === ScopeType.Custom
? [...includeUrlList.map((url) => regexEscape(url))]
? [...includeUrlList.map((url) => "^" + regexEscape(url))]
: [],
extraHops: this.formState.includeLinkedPages ? 1 : 0,
};