Custom Page Prefix Usability Fixes (#2736)
- Automatically compute prefix from starting URL, if no other prefix is set in custom prefix mode. - Ensure each prefix is actually a prefix: add '^' to each custom prefix URL, as include URL path is a regex - rename 'Extra URL Prefixes' to just 'URL Prefixes' and adjust help text to indicate that the prefix list is the list that is in scope - fixes #2735, follow up to #2722 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> Co-authored-by: sua yoo <sua@webrecorder.org>
This commit is contained in:
parent
b0f2d87ce2
commit
5d2b34f3b6
@ -60,7 +60,7 @@ _Site Crawl_
|
||||
: This scope will crawl all pages on the domain and any subdomains found. If `example.com` is set as the _Crawl Start URL_, both pages on `example.com` and `subdomain.example.com` will be crawled.
|
||||
|
||||
`Custom Page Prefix`
|
||||
: This scope will crawl all pages that begin with the _Crawl Start URL_ as well as pages from any URL that begin with the URLs listed in `Extra URL Prefixes in Scope`
|
||||
: This scope will crawl the _Crawl Start URL_ and then include only those pages that begin with the URLs listed in _URL Prefixes in Scope_. By default, _URL Prefixes in Scope_ will be prefilled with the prefix of the _Crawl Start URL_ to the last `/`. For example, if `https://example.com/path/page` is set as the _Crawl Start URL_, `https://example.com/path/` will be automatically added to `URL Prefixes in Scope`. This prefix can then be removed or modified as needed.
|
||||
|
||||
### Page URL(s)
|
||||
|
||||
|
@ -1034,12 +1034,12 @@ https://archiveweb.page/guide`}
|
||||
break;
|
||||
case ScopeType.Custom:
|
||||
helpText = msg(
|
||||
html`Will crawl all page URLs that begin with
|
||||
html`Will start with
|
||||
<span class="break-word text-blue-500"
|
||||
>${exampleDomain}${examplePathname}</span
|
||||
>
|
||||
or any URL that begins with those specified in
|
||||
<em>Extra URL Prefixes in Scope</em>`,
|
||||
and include <em>only</em> URLs that start with the
|
||||
<em>URL Prefixes in Scope</em> listed below.`,
|
||||
);
|
||||
break;
|
||||
default:
|
||||
@ -1092,6 +1092,30 @@ https://archiveweb.page/guide`}
|
||||
true,
|
||||
);
|
||||
}
|
||||
if (
|
||||
this.formState.primarySeedUrl &&
|
||||
this.formState.scopeType === ScopeType.Custom &&
|
||||
!this.formState.customIncludeUrlList
|
||||
) {
|
||||
let prefixUrl = this.formState.primarySeedUrl;
|
||||
try {
|
||||
const startingUrl = new URL(this.formState.primarySeedUrl);
|
||||
prefixUrl =
|
||||
startingUrl.origin +
|
||||
startingUrl.pathname.slice(
|
||||
0,
|
||||
startingUrl.pathname.lastIndexOf("/") + 1,
|
||||
);
|
||||
} catch (e) {
|
||||
// ignore
|
||||
}
|
||||
this.updateFormState(
|
||||
{
|
||||
customIncludeUrlList: prefixUrl,
|
||||
},
|
||||
true,
|
||||
);
|
||||
}
|
||||
}}
|
||||
>
|
||||
<div slot="help-text">${helpText}</div>
|
||||
@ -1104,7 +1128,7 @@ https://archiveweb.page/guide`}
|
||||
${inputCol(html`
|
||||
<sl-textarea
|
||||
name="customIncludeUrlList"
|
||||
label=${msg("Extra URL Prefixes in Scope")}
|
||||
label=${msg("URL Prefixes in Scope")}
|
||||
rows="3"
|
||||
autocomplete="off"
|
||||
inputmode="url"
|
||||
@ -1115,8 +1139,7 @@ https://example.net`}
|
||||
></sl-textarea>
|
||||
`)}
|
||||
${this.renderHelpTextCol(
|
||||
msg(`If the crawler finds pages outside of the Crawl Scope they
|
||||
will only be saved if they begin with URLs listed here.`),
|
||||
msg(`Only crawl pages that begin with URLs listed here.`),
|
||||
)}
|
||||
`,
|
||||
)}
|
||||
@ -2643,7 +2666,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
scopeType: this.formState.scopeType as ScopeType,
|
||||
include:
|
||||
this.formState.scopeType === ScopeType.Custom
|
||||
? [...includeUrlList.map((url) => regexEscape(url))]
|
||||
? [...includeUrlList.map((url) => "^" + regexEscape(url))]
|
||||
: [],
|
||||
extraHops: this.formState.includeLinkedPages ? 1 : 0,
|
||||
};
|
||||
|
Loading…
Reference in New Issue
Block a user