Custom Page Prefix Usability Fixes (#2736)
- Automatically compute prefix from starting URL, if no other prefix is set in custom prefix mode. - Ensure each prefix is actually a prefix: add '^' to each custom prefix URL, as include URL path is a regex - rename 'Extra URL Prefixes' to just 'URL Prefixes' and adjust help text to indicate that the prefix list is the list that is in scope - fixes #2735, follow up to #2722 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> Co-authored-by: sua yoo <sua@webrecorder.org>
This commit is contained in:
parent
b0f2d87ce2
commit
5d2b34f3b6
@ -60,7 +60,7 @@ _Site Crawl_
|
|||||||
: This scope will crawl all pages on the domain and any subdomains found. If `example.com` is set as the _Crawl Start URL_, both pages on `example.com` and `subdomain.example.com` will be crawled.
|
: This scope will crawl all pages on the domain and any subdomains found. If `example.com` is set as the _Crawl Start URL_, both pages on `example.com` and `subdomain.example.com` will be crawled.
|
||||||
|
|
||||||
`Custom Page Prefix`
|
`Custom Page Prefix`
|
||||||
: This scope will crawl all pages that begin with the _Crawl Start URL_ as well as pages from any URL that begin with the URLs listed in `Extra URL Prefixes in Scope`
|
: This scope will crawl the _Crawl Start URL_ and then include only those pages that begin with the URLs listed in _URL Prefixes in Scope_. By default, _URL Prefixes in Scope_ will be prefilled with the prefix of the _Crawl Start URL_ to the last `/`. For example, if `https://example.com/path/page` is set as the _Crawl Start URL_, `https://example.com/path/` will be automatically added to `URL Prefixes in Scope`. This prefix can then be removed or modified as needed.
|
||||||
|
|
||||||
### Page URL(s)
|
### Page URL(s)
|
||||||
|
|
||||||
|
@ -1034,12 +1034,12 @@ https://archiveweb.page/guide`}
|
|||||||
break;
|
break;
|
||||||
case ScopeType.Custom:
|
case ScopeType.Custom:
|
||||||
helpText = msg(
|
helpText = msg(
|
||||||
html`Will crawl all page URLs that begin with
|
html`Will start with
|
||||||
<span class="break-word text-blue-500"
|
<span class="break-word text-blue-500"
|
||||||
>${exampleDomain}${examplePathname}</span
|
>${exampleDomain}${examplePathname}</span
|
||||||
>
|
>
|
||||||
or any URL that begins with those specified in
|
and include <em>only</em> URLs that start with the
|
||||||
<em>Extra URL Prefixes in Scope</em>`,
|
<em>URL Prefixes in Scope</em> listed below.`,
|
||||||
);
|
);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
@ -1092,6 +1092,30 @@ https://archiveweb.page/guide`}
|
|||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
if (
|
||||||
|
this.formState.primarySeedUrl &&
|
||||||
|
this.formState.scopeType === ScopeType.Custom &&
|
||||||
|
!this.formState.customIncludeUrlList
|
||||||
|
) {
|
||||||
|
let prefixUrl = this.formState.primarySeedUrl;
|
||||||
|
try {
|
||||||
|
const startingUrl = new URL(this.formState.primarySeedUrl);
|
||||||
|
prefixUrl =
|
||||||
|
startingUrl.origin +
|
||||||
|
startingUrl.pathname.slice(
|
||||||
|
0,
|
||||||
|
startingUrl.pathname.lastIndexOf("/") + 1,
|
||||||
|
);
|
||||||
|
} catch (e) {
|
||||||
|
// ignore
|
||||||
|
}
|
||||||
|
this.updateFormState(
|
||||||
|
{
|
||||||
|
customIncludeUrlList: prefixUrl,
|
||||||
|
},
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
}
|
||||||
}}
|
}}
|
||||||
>
|
>
|
||||||
<div slot="help-text">${helpText}</div>
|
<div slot="help-text">${helpText}</div>
|
||||||
@ -1104,7 +1128,7 @@ https://archiveweb.page/guide`}
|
|||||||
${inputCol(html`
|
${inputCol(html`
|
||||||
<sl-textarea
|
<sl-textarea
|
||||||
name="customIncludeUrlList"
|
name="customIncludeUrlList"
|
||||||
label=${msg("Extra URL Prefixes in Scope")}
|
label=${msg("URL Prefixes in Scope")}
|
||||||
rows="3"
|
rows="3"
|
||||||
autocomplete="off"
|
autocomplete="off"
|
||||||
inputmode="url"
|
inputmode="url"
|
||||||
@ -1115,8 +1139,7 @@ https://example.net`}
|
|||||||
></sl-textarea>
|
></sl-textarea>
|
||||||
`)}
|
`)}
|
||||||
${this.renderHelpTextCol(
|
${this.renderHelpTextCol(
|
||||||
msg(`If the crawler finds pages outside of the Crawl Scope they
|
msg(`Only crawl pages that begin with URLs listed here.`),
|
||||||
will only be saved if they begin with URLs listed here.`),
|
|
||||||
)}
|
)}
|
||||||
`,
|
`,
|
||||||
)}
|
)}
|
||||||
@ -2643,7 +2666,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
|||||||
scopeType: this.formState.scopeType as ScopeType,
|
scopeType: this.formState.scopeType as ScopeType,
|
||||||
include:
|
include:
|
||||||
this.formState.scopeType === ScopeType.Custom
|
this.formState.scopeType === ScopeType.Custom
|
||||||
? [...includeUrlList.map((url) => regexEscape(url))]
|
? [...includeUrlList.map((url) => "^" + regexEscape(url))]
|
||||||
: [],
|
: [],
|
||||||
extraHops: this.formState.includeLinkedPages ? 1 : 0,
|
extraHops: this.formState.includeLinkedPages ? 1 : 0,
|
||||||
};
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user