custom prefix additional fixes (#2746)
- follow-up to: #2736: remove '^' custom prefix URLs to avoid accumulating '^' via utility function - Show URL prefix list in settings for custom prefix scope. - Update user guide with correct custom prefix field. --------- Co-authored-by: sua yoo <sua@webrecorder.org>
This commit is contained in:
parent
74aec5dfa3
commit
2f9a61f6be
@ -60,7 +60,7 @@ _Site Crawl_
|
||||
: This scope will crawl all pages on the domain and any subdomains found. If `example.com` is set as the _Crawl Start URL_, both pages on `example.com` and `subdomain.example.com` will be crawled.
|
||||
|
||||
`Custom Page Prefix`
|
||||
: This scope will crawl the _Crawl Start URL_ and then include only those pages that begin with the URLs listed in _URL Prefixes in Scope_. By default, _URL Prefixes in Scope_ will be prefilled with the prefix of the _Crawl Start URL_ to the last `/`. For example, if `https://example.com/path/page` is set as the _Crawl Start URL_, `https://example.com/path/` will be automatically added to `URL Prefixes in Scope`. This prefix can then be removed or modified as needed.
|
||||
: This scope will crawl the _Crawl Start URL_ and then include only those pages that begin with the URLs listed in [_URL Prefixes in Scope_](#url-prefixes-in-scope).
|
||||
|
||||
### Page URL(s)
|
||||
|
||||
@ -91,11 +91,13 @@ When enabled, the crawler will fail the entire crawl if any of the provided URLs
|
||||
|
||||
Instructs the crawler to stop visiting new links past a specified depth.
|
||||
|
||||
### Extra URL Prefixes in Scope
|
||||
### URL Prefixes in Scope
|
||||
|
||||
This field accepts additional URLs or domains that will be crawled if URLs that lead to them are found.
|
||||
When using a scope of `Custom Page Prefix`, this field accepts URLs or domains that will be crawled if URLs that lead to them are found.
|
||||
|
||||
This can be useful for crawling websites that span multiple domains such as `example.org` and `example.net`.
|
||||
By default, _URL Prefixes in Scope_ will be prefilled with the _Crawl Start URL_ up to the last slash (`/`). For example, if `https://example.com/path/page` is set as the _Crawl Start URL_, `https://example.com/path/` will be automatically added to _URL Prefixes in Scope_. This URL prefix can then be removed or modified as needed.
|
||||
|
||||
This field can also be useful for crawling websites that span multiple domains such as `https://example.org` and `https://example.net`. To crawl websites outside of scope for scope types other than `Custom Page Prefix`, see [_Additional Pages_](#additional-pages).
|
||||
|
||||
### Include Any Linked Page ("one hop out")
|
||||
|
||||
|
@ -3,9 +3,7 @@ import ISO6391 from "iso-639-1";
|
||||
import { html, nothing, type TemplateResult } from "lit";
|
||||
import { customElement, property, state } from "lit/decorators.js";
|
||||
import { when } from "lit/directives/when.js";
|
||||
import { html as staticHtml, unsafeStatic } from "lit/static-html.js";
|
||||
import capitalize from "lodash/fp/capitalize";
|
||||
import RegexColorize from "regex-colorize";
|
||||
|
||||
import { BtrixElement } from "@/classes/BtrixElement";
|
||||
import { none, notSpecified } from "@/layouts/empty";
|
||||
@ -21,6 +19,7 @@ import sectionStrings from "@/strings/crawl-workflows/section";
|
||||
import type { Collection } from "@/types/collection";
|
||||
import { WorkflowScopeType } from "@/types/workflow";
|
||||
import { isApiError } from "@/utils/api";
|
||||
import { unescapeCustomPrefix } from "@/utils/crawl-workflows/unescapeCustomPrefix";
|
||||
import { DEPTH_SUPPORTED_SCOPES, isPageScopeType } from "@/utils/crawler";
|
||||
import { humanizeSchedule } from "@/utils/cron";
|
||||
import { pluralOf } from "@/utils/pluralize";
|
||||
@ -433,19 +432,18 @@ export class ConfigDetails extends BtrixElement {
|
||||
: undefined,
|
||||
true,
|
||||
)}
|
||||
${when(scopeType === WorkflowScopeType.Prefix, () =>
|
||||
${when(scopeType === WorkflowScopeType.Custom, () =>
|
||||
this.renderSetting(
|
||||
msg("Extra URL Prefixes in Scope"),
|
||||
msg("URL Prefixes in Scope"),
|
||||
includeUrlList.length
|
||||
? html`
|
||||
<ul>
|
||||
${includeUrlList.map(
|
||||
(url: string) =>
|
||||
staticHtml`<li class="regex">${unsafeStatic(
|
||||
new RegexColorize().colorizeText(url) as string,
|
||||
)}</li>`,
|
||||
)}
|
||||
</ul>
|
||||
<btrix-data-table
|
||||
.columns=${[msg("URL Prefix")]}
|
||||
.rows=${includeUrlList.map((url) => [
|
||||
unescapeCustomPrefix(url),
|
||||
])}
|
||||
>
|
||||
</btrix-data-table>
|
||||
`
|
||||
: none,
|
||||
true,
|
||||
|
@ -0,0 +1,41 @@
|
||||
import { expect } from "@open-wc/testing";
|
||||
|
||||
import { unescapeCustomPrefix } from "./unescapeCustomPrefix";
|
||||
|
||||
describe("unescapeCustomPrefix", () => {
|
||||
it("doesn't modify a text URL", () => {
|
||||
expect(unescapeCustomPrefix("https://example.com/")).to.equal(
|
||||
"https://example.com/",
|
||||
);
|
||||
});
|
||||
|
||||
it("doesn't modify a text URL with query params", () => {
|
||||
expect(
|
||||
unescapeCustomPrefix("https://example.com/page?query&foo=bar"),
|
||||
).to.equal("https://example.com/page?query&foo=bar");
|
||||
});
|
||||
|
||||
it("escapes a regex URL", () => {
|
||||
expect(unescapeCustomPrefix("https://example\\.com/")).to.equal(
|
||||
"https://example.com/",
|
||||
);
|
||||
});
|
||||
|
||||
it("escapes a regex URL with query params", () => {
|
||||
expect(
|
||||
unescapeCustomPrefix("https://example\\.com/page\\?query&foo=bar"),
|
||||
).to.equal("https://example.com/page?query&foo=bar");
|
||||
});
|
||||
|
||||
it("removes leading ^ from a regex URL", () => {
|
||||
expect(unescapeCustomPrefix("^https://example\\.com/")).to.equal(
|
||||
"https://example.com/",
|
||||
);
|
||||
});
|
||||
|
||||
it("removes multiple leading ^ from a regex URL", () => {
|
||||
expect(unescapeCustomPrefix("^^^https://example\\.com/")).to.equal(
|
||||
"https://example.com/",
|
||||
);
|
||||
});
|
||||
});
|
@ -0,0 +1,8 @@
|
||||
import { regexUnescape } from "@/utils/string";
|
||||
|
||||
/**
|
||||
* Unescape "custom" scope prefix URL for user display
|
||||
*/
|
||||
export function unescapeCustomPrefix(urlPrefix: string) {
|
||||
return regexUnescape(urlPrefix.replace(/^\^+/, ""));
|
||||
}
|
@ -18,10 +18,10 @@ import {
|
||||
WorkflowScopeType,
|
||||
type NewWorkflowOnlyScopeType,
|
||||
} from "@/types/workflow";
|
||||
import { unescapeCustomPrefix } from "@/utils/crawl-workflows/unescapeCustomPrefix";
|
||||
import { DEFAULT_MAX_SCALE, isPageScopeType } from "@/utils/crawler";
|
||||
import { getNextDate, getScheduleInterval } from "@/utils/cron";
|
||||
import localize, { getDefaultLang } from "@/utils/localize";
|
||||
import { regexUnescape } from "@/utils/string";
|
||||
|
||||
export const BYTES_PER_GB = 1e9;
|
||||
export const DEFAULT_SELECT_LINKS = ["a[href]->href" as const];
|
||||
@ -218,7 +218,7 @@ export function getInitialFormState(params: {
|
||||
if (primarySeedConfig.include?.length) {
|
||||
formState.customIncludeUrlList = primarySeedConfig.include
|
||||
// Unescape regex
|
||||
.map(regexUnescape)
|
||||
.map(unescapeCustomPrefix)
|
||||
.join("\n");
|
||||
// if we have additional include URLs, set to "custom" scope here
|
||||
// to indicate 'Custom Page Prefix' option
|
||||
|
Loading…
Reference in New Issue
Block a user