custom prefix additional fixes (#2746)

- follow-up to: #2736: remove '^' custom prefix URLs to avoid accumulating '^' via utility function
- Show URL prefix list in settings for custom prefix scope.
- Update user guide with correct custom prefix field.

---------

Co-authored-by: sua yoo <sua@webrecorder.org>
This commit is contained in:
Ilya Kreymer 2025-07-18 18:21:32 -07:00 committed by GitHub
parent 74aec5dfa3
commit 2f9a61f6be
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 67 additions and 18 deletions

View File

@ -60,7 +60,7 @@ _Site Crawl_
: This scope will crawl all pages on the domain and any subdomains found. If `example.com` is set as the _Crawl Start URL_, both pages on `example.com` and `subdomain.example.com` will be crawled.
`Custom Page Prefix`
: This scope will crawl the _Crawl Start URL_ and then include only those pages that begin with the URLs listed in _URL Prefixes in Scope_. By default, _URL Prefixes in Scope_ will be prefilled with the prefix of the _Crawl Start URL_ to the last `/`. For example, if `https://example.com/path/page` is set as the _Crawl Start URL_, `https://example.com/path/` will be automatically added to `URL Prefixes in Scope`. This prefix can then be removed or modified as needed.
: This scope will crawl the _Crawl Start URL_ and then include only those pages that begin with the URLs listed in [_URL Prefixes in Scope_](#url-prefixes-in-scope).
### Page URL(s)
@ -91,11 +91,13 @@ When enabled, the crawler will fail the entire crawl if any of the provided URLs
Instructs the crawler to stop visiting new links past a specified depth.
### Extra URL Prefixes in Scope
### URL Prefixes in Scope
This field accepts additional URLs or domains that will be crawled if URLs that lead to them are found.
When using a scope of `Custom Page Prefix`, this field accepts URLs or domains that will be crawled if URLs that lead to them are found.
This can be useful for crawling websites that span multiple domains such as `example.org` and `example.net`.
By default, _URL Prefixes in Scope_ will be prefilled with the _Crawl Start URL_ up to the last slash (`/`). For example, if `https://example.com/path/page` is set as the _Crawl Start URL_, `https://example.com/path/` will be automatically added to _URL Prefixes in Scope_. This URL prefix can then be removed or modified as needed.
This field can also be useful for crawling websites that span multiple domains such as `https://example.org` and `https://example.net`. To crawl websites outside of scope for scope types other than `Custom Page Prefix`, see [_Additional Pages_](#additional-pages).
### Include Any Linked Page ("one hop out")

View File

@ -3,9 +3,7 @@ import ISO6391 from "iso-639-1";
import { html, nothing, type TemplateResult } from "lit";
import { customElement, property, state } from "lit/decorators.js";
import { when } from "lit/directives/when.js";
import { html as staticHtml, unsafeStatic } from "lit/static-html.js";
import capitalize from "lodash/fp/capitalize";
import RegexColorize from "regex-colorize";
import { BtrixElement } from "@/classes/BtrixElement";
import { none, notSpecified } from "@/layouts/empty";
@ -21,6 +19,7 @@ import sectionStrings from "@/strings/crawl-workflows/section";
import type { Collection } from "@/types/collection";
import { WorkflowScopeType } from "@/types/workflow";
import { isApiError } from "@/utils/api";
import { unescapeCustomPrefix } from "@/utils/crawl-workflows/unescapeCustomPrefix";
import { DEPTH_SUPPORTED_SCOPES, isPageScopeType } from "@/utils/crawler";
import { humanizeSchedule } from "@/utils/cron";
import { pluralOf } from "@/utils/pluralize";
@ -433,19 +432,18 @@ export class ConfigDetails extends BtrixElement {
: undefined,
true,
)}
${when(scopeType === WorkflowScopeType.Prefix, () =>
${when(scopeType === WorkflowScopeType.Custom, () =>
this.renderSetting(
msg("Extra URL Prefixes in Scope"),
msg("URL Prefixes in Scope"),
includeUrlList.length
? html`
<ul>
${includeUrlList.map(
(url: string) =>
staticHtml`<li class="regex">${unsafeStatic(
new RegexColorize().colorizeText(url) as string,
)}</li>`,
)}
</ul>
<btrix-data-table
.columns=${[msg("URL Prefix")]}
.rows=${includeUrlList.map((url) => [
unescapeCustomPrefix(url),
])}
>
</btrix-data-table>
`
: none,
true,

View File

@ -0,0 +1,41 @@
import { expect } from "@open-wc/testing";
import { unescapeCustomPrefix } from "./unescapeCustomPrefix";
describe("unescapeCustomPrefix", () => {
it("doesn't modify a text URL", () => {
expect(unescapeCustomPrefix("https://example.com/")).to.equal(
"https://example.com/",
);
});
it("doesn't modify a text URL with query params", () => {
expect(
unescapeCustomPrefix("https://example.com/page?query&foo=bar"),
).to.equal("https://example.com/page?query&foo=bar");
});
it("escapes a regex URL", () => {
expect(unescapeCustomPrefix("https://example\\.com/")).to.equal(
"https://example.com/",
);
});
it("escapes a regex URL with query params", () => {
expect(
unescapeCustomPrefix("https://example\\.com/page\\?query&foo=bar"),
).to.equal("https://example.com/page?query&foo=bar");
});
it("removes leading ^ from a regex URL", () => {
expect(unescapeCustomPrefix("^https://example\\.com/")).to.equal(
"https://example.com/",
);
});
it("removes multiple leading ^ from a regex URL", () => {
expect(unescapeCustomPrefix("^^^https://example\\.com/")).to.equal(
"https://example.com/",
);
});
});

View File

@ -0,0 +1,8 @@
import { regexUnescape } from "@/utils/string";
/**
* Unescape "custom" scope prefix URL for user display
*/
export function unescapeCustomPrefix(urlPrefix: string) {
return regexUnescape(urlPrefix.replace(/^\^+/, ""));
}

View File

@ -18,10 +18,10 @@ import {
WorkflowScopeType,
type NewWorkflowOnlyScopeType,
} from "@/types/workflow";
import { unescapeCustomPrefix } from "@/utils/crawl-workflows/unescapeCustomPrefix";
import { DEFAULT_MAX_SCALE, isPageScopeType } from "@/utils/crawler";
import { getNextDate, getScheduleInterval } from "@/utils/cron";
import localize, { getDefaultLang } from "@/utils/localize";
import { regexUnescape } from "@/utils/string";
export const BYTES_PER_GB = 1e9;
export const DEFAULT_SELECT_LINKS = ["a[href]->href" as const];
@ -218,7 +218,7 @@ export function getInitialFormState(params: {
if (primarySeedConfig.include?.length) {
formState.customIncludeUrlList = primarySeedConfig.include
// Unescape regex
.map(regexUnescape)
.map(unescapeCustomPrefix)
.join("\n");
// if we have additional include URLs, set to "custom" scope here
// to indicate 'Custom Page Prefix' option