From 2f9a61f6beda9f2dfacd3caf14148a9616dbbc1a Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 18 Jul 2025 18:21:32 -0700 Subject: [PATCH] custom prefix additional fixes (#2746) - follow-up to: #2736: remove '^' custom prefix URLs to avoid accumulating '^' via utility function - Show URL prefix list in settings for custom prefix scope. - Update user guide with correct custom prefix field. --------- Co-authored-by: sua yoo --- .../docs/docs/user-guide/workflow-setup.md | 10 +++-- frontend/src/components/ui/config-details.ts | 22 +++++----- .../unescapeCustomPrefix.test.ts | 41 +++++++++++++++++++ .../crawl-workflows/unescapeCustomPrefix.ts | 8 ++++ frontend/src/utils/workflow.ts | 4 +- 5 files changed, 67 insertions(+), 18 deletions(-) create mode 100644 frontend/src/utils/crawl-workflows/unescapeCustomPrefix.test.ts create mode 100644 frontend/src/utils/crawl-workflows/unescapeCustomPrefix.ts diff --git a/frontend/docs/docs/user-guide/workflow-setup.md b/frontend/docs/docs/user-guide/workflow-setup.md index cc75711a..83bc49a8 100644 --- a/frontend/docs/docs/user-guide/workflow-setup.md +++ b/frontend/docs/docs/user-guide/workflow-setup.md @@ -60,7 +60,7 @@ _Site Crawl_ : This scope will crawl all pages on the domain and any subdomains found. If `example.com` is set as the _Crawl Start URL_, both pages on `example.com` and `subdomain.example.com` will be crawled. `Custom Page Prefix` -: This scope will crawl the _Crawl Start URL_ and then include only those pages that begin with the URLs listed in _URL Prefixes in Scope_. By default, _URL Prefixes in Scope_ will be prefilled with the prefix of the _Crawl Start URL_ to the last `/`. For example, if `https://example.com/path/page` is set as the _Crawl Start URL_, `https://example.com/path/` will be automatically added to `URL Prefixes in Scope`. This prefix can then be removed or modified as needed. +: This scope will crawl the _Crawl Start URL_ and then include only those pages that begin with the URLs listed in [_URL Prefixes in Scope_](#url-prefixes-in-scope). ### Page URL(s) @@ -91,11 +91,13 @@ When enabled, the crawler will fail the entire crawl if any of the provided URLs Instructs the crawler to stop visiting new links past a specified depth. -### Extra URL Prefixes in Scope +### URL Prefixes in Scope -This field accepts additional URLs or domains that will be crawled if URLs that lead to them are found. +When using a scope of `Custom Page Prefix`, this field accepts URLs or domains that will be crawled if URLs that lead to them are found. -This can be useful for crawling websites that span multiple domains such as `example.org` and `example.net`. +By default, _URL Prefixes in Scope_ will be prefilled with the _Crawl Start URL_ up to the last slash (`/`). For example, if `https://example.com/path/page` is set as the _Crawl Start URL_, `https://example.com/path/` will be automatically added to _URL Prefixes in Scope_. This URL prefix can then be removed or modified as needed. + +This field can also be useful for crawling websites that span multiple domains such as `https://example.org` and `https://example.net`. To crawl websites outside of scope for scope types other than `Custom Page Prefix`, see [_Additional Pages_](#additional-pages). ### Include Any Linked Page ("one hop out") diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts index 88123235..af164a86 100644 --- a/frontend/src/components/ui/config-details.ts +++ b/frontend/src/components/ui/config-details.ts @@ -3,9 +3,7 @@ import ISO6391 from "iso-639-1"; import { html, nothing, type TemplateResult } from "lit"; import { customElement, property, state } from "lit/decorators.js"; import { when } from "lit/directives/when.js"; -import { html as staticHtml, unsafeStatic } from "lit/static-html.js"; import capitalize from "lodash/fp/capitalize"; -import RegexColorize from "regex-colorize"; import { BtrixElement } from "@/classes/BtrixElement"; import { none, notSpecified } from "@/layouts/empty"; @@ -21,6 +19,7 @@ import sectionStrings from "@/strings/crawl-workflows/section"; import type { Collection } from "@/types/collection"; import { WorkflowScopeType } from "@/types/workflow"; import { isApiError } from "@/utils/api"; +import { unescapeCustomPrefix } from "@/utils/crawl-workflows/unescapeCustomPrefix"; import { DEPTH_SUPPORTED_SCOPES, isPageScopeType } from "@/utils/crawler"; import { humanizeSchedule } from "@/utils/cron"; import { pluralOf } from "@/utils/pluralize"; @@ -433,19 +432,18 @@ export class ConfigDetails extends BtrixElement { : undefined, true, )} - ${when(scopeType === WorkflowScopeType.Prefix, () => + ${when(scopeType === WorkflowScopeType.Custom, () => this.renderSetting( - msg("Extra URL Prefixes in Scope"), + msg("URL Prefixes in Scope"), includeUrlList.length ? html` - + [ + unescapeCustomPrefix(url), + ])} + > + ` : none, true, diff --git a/frontend/src/utils/crawl-workflows/unescapeCustomPrefix.test.ts b/frontend/src/utils/crawl-workflows/unescapeCustomPrefix.test.ts new file mode 100644 index 00000000..cf32b9e5 --- /dev/null +++ b/frontend/src/utils/crawl-workflows/unescapeCustomPrefix.test.ts @@ -0,0 +1,41 @@ +import { expect } from "@open-wc/testing"; + +import { unescapeCustomPrefix } from "./unescapeCustomPrefix"; + +describe("unescapeCustomPrefix", () => { + it("doesn't modify a text URL", () => { + expect(unescapeCustomPrefix("https://example.com/")).to.equal( + "https://example.com/", + ); + }); + + it("doesn't modify a text URL with query params", () => { + expect( + unescapeCustomPrefix("https://example.com/page?query&foo=bar"), + ).to.equal("https://example.com/page?query&foo=bar"); + }); + + it("escapes a regex URL", () => { + expect(unescapeCustomPrefix("https://example\\.com/")).to.equal( + "https://example.com/", + ); + }); + + it("escapes a regex URL with query params", () => { + expect( + unescapeCustomPrefix("https://example\\.com/page\\?query&foo=bar"), + ).to.equal("https://example.com/page?query&foo=bar"); + }); + + it("removes leading ^ from a regex URL", () => { + expect(unescapeCustomPrefix("^https://example\\.com/")).to.equal( + "https://example.com/", + ); + }); + + it("removes multiple leading ^ from a regex URL", () => { + expect(unescapeCustomPrefix("^^^https://example\\.com/")).to.equal( + "https://example.com/", + ); + }); +}); diff --git a/frontend/src/utils/crawl-workflows/unescapeCustomPrefix.ts b/frontend/src/utils/crawl-workflows/unescapeCustomPrefix.ts new file mode 100644 index 00000000..79725736 --- /dev/null +++ b/frontend/src/utils/crawl-workflows/unescapeCustomPrefix.ts @@ -0,0 +1,8 @@ +import { regexUnescape } from "@/utils/string"; + +/** + * Unescape "custom" scope prefix URL for user display + */ +export function unescapeCustomPrefix(urlPrefix: string) { + return regexUnescape(urlPrefix.replace(/^\^+/, "")); +} diff --git a/frontend/src/utils/workflow.ts b/frontend/src/utils/workflow.ts index fae9693e..ffa19d4d 100644 --- a/frontend/src/utils/workflow.ts +++ b/frontend/src/utils/workflow.ts @@ -18,10 +18,10 @@ import { WorkflowScopeType, type NewWorkflowOnlyScopeType, } from "@/types/workflow"; +import { unescapeCustomPrefix } from "@/utils/crawl-workflows/unescapeCustomPrefix"; import { DEFAULT_MAX_SCALE, isPageScopeType } from "@/utils/crawler"; import { getNextDate, getScheduleInterval } from "@/utils/cron"; import localize, { getDefaultLang } from "@/utils/localize"; -import { regexUnescape } from "@/utils/string"; export const BYTES_PER_GB = 1e9; export const DEFAULT_SELECT_LINKS = ["a[href]->href" as const]; @@ -218,7 +218,7 @@ export function getInitialFormState(params: { if (primarySeedConfig.include?.length) { formState.customIncludeUrlList = primarySeedConfig.include // Unescape regex - .map(regexUnescape) + .map(unescapeCustomPrefix) .join("\n"); // if we have additional include URLs, set to "custom" scope here // to indicate 'Custom Page Prefix' option