From 2f9a61f6beda9f2dfacd3caf14148a9616dbbc1a Mon Sep 17 00:00:00 2001
From: Ilya Kreymer <ikreymer@users.noreply.github.com>
Date: Fri, 18 Jul 2025 18:21:32 -0700
Subject: [PATCH] custom prefix additional fixes (#2746)

- follow-up to: #2736: remove '^' custom prefix URLs to avoid accumulating '^' via utility function
- Show URL prefix list in settings for custom prefix scope.
- Update user guide with correct custom prefix field.

---------

Co-authored-by: sua yoo <sua@webrecorder.org>
---
 .../docs/docs/user-guide/workflow-setup.md    | 10 +++--
 frontend/src/components/ui/config-details.ts  | 22 +++++-----
 .../unescapeCustomPrefix.test.ts              | 41 +++++++++++++++++++
 .../crawl-workflows/unescapeCustomPrefix.ts   |  8 ++++
 frontend/src/utils/workflow.ts                |  4 +-
 5 files changed, 67 insertions(+), 18 deletions(-)
 create mode 100644 frontend/src/utils/crawl-workflows/unescapeCustomPrefix.test.ts
 create mode 100644 frontend/src/utils/crawl-workflows/unescapeCustomPrefix.ts

diff --git a/frontend/docs/docs/user-guide/workflow-setup.md b/frontend/docs/docs/user-guide/workflow-setup.md
index cc75711a..83bc49a8 100644
--- a/frontend/docs/docs/user-guide/workflow-setup.md
+++ b/frontend/docs/docs/user-guide/workflow-setup.md
@@ -60,7 +60,7 @@ _Site Crawl_
 :   This scope will crawl all pages on the domain and any subdomains found. If `example.com` is set as the _Crawl Start URL_, both pages on `example.com` and `subdomain.example.com` will be crawled.
 
 `Custom Page Prefix`
-:   This scope will crawl the _Crawl Start URL_ and then include only those pages that begin with the URLs listed in _URL Prefixes in Scope_. By default, _URL Prefixes in Scope_ will be prefilled with the prefix of the _Crawl Start URL_ to the last `/`. For example, if `https://example.com/path/page` is set as the _Crawl Start URL_, `https://example.com/path/` will be automatically added to `URL Prefixes in Scope`. This prefix can then be removed or modified as needed.
+:   This scope will crawl the _Crawl Start URL_ and then include only those pages that begin with the URLs listed in [_URL Prefixes in Scope_](#url-prefixes-in-scope).
 
 ### Page URL(s)
 
@@ -91,11 +91,13 @@ When enabled, the crawler will fail the entire crawl if any of the provided URLs
 
 Instructs the crawler to stop visiting new links past a specified depth.
 
-### Extra URL Prefixes in Scope
+### URL Prefixes in Scope
 
-This field accepts additional URLs or domains that will be crawled if URLs that lead to them are found.
+When using a scope of `Custom Page Prefix`, this field accepts URLs or domains that will be crawled if URLs that lead to them are found.
 
-This can be useful for crawling websites that span multiple domains such as `example.org` and `example.net`.
+By default, _URL Prefixes in Scope_ will be prefilled with the _Crawl Start URL_ up to the last slash (`/`). For example, if `https://example.com/path/page` is set as the _Crawl Start URL_, `https://example.com/path/` will be automatically added to _URL Prefixes in Scope_. This URL prefix can then be removed or modified as needed.
+
+This field can also be useful for crawling websites that span multiple domains such as `https://example.org` and `https://example.net`. To crawl websites outside of scope for scope types other than `Custom Page Prefix`, see [_Additional Pages_](#additional-pages).
 
 ### Include Any Linked Page ("one hop out")
 
diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts
index 88123235..af164a86 100644
--- a/frontend/src/components/ui/config-details.ts
+++ b/frontend/src/components/ui/config-details.ts
@@ -3,9 +3,7 @@ import ISO6391 from "iso-639-1";
 import { html, nothing, type TemplateResult } from "lit";
 import { customElement, property, state } from "lit/decorators.js";
 import { when } from "lit/directives/when.js";
-import { html as staticHtml, unsafeStatic } from "lit/static-html.js";
 import capitalize from "lodash/fp/capitalize";
-import RegexColorize from "regex-colorize";
 
 import { BtrixElement } from "@/classes/BtrixElement";
 import { none, notSpecified } from "@/layouts/empty";
@@ -21,6 +19,7 @@ import sectionStrings from "@/strings/crawl-workflows/section";
 import type { Collection } from "@/types/collection";
 import { WorkflowScopeType } from "@/types/workflow";
 import { isApiError } from "@/utils/api";
+import { unescapeCustomPrefix } from "@/utils/crawl-workflows/unescapeCustomPrefix";
 import { DEPTH_SUPPORTED_SCOPES, isPageScopeType } from "@/utils/crawler";
 import { humanizeSchedule } from "@/utils/cron";
 import { pluralOf } from "@/utils/pluralize";
@@ -433,19 +432,18 @@ export class ConfigDetails extends BtrixElement {
           : undefined,
         true,
       )}
-      ${when(scopeType === WorkflowScopeType.Prefix, () =>
+      ${when(scopeType === WorkflowScopeType.Custom, () =>
         this.renderSetting(
-          msg("Extra URL Prefixes in Scope"),
+          msg("URL Prefixes in Scope"),
           includeUrlList.length
             ? html`
-                <ul>
-                  ${includeUrlList.map(
-                    (url: string) =>
-                      staticHtml`<li class="regex">${unsafeStatic(
-                        new RegexColorize().colorizeText(url) as string,
-                      )}</li>`,
-                  )}
-                </ul>
+                <btrix-data-table
+                  .columns=${[msg("URL Prefix")]}
+                  .rows=${includeUrlList.map((url) => [
+                    unescapeCustomPrefix(url),
+                  ])}
+                >
+                </btrix-data-table>
               `
             : none,
           true,
diff --git a/frontend/src/utils/crawl-workflows/unescapeCustomPrefix.test.ts b/frontend/src/utils/crawl-workflows/unescapeCustomPrefix.test.ts
new file mode 100644
index 00000000..cf32b9e5
--- /dev/null
+++ b/frontend/src/utils/crawl-workflows/unescapeCustomPrefix.test.ts
@@ -0,0 +1,41 @@
+import { expect } from "@open-wc/testing";
+
+import { unescapeCustomPrefix } from "./unescapeCustomPrefix";
+
+describe("unescapeCustomPrefix", () => {
+  it("doesn't modify a text URL", () => {
+    expect(unescapeCustomPrefix("https://example.com/")).to.equal(
+      "https://example.com/",
+    );
+  });
+
+  it("doesn't modify a text URL with query params", () => {
+    expect(
+      unescapeCustomPrefix("https://example.com/page?query&foo=bar"),
+    ).to.equal("https://example.com/page?query&foo=bar");
+  });
+
+  it("escapes a regex URL", () => {
+    expect(unescapeCustomPrefix("https://example\\.com/")).to.equal(
+      "https://example.com/",
+    );
+  });
+
+  it("escapes a regex URL with query params", () => {
+    expect(
+      unescapeCustomPrefix("https://example\\.com/page\\?query&foo=bar"),
+    ).to.equal("https://example.com/page?query&foo=bar");
+  });
+
+  it("removes leading ^ from a regex URL", () => {
+    expect(unescapeCustomPrefix("^https://example\\.com/")).to.equal(
+      "https://example.com/",
+    );
+  });
+
+  it("removes multiple leading ^ from a regex URL", () => {
+    expect(unescapeCustomPrefix("^^^https://example\\.com/")).to.equal(
+      "https://example.com/",
+    );
+  });
+});
diff --git a/frontend/src/utils/crawl-workflows/unescapeCustomPrefix.ts b/frontend/src/utils/crawl-workflows/unescapeCustomPrefix.ts
new file mode 100644
index 00000000..79725736
--- /dev/null
+++ b/frontend/src/utils/crawl-workflows/unescapeCustomPrefix.ts
@@ -0,0 +1,8 @@
+import { regexUnescape } from "@/utils/string";
+
+/**
+ * Unescape "custom" scope prefix URL for user display
+ */
+export function unescapeCustomPrefix(urlPrefix: string) {
+  return regexUnescape(urlPrefix.replace(/^\^+/, ""));
+}
diff --git a/frontend/src/utils/workflow.ts b/frontend/src/utils/workflow.ts
index fae9693e..ffa19d4d 100644
--- a/frontend/src/utils/workflow.ts
+++ b/frontend/src/utils/workflow.ts
@@ -18,10 +18,10 @@ import {
   WorkflowScopeType,
   type NewWorkflowOnlyScopeType,
 } from "@/types/workflow";
+import { unescapeCustomPrefix } from "@/utils/crawl-workflows/unescapeCustomPrefix";
 import { DEFAULT_MAX_SCALE, isPageScopeType } from "@/utils/crawler";
 import { getNextDate, getScheduleInterval } from "@/utils/cron";
 import localize, { getDefaultLang } from "@/utils/localize";
-import { regexUnescape } from "@/utils/string";
 
 export const BYTES_PER_GB = 1e9;
 export const DEFAULT_SELECT_LINKS = ["a[href]->href" as const];
@@ -218,7 +218,7 @@ export function getInitialFormState(params: {
     if (primarySeedConfig.include?.length) {
       formState.customIncludeUrlList = primarySeedConfig.include
         // Unescape regex
-        .map(regexUnescape)
+        .map(unescapeCustomPrefix)
         .join("\n");
       // if we have additional include URLs, set to "custom" scope here
       // to indicate 'Custom Page Prefix' option