custom prefix additional fixes (#2746)

- follow-up to: #2736: remove '^' custom prefix URLs to avoid accumulating '^' via utility function - Show URL prefix list in settings for custom prefix scope. - Update user guide with correct custom prefix field. --------- Co-authored-by: sua yoo <sua@webrecorder.org>
2025-07-18 18:21:32 -07:00 · 2025-07-18 18:21:32 -07:00 · 2f9a61f6be
commit 2f9a61f6be
parent 74aec5dfa3
5 changed files with 67 additions and 18 deletions
--- a/frontend/docs/docs/user-guide/workflow-setup.md
+++ b/frontend/docs/docs/user-guide/workflow-setup.md
@ -60,7 +60,7 @@ _Site Crawl_
 :   This scope will crawl all pages on the domain and any subdomains found. If `example.com` is set as the _Crawl Start URL_, both pages on `example.com` and `subdomain.example.com` will be crawled.

 `Custom Page Prefix`
-:   This scope will crawl the _Crawl Start URL_ and then include only those pages that begin with the URLs listed in _URL Prefixes in Scope_. By default, _URL Prefixes in Scope_ will be prefilled with the prefix of the _Crawl Start URL_ to the last `/`. For example, if `https://example.com/path/page` is set as the _Crawl Start URL_, `https://example.com/path/` will be automatically added to `URL Prefixes in Scope`. This prefix can then be removed or modified as needed.
+:   This scope will crawl the _Crawl Start URL_ and then include only those pages that begin with the URLs listed in [_URL Prefixes in Scope_](#url-prefixes-in-scope).

 ### Page URL(s)

@ -91,11 +91,13 @@ When enabled, the crawler will fail the entire crawl if any of the provided URLs

 Instructs the crawler to stop visiting new links past a specified depth.

-### Extra URL Prefixes in Scope
+### URL Prefixes in Scope

-This field accepts additional URLs or domains that will be crawled if URLs that lead to them are found.
+When using a scope of `Custom Page Prefix`, this field accepts URLs or domains that will be crawled if URLs that lead to them are found.

-This can be useful for crawling websites that span multiple domains such as `example.org` and `example.net`.
+By default, _URL Prefixes in Scope_ will be prefilled with the _Crawl Start URL_ up to the last slash (`/`). For example, if `https://example.com/path/page` is set as the _Crawl Start URL_, `https://example.com/path/` will be automatically added to _URL Prefixes in Scope_. This URL prefix can then be removed or modified as needed.
+
+This field can also be useful for crawling websites that span multiple domains such as `https://example.org` and `https://example.net`. To crawl websites outside of scope for scope types other than `Custom Page Prefix`, see [_Additional Pages_](#additional-pages).

 ### Include Any Linked Page ("one hop out")

--- a/frontend/src/components/ui/config-details.ts
+++ b/frontend/src/components/ui/config-details.ts
@ -3,9 +3,7 @@ import ISO6391 from "iso-639-1";
 import { html, nothing, type TemplateResult } from "lit";
 import { customElement, property, state } from "lit/decorators.js";
 import { when } from "lit/directives/when.js";
-import { html as staticHtml, unsafeStatic } from "lit/static-html.js";
 import capitalize from "lodash/fp/capitalize";
-import RegexColorize from "regex-colorize";

 import { BtrixElement } from "@/classes/BtrixElement";
 import { none, notSpecified } from "@/layouts/empty";
@ -21,6 +19,7 @@ import sectionStrings from "@/strings/crawl-workflows/section";
 import type { Collection } from "@/types/collection";
 import { WorkflowScopeType } from "@/types/workflow";
 import { isApiError } from "@/utils/api";
+import { unescapeCustomPrefix } from "@/utils/crawl-workflows/unescapeCustomPrefix";
 import { DEPTH_SUPPORTED_SCOPES, isPageScopeType } from "@/utils/crawler";
 import { humanizeSchedule } from "@/utils/cron";
 import { pluralOf } from "@/utils/pluralize";
@ -433,19 +432,18 @@ export class ConfigDetails extends BtrixElement {
          : undefined,
        true,
      )}
-      ${when(scopeType === WorkflowScopeType.Prefix, () =>
+      ${when(scopeType === WorkflowScopeType.Custom, () =>
        this.renderSetting(
-          msg("Extra URL Prefixes in Scope"),
+          msg("URL Prefixes in Scope"),
          includeUrlList.length
            ? html`
-                <ul>
-                  ${includeUrlList.map(
-                    (url: string) =>
-                      staticHtml`<li class="regex">${unsafeStatic(
-                        new RegexColorize().colorizeText(url) as string,
-                      )}</li>`,
-                  )}
-                </ul>
+                <btrix-data-table
+                  .columns=${[msg("URL Prefix")]}
+                  .rows=${includeUrlList.map((url) => [
+                    unescapeCustomPrefix(url),
+                  ])}
+                >
+                </btrix-data-table>
              `
            : none,
          true,
--- a/frontend/src/utils/crawl-workflows/unescapeCustomPrefix.test.ts
+++ b/frontend/src/utils/crawl-workflows/unescapeCustomPrefix.test.ts
@ -0,0 +1,41 @@
+import { expect } from "@open-wc/testing";
+
+import { unescapeCustomPrefix } from "./unescapeCustomPrefix";
+
+describe("unescapeCustomPrefix", () => {
+  it("doesn't modify a text URL", () => {
+    expect(unescapeCustomPrefix("https://example.com/")).to.equal(
+      "https://example.com/",
+    );
+  });
+
+  it("doesn't modify a text URL with query params", () => {
+    expect(
+      unescapeCustomPrefix("https://example.com/page?query&foo=bar"),
+    ).to.equal("https://example.com/page?query&foo=bar");
+  });
+
+  it("escapes a regex URL", () => {
+    expect(unescapeCustomPrefix("https://example\\.com/")).to.equal(
+      "https://example.com/",
+    );
+  });
+
+  it("escapes a regex URL with query params", () => {
+    expect(
+      unescapeCustomPrefix("https://example\\.com/page\\?query&foo=bar"),
+    ).to.equal("https://example.com/page?query&foo=bar");
+  });
+
+  it("removes leading ^ from a regex URL", () => {
+    expect(unescapeCustomPrefix("^https://example\\.com/")).to.equal(
+      "https://example.com/",
+    );
+  });
+
+  it("removes multiple leading ^ from a regex URL", () => {
+    expect(unescapeCustomPrefix("^^^https://example\\.com/")).to.equal(
+      "https://example.com/",
+    );
+  });
+});
--- a/frontend/src/utils/crawl-workflows/unescapeCustomPrefix.ts
+++ b/frontend/src/utils/crawl-workflows/unescapeCustomPrefix.ts
@ -0,0 +1,8 @@
+import { regexUnescape } from "@/utils/string";
+
+/**
+ * Unescape "custom" scope prefix URL for user display
+ */
+export function unescapeCustomPrefix(urlPrefix: string) {
+  return regexUnescape(urlPrefix.replace(/^\^+/, ""));
+}
--- a/frontend/src/utils/workflow.ts
+++ b/frontend/src/utils/workflow.ts
@ -18,10 +18,10 @@ import {
  WorkflowScopeType,
  type NewWorkflowOnlyScopeType,
 } from "@/types/workflow";
+import { unescapeCustomPrefix } from "@/utils/crawl-workflows/unescapeCustomPrefix";
 import { DEFAULT_MAX_SCALE, isPageScopeType } from "@/utils/crawler";
 import { getNextDate, getScheduleInterval } from "@/utils/cron";
 import localize, { getDefaultLang } from "@/utils/localize";
-import { regexUnescape } from "@/utils/string";

 export const BYTES_PER_GB = 1e9;
 export const DEFAULT_SELECT_LINKS = ["a[href]->href" as const];
@ -218,7 +218,7 @@ export function getInitialFormState(params: {
    if (primarySeedConfig.include?.length) {
      formState.customIncludeUrlList = primarySeedConfig.include
        // Unescape regex
-        .map(regexUnescape)
+        .map(unescapeCustomPrefix)
        .join("\n");
      // if we have additional include URLs, set to "custom" scope here
      // to indicate 'Custom Page Prefix' option