Custom Page Prefix Usability Fixes (#2736)

- Automatically compute prefix from starting URL, if no other prefix is set in custom prefix mode. - Ensure each prefix is actually a prefix: add '^' to each custom prefix URL, as include URL path is a regex - rename 'Extra URL Prefixes' to just 'URL Prefixes' and adjust help text to indicate that the prefix list is the list that is in scope - fixes #2735, follow up to #2722 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> Co-authored-by: sua yoo <sua@webrecorder.org>
2025-07-15 13:19:20 -07:00 · 2025-07-15 13:19:20 -07:00 · 5d2b34f3b6
commit 5d2b34f3b6
parent b0f2d87ce2
2 changed files with 31 additions and 8 deletions
--- a/frontend/docs/docs/user-guide/workflow-setup.md
+++ b/frontend/docs/docs/user-guide/workflow-setup.md
@ -60,7 +60,7 @@ _Site Crawl_
 :   This scope will crawl all pages on the domain and any subdomains found. If `example.com` is set as the _Crawl Start URL_, both pages on `example.com` and `subdomain.example.com` will be crawled.

 `Custom Page Prefix`
-:   This scope will crawl all pages that begin with the _Crawl Start URL_ as well as pages from any URL that begin with the URLs listed in `Extra URL Prefixes in Scope`
+:   This scope will crawl the _Crawl Start URL_ and then include only those pages that begin with the URLs listed in _URL Prefixes in Scope_. By default, _URL Prefixes in Scope_ will be prefilled with the prefix of the _Crawl Start URL_ to the last `/`. For example, if `https://example.com/path/page` is set as the _Crawl Start URL_, `https://example.com/path/` will be automatically added to `URL Prefixes in Scope`. This prefix can then be removed or modified as needed.

 ### Page URL(s)

--- a/frontend/src/features/crawl-workflows/workflow-editor.ts
+++ b/frontend/src/features/crawl-workflows/workflow-editor.ts
@ -1034,12 +1034,12 @@ https://archiveweb.page/guide`}
        break;
      case ScopeType.Custom:
        helpText = msg(
-          html`Will crawl all page URLs that begin with
+          html`Will start with
            <span class="break-word text-blue-500"
              >${exampleDomain}${examplePathname}</span
            >
-            or any URL that begins with those specified in
-            <em>Extra URL Prefixes in Scope</em>`,
+            and include <em>only</em> URLs that start with the
+            <em>URL Prefixes in Scope</em> listed below.`,
        );
        break;
      default:
@ -1092,6 +1092,30 @@ https://archiveweb.page/guide`}
                true,
              );
            }
+            if (
+              this.formState.primarySeedUrl &&
+              this.formState.scopeType === ScopeType.Custom &&
+              !this.formState.customIncludeUrlList
+            ) {
+              let prefixUrl = this.formState.primarySeedUrl;
+              try {
+                const startingUrl = new URL(this.formState.primarySeedUrl);
+                prefixUrl =
+                  startingUrl.origin +
+                  startingUrl.pathname.slice(
+                    0,
+                    startingUrl.pathname.lastIndexOf("/") + 1,
+                  );
+              } catch (e) {
+                // ignore
+              }
+              this.updateFormState(
+                {
+                  customIncludeUrlList: prefixUrl,
+                },
+                true,
+              );
+            }
          }}
        >
          <div slot="help-text">${helpText}</div>
@ -1104,7 +1128,7 @@ https://archiveweb.page/guide`}
          ${inputCol(html`
            <sl-textarea
              name="customIncludeUrlList"
-              label=${msg("Extra URL Prefixes in Scope")}
+              label=${msg("URL Prefixes in Scope")}
              rows="3"
              autocomplete="off"
              inputmode="url"
@ -1115,8 +1139,7 @@ https://example.net`}
            ></sl-textarea>
          `)}
          ${this.renderHelpTextCol(
-            msg(`If the crawler finds pages outside of the Crawl Scope they
-            will only be saved if they begin with URLs listed here.`),
+            msg(`Only crawl pages that begin with URLs listed here.`),
          )}
        `,
      )}
@ -2643,7 +2666,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
      scopeType: this.formState.scopeType as ScopeType,
      include:
        this.formState.scopeType === ScopeType.Custom
-          ? [...includeUrlList.map((url) => regexEscape(url))]
+          ? [...includeUrlList.map((url) => "^" + regexEscape(url))]
          : [],
      extraHops: this.formState.includeLinkedPages ? 1 : 0,
    };