Add option to fail crawl if not logged in (#2754)

This PR adds a new checkbox to both page and seed crawl workflow types, which will fail the crawl if behaviors detect the browser is not logged in for supported sites. Changes include: - Backend support for the new crawler flag - A new `failed_not_logged_in` crawl state - Checkbox workflow editor and config details in the frontend (currently in the Scope section - I think it makes sense to have this option up front, but worth considering) - User Guide documentation of new option - A new nightly test for the new workflow option and `failed_not_logged_in` state --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com> Co-authored-by: sua yoo <sua@webrecorder.org>
2025-07-29 01:58:43 -04:00 · 2025-07-29 01:58:43 -04:00 · 0c8c397fca
commit 0c8c397fca
parent feafeae1eb
13 changed files with 188 additions and 9 deletions
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@ -243,6 +243,7 @@ WAITING_STATES = get_args(TYPE_WAITING_STATES)
 TYPE_FAILED_STATES = Literal[
    "canceled",
    "failed",
+    "failed_not_logged_in",
    "skipped_storage_quota_reached",
    "skipped_time_quota_reached",
 ]
@ -358,6 +359,7 @@ class RawCrawlConfig(BaseModel):

    useSitemap: Optional[bool] = False
    failOnFailedSeed: Optional[bool] = False
+    failOnContentCheck: Optional[bool] = False

    logging: Optional[str] = None
    behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
--- a/backend/btrixcloud/operator/crawls.py
+++ b/backend/btrixcloud/operator/crawls.py
@ -4,7 +4,7 @@ import traceback
 import os
 import math
 from pprint import pprint
-from typing import Optional, Any, Sequence
+from typing import Optional, Any, Sequence, Literal
 from datetime import datetime, timedelta
 from uuid import UUID

@ -827,15 +827,26 @@ class CrawlOperator(BaseOperator):
        crawl: CrawlSpec,
        status: CrawlStatus,
        pods: dict,
-        stats: Optional[CrawlStats] = None,
+        stats: CrawlStats,
+        redis: Redis,
    ) -> bool:
        """Mark crawl as failed, log crawl state and print crawl logs, if possible"""
        prev_state = status.state

-        if not await self.mark_finished(crawl, status, "failed", stats=stats):
+        failed_state: Literal["failed", "failed_not_logged_in"] = "failed"
+
+        fail_reason = await redis.get(f"{crawl.id}:failReason")
+
+        if fail_reason == "not_logged_in":
+            failed_state = "failed_not_logged_in"
+
+        if not await self.mark_finished(crawl, status, failed_state, stats=stats):
            return False

-        if not self.log_failed_crawl_lines or prev_state == "failed":
+        if not self.log_failed_crawl_lines or prev_state in (
+            "failed",
+            "failed_not_logged_in",
+        ):
            return True

        pod_names = list(pods.keys())
@ -1579,7 +1590,7 @@ class CrawlOperator(BaseOperator):
            # check if one-page crawls actually succeeded
            # if only one page found, and no files, assume failed
            if status.pagesFound == 1 and not status.filesAdded:
-                await self.fail_crawl(crawl, status, pods, stats)
+                await self.fail_crawl(crawl, status, pods, stats, redis)
                return status

            state: TYPE_NON_RUNNING_STATES
@ -1602,7 +1613,7 @@ class CrawlOperator(BaseOperator):
            if status.stopping and not status.pagesDone:
                await self.mark_finished(crawl, status, "canceled", stats)
            else:
-                await self.fail_crawl(crawl, status, pods, stats)
+                await self.fail_crawl(crawl, status, pods, stats, redis)

        # check for other statuses, default to "running"
        else:
--- a/backend/test_nightly/test_crawl_not_logged_in.py
+++ b/backend/test_nightly/test_crawl_not_logged_in.py
@ -0,0 +1,89 @@
+import time
+
+import pytest
+import requests
+
+from .conftest import API_PREFIX
+
+config_id = None
+
+
+@pytest.fixture(scope="session")
+def fail_not_logged_in_crawl_id(admin_auth_headers, default_org_id):
+    # Start crawl
+    crawl_data = {
+        "runNow": True,
+        "name": "Fail Crawl Not Logged In",
+        "config": {
+            "seeds": [{"url": "https://x.com/webrecorder_io"}],
+            "scopeType": "page",
+            "limit": 1,
+            "failOnContentCheck": True,
+        },
+    }
+    r = requests.post(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
+        headers=admin_auth_headers,
+        json=crawl_data,
+    )
+    data = r.json()
+
+    global config_id
+    config_id = data["id"]
+
+    crawl_id = data["run_now_job"]
+
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
+            headers=admin_auth_headers,
+        )
+        data = r.json()
+        if data["state"] == "running":
+            # Give crawl time to start properly
+            time.sleep(30)
+            return crawl_id
+        time.sleep(5)
+
+
+@pytest.fixture(scope="session")
+def failed_crawl_finished(
+    admin_auth_headers, default_org_id, fail_not_logged_in_crawl_id
+):
+    # Wait for crawl to complete
+    while True:
+        r = requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{fail_not_logged_in_crawl_id}/replay.json",
+            headers=admin_auth_headers,
+        )
+        data = r.json()
+        if data["state"] in ("complete", "failed", "failed_not_logged_in"):
+            # Give some time for WACZ files to be stored
+            time.sleep(30)
+            break
+        time.sleep(5)
+
+
+def test_fail_crawl_not_logged_in(
+    admin_auth_headers,
+    default_org_id,
+    fail_not_logged_in_crawl_id,
+    failed_crawl_finished,
+):
+    # Ensure crawl has expected state
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{fail_not_logged_in_crawl_id}/replay.json",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["state"] == "failed_not_logged_in"
+
+    # Ensure workflow lastCrawlState has expected state
+    r = requests.get(
+        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
+        headers=admin_auth_headers,
+    )
+    assert r.status_code == 200
+    data = r.json()
+    assert data["lastCrawlState"] == "failed_not_logged_in"
--- a/frontend/docs/docs/user-guide/workflow-setup.md
+++ b/frontend/docs/docs/user-guide/workflow-setup.md
@ -83,6 +83,12 @@ When enabled, the crawler will visit all the links it finds within each page def
 ??? example "Crawling tags & search queries with Page List crawls"
    This setting can be useful for crawling the content of specific tags or search queries. Specify the tag or search query URL(s) in the _Crawl URL(s)_ field, e.g: `https://example.com/search?q=tag`, and enable _Include Any Linked Page_ to crawl all the content present on that search query page.

+### Fail Crawl if Not Logged In
+
+When enabled, the crawl will fail if a [page behavior](#page-behavior) detects the presence or absence of content on supported pages indicating that the browser is not logged in.
+
+For details about which websites are supported and how to add this functionality to your own [custom behaviors](#use-custom-behaviors), see the [Browsertrix Crawler documentation for Fail on Content Check](https://crawler.docs.browsertrix.com/user-guide/behaviors/#fail-on-content-check).
+
 ### Fail Crawl on Failed URL

 When enabled, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled. The resulting archived item will have a status of "Failed".
--- a/frontend/src/components/ui/config-details.ts
+++ b/frontend/src/components/ui/config-details.ts
@ -438,6 +438,10 @@ export class ConfigDetails extends BtrixElement {
        msg("Include Any Linked Page (“one hop out”)"),
        Boolean(config.extraHops),
      )}
+      ${this.renderSetting(
+        msg("Fail Crawl If Not Logged In"),
+        Boolean(config.failOnContentCheck),
+      )}
      ${when(
        config.extraHops,
        () => html`${this.renderLinkSelectors()}${this.renderExclusions()}`,
@ -502,6 +506,10 @@ export class ConfigDetails extends BtrixElement {
        msg("Check For Sitemap"),
        Boolean(config.useSitemap),
      )}
+      ${this.renderSetting(
+        msg("Fail Crawl if Not Logged In"),
+        Boolean(config.failOnContentCheck),
+      )}
      ${this.renderLinkSelectors()}
      ${this.renderSetting(
        msg("Additional Page URLs"),
--- a/frontend/src/features/archived-items/crawl-status.ts
+++ b/frontend/src/features/archived-items/crawl-status.ts
@ -227,6 +227,16 @@ export class CrawlStatus extends TailwindElement {
        label = msg("Failed");
        break;

+      case "failed_not_logged_in":
+        color = "var(--danger)";
+        icon = html`<sl-icon
+          name="exclamation-triangle-fill"
+          slot="prefix"
+          style="color: ${color}"
+        ></sl-icon>`;
+        label = msg("Failed: Not Logged In");
+        break;
+
      case "skipped_storage_quota_reached":
        color = "var(--danger)";
        icon = html`<sl-icon
--- a/frontend/src/features/crawl-workflows/workflow-editor.ts
+++ b/frontend/src/features/crawl-workflows/workflow-editor.ts
@ -1007,6 +1007,22 @@ export class WorkflowEditor extends BtrixElement {
        msg(`If checked, the crawler will visit pages one link away.`),
        false,
      )}
+      ${inputCol(html`
+        <sl-checkbox
+          name="failOnContentCheck"
+          ?checked=${this.formState.failOnContentCheck}
+        >
+          ${msg("Fail crawl if not logged in")}
+        </sl-checkbox>
+      `)}
+      ${this.renderHelpTextCol(
+        html`${infoTextFor["failOnContentCheck"]}
+        ${this.renderUserGuideLink({
+          hash: "fail-crawl-if-not-logged-in",
+          content: msg("More details"),
+        })}.`,
+        false,
+      )}
      ${when(this.formState.includeLinkedPages, () =>
        this.renderLinkSelectors(),
      )}
@ -1495,6 +1511,22 @@ https://example.net`}
        ),
        false,
      )}
+      ${inputCol(html`
+        <sl-checkbox
+          name="failOnContentCheck"
+          ?checked=${this.formState.failOnContentCheck}
+        >
+          ${msg("Fail crawl if not logged in")}
+        </sl-checkbox>
+      `)}
+      ${this.renderHelpTextCol(
+        html`${infoTextFor["failOnContentCheck"]}
+        ${this.renderUserGuideLink({
+          hash: "fail-crawl-if-not-logged-in",
+          content: msg("More details"),
+        })}.`,
+        false,
+      )}
      ${this.renderLinkSelectors()}

      <div class="col-span-5">
@ -3031,6 +3063,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
    | "extraHops"
    | "useSitemap"
    | "failOnFailedSeed"
+    | "failOnContentCheck"
  > {
    const jsonSeeds = this.formState.seedListFormat === SeedListFormat.JSON;

@ -3048,6 +3081,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
      extraHops: this.formState.includeLinkedPages ? 1 : 0,
      useSitemap: false,
      failOnFailedSeed: this.formState.failOnFailedSeed,
+      failOnContentCheck: this.formState.failOnContentCheck,
    };

    return config;
@ -3055,7 +3089,11 @@ https://archiveweb.page/images/${"logo.svg"}`}

  private parseSeededConfig(): Pick<
    CrawlConfigParams["config"],
-    "seeds" | "scopeType" | "useSitemap" | "failOnFailedSeed"
+    | "seeds"
+    | "scopeType"
+    | "useSitemap"
+    | "failOnFailedSeed"
+    | "failOnContentCheck"
  > {
    const primarySeedUrl = this.formState.primarySeedUrl;
    const includeUrlList = this.formState.customIncludeUrlList
@ -3086,6 +3124,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
      scopeType: this.formState.scopeType as ScopeType,
      useSitemap: this.formState.useSitemap,
      failOnFailedSeed: false,
+      failOnContentCheck: this.formState.failOnContentCheck,
    };
    return config;
  }
--- a/frontend/src/features/crawl-workflows/workflow-list.ts
+++ b/frontend/src/features/crawl-workflows/workflow-list.ts
@ -244,7 +244,8 @@ export class WorkflowListItem extends BtrixElement {
        }
        e.preventDefault();
        await this.updateComplete;
-        const href = `/orgs/${this.orgSlugState}/workflows/${this.workflow?.id}/${this.workflow?.lastCrawlState === "failed" ? WorkflowTab.Logs : WorkflowTab.LatestCrawl}`;
+        const failedStates = ["failed", "failed_not_logged_in"];
+        const href = `/orgs/${this.orgSlugState}/workflows/${this.workflow?.id}/${failedStates.includes(this.workflow?.lastCrawlState || "") ? WorkflowTab.Logs : WorkflowTab.LatestCrawl}`;
        this.navigate.to(href);
      }}
    >
--- a/frontend/src/pages/org/workflow-detail.ts
+++ b/frontend/src/pages/org/workflow-detail.ts
@ -1909,6 +1909,7 @@ export class WorkflowDetail extends BtrixElement {
          message = msg("This crawl can’t be replayed since it was canceled.");
          break;
        case "failed":
+        case "failed_not_logged_in":
          message = msg("This crawl can’t be replayed because it failed.");
          break;
        default:
@ -1920,7 +1921,9 @@ export class WorkflowDetail extends BtrixElement {
    const actionButton = (workflow: Workflow) => {
      if (!workflow.lastCrawlId) return;

-      if (workflow.lastCrawlState === "failed") {
+      const failedStates = ["failed", "failed_not_logged_in"];
+
+      if (failedStates.includes(workflow.lastCrawlState || "")) {
        return html`<div class="mt-4">
          <sl-button
            size="small"
--- a/frontend/src/strings/crawl-workflows/infoText.ts
+++ b/frontend/src/strings/crawl-workflows/infoText.ts
@ -76,6 +76,9 @@ export const infoTextFor = {
  customBehavior: msg(
    `Enable custom page actions with behavior scripts. You can specify any publicly accessible URL or public Git repository.`,
  ),
+  failOnContentCheck: msg(
+    `Fail the crawl if a page behavior detects the browser is not logged in on supported pages.`,
+  ),
 } as const satisfies Partial<Record<Field, string | TemplateResult>>;

 export default infoTextFor;
--- a/frontend/src/types/crawlState.ts
+++ b/frontend/src/types/crawlState.ts
@ -28,6 +28,7 @@ export const SUCCESSFUL_STATES = [
 export const FAILED_STATES = [
  "canceled",
  "failed",
+  "failed_not_logged_in",
  "skipped_storage_quota_reached",
  "skipped_time_quota_reached",
 ] as const;
--- a/frontend/src/types/crawler.ts
+++ b/frontend/src/types/crawler.ts
@ -44,6 +44,7 @@ export type SeedConfig = Expand<
    extraHops?: number | null;
    useSitemap?: boolean;
    failOnFailedSeed?: boolean;
+    failOnContentCheck?: boolean;
    depth?: number | null;
    userAgent?: string | null;
    selectLinks: string[];
--- a/frontend/src/utils/workflow.ts
+++ b/frontend/src/utils/workflow.ts
@ -115,6 +115,7 @@ export type FormState = {
  includeLinkedPages: boolean;
  useSitemap: boolean;
  failOnFailedSeed: boolean;
+  failOnContentCheck: boolean;
  customIncludeUrlList: string;
  crawlTimeoutMinutes: number;
  behaviorTimeoutSeconds: number | null;
@ -177,6 +178,7 @@ export const getDefaultFormState = (): FormState => ({
  includeLinkedPages: false,
  useSitemap: false,
  failOnFailedSeed: false,
+  failOnContentCheck: false,
  customIncludeUrlList: "",
  crawlTimeoutMinutes: 0,
  maxCrawlSizeGB: 0,
@ -269,6 +271,7 @@ export function getInitialFormState(params: {
    }

    formState.failOnFailedSeed = seedsConfig.failOnFailedSeed;
+    formState.failOnContentCheck = seedsConfig.failOnContentCheck;
  }

  if (params.initialWorkflow.schedule) {
@ -354,6 +357,8 @@ export function getInitialFormState(params: {
    useSitemap: seedsConfig.useSitemap ?? defaultFormState.useSitemap,
    failOnFailedSeed:
      seedsConfig.failOnFailedSeed ?? defaultFormState.failOnFailedSeed,
+    failOnContentCheck:
+      seedsConfig.failOnContentCheck ?? defaultFormState.failOnContentCheck,
    pageLimit:
      params.initialWorkflow.config.limit ?? defaultFormState.pageLimit,
    autoscrollBehavior: params.initialWorkflow.config.behaviors