From 0c8c397fcac746f38e6513e7bb679cc17cfd4c69 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Tue, 29 Jul 2025 01:58:43 -0400 Subject: [PATCH] Add option to fail crawl if not logged in (#2754) This PR adds a new checkbox to both page and seed crawl workflow types, which will fail the crawl if behaviors detect the browser is not logged in for supported sites. Changes include: - Backend support for the new crawler flag - A new `failed_not_logged_in` crawl state - Checkbox workflow editor and config details in the frontend (currently in the Scope section - I think it makes sense to have this option up front, but worth considering) - User Guide documentation of new option - A new nightly test for the new workflow option and `failed_not_logged_in` state --------- Co-authored-by: Ilya Kreymer Co-authored-by: sua yoo --- backend/btrixcloud/models.py | 2 + backend/btrixcloud/operator/crawls.py | 23 +++-- .../test_nightly/test_crawl_not_logged_in.py | 89 +++++++++++++++++++ .../docs/docs/user-guide/workflow-setup.md | 6 ++ frontend/src/components/ui/config-details.ts | 8 ++ .../features/archived-items/crawl-status.ts | 10 +++ .../crawl-workflows/workflow-editor.ts | 41 ++++++++- .../features/crawl-workflows/workflow-list.ts | 3 +- frontend/src/pages/org/workflow-detail.ts | 5 +- .../src/strings/crawl-workflows/infoText.ts | 3 + frontend/src/types/crawlState.ts | 1 + frontend/src/types/crawler.ts | 1 + frontend/src/utils/workflow.ts | 5 ++ 13 files changed, 188 insertions(+), 9 deletions(-) create mode 100644 backend/test_nightly/test_crawl_not_logged_in.py diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index b2f65d89..e31ac097 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -243,6 +243,7 @@ WAITING_STATES = get_args(TYPE_WAITING_STATES) TYPE_FAILED_STATES = Literal[ "canceled", "failed", + "failed_not_logged_in", "skipped_storage_quota_reached", "skipped_time_quota_reached", ] @@ -358,6 +359,7 @@ class RawCrawlConfig(BaseModel): useSitemap: Optional[bool] = False failOnFailedSeed: Optional[bool] = False + failOnContentCheck: Optional[bool] = False logging: Optional[str] = None behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific" diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 29f5ecb0..32d47a7f 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -4,7 +4,7 @@ import traceback import os import math from pprint import pprint -from typing import Optional, Any, Sequence +from typing import Optional, Any, Sequence, Literal from datetime import datetime, timedelta from uuid import UUID @@ -827,15 +827,26 @@ class CrawlOperator(BaseOperator): crawl: CrawlSpec, status: CrawlStatus, pods: dict, - stats: Optional[CrawlStats] = None, + stats: CrawlStats, + redis: Redis, ) -> bool: """Mark crawl as failed, log crawl state and print crawl logs, if possible""" prev_state = status.state - if not await self.mark_finished(crawl, status, "failed", stats=stats): + failed_state: Literal["failed", "failed_not_logged_in"] = "failed" + + fail_reason = await redis.get(f"{crawl.id}:failReason") + + if fail_reason == "not_logged_in": + failed_state = "failed_not_logged_in" + + if not await self.mark_finished(crawl, status, failed_state, stats=stats): return False - if not self.log_failed_crawl_lines or prev_state == "failed": + if not self.log_failed_crawl_lines or prev_state in ( + "failed", + "failed_not_logged_in", + ): return True pod_names = list(pods.keys()) @@ -1579,7 +1590,7 @@ class CrawlOperator(BaseOperator): # check if one-page crawls actually succeeded # if only one page found, and no files, assume failed if status.pagesFound == 1 and not status.filesAdded: - await self.fail_crawl(crawl, status, pods, stats) + await self.fail_crawl(crawl, status, pods, stats, redis) return status state: TYPE_NON_RUNNING_STATES @@ -1602,7 +1613,7 @@ class CrawlOperator(BaseOperator): if status.stopping and not status.pagesDone: await self.mark_finished(crawl, status, "canceled", stats) else: - await self.fail_crawl(crawl, status, pods, stats) + await self.fail_crawl(crawl, status, pods, stats, redis) # check for other statuses, default to "running" else: diff --git a/backend/test_nightly/test_crawl_not_logged_in.py b/backend/test_nightly/test_crawl_not_logged_in.py new file mode 100644 index 00000000..0d65b686 --- /dev/null +++ b/backend/test_nightly/test_crawl_not_logged_in.py @@ -0,0 +1,89 @@ +import time + +import pytest +import requests + +from .conftest import API_PREFIX + +config_id = None + + +@pytest.fixture(scope="session") +def fail_not_logged_in_crawl_id(admin_auth_headers, default_org_id): + # Start crawl + crawl_data = { + "runNow": True, + "name": "Fail Crawl Not Logged In", + "config": { + "seeds": [{"url": "https://x.com/webrecorder_io"}], + "scopeType": "page", + "limit": 1, + "failOnContentCheck": True, + }, + } + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=admin_auth_headers, + json=crawl_data, + ) + data = r.json() + + global config_id + config_id = data["id"] + + crawl_id = data["run_now_job"] + + while True: + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json", + headers=admin_auth_headers, + ) + data = r.json() + if data["state"] == "running": + # Give crawl time to start properly + time.sleep(30) + return crawl_id + time.sleep(5) + + +@pytest.fixture(scope="session") +def failed_crawl_finished( + admin_auth_headers, default_org_id, fail_not_logged_in_crawl_id +): + # Wait for crawl to complete + while True: + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{fail_not_logged_in_crawl_id}/replay.json", + headers=admin_auth_headers, + ) + data = r.json() + if data["state"] in ("complete", "failed", "failed_not_logged_in"): + # Give some time for WACZ files to be stored + time.sleep(30) + break + time.sleep(5) + + +def test_fail_crawl_not_logged_in( + admin_auth_headers, + default_org_id, + fail_not_logged_in_crawl_id, + failed_crawl_finished, +): + # Ensure crawl has expected state + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{fail_not_logged_in_crawl_id}/replay.json", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["state"] == "failed_not_logged_in" + + # Ensure workflow lastCrawlState has expected state + r = requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}", + headers=admin_auth_headers, + ) + assert r.status_code == 200 + data = r.json() + assert data["lastCrawlState"] == "failed_not_logged_in" diff --git a/frontend/docs/docs/user-guide/workflow-setup.md b/frontend/docs/docs/user-guide/workflow-setup.md index 83bc49a8..a80b4355 100644 --- a/frontend/docs/docs/user-guide/workflow-setup.md +++ b/frontend/docs/docs/user-guide/workflow-setup.md @@ -83,6 +83,12 @@ When enabled, the crawler will visit all the links it finds within each page def ??? example "Crawling tags & search queries with Page List crawls" This setting can be useful for crawling the content of specific tags or search queries. Specify the tag or search query URL(s) in the _Crawl URL(s)_ field, e.g: `https://example.com/search?q=tag`, and enable _Include Any Linked Page_ to crawl all the content present on that search query page. +### Fail Crawl if Not Logged In + +When enabled, the crawl will fail if a [page behavior](#page-behavior) detects the presence or absence of content on supported pages indicating that the browser is not logged in. + +For details about which websites are supported and how to add this functionality to your own [custom behaviors](#use-custom-behaviors), see the [Browsertrix Crawler documentation for Fail on Content Check](https://crawler.docs.browsertrix.com/user-guide/behaviors/#fail-on-content-check). + ### Fail Crawl on Failed URL When enabled, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled. The resulting archived item will have a status of "Failed". diff --git a/frontend/src/components/ui/config-details.ts b/frontend/src/components/ui/config-details.ts index b75fb32d..a9f6eeb1 100644 --- a/frontend/src/components/ui/config-details.ts +++ b/frontend/src/components/ui/config-details.ts @@ -438,6 +438,10 @@ export class ConfigDetails extends BtrixElement { msg("Include Any Linked Page (“one hop out”)"), Boolean(config.extraHops), )} + ${this.renderSetting( + msg("Fail Crawl If Not Logged In"), + Boolean(config.failOnContentCheck), + )} ${when( config.extraHops, () => html`${this.renderLinkSelectors()}${this.renderExclusions()}`, @@ -502,6 +506,10 @@ export class ConfigDetails extends BtrixElement { msg("Check For Sitemap"), Boolean(config.useSitemap), )} + ${this.renderSetting( + msg("Fail Crawl if Not Logged In"), + Boolean(config.failOnContentCheck), + )} ${this.renderLinkSelectors()} ${this.renderSetting( msg("Additional Page URLs"), diff --git a/frontend/src/features/archived-items/crawl-status.ts b/frontend/src/features/archived-items/crawl-status.ts index 05f469d9..c9c9d347 100644 --- a/frontend/src/features/archived-items/crawl-status.ts +++ b/frontend/src/features/archived-items/crawl-status.ts @@ -227,6 +227,16 @@ export class CrawlStatus extends TailwindElement { label = msg("Failed"); break; + case "failed_not_logged_in": + color = "var(--danger)"; + icon = html``; + label = msg("Failed: Not Logged In"); + break; + case "skipped_storage_quota_reached": color = "var(--danger)"; icon = html` + ${msg("Fail crawl if not logged in")} + + `)} + ${this.renderHelpTextCol( + html`${infoTextFor["failOnContentCheck"]} + ${this.renderUserGuideLink({ + hash: "fail-crawl-if-not-logged-in", + content: msg("More details"), + })}.`, + false, + )} ${when(this.formState.includeLinkedPages, () => this.renderLinkSelectors(), )} @@ -1495,6 +1511,22 @@ https://example.net`} ), false, )} + ${inputCol(html` + + ${msg("Fail crawl if not logged in")} + + `)} + ${this.renderHelpTextCol( + html`${infoTextFor["failOnContentCheck"]} + ${this.renderUserGuideLink({ + hash: "fail-crawl-if-not-logged-in", + content: msg("More details"), + })}.`, + false, + )} ${this.renderLinkSelectors()}
@@ -3031,6 +3063,7 @@ https://archiveweb.page/images/${"logo.svg"}`} | "extraHops" | "useSitemap" | "failOnFailedSeed" + | "failOnContentCheck" > { const jsonSeeds = this.formState.seedListFormat === SeedListFormat.JSON; @@ -3048,6 +3081,7 @@ https://archiveweb.page/images/${"logo.svg"}`} extraHops: this.formState.includeLinkedPages ? 1 : 0, useSitemap: false, failOnFailedSeed: this.formState.failOnFailedSeed, + failOnContentCheck: this.formState.failOnContentCheck, }; return config; @@ -3055,7 +3089,11 @@ https://archiveweb.page/images/${"logo.svg"}`} private parseSeededConfig(): Pick< CrawlConfigParams["config"], - "seeds" | "scopeType" | "useSitemap" | "failOnFailedSeed" + | "seeds" + | "scopeType" + | "useSitemap" + | "failOnFailedSeed" + | "failOnContentCheck" > { const primarySeedUrl = this.formState.primarySeedUrl; const includeUrlList = this.formState.customIncludeUrlList @@ -3086,6 +3124,7 @@ https://archiveweb.page/images/${"logo.svg"}`} scopeType: this.formState.scopeType as ScopeType, useSitemap: this.formState.useSitemap, failOnFailedSeed: false, + failOnContentCheck: this.formState.failOnContentCheck, }; return config; } diff --git a/frontend/src/features/crawl-workflows/workflow-list.ts b/frontend/src/features/crawl-workflows/workflow-list.ts index fdedc926..ef416e8d 100644 --- a/frontend/src/features/crawl-workflows/workflow-list.ts +++ b/frontend/src/features/crawl-workflows/workflow-list.ts @@ -244,7 +244,8 @@ export class WorkflowListItem extends BtrixElement { } e.preventDefault(); await this.updateComplete; - const href = `/orgs/${this.orgSlugState}/workflows/${this.workflow?.id}/${this.workflow?.lastCrawlState === "failed" ? WorkflowTab.Logs : WorkflowTab.LatestCrawl}`; + const failedStates = ["failed", "failed_not_logged_in"]; + const href = `/orgs/${this.orgSlugState}/workflows/${this.workflow?.id}/${failedStates.includes(this.workflow?.lastCrawlState || "") ? WorkflowTab.Logs : WorkflowTab.LatestCrawl}`; this.navigate.to(href); }} > diff --git a/frontend/src/pages/org/workflow-detail.ts b/frontend/src/pages/org/workflow-detail.ts index fd44fec2..5777f94f 100644 --- a/frontend/src/pages/org/workflow-detail.ts +++ b/frontend/src/pages/org/workflow-detail.ts @@ -1909,6 +1909,7 @@ export class WorkflowDetail extends BtrixElement { message = msg("This crawl can’t be replayed since it was canceled."); break; case "failed": + case "failed_not_logged_in": message = msg("This crawl can’t be replayed because it failed."); break; default: @@ -1920,7 +1921,9 @@ export class WorkflowDetail extends BtrixElement { const actionButton = (workflow: Workflow) => { if (!workflow.lastCrawlId) return; - if (workflow.lastCrawlState === "failed") { + const failedStates = ["failed", "failed_not_logged_in"]; + + if (failedStates.includes(workflow.lastCrawlState || "")) { return html`
>; export default infoTextFor; diff --git a/frontend/src/types/crawlState.ts b/frontend/src/types/crawlState.ts index 1a63aecb..9c56c48d 100644 --- a/frontend/src/types/crawlState.ts +++ b/frontend/src/types/crawlState.ts @@ -28,6 +28,7 @@ export const SUCCESSFUL_STATES = [ export const FAILED_STATES = [ "canceled", "failed", + "failed_not_logged_in", "skipped_storage_quota_reached", "skipped_time_quota_reached", ] as const; diff --git a/frontend/src/types/crawler.ts b/frontend/src/types/crawler.ts index fcbab10a..45aa5d4f 100644 --- a/frontend/src/types/crawler.ts +++ b/frontend/src/types/crawler.ts @@ -44,6 +44,7 @@ export type SeedConfig = Expand< extraHops?: number | null; useSitemap?: boolean; failOnFailedSeed?: boolean; + failOnContentCheck?: boolean; depth?: number | null; userAgent?: string | null; selectLinks: string[]; diff --git a/frontend/src/utils/workflow.ts b/frontend/src/utils/workflow.ts index 0f288af9..6d877ab6 100644 --- a/frontend/src/utils/workflow.ts +++ b/frontend/src/utils/workflow.ts @@ -115,6 +115,7 @@ export type FormState = { includeLinkedPages: boolean; useSitemap: boolean; failOnFailedSeed: boolean; + failOnContentCheck: boolean; customIncludeUrlList: string; crawlTimeoutMinutes: number; behaviorTimeoutSeconds: number | null; @@ -177,6 +178,7 @@ export const getDefaultFormState = (): FormState => ({ includeLinkedPages: false, useSitemap: false, failOnFailedSeed: false, + failOnContentCheck: false, customIncludeUrlList: "", crawlTimeoutMinutes: 0, maxCrawlSizeGB: 0, @@ -269,6 +271,7 @@ export function getInitialFormState(params: { } formState.failOnFailedSeed = seedsConfig.failOnFailedSeed; + formState.failOnContentCheck = seedsConfig.failOnContentCheck; } if (params.initialWorkflow.schedule) { @@ -354,6 +357,8 @@ export function getInitialFormState(params: { useSitemap: seedsConfig.useSitemap ?? defaultFormState.useSitemap, failOnFailedSeed: seedsConfig.failOnFailedSeed ?? defaultFormState.failOnFailedSeed, + failOnContentCheck: + seedsConfig.failOnContentCheck ?? defaultFormState.failOnContentCheck, pageLimit: params.initialWorkflow.config.limit ?? defaultFormState.pageLimit, autoscrollBehavior: params.initialWorkflow.config.behaviors