Add option to fail crawl if not logged in (#2754)

This PR adds a new checkbox to both page and seed crawl workflow types,
which will fail the crawl if behaviors detect the browser is not logged
in for supported sites.

Changes include:

- Backend support for the new crawler flag
- A new `failed_not_logged_in` crawl state
- Checkbox workflow editor and config details in the frontend (currently
in the Scope section - I think it makes sense to have this option up
front, but worth considering)
- User Guide documentation of new option
- A new nightly test for the new workflow option and
`failed_not_logged_in` state


---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
Co-authored-by: sua yoo <sua@webrecorder.org>
This commit is contained in:
Tessa Walsh 2025-07-29 01:58:43 -04:00 committed by GitHub
parent feafeae1eb
commit 0c8c397fca
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 188 additions and 9 deletions

View File

@ -243,6 +243,7 @@ WAITING_STATES = get_args(TYPE_WAITING_STATES)
TYPE_FAILED_STATES = Literal[
"canceled",
"failed",
"failed_not_logged_in",
"skipped_storage_quota_reached",
"skipped_time_quota_reached",
]
@ -358,6 +359,7 @@ class RawCrawlConfig(BaseModel):
useSitemap: Optional[bool] = False
failOnFailedSeed: Optional[bool] = False
failOnContentCheck: Optional[bool] = False
logging: Optional[str] = None
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"

View File

@ -4,7 +4,7 @@ import traceback
import os
import math
from pprint import pprint
from typing import Optional, Any, Sequence
from typing import Optional, Any, Sequence, Literal
from datetime import datetime, timedelta
from uuid import UUID
@ -827,15 +827,26 @@ class CrawlOperator(BaseOperator):
crawl: CrawlSpec,
status: CrawlStatus,
pods: dict,
stats: Optional[CrawlStats] = None,
stats: CrawlStats,
redis: Redis,
) -> bool:
"""Mark crawl as failed, log crawl state and print crawl logs, if possible"""
prev_state = status.state
if not await self.mark_finished(crawl, status, "failed", stats=stats):
failed_state: Literal["failed", "failed_not_logged_in"] = "failed"
fail_reason = await redis.get(f"{crawl.id}:failReason")
if fail_reason == "not_logged_in":
failed_state = "failed_not_logged_in"
if not await self.mark_finished(crawl, status, failed_state, stats=stats):
return False
if not self.log_failed_crawl_lines or prev_state == "failed":
if not self.log_failed_crawl_lines or prev_state in (
"failed",
"failed_not_logged_in",
):
return True
pod_names = list(pods.keys())
@ -1579,7 +1590,7 @@ class CrawlOperator(BaseOperator):
# check if one-page crawls actually succeeded
# if only one page found, and no files, assume failed
if status.pagesFound == 1 and not status.filesAdded:
await self.fail_crawl(crawl, status, pods, stats)
await self.fail_crawl(crawl, status, pods, stats, redis)
return status
state: TYPE_NON_RUNNING_STATES
@ -1602,7 +1613,7 @@ class CrawlOperator(BaseOperator):
if status.stopping and not status.pagesDone:
await self.mark_finished(crawl, status, "canceled", stats)
else:
await self.fail_crawl(crawl, status, pods, stats)
await self.fail_crawl(crawl, status, pods, stats, redis)
# check for other statuses, default to "running"
else:

View File

@ -0,0 +1,89 @@
import time
import pytest
import requests
from .conftest import API_PREFIX
config_id = None
@pytest.fixture(scope="session")
def fail_not_logged_in_crawl_id(admin_auth_headers, default_org_id):
# Start crawl
crawl_data = {
"runNow": True,
"name": "Fail Crawl Not Logged In",
"config": {
"seeds": [{"url": "https://x.com/webrecorder_io"}],
"scopeType": "page",
"limit": 1,
"failOnContentCheck": True,
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()
global config_id
config_id = data["id"]
crawl_id = data["run_now_job"]
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "running":
# Give crawl time to start properly
time.sleep(30)
return crawl_id
time.sleep(5)
@pytest.fixture(scope="session")
def failed_crawl_finished(
admin_auth_headers, default_org_id, fail_not_logged_in_crawl_id
):
# Wait for crawl to complete
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{fail_not_logged_in_crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] in ("complete", "failed", "failed_not_logged_in"):
# Give some time for WACZ files to be stored
time.sleep(30)
break
time.sleep(5)
def test_fail_crawl_not_logged_in(
admin_auth_headers,
default_org_id,
fail_not_logged_in_crawl_id,
failed_crawl_finished,
):
# Ensure crawl has expected state
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{fail_not_logged_in_crawl_id}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["state"] == "failed_not_logged_in"
# Ensure workflow lastCrawlState has expected state
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["lastCrawlState"] == "failed_not_logged_in"

View File

@ -83,6 +83,12 @@ When enabled, the crawler will visit all the links it finds within each page def
??? example "Crawling tags & search queries with Page List crawls"
This setting can be useful for crawling the content of specific tags or search queries. Specify the tag or search query URL(s) in the _Crawl URL(s)_ field, e.g: `https://example.com/search?q=tag`, and enable _Include Any Linked Page_ to crawl all the content present on that search query page.
### Fail Crawl if Not Logged In
When enabled, the crawl will fail if a [page behavior](#page-behavior) detects the presence or absence of content on supported pages indicating that the browser is not logged in.
For details about which websites are supported and how to add this functionality to your own [custom behaviors](#use-custom-behaviors), see the [Browsertrix Crawler documentation for Fail on Content Check](https://crawler.docs.browsertrix.com/user-guide/behaviors/#fail-on-content-check).
### Fail Crawl on Failed URL
When enabled, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled. The resulting archived item will have a status of "Failed".

View File

@ -438,6 +438,10 @@ export class ConfigDetails extends BtrixElement {
msg("Include Any Linked Page (“one hop out”)"),
Boolean(config.extraHops),
)}
${this.renderSetting(
msg("Fail Crawl If Not Logged In"),
Boolean(config.failOnContentCheck),
)}
${when(
config.extraHops,
() => html`${this.renderLinkSelectors()}${this.renderExclusions()}`,
@ -502,6 +506,10 @@ export class ConfigDetails extends BtrixElement {
msg("Check For Sitemap"),
Boolean(config.useSitemap),
)}
${this.renderSetting(
msg("Fail Crawl if Not Logged In"),
Boolean(config.failOnContentCheck),
)}
${this.renderLinkSelectors()}
${this.renderSetting(
msg("Additional Page URLs"),

View File

@ -227,6 +227,16 @@ export class CrawlStatus extends TailwindElement {
label = msg("Failed");
break;
case "failed_not_logged_in":
color = "var(--danger)";
icon = html`<sl-icon
name="exclamation-triangle-fill"
slot="prefix"
style="color: ${color}"
></sl-icon>`;
label = msg("Failed: Not Logged In");
break;
case "skipped_storage_quota_reached":
color = "var(--danger)";
icon = html`<sl-icon

View File

@ -1007,6 +1007,22 @@ export class WorkflowEditor extends BtrixElement {
msg(`If checked, the crawler will visit pages one link away.`),
false,
)}
${inputCol(html`
<sl-checkbox
name="failOnContentCheck"
?checked=${this.formState.failOnContentCheck}
>
${msg("Fail crawl if not logged in")}
</sl-checkbox>
`)}
${this.renderHelpTextCol(
html`${infoTextFor["failOnContentCheck"]}
${this.renderUserGuideLink({
hash: "fail-crawl-if-not-logged-in",
content: msg("More details"),
})}.`,
false,
)}
${when(this.formState.includeLinkedPages, () =>
this.renderLinkSelectors(),
)}
@ -1495,6 +1511,22 @@ https://example.net`}
),
false,
)}
${inputCol(html`
<sl-checkbox
name="failOnContentCheck"
?checked=${this.formState.failOnContentCheck}
>
${msg("Fail crawl if not logged in")}
</sl-checkbox>
`)}
${this.renderHelpTextCol(
html`${infoTextFor["failOnContentCheck"]}
${this.renderUserGuideLink({
hash: "fail-crawl-if-not-logged-in",
content: msg("More details"),
})}.`,
false,
)}
${this.renderLinkSelectors()}
<div class="col-span-5">
@ -3031,6 +3063,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
| "extraHops"
| "useSitemap"
| "failOnFailedSeed"
| "failOnContentCheck"
> {
const jsonSeeds = this.formState.seedListFormat === SeedListFormat.JSON;
@ -3048,6 +3081,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
extraHops: this.formState.includeLinkedPages ? 1 : 0,
useSitemap: false,
failOnFailedSeed: this.formState.failOnFailedSeed,
failOnContentCheck: this.formState.failOnContentCheck,
};
return config;
@ -3055,7 +3089,11 @@ https://archiveweb.page/images/${"logo.svg"}`}
private parseSeededConfig(): Pick<
CrawlConfigParams["config"],
"seeds" | "scopeType" | "useSitemap" | "failOnFailedSeed"
| "seeds"
| "scopeType"
| "useSitemap"
| "failOnFailedSeed"
| "failOnContentCheck"
> {
const primarySeedUrl = this.formState.primarySeedUrl;
const includeUrlList = this.formState.customIncludeUrlList
@ -3086,6 +3124,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
scopeType: this.formState.scopeType as ScopeType,
useSitemap: this.formState.useSitemap,
failOnFailedSeed: false,
failOnContentCheck: this.formState.failOnContentCheck,
};
return config;
}

View File

@ -244,7 +244,8 @@ export class WorkflowListItem extends BtrixElement {
}
e.preventDefault();
await this.updateComplete;
const href = `/orgs/${this.orgSlugState}/workflows/${this.workflow?.id}/${this.workflow?.lastCrawlState === "failed" ? WorkflowTab.Logs : WorkflowTab.LatestCrawl}`;
const failedStates = ["failed", "failed_not_logged_in"];
const href = `/orgs/${this.orgSlugState}/workflows/${this.workflow?.id}/${failedStates.includes(this.workflow?.lastCrawlState || "") ? WorkflowTab.Logs : WorkflowTab.LatestCrawl}`;
this.navigate.to(href);
}}
>

View File

@ -1909,6 +1909,7 @@ export class WorkflowDetail extends BtrixElement {
message = msg("This crawl cant be replayed since it was canceled.");
break;
case "failed":
case "failed_not_logged_in":
message = msg("This crawl cant be replayed because it failed.");
break;
default:
@ -1920,7 +1921,9 @@ export class WorkflowDetail extends BtrixElement {
const actionButton = (workflow: Workflow) => {
if (!workflow.lastCrawlId) return;
if (workflow.lastCrawlState === "failed") {
const failedStates = ["failed", "failed_not_logged_in"];
if (failedStates.includes(workflow.lastCrawlState || "")) {
return html`<div class="mt-4">
<sl-button
size="small"

View File

@ -76,6 +76,9 @@ export const infoTextFor = {
customBehavior: msg(
`Enable custom page actions with behavior scripts. You can specify any publicly accessible URL or public Git repository.`,
),
failOnContentCheck: msg(
`Fail the crawl if a page behavior detects the browser is not logged in on supported pages.`,
),
} as const satisfies Partial<Record<Field, string | TemplateResult>>;
export default infoTextFor;

View File

@ -28,6 +28,7 @@ export const SUCCESSFUL_STATES = [
export const FAILED_STATES = [
"canceled",
"failed",
"failed_not_logged_in",
"skipped_storage_quota_reached",
"skipped_time_quota_reached",
] as const;

View File

@ -44,6 +44,7 @@ export type SeedConfig = Expand<
extraHops?: number | null;
useSitemap?: boolean;
failOnFailedSeed?: boolean;
failOnContentCheck?: boolean;
depth?: number | null;
userAgent?: string | null;
selectLinks: string[];

View File

@ -115,6 +115,7 @@ export type FormState = {
includeLinkedPages: boolean;
useSitemap: boolean;
failOnFailedSeed: boolean;
failOnContentCheck: boolean;
customIncludeUrlList: string;
crawlTimeoutMinutes: number;
behaviorTimeoutSeconds: number | null;
@ -177,6 +178,7 @@ export const getDefaultFormState = (): FormState => ({
includeLinkedPages: false,
useSitemap: false,
failOnFailedSeed: false,
failOnContentCheck: false,
customIncludeUrlList: "",
crawlTimeoutMinutes: 0,
maxCrawlSizeGB: 0,
@ -269,6 +271,7 @@ export function getInitialFormState(params: {
}
formState.failOnFailedSeed = seedsConfig.failOnFailedSeed;
formState.failOnContentCheck = seedsConfig.failOnContentCheck;
}
if (params.initialWorkflow.schedule) {
@ -354,6 +357,8 @@ export function getInitialFormState(params: {
useSitemap: seedsConfig.useSitemap ?? defaultFormState.useSitemap,
failOnFailedSeed:
seedsConfig.failOnFailedSeed ?? defaultFormState.failOnFailedSeed,
failOnContentCheck:
seedsConfig.failOnContentCheck ?? defaultFormState.failOnContentCheck,
pageLimit:
params.initialWorkflow.config.limit ?? defaultFormState.pageLimit,
autoscrollBehavior: params.initialWorkflow.config.behaviors