Add option to fail crawl if not logged in (#2754)
This PR adds a new checkbox to both page and seed crawl workflow types, which will fail the crawl if behaviors detect the browser is not logged in for supported sites. Changes include: - Backend support for the new crawler flag - A new `failed_not_logged_in` crawl state - Checkbox workflow editor and config details in the frontend (currently in the Scope section - I think it makes sense to have this option up front, but worth considering) - User Guide documentation of new option - A new nightly test for the new workflow option and `failed_not_logged_in` state --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com> Co-authored-by: sua yoo <sua@webrecorder.org>
This commit is contained in:
parent
feafeae1eb
commit
0c8c397fca
@ -243,6 +243,7 @@ WAITING_STATES = get_args(TYPE_WAITING_STATES)
|
||||
TYPE_FAILED_STATES = Literal[
|
||||
"canceled",
|
||||
"failed",
|
||||
"failed_not_logged_in",
|
||||
"skipped_storage_quota_reached",
|
||||
"skipped_time_quota_reached",
|
||||
]
|
||||
@ -358,6 +359,7 @@ class RawCrawlConfig(BaseModel):
|
||||
|
||||
useSitemap: Optional[bool] = False
|
||||
failOnFailedSeed: Optional[bool] = False
|
||||
failOnContentCheck: Optional[bool] = False
|
||||
|
||||
logging: Optional[str] = None
|
||||
behaviors: Optional[str] = "autoscroll,autoplay,autofetch,siteSpecific"
|
||||
|
@ -4,7 +4,7 @@ import traceback
|
||||
import os
|
||||
import math
|
||||
from pprint import pprint
|
||||
from typing import Optional, Any, Sequence
|
||||
from typing import Optional, Any, Sequence, Literal
|
||||
from datetime import datetime, timedelta
|
||||
from uuid import UUID
|
||||
|
||||
@ -827,15 +827,26 @@ class CrawlOperator(BaseOperator):
|
||||
crawl: CrawlSpec,
|
||||
status: CrawlStatus,
|
||||
pods: dict,
|
||||
stats: Optional[CrawlStats] = None,
|
||||
stats: CrawlStats,
|
||||
redis: Redis,
|
||||
) -> bool:
|
||||
"""Mark crawl as failed, log crawl state and print crawl logs, if possible"""
|
||||
prev_state = status.state
|
||||
|
||||
if not await self.mark_finished(crawl, status, "failed", stats=stats):
|
||||
failed_state: Literal["failed", "failed_not_logged_in"] = "failed"
|
||||
|
||||
fail_reason = await redis.get(f"{crawl.id}:failReason")
|
||||
|
||||
if fail_reason == "not_logged_in":
|
||||
failed_state = "failed_not_logged_in"
|
||||
|
||||
if not await self.mark_finished(crawl, status, failed_state, stats=stats):
|
||||
return False
|
||||
|
||||
if not self.log_failed_crawl_lines or prev_state == "failed":
|
||||
if not self.log_failed_crawl_lines or prev_state in (
|
||||
"failed",
|
||||
"failed_not_logged_in",
|
||||
):
|
||||
return True
|
||||
|
||||
pod_names = list(pods.keys())
|
||||
@ -1579,7 +1590,7 @@ class CrawlOperator(BaseOperator):
|
||||
# check if one-page crawls actually succeeded
|
||||
# if only one page found, and no files, assume failed
|
||||
if status.pagesFound == 1 and not status.filesAdded:
|
||||
await self.fail_crawl(crawl, status, pods, stats)
|
||||
await self.fail_crawl(crawl, status, pods, stats, redis)
|
||||
return status
|
||||
|
||||
state: TYPE_NON_RUNNING_STATES
|
||||
@ -1602,7 +1613,7 @@ class CrawlOperator(BaseOperator):
|
||||
if status.stopping and not status.pagesDone:
|
||||
await self.mark_finished(crawl, status, "canceled", stats)
|
||||
else:
|
||||
await self.fail_crawl(crawl, status, pods, stats)
|
||||
await self.fail_crawl(crawl, status, pods, stats, redis)
|
||||
|
||||
# check for other statuses, default to "running"
|
||||
else:
|
||||
|
89
backend/test_nightly/test_crawl_not_logged_in.py
Normal file
89
backend/test_nightly/test_crawl_not_logged_in.py
Normal file
@ -0,0 +1,89 @@
|
||||
import time
|
||||
|
||||
import pytest
|
||||
import requests
|
||||
|
||||
from .conftest import API_PREFIX
|
||||
|
||||
config_id = None
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def fail_not_logged_in_crawl_id(admin_auth_headers, default_org_id):
|
||||
# Start crawl
|
||||
crawl_data = {
|
||||
"runNow": True,
|
||||
"name": "Fail Crawl Not Logged In",
|
||||
"config": {
|
||||
"seeds": [{"url": "https://x.com/webrecorder_io"}],
|
||||
"scopeType": "page",
|
||||
"limit": 1,
|
||||
"failOnContentCheck": True,
|
||||
},
|
||||
}
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
||||
headers=admin_auth_headers,
|
||||
json=crawl_data,
|
||||
)
|
||||
data = r.json()
|
||||
|
||||
global config_id
|
||||
config_id = data["id"]
|
||||
|
||||
crawl_id = data["run_now_job"]
|
||||
|
||||
while True:
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
data = r.json()
|
||||
if data["state"] == "running":
|
||||
# Give crawl time to start properly
|
||||
time.sleep(30)
|
||||
return crawl_id
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def failed_crawl_finished(
|
||||
admin_auth_headers, default_org_id, fail_not_logged_in_crawl_id
|
||||
):
|
||||
# Wait for crawl to complete
|
||||
while True:
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{fail_not_logged_in_crawl_id}/replay.json",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
data = r.json()
|
||||
if data["state"] in ("complete", "failed", "failed_not_logged_in"):
|
||||
# Give some time for WACZ files to be stored
|
||||
time.sleep(30)
|
||||
break
|
||||
time.sleep(5)
|
||||
|
||||
|
||||
def test_fail_crawl_not_logged_in(
|
||||
admin_auth_headers,
|
||||
default_org_id,
|
||||
fail_not_logged_in_crawl_id,
|
||||
failed_crawl_finished,
|
||||
):
|
||||
# Ensure crawl has expected state
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{fail_not_logged_in_crawl_id}/replay.json",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["state"] == "failed_not_logged_in"
|
||||
|
||||
# Ensure workflow lastCrawlState has expected state
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["lastCrawlState"] == "failed_not_logged_in"
|
@ -83,6 +83,12 @@ When enabled, the crawler will visit all the links it finds within each page def
|
||||
??? example "Crawling tags & search queries with Page List crawls"
|
||||
This setting can be useful for crawling the content of specific tags or search queries. Specify the tag or search query URL(s) in the _Crawl URL(s)_ field, e.g: `https://example.com/search?q=tag`, and enable _Include Any Linked Page_ to crawl all the content present on that search query page.
|
||||
|
||||
### Fail Crawl if Not Logged In
|
||||
|
||||
When enabled, the crawl will fail if a [page behavior](#page-behavior) detects the presence or absence of content on supported pages indicating that the browser is not logged in.
|
||||
|
||||
For details about which websites are supported and how to add this functionality to your own [custom behaviors](#use-custom-behaviors), see the [Browsertrix Crawler documentation for Fail on Content Check](https://crawler.docs.browsertrix.com/user-guide/behaviors/#fail-on-content-check).
|
||||
|
||||
### Fail Crawl on Failed URL
|
||||
|
||||
When enabled, the crawler will fail the entire crawl if any of the provided URLs are invalid or unsuccessfully crawled. The resulting archived item will have a status of "Failed".
|
||||
|
@ -438,6 +438,10 @@ export class ConfigDetails extends BtrixElement {
|
||||
msg("Include Any Linked Page (“one hop out”)"),
|
||||
Boolean(config.extraHops),
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Fail Crawl If Not Logged In"),
|
||||
Boolean(config.failOnContentCheck),
|
||||
)}
|
||||
${when(
|
||||
config.extraHops,
|
||||
() => html`${this.renderLinkSelectors()}${this.renderExclusions()}`,
|
||||
@ -502,6 +506,10 @@ export class ConfigDetails extends BtrixElement {
|
||||
msg("Check For Sitemap"),
|
||||
Boolean(config.useSitemap),
|
||||
)}
|
||||
${this.renderSetting(
|
||||
msg("Fail Crawl if Not Logged In"),
|
||||
Boolean(config.failOnContentCheck),
|
||||
)}
|
||||
${this.renderLinkSelectors()}
|
||||
${this.renderSetting(
|
||||
msg("Additional Page URLs"),
|
||||
|
@ -227,6 +227,16 @@ export class CrawlStatus extends TailwindElement {
|
||||
label = msg("Failed");
|
||||
break;
|
||||
|
||||
case "failed_not_logged_in":
|
||||
color = "var(--danger)";
|
||||
icon = html`<sl-icon
|
||||
name="exclamation-triangle-fill"
|
||||
slot="prefix"
|
||||
style="color: ${color}"
|
||||
></sl-icon>`;
|
||||
label = msg("Failed: Not Logged In");
|
||||
break;
|
||||
|
||||
case "skipped_storage_quota_reached":
|
||||
color = "var(--danger)";
|
||||
icon = html`<sl-icon
|
||||
|
@ -1007,6 +1007,22 @@ export class WorkflowEditor extends BtrixElement {
|
||||
msg(`If checked, the crawler will visit pages one link away.`),
|
||||
false,
|
||||
)}
|
||||
${inputCol(html`
|
||||
<sl-checkbox
|
||||
name="failOnContentCheck"
|
||||
?checked=${this.formState.failOnContentCheck}
|
||||
>
|
||||
${msg("Fail crawl if not logged in")}
|
||||
</sl-checkbox>
|
||||
`)}
|
||||
${this.renderHelpTextCol(
|
||||
html`${infoTextFor["failOnContentCheck"]}
|
||||
${this.renderUserGuideLink({
|
||||
hash: "fail-crawl-if-not-logged-in",
|
||||
content: msg("More details"),
|
||||
})}.`,
|
||||
false,
|
||||
)}
|
||||
${when(this.formState.includeLinkedPages, () =>
|
||||
this.renderLinkSelectors(),
|
||||
)}
|
||||
@ -1495,6 +1511,22 @@ https://example.net`}
|
||||
),
|
||||
false,
|
||||
)}
|
||||
${inputCol(html`
|
||||
<sl-checkbox
|
||||
name="failOnContentCheck"
|
||||
?checked=${this.formState.failOnContentCheck}
|
||||
>
|
||||
${msg("Fail crawl if not logged in")}
|
||||
</sl-checkbox>
|
||||
`)}
|
||||
${this.renderHelpTextCol(
|
||||
html`${infoTextFor["failOnContentCheck"]}
|
||||
${this.renderUserGuideLink({
|
||||
hash: "fail-crawl-if-not-logged-in",
|
||||
content: msg("More details"),
|
||||
})}.`,
|
||||
false,
|
||||
)}
|
||||
${this.renderLinkSelectors()}
|
||||
|
||||
<div class="col-span-5">
|
||||
@ -3031,6 +3063,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
| "extraHops"
|
||||
| "useSitemap"
|
||||
| "failOnFailedSeed"
|
||||
| "failOnContentCheck"
|
||||
> {
|
||||
const jsonSeeds = this.formState.seedListFormat === SeedListFormat.JSON;
|
||||
|
||||
@ -3048,6 +3081,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
extraHops: this.formState.includeLinkedPages ? 1 : 0,
|
||||
useSitemap: false,
|
||||
failOnFailedSeed: this.formState.failOnFailedSeed,
|
||||
failOnContentCheck: this.formState.failOnContentCheck,
|
||||
};
|
||||
|
||||
return config;
|
||||
@ -3055,7 +3089,11 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
|
||||
private parseSeededConfig(): Pick<
|
||||
CrawlConfigParams["config"],
|
||||
"seeds" | "scopeType" | "useSitemap" | "failOnFailedSeed"
|
||||
| "seeds"
|
||||
| "scopeType"
|
||||
| "useSitemap"
|
||||
| "failOnFailedSeed"
|
||||
| "failOnContentCheck"
|
||||
> {
|
||||
const primarySeedUrl = this.formState.primarySeedUrl;
|
||||
const includeUrlList = this.formState.customIncludeUrlList
|
||||
@ -3086,6 +3124,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
|
||||
scopeType: this.formState.scopeType as ScopeType,
|
||||
useSitemap: this.formState.useSitemap,
|
||||
failOnFailedSeed: false,
|
||||
failOnContentCheck: this.formState.failOnContentCheck,
|
||||
};
|
||||
return config;
|
||||
}
|
||||
|
@ -244,7 +244,8 @@ export class WorkflowListItem extends BtrixElement {
|
||||
}
|
||||
e.preventDefault();
|
||||
await this.updateComplete;
|
||||
const href = `/orgs/${this.orgSlugState}/workflows/${this.workflow?.id}/${this.workflow?.lastCrawlState === "failed" ? WorkflowTab.Logs : WorkflowTab.LatestCrawl}`;
|
||||
const failedStates = ["failed", "failed_not_logged_in"];
|
||||
const href = `/orgs/${this.orgSlugState}/workflows/${this.workflow?.id}/${failedStates.includes(this.workflow?.lastCrawlState || "") ? WorkflowTab.Logs : WorkflowTab.LatestCrawl}`;
|
||||
this.navigate.to(href);
|
||||
}}
|
||||
>
|
||||
|
@ -1909,6 +1909,7 @@ export class WorkflowDetail extends BtrixElement {
|
||||
message = msg("This crawl can’t be replayed since it was canceled.");
|
||||
break;
|
||||
case "failed":
|
||||
case "failed_not_logged_in":
|
||||
message = msg("This crawl can’t be replayed because it failed.");
|
||||
break;
|
||||
default:
|
||||
@ -1920,7 +1921,9 @@ export class WorkflowDetail extends BtrixElement {
|
||||
const actionButton = (workflow: Workflow) => {
|
||||
if (!workflow.lastCrawlId) return;
|
||||
|
||||
if (workflow.lastCrawlState === "failed") {
|
||||
const failedStates = ["failed", "failed_not_logged_in"];
|
||||
|
||||
if (failedStates.includes(workflow.lastCrawlState || "")) {
|
||||
return html`<div class="mt-4">
|
||||
<sl-button
|
||||
size="small"
|
||||
|
@ -76,6 +76,9 @@ export const infoTextFor = {
|
||||
customBehavior: msg(
|
||||
`Enable custom page actions with behavior scripts. You can specify any publicly accessible URL or public Git repository.`,
|
||||
),
|
||||
failOnContentCheck: msg(
|
||||
`Fail the crawl if a page behavior detects the browser is not logged in on supported pages.`,
|
||||
),
|
||||
} as const satisfies Partial<Record<Field, string | TemplateResult>>;
|
||||
|
||||
export default infoTextFor;
|
||||
|
@ -28,6 +28,7 @@ export const SUCCESSFUL_STATES = [
|
||||
export const FAILED_STATES = [
|
||||
"canceled",
|
||||
"failed",
|
||||
"failed_not_logged_in",
|
||||
"skipped_storage_quota_reached",
|
||||
"skipped_time_quota_reached",
|
||||
] as const;
|
||||
|
@ -44,6 +44,7 @@ export type SeedConfig = Expand<
|
||||
extraHops?: number | null;
|
||||
useSitemap?: boolean;
|
||||
failOnFailedSeed?: boolean;
|
||||
failOnContentCheck?: boolean;
|
||||
depth?: number | null;
|
||||
userAgent?: string | null;
|
||||
selectLinks: string[];
|
||||
|
@ -115,6 +115,7 @@ export type FormState = {
|
||||
includeLinkedPages: boolean;
|
||||
useSitemap: boolean;
|
||||
failOnFailedSeed: boolean;
|
||||
failOnContentCheck: boolean;
|
||||
customIncludeUrlList: string;
|
||||
crawlTimeoutMinutes: number;
|
||||
behaviorTimeoutSeconds: number | null;
|
||||
@ -177,6 +178,7 @@ export const getDefaultFormState = (): FormState => ({
|
||||
includeLinkedPages: false,
|
||||
useSitemap: false,
|
||||
failOnFailedSeed: false,
|
||||
failOnContentCheck: false,
|
||||
customIncludeUrlList: "",
|
||||
crawlTimeoutMinutes: 0,
|
||||
maxCrawlSizeGB: 0,
|
||||
@ -269,6 +271,7 @@ export function getInitialFormState(params: {
|
||||
}
|
||||
|
||||
formState.failOnFailedSeed = seedsConfig.failOnFailedSeed;
|
||||
formState.failOnContentCheck = seedsConfig.failOnContentCheck;
|
||||
}
|
||||
|
||||
if (params.initialWorkflow.schedule) {
|
||||
@ -354,6 +357,8 @@ export function getInitialFormState(params: {
|
||||
useSitemap: seedsConfig.useSitemap ?? defaultFormState.useSitemap,
|
||||
failOnFailedSeed:
|
||||
seedsConfig.failOnFailedSeed ?? defaultFormState.failOnFailedSeed,
|
||||
failOnContentCheck:
|
||||
seedsConfig.failOnContentCheck ?? defaultFormState.failOnContentCheck,
|
||||
pageLimit:
|
||||
params.initialWorkflow.config.limit ?? defaultFormState.pageLimit,
|
||||
autoscrollBehavior: params.initialWorkflow.config.behaviors
|
||||
|
Loading…
Reference in New Issue
Block a user