browsertrix/backend/test_nightly/test_crawl_not_logged_in.py
Tessa Walsh 0c8c397fca
Add option to fail crawl if not logged in (#2754)
This PR adds a new checkbox to both page and seed crawl workflow types,
which will fail the crawl if behaviors detect the browser is not logged
in for supported sites.

Changes include:

- Backend support for the new crawler flag
- A new `failed_not_logged_in` crawl state
- Checkbox workflow editor and config details in the frontend (currently
in the Scope section - I think it makes sense to have this option up
front, but worth considering)
- User Guide documentation of new option
- A new nightly test for the new workflow option and
`failed_not_logged_in` state


---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
Co-authored-by: sua yoo <sua@webrecorder.org>
2025-07-28 22:58:43 -07:00

90 lines
2.4 KiB
Python

import time
import pytest
import requests
from .conftest import API_PREFIX
config_id = None
@pytest.fixture(scope="session")
def fail_not_logged_in_crawl_id(admin_auth_headers, default_org_id):
# Start crawl
crawl_data = {
"runNow": True,
"name": "Fail Crawl Not Logged In",
"config": {
"seeds": [{"url": "https://x.com/webrecorder_io"}],
"scopeType": "page",
"limit": 1,
"failOnContentCheck": True,
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()
global config_id
config_id = data["id"]
crawl_id = data["run_now_job"]
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "running":
# Give crawl time to start properly
time.sleep(30)
return crawl_id
time.sleep(5)
@pytest.fixture(scope="session")
def failed_crawl_finished(
admin_auth_headers, default_org_id, fail_not_logged_in_crawl_id
):
# Wait for crawl to complete
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{fail_not_logged_in_crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] in ("complete", "failed", "failed_not_logged_in"):
# Give some time for WACZ files to be stored
time.sleep(30)
break
time.sleep(5)
def test_fail_crawl_not_logged_in(
admin_auth_headers,
default_org_id,
fail_not_logged_in_crawl_id,
failed_crawl_finished,
):
# Ensure crawl has expected state
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{fail_not_logged_in_crawl_id}/replay.json",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["state"] == "failed_not_logged_in"
# Ensure workflow lastCrawlState has expected state
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["lastCrawlState"] == "failed_not_logged_in"