This PR adds a new checkbox to both page and seed crawl workflow types, which will fail the crawl if behaviors detect the browser is not logged in for supported sites. Changes include: - Backend support for the new crawler flag - A new `failed_not_logged_in` crawl state - Checkbox workflow editor and config details in the frontend (currently in the Scope section - I think it makes sense to have this option up front, but worth considering) - User Guide documentation of new option - A new nightly test for the new workflow option and `failed_not_logged_in` state --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com> Co-authored-by: sua yoo <sua@webrecorder.org>
90 lines
2.4 KiB
Python
90 lines
2.4 KiB
Python
import time
|
|
|
|
import pytest
|
|
import requests
|
|
|
|
from .conftest import API_PREFIX
|
|
|
|
config_id = None
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def fail_not_logged_in_crawl_id(admin_auth_headers, default_org_id):
|
|
# Start crawl
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Fail Crawl Not Logged In",
|
|
"config": {
|
|
"seeds": [{"url": "https://x.com/webrecorder_io"}],
|
|
"scopeType": "page",
|
|
"limit": 1,
|
|
"failOnContentCheck": True,
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
global config_id
|
|
config_id = data["id"]
|
|
|
|
crawl_id = data["run_now_job"]
|
|
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] == "running":
|
|
# Give crawl time to start properly
|
|
time.sleep(30)
|
|
return crawl_id
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def failed_crawl_finished(
|
|
admin_auth_headers, default_org_id, fail_not_logged_in_crawl_id
|
|
):
|
|
# Wait for crawl to complete
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{fail_not_logged_in_crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] in ("complete", "failed", "failed_not_logged_in"):
|
|
# Give some time for WACZ files to be stored
|
|
time.sleep(30)
|
|
break
|
|
time.sleep(5)
|
|
|
|
|
|
def test_fail_crawl_not_logged_in(
|
|
admin_auth_headers,
|
|
default_org_id,
|
|
fail_not_logged_in_crawl_id,
|
|
failed_crawl_finished,
|
|
):
|
|
# Ensure crawl has expected state
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{fail_not_logged_in_crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["state"] == "failed_not_logged_in"
|
|
|
|
# Ensure workflow lastCrawlState has expected state
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{config_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["lastCrawlState"] == "failed_not_logged_in"
|