browsertrix/backend/test_nightly/conftest.py
Ilya Kreymer 12f7db3ae2
tests: fixes for crawl cancel + crawl stopped (#864)
* tests:
- fix cancel crawl test by ensuring state is not running or waiting
- fix stop crawl test by ensuring stop is only initiated after at least one page has been crawled,
otherwise result may be failed, as no crawl data has been crawled yet (separate fix in crawler to avoid loop if stopped
before any data written webrecorder/browsertrix-crawler#314)
- bump page limit to 4 for tests to ensure crawl is partially complete, not fully complete when stopping
- allow canceled or partial_complete due to race condition

* chart: bump frontend limits in default, not just for tests (addresses #780)

* crawl stop before starting:
- if crawl stopped before it started, mark as canceled
- add test for stopping immediately, which should result in 'canceled' crawl
- attempt to increase resync interval for immediate failure
- nightly tests: increase page limit to test timeout

* backend:
- detect stopped-before-start crawl as 'failed' instead of 'done'
- stats: return stats counters as int instead of string
2023-05-22 20:17:29 -07:00

260 lines
7.2 KiB
Python

import pytest
import requests
import time
HOST_PREFIX = "http://127.0.0.1:30870"
API_PREFIX = HOST_PREFIX + "/api"
ADMIN_USERNAME = "admin@example.com"
ADMIN_PW = "PASSW0RD!"
@pytest.fixture(scope="session")
def admin_auth_headers():
while True:
r = requests.post(
f"{API_PREFIX}/auth/jwt/login",
data={
"username": ADMIN_USERNAME,
"password": ADMIN_PW,
"grant_type": "password",
},
)
data = r.json()
try:
return {"Authorization": f"Bearer {data['access_token']}"}
except:
print("Waiting for admin_auth_headers")
time.sleep(5)
@pytest.fixture(scope="session")
def default_org_id(admin_auth_headers):
while True:
r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
data = r.json()
try:
for org in data["items"]:
if org["default"] is True:
return org["id"]
except:
print("Waiting for default org id")
time.sleep(5)
@pytest.fixture(scope="session")
def crawl_id_wr(admin_auth_headers, default_org_id):
# Start crawl.
crawl_data = {
"runNow": True,
"name": "Webrecorder admin test crawl",
"tags": ["wr", "nightly testing"],
"config": {
"seeds": [{"url": "https://webrecorder.net/"}],
"limit": 1,
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()
crawl_id = data["run_now_job"]
# Wait for it to complete and then return crawl ID
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "complete":
return crawl_id
time.sleep(5)
@pytest.fixture(scope="session")
def crawl_id_wr_specs(admin_auth_headers, default_org_id):
# Start crawl.
crawl_data = {
"runNow": True,
"name": "Webrecorder Specs admin test crawl",
"tags": ["wr-specs", "nightly testing"],
"config": {
"seeds": [{"url": "https://specs.webrecorder.net/"}],
"limit": 1,
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()
crawl_id = data["run_now_job"]
# Wait for it to complete and then return crawl ID
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "complete":
return crawl_id
time.sleep(5)
@pytest.fixture(scope="session")
def crawl_config_info(admin_auth_headers, default_org_id):
# Start crawl.
crawl_data = {
"runNow": True,
"name": "Crawl config test",
"config": {"seeds": [{"url": "https://specs.webrecorder.net/"}], "limit": 1},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()
crawl_config_id = data["added"]
crawl_id = data["run_now_job"]
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "complete":
break
time.sleep(5)
# Run second crawl from crawlconfig and return info when it finishes
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}/run",
headers=admin_auth_headers,
)
data = r.json()
second_crawl_id = data["started"]
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{second_crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "complete":
return (crawl_config_id, crawl_id, second_crawl_id)
time.sleep(5)
@pytest.fixture(scope="session")
def large_crawl_id(admin_auth_headers, default_org_id):
# Start crawl
crawl_data = {
"runNow": True,
"name": "Large Test Crawl",
"tags": ["wacz-logs"],
"config": {
"seeds": [{"url": "https://webrecorder.net/"}],
"scopeType": "domain",
"limit": 100,
"extraHops": 1,
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()
crawl_id = data["run_now_job"]
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "running":
# Give crawl time to start properly
time.sleep(30)
return crawl_id
time.sleep(5)
@pytest.fixture(scope="session")
def large_crawl_finished(admin_auth_headers, default_org_id, large_crawl_id):
# Wait for crawl to complete
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{large_crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "complete":
# Give some time for WACZ files to be stored
time.sleep(30)
break
time.sleep(5)
@pytest.fixture(scope="session")
def timeout_crawl(admin_auth_headers, default_org_id):
# Start crawl
crawl_data = {
"runNow": True,
"name": "Crawl with crawl timeout",
"crawlTimeout": 30,
"config": {
"seeds": [{"url": "https://webrecorder.net/"}],
"scopeType": "domain",
"limit": 100,
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()
return data["run_now_job"]
@pytest.fixture(scope="session")
def error_crawl_id(admin_auth_headers, default_org_id):
crawl_data = {
"runNow": True,
"name": "Youtube crawl with errors",
"config": {
"seeds": [
{"url": "https://invalid.webrecorder.net/"},
{"url": "https://invalid-2.webrecorder.net/"},
],
"limit": 1,
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()
crawl_id = data["run_now_job"]
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "complete":
return crawl_id
time.sleep(5)