browsertrix/backend/test/test_qa.py
Ilya Kreymer 4360e0c1b5
Update tests with latest crawler (#1711)
tests: use 'latest' crawler release for testing, now that 1.1.x is released.
2024-04-20 15:56:26 -07:00

578 lines
16 KiB
Python

from .conftest import API_PREFIX, HOST_PREFIX
import requests
import time
from datetime import datetime
import pytest
MAX_ATTEMPTS = 24
@pytest.fixture(scope="module")
def qa_run_id(crawler_crawl_id, crawler_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/start",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
qa_run_id = data["started"]
assert qa_run_id
return qa_run_id
@pytest.fixture(scope="module")
def qa_run_pages_ready(
crawler_crawl_id, crawler_auth_headers, default_org_id, qa_run_id
):
# Wait until activeQA is finished
count = 0
while count < MAX_ATTEMPTS:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/activeQA",
headers=crawler_auth_headers,
)
data = r.json()
if not data["qa"]:
break
if count + 1 == MAX_ATTEMPTS:
assert False
time.sleep(5)
count += 1
# Wait until pages are ready
count = 0
while count < MAX_ATTEMPTS:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/pages",
headers=crawler_auth_headers,
)
if len(r.json()["items"]) > 0:
break
if count + 1 == MAX_ATTEMPTS:
assert False
time.sleep(5)
count += 1
@pytest.fixture(scope="module")
def failed_qa_run_id(crawler_crawl_id, crawler_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/start",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
failed_qa_run_id = data["started"]
assert failed_qa_run_id
# Wait until it's properly running
count = 0
while count < MAX_ATTEMPTS:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/activeQA",
headers=crawler_auth_headers,
)
data = r.json()
if data.get("qa") and data["qa"].get("state") == "running":
break
if count + 1 == MAX_ATTEMPTS:
assert False
time.sleep(5)
count += 1
# Ensure can't start another QA job while this one's running
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/start",
headers=crawler_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "qa_already_running"
# Ensure activeQA responds as expected
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/activeQA",
headers=crawler_auth_headers,
)
data = r.json()
qa = data["qa"]
assert qa
assert qa["state"]
assert qa["started"]
assert not qa["finished"]
# Ensure sorting by qaState works as expected - current floated to top
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=qaState",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == crawler_crawl_id
assert crawls[0]["activeQAState"]
assert crawls[0]["activeQAStats"]
assert crawls[0]["lastQAState"]
# Ensure sorting by qaState works as expected with all-crawls
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=qaState",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == crawler_crawl_id
assert crawls[0]["activeQAState"]
assert crawls[0]["activeQAStats"]
assert crawls[0]["lastQAState"]
# Cancel crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/cancel",
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["success"]
# Wait for state to be changed
count = 0
while count < MAX_ATTEMPTS:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
matching_runs = [
qa_run for qa_run in data if qa_run.get("id") == failed_qa_run_id
]
if matching_runs:
matching_run = matching_runs[0]
if matching_run.get("state") == "canceled":
break
if count + 1 == MAX_ATTEMPTS:
assert False
time.sleep(5)
count += 1
return failed_qa_run_id
def test_qa_completed(
crawler_crawl_id, crawler_auth_headers, default_org_id, qa_run_pages_ready
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa",
headers=crawler_auth_headers,
)
data = r.json()
assert len(data) >= 1
for qa in data:
assert qa
assert qa["state"]
assert qa["started"]
assert qa["finished"]
assert qa["stats"]["found"] == 1
assert qa["stats"]["done"] == 1
assert qa["crawlExecSeconds"] > 0
def test_qa_org_stats(
crawler_crawl_id, crawler_auth_headers, default_org_id, qa_run_pages_ready
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
headers=crawler_auth_headers,
)
crawl_stats = r.json()
assert crawl_stats["qaCrawlExecSeconds"] > 0
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}",
headers=crawler_auth_headers,
)
org_stats = r.json()
yymm = datetime.utcnow().strftime("%Y-%m")
assert org_stats["qaCrawlExecSeconds"][yymm] > 0
assert org_stats["qaUsage"][yymm] > 0
def test_qa_page_data(
crawler_crawl_id,
crawler_auth_headers,
default_org_id,
qa_run_id,
qa_run_pages_ready,
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/pages",
headers=crawler_auth_headers,
)
data = r.json()
assert len(data["items"]) == 1
page = data["items"][0]
page_id = page["id"]
assert page_id
assert page["title"] == "Webrecorder"
assert page["url"] == "https://webrecorder.net/"
assert page["mime"] == "text/html"
assert page["status"] == 200
assert page["qa"]["textMatch"] == 1.0
assert page["qa"]["screenshotMatch"] == 1.0
assert page["qa"]["resourceCounts"] == {
"crawlGood": 16,
"crawlBad": 0,
"replayGood": 15,
"replayBad": 1,
}
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/pages/{page_id}",
headers=crawler_auth_headers,
)
page = r.json()
assert page["id"]
assert page["title"] == "Webrecorder"
assert page["url"] == "https://webrecorder.net/"
assert page["mime"] == "text/html"
assert page["status"] == 200
assert page["qa"]["textMatch"] == 1.0
assert page["qa"]["screenshotMatch"] == 1.0
assert page["qa"]["resourceCounts"] == {
"crawlGood": 16,
"crawlBad": 0,
"replayGood": 15,
"replayBad": 1,
}
def test_qa_replay(
crawler_crawl_id,
crawler_auth_headers,
default_org_id,
qa_run_id,
qa_run_pages_ready,
):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/replay.json",
headers=crawler_auth_headers,
)
data = r.json()
assert len(data["resources"]) == 1
assert data["resources"][0]["path"]
def test_qa_stats(
crawler_crawl_id,
crawler_auth_headers,
default_org_id,
qa_run_id,
qa_run_pages_ready,
):
# We'll want to improve this test by having more pages to test
# if we can figure out stable page scores to test against
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/stats?screenshotThresholds=0.7,0.9&textThresholds=0.7,0.9",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["screenshotMatch"] == [
{"lowerBoundary": "0.0", "count": 0},
{"lowerBoundary": "0.7", "count": 0},
{"lowerBoundary": "0.9", "count": 1},
]
assert data["textMatch"] == [
{"lowerBoundary": "0.0", "count": 0},
{"lowerBoundary": "0.7", "count": 0},
{"lowerBoundary": "0.9", "count": 1},
]
# Test we get expected results with explicit 0 boundary
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/stats?screenshotThresholds=0,0.7,0.9&textThresholds=0,0.7,0.9",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["screenshotMatch"] == [
{"lowerBoundary": "0.0", "count": 0},
{"lowerBoundary": "0.7", "count": 0},
{"lowerBoundary": "0.9", "count": 1},
]
assert data["textMatch"] == [
{"lowerBoundary": "0.0", "count": 0},
{"lowerBoundary": "0.7", "count": 0},
{"lowerBoundary": "0.9", "count": 1},
]
# Test that missing threshold values result in 422 HTTPException
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/stats?screenshotThresholds=0.7",
headers=crawler_auth_headers,
)
assert r.status_code == 422
assert r.json()["detail"][0]["msg"] == "field required"
# Test that invalid threshold values result in 400 HTTPException
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/stats?screenshotThresholds=0.7&textThresholds=null",
headers=crawler_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_thresholds"
def test_run_qa_not_running(
crawler_crawl_id,
crawler_auth_headers,
default_org_id,
failed_qa_run_id,
qa_run_pages_ready,
):
# Make sure no active QA is running
count = 0
while count < MAX_ATTEMPTS:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/activeQA",
headers=crawler_auth_headers,
)
data = r.json()
if data.get("qa") is None:
break
if count + 1 == MAX_ATTEMPTS:
assert False
time.sleep(5)
count += 1
# Try to stop when there's no running QA run
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/stop",
headers=crawler_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "qa_not_running"
def test_failed_qa_run(
crawler_crawl_id,
crawler_auth_headers,
default_org_id,
failed_qa_run_id,
qa_run_pages_ready,
):
# Ensure failed QA run is included in list endpoint
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa",
headers=crawler_auth_headers,
)
data = r.json()
assert len(data) == 2
failed_run = [qa_run for qa_run in data if qa_run.get("id") == failed_qa_run_id][0]
assert failed_run
assert failed_run["state"] == "canceled"
assert failed_run["started"]
assert failed_run["finished"]
assert failed_run["stats"]
assert failed_run["crawlExecSeconds"] >= 0
# Ensure failed QA run not included in list endpoint with skipFailed param
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa?skipFailed=true",
headers=crawler_auth_headers,
)
data = r.json()
assert len(data) == 1
qa = data[0]
assert qa
assert qa["state"] == "complete"
assert qa["started"]
assert qa["finished"]
assert qa["stats"]["found"] == 1
assert qa["stats"]["done"] == 1
assert qa["crawlExecSeconds"] > 0
def test_sort_crawls_by_qa_runs(
crawler_crawl_id,
crawler_auth_headers,
default_org_id,
failed_qa_run_id,
qa_run_pages_ready,
):
# Test that sorting by qaRunCount works as expected
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=qaRunCount",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == crawler_crawl_id
qa_run_count = crawls[0]["qaRunCount"]
assert qa_run_count > 0
last_count = qa_run_count
for crawl in crawls:
if crawl["id"] == crawler_crawl_id:
continue
crawl_qa_count = crawl["qaRunCount"]
assert isinstance(crawl_qa_count, int)
assert crawl_qa_count <= last_count
last_count = crawl_qa_count
# Test ascending sort
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=qaRunCount&sortDirection=1",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[-1]["id"] == crawler_crawl_id
assert crawls[-1]["qaRunCount"] > 0
last_count = 0
for crawl in crawls:
if crawl["id"] == crawler_crawl_id:
continue
crawl_qa_count = crawl["qaRunCount"]
assert isinstance(crawl_qa_count, int)
assert crawl_qa_count >= last_count
last_count = crawl_qa_count
# Test same with all-crawls
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=qaRunCount",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[0]["id"] == crawler_crawl_id
qa_run_count = crawls[0]["qaRunCount"]
assert qa_run_count > 0
last_count = qa_run_count
for crawl in crawls:
if crawl["id"] == crawler_crawl_id:
continue
crawl_qa_count = crawl["qaRunCount"]
assert isinstance(crawl_qa_count, int)
assert crawl_qa_count <= last_count
last_count = crawl_qa_count
# Test ascending sort
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=qaRunCount&sortDirection=1",
headers=crawler_auth_headers,
)
assert r.status_code == 200
crawls = r.json()["items"]
assert crawls[-1]["id"] == crawler_crawl_id
assert crawls[-1]["qaRunCount"] > 0
last_count = 0
for crawl in crawls:
if crawl["id"] == crawler_crawl_id:
continue
crawl_qa_count = crawl["qaRunCount"]
assert isinstance(crawl_qa_count, int)
assert crawl_qa_count >= last_count
last_count = crawl_qa_count
def test_delete_qa_runs(
crawler_crawl_id,
crawler_auth_headers,
default_org_id,
qa_run_id,
qa_run_pages_ready,
failed_qa_run_id,
):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/delete",
json={"qa_run_ids": [qa_run_id, failed_qa_run_id]},
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["deleted"] == 2
# Wait for QA runs to be deleted
count = 0
while count < MAX_ATTEMPTS:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa",
headers=crawler_auth_headers,
)
if len(r.json()) == 0:
break
if count + 1 == MAX_ATTEMPTS:
assert False
time.sleep(5)
count += 1
# Ensure associated qa run information in pages is also deleted
for qa_run in (qa_run_id, failed_qa_run_id):
count = 0
while count < MAX_ATTEMPTS:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run}/pages",
headers=crawler_auth_headers,
)
data = r.json()
pages_with_qa_run = [
page
for page in data["items"]
if page.get("qa") and page.get("qa").get(qa_run)
]
if not pages_with_qa_run:
break
if count + 1 == MAX_ATTEMPTS:
assert False
time.sleep(5)
count += 1