browsertrix/backend/test/test_qa.py
Ilya Kreymer 4f676e4e82
QA Runs Initial Backend Implementation (#1586)
Supports running QA Runs via the QA API!

Builds on top of the `issue-1498-crawl-qa-backend-support` branch, fixes
#1498

Also requires the latest Browsertrix Crawler 1.1.0+ (from
webrecorder/browsertrix-crawler#469 branch)

Notable changes:
- QARun objects contain info about QA runs, which are crawls
performed on data loaded from existing crawls.

- Various crawl db operations can be performed on either the crawl or
`qa.` object, and core crawl fields have been moved to CoreCrawlable.

- While running,`QARun` data stored in a single `qa` object, while
finished qa runs are added to `qaFinished` dictionary on the Crawl. The
QA list API returns data from the finished list, sorted by most recent
first.

- Includes additional type fixes / type safety, especially around
BaseCrawl / Crawl / UploadedCrawl functionality, also creating specific
get_upload(), get_basecrawl(), get_crawl() getters for internal use and
get_crawl_out() for API

- Support filtering and sorting pages via `qaFilterBy` (screenshotMatch, textMatch) 
along with `gt`, `lt`, `gte`, `lte` params to return pages based on QA results.

---------
Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-03-20 22:42:16 -07:00

188 lines
5.1 KiB
Python

from .conftest import API_PREFIX, HOST_PREFIX
import requests
import time
from datetime import datetime
qa_run_id = None
def test_run_qa(crawler_crawl_id, crawler_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/start",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["started"]
global qa_run_id
qa_run_id = data["started"]
def test_run_qa_already_running(crawler_crawl_id, crawler_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/start",
headers=crawler_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "qa_already_running"
def test_active_qa(crawler_crawl_id, crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/activeQA",
headers=crawler_auth_headers,
)
data = r.json()
qa = data["qa"]
assert qa
assert qa["state"]
assert qa["started"]
assert not qa["finished"]
def test_qa_list(crawler_crawl_id, crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa",
headers=crawler_auth_headers,
)
data = r.json()
assert len(data) == 1
qa = data[0]
assert qa
assert qa["state"]
assert qa["started"]
assert not qa["finished"]
def test_wait_for_complete(crawler_crawl_id, crawler_auth_headers, default_org_id):
count = 0
completed = False
while count < 24:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/activeQA",
headers=crawler_auth_headers,
)
data = r.json()
if not data["qa"]:
completed = True
break
time.sleep(5)
count += 1
assert completed
def test_qa_completed(crawler_crawl_id, crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa",
headers=crawler_auth_headers,
)
data = r.json()
assert len(data) == 1
qa = data[0]
assert qa
assert qa["state"] == "complete"
assert qa["started"]
assert qa["finished"]
assert qa["stats"]["found"] == 1
assert qa["stats"]["done"] == 1
assert qa["crawlExecSeconds"] > 0
def test_qa_org_stats(crawler_crawl_id, crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
headers=crawler_auth_headers,
)
crawl_stats = r.json()
assert crawl_stats["qaCrawlExecSeconds"] > 0
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}",
headers=crawler_auth_headers,
)
org_stats = r.json()
yymm = datetime.utcnow().strftime("%Y-%m")
assert org_stats["qaCrawlExecSeconds"][yymm] > 0
assert org_stats["qaUsage"][yymm] > 0
def test_qa_page_data(crawler_crawl_id, crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/pages",
headers=crawler_auth_headers,
)
data = r.json()
assert len(data["items"]) == 1
page = data["items"][0]
assert page["title"] == "Webrecorder"
assert page["url"] == "https://webrecorder.net/"
assert page["qa"]["textMatch"] == 1.0
assert page["qa"]["screenshotMatch"] == 1.0
assert page["qa"]["resourceCounts"] == {
"crawlGood": 15,
"crawlBad": 0,
"replayGood": 15,
"replayBad": 1,
}
def test_qa_replay(crawler_crawl_id, crawler_auth_headers, default_org_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/replay.json",
headers=crawler_auth_headers,
)
data = r.json()
assert len(data["resources"]) == 1
assert data["resources"][0]["path"]
def test_run_qa_not_running(crawler_crawl_id, crawler_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/stop",
headers=crawler_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "qa_not_running"
def test_delete_qa_run(crawler_crawl_id, crawler_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/delete",
json={"qa_run_ids": [qa_run_id]},
headers=crawler_auth_headers,
)
assert r.status_code == 200
assert r.json()["deleted"] == True
# deleted from finished qa list
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa",
headers=crawler_auth_headers,
)
assert len(r.json()) == 0
# deleted from pages
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/pages",
headers=crawler_auth_headers,
)
assert len(r.json()["items"]) == 0