browsertrix/backend/test/test_qa.py

from .conftest import API_PREFIX, HOST_PREFIX
import requests
import time
from datetime import datetime

import pytest

MAX_ATTEMPTS = 24


@pytest.fixture(scope="module")
def qa_run_id(crawler_crawl_id, crawler_auth_headers, default_org_id):
    r = requests.post(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/start",
        headers=crawler_auth_headers,
    )

    assert r.status_code == 200

    data = r.json()
    qa_run_id = data["started"]
    assert qa_run_id
    return qa_run_id


@pytest.fixture(scope="module")
def qa_run_pages_ready(
    crawler_crawl_id, crawler_auth_headers, default_org_id, qa_run_id
):
    # Wait until activeQA is finished
    count = 0
    while count < MAX_ATTEMPTS:
        r = requests.get(
            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/activeQA",
            headers=crawler_auth_headers,
        )

        data = r.json()
        if not data["qa"]:
            break

        if count + 1 == MAX_ATTEMPTS:
            assert False

        time.sleep(5)
        count += 1

    # Wait until pages are ready
    count = 0
    while count < MAX_ATTEMPTS:
        r = requests.get(
            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/pages",
            headers=crawler_auth_headers,
        )
        if len(r.json()["items"]) > 0:
            break

        if count + 1 == MAX_ATTEMPTS:
            assert False

        time.sleep(5)
        count += 1


@pytest.fixture(scope="module")
def failed_qa_run_id(crawler_crawl_id, crawler_auth_headers, default_org_id):
    r = requests.post(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/start",
        headers=crawler_auth_headers,
    )

    assert r.status_code == 200

    data = r.json()
    failed_qa_run_id = data["started"]
    assert failed_qa_run_id

    # Wait until it's properly running
    count = 0
    while count < MAX_ATTEMPTS:
        r = requests.get(
            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/activeQA",
            headers=crawler_auth_headers,
        )

        data = r.json()
        if data.get("qa") and data["qa"].get("state") == "running":
            break

        if count + 1 == MAX_ATTEMPTS:
            assert False

        time.sleep(5)
        count += 1

    # Ensure can't start another QA job while this one's running
    r = requests.post(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/start",
        headers=crawler_auth_headers,
    )

    assert r.status_code == 400
    assert r.json()["detail"] == "qa_already_running"

    # Ensure activeQA responds as expected
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/activeQA",
        headers=crawler_auth_headers,
    )

    data = r.json()
    qa = data["qa"]

    assert qa
    assert qa["state"]
    assert qa["started"]
    assert not qa["finished"]

    # Ensure sorting by lastQAState works as expected - current floated to top
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=lastQAState",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
    crawls = r.json()["items"]
    assert crawls[0]["id"] == crawler_crawl_id
    assert crawls[0]["activeQAStats"]
    assert crawls[0]["lastQAState"]
    assert crawls[0]["lastQAStarted"]

    # Ensure sorting by lastQAState works as expected with all-crawls
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=lastQAState",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
    crawls = r.json()["items"]
    assert crawls[0]["id"] == crawler_crawl_id
    assert crawls[0]["activeQAStats"]
    assert crawls[0]["lastQAState"]
    assert crawls[0]["lastQAStarted"]

    # Ensure sorting by lastQAStarted works as expected - current floated to top
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=lastQAStarted",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
    crawls = r.json()["items"]
    assert crawls[0]["id"] == crawler_crawl_id
    assert crawls[0]["activeQAStats"]
    assert crawls[0]["lastQAState"]
    assert crawls[0]["lastQAStarted"]

    # Ensure sorting by lastQAState works as expected with all-crawls
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=lastQAStarted",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
    crawls = r.json()["items"]
    assert crawls[0]["id"] == crawler_crawl_id
    assert crawls[0]["activeQAStats"]
    assert crawls[0]["lastQAState"]
    assert crawls[0]["lastQAStarted"]

    # Cancel crawl
    r = requests.post(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/cancel",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
    assert r.json()["success"]

    # Wait for state to be changed
    count = 0
    while count < MAX_ATTEMPTS:
        r = requests.get(
            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa",
            headers=crawler_auth_headers,
        )
        assert r.status_code == 200

        data = r.json()
        matching_runs = [
            qa_run for qa_run in data if qa_run.get("id") == failed_qa_run_id
        ]
        if matching_runs:
            matching_run = matching_runs[0]
            if matching_run.get("state") == "canceled":
                break

        if count + 1 == MAX_ATTEMPTS:
            assert False

        time.sleep(5)
        count += 1

    return failed_qa_run_id


def test_qa_completed(
    crawler_crawl_id, crawler_auth_headers, default_org_id, qa_run_pages_ready
):
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa",
        headers=crawler_auth_headers,
    )

    data = r.json()

    assert len(data) >= 1

    for qa in data:
        assert qa
        assert qa["state"]
        assert qa["started"]
        assert qa["finished"]
        assert qa["stats"]["found"] == 1
        assert qa["stats"]["done"] == 1
        assert qa["crawlExecSeconds"] > 0


def test_qa_org_stats(
    crawler_crawl_id, crawler_auth_headers, default_org_id, qa_run_pages_ready
):
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
        headers=crawler_auth_headers,
    )
    crawl_stats = r.json()
    assert crawl_stats["qaCrawlExecSeconds"] > 0

    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}",
        headers=crawler_auth_headers,
    )
    org_stats = r.json()

    yymm = datetime.utcnow().strftime("%Y-%m")
    assert org_stats["qaCrawlExecSeconds"][yymm] > 0
    assert org_stats["qaUsage"][yymm] > 0


def test_qa_page_data(
    crawler_crawl_id,
    crawler_auth_headers,
    default_org_id,
    qa_run_id,
    qa_run_pages_ready,
):
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/pages",
        headers=crawler_auth_headers,
    )
    data = r.json()
    assert len(data["items"]) == 1
    page = data["items"][0]

    page_id = page["id"]
    assert page_id

    assert page["title"] == "Webrecorder"
    assert page["url"] == "https://webrecorder.net/"
    assert page["mime"] == "text/html"
    assert page["status"] == 200
    assert page["qa"]["textMatch"] == 1.0
    assert page["qa"]["screenshotMatch"] == 1.0
    assert page["qa"]["resourceCounts"] == {
        "crawlGood": 16,
        "crawlBad": 0,
        "replayGood": 15,
        "replayBad": 1,
    }

    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/pages/{page_id}",
        headers=crawler_auth_headers,
    )
    page = r.json()
    assert page["id"]
    assert page["title"] == "Webrecorder"
    assert page["url"] == "https://webrecorder.net/"
    assert page["mime"] == "text/html"
    assert page["status"] == 200
    assert page["qa"]["textMatch"] == 1.0
    assert page["qa"]["screenshotMatch"] == 1.0
    assert page["qa"]["resourceCounts"] == {
        "crawlGood": 16,
        "crawlBad": 0,
        "replayGood": 15,
        "replayBad": 1,
    }


def test_qa_replay(
    crawler_crawl_id,
    crawler_auth_headers,
    default_org_id,
    qa_run_id,
    qa_run_pages_ready,
):
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/replay.json",
        headers=crawler_auth_headers,
    )
    data = r.json()
    assert len(data["resources"]) == 1
    assert data["resources"][0]["path"]


def test_qa_stats(
    crawler_crawl_id,
    crawler_auth_headers,
    default_org_id,
    qa_run_id,
    qa_run_pages_ready,
):
    # We'll want to improve this test by having more pages to test
    # if we can figure out stable page scores to test against
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/stats?screenshotThresholds=0.7,0.9&textThresholds=0.7,0.9",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200

    data = r.json()
    assert data["screenshotMatch"] == [
        {"lowerBoundary": "0.0", "count": 0},
        {"lowerBoundary": "0.7", "count": 0},
        {"lowerBoundary": "0.9", "count": 1},
    ]
    assert data["textMatch"] == [
        {"lowerBoundary": "0.0", "count": 0},
        {"lowerBoundary": "0.7", "count": 0},
        {"lowerBoundary": "0.9", "count": 1},
    ]

    # Test we get expected results with explicit 0 boundary
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/stats?screenshotThresholds=0,0.7,0.9&textThresholds=0,0.7,0.9",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200

    data = r.json()
    assert data["screenshotMatch"] == [
        {"lowerBoundary": "0.0", "count": 0},
        {"lowerBoundary": "0.7", "count": 0},
        {"lowerBoundary": "0.9", "count": 1},
    ]
    assert data["textMatch"] == [
        {"lowerBoundary": "0.0", "count": 0},
        {"lowerBoundary": "0.7", "count": 0},
        {"lowerBoundary": "0.9", "count": 1},
    ]

    # Test that missing threshold values result in 422 HTTPException
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/stats?screenshotThresholds=0.7",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 422
    assert r.json()["detail"][0]["msg"] == "field required"

    # Test that invalid threshold values result in 400 HTTPException
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/stats?screenshotThresholds=0.7&textThresholds=null",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 400
    assert r.json()["detail"] == "invalid_thresholds"


def test_run_qa_not_running(
    crawler_crawl_id,
    crawler_auth_headers,
    default_org_id,
    failed_qa_run_id,
    qa_run_pages_ready,
):
    # Make sure no active QA is running
    count = 0
    while count < MAX_ATTEMPTS:
        r = requests.get(
            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/activeQA",
            headers=crawler_auth_headers,
        )
        data = r.json()
        if data.get("qa") is None:
            break

        if count + 1 == MAX_ATTEMPTS:
            assert False

        time.sleep(5)
        count += 1

    # Try to stop when there's no running QA run
    r = requests.post(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/stop",
        headers=crawler_auth_headers,
    )

    assert r.status_code == 400
    assert r.json()["detail"] == "qa_not_running"


def test_failed_qa_run(
    crawler_crawl_id,
    crawler_auth_headers,
    default_org_id,
    failed_qa_run_id,
    qa_run_pages_ready,
):
    # Ensure failed QA run is included in list endpoint
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa",
        headers=crawler_auth_headers,
    )

    data = r.json()

    assert len(data) == 2

    failed_run = [qa_run for qa_run in data if qa_run.get("id") == failed_qa_run_id][0]
    assert failed_run
    assert failed_run["state"] == "canceled"
    assert failed_run["started"]
    assert failed_run["finished"]
    assert failed_run["stats"]
    assert failed_run["crawlExecSeconds"] >= 0

    # Ensure failed QA run not included in list endpoint with skipFailed param
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa?skipFailed=true",
        headers=crawler_auth_headers,
    )

    data = r.json()

    assert len(data) == 1

    qa = data[0]
    assert qa
    assert qa["state"] == "complete"
    assert qa["started"]
    assert qa["finished"]
    assert qa["stats"]["found"] == 1
    assert qa["stats"]["done"] == 1
    assert qa["crawlExecSeconds"] > 0


def test_sort_crawls_by_qa_runs(
    crawler_crawl_id,
    crawler_auth_headers,
    default_org_id,
    failed_qa_run_id,
    qa_run_pages_ready,
):
    # Test that sorting by qaRunCount works as expected
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=qaRunCount",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
    crawls = r.json()["items"]

    assert crawls[0]["id"] == crawler_crawl_id
    qa_run_count = crawls[0]["qaRunCount"]
    assert qa_run_count > 0

    last_count = qa_run_count
    for crawl in crawls:
        if crawl["id"] == crawler_crawl_id:
            continue
        crawl_qa_count = crawl["qaRunCount"]
        assert isinstance(crawl_qa_count, int)
        assert crawl_qa_count <= last_count
        last_count = crawl_qa_count

    # Test ascending sort
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls?sortBy=qaRunCount&sortDirection=1",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
    crawls = r.json()["items"]

    assert crawls[-1]["id"] == crawler_crawl_id
    assert crawls[-1]["qaRunCount"] > 0

    last_count = 0
    for crawl in crawls:
        if crawl["id"] == crawler_crawl_id:
            continue
        crawl_qa_count = crawl["qaRunCount"]
        assert isinstance(crawl_qa_count, int)
        assert crawl_qa_count >= last_count
        last_count = crawl_qa_count

    # Test same with all-crawls
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=qaRunCount",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
    crawls = r.json()["items"]

    assert crawls[0]["id"] == crawler_crawl_id
    qa_run_count = crawls[0]["qaRunCount"]
    assert qa_run_count > 0

    last_count = qa_run_count
    for crawl in crawls:
        if crawl["id"] == crawler_crawl_id:
            continue
        crawl_qa_count = crawl["qaRunCount"]
        assert isinstance(crawl_qa_count, int)
        assert crawl_qa_count <= last_count
        last_count = crawl_qa_count

    # Test ascending sort
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=qaRunCount&sortDirection=1",
        headers=crawler_auth_headers,
    )
    assert r.status_code == 200
    crawls = r.json()["items"]

    assert crawls[-1]["id"] == crawler_crawl_id
    assert crawls[-1]["qaRunCount"] > 0

    last_count = 0
    for crawl in crawls:
        if crawl["id"] == crawler_crawl_id:
            continue
        crawl_qa_count = crawl["qaRunCount"]
        assert isinstance(crawl_qa_count, int)
        assert crawl_qa_count >= last_count
        last_count = crawl_qa_count


def test_delete_qa_runs(
    crawler_crawl_id,
    crawler_auth_headers,
    default_org_id,
    qa_run_id,
    qa_run_pages_ready,
    failed_qa_run_id,
):
    # Get download links for QA WACZs
    r = requests.get(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/replay.json",
        headers=crawler_auth_headers,
    )
    data = r.json()
    assert len(data["resources"]) == 1
    qa_wacz_url = data["resources"][0]["path"]

    # Delete QA runs
    r = requests.post(
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/delete",
        json={"qa_run_ids": [qa_run_id, failed_qa_run_id]},
        headers=crawler_auth_headers,
    )

    assert r.status_code == 200
    assert r.json()["deleted"] == 2

    # Wait for QA runs to be deleted
    count = 0
    while count < MAX_ATTEMPTS:
        r = requests.get(
            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa",
            headers=crawler_auth_headers,
        )

        if len(r.json()) == 0:
            break

        if count + 1 == MAX_ATTEMPTS:
            assert False

        time.sleep(5)
        count += 1

    # Ensure QA WACZs was deleted
    r = requests.get(f"http://localhost:30870{qa_wacz_url}")
    assert r.status_code == 404

    # Ensure associated qa run information in pages is also deleted
    for qa_run in (qa_run_id, failed_qa_run_id):
        count = 0
        while count < MAX_ATTEMPTS:
            r = requests.get(
                f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run}/pages",
                headers=crawler_auth_headers,
            )
            data = r.json()

            pages_with_qa_run = [
                page
                for page in data["items"]
                if page.get("qa") and page.get("qa").get(qa_run)
            ]

            if not pages_with_qa_run:
                break

            if count + 1 == MAX_ATTEMPTS:
                assert False

            time.sleep(5)
            count += 1