browsertrix/backend/test_nightly/test_crawlconfig_crawl_stats.py
Tessa Walsh 13bf818914
Fix nightly tests (#2460)
Fixes #2459 

- Set `/data/` as primary storage `access_endpoint_url` in nightly test
chart
- Modify nightly test GH Actions workflow to spawn a separate job per
nightly test module using dynamic matrix
- Set configuration not to fail other jobs if one job fails
- Modify failing tests:
- Add fixture to background job nightly test module so it can run alone
- Add retry loop to crawlconfig stats nightly test so it's less
dependent on timing

GitHub limits each workflow to 256 jobs, so this should continue to be
able to scale up for us without issue.

---------

Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
2025-03-06 16:23:30 -08:00

96 lines
2.8 KiB
Python

import requests
import time
from .conftest import API_PREFIX
def test_crawlconfig_crawl_stats(admin_auth_headers, default_org_id, crawl_config_info):
crawl_config_id, crawl_id, second_crawl_id = crawl_config_info
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
first_crawl_finished = data["finished"]
assert first_crawl_finished
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{second_crawl_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
second_crawl_finished = data["finished"]
assert second_crawl_finished
# Verify crawl stats from /crawlconfigs
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["crawlAttemptCount"] == 2
assert data["crawlCount"] == 2
assert data["lastCrawlId"] == second_crawl_id
assert data["lastCrawlState"] == "complete"
assert data["lastCrawlTime"] == second_crawl_finished
# Delete second crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=admin_auth_headers,
json={"crawl_ids": [second_crawl_id]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"]
# Verify crawl stats from /crawlconfigs
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["crawlAttemptCount"] == 2
assert data["crawlCount"] == 1
assert data["lastCrawlId"] == crawl_id
assert data["lastCrawlState"] == "complete"
assert data["lastCrawlTime"] == first_crawl_finished
# Delete first crawl
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
headers=admin_auth_headers,
json={"crawl_ids": [crawl_id]},
)
assert r.status_code == 200
data = r.json()
assert data["deleted"]
# Verify crawl stats from /crawlconfigs
max_attempts = 18
attempts = 1
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
if data["crawlAttemptCount"] == 2 and data["crawlCount"] == 0:
assert not data["lastCrawlId"]
assert not data["lastCrawlState"]
assert not data["lastCrawlTime"]
break
if attempts >= max_attempts:
assert False
time.sleep(10)
attempts += 1