browsertrix/backend/test_nightly/test_concurrent_crawl_limit.py
Ilya Kreymer 00fb8ac048
Concurrent Crawl Limit (#874)
concurrent crawl limits: (addresses #866)
- support limits on concurrent crawls that can be run within a single org
- change 'waiting' state to 'waiting_org_limit' for concurrent crawl limit and 'waiting_capacity' for capacity-based
limits

orgs:
- add 'maxConcurrentCrawl' to new 'quotas' object on orgs
- add /quotas endpoint for updating quotas object

operator:
- add all crawljobs as related, appear to be returned in creation order
- operator: if concurrent crawl limit set, ensures current job is in the first N set of crawljobs (as provided via 'related' list of crawljob objects) before it can proceed to 'starting', otherwise set to 'waiting_org_limit'
- api: add org /quotas endpoint for configuring quotas
- remove 'new' state, always start with 'starting'
- crawljob: add 'oid' to crawljob spec and label for easier querying
- more stringent state transitions: add allowed_from to set_state()
- ensure state transitions only happened from allowed states, while failed/canceled can happen from any state
- ensure finished and state synched from db if transition not allowed
- add crawl indices by oid and cid

frontend: 
- show different waiting states on frontend: 'Waiting (Crawl Limit) and 'Waiting (At Capacity)'
- add gear icon on orgs admin page
- and initial popup for setting org quotas, showing all properties from org 'quotas' object

tests:
- add concurrent crawl limit nightly tests
- fix state waiting -> waiting_capacity
- ci: add logging of operator output on test failure
2023-05-30 15:38:03 -07:00

108 lines
2.7 KiB
Python

import requests
import time
from .conftest import API_PREFIX
crawl_id_a = None
crawl_id_b = None
def test_set_concurrent_crawl_limit(org_with_quotas, admin_auth_headers):
r = requests.post(
f"{API_PREFIX}/orgs/{org_with_quotas}/quotas",
headers=admin_auth_headers,
json={"maxConcurrentCrawls": 1},
)
data = r.json()
assert data.get("updated") == True
def test_run_two_only_one_concurrent(org_with_quotas, admin_auth_headers):
global crawl_id_a
crawl_id_a = run_crawl(org_with_quotas, admin_auth_headers)
time.sleep(1)
global crawl_id_b
crawl_id_b = run_crawl(org_with_quotas, admin_auth_headers)
while (
get_crawl_status(org_with_quotas, crawl_id_a, admin_auth_headers) == "starting"
):
time.sleep(2)
assert (
get_crawl_status(org_with_quotas, crawl_id_a, admin_auth_headers) == "running"
)
while (
get_crawl_status(org_with_quotas, crawl_id_b, admin_auth_headers) == "starting"
):
time.sleep(2)
assert (
get_crawl_status(org_with_quotas, crawl_id_b, admin_auth_headers)
== "waiting_org_limit"
)
def test_cancel_and_run_other(org_with_quotas, admin_auth_headers):
r = requests.post(
f"{API_PREFIX}/orgs/{org_with_quotas}/crawls/{crawl_id_a}/cancel",
headers=admin_auth_headers,
)
data = r.json()
assert data["success"] == True
while (
get_crawl_status(org_with_quotas, crawl_id_a, admin_auth_headers) != "canceled"
):
time.sleep(2)
while (
get_crawl_status(org_with_quotas, crawl_id_b, admin_auth_headers)
== "waiting_org_limit"
):
time.sleep(5)
assert get_crawl_status(org_with_quotas, crawl_id_b, admin_auth_headers) in (
"starting",
"running",
)
# cancel second crawl as well
r = requests.post(
f"{API_PREFIX}/orgs/{org_with_quotas}/crawls/{crawl_id_b}/cancel",
headers=admin_auth_headers,
)
data = r.json()
assert data["success"] == True
def run_crawl(org_id, headers):
crawl_data = {
"runNow": True,
"name": "Concurrent Crawl",
"config": {
"seeds": [{"url": "https://specs.webrecorder.net/"}],
"limit": 1,
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{org_id}/crawlconfigs/",
headers=headers,
json=crawl_data,
)
data = r.json()
return data["run_now_job"]
def get_crawl_status(org_id, crawl_id, headers):
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{org_id}/crawls/{crawl_id}/replay.json",
headers=headers,
)
data = r.json()
return data["state"]