concurrent crawl limits: (addresses #866) - support limits on concurrent crawls that can be run within a single org - change 'waiting' state to 'waiting_org_limit' for concurrent crawl limit and 'waiting_capacity' for capacity-based limits orgs: - add 'maxConcurrentCrawl' to new 'quotas' object on orgs - add /quotas endpoint for updating quotas object operator: - add all crawljobs as related, appear to be returned in creation order - operator: if concurrent crawl limit set, ensures current job is in the first N set of crawljobs (as provided via 'related' list of crawljob objects) before it can proceed to 'starting', otherwise set to 'waiting_org_limit' - api: add org /quotas endpoint for configuring quotas - remove 'new' state, always start with 'starting' - crawljob: add 'oid' to crawljob spec and label for easier querying - more stringent state transitions: add allowed_from to set_state() - ensure state transitions only happened from allowed states, while failed/canceled can happen from any state - ensure finished and state synched from db if transition not allowed - add crawl indices by oid and cid frontend: - show different waiting states on frontend: 'Waiting (Crawl Limit) and 'Waiting (At Capacity)' - add gear icon on orgs admin page - and initial popup for setting org quotas, showing all properties from org 'quotas' object tests: - add concurrent crawl limit nightly tests - fix state waiting -> waiting_capacity - ci: add logging of operator output on test failure
157 lines
4.4 KiB
Python
157 lines
4.4 KiB
Python
import requests
|
|
import time
|
|
import os
|
|
import pytest
|
|
|
|
from .conftest import API_PREFIX
|
|
|
|
crawl_id = None
|
|
|
|
|
|
def get_crawl(org_id, auth_headers, crawl_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
return r.json()
|
|
|
|
|
|
def test_start_crawl_to_cancel(
|
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
|
):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}/run",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data.get("started")
|
|
|
|
global crawl_id
|
|
crawl_id = data["started"]
|
|
|
|
|
|
def test_cancel_crawl(default_org_id, crawler_auth_headers):
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
while data["state"] == "starting":
|
|
time.sleep(5)
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/cancel",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["success"] == True
|
|
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
while data["state"] in ("running", "waiting_capacity"):
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
assert data["state"] == "canceled"
|
|
assert data["stopping"] == False
|
|
|
|
assert len(data["resources"]) == 0
|
|
|
|
|
|
def test_start_crawl_and_stop_immediately(
|
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
|
):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}/run",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
crawl_id = data["started"]
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/stop",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["success"] == True
|
|
|
|
# test crawl
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
assert data["stopping"] == True
|
|
|
|
# test workflow
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.json()["lastCrawlStopping"] == True
|
|
|
|
while data["state"] in ("starting", "running", "waiting_capacity"):
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
assert data["state"] in ("canceled", "partial_complete")
|
|
assert data["stopping"] == True
|
|
|
|
|
|
def test_start_crawl_to_stop_partial(
|
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
|
):
|
|
while True:
|
|
time.sleep(2)
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
if r.json().get("isCrawlRunning") is False:
|
|
break
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}/run",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data.get("started")
|
|
|
|
global crawl_id
|
|
crawl_id = data["started"]
|
|
|
|
|
|
def test_stop_crawl_partial(
|
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
|
):
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
done = False
|
|
while not done:
|
|
time.sleep(2)
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
done = data.get("stats") and data.get("stats").get("done") > 0
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/stop",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["success"] == True
|
|
|
|
# test crawl
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
assert data["stopping"] == True
|
|
|
|
# test workflow
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.json()["lastCrawlStopping"] == True
|
|
|
|
while data["state"] == "running":
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
assert data["state"] in ("partial_complete", "complete")
|
|
assert data["stopping"] == True
|
|
|
|
assert len(data["resources"]) == 1
|