* tests: - fix cancel crawl test by ensuring state is not running or waiting - fix stop crawl test by ensuring stop is only initiated after at least one page has been crawled, otherwise result may be failed, as no crawl data has been crawled yet (separate fix in crawler to avoid loop if stopped before any data written webrecorder/browsertrix-crawler#314) - bump page limit to 4 for tests to ensure crawl is partially complete, not fully complete when stopping - allow canceled or partial_complete due to race condition * chart: bump frontend limits in default, not just for tests (addresses #780) * crawl stop before starting: - if crawl stopped before it started, mark as canceled - add test for stopping immediately, which should result in 'canceled' crawl - attempt to increase resync interval for immediate failure - nightly tests: increase page limit to test timeout * backend: - detect stopped-before-start crawl as 'failed' instead of 'done' - stats: return stats counters as int instead of string
148 lines
4.1 KiB
Python
148 lines
4.1 KiB
Python
import requests
|
|
import time
|
|
import os
|
|
import pytest
|
|
|
|
from .conftest import API_PREFIX
|
|
|
|
crawl_id = None
|
|
|
|
|
|
def get_crawl(org_id, auth_headers, crawl_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
return r.json()
|
|
|
|
|
|
def test_start_crawl_to_cancel(
|
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
|
):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}/run",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data.get("started")
|
|
|
|
global crawl_id
|
|
crawl_id = data["started"]
|
|
|
|
|
|
def test_cancel_crawl(default_org_id, crawler_auth_headers):
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
while data["state"] == "starting":
|
|
time.sleep(5)
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/cancel",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["success"] == True
|
|
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
while data["state"] in ("running", "waiting"):
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
assert data["state"] == "canceled"
|
|
assert data["stopping"] == False
|
|
|
|
assert len(data["resources"]) == 0
|
|
|
|
|
|
def test_start_crawl_and_stop_immediately(
|
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
|
):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}/run",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
crawl_id = data["started"]
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/stop",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["success"] == True
|
|
|
|
# test crawl
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
assert data["stopping"] == True
|
|
|
|
# test workflow
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.json()["currCrawlStopping"] == True
|
|
|
|
while data["state"] in ("starting", "running", "waiting"):
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
assert data["state"] in ("canceled", "partial_complete")
|
|
assert data["stopping"] == True
|
|
|
|
|
|
def test_start_crawl_to_stop_partial(
|
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
|
):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}/run",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data.get("started")
|
|
|
|
global crawl_id
|
|
crawl_id = data["started"]
|
|
|
|
|
|
def test_stop_crawl_partial(
|
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
|
):
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
done = False
|
|
while not done:
|
|
time.sleep(2)
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
done = data.get("stats") and data.get("stats").get("done") > 0
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/stop",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["success"] == True
|
|
|
|
# test crawl
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
assert data["stopping"] == True
|
|
|
|
# test workflow
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.json()["currCrawlStopping"] == True
|
|
|
|
while data["state"] == "running":
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
assert data["state"] == "partial_complete"
|
|
assert data["stopping"] == True
|
|
|
|
assert len(data["resources"]) == 1
|