* optimizations: - rename update_crawl_config_stats to stats_recompute_all, only used in migration to fetch all crawls and do a full recompute of all file sizes - add stats_recompute_last to only get last crawl by size, increment total size by specified amount, and incr/decr number of crawls - Update migration 0007 to use stats_recompute_all - Add isCrawlRunning, lastCrawlStopping, and lastRun to stats_recompute_last - Increment crawlSuccessfulCount in stats_recompute_last * operator/crawls: - operator: keep track of filesAddedSize in redis as well - rename update_crawl to update_crawl_state_if_changed() and only update if state is different, otherwise return false - ensure mark_finished() operations only occur if crawl is state has changed - don't clear 'stopping' flag, can track if crawl was stopped - state always starts with "starting", don't reset to starting tests: - Add test for incremental workflow stats updating - don't clear stopping==true, indicates crawl was manually stopped --------- Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
157 lines
4.4 KiB
Python
157 lines
4.4 KiB
Python
import requests
|
|
import time
|
|
import os
|
|
import pytest
|
|
|
|
from .conftest import API_PREFIX
|
|
|
|
crawl_id = None
|
|
|
|
|
|
def get_crawl(org_id, auth_headers, crawl_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
return r.json()
|
|
|
|
|
|
def test_start_crawl_to_cancel(
|
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
|
):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}/run",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data.get("started")
|
|
|
|
global crawl_id
|
|
crawl_id = data["started"]
|
|
|
|
|
|
def test_cancel_crawl(default_org_id, crawler_auth_headers):
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
while data["state"] == "starting":
|
|
time.sleep(5)
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/cancel",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["success"] == True
|
|
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
while data["state"] in ("running", "waiting"):
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
assert data["state"] == "canceled"
|
|
assert data["stopping"] == False
|
|
|
|
assert len(data["resources"]) == 0
|
|
|
|
|
|
def test_start_crawl_and_stop_immediately(
|
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
|
):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}/run",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
crawl_id = data["started"]
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/stop",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["success"] == True
|
|
|
|
# test crawl
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
assert data["stopping"] == True
|
|
|
|
# test workflow
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.json()["lastCrawlStopping"] == True
|
|
|
|
while data["state"] in ("starting", "running", "waiting"):
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
assert data["state"] in ("canceled", "partial_complete")
|
|
assert data["stopping"] == True
|
|
|
|
|
|
def test_start_crawl_to_stop_partial(
|
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
|
):
|
|
while True:
|
|
time.sleep(2)
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
if r.json().get("isCrawlRunning") is False:
|
|
break
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}/run",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert data.get("started")
|
|
|
|
global crawl_id
|
|
crawl_id = data["started"]
|
|
|
|
|
|
def test_stop_crawl_partial(
|
|
default_org_id, crawler_config_id_only, crawler_auth_headers
|
|
):
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
done = False
|
|
while not done:
|
|
time.sleep(2)
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
done = data.get("stats") and data.get("stats").get("done") > 0
|
|
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/stop",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["success"] == True
|
|
|
|
# test crawl
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
assert data["stopping"] == True
|
|
|
|
# test workflow
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawler_config_id_only}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.json()["lastCrawlStopping"] == True
|
|
|
|
while data["state"] == "running":
|
|
data = get_crawl(default_org_id, crawler_auth_headers, crawl_id)
|
|
|
|
assert data["state"] in ("partial_complete", "complete")
|
|
assert data["stopping"] == True
|
|
|
|
assert len(data["resources"]) == 1
|