* tests: - fix cancel crawl test by ensuring state is not running or waiting - fix stop crawl test by ensuring stop is only initiated after at least one page has been crawled, otherwise result may be failed, as no crawl data has been crawled yet (separate fix in crawler to avoid loop if stopped before any data written webrecorder/browsertrix-crawler#314) - bump page limit to 4 for tests to ensure crawl is partially complete, not fully complete when stopping - allow canceled or partial_complete due to race condition * chart: bump frontend limits in default, not just for tests (addresses #780) * crawl stop before starting: - if crawl stopped before it started, mark as canceled - add test for stopping immediately, which should result in 'canceled' crawl - attempt to increase resync interval for immediate failure - nightly tests: increase page limit to test timeout * backend: - detect stopped-before-start crawl as 'failed' instead of 'done' - stats: return stats counters as int instead of string
260 lines
7.2 KiB
Python
260 lines
7.2 KiB
Python
import pytest
|
|
import requests
|
|
import time
|
|
|
|
|
|
HOST_PREFIX = "http://127.0.0.1:30870"
|
|
API_PREFIX = HOST_PREFIX + "/api"
|
|
|
|
ADMIN_USERNAME = "admin@example.com"
|
|
ADMIN_PW = "PASSW0RD!"
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def admin_auth_headers():
|
|
while True:
|
|
r = requests.post(
|
|
f"{API_PREFIX}/auth/jwt/login",
|
|
data={
|
|
"username": ADMIN_USERNAME,
|
|
"password": ADMIN_PW,
|
|
"grant_type": "password",
|
|
},
|
|
)
|
|
data = r.json()
|
|
try:
|
|
return {"Authorization": f"Bearer {data['access_token']}"}
|
|
except:
|
|
print("Waiting for admin_auth_headers")
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def default_org_id(admin_auth_headers):
|
|
while True:
|
|
r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
|
|
data = r.json()
|
|
try:
|
|
for org in data["items"]:
|
|
if org["default"] is True:
|
|
return org["id"]
|
|
except:
|
|
print("Waiting for default org id")
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def crawl_id_wr(admin_auth_headers, default_org_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Webrecorder admin test crawl",
|
|
"tags": ["wr", "nightly testing"],
|
|
"config": {
|
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
|
"limit": 1,
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
crawl_id = data["run_now_job"]
|
|
# Wait for it to complete and then return crawl ID
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] == "complete":
|
|
return crawl_id
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def crawl_id_wr_specs(admin_auth_headers, default_org_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Webrecorder Specs admin test crawl",
|
|
"tags": ["wr-specs", "nightly testing"],
|
|
"config": {
|
|
"seeds": [{"url": "https://specs.webrecorder.net/"}],
|
|
"limit": 1,
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
crawl_id = data["run_now_job"]
|
|
# Wait for it to complete and then return crawl ID
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] == "complete":
|
|
return crawl_id
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def crawl_config_info(admin_auth_headers, default_org_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Crawl config test",
|
|
"config": {"seeds": [{"url": "https://specs.webrecorder.net/"}], "limit": 1},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
crawl_config_id = data["added"]
|
|
crawl_id = data["run_now_job"]
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] == "complete":
|
|
break
|
|
time.sleep(5)
|
|
|
|
# Run second crawl from crawlconfig and return info when it finishes
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}/run",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
second_crawl_id = data["started"]
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{second_crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] == "complete":
|
|
return (crawl_config_id, crawl_id, second_crawl_id)
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def large_crawl_id(admin_auth_headers, default_org_id):
|
|
# Start crawl
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Large Test Crawl",
|
|
"tags": ["wacz-logs"],
|
|
"config": {
|
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
|
"scopeType": "domain",
|
|
"limit": 100,
|
|
"extraHops": 1,
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
crawl_id = data["run_now_job"]
|
|
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] == "running":
|
|
# Give crawl time to start properly
|
|
time.sleep(30)
|
|
return crawl_id
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def large_crawl_finished(admin_auth_headers, default_org_id, large_crawl_id):
|
|
# Wait for crawl to complete
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{large_crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] == "complete":
|
|
# Give some time for WACZ files to be stored
|
|
time.sleep(30)
|
|
break
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def timeout_crawl(admin_auth_headers, default_org_id):
|
|
# Start crawl
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Crawl with crawl timeout",
|
|
"crawlTimeout": 30,
|
|
"config": {
|
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
|
"scopeType": "domain",
|
|
"limit": 100,
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
return data["run_now_job"]
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def error_crawl_id(admin_auth_headers, default_org_id):
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Youtube crawl with errors",
|
|
"config": {
|
|
"seeds": [
|
|
{"url": "https://invalid.webrecorder.net/"},
|
|
{"url": "https://invalid-2.webrecorder.net/"},
|
|
],
|
|
"limit": 1,
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
crawl_id = data["run_now_job"]
|
|
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] == "complete":
|
|
return crawl_id
|
|
time.sleep(5)
|