* Add bytes stored per type to org and metrics The org now tracks bytesStored by type of crawl, uploads, and browser profiles in addition to the total, and returns these values in the org metrics endpoint. A migration is added to precompute these values in existing deployments. In addition, all /metrics storage values are now returned solely as bytes, as the GB form wasn't being used in the frontend and is unnecessary. * Improve deletion of multiple archived item types via `/all-crawls` delete endpoint - Update `/all-crawls` delete test to check that org and workflow size values are correct following deletion. - Fix bug where it was always assumed only one crawl was deleted per cid and size was not tracked per cid - Add type check within delete_crawls
486 lines
13 KiB
Python
486 lines
13 KiB
Python
import os
|
|
import pytest
|
|
import requests
|
|
import socket
|
|
import subprocess
|
|
import time
|
|
|
|
|
|
HOST_PREFIX = "http://127.0.0.1:30870"
|
|
API_PREFIX = HOST_PREFIX + "/api"
|
|
|
|
ADMIN_USERNAME = "admin@example.com"
|
|
ADMIN_PW = "PASSW0RD!"
|
|
|
|
VIEWER_USERNAME = "viewer@example.com"
|
|
VIEWER_PW = "viewerPASSW0RD!"
|
|
|
|
CRAWLER_USERNAME = "crawler@example.com"
|
|
CRAWLER_PW = "crawlerPASSWORD!"
|
|
|
|
_admin_config_id = None
|
|
_crawler_config_id = None
|
|
_auto_add_config_id = None
|
|
_all_crawls_config_id = None
|
|
_all_crawls_delete_config_id = None
|
|
|
|
NON_DEFAULT_ORG_NAME = "Non-default org"
|
|
|
|
FINISHED_STATES = ("complete", "partial_complete", "canceled", "failed")
|
|
|
|
curr_dir = os.path.abspath(os.path.dirname(os.path.realpath(__file__)))
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def admin_auth_headers():
|
|
while True:
|
|
r = requests.post(
|
|
f"{API_PREFIX}/auth/jwt/login",
|
|
data={
|
|
"username": ADMIN_USERNAME,
|
|
"password": ADMIN_PW,
|
|
"grant_type": "password",
|
|
},
|
|
)
|
|
data = r.json()
|
|
try:
|
|
return {"Authorization": f"Bearer {data['access_token']}"}
|
|
except:
|
|
print("Waiting for admin_auth_headers")
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def default_org_id(admin_auth_headers):
|
|
while True:
|
|
r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
|
|
data = r.json()
|
|
try:
|
|
for org in data["items"]:
|
|
if org["default"] is True:
|
|
return org["id"]
|
|
except:
|
|
print("Waiting for default org id")
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def non_default_org_id(admin_auth_headers):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/create",
|
|
headers=admin_auth_headers,
|
|
json={"name": NON_DEFAULT_ORG_NAME},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
while True:
|
|
r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
|
|
data = r.json()
|
|
try:
|
|
for org in data["items"]:
|
|
if org["name"] == NON_DEFAULT_ORG_NAME:
|
|
return org["id"]
|
|
except:
|
|
print("Waiting for non-default org id")
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def admin_crawl_id(admin_auth_headers, default_org_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Admin Test Crawl",
|
|
"description": "Admin Test Crawl description",
|
|
"tags": ["wr-test-1", "wr-test-2"],
|
|
"config": {
|
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
|
"exclude": "community",
|
|
# limit now set via 'max_pages_per_crawl' global limit
|
|
# "limit": 1,
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
global _admin_config_id
|
|
_admin_config_id = data["id"]
|
|
|
|
crawl_id = data["run_now_job"]
|
|
# Wait for it to complete and then return crawl ID
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] in FINISHED_STATES:
|
|
return crawl_id
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def admin_config_id(admin_crawl_id):
|
|
return _admin_config_id
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def admin_userid(admin_auth_headers):
|
|
r = requests.get(f"{API_PREFIX}/users/me", headers=admin_auth_headers)
|
|
return r.json()["id"]
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def viewer_auth_headers(admin_auth_headers, default_org_id):
|
|
requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/add-user",
|
|
json={
|
|
"email": VIEWER_USERNAME,
|
|
"password": VIEWER_PW,
|
|
"name": "newviewer",
|
|
"role": 10,
|
|
},
|
|
headers=admin_auth_headers,
|
|
)
|
|
r = requests.post(
|
|
f"{API_PREFIX}/auth/jwt/login",
|
|
data={
|
|
"username": VIEWER_USERNAME,
|
|
"password": VIEWER_PW,
|
|
"grant_type": "password",
|
|
},
|
|
)
|
|
data = r.json()
|
|
access_token = data.get("access_token")
|
|
return {"Authorization": f"Bearer {access_token}"}
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def crawler_auth_headers(admin_auth_headers, default_org_id):
|
|
requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/add-user",
|
|
json={
|
|
"email": CRAWLER_USERNAME,
|
|
"password": CRAWLER_PW,
|
|
"name": "new-crawler",
|
|
"description": "crawler test crawl",
|
|
"role": 20,
|
|
},
|
|
headers=admin_auth_headers,
|
|
)
|
|
r = requests.post(
|
|
f"{API_PREFIX}/auth/jwt/login",
|
|
data={
|
|
"username": CRAWLER_USERNAME,
|
|
"password": CRAWLER_PW,
|
|
"grant_type": "password",
|
|
},
|
|
)
|
|
data = r.json()
|
|
access_token = data.get("access_token")
|
|
return {"Authorization": f"Bearer {access_token}"}
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def crawler_userid(crawler_auth_headers):
|
|
r = requests.get(f"{API_PREFIX}/users/me", headers=crawler_auth_headers)
|
|
return r.json()["id"]
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def _crawler_create_config_only(crawler_auth_headers, default_org_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": False,
|
|
"name": "Crawler User Test Crawl",
|
|
"description": "crawler test crawl",
|
|
"config": {
|
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
|
"pageExtraDelay": 10,
|
|
"limit": 3,
|
|
"exclude": "community",
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
global _crawler_config_id
|
|
_crawler_config_id = data["id"]
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def crawler_crawl_id(crawler_auth_headers, default_org_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Crawler User Test Crawl",
|
|
"description": "crawler test crawl",
|
|
"tags": ["wr-test-2"],
|
|
"config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 1},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
global _crawler_config_id
|
|
_crawler_config_id = data["id"]
|
|
|
|
crawl_id = data["run_now_job"]
|
|
# Wait for it to complete and then return crawl ID
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] in FINISHED_STATES:
|
|
return crawl_id
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def wr_specs_crawl_id(crawler_auth_headers, default_org_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Webrecorder Specs sample crawl",
|
|
"config": {"seeds": [{"url": "https://specs.webrecorder.net/"}], "limit": 1},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
crawl_id = data["run_now_job"]
|
|
# Wait for it to complete and then return crawl ID
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] in FINISHED_STATES:
|
|
return crawl_id
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def crawler_config_id(crawler_crawl_id):
|
|
return _crawler_config_id
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def crawler_config_id_only(_crawler_create_config_only):
|
|
return _crawler_config_id
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def sample_crawl_data():
|
|
return {
|
|
"runNow": False,
|
|
"name": "Test Crawl",
|
|
"config": {"seeds": [{"url": "https://example.com/"}]},
|
|
"tags": ["tag1", "tag2"],
|
|
}
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def auto_add_collection_id(crawler_auth_headers, default_org_id):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={"name": "Auto Add Collection"},
|
|
)
|
|
assert r.status_code == 200
|
|
return r.json()["id"]
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def auto_add_crawl_id(crawler_auth_headers, default_org_id, auto_add_collection_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "Auto Add",
|
|
"description": "For testing auto-adding new workflow crawls to collections",
|
|
"autoAddCollections": [auto_add_collection_id],
|
|
"config": {
|
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
|
"limit": 1,
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
global _auto_add_config_id
|
|
_auto_add_config_id = data["id"]
|
|
|
|
crawl_id = data["run_now_job"]
|
|
# Wait for it to complete and then return crawl ID
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] in FINISHED_STATES:
|
|
return crawl_id
|
|
time.sleep(5)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def auto_add_config_id(auto_add_crawl_id):
|
|
return _auto_add_config_id
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def all_crawls_crawl_id(crawler_auth_headers, default_org_id):
|
|
# Start crawl.
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "All Crawls Test Crawl",
|
|
"description": "Lorem ipsum",
|
|
"config": {
|
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
|
"exclude": "community",
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=crawler_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
global _all_crawls_config_id
|
|
_all_crawls_config_id = data["id"]
|
|
|
|
crawl_id = data["run_now_job"]
|
|
# Wait for it to complete and then return crawl ID
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] in FINISHED_STATES:
|
|
break
|
|
time.sleep(5)
|
|
|
|
# Add description to crawl
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}",
|
|
headers=crawler_auth_headers,
|
|
json={"description": "Lorem ipsum"},
|
|
)
|
|
assert r.status_code == 200
|
|
return crawl_id
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def all_crawls_config_id(all_crawls_crawl_id):
|
|
return _all_crawls_config_id
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def uploads_collection_id(crawler_auth_headers, default_org_id):
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=crawler_auth_headers,
|
|
json={"name": "Upload test collection"},
|
|
)
|
|
assert r.status_code == 200
|
|
return r.json()["id"]
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def all_crawls_delete_crawl_ids(admin_auth_headers, default_org_id):
|
|
crawl_data = {
|
|
"runNow": True,
|
|
"name": "All Crawls Delete Test Workflow",
|
|
"description": "Lorem ipsum",
|
|
"config": {
|
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
|
"exclude": "community",
|
|
},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
data = r.json()
|
|
|
|
global _all_crawls_delete_config_id
|
|
_all_crawls_delete_config_id = data["id"]
|
|
|
|
return_crawl_ids = []
|
|
|
|
crawl_id = data["run_now_job"]
|
|
return_crawl_ids.append(crawl_id)
|
|
|
|
# Wait for crawl to complete
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] in FINISHED_STATES:
|
|
break
|
|
time.sleep(5)
|
|
|
|
# Run workflow again and wait for second crawl to complete
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{_all_crawls_delete_config_id}/run",
|
|
headers=admin_auth_headers,
|
|
)
|
|
crawl_2_id = r.json()["started"]
|
|
return_crawl_ids.append(crawl_2_id)
|
|
|
|
while True:
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_2_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
if data["state"] in FINISHED_STATES:
|
|
break
|
|
time.sleep(5)
|
|
|
|
return return_crawl_ids
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def all_crawls_delete_config_id(admin_crawl_id):
|
|
return _all_crawls_delete_config_id
|
|
|
|
|
|
@pytest.fixture(scope="function")
|
|
def echo_server():
|
|
print(f"Echo server starting", flush=True)
|
|
p = subprocess.Popen(["python3", os.path.join(curr_dir, "echo_server.py")])
|
|
print(f"Echo server started", flush=True)
|
|
time.sleep(1)
|
|
yield p
|
|
time.sleep(10)
|
|
print(f"Echo server terminating", flush=True)
|
|
p.terminate()
|
|
print(f"Echo server terminated", flush=True)
|