browsertrix/backend/test/conftest.py
Ilya Kreymer 887cb16146
Allow configurable max pages per crawl in deployment settings (#717)
* backend: max pages per crawl limit, part of fix for #716:
- set 'max_pages_crawl_limit' in values.yaml, default to 100,000
- if set/non-0, automatically set limit if none provided
- if set/non-0, return 400 if adding config with limit exceeding max limit
- return limit as 'maxPagesPerCrawl' in /api/settings
- api: /all/crawls - add runningOnly=0 to show all crawls, default to 1/true (for more reliable testing)

tests: add test for 'max_pages_per_crawl' setting
- ensure 'limit' can not be set higher than max_pages_per_crawl
- ensure pages crawled is at the limit
- set test limit to max 2 pages
- add settings test
- check for pages.jsonl and extraPages.jsonl when crawling 2 pages
2023-03-28 16:26:29 -07:00

250 lines
6.8 KiB
Python

import pytest
import requests
import time
HOST_PREFIX = "http://127.0.0.1:30870"
API_PREFIX = HOST_PREFIX + "/api"
ADMIN_USERNAME = "admin@example.com"
ADMIN_PW = "PASSW0RD!"
VIEWER_USERNAME = "viewer@example.com"
VIEWER_PW = "viewerPASSW0RD!"
CRAWLER_USERNAME = "crawler@example.com"
CRAWLER_PW = "crawlerPASSWORD!"
_admin_config_id = None
_crawler_config_id = None
NON_DEFAULT_ORG_NAME = "Non-default org"
@pytest.fixture(scope="session")
def admin_auth_headers():
while True:
r = requests.post(
f"{API_PREFIX}/auth/jwt/login",
data={
"username": ADMIN_USERNAME,
"password": ADMIN_PW,
"grant_type": "password",
},
)
data = r.json()
try:
return {"Authorization": f"Bearer {data['access_token']}"}
except:
print("Waiting for admin_auth_headers")
time.sleep(5)
@pytest.fixture(scope="session")
def default_org_id(admin_auth_headers):
while True:
r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
data = r.json()
try:
for org in data["items"]:
if org["default"] is True:
return org["id"]
except:
print("Waiting for default org id")
time.sleep(5)
@pytest.fixture(scope="session")
def non_default_org_id(admin_auth_headers):
r = requests.post(
f"{API_PREFIX}/orgs/create",
headers=admin_auth_headers,
json={"name": NON_DEFAULT_ORG_NAME},
)
assert r.status_code == 200
while True:
r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
data = r.json()
try:
for org in data["items"]:
if org["name"] == NON_DEFAULT_ORG_NAME:
return org["id"]
except:
print("Waiting for non-default org id")
time.sleep(5)
@pytest.fixture(scope="session")
def admin_crawl_id(admin_auth_headers, default_org_id):
# Start crawl.
crawl_data = {
"runNow": True,
"name": "Admin Test Crawl",
"description": "Admin Test Crawl description",
"tags": ["wr-test-1", "wr-test-2"],
"config": {
"seeds": [{"url": "https://webrecorder.net/"}],
# limit now set via 'max_pages_per_crawl' global limit
# "limit": 1,
},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
data = r.json()
global _admin_config_id
_admin_config_id = data["added"]
crawl_id = data["run_now_job"]
# Wait for it to complete and then return crawl ID
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=admin_auth_headers,
)
data = r.json()
if data["state"] == "complete":
return crawl_id
time.sleep(5)
@pytest.fixture(scope="session")
def admin_config_id(admin_crawl_id):
return _admin_config_id
@pytest.fixture(scope="session")
def viewer_auth_headers(admin_auth_headers, default_org_id):
requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/add-user",
json={
"email": VIEWER_USERNAME,
"password": VIEWER_PW,
"name": "newviewer",
"role": 10,
},
headers=admin_auth_headers,
)
r = requests.post(
f"{API_PREFIX}/auth/jwt/login",
data={
"username": VIEWER_USERNAME,
"password": VIEWER_PW,
"grant_type": "password",
},
)
data = r.json()
access_token = data.get("access_token")
return {"Authorization": f"Bearer {access_token}"}
@pytest.fixture(scope="session")
def crawler_auth_headers(admin_auth_headers, default_org_id):
requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/add-user",
json={
"email": CRAWLER_USERNAME,
"password": CRAWLER_PW,
"name": "new-crawler",
"description": "crawler test crawl",
"role": 20,
},
headers=admin_auth_headers,
)
r = requests.post(
f"{API_PREFIX}/auth/jwt/login",
data={
"username": CRAWLER_USERNAME,
"password": CRAWLER_PW,
"grant_type": "password",
},
)
data = r.json()
access_token = data.get("access_token")
return {"Authorization": f"Bearer {access_token}"}
@pytest.fixture(scope="session")
def crawler_userid(crawler_auth_headers):
r = requests.get(f"{API_PREFIX}/users/me", headers=crawler_auth_headers)
return r.json()["id"]
@pytest.fixture(scope="session")
def crawler_crawl_id(crawler_auth_headers, default_org_id):
# Start crawl.
crawl_data = {
"runNow": True,
"name": "Crawler User Test Crawl",
"description": "crawler test crawl",
"config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 1},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=crawl_data,
)
data = r.json()
global _crawler_config_id
_crawler_config_id = data["added"]
crawl_id = data["run_now_job"]
# Wait for it to complete and then return crawl ID
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=crawler_auth_headers,
)
data = r.json()
if data["state"] == "complete":
return crawl_id
time.sleep(5)
@pytest.fixture(scope="session")
def wr_specs_crawl_id(crawler_auth_headers, default_org_id):
# Start crawl.
crawl_data = {
"runNow": True,
"name": "Webrecorder Specs sample crawl",
"config": {"seeds": [{"url": "https://specs.webrecorder.net/"}], "limit": 1},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=crawl_data,
)
data = r.json()
crawl_id = data["run_now_job"]
# Wait for it to complete and then return crawl ID
while True:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
headers=crawler_auth_headers,
)
data = r.json()
if data["state"] == "complete":
return crawl_id
time.sleep(5)
@pytest.fixture(scope="session")
def crawler_config_id(crawler_crawl_id):
return _crawler_config_id
@pytest.fixture(scope="session")
def sample_crawl_data():
return {
"runNow": False,
"name": "Test Crawl",
"config": {"seeds": [{"url": "https://example.com/"}]},
"tags": ["tag1", "tag2"],
}