browsertrix/backend/test/test_profiles.py
Tessa Walsh 123705c53f
Serialize datetimes with Z suffix (#2058)
Use timezone aware datetimes instead of timezone naive datetimes:
- Update mongodb client to use tz-aware conversion
- Convert dt_now() to return timezone aware UTC date
- Rename to_k8s_date -> date_to_str, just returns ISO UTC date with 'Z'
(instead of '+00:00' suffix)
- Rename from_k8s_date -> str_to_date, returns timezone aware date from
str
- Standardize all string<->date conversion to use either date_to_str or
str_to_date
- Update frontend to assume iso date, not append 'Z' directly
- Update tests to check for 'Z' suffix on some dates

---------
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2024-09-12 16:16:13 -07:00

564 lines
18 KiB
Python

import time
from typing import Dict
from uuid import UUID
import requests
import pytest
from .conftest import API_PREFIX, FINISHED_STATES
PROFILE_NAME = "Test profile"
PROFILE_DESC = "Profile used for backend tests"
PROFILE_NAME_UPDATED = "Updated test profile"
PROFILE_DESC_UPDATED = "Updated profile used for backend tests"
PROFILE_2_NAME = "Second test profile"
PROFILE_2_DESC = "Second profile used to test list endpoint"
def prepare_browser_for_profile_commit(
browser_id: str, headers: Dict[str, str], oid: UUID
) -> None:
# Ping to make sure it doesn't expire
r = requests.post(
f"{API_PREFIX}/orgs/{oid}/profiles/browser/{browser_id}/ping",
headers=headers,
)
assert r.status_code == 200
data = r.json()
assert data.get("success")
assert data.get("origins") or data.get("origins") == []
# Verify browser seems good
r = requests.get(
f"{API_PREFIX}/orgs/{oid}/profiles/browser/{browser_id}",
headers=headers,
)
assert r.status_code == 200
data = r.json()
assert data["url"]
assert data["path"]
assert data["password"]
assert data["auth_bearer"]
assert data["scale"]
assert data["oid"] == oid
# Navigate to new URL
r = requests.post(
f"{API_PREFIX}/orgs/{oid}/profiles/browser/{browser_id}/navigate",
headers=headers,
json={"url": "https://webrecorder.net/tools"},
)
assert r.status_code == 200
assert r.json()["success"]
# Ping browser until ready
max_attempts = 20
attempts = 1
while attempts <= max_attempts:
try:
r = requests.post(
f"{API_PREFIX}/orgs/{oid}/profiles/browser/{browser_id}/ping",
headers=headers,
)
data = r.json()
if data["success"]:
break
time.sleep(5)
except:
pass
attempts += 1
@pytest.fixture(scope="module")
def profile_id(admin_auth_headers, default_org_id, profile_browser_id):
prepare_browser_for_profile_commit(
profile_browser_id, admin_auth_headers, default_org_id
)
# Create profile
start_time = time.monotonic()
time_limit = 300
while True:
try:
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/profiles",
headers=admin_auth_headers,
json={
"browserid": profile_browser_id,
"name": PROFILE_NAME,
"description": PROFILE_DESC,
},
timeout=10,
)
assert r.status_code == 200
data = r.json()
if data.get("detail") and data.get("detail") == "waiting_for_browser":
time.sleep(5)
continue
if data.get("added"):
assert data["storageQuotaReached"] in (True, False)
return data["id"]
except:
if time.monotonic() - start_time > time_limit:
raise
time.sleep(5)
@pytest.fixture(scope="module")
def profile_config_id(admin_auth_headers, default_org_id, profile_id):
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/profiles/{profile_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == profile_id
assert data["name"] == PROFILE_NAME
assert data["description"] == PROFILE_DESC
assert data["userid"]
assert data["oid"] == default_org_id
assert data.get("origins") or data.get("origins") == []
assert data["createdBy"]
assert data["createdByName"] == "admin"
assert data["modifiedBy"]
assert data["modifiedByName"] == "admin"
assert not data["baseid"]
created = data["created"]
assert created
assert created.endswith("Z")
modified = data["modified"]
assert modified
assert modified.endswith("Z")
resource = data["resource"]
assert resource
assert resource["filename"]
assert resource["hash"]
assert resource["size"]
assert resource["storage"]
assert resource["storage"]["name"]
assert resource.get("replicas") or resource.get("replicas") == []
assert data.get("crawlconfigs") == []
# Use profile in a workflow
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json={
"runNow": False,
"name": "Profile Test Crawl",
"description": "Crawl using browser profile",
"config": {
"seeds": [{"url": "https://webrecorder.net/"}],
"exclude": "community",
},
"profileid": profile_id,
},
)
data = r.json()
return data["id"]
@pytest.fixture(scope="module")
def profile_2_id(admin_auth_headers, default_org_id, profile_browser_2_id):
prepare_browser_for_profile_commit(
profile_browser_2_id, admin_auth_headers, default_org_id
)
# Create profile
start_time = time.monotonic()
time_limit = 300
while True:
try:
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/profiles",
headers=admin_auth_headers,
json={
"browserid": profile_browser_2_id,
"name": PROFILE_2_NAME,
"description": PROFILE_2_DESC,
},
timeout=10,
)
assert r.status_code == 200
data = r.json()
if data.get("detail") and data.get("detail") == "waiting_for_browser":
time.sleep(5)
if data.get("added"):
assert data["storageQuotaReached"] in (True, False)
return data["id"]
except:
if time.monotonic() - start_time > time_limit:
raise
time.sleep(5)
def test_commit_browser_to_new_profile(admin_auth_headers, default_org_id, profile_id):
assert profile_id
def test_get_profile(admin_auth_headers, default_org_id, profile_id, profile_config_id):
start_time = time.monotonic()
time_limit = 10
# Check get endpoint again and check that crawlconfigs is updated
while True:
try:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/profiles/{profile_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == profile_id
assert data["name"] == PROFILE_NAME
assert data["description"] == PROFILE_DESC
assert data["userid"]
assert data["oid"] == default_org_id
assert data.get("origins") or data.get("origins") == []
assert data["created"]
assert data["createdBy"]
assert data["createdByName"] == "admin"
assert data["modified"]
assert data["modifiedBy"]
assert data["modifiedByName"] == "admin"
assert not data["baseid"]
resource = data["resource"]
assert resource
assert resource["filename"]
assert resource["hash"]
assert resource["size"]
assert resource["storage"]
assert resource["storage"]["name"]
assert resource.get("replicas") or resource.get("replicas") == []
crawl_configs = data.get("crawlconfigs")
assert crawl_configs
assert len(crawl_configs) == 1
assert crawl_configs[0]["id"] == profile_config_id
assert crawl_configs[0]["name"] == "Profile Test Crawl"
assert crawl_configs[0]["firstSeed"] == "https://webrecorder.net/"
assert crawl_configs[0]["seedCount"] == 1
break
except:
if time.monotonic() - start_time > time_limit:
raise
time.sleep(1)
def test_commit_second_profile(profile_2_id):
assert profile_2_id
def test_list_profiles(admin_auth_headers, default_org_id, profile_id, profile_2_id):
start_time = time.monotonic()
time_limit = 10
# Check get endpoint again and check that crawlconfigs is updated
while True:
try:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/profiles",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 2
profiles = data["items"]
assert len(profiles) == 2
# Second profile should be listed first by default because it was
# modified more recently
profile_2 = profiles[0]
assert profile_2["id"] == profile_2_id
assert profile_2["name"] == PROFILE_2_NAME
assert profile_2["description"] == PROFILE_2_DESC
assert profile_2["userid"]
assert profile_2["oid"] == default_org_id
assert profile_2.get("origins") or data.get("origins") == []
assert profile_2["created"]
assert profile_2["createdBy"]
assert profile_2["createdByName"] == "admin"
assert profile_2["modified"]
assert profile_2["modifiedBy"]
assert profile_2["modifiedByName"] == "admin"
assert not profile_2["baseid"]
resource = profile_2["resource"]
assert resource
assert resource["filename"]
assert resource["hash"]
assert resource["size"]
assert resource["storage"]
assert resource["storage"]["name"]
assert resource.get("replicas") or resource.get("replicas") == []
# First profile should be listed second by default because it was
# modified less recently
profile_1 = profiles[1]
assert profile_1["id"] == profile_id
assert profile_1["name"] == PROFILE_NAME
assert profile_1["description"] == PROFILE_DESC
assert profile_1["userid"]
assert profile_1["oid"] == default_org_id
assert profile_1.get("origins") or data.get("origins") == []
assert profile_1["created"]
assert profile_1["createdBy"]
assert profile_1["createdByName"] == "admin"
assert profile_1["modified"]
assert profile_1["modifiedBy"]
assert profile_1["modifiedByName"] == "admin"
assert not profile_1["baseid"]
resource = profile_1["resource"]
assert resource
assert resource["filename"]
assert resource["hash"]
assert resource["size"]
assert resource["storage"]
assert resource["storage"]["name"]
assert resource.get("replicas") or resource.get("replicas") == []
break
except:
if time.monotonic() - start_time > time_limit:
raise
time.sleep(1)
def test_update_profile_metadata(crawler_auth_headers, default_org_id, profile_id):
# Get original created/modified times
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/profiles/{profile_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
original_created = data["created"]
original_modified = data["modified"]
# Update name and description
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/profiles/{profile_id}",
headers=crawler_auth_headers,
json={
"name": PROFILE_NAME_UPDATED,
"description": PROFILE_DESC_UPDATED,
},
)
assert r.status_code == 200
assert r.json()["updated"]
time.sleep(5)
# Verify update
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/profiles/{profile_id}",
headers=crawler_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["id"] == profile_id
assert data["name"] == PROFILE_NAME_UPDATED
assert data["description"] == PROFILE_DESC_UPDATED
# Ensure modified was updated but created was not
assert data["modified"] > original_modified
assert data["modifiedBy"]
assert data["modifiedByName"] == "new-crawler"
assert data["created"] == original_created
assert data["createdBy"]
assert data["createdByName"] == "admin"
def test_commit_browser_to_existing_profile(
admin_auth_headers, default_org_id, profile_browser_3_id, profile_id
):
# Get original modified time
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/profiles/{profile_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
original_created = data["created"]
original_modified = data["modified"]
prepare_browser_for_profile_commit(
profile_browser_3_id, admin_auth_headers, default_org_id
)
# Commit new browser to existing profile
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/profiles/{profile_id}",
headers=admin_auth_headers,
json={
"browserid": profile_browser_3_id,
"name": PROFILE_NAME_UPDATED,
"description": PROFILE_DESC_UPDATED,
},
)
assert r.status_code == 200
assert r.json()["updated"]
time.sleep(5)
# Ensure modified was updated but created was not
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/profiles/{profile_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["modified"] > original_modified
assert data["modifiedBy"]
assert data["modifiedByName"] == "admin"
assert data["created"] == original_created
assert data["createdBy"]
assert data["createdByName"] == "admin"
@pytest.mark.parametrize(
"sort_by,sort_direction,profile_1_index,profile_2_index",
[
# Modified, descending
("modified", -1, 0, 1),
# Modified, ascending
("modified", 1, 1, 0),
# Created, descending
("created", -1, 1, 0),
# Created, ascending
("created", 1, 0, 1),
# Name, descending
("name", -1, 0, 1),
# Name, ascending
("name", 1, 1, 0),
# URL, descending
("url", -1, 0, 1),
# URL, ascending
("url", 1, 1, 0),
],
)
def test_sort_profiles(
admin_auth_headers,
default_org_id,
profile_id,
profile_2_id,
sort_by,
sort_direction,
profile_1_index,
profile_2_index,
):
start_time = time.monotonic()
time_limit = 10
while True:
try:
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/profiles?sortBy={sort_by}&sortDirection={sort_direction}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["total"] == 2
profiles = data["items"]
assert len(profiles) == 2
profile_1 = profiles[profile_1_index]
assert profile_1["id"] == profile_id
assert profile_1["name"] == PROFILE_NAME_UPDATED
profile_2 = profiles[profile_2_index]
assert profile_2["id"] == profile_2_id
assert profile_2["name"] == PROFILE_2_NAME
break
except:
if time.monotonic() - start_time > time_limit:
raise
time.sleep(1)
def test_delete_profile(admin_auth_headers, default_org_id, profile_2_id):
# Delete second profile
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/profiles/{profile_2_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
assert r.json()["success"]
# Verify profile has been deleted
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/profiles/{profile_2_id}",
headers=admin_auth_headers,
)
assert r.status_code == 404
assert r.json()["detail"] == "profile_not_found"
# Try to delete it again and verify we get a 404
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/profiles/{profile_2_id}",
headers=admin_auth_headers,
)
assert r.status_code == 404
assert r.json()["detail"] == "profile_not_found"
def test_create_profile_read_only_org(
admin_auth_headers, default_org_id, profile_browser_4_id
):
# Set org to read-only
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/read-only",
headers=admin_auth_headers,
json={"readOnly": True, "readOnlyReason": "For testing purposes"},
)
assert r.json()["updated"]
prepare_browser_for_profile_commit(
profile_browser_4_id, admin_auth_headers, default_org_id
)
# Try to create profile, verify we get 403 forbidden
start_time = time.monotonic()
time_limit = 300
while True:
try:
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/profiles",
headers=admin_auth_headers,
json={
"browserid": profile_browser_4_id,
"name": "uncreatable",
"description": "because org is read-only",
},
timeout=10,
)
detail = r.json().get("detail")
if detail == "waiting_for_browser":
time.sleep(5)
continue
if detail == "org_set_to_read_only":
assert r.status_code == 403
break
except:
if time.monotonic() - start_time > time_limit:
raise
time.sleep(5)
# Set readOnly back to false on org
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/read-only",
headers=admin_auth_headers,
json={"readOnly": False},
)
assert r.json()["updated"]