Fixes #1597 New endpoints (replacing old migration) to re-add crawl pages to db from WACZs. After a few implementation attempts, we settled on using [remotezip](https://github.com/gtsystem/python-remotezip) to handle parsing of the zip files and streaming their contents line-by-line for pages. I've also modified the sync log streaming to use remotezip as well, which allows us to remove our own zip module and let remotezip handle the complexity of parsing zip files. Database inserts for pages from WACZs are batched 100 at a time to help speed up the endpoint, and the task is kicked off using asyncio.create_task so as not to block before giving a response. StorageOps now contains a method for streaming the bytes of any file in a remote WACZ, requiring only the presigned URL for the WACZ and the name of the file to stream.
717 lines
21 KiB
Python
717 lines
21 KiB
Python
import requests
|
|
import hashlib
|
|
import time
|
|
import io
|
|
import zipfile
|
|
import re
|
|
import csv
|
|
import codecs
|
|
|
|
from .conftest import API_PREFIX, HOST_PREFIX
|
|
from .test_collections import UPDATED_NAME as COLLECTION_NAME
|
|
|
|
wacz_path = None
|
|
wacz_size = None
|
|
wacz_hash = None
|
|
|
|
wacz_content = None
|
|
|
|
page_id = None
|
|
|
|
|
|
def test_list_orgs(admin_auth_headers, default_org_id):
|
|
r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
|
|
data = r.json()
|
|
|
|
orgs = data["items"]
|
|
assert len(orgs) > 0
|
|
assert data["total"] > 0
|
|
|
|
org_ids = []
|
|
for org in orgs:
|
|
org_ids.append(org["id"])
|
|
assert default_org_id in org_ids
|
|
|
|
|
|
def test_create_new_config(admin_auth_headers, default_org_id):
|
|
crawl_data = {
|
|
"runNow": False,
|
|
"name": "Test Crawl",
|
|
"config": {"seeds": [{"url": "https://webrecorder.net/"}]},
|
|
}
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
headers=admin_auth_headers,
|
|
json=crawl_data,
|
|
)
|
|
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["added"]
|
|
assert data["run_now_job"] == None
|
|
assert data["storageQuotaReached"] is False
|
|
|
|
|
|
def test_wait_for_complete(admin_auth_headers, default_org_id, admin_crawl_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["state"] == "complete"
|
|
|
|
assert len(data["resources"]) == 1
|
|
assert data["resources"][0]["path"]
|
|
|
|
# ensure filename matches specified pattern
|
|
# set in default_crawl_filename_template
|
|
assert re.search("/[\\d]+-testing-[\\w-]+\\.wacz", data["resources"][0]["path"])
|
|
|
|
assert data["tags"] == ["wr-test-1", "wr-test-2"]
|
|
|
|
global wacz_path
|
|
global wacz_size
|
|
global wacz_hash
|
|
wacz_path = data["resources"][0]["path"]
|
|
wacz_size = data["resources"][0]["size"]
|
|
wacz_hash = data["resources"][0]["hash"]
|
|
|
|
|
|
def test_crawl_info(admin_auth_headers, default_org_id, admin_crawl_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["fileSize"] == wacz_size
|
|
assert data["fileCount"] == 1
|
|
assert data["userName"]
|
|
|
|
|
|
def test_crawls_include_seed_info(admin_auth_headers, default_org_id, admin_crawl_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
assert data["firstSeed"] == "https://webrecorder.net/"
|
|
assert data["seedCount"] == 1
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
crawls = data["items"]
|
|
assert crawls
|
|
for crawl in crawls:
|
|
assert crawl["firstSeed"]
|
|
assert crawl["seedCount"] > 0
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/all/crawls?runningOnly=0",
|
|
headers=admin_auth_headers,
|
|
)
|
|
data = r.json()
|
|
crawls = data["items"]
|
|
assert crawls
|
|
for crawl in crawls:
|
|
assert crawl["firstSeed"]
|
|
assert crawl["seedCount"] > 0
|
|
|
|
|
|
def test_crawl_seeds_endpoint(admin_auth_headers, default_org_id, admin_crawl_id):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/seeds",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
data = r.json()
|
|
assert data["total"] == 1
|
|
assert data["items"][0]["url"] == "https://webrecorder.net/"
|
|
assert data["items"][0]["depth"] == 1
|
|
|
|
|
|
def test_crawls_exclude_errors(admin_auth_headers, default_org_id, admin_crawl_id):
|
|
# Get endpoint
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data.get("errors") == []
|
|
|
|
# replay.json endpoint
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data.get("errors") == []
|
|
|
|
# List endpoint
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
crawls = r.json()["items"]
|
|
for crawl in crawls:
|
|
assert data.get("errors") == []
|
|
|
|
|
|
def test_crawls_exclude_full_seeds(admin_auth_headers, default_org_id, admin_crawl_id):
|
|
# Get endpoint
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
config = data.get("config")
|
|
assert config is None or config.get("seeds") is None
|
|
|
|
# replay.json endpoint
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/replay.json",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
config = r.json().get("config")
|
|
assert config is None or config.get("seeds") is None
|
|
|
|
# List endpoint
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
crawls = r.json()["items"]
|
|
for crawl in crawls:
|
|
config = crawl.get("config")
|
|
assert config is None or config.get("seeds") is None
|
|
|
|
|
|
def test_download_wacz():
|
|
r = requests.get(HOST_PREFIX + wacz_path)
|
|
assert r.status_code == 200
|
|
assert len(r.content) == wacz_size
|
|
|
|
h = hashlib.sha256()
|
|
h.update(r.content)
|
|
assert h.hexdigest() == wacz_hash, (h.hexdigest(), wacz_hash)
|
|
|
|
global wacz_content
|
|
wacz_content = r.content
|
|
|
|
|
|
def test_verify_wacz():
|
|
b = io.BytesIO(wacz_content)
|
|
z = zipfile.ZipFile(b)
|
|
|
|
assert "pages/pages.jsonl" in z.namelist()
|
|
|
|
# 1 seed page
|
|
pages = z.open("pages/pages.jsonl").read().decode("utf-8")
|
|
assert '"https://webrecorder.net/"' in pages
|
|
|
|
# 1 seed page + header line
|
|
assert len(pages.strip().split("\n")) == 2
|
|
|
|
# 1 other page
|
|
pages = z.open("pages/extraPages.jsonl").read().decode("utf-8")
|
|
assert '"https://webrecorder.net/blog"' in pages
|
|
|
|
# 3 other page + header line
|
|
assert len(pages.strip().split("\n")) == 4
|
|
|
|
|
|
def test_update_crawl(
|
|
admin_auth_headers,
|
|
default_org_id,
|
|
admin_crawl_id,
|
|
):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert sorted(data["tags"]) == ["wr-test-1", "wr-test-2"]
|
|
assert len(data["collectionIds"]) == 1
|
|
|
|
# Make new collection
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/collections",
|
|
headers=admin_auth_headers,
|
|
json={"name": "Crawl Update Test Collection"},
|
|
)
|
|
new_coll_id = r.json()["id"]
|
|
|
|
# Submit patch request
|
|
UPDATED_TAGS = ["wr-test-1-updated", "wr-test-2-updated"]
|
|
UPDATED_DESC = "Lorem ipsum test note."
|
|
UPDATED_NAME = "Updated crawl name"
|
|
UPDATED_COLLECTION_IDS = [new_coll_id]
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"tags": UPDATED_TAGS,
|
|
"description": UPDATED_DESC,
|
|
"name": UPDATED_NAME,
|
|
"collectionIds": UPDATED_COLLECTION_IDS,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["updated"]
|
|
|
|
# Verify update was successful
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
|
|
assert data["description"] == UPDATED_DESC
|
|
assert data["name"] == UPDATED_NAME
|
|
assert data["collectionIds"] == UPDATED_COLLECTION_IDS
|
|
assert data.get("reviewStatus") is None
|
|
|
|
# Update reviewStatus and verify
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"reviewStatus": "good",
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["updated"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["reviewStatus"] == "good"
|
|
|
|
# Try to update to invalid reviewStatus
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
json={
|
|
"reviewStatus": "invalid",
|
|
},
|
|
)
|
|
assert r.status_code == 422
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["reviewStatus"] == "good"
|
|
|
|
# Verify deleting works as well
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
json={"tags": [], "description": None},
|
|
)
|
|
assert r.status_code == 200
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["tags"] == []
|
|
assert not data["description"]
|
|
|
|
|
|
def test_crawl_stats_all_orgs_not_superadmin(crawler_auth_headers):
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/all/crawls/stats", headers=crawler_auth_headers
|
|
)
|
|
assert r.status_code == 403
|
|
|
|
|
|
def test_crawl_stats_all_orgs(admin_auth_headers):
|
|
with requests.get(
|
|
f"{API_PREFIX}/orgs/all/crawls/stats", headers=admin_auth_headers, stream=True
|
|
) as r:
|
|
assert r.status_code == 200
|
|
|
|
# Wait for stream content
|
|
if not r.content:
|
|
while True:
|
|
if r.content:
|
|
break
|
|
time.sleep(5)
|
|
|
|
buffer = r.iter_lines()
|
|
for row in csv.DictReader(
|
|
codecs.iterdecode(buffer, "utf-8"), skipinitialspace=True
|
|
):
|
|
assert row["id"]
|
|
assert row["oid"]
|
|
assert row["org"]
|
|
assert row["cid"]
|
|
assert row["name"] or row["name"] == ""
|
|
assert row["state"]
|
|
assert row["userid"]
|
|
assert row["user"]
|
|
assert row["started"]
|
|
assert row["finished"] or row["finished"] is None
|
|
assert row["duration"] or row["duration"] == 0
|
|
assert row["pages"] or row["pages"] == 0
|
|
assert row["filesize"] or row["filesize"] == 0
|
|
assert row["avg_page_time"] or row["avg_page_time"] == 0
|
|
|
|
|
|
def test_crawl_stats(crawler_auth_headers, default_org_id):
|
|
with requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/stats",
|
|
headers=crawler_auth_headers,
|
|
stream=True,
|
|
) as r:
|
|
assert r.status_code == 200
|
|
|
|
# Wait for stream content
|
|
if not r.content:
|
|
while True:
|
|
if r.content:
|
|
break
|
|
time.sleep(5)
|
|
|
|
buffer = r.iter_lines()
|
|
for row in csv.DictReader(
|
|
codecs.iterdecode(buffer, "utf-8"), skipinitialspace=True
|
|
):
|
|
assert row["id"]
|
|
assert row["oid"] == default_org_id
|
|
assert row["org"]
|
|
assert row["cid"]
|
|
assert row["name"] or row["name"] == ""
|
|
assert row["state"]
|
|
assert row["userid"]
|
|
assert row["user"]
|
|
assert row["started"]
|
|
assert row["finished"] or row["finished"] is None
|
|
assert row["duration"] or row["duration"] == 0
|
|
assert row["pages"] or row["pages"] == 0
|
|
assert row["filesize"] or row["filesize"] == 0
|
|
assert row["avg_page_time"] or row["avg_page_time"] == 0
|
|
|
|
|
|
def test_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
|
# Test GET list endpoint
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] >= 0
|
|
|
|
pages = data["items"]
|
|
assert pages
|
|
|
|
for page in pages:
|
|
assert page["id"]
|
|
assert page["oid"]
|
|
assert page["crawl_id"]
|
|
assert page["url"]
|
|
assert page["timestamp"]
|
|
assert page.get("title") or page.get("title") is None
|
|
assert page["load_state"]
|
|
assert page["status"]
|
|
|
|
# Test GET page endpoint
|
|
global page_id
|
|
page_id = pages[0]["id"]
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
page = r.json()
|
|
|
|
assert page["id"] == page_id
|
|
assert page["oid"]
|
|
assert page["crawl_id"]
|
|
assert page["url"]
|
|
assert page["timestamp"]
|
|
assert page.get("title") or page.get("title") is None
|
|
assert page["load_state"]
|
|
|
|
assert page["screenshotMatch"] == {}
|
|
assert page["textMatch"] == {}
|
|
assert page["resourceCounts"] == {}
|
|
|
|
assert page["notes"] == []
|
|
assert page.get("userid") is None
|
|
assert page.get("modified") is None
|
|
assert page.get("approved") is None
|
|
|
|
# Update page with approval
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
|
|
headers=crawler_auth_headers,
|
|
json={
|
|
"approved": True,
|
|
},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
page = r.json()
|
|
|
|
assert page["id"] == page_id
|
|
assert page["oid"]
|
|
assert page["crawl_id"]
|
|
assert page["url"]
|
|
assert page["timestamp"]
|
|
assert page.get("title") or page.get("title") is None
|
|
assert page["load_state"]
|
|
|
|
assert page["notes"] == []
|
|
assert page["userid"]
|
|
assert page["modified"]
|
|
assert page["approved"]
|
|
|
|
|
|
def test_re_add_crawl_pages(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
|
# Re-add pages and verify they were correctly added
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/reAdd",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["started"]
|
|
|
|
time.sleep(10)
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["total"] >= 0
|
|
|
|
pages = data["items"]
|
|
assert pages
|
|
|
|
for page in pages:
|
|
assert page["id"]
|
|
assert page["oid"]
|
|
assert page["crawl_id"]
|
|
assert page["url"]
|
|
assert page["timestamp"]
|
|
assert page.get("title") or page.get("title") is None
|
|
assert page["load_state"]
|
|
assert page["status"]
|
|
|
|
# Ensure only superuser can re-add pages for all crawls in an org
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/all/pages/reAdd",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 403
|
|
|
|
|
|
def test_crawl_page_notes(crawler_auth_headers, default_org_id, crawler_crawl_id):
|
|
note_text = "testing"
|
|
updated_note_text = "updated"
|
|
untouched_text = "untouched"
|
|
|
|
# Add note
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}/notes",
|
|
headers=crawler_auth_headers,
|
|
json={"text": note_text},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["added"]
|
|
|
|
# Check that note was added
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
|
|
assert len(data["notes"]) == 1
|
|
|
|
first_note = data["notes"][0]
|
|
|
|
first_note_id = first_note["id"]
|
|
assert first_note_id
|
|
|
|
assert first_note["created"]
|
|
assert first_note["userid"]
|
|
assert first_note["userName"]
|
|
assert first_note["text"] == note_text
|
|
|
|
# Add second note to test selective updates/deletes
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}/notes",
|
|
headers=crawler_auth_headers,
|
|
json={"text": untouched_text},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["added"]
|
|
|
|
# Edit first note
|
|
r = requests.patch(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}/notes",
|
|
headers=crawler_auth_headers,
|
|
json={"text": updated_note_text, "id": first_note_id},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["updated"]
|
|
|
|
# Verify notes look as expected
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
notes = data["notes"]
|
|
|
|
assert len(notes) == 2
|
|
|
|
updated_note = [note for note in notes if note["id"] == first_note_id][0]
|
|
assert updated_note["text"] == updated_note_text
|
|
|
|
second_note_id = [note["id"] for note in notes if note["text"] == untouched_text][0]
|
|
assert second_note_id
|
|
|
|
# Delete both notes
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}/notes/delete",
|
|
headers=crawler_auth_headers,
|
|
json={"delete_list": [first_note_id, second_note_id]},
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["deleted"]
|
|
|
|
# Verify notes were deleted
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages/{page_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
notes = data.get("notes")
|
|
assert notes == []
|
|
|
|
|
|
def test_delete_crawls_crawler(
|
|
crawler_auth_headers, default_org_id, admin_crawl_id, crawler_crawl_id
|
|
):
|
|
# Test that crawler user can't delete another user's crawls
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
|
headers=crawler_auth_headers,
|
|
json={"crawl_ids": [admin_crawl_id]},
|
|
)
|
|
assert r.status_code == 403
|
|
data = r.json()
|
|
assert data["detail"] == "not_allowed"
|
|
|
|
# Check that pages exist for crawl
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["total"] > 0
|
|
|
|
# Test that crawler user can delete own crawl
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
|
headers=crawler_auth_headers,
|
|
json={"crawl_ids": [crawler_crawl_id]},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["deleted"] == 1
|
|
assert data["storageQuotaReached"] is False
|
|
|
|
time.sleep(5)
|
|
|
|
# Test that crawl is not found after deleting
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 404
|
|
|
|
# Test that associated pages are also deleted
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
|
|
headers=crawler_auth_headers,
|
|
)
|
|
assert r.status_code == 200
|
|
assert r.json()["total"] == 0
|
|
|
|
|
|
def test_delete_crawls_org_owner(
|
|
admin_auth_headers,
|
|
crawler_auth_headers,
|
|
default_org_id,
|
|
admin_crawl_id,
|
|
crawler_crawl_id,
|
|
wr_specs_crawl_id,
|
|
):
|
|
# Test that org owner can delete own crawl
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
|
headers=admin_auth_headers,
|
|
json={"crawl_ids": [admin_crawl_id]},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["deleted"]
|
|
assert data["storageQuotaReached"] is False
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 404
|
|
|
|
# Test that org owner can delete another org user's crawls
|
|
r = requests.post(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
|
headers=admin_auth_headers,
|
|
json={"crawl_ids": [wr_specs_crawl_id]},
|
|
)
|
|
assert r.status_code == 200
|
|
data = r.json()
|
|
assert data["deleted"] == 1
|
|
|
|
r = requests.get(
|
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{wr_specs_crawl_id}",
|
|
headers=admin_auth_headers,
|
|
)
|
|
assert r.status_code == 404
|