import requests import os from urllib.parse import urljoin from .conftest import API_PREFIX from .utils import read_in_chunks upload_id = None upload_id_2 = None upload_dl_path = None _coll_id = None curr_dir = os.path.dirname(os.path.realpath(__file__)) def test_upload_stream(admin_auth_headers, default_org_id, uploads_collection_id): with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh: r = requests.put( f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test.wacz&name=My%20Upload&description=Testing%0AData&collections={uploads_collection_id}&tags=one%2Ctwo", headers=admin_auth_headers, data=read_in_chunks(fh), ) assert r.status_code == 200 assert r.json()["added"] global upload_id upload_id = r.json()["id"] def test_list_stream_upload(admin_auth_headers, default_org_id, uploads_collection_id): r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/uploads", headers=admin_auth_headers, ) results = r.json() assert len(results["items"]) > 0 found = None for res in results["items"]: if res["id"] == upload_id: found = res assert found assert found["name"] == "My Upload" assert found["description"] == "Testing\nData" assert found["collectionIds"] == [uploads_collection_id] assert sorted(found["tags"]) == ["one", "two"] assert "files" not in found assert "resources" not in found def test_get_stream_upload(admin_auth_headers, default_org_id, uploads_collection_id): r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json", headers=admin_auth_headers, ) assert r.status_code == 200 result = r.json() assert uploads_collection_id in result["collectionIds"] assert "files" not in result upload_dl_path = result["resources"][0]["path"] assert "test-" in result["resources"][0]["name"] assert result["resources"][0]["name"].endswith(".wacz") dl_path = urljoin(API_PREFIX, upload_dl_path) wacz_resp = requests.get(dl_path) actual = wacz_resp.content with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh: expected = fh.read() assert len(actual) == len(expected) assert actual == expected r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id}", headers=admin_auth_headers, ) assert r.status_code == 200 def test_upload_form(admin_auth_headers, default_org_id, uploads_collection_id): with open(os.path.join(curr_dir, "data", "example.wacz"), "rb") as fh: data = fh.read() files = [ ("uploads", ("test.wacz", data, "application/octet-stream")), ("uploads", ("test-2.wacz", data, "application/octet-stream")), ("uploads", ("test.wacz", data, "application/octet-stream")), ] r = requests.put( f"{API_PREFIX}/orgs/{default_org_id}/uploads/formdata?name=test2.wacz&collections={uploads_collection_id}&tags=three%2Cfour", headers=admin_auth_headers, files=files, ) assert r.status_code == 200 assert r.json()["added"] global upload_id_2 upload_id_2 = r.json()["id"] def test_list_uploads(admin_auth_headers, default_org_id, uploads_collection_id): r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/uploads", headers=admin_auth_headers, ) results = r.json() assert len(results["items"]) > 1 found = None for res in results["items"]: if res["id"] == upload_id_2: found = res assert found assert found["name"] == "test2.wacz" assert found["collectionIds"] == [uploads_collection_id] assert sorted(found["tags"]) == ["four", "three"] assert "files" not in res assert "resources" not in res def test_collection_uploads(admin_auth_headers, default_org_id, uploads_collection_id): # Test uploads filtered by collection r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/uploads?collectionId={uploads_collection_id}", headers=admin_auth_headers, ) results = r.json() assert len(results["items"]) == 2 assert results["items"][0]["id"] in (upload_id, upload_id_2) assert results["items"][1]["id"] in (upload_id, upload_id_2) # Test all crawls filtered by collection r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?collectionId={uploads_collection_id}", headers=admin_auth_headers, ) results = r.json() assert len(results["items"]) == 2 assert results["items"][0]["id"] in (upload_id, upload_id_2) assert results["items"][1]["id"] in (upload_id, upload_id_2) def test_get_upload_replay_json( admin_auth_headers, default_org_id, uploads_collection_id ): r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/replay.json", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert data assert data["id"] == upload_id assert data["name"] == "My Upload" assert data["collectionIds"] == [uploads_collection_id] assert sorted(data["tags"]) == ["one", "two"] assert data["resources"] assert data["resources"][0]["path"] assert data["resources"][0]["size"] assert data["resources"][0]["hash"] assert data["errors"] == None assert "files" not in data def test_get_upload_replay_json_admin( admin_auth_headers, default_org_id, uploads_collection_id ): r = requests.get( f"{API_PREFIX}/orgs/all/uploads/{upload_id}/replay.json", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert data assert data["id"] == upload_id assert data["name"] == "My Upload" assert data["collectionIds"] == [uploads_collection_id] assert sorted(data["tags"]) == ["one", "two"] assert data["resources"] assert data["resources"][0]["path"] assert data["resources"][0]["size"] assert data["resources"][0]["hash"] assert data["errors"] == None assert "files" not in data def test_replace_upload(admin_auth_headers, default_org_id, uploads_collection_id): actual_id = do_upload_replace( admin_auth_headers, default_org_id, upload_id, uploads_collection_id ) assert upload_id == actual_id def do_upload_replace( admin_auth_headers, default_org_id, upload_id, uploads_collection_id ): with open(os.path.join(curr_dir, "data", "example-2.wacz"), "rb") as fh: r = requests.put( f"{API_PREFIX}/orgs/{default_org_id}/uploads/stream?filename=test.wacz&name=My%20Upload%20Updated&replaceId={upload_id}&collections={uploads_collection_id}", headers=admin_auth_headers, data=read_in_chunks(fh), ) assert r.status_code == 200 assert r.json()["added"] actual_id = r.json()["id"] r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/uploads/{actual_id}/replay.json", headers=admin_auth_headers, ) assert r.status_code == 200 result = r.json() # only one file, previous file removed assert len(result["resources"]) == 1 dl_path = urljoin(API_PREFIX, result["resources"][0]["path"]) wacz_resp = requests.get(dl_path) actual = wacz_resp.content with open(os.path.join(curr_dir, "data", "example-2.wacz"), "rb") as fh: expected = fh.read() assert len(actual) == len(expected) assert actual == expected return actual_id def test_update_upload_metadata(admin_auth_headers, default_org_id): r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert data["name"] == "My Upload Updated" assert not data["tags"] assert not data["description"] assert len(data["collectionIds"]) == 1 # Make new collection r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/collections", headers=admin_auth_headers, json={"name": "Patch Update Test Collection"}, ) new_coll_id = r.json()["id"] # Submit patch request to update name, tags, and description UPDATED_NAME = "New Upload Name" UPDATED_TAGS = ["wr-test-1-updated", "wr-test-2-updated"] UPDATED_DESC = "Lorem ipsum test note." UPDATED_COLLECTION_IDS = [new_coll_id] r = requests.patch( f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}", headers=admin_auth_headers, json={ "tags": UPDATED_TAGS, "description": UPDATED_DESC, "name": UPDATED_NAME, "collectionIds": UPDATED_COLLECTION_IDS, }, ) assert r.status_code == 200 data = r.json() assert data["updated"] # Verify update was successful r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert sorted(data["tags"]) == sorted(UPDATED_TAGS) assert data["description"] == UPDATED_DESC assert data["name"] == UPDATED_NAME assert data["collectionIds"] == UPDATED_COLLECTION_IDS def test_delete_stream_upload(admin_auth_headers, default_org_id): r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/uploads/delete", headers=admin_auth_headers, json={"crawl_ids": [upload_id]}, ) assert r.json()["deleted"] == True def test_ensure_deleted(admin_auth_headers, default_org_id): r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/uploads", headers=admin_auth_headers, ) results = r.json() for res in results["items"]: if res["id"] == upload_id: assert False def test_replace_upload_non_existent( admin_auth_headers, default_org_id, uploads_collection_id ): global upload_id # same replacement, but now to a non-existent upload actual_id = do_upload_replace( admin_auth_headers, default_org_id, upload_id, uploads_collection_id ) # new upload_id created assert actual_id != upload_id upload_id = actual_id def test_verify_from_upload_resource_count(admin_auth_headers, default_org_id): r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id_2}/replay.json", headers=admin_auth_headers, ) assert r.status_code == 200 result = r.json() assert "files" not in result assert len(result["resources"]) == 3 r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id_2}", headers=admin_auth_headers, ) assert r.status_code == 200 def test_list_all_crawls(admin_auth_headers, default_org_id): """Test that /all-crawls lists crawls and uploads before deleting uploads""" r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() items = data["items"] assert len(items) == data["total"] crawls = [item for item in items if item["type"] == "crawl"] assert len(crawls) > 0 uploads = [item for item in items if item["type"] == "upload"] assert len(uploads) > 0 for item in items: assert item["type"] in ("crawl", "upload") if item["type"] == "crawl": assert item["firstSeed"] assert item["seedCount"] assert item.get("name") or item.get("name") == "" assert item["id"] assert item["userid"] assert item["oid"] == default_org_id assert item["started"] assert item["finished"] assert item["state"] def test_get_all_crawls_by_name(admin_auth_headers, default_org_id): """Test filtering /all-crawls by name""" r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?name=test2.wacz", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert data["total"] == 1 items = data["items"] assert items[0]["id"] == upload_id_2 assert items[0]["name"] == "test2.wacz" crawl_name = "Crawler User Test Crawl" r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?name={crawl_name}", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert data["total"] == 3 for item in data["items"]: assert item["name"] == crawl_name def test_get_all_crawls_by_first_seed( admin_auth_headers, default_org_id, crawler_crawl_id ): """Test filtering /all-crawls by first seed""" first_seed = "https://webrecorder.net/" r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?firstSeed={first_seed}", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert data["total"] == 3 for item in data["items"]: assert item["firstSeed"] == first_seed def test_get_all_crawls_by_type(admin_auth_headers, default_org_id, admin_crawl_id): """Test filtering /all-crawls by crawl type""" r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=crawl", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert data["total"] == 3 for item in data["items"]: assert item["type"] == "crawl" r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=upload", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert data["total"] == 3 for item in data["items"]: assert item["type"] == "upload" r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?crawlType=invalid", headers=admin_auth_headers, ) assert r.status_code == 400 assert r.json()["detail"] == "invalid_crawl_type" def test_get_all_crawls_by_user(admin_auth_headers, default_org_id, crawler_userid): """Test filtering /all-crawls by userid""" r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?userid={crawler_userid}", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert data["total"] == 4 for item in data["items"]: assert item["userid"] == crawler_userid def test_get_all_crawls_by_cid( admin_auth_headers, default_org_id, all_crawls_config_id ): """Test filtering /all-crawls by cid""" r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?cid={all_crawls_config_id}", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert data["total"] == 1 assert data["items"][0]["cid"] == all_crawls_config_id def test_get_all_crawls_by_state(admin_auth_headers, default_org_id, admin_crawl_id): """Test filtering /all-crawls by cid""" r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?state=complete,partial_complete", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert data["total"] == 5 items = data["items"] for item in items: assert item["state"] in ("complete", "partial_complete") def test_get_all_crawls_by_collection_id( admin_auth_headers, default_org_id, admin_config_id, all_crawls_crawl_id ): """Test filtering /all-crawls by collection id""" # Create collection and add upload to it r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/collections", headers=admin_auth_headers, json={ "crawlIds": [all_crawls_crawl_id], "name": "all-crawls collection", }, ) assert r.status_code == 200 global _coll_id _coll_id = r.json()["id"] r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?collectionId={_coll_id}", headers=admin_auth_headers, ) assert r.status_code == 200 assert r.json()["total"] == 1 assert r.json()["items"][0]["id"] == all_crawls_crawl_id def test_sort_all_crawls(admin_auth_headers, default_org_id, admin_crawl_id): # Sort by started, descending (default) r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started", headers=admin_auth_headers, ) data = r.json() assert data["total"] == 7 items = data["items"] assert len(items) == 7 last_created = None for crawl in items: if last_created: assert crawl["started"] <= last_created last_created = crawl["started"] # Sort by started, ascending r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started&sortDirection=1", headers=admin_auth_headers, ) data = r.json() items = data["items"] last_created = None for crawl in items: if last_created: assert crawl["started"] >= last_created last_created = crawl["started"] # Sort by finished r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=finished", headers=admin_auth_headers, ) data = r.json() items = data["items"] last_finished = None for crawl in items: if not crawl["finished"]: continue if last_finished: assert crawl["finished"] <= last_finished last_finished = crawl["finished"] # Sort by finished, ascending r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=finished&sortDirection=1", headers=admin_auth_headers, ) data = r.json() items = data["items"] last_finished = None for crawl in items: if not crawl["finished"]: continue if last_finished: assert crawl["finished"] >= last_finished last_finished = crawl["finished"] # Sort by fileSize r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=fileSize", headers=admin_auth_headers, ) data = r.json() items = data["items"] last_size = None for crawl in items: if last_size: assert crawl["fileSize"] <= last_size last_size = crawl["fileSize"] # Sort by fileSize, ascending r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=fileSize&sortDirection=1", headers=admin_auth_headers, ) data = r.json() items = data["items"] last_size = None for crawl in items: if last_size: assert crawl["fileSize"] >= last_size last_size = crawl["fileSize"] # Invalid sort value r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=invalid", headers=admin_auth_headers, ) assert r.status_code == 400 assert r.json()["detail"] == "invalid_sort_by" # Invalid sort_direction value r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls?sortBy=started&sortDirection=0", headers=admin_auth_headers, ) assert r.status_code == 400 assert r.json()["detail"] == "invalid_sort_direction" def test_all_crawls_search_values(admin_auth_headers, default_org_id): """Test that all-crawls search values return expected results""" r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/search-values", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert len(data["names"]) == 5 expected_names = [ "Crawler User Test Crawl", "My Upload Updated", "test2.wacz", "All Crawls Test Crawl", ] for expected_name in expected_names: assert expected_name in data["names"] assert sorted(data["descriptions"]) == ["Lorem ipsum"] assert sorted(data["firstSeeds"]) == ["https://webrecorder.net/"] # Test filtering by crawls r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/search-values?crawlType=crawl", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert len(data["names"]) == 2 expected_names = [ "Crawler User Test Crawl", "All Crawls Test Crawl", ] for expected_name in expected_names: assert expected_name in data["names"] assert sorted(data["descriptions"]) == ["Lorem ipsum"] assert sorted(data["firstSeeds"]) == ["https://webrecorder.net/"] # Test filtering by uploads r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/search-values?crawlType=upload", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert len(data["names"]) == 3 expected_names = [ "My Upload Updated", "test2.wacz", ] for expected_name in expected_names: assert expected_name in data["names"] assert sorted(data["descriptions"]) == [] assert sorted(data["firstSeeds"]) == [] # Test invalid filter r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/search-values?crawlType=invalid", headers=admin_auth_headers, ) assert r.status_code == 400 assert r.json()["detail"] == "invalid_crawl_type" def test_get_upload_from_all_crawls(admin_auth_headers, default_org_id): """Test that /all-crawls lists crawls and uploads before deleting uploads""" r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id_2}", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert data["name"] == "test2.wacz" assert "files" not in data assert data["resources"] def test_get_upload_replay_json_from_all_crawls(admin_auth_headers, default_org_id): r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id_2}/replay.json", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert data assert data["id"] == upload_id_2 assert data["name"] == "test2.wacz" assert data["resources"] assert data["resources"][0]["path"] assert data["resources"][0]["size"] assert data["resources"][0]["hash"] assert data["errors"] == None assert "files" not in data def test_get_upload_replay_json_admin_from_all_crawls( admin_auth_headers, default_org_id ): r = requests.get( f"{API_PREFIX}/orgs/all/all-crawls/{upload_id_2}/replay.json", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert data assert data["id"] == upload_id_2 assert data["name"] == "test2.wacz" assert data["resources"] assert data["resources"][0]["path"] assert data["resources"][0]["size"] assert data["resources"][0]["hash"] assert data["errors"] == None assert "files" not in data def test_update_upload_metadata_all_crawls(admin_auth_headers, default_org_id): r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id}", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert data["name"] == "My Upload Updated" assert not data["tags"] assert not data["description"] assert len(data["collectionIds"]) == 1 # Make new collection r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/collections", headers=admin_auth_headers, json={"name": "Patch Update Test Collection 2"}, ) new_coll_id = r.json()["id"] # Submit patch request to update name, tags, and description UPDATED_NAME = "New Upload Name 2" UPDATED_TAGS = ["wr-test-1-updated-again", "wr-test-2-updated-again"] UPDATED_DESC = "Lorem ipsum test note 2." UPDATED_COLLECTION_IDS = [new_coll_id] r = requests.patch( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id}", headers=admin_auth_headers, json={ "tags": UPDATED_TAGS, "description": UPDATED_DESC, "name": UPDATED_NAME, "collectionIds": UPDATED_COLLECTION_IDS, }, ) assert r.status_code == 200 data = r.json() assert data["updated"] # Verify update was successful r = requests.get( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/{upload_id}", headers=admin_auth_headers, ) assert r.status_code == 200 data = r.json() assert sorted(data["tags"]) == sorted(UPDATED_TAGS) assert data["description"] == UPDATED_DESC assert data["name"] == UPDATED_NAME assert data["collectionIds"] == UPDATED_COLLECTION_IDS def test_delete_form_upload_from_all_crawls(admin_auth_headers, default_org_id): r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/all-crawls/delete", headers=admin_auth_headers, json={"crawl_ids": [upload_id_2]}, ) assert r.json()["deleted"] == True