Fixes #2459 - Set `/data/` as primary storage `access_endpoint_url` in nightly test chart - Modify nightly test GH Actions workflow to spawn a separate job per nightly test module using dynamic matrix - Set configuration not to fail other jobs if one job fails - Modify failing tests: - Add fixture to background job nightly test module so it can run alone - Add retry loop to crawlconfig stats nightly test so it's less dependent on timing GitHub limits each workflow to 256 jobs, so this should continue to be able to scale up for us without issue. --------- Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
		
			
				
	
	
		
			387 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			387 lines
		
	
	
		
			11 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pytest
 | |
| import requests
 | |
| import time
 | |
| import datetime
 | |
| 
 | |
| 
 | |
| HOST_PREFIX = "http://127.0.0.1:30870"
 | |
| API_PREFIX = HOST_PREFIX + "/api"
 | |
| 
 | |
| ADMIN_USERNAME = "admin@example.com"
 | |
| ADMIN_PW = "PASSW0RD!"
 | |
| 
 | |
| CRAWLER_USERNAME = "crawlernightly@example.com"
 | |
| CRAWLER_PW = "crawlerPASSWORD!"
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def admin_auth_headers():
 | |
|     while True:
 | |
|         r = requests.post(
 | |
|             f"{API_PREFIX}/auth/jwt/login",
 | |
|             data={
 | |
|                 "username": ADMIN_USERNAME,
 | |
|                 "password": ADMIN_PW,
 | |
|                 "grant_type": "password",
 | |
|             },
 | |
|         )
 | |
|         data = r.json()
 | |
|         try:
 | |
|             return {"Authorization": f"Bearer {data['access_token']}"}
 | |
|         except:
 | |
|             print("Waiting for admin_auth_headers")
 | |
|             time.sleep(5)
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def default_org_id(admin_auth_headers):
 | |
|     while True:
 | |
|         r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
 | |
|         data = r.json()
 | |
|         try:
 | |
|             for org in data["items"]:
 | |
|                 if org["default"] is True:
 | |
|                     return org["id"]
 | |
|         except:
 | |
|             print("Waiting for default org id")
 | |
|             time.sleep(5)
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def crawler_auth_headers(admin_auth_headers, default_org_id):
 | |
|     requests.post(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/add-user",
 | |
|         json={
 | |
|             "email": CRAWLER_USERNAME,
 | |
|             "password": CRAWLER_PW,
 | |
|             "name": "new-crawler",
 | |
|             "role": 20,
 | |
|         },
 | |
|         headers=admin_auth_headers,
 | |
|     )
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/auth/jwt/login",
 | |
|         data={
 | |
|             "username": CRAWLER_USERNAME,
 | |
|             "password": CRAWLER_PW,
 | |
|             "grant_type": "password",
 | |
|         },
 | |
|     )
 | |
|     data = r.json()
 | |
|     access_token = data.get("access_token")
 | |
|     return {"Authorization": f"Bearer {access_token}"}
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def crawl_id_wr(admin_auth_headers, default_org_id):
 | |
|     # Start crawl.
 | |
|     crawl_data = {
 | |
|         "runNow": True,
 | |
|         "name": "Webrecorder admin test crawl",
 | |
|         "tags": ["wr", "nightly testing"],
 | |
|         "config": {
 | |
|             "seeds": [{"url": "https://webrecorder.net/"}],
 | |
|             "limit": 1,
 | |
|         },
 | |
|     }
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | |
|         headers=admin_auth_headers,
 | |
|         json=crawl_data,
 | |
|     )
 | |
|     data = r.json()
 | |
| 
 | |
|     crawl_id = data["run_now_job"]
 | |
|     # Wait for it to complete and then return crawl ID
 | |
|     while True:
 | |
|         r = requests.get(
 | |
|             f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
 | |
|             headers=admin_auth_headers,
 | |
|         )
 | |
|         data = r.json()
 | |
|         if data["state"] == "complete":
 | |
|             return crawl_id
 | |
|         time.sleep(5)
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def crawl_id_wr_specs(admin_auth_headers, default_org_id):
 | |
|     # Start crawl.
 | |
|     crawl_data = {
 | |
|         "runNow": True,
 | |
|         "name": "Webrecorder Specs admin test crawl",
 | |
|         "tags": ["wr-specs", "nightly testing"],
 | |
|         "config": {
 | |
|             "seeds": [{"url": "https://specs.webrecorder.net/"}],
 | |
|             "limit": 1,
 | |
|         },
 | |
|     }
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | |
|         headers=admin_auth_headers,
 | |
|         json=crawl_data,
 | |
|     )
 | |
|     data = r.json()
 | |
| 
 | |
|     crawl_id = data["run_now_job"]
 | |
|     # Wait for it to complete and then return crawl ID
 | |
|     while True:
 | |
|         r = requests.get(
 | |
|             f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
 | |
|             headers=admin_auth_headers,
 | |
|         )
 | |
|         data = r.json()
 | |
|         if data["state"] == "complete":
 | |
|             return crawl_id
 | |
|         time.sleep(5)
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def crawl_config_info(admin_auth_headers, default_org_id):
 | |
|     # Start crawl.
 | |
|     crawl_data = {
 | |
|         "runNow": True,
 | |
|         "name": "Crawl config test",
 | |
|         "config": {"seeds": [{"url": "https://specs.webrecorder.net/"}], "limit": 1},
 | |
|     }
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | |
|         headers=admin_auth_headers,
 | |
|         json=crawl_data,
 | |
|     )
 | |
|     data = r.json()
 | |
| 
 | |
|     crawl_config_id = data["id"]
 | |
|     crawl_id = data["run_now_job"]
 | |
|     while True:
 | |
|         r = requests.get(
 | |
|             f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
 | |
|             headers=admin_auth_headers,
 | |
|         )
 | |
|         data = r.json()
 | |
|         if data["state"] == "complete":
 | |
|             break
 | |
|         time.sleep(5)
 | |
| 
 | |
|     # Run second crawl from crawlconfig and return info when it finishes
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}/run",
 | |
|         headers=admin_auth_headers,
 | |
|     )
 | |
|     data = r.json()
 | |
|     second_crawl_id = data["started"]
 | |
|     while True:
 | |
|         r = requests.get(
 | |
|             f"{API_PREFIX}/orgs/{default_org_id}/crawls/{second_crawl_id}/replay.json",
 | |
|             headers=admin_auth_headers,
 | |
|         )
 | |
|         data = r.json()
 | |
|         if data["state"] == "complete":
 | |
|             return (crawl_config_id, crawl_id, second_crawl_id)
 | |
|         time.sleep(5)
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def large_crawl_id(admin_auth_headers, default_org_id):
 | |
|     # Start crawl
 | |
|     crawl_data = {
 | |
|         "runNow": True,
 | |
|         "name": "Large Test Crawl",
 | |
|         "tags": ["wacz-logs"],
 | |
|         "config": {
 | |
|             "seeds": [{"url": "https://webrecorder.net/"}],
 | |
|             "scopeType": "domain",
 | |
|             "limit": 100,
 | |
|             "extraHops": 1,
 | |
|         },
 | |
|     }
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | |
|         headers=admin_auth_headers,
 | |
|         json=crawl_data,
 | |
|     )
 | |
|     data = r.json()
 | |
| 
 | |
|     crawl_id = data["run_now_job"]
 | |
| 
 | |
|     while True:
 | |
|         r = requests.get(
 | |
|             f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
 | |
|             headers=admin_auth_headers,
 | |
|         )
 | |
|         data = r.json()
 | |
|         if data["state"] == "running":
 | |
|             # Give crawl time to start properly
 | |
|             time.sleep(30)
 | |
|             return crawl_id
 | |
|         time.sleep(5)
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def large_crawl_finished(admin_auth_headers, default_org_id, large_crawl_id):
 | |
|     # Wait for crawl to complete
 | |
|     while True:
 | |
|         r = requests.get(
 | |
|             f"{API_PREFIX}/orgs/{default_org_id}/crawls/{large_crawl_id}/replay.json",
 | |
|             headers=admin_auth_headers,
 | |
|         )
 | |
|         data = r.json()
 | |
|         if data["state"] == "complete":
 | |
|             # Give some time for WACZ files to be stored
 | |
|             time.sleep(30)
 | |
|             break
 | |
|         time.sleep(5)
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def timeout_crawl(admin_auth_headers, default_org_id):
 | |
|     # Start crawl
 | |
|     crawl_data = {
 | |
|         "runNow": True,
 | |
|         "name": "Crawl with crawl timeout",
 | |
|         "crawlTimeout": 15,
 | |
|         "config": {
 | |
|             "seeds": [{"url": "https://webrecorder.net/"}],
 | |
|             "scopeType": "domain",
 | |
|             "limit": 100,
 | |
|         },
 | |
|     }
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | |
|         headers=admin_auth_headers,
 | |
|         json=crawl_data,
 | |
|     )
 | |
|     data = r.json()
 | |
|     return data["run_now_job"]
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def max_crawl_size_crawl_id(admin_auth_headers, default_org_id):
 | |
|     # Start crawl
 | |
|     crawl_data = {
 | |
|         "runNow": True,
 | |
|         "name": "Crawl with 5 MB max crawl size limit",
 | |
|         # Note crawl will exceed this size, as crawl begins to gracefully
 | |
|         # shut down when operator notices this value has been exceeded.
 | |
|         "maxCrawlSize": 5242880,
 | |
|         "config": {
 | |
|             "seeds": [{"url": "https://webrecorder.net/"}],
 | |
|             "scopeType": "domain",
 | |
|             "limit": 100,
 | |
|         },
 | |
|     }
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | |
|         headers=admin_auth_headers,
 | |
|         json=crawl_data,
 | |
|     )
 | |
|     data = r.json()
 | |
|     return data["run_now_job"]
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def error_crawl_id(admin_auth_headers, default_org_id):
 | |
|     crawl_data = {
 | |
|         "runNow": True,
 | |
|         "name": "Invalid URL crawl",
 | |
|         "config": {
 | |
|             "seeds": [
 | |
|                 {"url": "https://invalid-x.webrecorder.net/"},
 | |
|             ],
 | |
|             "limit": 1,
 | |
|         },
 | |
|     }
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | |
|         headers=admin_auth_headers,
 | |
|         json=crawl_data,
 | |
|     )
 | |
|     data = r.json()
 | |
| 
 | |
|     crawl_id = data["run_now_job"]
 | |
| 
 | |
|     while True:
 | |
|         r = requests.get(
 | |
|             f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
 | |
|             headers=admin_auth_headers,
 | |
|         )
 | |
|         data = r.json()
 | |
|         if data["state"] == "complete":
 | |
|             return crawl_id
 | |
|         time.sleep(5)
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def org_with_quotas(admin_auth_headers):
 | |
|     name = "Quota Org " + datetime.datetime.utcnow().isoformat()
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/orgs/create", headers=admin_auth_headers, json={"name": name}
 | |
|     )
 | |
|     data = r.json()
 | |
| 
 | |
|     return data["id"]
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def deleted_crawl_id(admin_auth_headers, default_org_id):
 | |
|     # Start crawl.
 | |
|     crawl_data = {
 | |
|         "runNow": True,
 | |
|         "name": "Test crawl",
 | |
|         "config": {
 | |
|             "seeds": [{"url": "https://webrecorder.net/"}],
 | |
|             "limit": 1,
 | |
|         },
 | |
|     }
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | |
|         headers=admin_auth_headers,
 | |
|         json=crawl_data,
 | |
|     )
 | |
|     data = r.json()
 | |
| 
 | |
|     crawl_id = data["run_now_job"]
 | |
| 
 | |
|     # Wait for it to complete
 | |
|     while True:
 | |
|         r = requests.get(
 | |
|             f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
 | |
|             headers=admin_auth_headers,
 | |
|         )
 | |
|         data = r.json()
 | |
|         if data["state"] == "complete":
 | |
|             break
 | |
|         time.sleep(5)
 | |
| 
 | |
|     # Wait until replica background job completes
 | |
|     while True:
 | |
|         r = requests.get(
 | |
|             f"{API_PREFIX}/orgs/{default_org_id}/jobs/?jobType=create-replica&success=True",
 | |
|             headers=admin_auth_headers,
 | |
|         )
 | |
|         assert r.status_code == 200
 | |
|         if r.json()["total"] == 1:
 | |
|             break
 | |
|         time.sleep(5)
 | |
| 
 | |
|     # Delete crawl
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
 | |
|         headers=admin_auth_headers,
 | |
|         json={"crawl_ids": [crawl_id]},
 | |
|     )
 | |
|     assert r.status_code == 200
 | |
| 
 | |
|     # Wait until delete replica background job completes
 | |
|     while True:
 | |
|         r = requests.get(
 | |
|             f"{API_PREFIX}/orgs/{default_org_id}/jobs/?jobType=delete-replica&success=True",
 | |
|             headers=admin_auth_headers,
 | |
|         )
 | |
|         assert r.status_code == 200
 | |
|         if r.json()["total"] == 1:
 | |
|             break
 | |
|         time.sleep(5)
 | |
| 
 | |
|     return crawl_id
 |