Fixes #1395 - Adds new `POST /orgs/<orgid>/jobs/retryFailed` API endpoint to retry all failed background jobs for a specific org. - Also adds `POST /orgs/all/jobs/retryFailed` for superadmin to retry all failed background jobs for all orgs
		
			
				
	
	
		
			324 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			324 lines
		
	
	
		
			9.0 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
import pytest
 | 
						|
import requests
 | 
						|
import time
 | 
						|
import datetime
 | 
						|
 | 
						|
 | 
						|
HOST_PREFIX = "http://127.0.0.1:30870"
 | 
						|
API_PREFIX = HOST_PREFIX + "/api"
 | 
						|
 | 
						|
ADMIN_USERNAME = "admin@example.com"
 | 
						|
ADMIN_PW = "PASSW0RD!"
 | 
						|
 | 
						|
CRAWLER_USERNAME = "crawlernightly@example.com"
 | 
						|
CRAWLER_PW = "crawlerPASSWORD!"
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture(scope="session")
 | 
						|
def admin_auth_headers():
 | 
						|
    while True:
 | 
						|
        r = requests.post(
 | 
						|
            f"{API_PREFIX}/auth/jwt/login",
 | 
						|
            data={
 | 
						|
                "username": ADMIN_USERNAME,
 | 
						|
                "password": ADMIN_PW,
 | 
						|
                "grant_type": "password",
 | 
						|
            },
 | 
						|
        )
 | 
						|
        data = r.json()
 | 
						|
        try:
 | 
						|
            return {"Authorization": f"Bearer {data['access_token']}"}
 | 
						|
        except:
 | 
						|
            print("Waiting for admin_auth_headers")
 | 
						|
            time.sleep(5)
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture(scope="session")
 | 
						|
def default_org_id(admin_auth_headers):
 | 
						|
    while True:
 | 
						|
        r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
 | 
						|
        data = r.json()
 | 
						|
        try:
 | 
						|
            for org in data["items"]:
 | 
						|
                if org["default"] is True:
 | 
						|
                    return org["id"]
 | 
						|
        except:
 | 
						|
            print("Waiting for default org id")
 | 
						|
            time.sleep(5)
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture(scope="session")
 | 
						|
def crawler_auth_headers(admin_auth_headers, default_org_id):
 | 
						|
    requests.post(
 | 
						|
        f"{API_PREFIX}/orgs/{default_org_id}/add-user",
 | 
						|
        json={
 | 
						|
            "email": CRAWLER_USERNAME,
 | 
						|
            "password": CRAWLER_PW,
 | 
						|
            "name": "new-crawler",
 | 
						|
            "description": "crawler test crawl",
 | 
						|
            "role": 20,
 | 
						|
        },
 | 
						|
        headers=admin_auth_headers,
 | 
						|
    )
 | 
						|
    r = requests.post(
 | 
						|
        f"{API_PREFIX}/auth/jwt/login",
 | 
						|
        data={
 | 
						|
            "username": CRAWLER_USERNAME,
 | 
						|
            "password": CRAWLER_PW,
 | 
						|
            "grant_type": "password",
 | 
						|
        },
 | 
						|
    )
 | 
						|
    data = r.json()
 | 
						|
    access_token = data.get("access_token")
 | 
						|
    return {"Authorization": f"Bearer {access_token}"}
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture(scope="session")
 | 
						|
def crawl_id_wr(admin_auth_headers, default_org_id):
 | 
						|
    # Start crawl.
 | 
						|
    crawl_data = {
 | 
						|
        "runNow": True,
 | 
						|
        "name": "Webrecorder admin test crawl",
 | 
						|
        "tags": ["wr", "nightly testing"],
 | 
						|
        "config": {
 | 
						|
            "seeds": [{"url": "https://webrecorder.net/"}],
 | 
						|
            "limit": 1,
 | 
						|
        },
 | 
						|
    }
 | 
						|
    r = requests.post(
 | 
						|
        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | 
						|
        headers=admin_auth_headers,
 | 
						|
        json=crawl_data,
 | 
						|
    )
 | 
						|
    data = r.json()
 | 
						|
 | 
						|
    crawl_id = data["run_now_job"]
 | 
						|
    # Wait for it to complete and then return crawl ID
 | 
						|
    while True:
 | 
						|
        r = requests.get(
 | 
						|
            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
 | 
						|
            headers=admin_auth_headers,
 | 
						|
        )
 | 
						|
        data = r.json()
 | 
						|
        if data["state"] == "complete":
 | 
						|
            return crawl_id
 | 
						|
        time.sleep(5)
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture(scope="session")
 | 
						|
def crawl_id_wr_specs(admin_auth_headers, default_org_id):
 | 
						|
    # Start crawl.
 | 
						|
    crawl_data = {
 | 
						|
        "runNow": True,
 | 
						|
        "name": "Webrecorder Specs admin test crawl",
 | 
						|
        "tags": ["wr-specs", "nightly testing"],
 | 
						|
        "config": {
 | 
						|
            "seeds": [{"url": "https://specs.webrecorder.net/"}],
 | 
						|
            "limit": 1,
 | 
						|
        },
 | 
						|
    }
 | 
						|
    r = requests.post(
 | 
						|
        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | 
						|
        headers=admin_auth_headers,
 | 
						|
        json=crawl_data,
 | 
						|
    )
 | 
						|
    data = r.json()
 | 
						|
 | 
						|
    crawl_id = data["run_now_job"]
 | 
						|
    # Wait for it to complete and then return crawl ID
 | 
						|
    while True:
 | 
						|
        r = requests.get(
 | 
						|
            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
 | 
						|
            headers=admin_auth_headers,
 | 
						|
        )
 | 
						|
        data = r.json()
 | 
						|
        if data["state"] == "complete":
 | 
						|
            return crawl_id
 | 
						|
        time.sleep(5)
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture(scope="session")
 | 
						|
def crawl_config_info(admin_auth_headers, default_org_id):
 | 
						|
    # Start crawl.
 | 
						|
    crawl_data = {
 | 
						|
        "runNow": True,
 | 
						|
        "name": "Crawl config test",
 | 
						|
        "config": {"seeds": [{"url": "https://specs.webrecorder.net/"}], "limit": 1},
 | 
						|
    }
 | 
						|
    r = requests.post(
 | 
						|
        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | 
						|
        headers=admin_auth_headers,
 | 
						|
        json=crawl_data,
 | 
						|
    )
 | 
						|
    data = r.json()
 | 
						|
 | 
						|
    crawl_config_id = data["id"]
 | 
						|
    crawl_id = data["run_now_job"]
 | 
						|
    while True:
 | 
						|
        r = requests.get(
 | 
						|
            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
 | 
						|
            headers=admin_auth_headers,
 | 
						|
        )
 | 
						|
        data = r.json()
 | 
						|
        if data["state"] == "complete":
 | 
						|
            break
 | 
						|
        time.sleep(5)
 | 
						|
 | 
						|
    # Run second crawl from crawlconfig and return info when it finishes
 | 
						|
    r = requests.post(
 | 
						|
        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{crawl_config_id}/run",
 | 
						|
        headers=admin_auth_headers,
 | 
						|
    )
 | 
						|
    data = r.json()
 | 
						|
    second_crawl_id = data["started"]
 | 
						|
    while True:
 | 
						|
        r = requests.get(
 | 
						|
            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{second_crawl_id}/replay.json",
 | 
						|
            headers=admin_auth_headers,
 | 
						|
        )
 | 
						|
        data = r.json()
 | 
						|
        if data["state"] == "complete":
 | 
						|
            return (crawl_config_id, crawl_id, second_crawl_id)
 | 
						|
        time.sleep(5)
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture(scope="session")
 | 
						|
def large_crawl_id(admin_auth_headers, default_org_id):
 | 
						|
    # Start crawl
 | 
						|
    crawl_data = {
 | 
						|
        "runNow": True,
 | 
						|
        "name": "Large Test Crawl",
 | 
						|
        "tags": ["wacz-logs"],
 | 
						|
        "config": {
 | 
						|
            "seeds": [{"url": "https://webrecorder.net/"}],
 | 
						|
            "scopeType": "domain",
 | 
						|
            "limit": 100,
 | 
						|
            "extraHops": 1,
 | 
						|
        },
 | 
						|
    }
 | 
						|
    r = requests.post(
 | 
						|
        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | 
						|
        headers=admin_auth_headers,
 | 
						|
        json=crawl_data,
 | 
						|
    )
 | 
						|
    data = r.json()
 | 
						|
 | 
						|
    crawl_id = data["run_now_job"]
 | 
						|
 | 
						|
    while True:
 | 
						|
        r = requests.get(
 | 
						|
            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
 | 
						|
            headers=admin_auth_headers,
 | 
						|
        )
 | 
						|
        data = r.json()
 | 
						|
        if data["state"] == "running":
 | 
						|
            # Give crawl time to start properly
 | 
						|
            time.sleep(30)
 | 
						|
            return crawl_id
 | 
						|
        time.sleep(5)
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture(scope="session")
 | 
						|
def large_crawl_finished(admin_auth_headers, default_org_id, large_crawl_id):
 | 
						|
    # Wait for crawl to complete
 | 
						|
    while True:
 | 
						|
        r = requests.get(
 | 
						|
            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{large_crawl_id}/replay.json",
 | 
						|
            headers=admin_auth_headers,
 | 
						|
        )
 | 
						|
        data = r.json()
 | 
						|
        if data["state"] == "complete":
 | 
						|
            # Give some time for WACZ files to be stored
 | 
						|
            time.sleep(30)
 | 
						|
            break
 | 
						|
        time.sleep(5)
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture(scope="session")
 | 
						|
def timeout_crawl(admin_auth_headers, default_org_id):
 | 
						|
    # Start crawl
 | 
						|
    crawl_data = {
 | 
						|
        "runNow": True,
 | 
						|
        "name": "Crawl with crawl timeout",
 | 
						|
        "crawlTimeout": 15,
 | 
						|
        "config": {
 | 
						|
            "seeds": [{"url": "https://webrecorder.net/"}],
 | 
						|
            "scopeType": "domain",
 | 
						|
            "limit": 100,
 | 
						|
        },
 | 
						|
    }
 | 
						|
    r = requests.post(
 | 
						|
        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | 
						|
        headers=admin_auth_headers,
 | 
						|
        json=crawl_data,
 | 
						|
    )
 | 
						|
    data = r.json()
 | 
						|
    return data["run_now_job"]
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture(scope="session")
 | 
						|
def max_crawl_size_crawl_id(admin_auth_headers, default_org_id):
 | 
						|
    # Start crawl
 | 
						|
    crawl_data = {
 | 
						|
        "runNow": True,
 | 
						|
        "name": "Crawl with 5 MB max crawl size limit",
 | 
						|
        # Note crawl will exceed this size, as crawl begins to gracefully
 | 
						|
        # shut down when operator notices this value has been exceeded.
 | 
						|
        "maxCrawlSize": 5242880,
 | 
						|
        "config": {
 | 
						|
            "seeds": [{"url": "https://webrecorder.net/"}],
 | 
						|
            "scopeType": "domain",
 | 
						|
            "limit": 100,
 | 
						|
        },
 | 
						|
    }
 | 
						|
    r = requests.post(
 | 
						|
        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | 
						|
        headers=admin_auth_headers,
 | 
						|
        json=crawl_data,
 | 
						|
    )
 | 
						|
    data = r.json()
 | 
						|
    return data["run_now_job"]
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture(scope="session")
 | 
						|
def error_crawl_id(admin_auth_headers, default_org_id):
 | 
						|
    crawl_data = {
 | 
						|
        "runNow": True,
 | 
						|
        "name": "Invalid URL crawl",
 | 
						|
        "config": {
 | 
						|
            "seeds": [
 | 
						|
                {"url": "https://invalid-x.webrecorder.net/"},
 | 
						|
            ],
 | 
						|
            "limit": 1,
 | 
						|
        },
 | 
						|
    }
 | 
						|
    r = requests.post(
 | 
						|
        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | 
						|
        headers=admin_auth_headers,
 | 
						|
        json=crawl_data,
 | 
						|
    )
 | 
						|
    data = r.json()
 | 
						|
 | 
						|
    crawl_id = data["run_now_job"]
 | 
						|
 | 
						|
    while True:
 | 
						|
        r = requests.get(
 | 
						|
            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
 | 
						|
            headers=admin_auth_headers,
 | 
						|
        )
 | 
						|
        data = r.json()
 | 
						|
        if data["state"] == "complete":
 | 
						|
            return crawl_id
 | 
						|
        time.sleep(5)
 | 
						|
 | 
						|
 | 
						|
@pytest.fixture(scope="session")
 | 
						|
def org_with_quotas(admin_auth_headers):
 | 
						|
    name = "Quota Org " + datetime.datetime.utcnow().isoformat()
 | 
						|
    r = requests.post(
 | 
						|
        f"{API_PREFIX}/orgs/create", headers=admin_auth_headers, json={"name": name}
 | 
						|
    )
 | 
						|
    data = r.json()
 | 
						|
 | 
						|
    return data["id"]
 |