* backend: max pages per crawl limit, part of fix for #716: - set 'max_pages_crawl_limit' in values.yaml, default to 100,000 - if set/non-0, automatically set limit if none provided - if set/non-0, return 400 if adding config with limit exceeding max limit - return limit as 'maxPagesPerCrawl' in /api/settings - api: /all/crawls - add runningOnly=0 to show all crawls, default to 1/true (for more reliable testing) tests: add test for 'max_pages_per_crawl' setting - ensure 'limit' can not be set higher than max_pages_per_crawl - ensure pages crawled is at the limit - set test limit to max 2 pages - add settings test - check for pages.jsonl and extraPages.jsonl when crawling 2 pages
		
			
				
	
	
		
			250 lines
		
	
	
		
			6.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			250 lines
		
	
	
		
			6.8 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import pytest
 | |
| import requests
 | |
| import time
 | |
| 
 | |
| 
 | |
| HOST_PREFIX = "http://127.0.0.1:30870"
 | |
| API_PREFIX = HOST_PREFIX + "/api"
 | |
| 
 | |
| ADMIN_USERNAME = "admin@example.com"
 | |
| ADMIN_PW = "PASSW0RD!"
 | |
| 
 | |
| VIEWER_USERNAME = "viewer@example.com"
 | |
| VIEWER_PW = "viewerPASSW0RD!"
 | |
| 
 | |
| CRAWLER_USERNAME = "crawler@example.com"
 | |
| CRAWLER_PW = "crawlerPASSWORD!"
 | |
| 
 | |
| _admin_config_id = None
 | |
| _crawler_config_id = None
 | |
| 
 | |
| NON_DEFAULT_ORG_NAME = "Non-default org"
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def admin_auth_headers():
 | |
|     while True:
 | |
|         r = requests.post(
 | |
|             f"{API_PREFIX}/auth/jwt/login",
 | |
|             data={
 | |
|                 "username": ADMIN_USERNAME,
 | |
|                 "password": ADMIN_PW,
 | |
|                 "grant_type": "password",
 | |
|             },
 | |
|         )
 | |
|         data = r.json()
 | |
|         try:
 | |
|             return {"Authorization": f"Bearer {data['access_token']}"}
 | |
|         except:
 | |
|             print("Waiting for admin_auth_headers")
 | |
|             time.sleep(5)
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def default_org_id(admin_auth_headers):
 | |
|     while True:
 | |
|         r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
 | |
|         data = r.json()
 | |
|         try:
 | |
|             for org in data["items"]:
 | |
|                 if org["default"] is True:
 | |
|                     return org["id"]
 | |
|         except:
 | |
|             print("Waiting for default org id")
 | |
|             time.sleep(5)
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def non_default_org_id(admin_auth_headers):
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/orgs/create",
 | |
|         headers=admin_auth_headers,
 | |
|         json={"name": NON_DEFAULT_ORG_NAME},
 | |
|     )
 | |
|     assert r.status_code == 200
 | |
| 
 | |
|     while True:
 | |
|         r = requests.get(f"{API_PREFIX}/orgs", headers=admin_auth_headers)
 | |
|         data = r.json()
 | |
|         try:
 | |
|             for org in data["items"]:
 | |
|                 if org["name"] == NON_DEFAULT_ORG_NAME:
 | |
|                     return org["id"]
 | |
|         except:
 | |
|             print("Waiting for non-default org id")
 | |
|             time.sleep(5)
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def admin_crawl_id(admin_auth_headers, default_org_id):
 | |
|     # Start crawl.
 | |
|     crawl_data = {
 | |
|         "runNow": True,
 | |
|         "name": "Admin Test Crawl",
 | |
|         "description": "Admin Test Crawl description",
 | |
|         "tags": ["wr-test-1", "wr-test-2"],
 | |
|         "config": {
 | |
|             "seeds": [{"url": "https://webrecorder.net/"}],
 | |
|             # limit now set via 'max_pages_per_crawl' global limit
 | |
|             # "limit": 1,
 | |
|         },
 | |
|     }
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | |
|         headers=admin_auth_headers,
 | |
|         json=crawl_data,
 | |
|     )
 | |
|     data = r.json()
 | |
| 
 | |
|     global _admin_config_id
 | |
|     _admin_config_id = data["added"]
 | |
| 
 | |
|     crawl_id = data["run_now_job"]
 | |
|     # Wait for it to complete and then return crawl ID
 | |
|     while True:
 | |
|         r = requests.get(
 | |
|             f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
 | |
|             headers=admin_auth_headers,
 | |
|         )
 | |
|         data = r.json()
 | |
|         if data["state"] == "complete":
 | |
|             return crawl_id
 | |
|         time.sleep(5)
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def admin_config_id(admin_crawl_id):
 | |
|     return _admin_config_id
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def viewer_auth_headers(admin_auth_headers, default_org_id):
 | |
|     requests.post(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/add-user",
 | |
|         json={
 | |
|             "email": VIEWER_USERNAME,
 | |
|             "password": VIEWER_PW,
 | |
|             "name": "newviewer",
 | |
|             "role": 10,
 | |
|         },
 | |
|         headers=admin_auth_headers,
 | |
|     )
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/auth/jwt/login",
 | |
|         data={
 | |
|             "username": VIEWER_USERNAME,
 | |
|             "password": VIEWER_PW,
 | |
|             "grant_type": "password",
 | |
|         },
 | |
|     )
 | |
|     data = r.json()
 | |
|     access_token = data.get("access_token")
 | |
|     return {"Authorization": f"Bearer {access_token}"}
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def crawler_auth_headers(admin_auth_headers, default_org_id):
 | |
|     requests.post(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/add-user",
 | |
|         json={
 | |
|             "email": CRAWLER_USERNAME,
 | |
|             "password": CRAWLER_PW,
 | |
|             "name": "new-crawler",
 | |
|             "description": "crawler test crawl",
 | |
|             "role": 20,
 | |
|         },
 | |
|         headers=admin_auth_headers,
 | |
|     )
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/auth/jwt/login",
 | |
|         data={
 | |
|             "username": CRAWLER_USERNAME,
 | |
|             "password": CRAWLER_PW,
 | |
|             "grant_type": "password",
 | |
|         },
 | |
|     )
 | |
|     data = r.json()
 | |
|     access_token = data.get("access_token")
 | |
|     return {"Authorization": f"Bearer {access_token}"}
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def crawler_userid(crawler_auth_headers):
 | |
|     r = requests.get(f"{API_PREFIX}/users/me", headers=crawler_auth_headers)
 | |
|     return r.json()["id"]
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def crawler_crawl_id(crawler_auth_headers, default_org_id):
 | |
|     # Start crawl.
 | |
|     crawl_data = {
 | |
|         "runNow": True,
 | |
|         "name": "Crawler User Test Crawl",
 | |
|         "description": "crawler test crawl",
 | |
|         "config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 1},
 | |
|     }
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | |
|         headers=crawler_auth_headers,
 | |
|         json=crawl_data,
 | |
|     )
 | |
|     data = r.json()
 | |
| 
 | |
|     global _crawler_config_id
 | |
|     _crawler_config_id = data["added"]
 | |
| 
 | |
|     crawl_id = data["run_now_job"]
 | |
|     # Wait for it to complete and then return crawl ID
 | |
|     while True:
 | |
|         r = requests.get(
 | |
|             f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
 | |
|             headers=crawler_auth_headers,
 | |
|         )
 | |
|         data = r.json()
 | |
|         if data["state"] == "complete":
 | |
|             return crawl_id
 | |
|         time.sleep(5)
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def wr_specs_crawl_id(crawler_auth_headers, default_org_id):
 | |
|     # Start crawl.
 | |
|     crawl_data = {
 | |
|         "runNow": True,
 | |
|         "name": "Webrecorder Specs sample crawl",
 | |
|         "config": {"seeds": [{"url": "https://specs.webrecorder.net/"}], "limit": 1},
 | |
|     }
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | |
|         headers=crawler_auth_headers,
 | |
|         json=crawl_data,
 | |
|     )
 | |
|     data = r.json()
 | |
| 
 | |
|     crawl_id = data["run_now_job"]
 | |
|     # Wait for it to complete and then return crawl ID
 | |
|     while True:
 | |
|         r = requests.get(
 | |
|             f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/replay.json",
 | |
|             headers=crawler_auth_headers,
 | |
|         )
 | |
|         data = r.json()
 | |
|         if data["state"] == "complete":
 | |
|             return crawl_id
 | |
|         time.sleep(5)
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def crawler_config_id(crawler_crawl_id):
 | |
|     return _crawler_config_id
 | |
| 
 | |
| 
 | |
| @pytest.fixture(scope="session")
 | |
| def sample_crawl_data():
 | |
|     return {
 | |
|         "runNow": False,
 | |
|         "name": "Test Crawl",
 | |
|         "config": {"seeds": [{"url": "https://example.com/"}]},
 | |
|         "tags": ["tag1", "tag2"],
 | |
|     }
 |