Allow configurable max pages per crawl in deployment settings (#717)

* backend: max pages per crawl limit, part of fix for #716:
- set 'max_pages_crawl_limit' in values.yaml, default to 100,000
- if set/non-0, automatically set limit if none provided
- if set/non-0, return 400 if adding config with limit exceeding max limit
- return limit as 'maxPagesPerCrawl' in /api/settings
- api: /all/crawls - add runningOnly=0 to show all crawls, default to 1/true (for more reliable testing)

tests: add test for 'max_pages_per_crawl' setting
- ensure 'limit' can not be set higher than max_pages_per_crawl
- ensure pages crawled is at the limit
- set test limit to max 2 pages
- add settings test
- check for pages.jsonl and extraPages.jsonl when crawling 2 pages
This commit is contained in:
Ilya Kreymer 2023-03-28 16:26:29 -07:00 committed by GitHub
parent 948cce3d30
commit 887cb16146
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 131 additions and 6 deletions

View File

@ -7,6 +7,7 @@ from enum import Enum
import uuid
import asyncio
import re
import os
from datetime import datetime
import urllib.parse
@ -20,6 +21,8 @@ from .pagination import DEFAULT_PAGE_SIZE, paginated_format
from .db import BaseMongoModel
# pylint: disable=too-many-lines
# ============================================================================
class JobType(str, Enum):
@ -222,7 +225,7 @@ class UpdateCrawlConfig(BaseModel):
# ============================================================================
# pylint: disable=too-many-instance-attributes,too-many-arguments
# pylint: disable=too-many-instance-attributes,too-many-arguments,too-many-public-methods
class CrawlConfigOps:
"""Crawl Config Operations"""
@ -246,6 +249,8 @@ class CrawlConfigOps:
self.coll_ops = None
self._file_rx = re.compile("\\W+")
self.max_pages_per_crawl = int(os.environ.get("MAX_PAGES_PER_CRAWL", 0))
def set_crawl_ops(self, ops):
"""set crawl ops reference"""
self.crawl_ops = ops
@ -295,6 +300,9 @@ class CrawlConfigOps:
user: User,
):
"""Add new crawl config"""
self.validate_crawl_limit(config.config)
data = config.dict()
data["oid"] = org.id
data["createdBy"] = user.id
@ -361,6 +369,8 @@ class CrawlConfigOps:
if not orig_crawl_config:
raise HTTPException(status_code=400, detail="config_not_found")
self.validate_crawl_limit(update.config)
# indicates if any k8s crawl config settings changed
changed = False
changed = changed or (
@ -857,6 +867,18 @@ class CrawlConfigOps:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")
def validate_crawl_limit(self, config: Optional[RawCrawlConfig]):
"""Ensure max pages per crawl limit is not exceeded.
Set limit if not provided. if provided config exceeds limit, raise exception
"""
if config and self.max_pages_per_crawl:
if config.limit <= 0:
config.limit = self.max_pages_per_crawl
elif config.limit > self.max_pages_per_crawl:
raise HTTPException(
status_code=400, detail="crawl_page_limit_exceeds_allowed"
)
# ============================================================================
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments

View File

@ -779,6 +779,7 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
description: Optional[str] = None,
sortBy: Optional[str] = None,
sortDirection: Optional[int] = -1,
runningOnly: Optional[bool] = True,
):
if not user.is_superuser:
raise HTTPException(status_code=403, detail="Not Allowed")
@ -799,7 +800,7 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
None,
userid=userid,
cid=cid,
running_only=True,
running_only=runningOnly,
state=state,
first_seed=firstSeed,
name=name,

View File

@ -54,6 +54,7 @@ def main():
"defaultBehaviorTimeSeconds": int(
os.environ.get("DEFAULT_BEHAVIOR_TIME_SECONDS", 300)
),
"maxPagesPerCrawl": int(os.environ.get("MAX_PAGES_PER_CRAWL", 0)),
}
invites = init_invites(mdb, email)

View File

@ -85,7 +85,8 @@ def admin_crawl_id(admin_auth_headers, default_org_id):
"tags": ["wr-test-1", "wr-test-2"],
"config": {
"seeds": [{"url": "https://webrecorder.net/"}],
"limit": 1,
# limit now set via 'max_pages_per_crawl' global limit
# "limit": 1,
},
}
r = requests.post(

View File

@ -86,6 +86,46 @@ def test_verify_update(crawler_auth_headers, default_org_id):
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
def test_update_config_invalid_format(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={
"config": {
"seeds": ["https://example.com/"],
"scopeType": "domain",
"limit": 10,
}
},
)
assert r.status_code == 422
def test_update_config_invalid_limit(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={
"config": {
"seeds": [{"url": "https://example.com/"}],
"scopeType": "domain",
"limit": 10,
}
},
)
assert r.status_code == 400
data = r.json()
assert data["detail"] == "crawl_page_limit_exceeds_allowed"
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",

View File

@ -27,10 +27,29 @@ def test_list_orgs(admin_auth_headers, default_org_id):
assert default_org_id in org_ids
def test_create_new_config(admin_auth_headers, default_org_id):
def test_create_new_config_invalid_limit(admin_auth_headers, default_org_id):
crawl_data = {
"runNow": True,
"name": "Test Crawl",
"config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 10},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
assert r.status_code == 400
data = r.json()
assert data["detail"] == "crawl_page_limit_exceeds_allowed"
def test_create_new_config(admin_auth_headers, default_org_id):
crawl_data = {
"runNow": False,
"name": "Test Crawl",
"config": {"seeds": [{"url": "https://webrecorder.net/"}]},
}
r = requests.post(
@ -43,7 +62,7 @@ def test_create_new_config(admin_auth_headers, default_org_id):
data = r.json()
assert data["added"]
assert data["run_now_job"]
assert data["run_now_job"] == None
def test_wait_for_complete(admin_auth_headers, default_org_id, admin_crawl_id):
@ -98,7 +117,7 @@ def test_crawls_include_seed_info(admin_auth_headers, default_org_id, admin_craw
assert crawl["seedCount"] > 0
r = requests.get(
f"{API_PREFIX}/orgs/all/crawls",
f"{API_PREFIX}/orgs/all/crawls?runningOnly=0",
headers=admin_auth_headers,
)
data = r.json()
@ -128,9 +147,20 @@ def test_verify_wacz():
assert "pages/pages.jsonl" in z.namelist()
# 1 seed page
pages = z.open("pages/pages.jsonl").read().decode("utf-8")
assert '"https://webrecorder.net/"' in pages
# 1 seed page + header line
assert len(pages.strip().split("\n")) == 2
# 1 other page
pages = z.open("pages/extraPages.jsonl").read().decode("utf-8")
assert '"https://webrecorder.net/blog"' in pages
# 1 other page + header line
assert len(pages.strip().split("\n")) == 2
def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
r = requests.get(

View File

@ -0,0 +1,17 @@
import requests
from .conftest import API_PREFIX
def test_settings():
r = requests.get(f"{API_PREFIX}/settings")
assert r.status_code == 200
data = r.json()
assert data == {
"registrationEnabled": False,
"jwtTokenLifetime": 86400,
"defaultBehaviorTimeSeconds": 300,
"maxPagesPerCrawl": 2,
}

View File

@ -54,6 +54,8 @@ data:
DEFAULT_BEHAVIOR_TIME_SECONDS: "{{ .Values.default_behavior_time_seconds }}"
MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"
WEB_CONCURRENCY: "{{ .Values.backend_workers | default 4 }}"
IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"

View File

@ -20,3 +20,9 @@ superuser:
local_service_port: 30870
# test max pages per crawl global limit
max_pages_per_crawl: 2
registration_enabled: "0"

View File

@ -18,6 +18,11 @@ jwt_token_lifetime_minutes: 1440
# default time to run behaviors on each page (in seconds)
default_behavior_time_seconds: 300
# max pages per crawl
# set to non-zero value to enforce global max pages per crawl limit
# if set, each workflow can have a lower limit, but not higher
max_pages_per_crawl: 0
# if set to "1", allow inviting same user to same org multiple times
allow_dupe_invites: "0"