Allow configurable max pages per crawl in deployment settings (#717)
* backend: max pages per crawl limit, part of fix for #716: - set 'max_pages_crawl_limit' in values.yaml, default to 100,000 - if set/non-0, automatically set limit if none provided - if set/non-0, return 400 if adding config with limit exceeding max limit - return limit as 'maxPagesPerCrawl' in /api/settings - api: /all/crawls - add runningOnly=0 to show all crawls, default to 1/true (for more reliable testing) tests: add test for 'max_pages_per_crawl' setting - ensure 'limit' can not be set higher than max_pages_per_crawl - ensure pages crawled is at the limit - set test limit to max 2 pages - add settings test - check for pages.jsonl and extraPages.jsonl when crawling 2 pages
This commit is contained in:
parent
948cce3d30
commit
887cb16146
@ -7,6 +7,7 @@ from enum import Enum
|
||||
import uuid
|
||||
import asyncio
|
||||
import re
|
||||
import os
|
||||
from datetime import datetime
|
||||
import urllib.parse
|
||||
|
||||
@ -20,6 +21,8 @@ from .pagination import DEFAULT_PAGE_SIZE, paginated_format
|
||||
|
||||
from .db import BaseMongoModel
|
||||
|
||||
# pylint: disable=too-many-lines
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class JobType(str, Enum):
|
||||
@ -222,7 +225,7 @@ class UpdateCrawlConfig(BaseModel):
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# pylint: disable=too-many-instance-attributes,too-many-arguments
|
||||
# pylint: disable=too-many-instance-attributes,too-many-arguments,too-many-public-methods
|
||||
class CrawlConfigOps:
|
||||
"""Crawl Config Operations"""
|
||||
|
||||
@ -246,6 +249,8 @@ class CrawlConfigOps:
|
||||
self.coll_ops = None
|
||||
self._file_rx = re.compile("\\W+")
|
||||
|
||||
self.max_pages_per_crawl = int(os.environ.get("MAX_PAGES_PER_CRAWL", 0))
|
||||
|
||||
def set_crawl_ops(self, ops):
|
||||
"""set crawl ops reference"""
|
||||
self.crawl_ops = ops
|
||||
@ -295,6 +300,9 @@ class CrawlConfigOps:
|
||||
user: User,
|
||||
):
|
||||
"""Add new crawl config"""
|
||||
|
||||
self.validate_crawl_limit(config.config)
|
||||
|
||||
data = config.dict()
|
||||
data["oid"] = org.id
|
||||
data["createdBy"] = user.id
|
||||
@ -361,6 +369,8 @@ class CrawlConfigOps:
|
||||
if not orig_crawl_config:
|
||||
raise HTTPException(status_code=400, detail="config_not_found")
|
||||
|
||||
self.validate_crawl_limit(update.config)
|
||||
|
||||
# indicates if any k8s crawl config settings changed
|
||||
changed = False
|
||||
changed = changed or (
|
||||
@ -857,6 +867,18 @@ class CrawlConfigOps:
|
||||
# pylint: disable=raise-missing-from
|
||||
raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")
|
||||
|
||||
def validate_crawl_limit(self, config: Optional[RawCrawlConfig]):
|
||||
"""Ensure max pages per crawl limit is not exceeded.
|
||||
Set limit if not provided. if provided config exceeds limit, raise exception
|
||||
"""
|
||||
if config and self.max_pages_per_crawl:
|
||||
if config.limit <= 0:
|
||||
config.limit = self.max_pages_per_crawl
|
||||
elif config.limit > self.max_pages_per_crawl:
|
||||
raise HTTPException(
|
||||
status_code=400, detail="crawl_page_limit_exceeds_allowed"
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
|
||||
|
@ -779,6 +779,7 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
|
||||
description: Optional[str] = None,
|
||||
sortBy: Optional[str] = None,
|
||||
sortDirection: Optional[int] = -1,
|
||||
runningOnly: Optional[bool] = True,
|
||||
):
|
||||
if not user.is_superuser:
|
||||
raise HTTPException(status_code=403, detail="Not Allowed")
|
||||
@ -799,7 +800,7 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
|
||||
None,
|
||||
userid=userid,
|
||||
cid=cid,
|
||||
running_only=True,
|
||||
running_only=runningOnly,
|
||||
state=state,
|
||||
first_seed=firstSeed,
|
||||
name=name,
|
||||
|
@ -54,6 +54,7 @@ def main():
|
||||
"defaultBehaviorTimeSeconds": int(
|
||||
os.environ.get("DEFAULT_BEHAVIOR_TIME_SECONDS", 300)
|
||||
),
|
||||
"maxPagesPerCrawl": int(os.environ.get("MAX_PAGES_PER_CRAWL", 0)),
|
||||
}
|
||||
|
||||
invites = init_invites(mdb, email)
|
||||
|
@ -85,7 +85,8 @@ def admin_crawl_id(admin_auth_headers, default_org_id):
|
||||
"tags": ["wr-test-1", "wr-test-2"],
|
||||
"config": {
|
||||
"seeds": [{"url": "https://webrecorder.net/"}],
|
||||
"limit": 1,
|
||||
# limit now set via 'max_pages_per_crawl' global limit
|
||||
# "limit": 1,
|
||||
},
|
||||
}
|
||||
r = requests.post(
|
||||
|
@ -86,6 +86,46 @@ def test_verify_update(crawler_auth_headers, default_org_id):
|
||||
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
|
||||
|
||||
|
||||
def test_update_config_invalid_format(
|
||||
crawler_auth_headers, default_org_id, sample_crawl_data
|
||||
):
|
||||
r = requests.patch(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
||||
headers=crawler_auth_headers,
|
||||
json={
|
||||
"config": {
|
||||
"seeds": ["https://example.com/"],
|
||||
"scopeType": "domain",
|
||||
"limit": 10,
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
assert r.status_code == 422
|
||||
|
||||
|
||||
def test_update_config_invalid_limit(
|
||||
crawler_auth_headers, default_org_id, sample_crawl_data
|
||||
):
|
||||
r = requests.patch(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
||||
headers=crawler_auth_headers,
|
||||
json={
|
||||
"config": {
|
||||
"seeds": [{"url": "https://example.com/"}],
|
||||
"scopeType": "domain",
|
||||
"limit": 10,
|
||||
}
|
||||
},
|
||||
)
|
||||
|
||||
assert r.status_code == 400
|
||||
|
||||
data = r.json()
|
||||
|
||||
assert data["detail"] == "crawl_page_limit_exceeds_allowed"
|
||||
|
||||
|
||||
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
|
||||
r = requests.patch(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
||||
|
@ -27,10 +27,29 @@ def test_list_orgs(admin_auth_headers, default_org_id):
|
||||
assert default_org_id in org_ids
|
||||
|
||||
|
||||
def test_create_new_config(admin_auth_headers, default_org_id):
|
||||
def test_create_new_config_invalid_limit(admin_auth_headers, default_org_id):
|
||||
crawl_data = {
|
||||
"runNow": True,
|
||||
"name": "Test Crawl",
|
||||
"config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 10},
|
||||
}
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
||||
headers=admin_auth_headers,
|
||||
json=crawl_data,
|
||||
)
|
||||
|
||||
assert r.status_code == 400
|
||||
|
||||
data = r.json()
|
||||
|
||||
assert data["detail"] == "crawl_page_limit_exceeds_allowed"
|
||||
|
||||
|
||||
def test_create_new_config(admin_auth_headers, default_org_id):
|
||||
crawl_data = {
|
||||
"runNow": False,
|
||||
"name": "Test Crawl",
|
||||
"config": {"seeds": [{"url": "https://webrecorder.net/"}]},
|
||||
}
|
||||
r = requests.post(
|
||||
@ -43,7 +62,7 @@ def test_create_new_config(admin_auth_headers, default_org_id):
|
||||
|
||||
data = r.json()
|
||||
assert data["added"]
|
||||
assert data["run_now_job"]
|
||||
assert data["run_now_job"] == None
|
||||
|
||||
|
||||
def test_wait_for_complete(admin_auth_headers, default_org_id, admin_crawl_id):
|
||||
@ -98,7 +117,7 @@ def test_crawls_include_seed_info(admin_auth_headers, default_org_id, admin_craw
|
||||
assert crawl["seedCount"] > 0
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/all/crawls",
|
||||
f"{API_PREFIX}/orgs/all/crawls?runningOnly=0",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
data = r.json()
|
||||
@ -128,9 +147,20 @@ def test_verify_wacz():
|
||||
|
||||
assert "pages/pages.jsonl" in z.namelist()
|
||||
|
||||
# 1 seed page
|
||||
pages = z.open("pages/pages.jsonl").read().decode("utf-8")
|
||||
assert '"https://webrecorder.net/"' in pages
|
||||
|
||||
# 1 seed page + header line
|
||||
assert len(pages.strip().split("\n")) == 2
|
||||
|
||||
# 1 other page
|
||||
pages = z.open("pages/extraPages.jsonl").read().decode("utf-8")
|
||||
assert '"https://webrecorder.net/blog"' in pages
|
||||
|
||||
# 1 other page + header line
|
||||
assert len(pages.strip().split("\n")) == 2
|
||||
|
||||
|
||||
def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
|
||||
r = requests.get(
|
||||
|
17
backend/test/test_settings.py
Normal file
17
backend/test/test_settings.py
Normal file
@ -0,0 +1,17 @@
|
||||
import requests
|
||||
|
||||
from .conftest import API_PREFIX
|
||||
|
||||
|
||||
def test_settings():
|
||||
r = requests.get(f"{API_PREFIX}/settings")
|
||||
assert r.status_code == 200
|
||||
|
||||
data = r.json()
|
||||
|
||||
assert data == {
|
||||
"registrationEnabled": False,
|
||||
"jwtTokenLifetime": 86400,
|
||||
"defaultBehaviorTimeSeconds": 300,
|
||||
"maxPagesPerCrawl": 2,
|
||||
}
|
@ -54,6 +54,8 @@ data:
|
||||
|
||||
DEFAULT_BEHAVIOR_TIME_SECONDS: "{{ .Values.default_behavior_time_seconds }}"
|
||||
|
||||
MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"
|
||||
|
||||
WEB_CONCURRENCY: "{{ .Values.backend_workers | default 4 }}"
|
||||
|
||||
IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"
|
||||
|
@ -20,3 +20,9 @@ superuser:
|
||||
|
||||
|
||||
local_service_port: 30870
|
||||
|
||||
# test max pages per crawl global limit
|
||||
max_pages_per_crawl: 2
|
||||
|
||||
registration_enabled: "0"
|
||||
|
||||
|
@ -18,6 +18,11 @@ jwt_token_lifetime_minutes: 1440
|
||||
# default time to run behaviors on each page (in seconds)
|
||||
default_behavior_time_seconds: 300
|
||||
|
||||
# max pages per crawl
|
||||
# set to non-zero value to enforce global max pages per crawl limit
|
||||
# if set, each workflow can have a lower limit, but not higher
|
||||
max_pages_per_crawl: 0
|
||||
|
||||
# if set to "1", allow inviting same user to same org multiple times
|
||||
allow_dupe_invites: "0"
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user