Allow configurable max pages per crawl in deployment settings (#717)
* backend: max pages per crawl limit, part of fix for #716: - set 'max_pages_crawl_limit' in values.yaml, default to 100,000 - if set/non-0, automatically set limit if none provided - if set/non-0, return 400 if adding config with limit exceeding max limit - return limit as 'maxPagesPerCrawl' in /api/settings - api: /all/crawls - add runningOnly=0 to show all crawls, default to 1/true (for more reliable testing) tests: add test for 'max_pages_per_crawl' setting - ensure 'limit' can not be set higher than max_pages_per_crawl - ensure pages crawled is at the limit - set test limit to max 2 pages - add settings test - check for pages.jsonl and extraPages.jsonl when crawling 2 pages
This commit is contained in:
parent
948cce3d30
commit
887cb16146
@ -7,6 +7,7 @@ from enum import Enum
|
|||||||
import uuid
|
import uuid
|
||||||
import asyncio
|
import asyncio
|
||||||
import re
|
import re
|
||||||
|
import os
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
|
||||||
@ -20,6 +21,8 @@ from .pagination import DEFAULT_PAGE_SIZE, paginated_format
|
|||||||
|
|
||||||
from .db import BaseMongoModel
|
from .db import BaseMongoModel
|
||||||
|
|
||||||
|
# pylint: disable=too-many-lines
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class JobType(str, Enum):
|
class JobType(str, Enum):
|
||||||
@ -222,7 +225,7 @@ class UpdateCrawlConfig(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# pylint: disable=too-many-instance-attributes,too-many-arguments
|
# pylint: disable=too-many-instance-attributes,too-many-arguments,too-many-public-methods
|
||||||
class CrawlConfigOps:
|
class CrawlConfigOps:
|
||||||
"""Crawl Config Operations"""
|
"""Crawl Config Operations"""
|
||||||
|
|
||||||
@ -246,6 +249,8 @@ class CrawlConfigOps:
|
|||||||
self.coll_ops = None
|
self.coll_ops = None
|
||||||
self._file_rx = re.compile("\\W+")
|
self._file_rx = re.compile("\\W+")
|
||||||
|
|
||||||
|
self.max_pages_per_crawl = int(os.environ.get("MAX_PAGES_PER_CRAWL", 0))
|
||||||
|
|
||||||
def set_crawl_ops(self, ops):
|
def set_crawl_ops(self, ops):
|
||||||
"""set crawl ops reference"""
|
"""set crawl ops reference"""
|
||||||
self.crawl_ops = ops
|
self.crawl_ops = ops
|
||||||
@ -295,6 +300,9 @@ class CrawlConfigOps:
|
|||||||
user: User,
|
user: User,
|
||||||
):
|
):
|
||||||
"""Add new crawl config"""
|
"""Add new crawl config"""
|
||||||
|
|
||||||
|
self.validate_crawl_limit(config.config)
|
||||||
|
|
||||||
data = config.dict()
|
data = config.dict()
|
||||||
data["oid"] = org.id
|
data["oid"] = org.id
|
||||||
data["createdBy"] = user.id
|
data["createdBy"] = user.id
|
||||||
@ -361,6 +369,8 @@ class CrawlConfigOps:
|
|||||||
if not orig_crawl_config:
|
if not orig_crawl_config:
|
||||||
raise HTTPException(status_code=400, detail="config_not_found")
|
raise HTTPException(status_code=400, detail="config_not_found")
|
||||||
|
|
||||||
|
self.validate_crawl_limit(update.config)
|
||||||
|
|
||||||
# indicates if any k8s crawl config settings changed
|
# indicates if any k8s crawl config settings changed
|
||||||
changed = False
|
changed = False
|
||||||
changed = changed or (
|
changed = changed or (
|
||||||
@ -857,6 +867,18 @@ class CrawlConfigOps:
|
|||||||
# pylint: disable=raise-missing-from
|
# pylint: disable=raise-missing-from
|
||||||
raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")
|
raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")
|
||||||
|
|
||||||
|
def validate_crawl_limit(self, config: Optional[RawCrawlConfig]):
|
||||||
|
"""Ensure max pages per crawl limit is not exceeded.
|
||||||
|
Set limit if not provided. if provided config exceeds limit, raise exception
|
||||||
|
"""
|
||||||
|
if config and self.max_pages_per_crawl:
|
||||||
|
if config.limit <= 0:
|
||||||
|
config.limit = self.max_pages_per_crawl
|
||||||
|
elif config.limit > self.max_pages_per_crawl:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400, detail="crawl_page_limit_exceeds_allowed"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
|
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
|
||||||
|
@ -779,6 +779,7 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
|
|||||||
description: Optional[str] = None,
|
description: Optional[str] = None,
|
||||||
sortBy: Optional[str] = None,
|
sortBy: Optional[str] = None,
|
||||||
sortDirection: Optional[int] = -1,
|
sortDirection: Optional[int] = -1,
|
||||||
|
runningOnly: Optional[bool] = True,
|
||||||
):
|
):
|
||||||
if not user.is_superuser:
|
if not user.is_superuser:
|
||||||
raise HTTPException(status_code=403, detail="Not Allowed")
|
raise HTTPException(status_code=403, detail="Not Allowed")
|
||||||
@ -799,7 +800,7 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user
|
|||||||
None,
|
None,
|
||||||
userid=userid,
|
userid=userid,
|
||||||
cid=cid,
|
cid=cid,
|
||||||
running_only=True,
|
running_only=runningOnly,
|
||||||
state=state,
|
state=state,
|
||||||
first_seed=firstSeed,
|
first_seed=firstSeed,
|
||||||
name=name,
|
name=name,
|
||||||
|
@ -54,6 +54,7 @@ def main():
|
|||||||
"defaultBehaviorTimeSeconds": int(
|
"defaultBehaviorTimeSeconds": int(
|
||||||
os.environ.get("DEFAULT_BEHAVIOR_TIME_SECONDS", 300)
|
os.environ.get("DEFAULT_BEHAVIOR_TIME_SECONDS", 300)
|
||||||
),
|
),
|
||||||
|
"maxPagesPerCrawl": int(os.environ.get("MAX_PAGES_PER_CRAWL", 0)),
|
||||||
}
|
}
|
||||||
|
|
||||||
invites = init_invites(mdb, email)
|
invites = init_invites(mdb, email)
|
||||||
|
@ -85,7 +85,8 @@ def admin_crawl_id(admin_auth_headers, default_org_id):
|
|||||||
"tags": ["wr-test-1", "wr-test-2"],
|
"tags": ["wr-test-1", "wr-test-2"],
|
||||||
"config": {
|
"config": {
|
||||||
"seeds": [{"url": "https://webrecorder.net/"}],
|
"seeds": [{"url": "https://webrecorder.net/"}],
|
||||||
"limit": 1,
|
# limit now set via 'max_pages_per_crawl' global limit
|
||||||
|
# "limit": 1,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
r = requests.post(
|
r = requests.post(
|
||||||
|
@ -86,6 +86,46 @@ def test_verify_update(crawler_auth_headers, default_org_id):
|
|||||||
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
|
assert sorted(data["tags"]) == sorted(UPDATED_TAGS)
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_config_invalid_format(
|
||||||
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
||||||
|
):
|
||||||
|
r = requests.patch(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
json={
|
||||||
|
"config": {
|
||||||
|
"seeds": ["https://example.com/"],
|
||||||
|
"scopeType": "domain",
|
||||||
|
"limit": 10,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert r.status_code == 422
|
||||||
|
|
||||||
|
|
||||||
|
def test_update_config_invalid_limit(
|
||||||
|
crawler_auth_headers, default_org_id, sample_crawl_data
|
||||||
|
):
|
||||||
|
r = requests.patch(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
json={
|
||||||
|
"config": {
|
||||||
|
"seeds": [{"url": "https://example.com/"}],
|
||||||
|
"scopeType": "domain",
|
||||||
|
"limit": 10,
|
||||||
|
}
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
assert r.status_code == 400
|
||||||
|
|
||||||
|
data = r.json()
|
||||||
|
|
||||||
|
assert data["detail"] == "crawl_page_limit_exceeds_allowed"
|
||||||
|
|
||||||
|
|
||||||
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
|
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
|
||||||
r = requests.patch(
|
r = requests.patch(
|
||||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
||||||
|
@ -27,10 +27,29 @@ def test_list_orgs(admin_auth_headers, default_org_id):
|
|||||||
assert default_org_id in org_ids
|
assert default_org_id in org_ids
|
||||||
|
|
||||||
|
|
||||||
def test_create_new_config(admin_auth_headers, default_org_id):
|
def test_create_new_config_invalid_limit(admin_auth_headers, default_org_id):
|
||||||
crawl_data = {
|
crawl_data = {
|
||||||
"runNow": True,
|
"runNow": True,
|
||||||
"name": "Test Crawl",
|
"name": "Test Crawl",
|
||||||
|
"config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 10},
|
||||||
|
}
|
||||||
|
r = requests.post(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
||||||
|
headers=admin_auth_headers,
|
||||||
|
json=crawl_data,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert r.status_code == 400
|
||||||
|
|
||||||
|
data = r.json()
|
||||||
|
|
||||||
|
assert data["detail"] == "crawl_page_limit_exceeds_allowed"
|
||||||
|
|
||||||
|
|
||||||
|
def test_create_new_config(admin_auth_headers, default_org_id):
|
||||||
|
crawl_data = {
|
||||||
|
"runNow": False,
|
||||||
|
"name": "Test Crawl",
|
||||||
"config": {"seeds": [{"url": "https://webrecorder.net/"}]},
|
"config": {"seeds": [{"url": "https://webrecorder.net/"}]},
|
||||||
}
|
}
|
||||||
r = requests.post(
|
r = requests.post(
|
||||||
@ -43,7 +62,7 @@ def test_create_new_config(admin_auth_headers, default_org_id):
|
|||||||
|
|
||||||
data = r.json()
|
data = r.json()
|
||||||
assert data["added"]
|
assert data["added"]
|
||||||
assert data["run_now_job"]
|
assert data["run_now_job"] == None
|
||||||
|
|
||||||
|
|
||||||
def test_wait_for_complete(admin_auth_headers, default_org_id, admin_crawl_id):
|
def test_wait_for_complete(admin_auth_headers, default_org_id, admin_crawl_id):
|
||||||
@ -98,7 +117,7 @@ def test_crawls_include_seed_info(admin_auth_headers, default_org_id, admin_craw
|
|||||||
assert crawl["seedCount"] > 0
|
assert crawl["seedCount"] > 0
|
||||||
|
|
||||||
r = requests.get(
|
r = requests.get(
|
||||||
f"{API_PREFIX}/orgs/all/crawls",
|
f"{API_PREFIX}/orgs/all/crawls?runningOnly=0",
|
||||||
headers=admin_auth_headers,
|
headers=admin_auth_headers,
|
||||||
)
|
)
|
||||||
data = r.json()
|
data = r.json()
|
||||||
@ -128,9 +147,20 @@ def test_verify_wacz():
|
|||||||
|
|
||||||
assert "pages/pages.jsonl" in z.namelist()
|
assert "pages/pages.jsonl" in z.namelist()
|
||||||
|
|
||||||
|
# 1 seed page
|
||||||
pages = z.open("pages/pages.jsonl").read().decode("utf-8")
|
pages = z.open("pages/pages.jsonl").read().decode("utf-8")
|
||||||
assert '"https://webrecorder.net/"' in pages
|
assert '"https://webrecorder.net/"' in pages
|
||||||
|
|
||||||
|
# 1 seed page + header line
|
||||||
|
assert len(pages.strip().split("\n")) == 2
|
||||||
|
|
||||||
|
# 1 other page
|
||||||
|
pages = z.open("pages/extraPages.jsonl").read().decode("utf-8")
|
||||||
|
assert '"https://webrecorder.net/blog"' in pages
|
||||||
|
|
||||||
|
# 1 other page + header line
|
||||||
|
assert len(pages.strip().split("\n")) == 2
|
||||||
|
|
||||||
|
|
||||||
def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
|
def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id):
|
||||||
r = requests.get(
|
r = requests.get(
|
||||||
|
17
backend/test/test_settings.py
Normal file
17
backend/test/test_settings.py
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
import requests
|
||||||
|
|
||||||
|
from .conftest import API_PREFIX
|
||||||
|
|
||||||
|
|
||||||
|
def test_settings():
|
||||||
|
r = requests.get(f"{API_PREFIX}/settings")
|
||||||
|
assert r.status_code == 200
|
||||||
|
|
||||||
|
data = r.json()
|
||||||
|
|
||||||
|
assert data == {
|
||||||
|
"registrationEnabled": False,
|
||||||
|
"jwtTokenLifetime": 86400,
|
||||||
|
"defaultBehaviorTimeSeconds": 300,
|
||||||
|
"maxPagesPerCrawl": 2,
|
||||||
|
}
|
@ -54,6 +54,8 @@ data:
|
|||||||
|
|
||||||
DEFAULT_BEHAVIOR_TIME_SECONDS: "{{ .Values.default_behavior_time_seconds }}"
|
DEFAULT_BEHAVIOR_TIME_SECONDS: "{{ .Values.default_behavior_time_seconds }}"
|
||||||
|
|
||||||
|
MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"
|
||||||
|
|
||||||
WEB_CONCURRENCY: "{{ .Values.backend_workers | default 4 }}"
|
WEB_CONCURRENCY: "{{ .Values.backend_workers | default 4 }}"
|
||||||
|
|
||||||
IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"
|
IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"
|
||||||
|
@ -20,3 +20,9 @@ superuser:
|
|||||||
|
|
||||||
|
|
||||||
local_service_port: 30870
|
local_service_port: 30870
|
||||||
|
|
||||||
|
# test max pages per crawl global limit
|
||||||
|
max_pages_per_crawl: 2
|
||||||
|
|
||||||
|
registration_enabled: "0"
|
||||||
|
|
||||||
|
@ -18,6 +18,11 @@ jwt_token_lifetime_minutes: 1440
|
|||||||
# default time to run behaviors on each page (in seconds)
|
# default time to run behaviors on each page (in seconds)
|
||||||
default_behavior_time_seconds: 300
|
default_behavior_time_seconds: 300
|
||||||
|
|
||||||
|
# max pages per crawl
|
||||||
|
# set to non-zero value to enforce global max pages per crawl limit
|
||||||
|
# if set, each workflow can have a lower limit, but not higher
|
||||||
|
max_pages_per_crawl: 0
|
||||||
|
|
||||||
# if set to "1", allow inviting same user to same org multiple times
|
# if set to "1", allow inviting same user to same org multiple times
|
||||||
allow_dupe_invites: "0"
|
allow_dupe_invites: "0"
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user