diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 4b2861e7..4e1bb568 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -7,6 +7,7 @@ from enum import Enum import uuid import asyncio import re +import os from datetime import datetime import urllib.parse @@ -20,6 +21,8 @@ from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .db import BaseMongoModel +# pylint: disable=too-many-lines + # ============================================================================ class JobType(str, Enum): @@ -222,7 +225,7 @@ class UpdateCrawlConfig(BaseModel): # ============================================================================ -# pylint: disable=too-many-instance-attributes,too-many-arguments +# pylint: disable=too-many-instance-attributes,too-many-arguments,too-many-public-methods class CrawlConfigOps: """Crawl Config Operations""" @@ -246,6 +249,8 @@ class CrawlConfigOps: self.coll_ops = None self._file_rx = re.compile("\\W+") + self.max_pages_per_crawl = int(os.environ.get("MAX_PAGES_PER_CRAWL", 0)) + def set_crawl_ops(self, ops): """set crawl ops reference""" self.crawl_ops = ops @@ -295,6 +300,9 @@ class CrawlConfigOps: user: User, ): """Add new crawl config""" + + self.validate_crawl_limit(config.config) + data = config.dict() data["oid"] = org.id data["createdBy"] = user.id @@ -361,6 +369,8 @@ class CrawlConfigOps: if not orig_crawl_config: raise HTTPException(status_code=400, detail="config_not_found") + self.validate_crawl_limit(update.config) + # indicates if any k8s crawl config settings changed changed = False changed = changed or ( @@ -857,6 +867,18 @@ class CrawlConfigOps: # pylint: disable=raise-missing-from raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}") + def validate_crawl_limit(self, config: Optional[RawCrawlConfig]): + """Ensure max pages per crawl limit is not exceeded. + Set limit if not provided. if provided config exceeds limit, raise exception + """ + if config and self.max_pages_per_crawl: + if config.limit <= 0: + config.limit = self.max_pages_per_crawl + elif config.limit > self.max_pages_per_crawl: + raise HTTPException( + status_code=400, detail="crawl_page_limit_exceeds_allowed" + ) + # ============================================================================ # pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index d0f375a1..ca3f3d93 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -779,6 +779,7 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user description: Optional[str] = None, sortBy: Optional[str] = None, sortDirection: Optional[int] = -1, + runningOnly: Optional[bool] = True, ): if not user.is_superuser: raise HTTPException(status_code=403, detail="Not Allowed") @@ -799,7 +800,7 @@ def init_crawls_api(app, mdb, users, crawl_manager, crawl_config_ops, orgs, user None, userid=userid, cid=cid, - running_only=True, + running_only=runningOnly, state=state, first_seed=firstSeed, name=name, diff --git a/backend/btrixcloud/main.py b/backend/btrixcloud/main.py index d7ea1d98..8799709b 100644 --- a/backend/btrixcloud/main.py +++ b/backend/btrixcloud/main.py @@ -54,6 +54,7 @@ def main(): "defaultBehaviorTimeSeconds": int( os.environ.get("DEFAULT_BEHAVIOR_TIME_SECONDS", 300) ), + "maxPagesPerCrawl": int(os.environ.get("MAX_PAGES_PER_CRAWL", 0)), } invites = init_invites(mdb, email) diff --git a/backend/test/conftest.py b/backend/test/conftest.py index 2a578719..0506db9f 100644 --- a/backend/test/conftest.py +++ b/backend/test/conftest.py @@ -85,7 +85,8 @@ def admin_crawl_id(admin_auth_headers, default_org_id): "tags": ["wr-test-1", "wr-test-2"], "config": { "seeds": [{"url": "https://webrecorder.net/"}], - "limit": 1, + # limit now set via 'max_pages_per_crawl' global limit + # "limit": 1, }, } r = requests.post( diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 840fbd69..34a0661c 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -86,6 +86,46 @@ def test_verify_update(crawler_auth_headers, default_org_id): assert sorted(data["tags"]) == sorted(UPDATED_TAGS) +def test_update_config_invalid_format( + crawler_auth_headers, default_org_id, sample_crawl_data +): + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + json={ + "config": { + "seeds": ["https://example.com/"], + "scopeType": "domain", + "limit": 10, + } + }, + ) + + assert r.status_code == 422 + + +def test_update_config_invalid_limit( + crawler_auth_headers, default_org_id, sample_crawl_data +): + r = requests.patch( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", + headers=crawler_auth_headers, + json={ + "config": { + "seeds": [{"url": "https://example.com/"}], + "scopeType": "domain", + "limit": 10, + } + }, + ) + + assert r.status_code == 400 + + data = r.json() + + assert data["detail"] == "crawl_page_limit_exceeds_allowed" + + def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data): r = requests.patch( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index b97dce96..21d6907a 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -27,10 +27,29 @@ def test_list_orgs(admin_auth_headers, default_org_id): assert default_org_id in org_ids -def test_create_new_config(admin_auth_headers, default_org_id): +def test_create_new_config_invalid_limit(admin_auth_headers, default_org_id): crawl_data = { "runNow": True, "name": "Test Crawl", + "config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 10}, + } + r = requests.post( + f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", + headers=admin_auth_headers, + json=crawl_data, + ) + + assert r.status_code == 400 + + data = r.json() + + assert data["detail"] == "crawl_page_limit_exceeds_allowed" + + +def test_create_new_config(admin_auth_headers, default_org_id): + crawl_data = { + "runNow": False, + "name": "Test Crawl", "config": {"seeds": [{"url": "https://webrecorder.net/"}]}, } r = requests.post( @@ -43,7 +62,7 @@ def test_create_new_config(admin_auth_headers, default_org_id): data = r.json() assert data["added"] - assert data["run_now_job"] + assert data["run_now_job"] == None def test_wait_for_complete(admin_auth_headers, default_org_id, admin_crawl_id): @@ -98,7 +117,7 @@ def test_crawls_include_seed_info(admin_auth_headers, default_org_id, admin_craw assert crawl["seedCount"] > 0 r = requests.get( - f"{API_PREFIX}/orgs/all/crawls", + f"{API_PREFIX}/orgs/all/crawls?runningOnly=0", headers=admin_auth_headers, ) data = r.json() @@ -128,9 +147,20 @@ def test_verify_wacz(): assert "pages/pages.jsonl" in z.namelist() + # 1 seed page pages = z.open("pages/pages.jsonl").read().decode("utf-8") assert '"https://webrecorder.net/"' in pages + # 1 seed page + header line + assert len(pages.strip().split("\n")) == 2 + + # 1 other page + pages = z.open("pages/extraPages.jsonl").read().decode("utf-8") + assert '"https://webrecorder.net/blog"' in pages + + # 1 other page + header line + assert len(pages.strip().split("\n")) == 2 + def test_update_crawl(admin_auth_headers, default_org_id, admin_crawl_id): r = requests.get( diff --git a/backend/test/test_settings.py b/backend/test/test_settings.py new file mode 100644 index 00000000..27697c8a --- /dev/null +++ b/backend/test/test_settings.py @@ -0,0 +1,17 @@ +import requests + +from .conftest import API_PREFIX + + +def test_settings(): + r = requests.get(f"{API_PREFIX}/settings") + assert r.status_code == 200 + + data = r.json() + + assert data == { + "registrationEnabled": False, + "jwtTokenLifetime": 86400, + "defaultBehaviorTimeSeconds": 300, + "maxPagesPerCrawl": 2, + } diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index b0320e49..f35b7d6e 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -54,6 +54,8 @@ data: DEFAULT_BEHAVIOR_TIME_SECONDS: "{{ .Values.default_behavior_time_seconds }}" + MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}" + WEB_CONCURRENCY: "{{ .Values.backend_workers | default 4 }}" IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}" diff --git a/chart/test/test.yaml b/chart/test/test.yaml index 353850b5..9bf5fb3b 100644 --- a/chart/test/test.yaml +++ b/chart/test/test.yaml @@ -20,3 +20,9 @@ superuser: local_service_port: 30870 + +# test max pages per crawl global limit +max_pages_per_crawl: 2 + +registration_enabled: "0" + diff --git a/chart/values.yaml b/chart/values.yaml index 5fa3c0b0..5fc83d53 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -18,6 +18,11 @@ jwt_token_lifetime_minutes: 1440 # default time to run behaviors on each page (in seconds) default_behavior_time_seconds: 300 +# max pages per crawl +# set to non-zero value to enforce global max pages per crawl limit +# if set, each workflow can have a lower limit, but not higher +max_pages_per_crawl: 0 + # if set to "1", allow inviting same user to same org multiple times allow_dupe_invites: "0"