diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index 87edc97c..e3f8dd08 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -7,7 +7,6 @@ from enum import Enum import uuid import asyncio import re -import os from datetime import datetime import urllib.parse @@ -249,8 +248,6 @@ class CrawlConfigOps: self.coll_ops = None self._file_rx = re.compile("\\W+") - self.max_pages_per_crawl = int(os.environ.get("MAX_PAGES_PER_CRAWL", 0)) - def set_crawl_ops(self, ops): """set crawl ops reference""" self.crawl_ops = ops @@ -301,8 +298,6 @@ class CrawlConfigOps: ): """Add new crawl config""" - self.validate_crawl_limit(config.config) - data = config.dict() data["oid"] = org.id data["createdBy"] = user.id @@ -369,8 +364,6 @@ class CrawlConfigOps: if not orig_crawl_config: raise HTTPException(status_code=400, detail="config_not_found") - self.validate_crawl_limit(update.config) - # indicates if any k8s crawl config settings changed changed = False changed = changed or ( @@ -867,18 +860,6 @@ class CrawlConfigOps: # pylint: disable=raise-missing-from raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}") - def validate_crawl_limit(self, config: Optional[RawCrawlConfig]): - """Ensure max pages per crawl limit is not exceeded. - Set limit if not provided. if provided config exceeds limit, raise exception - """ - if config and self.max_pages_per_crawl: - if config.limit <= 0: - config.limit = self.max_pages_per_crawl - elif config.limit > self.max_pages_per_crawl: - raise HTTPException( - status_code=400, detail="crawl_page_limit_exceeds_allowed" - ) - # ============================================================================ # pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments diff --git a/backend/test/test_crawlconfigs.py b/backend/test/test_crawlconfigs.py index 34a0661c..2945e432 100644 --- a/backend/test/test_crawlconfigs.py +++ b/backend/test/test_crawlconfigs.py @@ -104,28 +104,6 @@ def test_update_config_invalid_format( assert r.status_code == 422 -def test_update_config_invalid_limit( - crawler_auth_headers, default_org_id, sample_crawl_data -): - r = requests.patch( - f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", - headers=crawler_auth_headers, - json={ - "config": { - "seeds": [{"url": "https://example.com/"}], - "scopeType": "domain", - "limit": 10, - } - }, - ) - - assert r.status_code == 400 - - data = r.json() - - assert data["detail"] == "crawl_page_limit_exceeds_allowed" - - def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data): r = requests.patch( f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index 21d6907a..7b72f15a 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -27,25 +27,6 @@ def test_list_orgs(admin_auth_headers, default_org_id): assert default_org_id in org_ids -def test_create_new_config_invalid_limit(admin_auth_headers, default_org_id): - crawl_data = { - "runNow": True, - "name": "Test Crawl", - "config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 10}, - } - r = requests.post( - f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", - headers=admin_auth_headers, - json=crawl_data, - ) - - assert r.status_code == 400 - - data = r.json() - - assert data["detail"] == "crawl_page_limit_exceeds_allowed" - - def test_create_new_config(admin_auth_headers, default_org_id): crawl_data = { "runNow": False, diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index f35b7d6e..b71aeabc 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -69,7 +69,7 @@ metadata: namespace: {{ .Values.crawler_namespace }} data: - CRAWL_ARGS: "{{ .Values.crawler_args }} --workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --healthCheckPort {{ .Values.crawler_liveness_port }}" + CRAWL_ARGS: "{{ .Values.crawler_args }} --workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }}" --- apiVersion: v1