Max page limit override (#737)

* more page limit: update to #717, instead of setting --limit in each crawlconfig,
apply override --maxPageLimit setting, implemented in crawler, to override individually configured page limit

* update tests, no longer returning 'crawl_page_limit_exceeds_allowed'
This commit is contained in:
Ilya Kreymer 2023-04-03 14:01:32 -07:00 committed by GitHub
parent 3b99bdf26a
commit 1c47a648a9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 1 additions and 61 deletions

View File

@ -7,7 +7,6 @@ from enum import Enum
import uuid import uuid
import asyncio import asyncio
import re import re
import os
from datetime import datetime from datetime import datetime
import urllib.parse import urllib.parse
@ -249,8 +248,6 @@ class CrawlConfigOps:
self.coll_ops = None self.coll_ops = None
self._file_rx = re.compile("\\W+") self._file_rx = re.compile("\\W+")
self.max_pages_per_crawl = int(os.environ.get("MAX_PAGES_PER_CRAWL", 0))
def set_crawl_ops(self, ops): def set_crawl_ops(self, ops):
"""set crawl ops reference""" """set crawl ops reference"""
self.crawl_ops = ops self.crawl_ops = ops
@ -301,8 +298,6 @@ class CrawlConfigOps:
): ):
"""Add new crawl config""" """Add new crawl config"""
self.validate_crawl_limit(config.config)
data = config.dict() data = config.dict()
data["oid"] = org.id data["oid"] = org.id
data["createdBy"] = user.id data["createdBy"] = user.id
@ -369,8 +364,6 @@ class CrawlConfigOps:
if not orig_crawl_config: if not orig_crawl_config:
raise HTTPException(status_code=400, detail="config_not_found") raise HTTPException(status_code=400, detail="config_not_found")
self.validate_crawl_limit(update.config)
# indicates if any k8s crawl config settings changed # indicates if any k8s crawl config settings changed
changed = False changed = False
changed = changed or ( changed = changed or (
@ -867,18 +860,6 @@ class CrawlConfigOps:
# pylint: disable=raise-missing-from # pylint: disable=raise-missing-from
raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}") raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")
def validate_crawl_limit(self, config: Optional[RawCrawlConfig]):
"""Ensure max pages per crawl limit is not exceeded.
Set limit if not provided. if provided config exceeds limit, raise exception
"""
if config and self.max_pages_per_crawl:
if config.limit <= 0:
config.limit = self.max_pages_per_crawl
elif config.limit > self.max_pages_per_crawl:
raise HTTPException(
status_code=400, detail="crawl_page_limit_exceeds_allowed"
)
# ============================================================================ # ============================================================================
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments # pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments

View File

@ -104,28 +104,6 @@ def test_update_config_invalid_format(
assert r.status_code == 422 assert r.status_code == 422
def test_update_config_invalid_limit(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={
"config": {
"seeds": [{"url": "https://example.com/"}],
"scopeType": "domain",
"limit": 10,
}
},
)
assert r.status_code == 400
data = r.json()
assert data["detail"] == "crawl_page_limit_exceeds_allowed"
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data): def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
r = requests.patch( r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",

View File

@ -27,25 +27,6 @@ def test_list_orgs(admin_auth_headers, default_org_id):
assert default_org_id in org_ids assert default_org_id in org_ids
def test_create_new_config_invalid_limit(admin_auth_headers, default_org_id):
crawl_data = {
"runNow": True,
"name": "Test Crawl",
"config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 10},
}
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=admin_auth_headers,
json=crawl_data,
)
assert r.status_code == 400
data = r.json()
assert data["detail"] == "crawl_page_limit_exceeds_allowed"
def test_create_new_config(admin_auth_headers, default_org_id): def test_create_new_config(admin_auth_headers, default_org_id):
crawl_data = { crawl_data = {
"runNow": False, "runNow": False,

View File

@ -69,7 +69,7 @@ metadata:
namespace: {{ .Values.crawler_namespace }} namespace: {{ .Values.crawler_namespace }}
data: data:
CRAWL_ARGS: "{{ .Values.crawler_args }} --workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --healthCheckPort {{ .Values.crawler_liveness_port }}" CRAWL_ARGS: "{{ .Values.crawler_args }} --workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }}"
--- ---
apiVersion: v1 apiVersion: v1