Max page limit override (#737)
* more page limit: update to #717, instead of setting --limit in each crawlconfig, apply override --maxPageLimit setting, implemented in crawler, to override individually configured page limit * update tests, no longer returning 'crawl_page_limit_exceeds_allowed'
This commit is contained in:
parent
3b99bdf26a
commit
1c47a648a9
@ -7,7 +7,6 @@ from enum import Enum
|
|||||||
import uuid
|
import uuid
|
||||||
import asyncio
|
import asyncio
|
||||||
import re
|
import re
|
||||||
import os
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
|
||||||
@ -249,8 +248,6 @@ class CrawlConfigOps:
|
|||||||
self.coll_ops = None
|
self.coll_ops = None
|
||||||
self._file_rx = re.compile("\\W+")
|
self._file_rx = re.compile("\\W+")
|
||||||
|
|
||||||
self.max_pages_per_crawl = int(os.environ.get("MAX_PAGES_PER_CRAWL", 0))
|
|
||||||
|
|
||||||
def set_crawl_ops(self, ops):
|
def set_crawl_ops(self, ops):
|
||||||
"""set crawl ops reference"""
|
"""set crawl ops reference"""
|
||||||
self.crawl_ops = ops
|
self.crawl_ops = ops
|
||||||
@ -301,8 +298,6 @@ class CrawlConfigOps:
|
|||||||
):
|
):
|
||||||
"""Add new crawl config"""
|
"""Add new crawl config"""
|
||||||
|
|
||||||
self.validate_crawl_limit(config.config)
|
|
||||||
|
|
||||||
data = config.dict()
|
data = config.dict()
|
||||||
data["oid"] = org.id
|
data["oid"] = org.id
|
||||||
data["createdBy"] = user.id
|
data["createdBy"] = user.id
|
||||||
@ -369,8 +364,6 @@ class CrawlConfigOps:
|
|||||||
if not orig_crawl_config:
|
if not orig_crawl_config:
|
||||||
raise HTTPException(status_code=400, detail="config_not_found")
|
raise HTTPException(status_code=400, detail="config_not_found")
|
||||||
|
|
||||||
self.validate_crawl_limit(update.config)
|
|
||||||
|
|
||||||
# indicates if any k8s crawl config settings changed
|
# indicates if any k8s crawl config settings changed
|
||||||
changed = False
|
changed = False
|
||||||
changed = changed or (
|
changed = changed or (
|
||||||
@ -867,18 +860,6 @@ class CrawlConfigOps:
|
|||||||
# pylint: disable=raise-missing-from
|
# pylint: disable=raise-missing-from
|
||||||
raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")
|
raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")
|
||||||
|
|
||||||
def validate_crawl_limit(self, config: Optional[RawCrawlConfig]):
|
|
||||||
"""Ensure max pages per crawl limit is not exceeded.
|
|
||||||
Set limit if not provided. if provided config exceeds limit, raise exception
|
|
||||||
"""
|
|
||||||
if config and self.max_pages_per_crawl:
|
|
||||||
if config.limit <= 0:
|
|
||||||
config.limit = self.max_pages_per_crawl
|
|
||||||
elif config.limit > self.max_pages_per_crawl:
|
|
||||||
raise HTTPException(
|
|
||||||
status_code=400, detail="crawl_page_limit_exceeds_allowed"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
|
# pylint: disable=redefined-builtin,invalid-name,too-many-locals,too-many-arguments
|
||||||
|
@ -104,28 +104,6 @@ def test_update_config_invalid_format(
|
|||||||
assert r.status_code == 422
|
assert r.status_code == 422
|
||||||
|
|
||||||
|
|
||||||
def test_update_config_invalid_limit(
|
|
||||||
crawler_auth_headers, default_org_id, sample_crawl_data
|
|
||||||
):
|
|
||||||
r = requests.patch(
|
|
||||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
|
||||||
headers=crawler_auth_headers,
|
|
||||||
json={
|
|
||||||
"config": {
|
|
||||||
"seeds": [{"url": "https://example.com/"}],
|
|
||||||
"scopeType": "domain",
|
|
||||||
"limit": 10,
|
|
||||||
}
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
assert r.status_code == 400
|
|
||||||
|
|
||||||
data = r.json()
|
|
||||||
|
|
||||||
assert data["detail"] == "crawl_page_limit_exceeds_allowed"
|
|
||||||
|
|
||||||
|
|
||||||
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
|
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
|
||||||
r = requests.patch(
|
r = requests.patch(
|
||||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
||||||
|
@ -27,25 +27,6 @@ def test_list_orgs(admin_auth_headers, default_org_id):
|
|||||||
assert default_org_id in org_ids
|
assert default_org_id in org_ids
|
||||||
|
|
||||||
|
|
||||||
def test_create_new_config_invalid_limit(admin_auth_headers, default_org_id):
|
|
||||||
crawl_data = {
|
|
||||||
"runNow": True,
|
|
||||||
"name": "Test Crawl",
|
|
||||||
"config": {"seeds": [{"url": "https://webrecorder.net/"}], "limit": 10},
|
|
||||||
}
|
|
||||||
r = requests.post(
|
|
||||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
|
||||||
headers=admin_auth_headers,
|
|
||||||
json=crawl_data,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert r.status_code == 400
|
|
||||||
|
|
||||||
data = r.json()
|
|
||||||
|
|
||||||
assert data["detail"] == "crawl_page_limit_exceeds_allowed"
|
|
||||||
|
|
||||||
|
|
||||||
def test_create_new_config(admin_auth_headers, default_org_id):
|
def test_create_new_config(admin_auth_headers, default_org_id):
|
||||||
crawl_data = {
|
crawl_data = {
|
||||||
"runNow": False,
|
"runNow": False,
|
||||||
|
@ -69,7 +69,7 @@ metadata:
|
|||||||
namespace: {{ .Values.crawler_namespace }}
|
namespace: {{ .Values.crawler_namespace }}
|
||||||
|
|
||||||
data:
|
data:
|
||||||
CRAWL_ARGS: "{{ .Values.crawler_args }} --workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --healthCheckPort {{ .Values.crawler_liveness_port }}"
|
CRAWL_ARGS: "{{ .Values.crawler_args }} --workers {{ .Values.crawler_browser_instances | default 1 }} --sizeLimit {{ .Values.crawler_session_size_limit_bytes }} --timeLimit {{ .Values.crawler_session_time_limit_seconds }} --maxPageLimit {{ .Values.max_pages_per_crawl | default 0 }} --healthCheckPort {{ .Values.crawler_liveness_port }}"
|
||||||
|
|
||||||
---
|
---
|
||||||
apiVersion: v1
|
apiVersion: v1
|
||||||
|
Loading…
Reference in New Issue
Block a user