Validate exclusion regexes on backend (#2316)
This commit is contained in:
parent
763c654484
commit
9363095d62
@ -43,7 +43,7 @@ from .models import (
|
||||
CrawlerProxy,
|
||||
CrawlerProxies,
|
||||
)
|
||||
from .utils import dt_now, slug_from_name
|
||||
from .utils import dt_now, slug_from_name, validate_regexes
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .orgs import OrgOps
|
||||
@ -189,7 +189,7 @@ class CrawlConfigOps:
|
||||
|
||||
return profile_filename
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
# pylint: disable=invalid-name, too-many-branches
|
||||
async def add_crawl_config(
|
||||
self,
|
||||
config_in: CrawlConfigIn,
|
||||
@ -215,6 +215,12 @@ class CrawlConfigOps:
|
||||
if not self.can_org_use_proxy(org, config_in.proxyId):
|
||||
raise HTTPException(status_code=404, detail="proxy_not_found")
|
||||
|
||||
if config_in.config.exclude:
|
||||
exclude = config_in.config.exclude
|
||||
if isinstance(exclude, str):
|
||||
exclude = [exclude]
|
||||
validate_regexes(exclude)
|
||||
|
||||
now = dt_now()
|
||||
crawlconfig = CrawlConfig(
|
||||
id=uuid4(),
|
||||
@ -317,11 +323,17 @@ class CrawlConfigOps:
|
||||
async def update_crawl_config(
|
||||
self, cid: UUID, org: Organization, user: User, update: UpdateCrawlConfig
|
||||
) -> dict[str, bool | str]:
|
||||
# pylint: disable=too-many-locals
|
||||
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
|
||||
"""Update name, scale, schedule, and/or tags for an existing crawl config"""
|
||||
|
||||
orig_crawl_config = await self.get_crawl_config(cid, org.id)
|
||||
|
||||
if update.config and update.config.exclude:
|
||||
exclude = update.config.exclude
|
||||
if isinstance(exclude, str):
|
||||
exclude = [exclude]
|
||||
validate_regexes(exclude)
|
||||
|
||||
# indicates if any k8s crawl config settings changed
|
||||
changed = False
|
||||
changed = changed or (
|
||||
|
@ -24,6 +24,7 @@ from .utils import (
|
||||
date_to_str,
|
||||
parse_jsonl_error_messages,
|
||||
stream_dict_list_as_csv,
|
||||
validate_regexes,
|
||||
)
|
||||
from .basecrawls import BaseCrawlOps
|
||||
from .crawlmanager import CrawlManager
|
||||
@ -517,6 +518,9 @@ class CrawlOps(BaseCrawlOps):
|
||||
"""add new exclusion to config or remove exclusion from config
|
||||
for given crawl_id, update config on crawl"""
|
||||
|
||||
if add:
|
||||
validate_regexes([regex])
|
||||
|
||||
crawl = await self.get_crawl(crawl_id, org)
|
||||
|
||||
if crawl.state not in RUNNING_AND_WAITING_STATES:
|
||||
|
@ -194,3 +194,13 @@ def get_origin(headers) -> str:
|
||||
return default_origin
|
||||
|
||||
return scheme + "://" + host
|
||||
|
||||
|
||||
def validate_regexes(regexes: List[str]):
|
||||
"""Validate regular expressions, raise HTTPException if invalid"""
|
||||
for regex in regexes:
|
||||
try:
|
||||
re.compile(regex)
|
||||
except re.error:
|
||||
# pylint: disable=raise-missing-from
|
||||
raise HTTPException(status_code=400, detail="invalid_regex")
|
||||
|
@ -153,6 +153,26 @@ def test_update_config_invalid_format(
|
||||
assert r.status_code == 422
|
||||
|
||||
|
||||
def test_update_config_invalid_exclude_regex(
|
||||
crawler_auth_headers, default_org_id, sample_crawl_data
|
||||
):
|
||||
r = requests.patch(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
||||
headers=crawler_auth_headers,
|
||||
json={"config": {"exclude": "["}},
|
||||
)
|
||||
assert r.status_code == 400
|
||||
assert r.json()["detail"] == "invalid_regex"
|
||||
|
||||
r = requests.patch(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
||||
headers=crawler_auth_headers,
|
||||
json={"config": {"exclude": ["abc.*", "["]}},
|
||||
)
|
||||
assert r.status_code == 400
|
||||
assert r.json()["detail"] == "invalid_regex"
|
||||
|
||||
|
||||
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
|
||||
r = requests.patch(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
|
||||
@ -487,3 +507,25 @@ def test_get_crawler_channels(crawler_auth_headers, default_org_id):
|
||||
for crawler_channel in crawler_channels:
|
||||
assert crawler_channel["id"]
|
||||
assert crawler_channel["image"]
|
||||
|
||||
|
||||
def test_add_crawl_config_invalid_exclude_regex(
|
||||
crawler_auth_headers, default_org_id, sample_crawl_data
|
||||
):
|
||||
sample_crawl_data["config"]["exclude"] = "["
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
||||
headers=crawler_auth_headers,
|
||||
json=sample_crawl_data,
|
||||
)
|
||||
assert r.status_code == 400
|
||||
assert r.json()["detail"] == "invalid_regex"
|
||||
|
||||
sample_crawl_data["config"]["exclude"] = ["abc.*", "["]
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
|
||||
headers=crawler_auth_headers,
|
||||
json=sample_crawl_data,
|
||||
)
|
||||
assert r.status_code == 400
|
||||
assert r.json()["detail"] == "invalid_regex"
|
||||
|
@ -148,6 +148,15 @@ def test_add_exclusion(admin_auth_headers, default_org_id):
|
||||
assert r.json()["success"] == True
|
||||
|
||||
|
||||
def test_add_invalid_exclusion(admin_auth_headers, default_org_id):
|
||||
r = requests.post(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=[",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 400
|
||||
assert r.json()["detail"] == "invalid_regex"
|
||||
|
||||
|
||||
def test_remove_exclusion(admin_auth_headers, default_org_id):
|
||||
r = requests.delete(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=test",
|
||||
|
Loading…
Reference in New Issue
Block a user