Validate exclusion regexes on backend (#2316)

This commit is contained in:
Tessa Walsh 2025-01-23 13:32:54 -05:00 committed by GitHub
parent 763c654484
commit 9363095d62
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 80 additions and 3 deletions

View File

@ -43,7 +43,7 @@ from .models import (
CrawlerProxy,
CrawlerProxies,
)
from .utils import dt_now, slug_from_name
from .utils import dt_now, slug_from_name, validate_regexes
if TYPE_CHECKING:
from .orgs import OrgOps
@ -189,7 +189,7 @@ class CrawlConfigOps:
return profile_filename
# pylint: disable=invalid-name
# pylint: disable=invalid-name, too-many-branches
async def add_crawl_config(
self,
config_in: CrawlConfigIn,
@ -215,6 +215,12 @@ class CrawlConfigOps:
if not self.can_org_use_proxy(org, config_in.proxyId):
raise HTTPException(status_code=404, detail="proxy_not_found")
if config_in.config.exclude:
exclude = config_in.config.exclude
if isinstance(exclude, str):
exclude = [exclude]
validate_regexes(exclude)
now = dt_now()
crawlconfig = CrawlConfig(
id=uuid4(),
@ -317,11 +323,17 @@ class CrawlConfigOps:
async def update_crawl_config(
self, cid: UUID, org: Organization, user: User, update: UpdateCrawlConfig
) -> dict[str, bool | str]:
# pylint: disable=too-many-locals
# pylint: disable=too-many-locals, too-many-branches, too-many-statements
"""Update name, scale, schedule, and/or tags for an existing crawl config"""
orig_crawl_config = await self.get_crawl_config(cid, org.id)
if update.config and update.config.exclude:
exclude = update.config.exclude
if isinstance(exclude, str):
exclude = [exclude]
validate_regexes(exclude)
# indicates if any k8s crawl config settings changed
changed = False
changed = changed or (

View File

@ -24,6 +24,7 @@ from .utils import (
date_to_str,
parse_jsonl_error_messages,
stream_dict_list_as_csv,
validate_regexes,
)
from .basecrawls import BaseCrawlOps
from .crawlmanager import CrawlManager
@ -517,6 +518,9 @@ class CrawlOps(BaseCrawlOps):
"""add new exclusion to config or remove exclusion from config
for given crawl_id, update config on crawl"""
if add:
validate_regexes([regex])
crawl = await self.get_crawl(crawl_id, org)
if crawl.state not in RUNNING_AND_WAITING_STATES:

View File

@ -194,3 +194,13 @@ def get_origin(headers) -> str:
return default_origin
return scheme + "://" + host
def validate_regexes(regexes: List[str]):
"""Validate regular expressions, raise HTTPException if invalid"""
for regex in regexes:
try:
re.compile(regex)
except re.error:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=400, detail="invalid_regex")

View File

@ -153,6 +153,26 @@ def test_update_config_invalid_format(
assert r.status_code == 422
def test_update_config_invalid_exclude_regex(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"exclude": "["}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"exclude": ["abc.*", "["]}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
@ -487,3 +507,25 @@ def test_get_crawler_channels(crawler_auth_headers, default_org_id):
for crawler_channel in crawler_channels:
assert crawler_channel["id"]
assert crawler_channel["image"]
def test_add_crawl_config_invalid_exclude_regex(
crawler_auth_headers, default_org_id, sample_crawl_data
):
sample_crawl_data["config"]["exclude"] = "["
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
sample_crawl_data["config"]["exclude"] = ["abc.*", "["]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"

View File

@ -148,6 +148,15 @@ def test_add_exclusion(admin_auth_headers, default_org_id):
assert r.json()["success"] == True
def test_add_invalid_exclusion(admin_auth_headers, default_org_id):
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=[",
headers=admin_auth_headers,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
def test_remove_exclusion(admin_auth_headers, default_org_id):
r = requests.delete(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=test",