Add basic backend validation for selectLinks (#2510)

Follow-up to #2152 

Related to https://github.com/webrecorder/browsertrix/pull/2487

This PR provides very basic validation of the `config.selectLinks`
argument on workflow creation and update. Namely, it checks that:
- `config.selectLinks` is not an empty array
- Each entry consists of two non-empty text sequences separated by `->`

At this point we're not validating the actual CSS selector on the
backend, though we could add that down the road.

Tests have been added accordingly.

Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2025-04-07 15:36:05 -04:00 committed by GitHub
parent 23f9e08a22
commit f84f6f55e0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 64 additions and 1 deletions

View File

@ -233,6 +233,8 @@ class CrawlConfigOps:
exclude = [exclude]
validate_regexes(exclude)
self._validate_link_selectors(config_in.config.selectLinks)
if config_in.config.customBehaviors:
for url in config_in.config.customBehaviors:
self._validate_custom_behavior_url_syntax(url)
@ -297,6 +299,24 @@ class CrawlConfigOps:
execMinutesQuotaReached=exec_mins_quota_reached,
)
def _validate_link_selectors(self, link_selectors: List[str]):
"""Validate link selectors
Ensure at least one link selector is set and that all the link slectors passed
follow expected syntax: selector->attribute/property.
We don't yet check the validity of the CSS selector itself.
"""
if not link_selectors:
raise HTTPException(status_code=400, detail="invalid_link_selector")
for link_selector in link_selectors:
parts = link_selector.split("->")
if not len(parts) == 2:
raise HTTPException(status_code=400, detail="invalid_link_selector")
if not parts[0] or not parts[1]:
raise HTTPException(status_code=400, detail="invalid_link_selector")
def _validate_custom_behavior_url_syntax(self, url: str) -> Tuple[bool, List[str]]:
"""Validate custom behaviors are valid URLs after removing custom git syntax"""
git_prefix = "git+"
@ -379,6 +399,9 @@ class CrawlConfigOps:
exclude = [exclude]
validate_regexes(exclude)
if update.config and update.config.selectLinks is not None:
self._validate_link_selectors(update.config.selectLinks)
if update.config and update.config.customBehaviors:
for url in update.config.customBehaviors:
self._validate_custom_behavior_url_syntax(url)

View File

@ -172,6 +172,24 @@ def test_update_config_invalid_exclude_regex(
assert r.status_code == 400
assert r.json()["detail"] == "invalid_regex"
def test_update_config_invalid_link_selector(
crawler_auth_headers, default_org_id, sample_crawl_data
):
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"selectLinks": []}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_link_selector"
r = requests.patch(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
headers=crawler_auth_headers,
json={"config": {"selectLinks": ["a[href]->href", "->href"]}},
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_link_selector"
def test_verify_default_select_links(
crawler_auth_headers, default_org_id, sample_crawl_data
@ -545,6 +563,28 @@ def test_add_crawl_config_invalid_exclude_regex(
assert r.json()["detail"] == "invalid_regex"
def test_add_crawl_config_invalid_link_selectors(
crawler_auth_headers, default_org_id, sample_crawl_data
):
sample_crawl_data["config"]["selectLinks"] = []
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_link_selector"
sample_crawl_data["config"]["selectLinks"] = ["a[href]->href", "->href"]
r = requests.post(
f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
headers=crawler_auth_headers,
json=sample_crawl_data,
)
assert r.status_code == 400
assert r.json()["detail"] == "invalid_link_selector"
def test_add_crawl_config_custom_behaviors_invalid_url(
crawler_auth_headers, default_org_id, sample_crawl_data
):

View File

@ -77,7 +77,7 @@ const errorFor: Record<ValidationErrorCode, string> = {
};
const inputStyle = [
tw`[--sl-input-background-color-hover:transparent] [--sl-input-background-color:transparent] [--sl-input-border-color-hover:transparent] [--sl-input-border-radius-medium:0] [--sl-input-spacing-medium:var(--sl-spacing-small)]`,
tw`[--sl-input-border-radius-medium:0] [--sl-input-spacing-medium:var(--sl-spacing-small)] [--sl-input-background-color-hover:transparent] [--sl-input-background-color:transparent] [--sl-input-border-color-hover:transparent]`,
tw`data-[valid]:[--sl-input-border-color:transparent]`,
tw`part-[form-control-help-text]:mx-1 part-[form-control-help-text]:mb-1`,
];