Add basic backend validation for selectLinks (#2510)
Follow-up to #2152 Related to https://github.com/webrecorder/browsertrix/pull/2487 This PR provides very basic validation of the `config.selectLinks` argument on workflow creation and update. Namely, it checks that: - `config.selectLinks` is not an empty array - Each entry consists of two non-empty text sequences separated by `->` At this point we're not validating the actual CSS selector on the backend, though we could add that down the road. Tests have been added accordingly. Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
		
							parent
							
								
									23f9e08a22
								
							
						
					
					
						commit
						f84f6f55e0
					
				@ -233,6 +233,8 @@ class CrawlConfigOps:
 | 
				
			|||||||
                exclude = [exclude]
 | 
					                exclude = [exclude]
 | 
				
			||||||
            validate_regexes(exclude)
 | 
					            validate_regexes(exclude)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        self._validate_link_selectors(config_in.config.selectLinks)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if config_in.config.customBehaviors:
 | 
					        if config_in.config.customBehaviors:
 | 
				
			||||||
            for url in config_in.config.customBehaviors:
 | 
					            for url in config_in.config.customBehaviors:
 | 
				
			||||||
                self._validate_custom_behavior_url_syntax(url)
 | 
					                self._validate_custom_behavior_url_syntax(url)
 | 
				
			||||||
@ -297,6 +299,24 @@ class CrawlConfigOps:
 | 
				
			|||||||
            execMinutesQuotaReached=exec_mins_quota_reached,
 | 
					            execMinutesQuotaReached=exec_mins_quota_reached,
 | 
				
			||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def _validate_link_selectors(self, link_selectors: List[str]):
 | 
				
			||||||
 | 
					        """Validate link selectors
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        Ensure at least one link selector is set and that all the link slectors passed
 | 
				
			||||||
 | 
					        follow expected syntax: selector->attribute/property.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        We don't yet check the validity of the CSS selector itself.
 | 
				
			||||||
 | 
					        """
 | 
				
			||||||
 | 
					        if not link_selectors:
 | 
				
			||||||
 | 
					            raise HTTPException(status_code=400, detail="invalid_link_selector")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        for link_selector in link_selectors:
 | 
				
			||||||
 | 
					            parts = link_selector.split("->")
 | 
				
			||||||
 | 
					            if not len(parts) == 2:
 | 
				
			||||||
 | 
					                raise HTTPException(status_code=400, detail="invalid_link_selector")
 | 
				
			||||||
 | 
					            if not parts[0] or not parts[1]:
 | 
				
			||||||
 | 
					                raise HTTPException(status_code=400, detail="invalid_link_selector")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def _validate_custom_behavior_url_syntax(self, url: str) -> Tuple[bool, List[str]]:
 | 
					    def _validate_custom_behavior_url_syntax(self, url: str) -> Tuple[bool, List[str]]:
 | 
				
			||||||
        """Validate custom behaviors are valid URLs after removing custom git syntax"""
 | 
					        """Validate custom behaviors are valid URLs after removing custom git syntax"""
 | 
				
			||||||
        git_prefix = "git+"
 | 
					        git_prefix = "git+"
 | 
				
			||||||
@ -379,6 +399,9 @@ class CrawlConfigOps:
 | 
				
			|||||||
                exclude = [exclude]
 | 
					                exclude = [exclude]
 | 
				
			||||||
            validate_regexes(exclude)
 | 
					            validate_regexes(exclude)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        if update.config and update.config.selectLinks is not None:
 | 
				
			||||||
 | 
					            self._validate_link_selectors(update.config.selectLinks)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if update.config and update.config.customBehaviors:
 | 
					        if update.config and update.config.customBehaviors:
 | 
				
			||||||
            for url in update.config.customBehaviors:
 | 
					            for url in update.config.customBehaviors:
 | 
				
			||||||
                self._validate_custom_behavior_url_syntax(url)
 | 
					                self._validate_custom_behavior_url_syntax(url)
 | 
				
			||||||
 | 
				
			|||||||
@ -172,6 +172,24 @@ def test_update_config_invalid_exclude_regex(
 | 
				
			|||||||
    assert r.status_code == 400
 | 
					    assert r.status_code == 400
 | 
				
			||||||
    assert r.json()["detail"] == "invalid_regex"
 | 
					    assert r.json()["detail"] == "invalid_regex"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_update_config_invalid_link_selector(
 | 
				
			||||||
 | 
					    crawler_auth_headers, default_org_id, sample_crawl_data
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    r = requests.patch(
 | 
				
			||||||
 | 
					        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
 | 
				
			||||||
 | 
					        headers=crawler_auth_headers,
 | 
				
			||||||
 | 
					        json={"config": {"selectLinks": []}},
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    assert r.status_code == 400
 | 
				
			||||||
 | 
					    assert r.json()["detail"] == "invalid_link_selector"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    r = requests.patch(
 | 
				
			||||||
 | 
					        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/",
 | 
				
			||||||
 | 
					        headers=crawler_auth_headers,
 | 
				
			||||||
 | 
					        json={"config": {"selectLinks": ["a[href]->href", "->href"]}},
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    assert r.status_code == 400
 | 
				
			||||||
 | 
					    assert r.json()["detail"] == "invalid_link_selector"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_verify_default_select_links(
 | 
					def test_verify_default_select_links(
 | 
				
			||||||
    crawler_auth_headers, default_org_id, sample_crawl_data
 | 
					    crawler_auth_headers, default_org_id, sample_crawl_data
 | 
				
			||||||
@ -545,6 +563,28 @@ def test_add_crawl_config_invalid_exclude_regex(
 | 
				
			|||||||
    assert r.json()["detail"] == "invalid_regex"
 | 
					    assert r.json()["detail"] == "invalid_regex"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def test_add_crawl_config_invalid_link_selectors(
 | 
				
			||||||
 | 
					    crawler_auth_headers, default_org_id, sample_crawl_data
 | 
				
			||||||
 | 
					):
 | 
				
			||||||
 | 
					    sample_crawl_data["config"]["selectLinks"] = []
 | 
				
			||||||
 | 
					    r = requests.post(
 | 
				
			||||||
 | 
					        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | 
				
			||||||
 | 
					        headers=crawler_auth_headers,
 | 
				
			||||||
 | 
					        json=sample_crawl_data,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    assert r.status_code == 400
 | 
				
			||||||
 | 
					    assert r.json()["detail"] == "invalid_link_selector"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    sample_crawl_data["config"]["selectLinks"] = ["a[href]->href", "->href"]
 | 
				
			||||||
 | 
					    r = requests.post(
 | 
				
			||||||
 | 
					        f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/",
 | 
				
			||||||
 | 
					        headers=crawler_auth_headers,
 | 
				
			||||||
 | 
					        json=sample_crawl_data,
 | 
				
			||||||
 | 
					    )
 | 
				
			||||||
 | 
					    assert r.status_code == 400
 | 
				
			||||||
 | 
					    assert r.json()["detail"] == "invalid_link_selector"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
def test_add_crawl_config_custom_behaviors_invalid_url(
 | 
					def test_add_crawl_config_custom_behaviors_invalid_url(
 | 
				
			||||||
    crawler_auth_headers, default_org_id, sample_crawl_data
 | 
					    crawler_auth_headers, default_org_id, sample_crawl_data
 | 
				
			||||||
):
 | 
					):
 | 
				
			||||||
 | 
				
			|||||||
@ -77,7 +77,7 @@ const errorFor: Record<ValidationErrorCode, string> = {
 | 
				
			|||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
const inputStyle = [
 | 
					const inputStyle = [
 | 
				
			||||||
  tw`[--sl-input-background-color-hover:transparent] [--sl-input-background-color:transparent] [--sl-input-border-color-hover:transparent] [--sl-input-border-radius-medium:0] [--sl-input-spacing-medium:var(--sl-spacing-small)]`,
 | 
					  tw`[--sl-input-border-radius-medium:0] [--sl-input-spacing-medium:var(--sl-spacing-small)] [--sl-input-background-color-hover:transparent] [--sl-input-background-color:transparent] [--sl-input-border-color-hover:transparent]`,
 | 
				
			||||||
  tw`data-[valid]:[--sl-input-border-color:transparent]`,
 | 
					  tw`data-[valid]:[--sl-input-border-color:transparent]`,
 | 
				
			||||||
  tw`part-[form-control-help-text]:mx-1 part-[form-control-help-text]:mb-1`,
 | 
					  tw`part-[form-control-help-text]:mx-1 part-[form-control-help-text]:mb-1`,
 | 
				
			||||||
];
 | 
					];
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
		Reference in New Issue
	
	Block a user