Validate exclusion regexes on backend (#2316)
This commit is contained in:
		
							parent
							
								
									763c654484
								
							
						
					
					
						commit
						9363095d62
					
				| @ -43,7 +43,7 @@ from .models import ( | ||||
|     CrawlerProxy, | ||||
|     CrawlerProxies, | ||||
| ) | ||||
| from .utils import dt_now, slug_from_name | ||||
| from .utils import dt_now, slug_from_name, validate_regexes | ||||
| 
 | ||||
| if TYPE_CHECKING: | ||||
|     from .orgs import OrgOps | ||||
| @ -189,7 +189,7 @@ class CrawlConfigOps: | ||||
| 
 | ||||
|         return profile_filename | ||||
| 
 | ||||
|     # pylint: disable=invalid-name | ||||
|     # pylint: disable=invalid-name, too-many-branches | ||||
|     async def add_crawl_config( | ||||
|         self, | ||||
|         config_in: CrawlConfigIn, | ||||
| @ -215,6 +215,12 @@ class CrawlConfigOps: | ||||
|             if not self.can_org_use_proxy(org, config_in.proxyId): | ||||
|                 raise HTTPException(status_code=404, detail="proxy_not_found") | ||||
| 
 | ||||
|         if config_in.config.exclude: | ||||
|             exclude = config_in.config.exclude | ||||
|             if isinstance(exclude, str): | ||||
|                 exclude = [exclude] | ||||
|             validate_regexes(exclude) | ||||
| 
 | ||||
|         now = dt_now() | ||||
|         crawlconfig = CrawlConfig( | ||||
|             id=uuid4(), | ||||
| @ -317,11 +323,17 @@ class CrawlConfigOps: | ||||
|     async def update_crawl_config( | ||||
|         self, cid: UUID, org: Organization, user: User, update: UpdateCrawlConfig | ||||
|     ) -> dict[str, bool | str]: | ||||
|         # pylint: disable=too-many-locals | ||||
|         # pylint: disable=too-many-locals, too-many-branches, too-many-statements | ||||
|         """Update name, scale, schedule, and/or tags for an existing crawl config""" | ||||
| 
 | ||||
|         orig_crawl_config = await self.get_crawl_config(cid, org.id) | ||||
| 
 | ||||
|         if update.config and update.config.exclude: | ||||
|             exclude = update.config.exclude | ||||
|             if isinstance(exclude, str): | ||||
|                 exclude = [exclude] | ||||
|             validate_regexes(exclude) | ||||
| 
 | ||||
|         # indicates if any k8s crawl config settings changed | ||||
|         changed = False | ||||
|         changed = changed or ( | ||||
|  | ||||
| @ -24,6 +24,7 @@ from .utils import ( | ||||
|     date_to_str, | ||||
|     parse_jsonl_error_messages, | ||||
|     stream_dict_list_as_csv, | ||||
|     validate_regexes, | ||||
| ) | ||||
| from .basecrawls import BaseCrawlOps | ||||
| from .crawlmanager import CrawlManager | ||||
| @ -517,6 +518,9 @@ class CrawlOps(BaseCrawlOps): | ||||
|         """add new exclusion to config or remove exclusion from config | ||||
|         for given crawl_id, update config on crawl""" | ||||
| 
 | ||||
|         if add: | ||||
|             validate_regexes([regex]) | ||||
| 
 | ||||
|         crawl = await self.get_crawl(crawl_id, org) | ||||
| 
 | ||||
|         if crawl.state not in RUNNING_AND_WAITING_STATES: | ||||
|  | ||||
| @ -194,3 +194,13 @@ def get_origin(headers) -> str: | ||||
|         return default_origin | ||||
| 
 | ||||
|     return scheme + "://" + host | ||||
| 
 | ||||
| 
 | ||||
| def validate_regexes(regexes: List[str]): | ||||
|     """Validate regular expressions, raise HTTPException if invalid""" | ||||
|     for regex in regexes: | ||||
|         try: | ||||
|             re.compile(regex) | ||||
|         except re.error: | ||||
|             # pylint: disable=raise-missing-from | ||||
|             raise HTTPException(status_code=400, detail="invalid_regex") | ||||
|  | ||||
| @ -153,6 +153,26 @@ def test_update_config_invalid_format( | ||||
|     assert r.status_code == 422 | ||||
| 
 | ||||
| 
 | ||||
| def test_update_config_invalid_exclude_regex( | ||||
|     crawler_auth_headers, default_org_id, sample_crawl_data | ||||
| ): | ||||
|     r = requests.patch( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", | ||||
|         headers=crawler_auth_headers, | ||||
|         json={"config": {"exclude": "["}}, | ||||
|     ) | ||||
|     assert r.status_code == 400 | ||||
|     assert r.json()["detail"] == "invalid_regex" | ||||
| 
 | ||||
|     r = requests.patch( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", | ||||
|         headers=crawler_auth_headers, | ||||
|         json={"config": {"exclude": ["abc.*", "["]}}, | ||||
|     ) | ||||
|     assert r.status_code == 400 | ||||
|     assert r.json()["detail"] == "invalid_regex" | ||||
| 
 | ||||
| 
 | ||||
| def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data): | ||||
|     r = requests.patch( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", | ||||
| @ -487,3 +507,25 @@ def test_get_crawler_channels(crawler_auth_headers, default_org_id): | ||||
|     for crawler_channel in crawler_channels: | ||||
|         assert crawler_channel["id"] | ||||
|         assert crawler_channel["image"] | ||||
| 
 | ||||
| 
 | ||||
| def test_add_crawl_config_invalid_exclude_regex( | ||||
|     crawler_auth_headers, default_org_id, sample_crawl_data | ||||
| ): | ||||
|     sample_crawl_data["config"]["exclude"] = "[" | ||||
|     r = requests.post( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", | ||||
|         headers=crawler_auth_headers, | ||||
|         json=sample_crawl_data, | ||||
|     ) | ||||
|     assert r.status_code == 400 | ||||
|     assert r.json()["detail"] == "invalid_regex" | ||||
| 
 | ||||
|     sample_crawl_data["config"]["exclude"] = ["abc.*", "["] | ||||
|     r = requests.post( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", | ||||
|         headers=crawler_auth_headers, | ||||
|         json=sample_crawl_data, | ||||
|     ) | ||||
|     assert r.status_code == 400 | ||||
|     assert r.json()["detail"] == "invalid_regex" | ||||
|  | ||||
| @ -148,6 +148,15 @@ def test_add_exclusion(admin_auth_headers, default_org_id): | ||||
|     assert r.json()["success"] == True | ||||
| 
 | ||||
| 
 | ||||
| def test_add_invalid_exclusion(admin_auth_headers, default_org_id): | ||||
|     r = requests.post( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=[", | ||||
|         headers=admin_auth_headers, | ||||
|     ) | ||||
|     assert r.status_code == 400 | ||||
|     assert r.json()["detail"] == "invalid_regex" | ||||
| 
 | ||||
| 
 | ||||
| def test_remove_exclusion(admin_auth_headers, default_org_id): | ||||
|     r = requests.delete( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=test", | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user