Validate exclusion regexes on backend (#2316)
This commit is contained in:
		
							parent
							
								
									763c654484
								
							
						
					
					
						commit
						9363095d62
					
				| @ -43,7 +43,7 @@ from .models import ( | |||||||
|     CrawlerProxy, |     CrawlerProxy, | ||||||
|     CrawlerProxies, |     CrawlerProxies, | ||||||
| ) | ) | ||||||
| from .utils import dt_now, slug_from_name | from .utils import dt_now, slug_from_name, validate_regexes | ||||||
| 
 | 
 | ||||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||||
|     from .orgs import OrgOps |     from .orgs import OrgOps | ||||||
| @ -189,7 +189,7 @@ class CrawlConfigOps: | |||||||
| 
 | 
 | ||||||
|         return profile_filename |         return profile_filename | ||||||
| 
 | 
 | ||||||
|     # pylint: disable=invalid-name |     # pylint: disable=invalid-name, too-many-branches | ||||||
|     async def add_crawl_config( |     async def add_crawl_config( | ||||||
|         self, |         self, | ||||||
|         config_in: CrawlConfigIn, |         config_in: CrawlConfigIn, | ||||||
| @ -215,6 +215,12 @@ class CrawlConfigOps: | |||||||
|             if not self.can_org_use_proxy(org, config_in.proxyId): |             if not self.can_org_use_proxy(org, config_in.proxyId): | ||||||
|                 raise HTTPException(status_code=404, detail="proxy_not_found") |                 raise HTTPException(status_code=404, detail="proxy_not_found") | ||||||
| 
 | 
 | ||||||
|  |         if config_in.config.exclude: | ||||||
|  |             exclude = config_in.config.exclude | ||||||
|  |             if isinstance(exclude, str): | ||||||
|  |                 exclude = [exclude] | ||||||
|  |             validate_regexes(exclude) | ||||||
|  | 
 | ||||||
|         now = dt_now() |         now = dt_now() | ||||||
|         crawlconfig = CrawlConfig( |         crawlconfig = CrawlConfig( | ||||||
|             id=uuid4(), |             id=uuid4(), | ||||||
| @ -317,11 +323,17 @@ class CrawlConfigOps: | |||||||
|     async def update_crawl_config( |     async def update_crawl_config( | ||||||
|         self, cid: UUID, org: Organization, user: User, update: UpdateCrawlConfig |         self, cid: UUID, org: Organization, user: User, update: UpdateCrawlConfig | ||||||
|     ) -> dict[str, bool | str]: |     ) -> dict[str, bool | str]: | ||||||
|         # pylint: disable=too-many-locals |         # pylint: disable=too-many-locals, too-many-branches, too-many-statements | ||||||
|         """Update name, scale, schedule, and/or tags for an existing crawl config""" |         """Update name, scale, schedule, and/or tags for an existing crawl config""" | ||||||
| 
 | 
 | ||||||
|         orig_crawl_config = await self.get_crawl_config(cid, org.id) |         orig_crawl_config = await self.get_crawl_config(cid, org.id) | ||||||
| 
 | 
 | ||||||
|  |         if update.config and update.config.exclude: | ||||||
|  |             exclude = update.config.exclude | ||||||
|  |             if isinstance(exclude, str): | ||||||
|  |                 exclude = [exclude] | ||||||
|  |             validate_regexes(exclude) | ||||||
|  | 
 | ||||||
|         # indicates if any k8s crawl config settings changed |         # indicates if any k8s crawl config settings changed | ||||||
|         changed = False |         changed = False | ||||||
|         changed = changed or ( |         changed = changed or ( | ||||||
|  | |||||||
| @ -24,6 +24,7 @@ from .utils import ( | |||||||
|     date_to_str, |     date_to_str, | ||||||
|     parse_jsonl_error_messages, |     parse_jsonl_error_messages, | ||||||
|     stream_dict_list_as_csv, |     stream_dict_list_as_csv, | ||||||
|  |     validate_regexes, | ||||||
| ) | ) | ||||||
| from .basecrawls import BaseCrawlOps | from .basecrawls import BaseCrawlOps | ||||||
| from .crawlmanager import CrawlManager | from .crawlmanager import CrawlManager | ||||||
| @ -517,6 +518,9 @@ class CrawlOps(BaseCrawlOps): | |||||||
|         """add new exclusion to config or remove exclusion from config |         """add new exclusion to config or remove exclusion from config | ||||||
|         for given crawl_id, update config on crawl""" |         for given crawl_id, update config on crawl""" | ||||||
| 
 | 
 | ||||||
|  |         if add: | ||||||
|  |             validate_regexes([regex]) | ||||||
|  | 
 | ||||||
|         crawl = await self.get_crawl(crawl_id, org) |         crawl = await self.get_crawl(crawl_id, org) | ||||||
| 
 | 
 | ||||||
|         if crawl.state not in RUNNING_AND_WAITING_STATES: |         if crawl.state not in RUNNING_AND_WAITING_STATES: | ||||||
|  | |||||||
| @ -194,3 +194,13 @@ def get_origin(headers) -> str: | |||||||
|         return default_origin |         return default_origin | ||||||
| 
 | 
 | ||||||
|     return scheme + "://" + host |     return scheme + "://" + host | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def validate_regexes(regexes: List[str]): | ||||||
|  |     """Validate regular expressions, raise HTTPException if invalid""" | ||||||
|  |     for regex in regexes: | ||||||
|  |         try: | ||||||
|  |             re.compile(regex) | ||||||
|  |         except re.error: | ||||||
|  |             # pylint: disable=raise-missing-from | ||||||
|  |             raise HTTPException(status_code=400, detail="invalid_regex") | ||||||
|  | |||||||
| @ -153,6 +153,26 @@ def test_update_config_invalid_format( | |||||||
|     assert r.status_code == 422 |     assert r.status_code == 422 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_update_config_invalid_exclude_regex( | ||||||
|  |     crawler_auth_headers, default_org_id, sample_crawl_data | ||||||
|  | ): | ||||||
|  |     r = requests.patch( | ||||||
|  |         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", | ||||||
|  |         headers=crawler_auth_headers, | ||||||
|  |         json={"config": {"exclude": "["}}, | ||||||
|  |     ) | ||||||
|  |     assert r.status_code == 400 | ||||||
|  |     assert r.json()["detail"] == "invalid_regex" | ||||||
|  | 
 | ||||||
|  |     r = requests.patch( | ||||||
|  |         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", | ||||||
|  |         headers=crawler_auth_headers, | ||||||
|  |         json={"config": {"exclude": ["abc.*", "["]}}, | ||||||
|  |     ) | ||||||
|  |     assert r.status_code == 400 | ||||||
|  |     assert r.json()["detail"] == "invalid_regex" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data): | def test_update_config_data(crawler_auth_headers, default_org_id, sample_crawl_data): | ||||||
|     r = requests.patch( |     r = requests.patch( | ||||||
|         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", |         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/{cid}/", | ||||||
| @ -487,3 +507,25 @@ def test_get_crawler_channels(crawler_auth_headers, default_org_id): | |||||||
|     for crawler_channel in crawler_channels: |     for crawler_channel in crawler_channels: | ||||||
|         assert crawler_channel["id"] |         assert crawler_channel["id"] | ||||||
|         assert crawler_channel["image"] |         assert crawler_channel["image"] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | def test_add_crawl_config_invalid_exclude_regex( | ||||||
|  |     crawler_auth_headers, default_org_id, sample_crawl_data | ||||||
|  | ): | ||||||
|  |     sample_crawl_data["config"]["exclude"] = "[" | ||||||
|  |     r = requests.post( | ||||||
|  |         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", | ||||||
|  |         headers=crawler_auth_headers, | ||||||
|  |         json=sample_crawl_data, | ||||||
|  |     ) | ||||||
|  |     assert r.status_code == 400 | ||||||
|  |     assert r.json()["detail"] == "invalid_regex" | ||||||
|  | 
 | ||||||
|  |     sample_crawl_data["config"]["exclude"] = ["abc.*", "["] | ||||||
|  |     r = requests.post( | ||||||
|  |         f"{API_PREFIX}/orgs/{default_org_id}/crawlconfigs/", | ||||||
|  |         headers=crawler_auth_headers, | ||||||
|  |         json=sample_crawl_data, | ||||||
|  |     ) | ||||||
|  |     assert r.status_code == 400 | ||||||
|  |     assert r.json()["detail"] == "invalid_regex" | ||||||
|  | |||||||
| @ -148,6 +148,15 @@ def test_add_exclusion(admin_auth_headers, default_org_id): | |||||||
|     assert r.json()["success"] == True |     assert r.json()["success"] == True | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | def test_add_invalid_exclusion(admin_auth_headers, default_org_id): | ||||||
|  |     r = requests.post( | ||||||
|  |         f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=[", | ||||||
|  |         headers=admin_auth_headers, | ||||||
|  |     ) | ||||||
|  |     assert r.status_code == 400 | ||||||
|  |     assert r.json()["detail"] == "invalid_regex" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def test_remove_exclusion(admin_auth_headers, default_org_id): | def test_remove_exclusion(admin_auth_headers, default_org_id): | ||||||
|     r = requests.delete( |     r = requests.delete( | ||||||
|         f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=test", |         f"{API_PREFIX}/orgs/{default_org_id}/crawls/{admin_crawl_id}/exclusions?regex=test", | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user