- ensure crawlFilenameTemplate is part of the CrawlConfig model - change CrawlConfig init to use type-safe construction - add a run_now_internal() that is shared for starting crawl, either on demand or from new config - add OrgOps.can_run_crawls() to check against org quotas for crawling - cleanup profile updates, remove _lookup_profile, only check for EmptyStr in update --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
		
							parent
							
								
									27059c91a5
								
							
						
					
					
						commit
						4db3053a9f
					
				| @ -4,7 +4,7 @@ Crawl Config API handling | ||||
| 
 | ||||
| # pylint: disable=too-many-lines | ||||
| 
 | ||||
| from typing import List, Union, Optional, Tuple, TYPE_CHECKING, cast | ||||
| from typing import List, Union, Optional, TYPE_CHECKING, cast | ||||
| 
 | ||||
| import asyncio | ||||
| import json | ||||
| @ -164,108 +164,96 @@ class CrawlConfigOps: | ||||
| 
 | ||||
|     async def get_profile_filename( | ||||
|         self, profileid: Optional[UUID], org: Organization | ||||
|     ) -> Optional[str]: | ||||
|     ) -> str: | ||||
|         """lookup filename from profileid""" | ||||
|         _, profile_filename = await self._lookup_profile(profileid, org) | ||||
|         return profile_filename | ||||
| 
 | ||||
|     async def _lookup_profile( | ||||
|         self, profileid: Union[UUID, EmptyStr, None], org: Organization | ||||
|     ) -> tuple[Optional[UUID], Optional[str]]: | ||||
|         if profileid is None: | ||||
|             return None, None | ||||
| 
 | ||||
|         if isinstance(profileid, EmptyStr) or profileid == "": | ||||
|             return None, "" | ||||
|         if not profileid: | ||||
|             return "" | ||||
| 
 | ||||
|         profile_filename = await self.profiles.get_profile_storage_path(profileid, org) | ||||
|         if not profile_filename: | ||||
|             raise HTTPException(status_code=400, detail="invalid_profile_id") | ||||
| 
 | ||||
|         return profileid, profile_filename | ||||
|         return profile_filename | ||||
| 
 | ||||
|     # pylint: disable=invalid-name | ||||
|     async def add_crawl_config( | ||||
|         self, | ||||
|         config: CrawlConfigIn, | ||||
|         config_in: CrawlConfigIn, | ||||
|         org: Organization, | ||||
|         user: User, | ||||
|     ) -> Tuple[str, Optional[str], bool, bool]: | ||||
|     ) -> CrawlConfigAddedResponse: | ||||
|         """Add new crawl config""" | ||||
|         data = config.dict() | ||||
|         data["oid"] = org.id | ||||
|         data["createdBy"] = user.id | ||||
|         data["createdByName"] = user.name | ||||
|         data["modifiedBy"] = user.id | ||||
|         data["modifiedByName"] = user.name | ||||
|         data["_id"] = uuid4() | ||||
|         data["created"] = dt_now() | ||||
|         data["modified"] = data["created"] | ||||
| 
 | ||||
|         if config.runNow: | ||||
|             data["lastStartedBy"] = user.id | ||||
|             data["lastStartedByName"] = user.name | ||||
|         # ensure crawlChannel is valid | ||||
|         if not self.get_channel_crawler_image(config_in.crawlerChannel): | ||||
|             raise HTTPException(status_code=404, detail="crawler_not_found") | ||||
| 
 | ||||
|         # ensure profile is valid, if provided | ||||
|         if config_in.profileid: | ||||
|             await self.profiles.get_profile(config_in.profileid, org) | ||||
| 
 | ||||
|         now = dt_now() | ||||
|         crawlconfig = CrawlConfig( | ||||
|             id=uuid4(), | ||||
|             oid=org.id, | ||||
|             createdBy=user.id, | ||||
|             createdByName=user.name, | ||||
|             modifiedBy=user.id, | ||||
|             modifiedByName=user.name, | ||||
|             created=now, | ||||
|             modified=now, | ||||
|             schedule=config_in.schedule, | ||||
|             config=config_in.config, | ||||
|             name=config_in.name, | ||||
|             description=config_in.description, | ||||
|             tags=config_in.tags, | ||||
|             jobType=config_in.jobType, | ||||
|             crawlTimeout=config_in.crawlTimeout, | ||||
|             maxCrawlSize=config_in.maxCrawlSize, | ||||
|             scale=config_in.scale, | ||||
|             autoAddCollections=config_in.autoAddCollections, | ||||
|             profileid=config_in.profileid, | ||||
|             crawlerChannel=config_in.crawlerChannel, | ||||
|             crawlFilenameTemplate=config_in.crawlFilenameTemplate, | ||||
|         ) | ||||
| 
 | ||||
|         if config_in.runNow: | ||||
|             crawlconfig.lastStartedBy = user.id | ||||
|             crawlconfig.lastStartedByName = user.name | ||||
| 
 | ||||
|         # Ensure page limit is below org maxPagesPerCall if set | ||||
|         max_pages = await self.org_ops.get_max_pages_per_crawl(org.id) | ||||
|         if max_pages > 0: | ||||
|             data["config"]["limit"] = max_pages | ||||
|             crawlconfig.config.limit = max_pages | ||||
| 
 | ||||
|         data["profileid"], profile_filename = await self._lookup_profile( | ||||
|             config.profileid, org | ||||
|         ) | ||||
| 
 | ||||
|         if config.autoAddCollections: | ||||
|             data["autoAddCollections"] = config.autoAddCollections | ||||
| 
 | ||||
|         if not self.get_channel_crawler_image(config.crawlerChannel): | ||||
|             raise HTTPException(status_code=404, detail="crawler_not_found") | ||||
| 
 | ||||
|         result = await self.crawl_configs.insert_one(data) | ||||
| 
 | ||||
|         crawlconfig = CrawlConfig.from_dict(data) | ||||
| 
 | ||||
|         storage_filename = ( | ||||
|             data.get("crawlFilenameTemplate") or self.default_filename_template | ||||
|         ) | ||||
| 
 | ||||
|         run_now = config.runNow | ||||
|         storage_quota_reached = await self.org_ops.storage_quota_reached(org.id) | ||||
|         exec_mins_quota_reached = await self.org_ops.exec_mins_quota_reached(org.id) | ||||
| 
 | ||||
|         if org.readOnly: | ||||
|             run_now = False | ||||
|             print(f"Org {org.id} set to read-only", flush=True) | ||||
| 
 | ||||
|         if storage_quota_reached: | ||||
|             run_now = False | ||||
|             print(f"Storage quota exceeded for org {org.id}", flush=True) | ||||
| 
 | ||||
|         if exec_mins_quota_reached: | ||||
|             run_now = False | ||||
|             print(f"Execution minutes quota exceeded for org {org.id}", flush=True) | ||||
|         # add  CrawlConfig to DB here | ||||
|         result = await self.crawl_configs.insert_one(crawlconfig.to_dict()) | ||||
| 
 | ||||
|         await self.crawl_manager.update_scheduled_job(crawlconfig, str(user.id)) | ||||
| 
 | ||||
|         crawl_id = None | ||||
|         storage_quota_reached = False | ||||
|         exec_mins_quota_reached = False | ||||
| 
 | ||||
|         if run_now: | ||||
|             crawl_id = await self.crawl_manager.create_crawl_job( | ||||
|                 crawlconfig, | ||||
|                 org.storage, | ||||
|                 userid=str(crawlconfig.modifiedBy), | ||||
|                 warc_prefix=self.get_warc_prefix(org, crawlconfig), | ||||
|                 storage_filename=storage_filename, | ||||
|                 profile_filename=profile_filename or "", | ||||
|             ) | ||||
|         if config_in.runNow: | ||||
|             try: | ||||
|                 crawl_id = await self.run_now_internal(crawlconfig, org, user) | ||||
|             except HTTPException as e: | ||||
|                 if e.detail == "storage_quota_reached": | ||||
|                     storage_quota_reached = True | ||||
|                 elif e.detail == "exec_minutes_quota_reached": | ||||
|                     exec_mins_quota_reached = True | ||||
|                 print(f"Can't run crawl now: {e.detail}", flush=True) | ||||
|         else: | ||||
|             storage_quota_reached = await self.org_ops.storage_quota_reached(org.id) | ||||
|             exec_mins_quota_reached = await self.org_ops.exec_mins_quota_reached(org.id) | ||||
| 
 | ||||
|             await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True) | ||||
| 
 | ||||
|         return ( | ||||
|             result.inserted_id, | ||||
|             crawl_id, | ||||
|             storage_quota_reached, | ||||
|             exec_mins_quota_reached, | ||||
|         return CrawlConfigAddedResponse( | ||||
|             added=True, | ||||
|             id=str(result.inserted_id), | ||||
|             run_now_job=crawl_id, | ||||
|             storageQuotaReached=storage_quota_reached, | ||||
|             execMinutesQuotaReached=exec_mins_quota_reached, | ||||
|         ) | ||||
| 
 | ||||
|     async def add_new_crawl( | ||||
| @ -377,7 +365,13 @@ class CrawlConfigOps: | ||||
|         query["modifiedByName"] = user.name | ||||
|         query["modified"] = dt_now() | ||||
| 
 | ||||
|         query["profileid"], _ = await self._lookup_profile(update.profileid, org) | ||||
|         # if empty str, just clear the profile | ||||
|         if isinstance(update.profileid, EmptyStr) or update.profileid == "": | ||||
|             query["profileid"] = None | ||||
|         # else, ensure its a valid profile | ||||
|         elif update.profileid: | ||||
|             await self.profiles.get_profile(update.profileid, org) | ||||
|             query["profileid"] = update.profileid | ||||
| 
 | ||||
|         if update.config is not None: | ||||
|             query["config"] = update.config.dict() | ||||
| @ -822,35 +816,29 @@ class CrawlConfigOps: | ||||
|             "workflowIds": workflow_ids, | ||||
|         } | ||||
| 
 | ||||
|     async def prepare_for_run_crawl(self, cid: UUID, org: Organization) -> CrawlConfig: | ||||
|         """prepare for running a crawl, returning crawlconfig and | ||||
|         validating that running crawls is allowed""" | ||||
|     async def run_now(self, cid: UUID, org: Organization, user: User) -> str: | ||||
|         """run new crawl for cid now, if possible""" | ||||
|         crawlconfig = await self.get_crawl_config(cid, org.id) | ||||
| 
 | ||||
|         if not crawlconfig: | ||||
|             raise HTTPException( | ||||
|                 status_code=404, detail=f"Crawl Config '{cid}' not found" | ||||
|             ) | ||||
| 
 | ||||
|         if org.readOnly: | ||||
|             raise HTTPException(status_code=403, detail="org_set_to_read_only") | ||||
|         return await self.run_now_internal(crawlconfig, org, user) | ||||
| 
 | ||||
|         if await self.org_ops.storage_quota_reached(org.id): | ||||
|             raise HTTPException(status_code=403, detail="storage_quota_reached") | ||||
| 
 | ||||
|         if await self.org_ops.exec_mins_quota_reached(org.id): | ||||
|             raise HTTPException(status_code=403, detail="exec_minutes_quota_reached") | ||||
| 
 | ||||
|         return crawlconfig | ||||
| 
 | ||||
|     async def run_now(self, cid: UUID, org: Organization, user: User): | ||||
|         """run specified crawlconfig now""" | ||||
|         crawlconfig = await self.prepare_for_run_crawl(cid, org) | ||||
|     async def run_now_internal( | ||||
|         self, crawlconfig: CrawlConfig, org: Organization, user: User | ||||
|     ) -> str: | ||||
|         """run new crawl for specified crawlconfig now""" | ||||
|         await self.org_ops.can_run_crawls(org) | ||||
| 
 | ||||
|         if await self.get_running_crawl(crawlconfig): | ||||
|             raise HTTPException(status_code=400, detail="crawl_already_running") | ||||
| 
 | ||||
|         profile_filename = await self.get_profile_filename(crawlconfig.profileid, org) | ||||
|         storage_filename = ( | ||||
|             crawlconfig.crawlFilenameTemplate or self.default_filename_template | ||||
|         ) | ||||
| 
 | ||||
|         try: | ||||
|             crawl_id = await self.crawl_manager.create_crawl_job( | ||||
| @ -858,7 +846,7 @@ class CrawlConfigOps: | ||||
|                 org.storage, | ||||
|                 userid=str(user.id), | ||||
|                 warc_prefix=self.get_warc_prefix(org, crawlconfig), | ||||
|                 storage_filename=self.default_filename_template, | ||||
|                 storage_filename=storage_filename, | ||||
|                 profile_filename=profile_filename or "", | ||||
|             ) | ||||
|             await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True) | ||||
| @ -1120,19 +1108,7 @@ def init_crawl_config_api( | ||||
|         org: Organization = Depends(org_crawl_dep), | ||||
|         user: User = Depends(user_dep), | ||||
|     ): | ||||
|         ( | ||||
|             cid, | ||||
|             new_job_name, | ||||
|             storage_quota_reached, | ||||
|             exec_mins_quota_reached, | ||||
|         ) = await ops.add_crawl_config(config, org, user) | ||||
|         return { | ||||
|             "added": True, | ||||
|             "id": str(cid), | ||||
|             "run_now_job": new_job_name, | ||||
|             "storageQuotaReached": storage_quota_reached, | ||||
|             "execMinutesQuotaReached": exec_mins_quota_reached, | ||||
|         } | ||||
|         return await ops.add_crawl_config(config, org, user) | ||||
| 
 | ||||
|     @router.patch( | ||||
|         "/{cid}", | ||||
|  | ||||
| @ -776,7 +776,9 @@ class CrawlOps(BaseCrawlOps): | ||||
|         if not crawl.cid or crawl.type != "crawl": | ||||
|             raise HTTPException(status_code=400, detail="invalid_crawl_for_qa") | ||||
| 
 | ||||
|         crawlconfig = await self.crawl_configs.prepare_for_run_crawl(crawl.cid, org) | ||||
|         await self.orgs.can_run_crawls(org) | ||||
| 
 | ||||
|         crawlconfig = await self.crawl_configs.get_crawl_config(crawl.cid, org.id) | ||||
| 
 | ||||
|         try: | ||||
|             qa_run_id = await self.crawl_manager.create_qa_crawl_job( | ||||
|  | ||||
| @ -314,7 +314,7 @@ class CrawlConfigIn(BaseModel): | ||||
| 
 | ||||
|     jobType: Optional[JobType] = JobType.CUSTOM | ||||
| 
 | ||||
|     profileid: Union[UUID, EmptyStr, None] | ||||
|     profileid: Optional[UUID] = None | ||||
|     crawlerChannel: str = "default" | ||||
| 
 | ||||
|     autoAddCollections: Optional[List[UUID]] = [] | ||||
| @ -407,6 +407,8 @@ class CrawlConfigAdditional(BaseModel): | ||||
| 
 | ||||
|     isCrawlRunning: Optional[bool] = False | ||||
| 
 | ||||
|     crawlFilenameTemplate: Optional[str] = None | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class CrawlConfig(CrawlConfigCore, CrawlConfigAdditional): | ||||
|  | ||||
| @ -697,6 +697,17 @@ class OrgOps: | ||||
| 
 | ||||
|         return False | ||||
| 
 | ||||
|     async def can_run_crawls(self, org: Organization) -> None: | ||||
|         """check crawl quotas and readOnly state, throw if can not run""" | ||||
|         if org.readOnly: | ||||
|             raise HTTPException(status_code=403, detail="org_set_to_read_only") | ||||
| 
 | ||||
|         if await self.storage_quota_reached(org.id): | ||||
|             raise HTTPException(status_code=403, detail="storage_quota_reached") | ||||
| 
 | ||||
|         if await self.exec_mins_quota_reached(org.id): | ||||
|             raise HTTPException(status_code=403, detail="exec_minutes_quota_reached") | ||||
| 
 | ||||
|     async def get_monthly_crawl_exec_seconds(self, oid: UUID) -> int: | ||||
|         """Return monthlyExecSeconds for current month""" | ||||
|         org_data = await self.orgs.find_one({"_id": oid}) | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user