From 4db3053a9f08e3509f46d2f3edfdd5adc550f421 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 17 Jul 2024 10:48:25 -0700 Subject: [PATCH] fix crawlFilenameTemplate + add_crawl_config cleanup (fixes #1932) (#1935) - ensure crawlFilenameTemplate is part of the CrawlConfig model - change CrawlConfig init to use type-safe construction - add a run_now_internal() that is shared for starting crawl, either on demand or from new config - add OrgOps.can_run_crawls() to check against org quotas for crawling - cleanup profile updates, remove _lookup_profile, only check for EmptyStr in update --------- Co-authored-by: Tessa Walsh --- backend/btrixcloud/crawlconfigs.py | 196 +++++++++++++---------------- backend/btrixcloud/crawls.py | 4 +- backend/btrixcloud/models.py | 4 +- backend/btrixcloud/orgs.py | 11 ++ 4 files changed, 103 insertions(+), 112 deletions(-) diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index d107bdf5..36c997cb 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -4,7 +4,7 @@ Crawl Config API handling # pylint: disable=too-many-lines -from typing import List, Union, Optional, Tuple, TYPE_CHECKING, cast +from typing import List, Union, Optional, TYPE_CHECKING, cast import asyncio import json @@ -164,108 +164,96 @@ class CrawlConfigOps: async def get_profile_filename( self, profileid: Optional[UUID], org: Organization - ) -> Optional[str]: + ) -> str: """lookup filename from profileid""" - _, profile_filename = await self._lookup_profile(profileid, org) - return profile_filename - - async def _lookup_profile( - self, profileid: Union[UUID, EmptyStr, None], org: Organization - ) -> tuple[Optional[UUID], Optional[str]]: - if profileid is None: - return None, None - - if isinstance(profileid, EmptyStr) or profileid == "": - return None, "" + if not profileid: + return "" profile_filename = await self.profiles.get_profile_storage_path(profileid, org) if not profile_filename: raise HTTPException(status_code=400, detail="invalid_profile_id") - return profileid, profile_filename + return profile_filename # pylint: disable=invalid-name async def add_crawl_config( self, - config: CrawlConfigIn, + config_in: CrawlConfigIn, org: Organization, user: User, - ) -> Tuple[str, Optional[str], bool, bool]: + ) -> CrawlConfigAddedResponse: """Add new crawl config""" - data = config.dict() - data["oid"] = org.id - data["createdBy"] = user.id - data["createdByName"] = user.name - data["modifiedBy"] = user.id - data["modifiedByName"] = user.name - data["_id"] = uuid4() - data["created"] = dt_now() - data["modified"] = data["created"] - if config.runNow: - data["lastStartedBy"] = user.id - data["lastStartedByName"] = user.name + # ensure crawlChannel is valid + if not self.get_channel_crawler_image(config_in.crawlerChannel): + raise HTTPException(status_code=404, detail="crawler_not_found") + + # ensure profile is valid, if provided + if config_in.profileid: + await self.profiles.get_profile(config_in.profileid, org) + + now = dt_now() + crawlconfig = CrawlConfig( + id=uuid4(), + oid=org.id, + createdBy=user.id, + createdByName=user.name, + modifiedBy=user.id, + modifiedByName=user.name, + created=now, + modified=now, + schedule=config_in.schedule, + config=config_in.config, + name=config_in.name, + description=config_in.description, + tags=config_in.tags, + jobType=config_in.jobType, + crawlTimeout=config_in.crawlTimeout, + maxCrawlSize=config_in.maxCrawlSize, + scale=config_in.scale, + autoAddCollections=config_in.autoAddCollections, + profileid=config_in.profileid, + crawlerChannel=config_in.crawlerChannel, + crawlFilenameTemplate=config_in.crawlFilenameTemplate, + ) + + if config_in.runNow: + crawlconfig.lastStartedBy = user.id + crawlconfig.lastStartedByName = user.name # Ensure page limit is below org maxPagesPerCall if set max_pages = await self.org_ops.get_max_pages_per_crawl(org.id) if max_pages > 0: - data["config"]["limit"] = max_pages + crawlconfig.config.limit = max_pages - data["profileid"], profile_filename = await self._lookup_profile( - config.profileid, org - ) - - if config.autoAddCollections: - data["autoAddCollections"] = config.autoAddCollections - - if not self.get_channel_crawler_image(config.crawlerChannel): - raise HTTPException(status_code=404, detail="crawler_not_found") - - result = await self.crawl_configs.insert_one(data) - - crawlconfig = CrawlConfig.from_dict(data) - - storage_filename = ( - data.get("crawlFilenameTemplate") or self.default_filename_template - ) - - run_now = config.runNow - storage_quota_reached = await self.org_ops.storage_quota_reached(org.id) - exec_mins_quota_reached = await self.org_ops.exec_mins_quota_reached(org.id) - - if org.readOnly: - run_now = False - print(f"Org {org.id} set to read-only", flush=True) - - if storage_quota_reached: - run_now = False - print(f"Storage quota exceeded for org {org.id}", flush=True) - - if exec_mins_quota_reached: - run_now = False - print(f"Execution minutes quota exceeded for org {org.id}", flush=True) + # add CrawlConfig to DB here + result = await self.crawl_configs.insert_one(crawlconfig.to_dict()) await self.crawl_manager.update_scheduled_job(crawlconfig, str(user.id)) crawl_id = None + storage_quota_reached = False + exec_mins_quota_reached = False - if run_now: - crawl_id = await self.crawl_manager.create_crawl_job( - crawlconfig, - org.storage, - userid=str(crawlconfig.modifiedBy), - warc_prefix=self.get_warc_prefix(org, crawlconfig), - storage_filename=storage_filename, - profile_filename=profile_filename or "", - ) + if config_in.runNow: + try: + crawl_id = await self.run_now_internal(crawlconfig, org, user) + except HTTPException as e: + if e.detail == "storage_quota_reached": + storage_quota_reached = True + elif e.detail == "exec_minutes_quota_reached": + exec_mins_quota_reached = True + print(f"Can't run crawl now: {e.detail}", flush=True) + else: + storage_quota_reached = await self.org_ops.storage_quota_reached(org.id) + exec_mins_quota_reached = await self.org_ops.exec_mins_quota_reached(org.id) - await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True) - - return ( - result.inserted_id, - crawl_id, - storage_quota_reached, - exec_mins_quota_reached, + return CrawlConfigAddedResponse( + added=True, + id=str(result.inserted_id), + run_now_job=crawl_id, + storageQuotaReached=storage_quota_reached, + execMinutesQuotaReached=exec_mins_quota_reached, ) async def add_new_crawl( @@ -377,7 +365,13 @@ class CrawlConfigOps: query["modifiedByName"] = user.name query["modified"] = dt_now() - query["profileid"], _ = await self._lookup_profile(update.profileid, org) + # if empty str, just clear the profile + if isinstance(update.profileid, EmptyStr) or update.profileid == "": + query["profileid"] = None + # else, ensure its a valid profile + elif update.profileid: + await self.profiles.get_profile(update.profileid, org) + query["profileid"] = update.profileid if update.config is not None: query["config"] = update.config.dict() @@ -822,35 +816,29 @@ class CrawlConfigOps: "workflowIds": workflow_ids, } - async def prepare_for_run_crawl(self, cid: UUID, org: Organization) -> CrawlConfig: - """prepare for running a crawl, returning crawlconfig and - validating that running crawls is allowed""" + async def run_now(self, cid: UUID, org: Organization, user: User) -> str: + """run new crawl for cid now, if possible""" crawlconfig = await self.get_crawl_config(cid, org.id) - if not crawlconfig: raise HTTPException( status_code=404, detail=f"Crawl Config '{cid}' not found" ) - if org.readOnly: - raise HTTPException(status_code=403, detail="org_set_to_read_only") + return await self.run_now_internal(crawlconfig, org, user) - if await self.org_ops.storage_quota_reached(org.id): - raise HTTPException(status_code=403, detail="storage_quota_reached") - - if await self.org_ops.exec_mins_quota_reached(org.id): - raise HTTPException(status_code=403, detail="exec_minutes_quota_reached") - - return crawlconfig - - async def run_now(self, cid: UUID, org: Organization, user: User): - """run specified crawlconfig now""" - crawlconfig = await self.prepare_for_run_crawl(cid, org) + async def run_now_internal( + self, crawlconfig: CrawlConfig, org: Organization, user: User + ) -> str: + """run new crawl for specified crawlconfig now""" + await self.org_ops.can_run_crawls(org) if await self.get_running_crawl(crawlconfig): raise HTTPException(status_code=400, detail="crawl_already_running") profile_filename = await self.get_profile_filename(crawlconfig.profileid, org) + storage_filename = ( + crawlconfig.crawlFilenameTemplate or self.default_filename_template + ) try: crawl_id = await self.crawl_manager.create_crawl_job( @@ -858,7 +846,7 @@ class CrawlConfigOps: org.storage, userid=str(user.id), warc_prefix=self.get_warc_prefix(org, crawlconfig), - storage_filename=self.default_filename_template, + storage_filename=storage_filename, profile_filename=profile_filename or "", ) await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True) @@ -1120,19 +1108,7 @@ def init_crawl_config_api( org: Organization = Depends(org_crawl_dep), user: User = Depends(user_dep), ): - ( - cid, - new_job_name, - storage_quota_reached, - exec_mins_quota_reached, - ) = await ops.add_crawl_config(config, org, user) - return { - "added": True, - "id": str(cid), - "run_now_job": new_job_name, - "storageQuotaReached": storage_quota_reached, - "execMinutesQuotaReached": exec_mins_quota_reached, - } + return await ops.add_crawl_config(config, org, user) @router.patch( "/{cid}", diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index 725b95d3..d1e6215b 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -776,7 +776,9 @@ class CrawlOps(BaseCrawlOps): if not crawl.cid or crawl.type != "crawl": raise HTTPException(status_code=400, detail="invalid_crawl_for_qa") - crawlconfig = await self.crawl_configs.prepare_for_run_crawl(crawl.cid, org) + await self.orgs.can_run_crawls(org) + + crawlconfig = await self.crawl_configs.get_crawl_config(crawl.cid, org.id) try: qa_run_id = await self.crawl_manager.create_qa_crawl_job( diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 450df4cb..184df795 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -314,7 +314,7 @@ class CrawlConfigIn(BaseModel): jobType: Optional[JobType] = JobType.CUSTOM - profileid: Union[UUID, EmptyStr, None] + profileid: Optional[UUID] = None crawlerChannel: str = "default" autoAddCollections: Optional[List[UUID]] = [] @@ -407,6 +407,8 @@ class CrawlConfigAdditional(BaseModel): isCrawlRunning: Optional[bool] = False + crawlFilenameTemplate: Optional[str] = None + # ============================================================================ class CrawlConfig(CrawlConfigCore, CrawlConfigAdditional): diff --git a/backend/btrixcloud/orgs.py b/backend/btrixcloud/orgs.py index aeeae67e..44974e53 100644 --- a/backend/btrixcloud/orgs.py +++ b/backend/btrixcloud/orgs.py @@ -697,6 +697,17 @@ class OrgOps: return False + async def can_run_crawls(self, org: Organization) -> None: + """check crawl quotas and readOnly state, throw if can not run""" + if org.readOnly: + raise HTTPException(status_code=403, detail="org_set_to_read_only") + + if await self.storage_quota_reached(org.id): + raise HTTPException(status_code=403, detail="storage_quota_reached") + + if await self.exec_mins_quota_reached(org.id): + raise HTTPException(status_code=403, detail="exec_minutes_quota_reached") + async def get_monthly_crawl_exec_seconds(self, oid: UUID) -> int: """Return monthlyExecSeconds for current month""" org_data = await self.orgs.find_one({"_id": oid})