fix crawlFilenameTemplate + add_crawl_config cleanup (fixes #1932) (#1935)

- ensure crawlFilenameTemplate is part of the CrawlConfig model - change CrawlConfig init to use type-safe construction - add a run_now_internal() that is shared for starting crawl, either on demand or from new config - add OrgOps.can_run_crawls() to check against org quotas for crawling - cleanup profile updates, remove _lookup_profile, only check for EmptyStr in update --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
2024-07-17 10:48:25 -07:00 · 2024-07-17 10:48:25 -07:00 · 4db3053a9f
commit 4db3053a9f
parent 27059c91a5
4 changed files with 103 additions and 112 deletions
--- a/backend/btrixcloud/crawlconfigs.py
+++ b/backend/btrixcloud/crawlconfigs.py
@ -4,7 +4,7 @@ Crawl Config API handling

 # pylint: disable=too-many-lines

-from typing import List, Union, Optional, Tuple, TYPE_CHECKING, cast
+from typing import List, Union, Optional, TYPE_CHECKING, cast

 import asyncio
 import json
@ -164,108 +164,96 @@ class CrawlConfigOps:

    async def get_profile_filename(
        self, profileid: Optional[UUID], org: Organization
-    ) -> Optional[str]:
+    ) -> str:
        """lookup filename from profileid"""
-        _, profile_filename = await self._lookup_profile(profileid, org)
-        return profile_filename
-
-    async def _lookup_profile(
-        self, profileid: Union[UUID, EmptyStr, None], org: Organization
-    ) -> tuple[Optional[UUID], Optional[str]]:
-        if profileid is None:
-            return None, None
-
-        if isinstance(profileid, EmptyStr) or profileid == "":
-            return None, ""
+        if not profileid:
+            return ""

        profile_filename = await self.profiles.get_profile_storage_path(profileid, org)
        if not profile_filename:
            raise HTTPException(status_code=400, detail="invalid_profile_id")

-        return profileid, profile_filename
+        return profile_filename

    # pylint: disable=invalid-name
    async def add_crawl_config(
        self,
-        config: CrawlConfigIn,
+        config_in: CrawlConfigIn,
        org: Organization,
        user: User,
-    ) -> Tuple[str, Optional[str], bool, bool]:
+    ) -> CrawlConfigAddedResponse:
        """Add new crawl config"""
-        data = config.dict()
-        data["oid"] = org.id
-        data["createdBy"] = user.id
-        data["createdByName"] = user.name
-        data["modifiedBy"] = user.id
-        data["modifiedByName"] = user.name
-        data["_id"] = uuid4()
-        data["created"] = dt_now()
-        data["modified"] = data["created"]

-        if config.runNow:
-            data["lastStartedBy"] = user.id
-            data["lastStartedByName"] = user.name
+        # ensure crawlChannel is valid
+        if not self.get_channel_crawler_image(config_in.crawlerChannel):
+            raise HTTPException(status_code=404, detail="crawler_not_found")
+
+        # ensure profile is valid, if provided
+        if config_in.profileid:
+            await self.profiles.get_profile(config_in.profileid, org)
+
+        now = dt_now()
+        crawlconfig = CrawlConfig(
+            id=uuid4(),
+            oid=org.id,
+            createdBy=user.id,
+            createdByName=user.name,
+            modifiedBy=user.id,
+            modifiedByName=user.name,
+            created=now,
+            modified=now,
+            schedule=config_in.schedule,
+            config=config_in.config,
+            name=config_in.name,
+            description=config_in.description,
+            tags=config_in.tags,
+            jobType=config_in.jobType,
+            crawlTimeout=config_in.crawlTimeout,
+            maxCrawlSize=config_in.maxCrawlSize,
+            scale=config_in.scale,
+            autoAddCollections=config_in.autoAddCollections,
+            profileid=config_in.profileid,
+            crawlerChannel=config_in.crawlerChannel,
+            crawlFilenameTemplate=config_in.crawlFilenameTemplate,
+        )
+
+        if config_in.runNow:
+            crawlconfig.lastStartedBy = user.id
+            crawlconfig.lastStartedByName = user.name

        # Ensure page limit is below org maxPagesPerCall if set
        max_pages = await self.org_ops.get_max_pages_per_crawl(org.id)
        if max_pages > 0:
-            data["config"]["limit"] = max_pages
+            crawlconfig.config.limit = max_pages

-        data["profileid"], profile_filename = await self._lookup_profile(
-            config.profileid, org
-        )
-
-        if config.autoAddCollections:
-            data["autoAddCollections"] = config.autoAddCollections
-
-        if not self.get_channel_crawler_image(config.crawlerChannel):
-            raise HTTPException(status_code=404, detail="crawler_not_found")
-
-        result = await self.crawl_configs.insert_one(data)
-
-        crawlconfig = CrawlConfig.from_dict(data)
-
-        storage_filename = (
-            data.get("crawlFilenameTemplate") or self.default_filename_template
-        )
-
-        run_now = config.runNow
-        storage_quota_reached = await self.org_ops.storage_quota_reached(org.id)
-        exec_mins_quota_reached = await self.org_ops.exec_mins_quota_reached(org.id)
-
-        if org.readOnly:
-            run_now = False
-            print(f"Org {org.id} set to read-only", flush=True)
-
-        if storage_quota_reached:
-            run_now = False
-            print(f"Storage quota exceeded for org {org.id}", flush=True)
-
-        if exec_mins_quota_reached:
-            run_now = False
-            print(f"Execution minutes quota exceeded for org {org.id}", flush=True)
+        # add  CrawlConfig to DB here
+        result = await self.crawl_configs.insert_one(crawlconfig.to_dict())

        await self.crawl_manager.update_scheduled_job(crawlconfig, str(user.id))

        crawl_id = None
+        storage_quota_reached = False
+        exec_mins_quota_reached = False

-        if run_now:
-            crawl_id = await self.crawl_manager.create_crawl_job(
-                crawlconfig,
-                org.storage,
-                userid=str(crawlconfig.modifiedBy),
-                warc_prefix=self.get_warc_prefix(org, crawlconfig),
-                storage_filename=storage_filename,
-                profile_filename=profile_filename or "",
-            )
+        if config_in.runNow:
+            try:
+                crawl_id = await self.run_now_internal(crawlconfig, org, user)
+            except HTTPException as e:
+                if e.detail == "storage_quota_reached":
+                    storage_quota_reached = True
+                elif e.detail == "exec_minutes_quota_reached":
+                    exec_mins_quota_reached = True
+                print(f"Can't run crawl now: {e.detail}", flush=True)
+        else:
+            storage_quota_reached = await self.org_ops.storage_quota_reached(org.id)
+            exec_mins_quota_reached = await self.org_ops.exec_mins_quota_reached(org.id)

-            await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True)
-
-        return (
-            result.inserted_id,
-            crawl_id,
-            storage_quota_reached,
-            exec_mins_quota_reached,
+        return CrawlConfigAddedResponse(
+            added=True,
+            id=str(result.inserted_id),
+            run_now_job=crawl_id,
+            storageQuotaReached=storage_quota_reached,
+            execMinutesQuotaReached=exec_mins_quota_reached,
        )

    async def add_new_crawl(
@ -377,7 +365,13 @@ class CrawlConfigOps:
        query["modifiedByName"] = user.name
        query["modified"] = dt_now()

-        query["profileid"], _ = await self._lookup_profile(update.profileid, org)
+        # if empty str, just clear the profile
+        if isinstance(update.profileid, EmptyStr) or update.profileid == "":
+            query["profileid"] = None
+        # else, ensure its a valid profile
+        elif update.profileid:
+            await self.profiles.get_profile(update.profileid, org)
+            query["profileid"] = update.profileid

        if update.config is not None:
            query["config"] = update.config.dict()
@ -822,35 +816,29 @@ class CrawlConfigOps:
            "workflowIds": workflow_ids,
        }

-    async def prepare_for_run_crawl(self, cid: UUID, org: Organization) -> CrawlConfig:
-        """prepare for running a crawl, returning crawlconfig and
-        validating that running crawls is allowed"""
+    async def run_now(self, cid: UUID, org: Organization, user: User) -> str:
+        """run new crawl for cid now, if possible"""
        crawlconfig = await self.get_crawl_config(cid, org.id)
-
        if not crawlconfig:
            raise HTTPException(
                status_code=404, detail=f"Crawl Config '{cid}' not found"
            )

-        if org.readOnly:
-            raise HTTPException(status_code=403, detail="org_set_to_read_only")
+        return await self.run_now_internal(crawlconfig, org, user)

-        if await self.org_ops.storage_quota_reached(org.id):
-            raise HTTPException(status_code=403, detail="storage_quota_reached")
-
-        if await self.org_ops.exec_mins_quota_reached(org.id):
-            raise HTTPException(status_code=403, detail="exec_minutes_quota_reached")
-
-        return crawlconfig
-
-    async def run_now(self, cid: UUID, org: Organization, user: User):
-        """run specified crawlconfig now"""
-        crawlconfig = await self.prepare_for_run_crawl(cid, org)
+    async def run_now_internal(
+        self, crawlconfig: CrawlConfig, org: Organization, user: User
+    ) -> str:
+        """run new crawl for specified crawlconfig now"""
+        await self.org_ops.can_run_crawls(org)

        if await self.get_running_crawl(crawlconfig):
            raise HTTPException(status_code=400, detail="crawl_already_running")

        profile_filename = await self.get_profile_filename(crawlconfig.profileid, org)
+        storage_filename = (
+            crawlconfig.crawlFilenameTemplate or self.default_filename_template
+        )

        try:
            crawl_id = await self.crawl_manager.create_crawl_job(
@ -858,7 +846,7 @@ class CrawlConfigOps:
                org.storage,
                userid=str(user.id),
                warc_prefix=self.get_warc_prefix(org, crawlconfig),
-                storage_filename=self.default_filename_template,
+                storage_filename=storage_filename,
                profile_filename=profile_filename or "",
            )
            await self.add_new_crawl(crawl_id, crawlconfig, user, manual=True)
@ -1120,19 +1108,7 @@ def init_crawl_config_api(
        org: Organization = Depends(org_crawl_dep),
        user: User = Depends(user_dep),
    ):
-        (
-            cid,
-            new_job_name,
-            storage_quota_reached,
-            exec_mins_quota_reached,
-        ) = await ops.add_crawl_config(config, org, user)
-        return {
-            "added": True,
-            "id": str(cid),
-            "run_now_job": new_job_name,
-            "storageQuotaReached": storage_quota_reached,
-            "execMinutesQuotaReached": exec_mins_quota_reached,
-        }
+        return await ops.add_crawl_config(config, org, user)

    @router.patch(
        "/{cid}",
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -776,7 +776,9 @@ class CrawlOps(BaseCrawlOps):
        if not crawl.cid or crawl.type != "crawl":
            raise HTTPException(status_code=400, detail="invalid_crawl_for_qa")

-        crawlconfig = await self.crawl_configs.prepare_for_run_crawl(crawl.cid, org)
+        await self.orgs.can_run_crawls(org)
+
+        crawlconfig = await self.crawl_configs.get_crawl_config(crawl.cid, org.id)

        try:
            qa_run_id = await self.crawl_manager.create_qa_crawl_job(
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@ -314,7 +314,7 @@ class CrawlConfigIn(BaseModel):

    jobType: Optional[JobType] = JobType.CUSTOM

-    profileid: Union[UUID, EmptyStr, None]
+    profileid: Optional[UUID] = None
    crawlerChannel: str = "default"

    autoAddCollections: Optional[List[UUID]] = []
@ -407,6 +407,8 @@ class CrawlConfigAdditional(BaseModel):

    isCrawlRunning: Optional[bool] = False

+    crawlFilenameTemplate: Optional[str] = None
+

 # ============================================================================
 class CrawlConfig(CrawlConfigCore, CrawlConfigAdditional):
--- a/backend/btrixcloud/orgs.py
+++ b/backend/btrixcloud/orgs.py
@ -697,6 +697,17 @@ class OrgOps:

        return False

+    async def can_run_crawls(self, org: Organization) -> None:
+        """check crawl quotas and readOnly state, throw if can not run"""
+        if org.readOnly:
+            raise HTTPException(status_code=403, detail="org_set_to_read_only")
+
+        if await self.storage_quota_reached(org.id):
+            raise HTTPException(status_code=403, detail="storage_quota_reached")
+
+        if await self.exec_mins_quota_reached(org.id):
+            raise HTTPException(status_code=403, detail="exec_minutes_quota_reached")
+
    async def get_monthly_crawl_exec_seconds(self, oid: UUID) -> int:
        """Return monthlyExecSeconds for current month"""
        org_data = await self.orgs.find_one({"_id": oid})