Implement retry API endpoint for failed background jobs (#1356)

Fixes #1328 - Adds /retry endpoint for retrying failed jobs. - Returns 400 error if previous job still running or has succeeded - Keeps track of previous failed attempts in previousAttempts array on failed job. - Also amends the similar webhook /retry endpoint to use `POST` for consistency. - Remove duplicate api tag for backgroundjobs
2023-11-09 21:09:37 -05:00 · 2023-11-09 21:09:37 -05:00 · 1afc411114
commit 1afc411114
parent 82a5d1e4e4
5 changed files with 210 additions and 64 deletions
--- a/backend/btrixcloud/background_jobs.py
+++ b/backend/btrixcloud/background_jobs.py
@ -20,6 +20,7 @@ from .models import (
    DeleteReplicaJob,
    PaginatedResponse,
    AnyJob,
+    StorageRef,
 )
 from .pagination import DEFAULT_PAGE_SIZE, paginated_format

@ -90,9 +91,8 @@ class BackgroundJobOps:

    async def create_replica_jobs(
        self, oid: UUID, file: BaseFile, object_id: str, object_type: str
-    ) -> Dict:
-        """Create k8s background job to replicate a file to another storage location."""
-
+    ) -> Dict[str, Union[bool, List[str]]]:
+        """Create k8s background job to replicate a file to all replica storage locations."""
        org = await self.org_ops.get_org_by_id(oid)

        primary_storage = self.storage_ops.get_org_storage_by_ref(org, file.storage)
@ -105,23 +105,42 @@ class BackgroundJobOps:
        ids = []

        for replica_ref in self.storage_ops.get_org_replicas_storage_refs(org):
-            replica_storage = self.storage_ops.get_org_storage_by_ref(org, replica_ref)
-            replica_endpoint, bucket_suffix = self.strip_bucket(
-                replica_storage.endpoint_url
+            job_id = await self.create_replica_job(
+                org,
+                file,
+                object_id,
+                object_type,
+                replica_ref,
+                primary_file_path,
+                primary_endpoint,
            )
-            replica_file_path = bucket_suffix + file.filename
+            ids.append(job_id)

-            # print(f"primary: {file.storage.get_storage_secret_name(str(oid))}")
-            # print(f"  endpoint: {primary_endpoint}")
-            # print(f"  path: {primary_file_path}")
-            # print(f"replica: {replica_ref.get_storage_secret_name(str(oid))}")
-            # print(f"  endpoint: {replica_endpoint}")
-            # print(f"  path: {replica_file_path}")
+        return {"added": True, "ids": ids}

-            job_type = BgJobType.CREATE_REPLICA.value
+    async def create_replica_job(
+        self,
+        org: Organization,
+        file: BaseFile,
+        object_id: str,
+        object_type: str,
+        replica_ref: StorageRef,
+        primary_file_path: str,
+        primary_endpoint: str,
+        existing_job_id: Optional[str] = None,
+    ) -> str:
+        """Create k8s background job to replicate a file to a specific replica storage location."""
+        replica_storage = self.storage_ops.get_org_storage_by_ref(org, replica_ref)
+        replica_endpoint, bucket_suffix = self.strip_bucket(
+            replica_storage.endpoint_url
+        )
+        replica_file_path = bucket_suffix + file.filename

+        job_type = BgJobType.CREATE_REPLICA.value
+
+        try:
            job_id = await self.crawl_manager.run_replica_job(
-                oid=str(oid),
+                oid=str(org.id),
                job_type=job_type,
                primary_storage=file.storage,
                primary_file_path=primary_file_path,
@ -130,74 +149,122 @@ class BackgroundJobOps:
                replica_file_path=replica_file_path,
                replica_endpoint=replica_endpoint,
                job_id_prefix=f"{job_type}-{object_id}",
+                existing_job_id=existing_job_id,
            )
-            replication_job = CreateReplicaJob(
-                id=job_id,
-                oid=oid,
-                started=datetime.now(),
-                file_path=file.filename,
-                object_type=object_type,
-                object_id=object_id,
-                primary=file.storage,
-                replica_storage=replica_ref,
-            )
-            # print(
-            #    f"File path written into replication_job: {file.filename}", flush=True
-            # )
+            if existing_job_id:
+                replication_job = await self.get_background_job(existing_job_id, org.id)
+                previous_attempt = {
+                    "started": replication_job.started,
+                    "finished": replication_job.finished,
+                }
+                if replication_job.previousAttempts:
+                    replication_job.previousAttempts.append(previous_attempt)
+                else:
+                    replication_job.previousAttempts = [previous_attempt]
+                replication_job.started = datetime.now()
+                replication_job.finished = None
+                replication_job.success = None
+            else:
+                replication_job = CreateReplicaJob(
+                    id=job_id,
+                    oid=org.id,
+                    started=datetime.now(),
+                    file_path=file.filename,
+                    object_type=object_type,
+                    object_id=object_id,
+                    primary=file.storage,
+                    replica_storage=replica_ref,
+                )
+
            await self.jobs.find_one_and_update(
                {"_id": job_id}, {"$set": replication_job.to_dict()}, upsert=True
            )
-            ids.append(job_id)

-        return {"added": True, "ids": ids}
+            return job_id
+        except Exception as exc:
+            # pylint: disable=raise-missing-from
+            raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")

    async def create_delete_replica_jobs(
        self, org: Organization, file: BaseFile, object_id: str, object_type: str
    ) -> Dict[str, Union[bool, List[str]]]:
        """Create a job to delete each replica for the given file"""
-
        ids = []
-        oid = str(org.id)

        for replica_ref in file.replicas or []:
-            replica_storage = self.storage_ops.get_org_storage_by_ref(org, replica_ref)
-            replica_endpoint, bucket_suffix = self.strip_bucket(
-                replica_storage.endpoint_url
+            job_id = await self.create_delete_replica_job(
+                org, file, object_id, object_type, replica_ref
            )
-            replica_file_path = bucket_suffix + file.filename
+            ids.append(job_id)

-            # print(f"replica: {replica_ref.get_storage_secret_name(oid)}")
-            # print(f"  endpoint: {replica_endpoint}")
-            # print(f"  path: {replica_file_path}")
+        return {"added": True, "ids": ids}

-            job_type = BgJobType.DELETE_REPLICA.value
+    async def create_delete_replica_job(
+        self,
+        org: Organization,
+        file: BaseFile,
+        object_id: str,
+        object_type: str,
+        replica_ref: StorageRef,
+        existing_job_id: Optional[str] = None,
+    ) -> str:
+        """Create a job to delete one replica of a given file"""
+        replica_storage = self.storage_ops.get_org_storage_by_ref(org, replica_ref)
+        replica_endpoint, bucket_suffix = self.strip_bucket(
+            replica_storage.endpoint_url
+        )
+        replica_file_path = bucket_suffix + file.filename

+        job_type = BgJobType.DELETE_REPLICA.value
+
+        try:
            job_id = await self.crawl_manager.run_replica_job(
-                oid=oid,
+                oid=str(org.id),
                job_type=job_type,
                replica_storage=replica_ref,
                replica_file_path=replica_file_path,
                replica_endpoint=replica_endpoint,
                job_id_prefix=f"{job_type}-{object_id}",
+                existing_job_id=existing_job_id,
            )

-            delete_replica_job = DeleteReplicaJob(
-                id=job_id,
-                oid=oid,
-                started=datetime.now(),
-                file_path=file.filename,
-                object_id=object_id,
-                object_type=object_type,
-                replica_storage=replica_ref,
-            )
+            if existing_job_id:
+                delete_replica_job = await self.get_background_job(
+                    existing_job_id, org.id
+                )
+                previous_attempt = {
+                    "started": delete_replica_job.started,
+                    "finished": delete_replica_job.finished,
+                }
+                if delete_replica_job.previousAttempts:
+                    delete_replica_job.previousAttempts.append(previous_attempt)
+                else:
+                    delete_replica_job.previousAttempts = [previous_attempt]
+                delete_replica_job.started = datetime.now()
+                delete_replica_job.finished = None
+                delete_replica_job.success = None
+            else:
+                delete_replica_job = DeleteReplicaJob(
+                    id=job_id,
+                    oid=org.id,
+                    started=datetime.now(),
+                    file_path=file.filename,
+                    object_id=object_id,
+                    object_type=object_type,
+                    replica_storage=replica_ref,
+                )

            await self.jobs.find_one_and_update(
                {"_id": job_id}, {"$set": delete_replica_job.to_dict()}, upsert=True
            )

-            ids.append(job_id)
+            return job_id

-        return {"added": True, "ids": ids}
+        except Exception as exc:
+            # pylint: disable=raise-missing-from
+            raise HTTPException(
+                status_code=400, detail=f"Error starting background job: {exc}"
+            )

    async def job_finished(
        self,
@ -241,7 +308,9 @@ class BackgroundJobOps:
            {"$set": {"success": success, "finished": finished}},
        )

-    async def get_background_job(self, job_id: str, oid: UUID) -> BackgroundJob:
+    async def get_background_job(
+        self, job_id: str, oid: UUID
+    ) -> Union[CreateReplicaJob, DeleteReplicaJob]:
        """Get background job"""
        query: dict[str, object] = {"_id": job_id, "oid": oid}
        res = await self.jobs.find_one(query)
@ -255,10 +324,9 @@ class BackgroundJobOps:
        if data["type"] == BgJobType.CREATE_REPLICA:
            return CreateReplicaJob.from_dict(data)

-        if data["type"] == BgJobType.DELETE_REPLICA:
-            return DeleteReplicaJob.from_dict(data)
+        return DeleteReplicaJob.from_dict(data)

-        return BackgroundJob.from_dict(data)
+        # return BackgroundJob.from_dict(data)

    async def list_background_jobs(
        self,
@ -324,6 +392,69 @@ class BackgroundJobOps:

        return jobs, total

+    async def get_replica_job_file(
+        self, job: Union[CreateReplicaJob, DeleteReplicaJob], org: Organization
+    ) -> BaseFile:
+        """Return file from replica job"""
+        try:
+            if job.object_type == "profile":
+                profile = await self.profile_ops.get_profile(UUID(job.object_id), org)
+                return BaseFile(**profile.resource.dict())
+
+            item_res = await self.base_crawl_ops.get_crawl_raw(job.object_id, org)
+            matching_file = [
+                f for f in item_res.get("files", []) if f["filename"] == job.file_path
+            ][0]
+            return BaseFile(**matching_file)
+        # pylint: disable=broad-exception-caught, raise-missing-from
+        except Exception:
+            raise HTTPException(status_code=404, detail="file_not_found")
+
+    async def retry_background_job(
+        self, job_id: str, org: Organization
+    ) -> Dict[str, Union[bool, Optional[str]]]:
+        """Retry background job and return new job id"""
+        job = await self.get_background_job(job_id, org.id)
+        if not job:
+            raise HTTPException(status_code=404, detail="job_not_found")
+
+        if not job.finished:
+            raise HTTPException(status_code=400, detail="job_not_finished")
+
+        if job.success:
+            raise HTTPException(status_code=400, detail="job_already_succeeded")
+
+        file = await self.get_replica_job_file(job, org)
+
+        if job.type == BgJobType.CREATE_REPLICA:
+            primary_storage = self.storage_ops.get_org_storage_by_ref(org, file.storage)
+            primary_endpoint, bucket_suffix = self.strip_bucket(
+                primary_storage.endpoint_url
+            )
+            primary_file_path = bucket_suffix + file.filename
+            await self.create_replica_job(
+                org,
+                file,
+                job.object_id,
+                job.object_type,
+                job.replica_storage,
+                primary_file_path,
+                primary_endpoint,
+                existing_job_id=job_id,
+            )
+
+        if job.type == BgJobType.DELETE_REPLICA:
+            await self.create_delete_replica_job(
+                org,
+                file,
+                job.object_id,
+                job.object_type,
+                job.replica_storage,
+                existing_job_id=job_id,
+            )
+
+        return {"success": True}
+

 # ============================================================================
 # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme
@ -344,7 +475,6 @@ def init_background_jobs_api(

    @router.get(
        "/{job_id}",
-        tags=["backgroundjobs"],
        response_model=AnyJob,
    )
    async def get_background_job(
@ -354,7 +484,17 @@ def init_background_jobs_api(
        """Retrieve information for background job"""
        return await ops.get_background_job(job_id, org.id)

-    @router.get("", tags=["backgroundjobs"], response_model=PaginatedResponse)
+    @router.post(
+        "/{job_id}/retry",
+    )
+    async def retry_background_job(
+        job_id: str,
+        org: Organization = Depends(org_crawl_dep),
+    ):
+        """Retry background job"""
+        return await ops.retry_background_job(job_id, org)
+
+    @router.get("", response_model=PaginatedResponse)
    async def list_background_jobs(
        org: Organization = Depends(org_crawl_dep),
        pageSize: int = DEFAULT_PAGE_SIZE,
--- a/backend/btrixcloud/crawlmanager.py
+++ b/backend/btrixcloud/crawlmanager.py
@ -74,14 +74,18 @@ class CrawlManager(K8sAPI):
        primary_file_path: Optional[str] = None,
        primary_endpoint: Optional[str] = None,
        job_id_prefix: Optional[str] = None,
+        existing_job_id: Optional[str] = None,
    ):
        """run job to replicate file from primary storage to replica storage"""

-        if not job_id_prefix:
-            job_id_prefix = job_type
+        if existing_job_id:
+            job_id = existing_job_id
+        else:
+            if not job_id_prefix:
+                job_id_prefix = job_type

-        # ensure name is <=63 characters
-        job_id = f"{job_id_prefix[:52]}-{secrets.token_hex(5)}"
+            # ensure name is <=63 characters
+            job_id = f"{job_id_prefix[:52]}-{secrets.token_hex(5)}"

        params = {
            "id": job_id,
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@ -1237,6 +1237,8 @@ class BackgroundJob(BaseMongoModel):
    started: datetime
    finished: Optional[datetime] = None

+    previousAttempts: Optional[List[Dict[str, Optional[datetime]]]] = None
+

 # ============================================================================
 class CreateReplicaJob(BackgroundJob):
--- a/backend/btrixcloud/webhooks.py
+++ b/backend/btrixcloud/webhooks.py
@ -400,7 +400,7 @@ def init_event_webhooks_api(mdb, org_ops, app):
    ):
        return await ops.get_notification(org, notificationid)

-    @router.get("/{notificationid}/retry")
+    @router.post("/{notificationid}/retry")
    async def retry_notification(
        notificationid: UUID,
        org: Organization = Depends(org_owner_dep),
--- a/backend/test/test_webhooks.py
+++ b/backend/test/test_webhooks.py
@ -99,7 +99,7 @@ def test_get_webhook_event(admin_auth_headers, default_org_id):

 def test_retry_webhook_event(admin_auth_headers, default_org_id):
    # Expect to fail because we haven't set up URLs that accept webhooks
-    r = requests.get(
+    r = requests.post(
        f"{API_PREFIX}/orgs/{default_org_id}/webhooks/{_webhook_event_id}/retry",
        headers=admin_auth_headers,
    )