Optionally delay replica deletion (#2252)

Fixes #2170 The number of days to delay file replication deletion by is configurable in the Helm chart with `replica_deletion_delay_days` (set by default to 7 days in `values.yaml` to encourage good practice, though we could change this). When `replica_deletion_delay_days` is set to an int above 0, when a delete replica job would otherwise be started as a Kubernetes Job, a CronJob is created instead with a cron schedule set to run yearly, starting x days from the current moment. This cronjob is then deleted by the operator after the job successfully completes. If a failed background job is retried, it is re-run immediately as a Job rather than being scheduled out into the future again. --------- Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
2024-12-19 21:50:28 -05:00 · 2024-12-19 21:50:28 -05:00 · 589819682e
commit 589819682e
parent 2060ee78b4
8 changed files with 169 additions and 22 deletions
--- a/backend/btrixcloud/background_jobs.py
+++ b/backend/btrixcloud/background_jobs.py
@ -96,6 +96,11 @@ class BackgroundJobOps:
        if not res:
            print("File deleted before replication job started, ignoring", flush=True)

+    async def handle_delete_replica_job_finished(self, job: DeleteReplicaJob) -> None:
+        """After successful replica deletion, delete cronjob if scheduled"""
+        if job.schedule:
+            await self.crawl_manager.delete_replica_deletion_scheduled_job(job.id)
+
    async def create_replica_jobs(
        self, oid: UUID, file: BaseFile, object_id: str, object_type: str
    ) -> Dict[str, Union[bool, List[str]]]:
@ -146,7 +151,7 @@ class BackgroundJobOps:
        job_type = BgJobType.CREATE_REPLICA.value

        try:
-            job_id = await self.crawl_manager.run_replica_job(
+            job_id, _ = await self.crawl_manager.run_replica_job(
                oid=str(org.id),
                job_type=job_type,
                primary_storage=file.storage,
@ -155,7 +160,7 @@ class BackgroundJobOps:
                replica_storage=replica_ref,
                replica_file_path=replica_file_path,
                replica_endpoint=replica_endpoint,
-                job_id_prefix=f"{job_type}-{object_id}",
+                delay_days=0,
                existing_job_id=existing_job_id,
            )
            if existing_job_id:
@ -188,9 +193,13 @@ class BackgroundJobOps:
            )

            return job_id
+        # pylint: disable=broad-exception-caught
        except Exception as exc:
-            # pylint: disable=raise-missing-from
-            raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")
+            print(
+                "warning: replica job could not be started "
+                + f"for {object_type} {file}: {exc}"
+            )
+            return ""

    async def create_delete_replica_jobs(
        self, org: Organization, file: BaseFile, object_id: str, object_type: str
@ -214,8 +223,9 @@ class BackgroundJobOps:
        object_id: str,
        object_type: str,
        replica_ref: StorageRef,
+        force_start_immediately: bool = False,
        existing_job_id: Optional[str] = None,
-    ) -> Optional[str]:
+    ) -> str:
        """Create a job to delete one replica of a given file"""
        try:
            replica_storage = self.storage_ops.get_org_storage_by_ref(org, replica_ref)
@ -226,20 +236,23 @@ class BackgroundJobOps:

            job_type = BgJobType.DELETE_REPLICA.value

-            job_id = await self.crawl_manager.run_replica_job(
+            delay_days = int(os.environ.get("REPLICA_DELETION_DELAY_DAYS", 0))
+            if force_start_immediately:
+                delay_days = 0
+
+            job_id, schedule = await self.crawl_manager.run_replica_job(
                oid=str(org.id),
                job_type=job_type,
                replica_storage=replica_ref,
                replica_file_path=replica_file_path,
                replica_endpoint=replica_endpoint,
-                job_id_prefix=f"{job_type}-{object_id}",
+                delay_days=delay_days,
                existing_job_id=existing_job_id,
            )

            if existing_job_id:
-                delete_replica_job = await self.get_background_job(
-                    existing_job_id, org.id
-                )
+                job = await self.get_background_job(existing_job_id, org.id)
+                delete_replica_job = cast(DeleteReplicaJob, job)
                previous_attempt = {
                    "started": delete_replica_job.started,
                    "finished": delete_replica_job.finished,
@ -251,6 +264,7 @@ class BackgroundJobOps:
                delete_replica_job.started = dt_now()
                delete_replica_job.finished = None
                delete_replica_job.success = None
+                delete_replica_job.schedule = None
            else:
                delete_replica_job = DeleteReplicaJob(
                    id=job_id,
@ -260,6 +274,7 @@ class BackgroundJobOps:
                    object_id=object_id,
                    object_type=object_type,
                    replica_storage=replica_ref,
+                    schedule=schedule,
                )

            await self.jobs.find_one_and_update(
@ -274,7 +289,7 @@ class BackgroundJobOps:
                "warning: replica deletion job could not be started "
                + f"for {object_type} {file}: {exc}"
            )
-            return None
+            return ""

    async def create_delete_org_job(
        self,
@ -387,6 +402,10 @@ class BackgroundJobOps:
        if success:
            if job_type == BgJobType.CREATE_REPLICA:
                await self.handle_replica_job_finished(cast(CreateReplicaJob, job))
+            if job_type == BgJobType.DELETE_REPLICA:
+                await self.handle_delete_replica_job_finished(
+                    cast(DeleteReplicaJob, job)
+                )
        else:
            print(
                f"Background job {job.id} failed, sending email to superuser",
@ -560,6 +579,7 @@ class BackgroundJobOps:
                job.object_id,
                job.object_type,
                job.replica_storage,
+                force_start_immediately=True,
                existing_job_id=job_id,
            )

--- a/backend/btrixcloud/crawlmanager.py
+++ b/backend/btrixcloud/crawlmanager.py
@ -3,7 +3,7 @@
 import os
 import secrets

-from typing import Optional, Dict
+from typing import Optional, Dict, Tuple
 from datetime import timedelta

 from fastapi import HTTPException
@ -72,24 +72,21 @@ class CrawlManager(K8sAPI):
        replica_storage: StorageRef,
        replica_file_path: str,
        replica_endpoint: str,
+        delay_days: int = 0,
        primary_storage: Optional[StorageRef] = None,
        primary_file_path: Optional[str] = None,
        primary_endpoint: Optional[str] = None,
-        job_id_prefix: Optional[str] = None,
        existing_job_id: Optional[str] = None,
-    ):
+    ) -> Tuple[str, Optional[str]]:
        """run job to replicate file from primary storage to replica storage"""

        if existing_job_id:
            job_id = existing_job_id
        else:
-            if not job_id_prefix:
-                job_id_prefix = job_type
+            # Keep name shorter than in past to avoid k8s issues with length
+            job_id = f"{job_type}-{secrets.token_hex(5)}"

-            # ensure name is <=63 characters
-            job_id = f"{job_id_prefix[:52]}-{secrets.token_hex(5)}"
-
-        params = {
+        params: Dict[str, object] = {
            "id": job_id,
            "oid": oid,
            "job_type": job_type,
@ -106,11 +103,17 @@ class CrawlManager(K8sAPI):
            "BgJobType": BgJobType,
        }

+        if job_type == BgJobType.DELETE_REPLICA.value and delay_days > 0:
+            # If replica deletion delay is configured, schedule as cronjob
+            return await self.create_replica_deletion_scheduled_job(
+                job_id, params, delay_days
+            )
+
        data = self.templates.env.get_template("replica_job.yaml").render(params)

        await self.create_from_yaml(data)

-        return job_id
+        return job_id, None

    async def run_delete_org_job(
        self,
@ -393,3 +396,37 @@ class CrawlManager(K8sAPI):
        await self.create_from_yaml(data, self.namespace)

        return cron_job_id
+
+    async def create_replica_deletion_scheduled_job(
+        self,
+        job_id: str,
+        params: Dict[str, object],
+        delay_days: int,
+    ) -> Tuple[str, Optional[str]]:
+        """create scheduled job to delay replica file in x days"""
+        now = dt_now()
+        run_at = now + timedelta(days=delay_days)
+        schedule = f"{run_at.minute} {run_at.hour} {run_at.day} {run_at.month} *"
+
+        params["schedule"] = schedule
+
+        print(f"Replica deletion cron schedule: '{schedule}'", flush=True)
+
+        data = self.templates.env.get_template("replica_deletion_cron_job.yaml").render(
+            params
+        )
+
+        await self.create_from_yaml(data, self.namespace)
+
+        return job_id, schedule
+
+    async def delete_replica_deletion_scheduled_job(self, job_id: str):
+        """delete scheduled job to delay replica file in x days"""
+        cron_job = await self.batch_api.read_namespaced_cron_job(
+            name=job_id,
+            namespace=self.namespace,
+        )
+        if cron_job:
+            await self.batch_api.delete_namespaced_cron_job(
+                name=cron_job.metadata.name, namespace=self.namespace
+            )
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@ -2058,6 +2058,7 @@ class DeleteReplicaJob(BackgroundJob):
    object_type: str
    object_id: str
    replica_storage: StorageRef
+    schedule: Optional[str] = None


 # ============================================================================
--- a/backend/btrixcloud/operator/bgjobs.py
+++ b/backend/btrixcloud/operator/bgjobs.py
@ -35,7 +35,7 @@ class BgJobOperator(BaseOperator):
        labels: dict[str, str] = metadata.get("labels", {})
        oid: str = labels.get("btrix.org") or ""
        job_type: str = labels.get("job_type") or ""
-        job_id: str = metadata.get("name")
+        job_id: str = labels.get("job_id") or metadata.get("name")

        status = data.object["status"]
        success = status.get("succeeded") == 1
--- a/chart/app-templates/replica_deletion_cron_job.yaml
+++ b/chart/app-templates/replica_deletion_cron_job.yaml
@ -0,0 +1,81 @@
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+  name: "{{ id }}"
+  labels:
+    role: "cron-background-job"
+    job_type: {{ job_type }}
+    btrix.org: {{ oid }}
+
+spec:
+  concurrencyPolicy: Forbid
+  successfulJobsHistoryLimit: 0
+  failedJobsHistoryLimit: 2
+
+  schedule: "{{ schedule }}"
+
+  jobTemplate:
+    metadata:
+      labels:
+        role: "background-job"
+        job_type: {{ job_type }}
+        job_id: {{ id }}
+        btrix.org: {{ oid }}
+
+    spec:
+      template:
+        spec:
+          restartPolicy: Never
+          priorityClassName: bg-job
+          podFailurePolicy:
+            rules:
+            - action: FailJob
+              onExitCodes:
+                containerName: rclone
+                operator: NotIn
+                values: [0]
+
+          containers:
+            - name: rclone
+              image: rclone/rclone:latest
+              env:
+
+              - name: RCLONE_CONFIG_REPLICA_TYPE
+                value: "s3"
+
+              - name: RCLONE_CONFIG_REPLICA_ACCESS_KEY_ID
+                valueFrom:
+                  secretKeyRef:
+                    name: "{{ replica_secret_name }}"
+                    key: STORE_ACCESS_KEY
+
+              - name: RCLONE_CONFIG_REPLICA_SECRET_ACCESS_KEY
+                valueFrom:
+                  secretKeyRef:
+                    name: "{{ replica_secret_name }}"
+                    key: STORE_SECRET_KEY
+
+              - name: RCLONE_CONFIG_REPLICA_REGION
+                valueFrom:
+                  secretKeyRef:
+                    name: "{{ replica_secret_name }}"
+                    key: STORE_REGION
+
+              - name: RCLONE_CONFIG_REPLICA_PROVIDER
+                valueFrom:
+                  secretKeyRef:
+                    name: "{{ replica_secret_name }}"
+                    key: STORE_S3_PROVIDER
+
+              - name: RCLONE_CONFIG_REPLICA_ENDPOINT
+                value: "{{ replica_endpoint }}"
+
+              command: ["rclone", "-vv", "delete", "replica:{{ replica_file_path }}"]
+
+              resources:
+                limits:
+                  memory: "200Mi"
+
+                requests:
+                  memory: "200Mi"
+                  cpu: "50m"
--- a/chart/templates/configmap.yaml
+++ b/chart/templates/configmap.yaml
@ -85,6 +85,8 @@ data:

  LOCALES_ENABLED: "{{ .Values.locales_enabled }}"

+  REPLICA_DELETION_DELAY_DAYS: "{{ .Values.replica_deletion_delay_days | default 0 }}"
+

 ---
 apiVersion: v1
--- a/chart/values.yaml
+++ b/chart/values.yaml
@ -96,6 +96,10 @@ superuser:
 # Set name for default organization created with superuser
 default_org: "My Organization"

+# Set number of days replica file deletion should be delayed by
+# if set >0, will keep replicas (if any) for this number of days
+replica_deletion_delay_days: 0
+

 # API Image
 # =========================================
--- a/frontend/docs/docs/deploy/customization.md
+++ b/frontend/docs/docs/deploy/customization.md
@ -133,6 +133,8 @@ storages:
    access_endpoint_url: "https://my-custom-domain.example.com/path/"
 ```

+When replica locations are set, the default behavior when a crawl, upload, or browser profile is deleted is that the replica files are deleted at the same time as the file in primary storage. To delay deletion of replicas, set `replica_deletion_delay_days` in the Helm chart to the number of days by which to delay replica file deletion. This feature gives Browsertrix administrators time in the event of files being deleted accidentally or maliciously to recover copies from configured replica locations.
+
 ## Horizontal Autoscaling

 Browsertrix also includes support for horizontal auto-scaling for both the backend and frontend pods.