Optionally delay replica deletion (#2252)

Fixes #2170 The number of days to delay file replication deletion by is configurable in the Helm chart with `replica_deletion_delay_days` (set by default to 7 days in `values.yaml` to encourage good practice, though we could change this). When `replica_deletion_delay_days` is set to an int above 0, when a delete replica job would otherwise be started as a Kubernetes Job, a CronJob is created instead with a cron schedule set to run yearly, starting x days from the current moment. This cronjob is then deleted by the operator after the job successfully completes. If a failed background job is retried, it is re-run immediately as a Job rather than being scheduled out into the future again. --------- Co-authored-by: Ilya Kreymer <ikreymer@users.noreply.github.com>
2024-12-19 21:50:28 -05:00 · 2024-12-19 21:50:28 -05:00 · 589819682e
commit 589819682e
parent 2060ee78b4
8 changed files with 169 additions and 22 deletions
--- a/backend/btrixcloud/background_jobs.py
+++ b/backend/btrixcloud/background_jobs.py
@ -96,6 +96,11 @@ class BackgroundJobOps:
        if not res:
            print("File deleted before replication job started, ignoring", flush=True)
    async def handle_delete_replica_job_finished(self, job: DeleteReplicaJob) -> None:
        """After successful replica deletion, delete cronjob if scheduled"""
        if job.schedule:
            await self.crawl_manager.delete_replica_deletion_scheduled_job(job.id)
    async def create_replica_jobs(
        self, oid: UUID, file: BaseFile, object_id: str, object_type: str
    ) -> Dict[str, Union[bool, List[str]]]:
@ -146,7 +151,7 @@ class BackgroundJobOps:
        job_type = BgJobType.CREATE_REPLICA.value
        try:
-            job_id = await self.crawl_manager.run_replica_job(
+            job_id, _ = await self.crawl_manager.run_replica_job(
                oid=str(org.id),
                job_type=job_type,
                primary_storage=file.storage,
@ -155,7 +160,7 @@ class BackgroundJobOps:
                replica_storage=replica_ref,
                replica_file_path=replica_file_path,
                replica_endpoint=replica_endpoint,
-                job_id_prefix=f"{job_type}-{object_id}",
+                delay_days=0,
                existing_job_id=existing_job_id,
            )
            if existing_job_id:
@ -188,9 +193,13 @@ class BackgroundJobOps:
            )
            return job_id
        # pylint: disable=broad-exception-caught
        except Exception as exc:
-            # pylint: disable=raise-missing-from
+            print(
-            raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}")
+                "warning: replica job could not be started "
                + f"for {object_type} {file}: {exc}"
            )
            return ""
    async def create_delete_replica_jobs(
        self, org: Organization, file: BaseFile, object_id: str, object_type: str
@ -214,8 +223,9 @@ class BackgroundJobOps:
        object_id: str,
        object_type: str,
        replica_ref: StorageRef,
        force_start_immediately: bool = False,
        existing_job_id: Optional[str] = None,
-    ) -> Optional[str]:
+    ) -> str:
        """Create a job to delete one replica of a given file"""
        try:
            replica_storage = self.storage_ops.get_org_storage_by_ref(org, replica_ref)
@ -226,20 +236,23 @@ class BackgroundJobOps:
            job_type = BgJobType.DELETE_REPLICA.value
-            job_id = await self.crawl_manager.run_replica_job(
+            delay_days = int(os.environ.get("REPLICA_DELETION_DELAY_DAYS", 0))
            if force_start_immediately:
                delay_days = 0
            job_id, schedule = await self.crawl_manager.run_replica_job(
                oid=str(org.id),
                job_type=job_type,
                replica_storage=replica_ref,
                replica_file_path=replica_file_path,
                replica_endpoint=replica_endpoint,
-                job_id_prefix=f"{job_type}-{object_id}",
+                delay_days=delay_days,
                existing_job_id=existing_job_id,
            )
            if existing_job_id:
-                delete_replica_job = await self.get_background_job(
+                job = await self.get_background_job(existing_job_id, org.id)
-                    existing_job_id, org.id
+                delete_replica_job = cast(DeleteReplicaJob, job)
                )
                previous_attempt = {
                    "started": delete_replica_job.started,
                    "finished": delete_replica_job.finished,
@ -251,6 +264,7 @@ class BackgroundJobOps:
                delete_replica_job.started = dt_now()
                delete_replica_job.finished = None
                delete_replica_job.success = None
                delete_replica_job.schedule = None
            else:
                delete_replica_job = DeleteReplicaJob(
                    id=job_id,
@ -260,6 +274,7 @@ class BackgroundJobOps:
                    object_id=object_id,
                    object_type=object_type,
                    replica_storage=replica_ref,
                    schedule=schedule,
                )
            await self.jobs.find_one_and_update(
@ -274,7 +289,7 @@ class BackgroundJobOps:
                "warning: replica deletion job could not be started "
                + f"for {object_type} {file}: {exc}"
            )
-            return None
+            return ""
    async def create_delete_org_job(
        self,
@ -387,6 +402,10 @@ class BackgroundJobOps:
        if success:
            if job_type == BgJobType.CREATE_REPLICA:
                await self.handle_replica_job_finished(cast(CreateReplicaJob, job))
            if job_type == BgJobType.DELETE_REPLICA:
                await self.handle_delete_replica_job_finished(
                    cast(DeleteReplicaJob, job)
                )
        else:
            print(
                f"Background job {job.id} failed, sending email to superuser",
@ -560,6 +579,7 @@ class BackgroundJobOps:
                job.object_id,
                job.object_type,
                job.replica_storage,
                force_start_immediately=True,
                existing_job_id=job_id,
            )
--- a/backend/btrixcloud/crawlmanager.py
+++ b/backend/btrixcloud/crawlmanager.py
@ -3,7 +3,7 @@
 import os
 import secrets
-from typing import Optional, Dict
+from typing import Optional, Dict, Tuple
 from datetime import timedelta
 from fastapi import HTTPException
@ -72,24 +72,21 @@ class CrawlManager(K8sAPI):
        replica_storage: StorageRef,
        replica_file_path: str,
        replica_endpoint: str,
        delay_days: int = 0,
        primary_storage: Optional[StorageRef] = None,
        primary_file_path: Optional[str] = None,
        primary_endpoint: Optional[str] = None,
        job_id_prefix: Optional[str] = None,
        existing_job_id: Optional[str] = None,
-    ):
+    ) -> Tuple[str, Optional[str]]:
        """run job to replicate file from primary storage to replica storage"""
        if existing_job_id:
            job_id = existing_job_id
        else:
-            if not job_id_prefix:
+            # Keep name shorter than in past to avoid k8s issues with length
-                job_id_prefix = job_type
+            job_id = f"{job_type}-{secrets.token_hex(5)}"
-            # ensure name is <=63 characters
+        params: Dict[str, object] = {
            job_id = f"{job_id_prefix[:52]}-{secrets.token_hex(5)}"
        params = {
            "id": job_id,
            "oid": oid,
            "job_type": job_type,
@ -106,11 +103,17 @@ class CrawlManager(K8sAPI):
            "BgJobType": BgJobType,
        }
        if job_type == BgJobType.DELETE_REPLICA.value and delay_days > 0:
            # If replica deletion delay is configured, schedule as cronjob
            return await self.create_replica_deletion_scheduled_job(
                job_id, params, delay_days
            )
        data = self.templates.env.get_template("replica_job.yaml").render(params)
        await self.create_from_yaml(data)
-        return job_id
+        return job_id, None
    async def run_delete_org_job(
        self,
@ -393,3 +396,37 @@ class CrawlManager(K8sAPI):
        await self.create_from_yaml(data, self.namespace)
        return cron_job_id
    async def create_replica_deletion_scheduled_job(
        self,
        job_id: str,
        params: Dict[str, object],
        delay_days: int,
    ) -> Tuple[str, Optional[str]]:
        """create scheduled job to delay replica file in x days"""
        now = dt_now()
        run_at = now + timedelta(days=delay_days)
        schedule = f"{run_at.minute} {run_at.hour} {run_at.day} {run_at.month} *"
        params["schedule"] = schedule
        print(f"Replica deletion cron schedule: '{schedule}'", flush=True)
        data = self.templates.env.get_template("replica_deletion_cron_job.yaml").render(
            params
        )
        await self.create_from_yaml(data, self.namespace)
        return job_id, schedule
    async def delete_replica_deletion_scheduled_job(self, job_id: str):
        """delete scheduled job to delay replica file in x days"""
        cron_job = await self.batch_api.read_namespaced_cron_job(
            name=job_id,
            namespace=self.namespace,
        )
        if cron_job:
            await self.batch_api.delete_namespaced_cron_job(
                name=cron_job.metadata.name, namespace=self.namespace
            )
--- a/backend/btrixcloud/models.py
+++ b/backend/btrixcloud/models.py
@ -2058,6 +2058,7 @@ class DeleteReplicaJob(BackgroundJob):
    object_type: str
    object_id: str
    replica_storage: StorageRef
    schedule: Optional[str] = None
 # ============================================================================
--- a/backend/btrixcloud/operator/bgjobs.py
+++ b/backend/btrixcloud/operator/bgjobs.py
@ -35,7 +35,7 @@ class BgJobOperator(BaseOperator):
        labels: dict[str, str] = metadata.get("labels", {})
        oid: str = labels.get("btrix.org") or ""
        job_type: str = labels.get("job_type") or ""
-        job_id: str = metadata.get("name")
+        job_id: str = labels.get("job_id") or metadata.get("name")
        status = data.object["status"]
        success = status.get("succeeded") == 1
--- a/chart/app-templates/replica_deletion_cron_job.yaml
+++ b/chart/app-templates/replica_deletion_cron_job.yaml
@ -0,0 +1,81 @@
 apiVersion: batch/v1
 kind: CronJob
 metadata:
  name: "{{ id }}"
  labels:
    role: "cron-background-job"
    job_type: {{ job_type }}
    btrix.org: {{ oid }}
 spec:
  concurrencyPolicy: Forbid
  successfulJobsHistoryLimit: 0
  failedJobsHistoryLimit: 2
  schedule: "{{ schedule }}"
  jobTemplate:
    metadata:
      labels:
        role: "background-job"
        job_type: {{ job_type }}
        job_id: {{ id }}
        btrix.org: {{ oid }}
    spec:
      template:
        spec:
          restartPolicy: Never
          priorityClassName: bg-job
          podFailurePolicy:
            rules:
            - action: FailJob
              onExitCodes:
                containerName: rclone
                operator: NotIn
                values: [0]
          containers:
            - name: rclone
              image: rclone/rclone:latest
              env:
              - name: RCLONE_CONFIG_REPLICA_TYPE
                value: "s3"
              - name: RCLONE_CONFIG_REPLICA_ACCESS_KEY_ID
                valueFrom:
                  secretKeyRef:
                    name: "{{ replica_secret_name }}"
                    key: STORE_ACCESS_KEY
              - name: RCLONE_CONFIG_REPLICA_SECRET_ACCESS_KEY
                valueFrom:
                  secretKeyRef:
                    name: "{{ replica_secret_name }}"
                    key: STORE_SECRET_KEY
              - name: RCLONE_CONFIG_REPLICA_REGION
                valueFrom:
                  secretKeyRef:
                    name: "{{ replica_secret_name }}"
                    key: STORE_REGION
              - name: RCLONE_CONFIG_REPLICA_PROVIDER
                valueFrom:
                  secretKeyRef:
                    name: "{{ replica_secret_name }}"
                    key: STORE_S3_PROVIDER
              - name: RCLONE_CONFIG_REPLICA_ENDPOINT
                value: "{{ replica_endpoint }}"
              command: ["rclone", "-vv", "delete", "replica:{{ replica_file_path }}"]
              resources:
                limits:
                  memory: "200Mi"
                requests:
                  memory: "200Mi"
                  cpu: "50m"
--- a/chart/templates/configmap.yaml
+++ b/chart/templates/configmap.yaml
@ -85,6 +85,8 @@ data:
  LOCALES_ENABLED: "{{ .Values.locales_enabled }}"
  REPLICA_DELETION_DELAY_DAYS: "{{ .Values.replica_deletion_delay_days | default 0 }}"
 ---
 apiVersion: v1
--- a/chart/values.yaml
+++ b/chart/values.yaml
@ -96,6 +96,10 @@ superuser:
 # Set name for default organization created with superuser
 default_org: "My Organization"
 # Set number of days replica file deletion should be delayed by
 # if set >0, will keep replicas (if any) for this number of days
 replica_deletion_delay_days: 0
 # API Image
 # =========================================
--- a/frontend/docs/docs/deploy/customization.md
+++ b/frontend/docs/docs/deploy/customization.md
@ -133,6 +133,8 @@ storages:
    access_endpoint_url: "https://my-custom-domain.example.com/path/"
 ```
 When replica locations are set, the default behavior when a crawl, upload, or browser profile is deleted is that the replica files are deleted at the same time as the file in primary storage. To delay deletion of replicas, set `replica_deletion_delay_days` in the Helm chart to the number of days by which to delay replica file deletion. This feature gives Browsertrix administrators time in the event of files being deleted accidentally or maliciously to recover copies from configured replica locations.
 ## Horizontal Autoscaling
 Browsertrix also includes support for horizontal auto-scaling for both the backend and frontend pods.