Backend: Crawls with Multiple WACZ files + Profile + Misc Fixes (#232)

* backend: k8s: - support crawls with multiple wacz files, don't assume crawl complete after first wacz uploaded - if crawl is running and has wacz file, still show as running - k8s: allow configuring node selector for main pods (eg. nodeType=main) and for crawlers (eg. nodeType=crawling) - profiles: support uploading to alternate storage specified via 'shared_profile_storage' value is set - misc fixes for profiles * backend: ensure docker run_profile api matches k8s k8s chart: don't delete pvc and pv in helm chart * dependency: bump authsign to 0.4.0 docker: disable public redis port * profiles: fix path, profile browser return value * fix typo in presigned url cacheing
2022-05-19 18:40:41 -07:00 · 2022-05-19 18:40:41 -07:00 · 3df310ee4f
commit 3df310ee4f
parent cdefb8d06e
19 changed files with 154 additions and 40 deletions
--- a/backend/archives.py
+++ b/backend/archives.py
@ -44,6 +44,7 @@ class S3Storage(BaseModel):
    access_key: str
    secret_key: str
    access_endpoint_url: Optional[str]
+    region: Optional[str] = ""


 # ============================================================================
--- a/backend/crawlconfigs.py
+++ b/backend/crawlconfigs.py
@ -248,8 +248,12 @@ class CrawlConfigOps:

        crawlconfig = CrawlConfig.from_dict(data)

+        suffix = f"{self.sanitize(crawlconfig.name)}-{self.sanitize(user.name)}"
+
        # pylint: disable=line-too-long
-        out_filename = f"{self.sanitize(crawlconfig.name)}-{self.sanitize(user.name)}-@ts-@hostsuffix.wacz"
+        out_filename = (
+            f"data/{self.sanitize(crawlconfig.name)}-@id/{suffix}-@ts-@hostsuffix.wacz"
+        )

        new_name = await self.crawl_manager.add_crawl_config(
            crawlconfig=crawlconfig,
--- a/backend/crawls.py
+++ b/backend/crawls.py
@ -230,13 +230,14 @@ class CrawlOps:

        print(f"Duration: {dura}", flush=True)

-        await self.archives.inc_usage(crawl.aid, dura)
+        # if crawl.finished:
+        if crawl.state == "complete":
+            await self.archives.inc_usage(crawl.aid, dura)

-        await self.crawl_configs.inc_crawls(
-            crawl.cid, crawl.id, crawl.finished, crawl.state
-        )
+            await self.crawl_configs.inc_crawls(
+                crawl.cid, crawl.id, crawl.finished, crawl.state
+            )

-        if crawl_file:
            await self.delete_redis_keys(crawl)

        return True
@ -317,9 +318,12 @@ class CrawlOps:

        crawls = []

+        running_ids = set()
+
        for crawl in running_crawls:
            list_crawl = ListCrawlOut(**crawl.dict())
            crawls.append(await self._resolve_crawl_refs(list_crawl, archive))
+            running_ids.add(list_crawl.id)

        if not running_only:
            aid = archive.id if archive else None
@ -327,7 +331,9 @@ class CrawlOps:
                aid=aid, exclude_files=True
            )

-            crawls.extend(finished_crawls)
+            for crawl in finished_crawls:
+                if crawl.id not in running_ids:
+                    crawls.append(crawl)

        return ListCrawls(crawls=crawls)

@ -339,21 +345,34 @@ class CrawlOps:
            query["aid"] = archive.id

        res = await self.crawls.find_one(query)
+        crawl = None
+        completed = False

-        if not res:
-            aid_str = archive.id_str if archive else None
-            crawl = await self.crawl_manager.get_running_crawl(crawlid, aid_str)
-            if crawl:
-                await self.get_redis_stats([crawl])
-                await self.cache_ips(crawl)
-
-        else:
+        if res:
            files = [CrawlFile(**data) for data in res["files"]]

            del res["files"]

            res["resources"] = await self._resolve_signed_urls(files, archive)
            crawl = CrawlOut.from_dict(res)
+            completed = crawl.state == "complete"
+
+        if not completed:
+            aid_str = archive.id_str if archive else None
+            running_crawl = await self.crawl_manager.get_running_crawl(crawlid, aid_str)
+            if running_crawl:
+                await self.get_redis_stats([running_crawl])
+                await self.cache_ips(running_crawl)
+
+                if crawl:
+                    crawl.stats = running_crawl.stats
+                    # pylint: disable=invalid-name
+                    crawl.watchIPs = running_crawl.watchIPs
+                    crawl.scale = running_crawl.scale
+                    crawl.state = running_crawl.state
+
+                else:
+                    crawl = running_crawl

        if not crawl:
            raise HTTPException(status_code=404, detail=f"Crawl not found: {crawlid}")
@ -383,7 +402,7 @@ class CrawlOps:

        async with self.redis.pipeline(transaction=True) as pipe:
            for file_ in files:
-                pipe.get(f"{file_.filename}")
+                pipe.get(f"f:{file_.filename}")

            results = await pipe.execute()

--- a/backend/dockerman.py
+++ b/backend/dockerman.py
@ -421,13 +421,18 @@ class DockerManager:
        self,
        userid,
        aid,
-        storage,
        command,
+        storage=None,
+        storage_name=None,
        baseprofile=None,
    ):
        """ Run browser for profile creation """
-        storage_name = storage.name
-        storage, storage_path = await self._get_storage_and_path(storage)
+        if storage_name:
+            storage = self.storages[storage_name]
+            storage_path = storage.path
+        else:
+            storage_name = storage.name
+            storage, storage_path = await self._get_storage_and_path(storage)

        env_vars = [
            f"STORE_USER={userid}",
--- a/backend/k8sman.py
+++ b/backend/k8sman.py
@ -67,6 +67,12 @@ class K8SManager:
        else:
            self.crawl_volume["emptyDir"] = {}

+        crawl_node_type = os.environ.get("CRAWLER_NODE_TYPE")
+        if crawl_node_type:
+            self.crawl_node_selector = {"nodeType": crawl_node_type}
+        else:
+            self.crawl_node_selector = {}
+
        self.loop = asyncio.get_running_loop()
        self.loop.create_task(self.run_event_loop())
        self.loop.create_task(self.init_redis(self.redis_url))
@ -172,7 +178,12 @@ class K8SManager:
            )

    async def add_crawl_config(
-        self, crawlconfig, storage, run_now, out_filename, profile_filename
+        self,
+        crawlconfig,
+        storage,
+        run_now,
+        out_filename,
+        profile_filename,
    ):
        """add new crawl as cron job, store crawl config in configmap"""
        cid = str(crawlconfig.id)
@ -343,7 +354,7 @@ class K8SManager:
            return None, None

        manual = job.metadata.annotations.get("btrix.run.manual") == "1"
-        if manual and not self.no_delete_jobs:
+        if manual and not self.no_delete_jobs and crawlcomplete.completed:
            self.loop.create_task(self._delete_job(job.metadata.name))

        crawl = self._make_crawl_for_job(
@ -389,12 +400,14 @@ class K8SManager:
            endpoint_url = self._secret_data(storage_secret, "STORE_ENDPOINT_URL")
            access_key = self._secret_data(storage_secret, "STORE_ACCESS_KEY")
            secret_key = self._secret_data(storage_secret, "STORE_SECRET_KEY")
+            region = self._secret_data(storage_secret, "STORE_REGION") or ""

            self._default_storages[name] = S3Storage(
                access_key=access_key,
                secret_key=secret_key,
                endpoint_url=endpoint_url,
                access_endpoint_url=access_endpoint_url,
+                region=region,
            )

        return self._default_storages[name]
@ -542,17 +555,19 @@ class K8SManager:
            return True

    async def run_profile_browser(
-        self, userid, aid, storage, command, baseprofile=None
+        self, userid, aid, command, storage=None, storage_name=None, baseprofile=None
    ):
        """run browser for profile creation """
-        # Configure Annotations + Labels
-        if storage.type == "default":
+
+        # if default storage, use name and path + profiles/
+        if storage:
            storage_name = storage.name
-            storage_path = storage.path
+            storage_path = storage.path + "profiles/"
+        # otherwise, use storage name and existing path from secret
        else:
-            storage_name = aid
            storage_path = ""

+        # Configure Annotations + Labels
        labels = {
            "btrix.user": userid,
            "btrix.archive": aid,
@ -560,7 +575,7 @@ class K8SManager:
        }

        if baseprofile:
-            labels["btrix.baseprofile"] = baseprofile
+            labels["btrix.baseprofile"] = str(baseprofile)

        await self.check_storage(storage_name)

@ -825,7 +840,7 @@ class K8SManager:

        if profile_filename:
            command.append("--profile")
-            command.append(f"@{profile_filename}")
+            command.append(f"@profiles/{profile_filename}")

        job_template = {
            "metadata": {"annotations": annotations},
@ -835,6 +850,7 @@ class K8SManager:
                "template": {
                    "metadata": {"labels": labels},
                    "spec": {
+                        "nodeSelector": self.crawl_node_selector,
                        "containers": [
                            {
                                "name": "crawler",
@ -891,7 +907,7 @@ class K8SManager:
                            },
                            self.crawl_volume,
                        ],
-                        "restartPolicy": "Never",
+                        "restartPolicy": "OnFailure",
                        "terminationGracePeriodSeconds": self.grace_period,
                    },
                },
--- a/backend/profiles.py
+++ b/backend/profiles.py
@ -4,6 +4,8 @@ from typing import Optional, List
 from datetime import datetime
 import uuid
 import asyncio
+import os
+
 from urllib.parse import urlencode

 from fastapi import APIRouter, Depends, Request, HTTPException
@ -100,6 +102,8 @@ class ProfileOps:

        self.crawlconfigs = None

+        self.shared_profile_storage = os.environ.get("SHARED_PROFILE_STORAGE")
+
    def set_crawlconfigs(self, crawlconfigs):
        """ set crawlconfigs ops """
        self.crawlconfigs = crawlconfigs
@ -116,16 +120,25 @@ class ProfileOps:
        """ Create new profile """
        command = await self.get_command(profile_launch, archive)

+        if self.shared_profile_storage:
+            storage_name = self.shared_profile_storage
+            storage = None
+        elif archive.storage and archive.storage.type == "default":
+            storage_name = None
+            storage = archive.storage
+        else:
+            storage_name = str(archive.id)
+            storage = None
+
        browserid = await self.crawl_manager.run_profile_browser(
            str(user.id),
            str(archive.id),
-            archive.storage,
            command,
-            baseprofile=str(profile_launch.profileId),
+            storage=storage,
+            storage_name=storage_name,
+            baseprofile=profile_launch.profileId,
        )

-        print("base profile", str(profile_launch.profileId))
-
        if not browserid:
            raise HTTPException(status_code=400, detail="browser_not_created")

@ -231,7 +244,6 @@ class ProfileOps:

        baseid = browser_data.get("btrix.baseprofile")
        if baseid:
-            print("baseid", baseid)
            baseid = uuid.UUID(baseid)

        profile = Profile(
--- a/backend/storages.py
+++ b/backend/storages.py
@ -71,7 +71,7 @@ async def get_s3_client(storage, use_access=False):

    async with session.create_client(
        "s3",
-        region_name="",
+        region_name=storage.region,
        endpoint_url=endpoint_url,
        aws_access_key_id=storage.access_key,
        aws_secret_access_key=storage.secret_key,
--- a/chart/templates/backend.yaml
+++ b/chart/templates/backend.yaml
@ -24,6 +24,11 @@ spec:
        {{- end }}

    spec:
+      {{- if .Values.main_node_type }}
+      nodeSelector:
+        nodeType: {{ .Values.main_node_type }}
+      {{- end }}
+
      initContainers:
 {{- if .Values.minio_local }}
        - name: init-bucket
--- a/chart/templates/configmap.yaml
+++ b/chart/templates/configmap.yaml
@ -27,6 +27,8 @@ data:
  CRAWLER_PV_CLAIM: "{{ .Values.crawler_pv_claim }}"
  {{- end }}

+  CRAWLER_NODE_TYPE: "{{ .Values.crawler_node_type }}"
+
  REDIS_URL: "{{ .Values.redis_url }}"

  REDIS_CRAWLS_DONE_KEY: "crawls-done"
--- a/chart/templates/frontend.yaml
+++ b/chart/templates/frontend.yaml
@ -25,6 +25,11 @@ spec:


    spec:
+      {{- if .Values.main_node_type }}
+      nodeSelector:
+        nodeType: {{ .Values.main_node_type }}
+      {{- end }}
+
      volumes:
        - name: nginx-resolver
          emptyDir: {}
--- a/chart/templates/minio.yaml
+++ b/chart/templates/minio.yaml
@ -5,6 +5,8 @@ kind: PersistentVolumeClaim
 apiVersion: v1
 metadata:
  name: minio-storage-pvc
+  annotations:
+    "helm.sh/resource-policy": keep
 spec:
  accessModes:
    - ReadWriteOnce
@ -24,6 +26,8 @@ apiVersion: v1
 kind: PersistentVolume
 metadata:
  name: "local-minio-store-pv"
+  annotations:
+    "helm.sh/resource-policy": keep
 spec:
  capacity:
    storage: 5Gi
@ -54,6 +58,11 @@ spec:
        app: local-minio

    spec:
+      {{- if .Values.main_node_type }}
+      nodeSelector:
+        nodeType: {{ .Values.main_node_type }}
+      {{- end }}
+
      volumes:
        - name: data-minio
          persistentVolumeClaim:
--- a/chart/templates/mongo.yaml
+++ b/chart/templates/mongo.yaml
@ -22,6 +22,8 @@ kind: PersistentVolumeClaim
 apiVersion: v1
 metadata:
  name: mongo-storage-pvc
+  annotations:
+    "helm.sh/resource-policy": keep
 spec:
  accessModes:
    - ReadWriteOnce
@ -41,6 +43,8 @@ apiVersion: v1
 kind: PersistentVolume
 metadata:
  name: "local-mongo-store-pv"
+  annotations:
+    "helm.sh/resource-policy": keep
 spec:
  capacity:
    storage: 2Gi
@ -69,6 +73,11 @@ spec:
        app: local-mongo

    spec:
+      {{- if .Values.main_node_type }}
+      nodeSelector:
+        nodeType: {{ .Values.main_node_type }}
+      {{- end }}
+
      volumes:
        - name: data-db
          persistentVolumeClaim:
--- a/chart/templates/namespaces.yaml
+++ b/chart/templates/namespaces.yaml
@ -6,3 +6,4 @@ metadata:
    release: {{ .Release.Name }}
  annotations:
    "helm.sh/resource-policy": keep
+
--- a/chart/templates/redis.yaml
+++ b/chart/templates/redis.yaml
@ -5,6 +5,8 @@ kind: PersistentVolumeClaim
 apiVersion: v1
 metadata:
  name: redis-storage-pvc
+  annotations:
+    "helm.sh/resource-policy": keep
 spec:
  accessModes:
    - ReadWriteOnce
@ -24,6 +26,8 @@ apiVersion: v1
 kind: PersistentVolume
 metadata:
  name: "local-redis-store-pv"
+  annotations:
+    "helm.sh/resource-policy": keep
 spec:
  capacity:
    storage: 1Gi
--- a/chart/templates/role.yaml
+++ b/chart/templates/role.yaml
@ -6,7 +6,7 @@ metadata:
  name: crawler-run
 rules:
 - apiGroups: [""]
-  resources: ["pods", "pods/exec", "pods/log", "services", "configmaps", "secrets", "events"]
+  resources: ["pods", "pods/exec", "pods/log", "services", "configmaps", "secrets", "events", "persistentvolumeclaims"]
  verbs: ["get", "list", "watch", "create", "update", "patch", "delete", "deletecollection"]

 - apiGroups: ["batch", "extensions"]
--- a/chart/templates/secrets.yaml
+++ b/chart/templates/secrets.yaml
@ -26,6 +26,8 @@ stringData:
  SUPERUSER_EMAIL: "{{ .Values.superuser.email }}"
  SUPERUSER_PASSWORD: "{{ .Values.superuser.password }}"

+  SHARED_PROFILE_STORAGE: "{{ .Values.shared_profile_storage }}"
+
 {{- range $storage := .Values.storages }}
 ---
 apiVersion: v1
@ -53,6 +55,8 @@ stringData:
  STORE_ACCESS_ENDPOINT_URL: "{{ $storage.endpoint_url }}"
  {{- end }}

+  STORE_REGION: {{ $storage.region | default "" }}
+
  {{- if $.Values.signer.auth_token }}
  WACZ_SIGN_TOKEN: "{{ $.Values.signer.auth_token }}"
  WACZ_SIGN_URL: "http://auth-signer.default:5053/sign"
--- a/chart/templates/signer.yaml
+++ b/chart/templates/signer.yaml
@ -42,6 +42,8 @@ kind: PersistentVolumeClaim
 apiVersion: v1
 metadata:
  name: signer-storage-pvc
+  annotations:
+    "helm.sh/resource-policy": keep
 spec:
  accessModes:
    - ReadWriteOnce
@ -61,6 +63,8 @@ apiVersion: v1
 kind: PersistentVolume
 metadata:
  name: "signer-store-pv"
+  annotations:
+    "helm.sh/resource-policy": keep
 spec:
  capacity:
    storage: 1Gi
@ -95,6 +99,11 @@ spec:
        {{- end }}

    spec:
+      {{- if .Values.main_node_type }}
+      nodeSelector:
+        nodeType: {{ .Values.main_node_type }}
+      {{- end }}
+
      volumes:
        - name: signer-config
          secret:
--- a/chart/values.yaml
+++ b/chart/values.yaml
@ -6,6 +6,11 @@ name: browsertrix-cloud
 # keep empty to use hostPath (eg. on minikube)
 volume_storage_class:

+# if set, set the node selector 'nodeType' for deployment pods
+# main_node_type:
+
+# if set, set the node selector 'nodeType' to this crawling pods
+# crawler_node_type:

 registration_enabled: 1
 jwt_token_lifetime_minutes: 60
@ -147,6 +152,10 @@ storages:

    endpoint_url: "http://local-minio.default:9000/"

+# optional: if above includes a separate storage for profiles, specify here to store profiles separately from wacz files
+# may be useful if, for example, the wacz files are public, while profiles should not be
+# shared_storage_profile:
+

 # Email Options
 # =========================================
@ -178,7 +187,7 @@ signer:
  enabled: false
  # host: <set to signer domain>
  # cert_email: "test@example.com
-  # image: webrecorder/authsign:0.3.1
+  # image: webrecorder/authsign:0.4.0
  # image_pull_policy: "IfNotPresent"
  # auth_token: <set to custom value>

--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -34,8 +34,8 @@ services:
    image: redis
    command: redis-server --appendonly yes

-    ports:
-      - 6379:6379
+    #ports:
+    #  - 6379:6379

    volumes:
      - btrix-redis-data:/data
@ -80,7 +80,7 @@ services:
 # enable to support signing of wacz files
 # port 80 must be open to automatically generate cert via LetsEncrypt
  authsign:
-    image: webrecorder/authsign:0.3.1
+    image: webrecorder/authsign:0.4.0

    volumes:
      - btrix-sign-data:/data