Backend: Crawls with Multiple WACZ files + Profile + Misc Fixes (#232)

* backend: k8s: - support crawls with multiple wacz files, don't assume crawl complete after first wacz uploaded - if crawl is running and has wacz file, still show as running - k8s: allow configuring node selector for main pods (eg. nodeType=main) and for crawlers (eg. nodeType=crawling) - profiles: support uploading to alternate storage specified via 'shared_profile_storage' value is set - misc fixes for profiles * backend: ensure docker run_profile api matches k8s k8s chart: don't delete pvc and pv in helm chart * dependency: bump authsign to 0.4.0 docker: disable public redis port * profiles: fix path, profile browser return value * fix typo in presigned url cacheing
2022-05-19 18:40:41 -07:00 · 2022-05-19 18:40:41 -07:00 · 3df310ee4f
commit 3df310ee4f
parent cdefb8d06e
19 changed files with 154 additions and 40 deletions
--- a/backend/archives.py
+++ b/backend/archives.py
@ -44,6 +44,7 @@ class S3Storage(BaseModel):
    access_key: str
    secret_key: str
    access_endpoint_url: Optional[str]
    region: Optional[str] = ""
 # ============================================================================
--- a/backend/crawlconfigs.py
+++ b/backend/crawlconfigs.py
@ -248,8 +248,12 @@ class CrawlConfigOps:
        crawlconfig = CrawlConfig.from_dict(data)
        suffix = f"{self.sanitize(crawlconfig.name)}-{self.sanitize(user.name)}"
        # pylint: disable=line-too-long
-        out_filename = f"{self.sanitize(crawlconfig.name)}-{self.sanitize(user.name)}-@ts-@hostsuffix.wacz"
+        out_filename = (
            f"data/{self.sanitize(crawlconfig.name)}-@id/{suffix}-@ts-@hostsuffix.wacz"
        )
        new_name = await self.crawl_manager.add_crawl_config(
            crawlconfig=crawlconfig,
--- a/backend/crawls.py
+++ b/backend/crawls.py
@ -230,13 +230,14 @@ class CrawlOps:
        print(f"Duration: {dura}", flush=True)
-        await self.archives.inc_usage(crawl.aid, dura)
+        # if crawl.finished:
        if crawl.state == "complete":
            await self.archives.inc_usage(crawl.aid, dura)
-        await self.crawl_configs.inc_crawls(
+            await self.crawl_configs.inc_crawls(
-            crawl.cid, crawl.id, crawl.finished, crawl.state
+                crawl.cid, crawl.id, crawl.finished, crawl.state
-        )
+            )
        if crawl_file:
            await self.delete_redis_keys(crawl)
        return True
@ -317,9 +318,12 @@ class CrawlOps:
        crawls = []
        running_ids = set()
        for crawl in running_crawls:
            list_crawl = ListCrawlOut(**crawl.dict())
            crawls.append(await self._resolve_crawl_refs(list_crawl, archive))
            running_ids.add(list_crawl.id)
        if not running_only:
            aid = archive.id if archive else None
@ -327,7 +331,9 @@ class CrawlOps:
                aid=aid, exclude_files=True
            )
-            crawls.extend(finished_crawls)
+            for crawl in finished_crawls:
                if crawl.id not in running_ids:
                    crawls.append(crawl)
        return ListCrawls(crawls=crawls)
@ -339,21 +345,34 @@ class CrawlOps:
            query["aid"] = archive.id
        res = await self.crawls.find_one(query)
        crawl = None
        completed = False
-        if not res:
+        if res:
            aid_str = archive.id_str if archive else None
            crawl = await self.crawl_manager.get_running_crawl(crawlid, aid_str)
            if crawl:
                await self.get_redis_stats([crawl])
                await self.cache_ips(crawl)
        else:
            files = [CrawlFile(**data) for data in res["files"]]
            del res["files"]
            res["resources"] = await self._resolve_signed_urls(files, archive)
            crawl = CrawlOut.from_dict(res)
            completed = crawl.state == "complete"
        if not completed:
            aid_str = archive.id_str if archive else None
            running_crawl = await self.crawl_manager.get_running_crawl(crawlid, aid_str)
            if running_crawl:
                await self.get_redis_stats([running_crawl])
                await self.cache_ips(running_crawl)
                if crawl:
                    crawl.stats = running_crawl.stats
                    # pylint: disable=invalid-name
                    crawl.watchIPs = running_crawl.watchIPs
                    crawl.scale = running_crawl.scale
                    crawl.state = running_crawl.state
                else:
                    crawl = running_crawl
        if not crawl:
            raise HTTPException(status_code=404, detail=f"Crawl not found: {crawlid}")
@ -383,7 +402,7 @@ class CrawlOps:
        async with self.redis.pipeline(transaction=True) as pipe:
            for file_ in files:
-                pipe.get(f"{file_.filename}")
+                pipe.get(f"f:{file_.filename}")
            results = await pipe.execute()
--- a/backend/dockerman.py
+++ b/backend/dockerman.py
@ -421,13 +421,18 @@ class DockerManager:
        self,
        userid,
        aid,
        storage,
        command,
        storage=None,
        storage_name=None,
        baseprofile=None,
    ):
        """ Run browser for profile creation """
-        storage_name = storage.name
+        if storage_name:
-        storage, storage_path = await self._get_storage_and_path(storage)
+            storage = self.storages[storage_name]
            storage_path = storage.path
        else:
            storage_name = storage.name
            storage, storage_path = await self._get_storage_and_path(storage)
        env_vars = [
            f"STORE_USER={userid}",
--- a/backend/k8sman.py
+++ b/backend/k8sman.py
@ -67,6 +67,12 @@ class K8SManager:
        else:
            self.crawl_volume["emptyDir"] = {}
        crawl_node_type = os.environ.get("CRAWLER_NODE_TYPE")
        if crawl_node_type:
            self.crawl_node_selector = {"nodeType": crawl_node_type}
        else:
            self.crawl_node_selector = {}
        self.loop = asyncio.get_running_loop()
        self.loop.create_task(self.run_event_loop())
        self.loop.create_task(self.init_redis(self.redis_url))
@ -172,7 +178,12 @@ class K8SManager:
            )
    async def add_crawl_config(
-        self, crawlconfig, storage, run_now, out_filename, profile_filename
+        self,
        crawlconfig,
        storage,
        run_now,
        out_filename,
        profile_filename,
    ):
        """add new crawl as cron job, store crawl config in configmap"""
        cid = str(crawlconfig.id)
@ -343,7 +354,7 @@ class K8SManager:
            return None, None
        manual = job.metadata.annotations.get("btrix.run.manual") == "1"
-        if manual and not self.no_delete_jobs:
+        if manual and not self.no_delete_jobs and crawlcomplete.completed:
            self.loop.create_task(self._delete_job(job.metadata.name))
        crawl = self._make_crawl_for_job(
@ -389,12 +400,14 @@ class K8SManager:
            endpoint_url = self._secret_data(storage_secret, "STORE_ENDPOINT_URL")
            access_key = self._secret_data(storage_secret, "STORE_ACCESS_KEY")
            secret_key = self._secret_data(storage_secret, "STORE_SECRET_KEY")
            region = self._secret_data(storage_secret, "STORE_REGION") or ""
            self._default_storages[name] = S3Storage(
                access_key=access_key,
                secret_key=secret_key,
                endpoint_url=endpoint_url,
                access_endpoint_url=access_endpoint_url,
                region=region,
            )
        return self._default_storages[name]
@ -542,17 +555,19 @@ class K8SManager:
            return True
    async def run_profile_browser(
-        self, userid, aid, storage, command, baseprofile=None
+        self, userid, aid, command, storage=None, storage_name=None, baseprofile=None
    ):
        """run browser for profile creation """
-        # Configure Annotations + Labels
+
-        if storage.type == "default":
+        # if default storage, use name and path + profiles/
        if storage:
            storage_name = storage.name
-            storage_path = storage.path
+            storage_path = storage.path + "profiles/"
        # otherwise, use storage name and existing path from secret
        else:
            storage_name = aid
            storage_path = ""
        # Configure Annotations + Labels
        labels = {
            "btrix.user": userid,
            "btrix.archive": aid,
@ -560,7 +575,7 @@ class K8SManager:
        }
        if baseprofile:
-            labels["btrix.baseprofile"] = baseprofile
+            labels["btrix.baseprofile"] = str(baseprofile)
        await self.check_storage(storage_name)
@ -825,7 +840,7 @@ class K8SManager:
        if profile_filename:
            command.append("--profile")
-            command.append(f"@{profile_filename}")
+            command.append(f"@profiles/{profile_filename}")
        job_template = {
            "metadata": {"annotations": annotations},
@ -835,6 +850,7 @@ class K8SManager:
                "template": {
                    "metadata": {"labels": labels},
                    "spec": {
                        "nodeSelector": self.crawl_node_selector,
                        "containers": [
                            {
                                "name": "crawler",
@ -891,7 +907,7 @@ class K8SManager:
                            },
                            self.crawl_volume,
                        ],
-                        "restartPolicy": "Never",
+                        "restartPolicy": "OnFailure",
                        "terminationGracePeriodSeconds": self.grace_period,
                    },
                },
--- a/backend/profiles.py
+++ b/backend/profiles.py
@ -4,6 +4,8 @@ from typing import Optional, List
 from datetime import datetime
 import uuid
 import asyncio
 import os
 from urllib.parse import urlencode
 from fastapi import APIRouter, Depends, Request, HTTPException
@ -100,6 +102,8 @@ class ProfileOps:
        self.crawlconfigs = None
        self.shared_profile_storage = os.environ.get("SHARED_PROFILE_STORAGE")
    def set_crawlconfigs(self, crawlconfigs):
        """ set crawlconfigs ops """
        self.crawlconfigs = crawlconfigs
@ -116,16 +120,25 @@ class ProfileOps:
        """ Create new profile """
        command = await self.get_command(profile_launch, archive)
        if self.shared_profile_storage:
            storage_name = self.shared_profile_storage
            storage = None
        elif archive.storage and archive.storage.type == "default":
            storage_name = None
            storage = archive.storage
        else:
            storage_name = str(archive.id)
            storage = None
        browserid = await self.crawl_manager.run_profile_browser(
            str(user.id),
            str(archive.id),
            archive.storage,
            command,
-            baseprofile=str(profile_launch.profileId),
+            storage=storage,
            storage_name=storage_name,
            baseprofile=profile_launch.profileId,
        )
        print("base profile", str(profile_launch.profileId))
        if not browserid:
            raise HTTPException(status_code=400, detail="browser_not_created")
@ -231,7 +244,6 @@ class ProfileOps:
        baseid = browser_data.get("btrix.baseprofile")
        if baseid:
            print("baseid", baseid)
            baseid = uuid.UUID(baseid)
        profile = Profile(
--- a/backend/storages.py
+++ b/backend/storages.py
@ -71,7 +71,7 @@ async def get_s3_client(storage, use_access=False):
    async with session.create_client(
        "s3",
-        region_name="",
+        region_name=storage.region,
        endpoint_url=endpoint_url,
        aws_access_key_id=storage.access_key,
        aws_secret_access_key=storage.secret_key,
--- a/chart/templates/backend.yaml
+++ b/chart/templates/backend.yaml
@ -24,6 +24,11 @@ spec:
        {{- end }}
    spec:
      {{- if .Values.main_node_type }}
      nodeSelector:
        nodeType: {{ .Values.main_node_type }}
      {{- end }}
      initContainers:
 {{- if .Values.minio_local }}
        - name: init-bucket
--- a/chart/templates/configmap.yaml
+++ b/chart/templates/configmap.yaml
@ -27,6 +27,8 @@ data:
  CRAWLER_PV_CLAIM: "{{ .Values.crawler_pv_claim }}"
  {{- end }}
  CRAWLER_NODE_TYPE: "{{ .Values.crawler_node_type }}"
  REDIS_URL: "{{ .Values.redis_url }}"
  REDIS_CRAWLS_DONE_KEY: "crawls-done"
--- a/chart/templates/frontend.yaml
+++ b/chart/templates/frontend.yaml
@ -25,6 +25,11 @@ spec:
    spec:
      {{- if .Values.main_node_type }}
      nodeSelector:
        nodeType: {{ .Values.main_node_type }}
      {{- end }}
      volumes:
        - name: nginx-resolver
          emptyDir: {}
--- a/chart/templates/minio.yaml
+++ b/chart/templates/minio.yaml
@ -5,6 +5,8 @@ kind: PersistentVolumeClaim
 apiVersion: v1
 metadata:
  name: minio-storage-pvc
  annotations:
    "helm.sh/resource-policy": keep
 spec:
  accessModes:
    - ReadWriteOnce
@ -24,6 +26,8 @@ apiVersion: v1
 kind: PersistentVolume
 metadata:
  name: "local-minio-store-pv"
  annotations:
    "helm.sh/resource-policy": keep
 spec:
  capacity:
    storage: 5Gi
@ -54,6 +58,11 @@ spec:
        app: local-minio
    spec:
      {{- if .Values.main_node_type }}
      nodeSelector:
        nodeType: {{ .Values.main_node_type }}
      {{- end }}
      volumes:
        - name: data-minio
          persistentVolumeClaim:
--- a/chart/templates/mongo.yaml
+++ b/chart/templates/mongo.yaml
@ -22,6 +22,8 @@ kind: PersistentVolumeClaim
 apiVersion: v1
 metadata:
  name: mongo-storage-pvc
  annotations:
    "helm.sh/resource-policy": keep
 spec:
  accessModes:
    - ReadWriteOnce
@ -41,6 +43,8 @@ apiVersion: v1
 kind: PersistentVolume
 metadata:
  name: "local-mongo-store-pv"
  annotations:
    "helm.sh/resource-policy": keep
 spec:
  capacity:
    storage: 2Gi
@ -69,6 +73,11 @@ spec:
        app: local-mongo
    spec:
      {{- if .Values.main_node_type }}
      nodeSelector:
        nodeType: {{ .Values.main_node_type }}
      {{- end }}
      volumes:
        - name: data-db
          persistentVolumeClaim:
--- a/chart/templates/namespaces.yaml
+++ b/chart/templates/namespaces.yaml
@ -6,3 +6,4 @@ metadata:
    release: {{ .Release.Name }}
  annotations:
    "helm.sh/resource-policy": keep
--- a/chart/templates/redis.yaml
+++ b/chart/templates/redis.yaml
@ -5,6 +5,8 @@ kind: PersistentVolumeClaim
 apiVersion: v1
 metadata:
  name: redis-storage-pvc
  annotations:
    "helm.sh/resource-policy": keep
 spec:
  accessModes:
    - ReadWriteOnce
@ -24,6 +26,8 @@ apiVersion: v1
 kind: PersistentVolume
 metadata:
  name: "local-redis-store-pv"
  annotations:
    "helm.sh/resource-policy": keep
 spec:
  capacity:
    storage: 1Gi
--- a/chart/templates/role.yaml
+++ b/chart/templates/role.yaml
@ -6,7 +6,7 @@ metadata:
  name: crawler-run
 rules:
 - apiGroups: [""]
-  resources: ["pods", "pods/exec", "pods/log", "services", "configmaps", "secrets", "events"]
+  resources: ["pods", "pods/exec", "pods/log", "services", "configmaps", "secrets", "events", "persistentvolumeclaims"]
  verbs: ["get", "list", "watch", "create", "update", "patch", "delete", "deletecollection"]
 - apiGroups: ["batch", "extensions"]
--- a/chart/templates/secrets.yaml
+++ b/chart/templates/secrets.yaml
@ -26,6 +26,8 @@ stringData:
  SUPERUSER_EMAIL: "{{ .Values.superuser.email }}"
  SUPERUSER_PASSWORD: "{{ .Values.superuser.password }}"
  SHARED_PROFILE_STORAGE: "{{ .Values.shared_profile_storage }}"
 {{- range $storage := .Values.storages }}
 ---
 apiVersion: v1
@ -53,6 +55,8 @@ stringData:
  STORE_ACCESS_ENDPOINT_URL: "{{ $storage.endpoint_url }}"
  {{- end }}
  STORE_REGION: {{ $storage.region | default "" }}
  {{- if $.Values.signer.auth_token }}
  WACZ_SIGN_TOKEN: "{{ $.Values.signer.auth_token }}"
  WACZ_SIGN_URL: "http://auth-signer.default:5053/sign"
--- a/chart/templates/signer.yaml
+++ b/chart/templates/signer.yaml
@ -42,6 +42,8 @@ kind: PersistentVolumeClaim
 apiVersion: v1
 metadata:
  name: signer-storage-pvc
  annotations:
    "helm.sh/resource-policy": keep
 spec:
  accessModes:
    - ReadWriteOnce
@ -61,6 +63,8 @@ apiVersion: v1
 kind: PersistentVolume
 metadata:
  name: "signer-store-pv"
  annotations:
    "helm.sh/resource-policy": keep
 spec:
  capacity:
    storage: 1Gi
@ -95,6 +99,11 @@ spec:
        {{- end }}
    spec:
      {{- if .Values.main_node_type }}
      nodeSelector:
        nodeType: {{ .Values.main_node_type }}
      {{- end }}
      volumes:
        - name: signer-config
          secret:
--- a/chart/values.yaml
+++ b/chart/values.yaml
@ -6,6 +6,11 @@ name: browsertrix-cloud
 # keep empty to use hostPath (eg. on minikube)
 volume_storage_class:
 # if set, set the node selector 'nodeType' for deployment pods
 # main_node_type:
 # if set, set the node selector 'nodeType' to this crawling pods
 # crawler_node_type:
 registration_enabled: 1
 jwt_token_lifetime_minutes: 60
@ -147,6 +152,10 @@ storages:
    endpoint_url: "http://local-minio.default:9000/"
 # optional: if above includes a separate storage for profiles, specify here to store profiles separately from wacz files
 # may be useful if, for example, the wacz files are public, while profiles should not be
 # shared_storage_profile:
 # Email Options
 # =========================================
@ -178,7 +187,7 @@ signer:
  enabled: false
  # host: <set to signer domain>
  # cert_email: "test@example.com
-  # image: webrecorder/authsign:0.3.1
+  # image: webrecorder/authsign:0.4.0
  # image_pull_policy: "IfNotPresent"
  # auth_token: <set to custom value>
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -34,8 +34,8 @@ services:
    image: redis
    command: redis-server --appendonly yes
-    ports:
+    #ports:
-      - 6379:6379
+    #  - 6379:6379
    volumes:
      - btrix-redis-data:/data
@ -80,7 +80,7 @@ services:
 # enable to support signing of wacz files
 # port 80 must be open to automatically generate cert via LetsEncrypt
  authsign:
-    image: webrecorder/authsign:0.3.1
+    image: webrecorder/authsign:0.4.0
    volumes:
      - btrix-sign-data:/data