From 3df310ee4f2a8c7c8b2fd7f45f9db1dd7dcd4bd6 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 19 May 2022 18:40:41 -0700 Subject: [PATCH] Backend: Crawls with Multiple WACZ files + Profile + Misc Fixes (#232) * backend: k8s: - support crawls with multiple wacz files, don't assume crawl complete after first wacz uploaded - if crawl is running and has wacz file, still show as running - k8s: allow configuring node selector for main pods (eg. nodeType=main) and for crawlers (eg. nodeType=crawling) - profiles: support uploading to alternate storage specified via 'shared_profile_storage' value is set - misc fixes for profiles * backend: ensure docker run_profile api matches k8s k8s chart: don't delete pvc and pv in helm chart * dependency: bump authsign to 0.4.0 docker: disable public redis port * profiles: fix path, profile browser return value * fix typo in presigned url cacheing --- backend/archives.py | 1 + backend/crawlconfigs.py | 6 +++- backend/crawls.py | 49 +++++++++++++++++++++++---------- backend/dockerman.py | 11 ++++++-- backend/k8sman.py | 36 +++++++++++++++++------- backend/profiles.py | 22 +++++++++++---- backend/storages.py | 2 +- chart/templates/backend.yaml | 5 ++++ chart/templates/configmap.yaml | 2 ++ chart/templates/frontend.yaml | 5 ++++ chart/templates/minio.yaml | 9 ++++++ chart/templates/mongo.yaml | 9 ++++++ chart/templates/namespaces.yaml | 1 + chart/templates/redis.yaml | 4 +++ chart/templates/role.yaml | 2 +- chart/templates/secrets.yaml | 4 +++ chart/templates/signer.yaml | 9 ++++++ chart/values.yaml | 11 +++++++- docker-compose.yml | 6 ++-- 19 files changed, 154 insertions(+), 40 deletions(-) diff --git a/backend/archives.py b/backend/archives.py index 7c3dbedc..76f46ae8 100644 --- a/backend/archives.py +++ b/backend/archives.py @@ -44,6 +44,7 @@ class S3Storage(BaseModel): access_key: str secret_key: str access_endpoint_url: Optional[str] + region: Optional[str] = "" # ============================================================================ diff --git a/backend/crawlconfigs.py b/backend/crawlconfigs.py index a747837e..c55e6505 100644 --- a/backend/crawlconfigs.py +++ b/backend/crawlconfigs.py @@ -248,8 +248,12 @@ class CrawlConfigOps: crawlconfig = CrawlConfig.from_dict(data) + suffix = f"{self.sanitize(crawlconfig.name)}-{self.sanitize(user.name)}" + # pylint: disable=line-too-long - out_filename = f"{self.sanitize(crawlconfig.name)}-{self.sanitize(user.name)}-@ts-@hostsuffix.wacz" + out_filename = ( + f"data/{self.sanitize(crawlconfig.name)}-@id/{suffix}-@ts-@hostsuffix.wacz" + ) new_name = await self.crawl_manager.add_crawl_config( crawlconfig=crawlconfig, diff --git a/backend/crawls.py b/backend/crawls.py index fba93cb9..ea0221b2 100644 --- a/backend/crawls.py +++ b/backend/crawls.py @@ -230,13 +230,14 @@ class CrawlOps: print(f"Duration: {dura}", flush=True) - await self.archives.inc_usage(crawl.aid, dura) + # if crawl.finished: + if crawl.state == "complete": + await self.archives.inc_usage(crawl.aid, dura) - await self.crawl_configs.inc_crawls( - crawl.cid, crawl.id, crawl.finished, crawl.state - ) + await self.crawl_configs.inc_crawls( + crawl.cid, crawl.id, crawl.finished, crawl.state + ) - if crawl_file: await self.delete_redis_keys(crawl) return True @@ -317,9 +318,12 @@ class CrawlOps: crawls = [] + running_ids = set() + for crawl in running_crawls: list_crawl = ListCrawlOut(**crawl.dict()) crawls.append(await self._resolve_crawl_refs(list_crawl, archive)) + running_ids.add(list_crawl.id) if not running_only: aid = archive.id if archive else None @@ -327,7 +331,9 @@ class CrawlOps: aid=aid, exclude_files=True ) - crawls.extend(finished_crawls) + for crawl in finished_crawls: + if crawl.id not in running_ids: + crawls.append(crawl) return ListCrawls(crawls=crawls) @@ -339,21 +345,34 @@ class CrawlOps: query["aid"] = archive.id res = await self.crawls.find_one(query) + crawl = None + completed = False - if not res: - aid_str = archive.id_str if archive else None - crawl = await self.crawl_manager.get_running_crawl(crawlid, aid_str) - if crawl: - await self.get_redis_stats([crawl]) - await self.cache_ips(crawl) - - else: + if res: files = [CrawlFile(**data) for data in res["files"]] del res["files"] res["resources"] = await self._resolve_signed_urls(files, archive) crawl = CrawlOut.from_dict(res) + completed = crawl.state == "complete" + + if not completed: + aid_str = archive.id_str if archive else None + running_crawl = await self.crawl_manager.get_running_crawl(crawlid, aid_str) + if running_crawl: + await self.get_redis_stats([running_crawl]) + await self.cache_ips(running_crawl) + + if crawl: + crawl.stats = running_crawl.stats + # pylint: disable=invalid-name + crawl.watchIPs = running_crawl.watchIPs + crawl.scale = running_crawl.scale + crawl.state = running_crawl.state + + else: + crawl = running_crawl if not crawl: raise HTTPException(status_code=404, detail=f"Crawl not found: {crawlid}") @@ -383,7 +402,7 @@ class CrawlOps: async with self.redis.pipeline(transaction=True) as pipe: for file_ in files: - pipe.get(f"{file_.filename}") + pipe.get(f"f:{file_.filename}") results = await pipe.execute() diff --git a/backend/dockerman.py b/backend/dockerman.py index 26328320..c45d9d62 100644 --- a/backend/dockerman.py +++ b/backend/dockerman.py @@ -421,13 +421,18 @@ class DockerManager: self, userid, aid, - storage, command, + storage=None, + storage_name=None, baseprofile=None, ): """ Run browser for profile creation """ - storage_name = storage.name - storage, storage_path = await self._get_storage_and_path(storage) + if storage_name: + storage = self.storages[storage_name] + storage_path = storage.path + else: + storage_name = storage.name + storage, storage_path = await self._get_storage_and_path(storage) env_vars = [ f"STORE_USER={userid}", diff --git a/backend/k8sman.py b/backend/k8sman.py index 49a1c6d5..76c1cfac 100644 --- a/backend/k8sman.py +++ b/backend/k8sman.py @@ -67,6 +67,12 @@ class K8SManager: else: self.crawl_volume["emptyDir"] = {} + crawl_node_type = os.environ.get("CRAWLER_NODE_TYPE") + if crawl_node_type: + self.crawl_node_selector = {"nodeType": crawl_node_type} + else: + self.crawl_node_selector = {} + self.loop = asyncio.get_running_loop() self.loop.create_task(self.run_event_loop()) self.loop.create_task(self.init_redis(self.redis_url)) @@ -172,7 +178,12 @@ class K8SManager: ) async def add_crawl_config( - self, crawlconfig, storage, run_now, out_filename, profile_filename + self, + crawlconfig, + storage, + run_now, + out_filename, + profile_filename, ): """add new crawl as cron job, store crawl config in configmap""" cid = str(crawlconfig.id) @@ -343,7 +354,7 @@ class K8SManager: return None, None manual = job.metadata.annotations.get("btrix.run.manual") == "1" - if manual and not self.no_delete_jobs: + if manual and not self.no_delete_jobs and crawlcomplete.completed: self.loop.create_task(self._delete_job(job.metadata.name)) crawl = self._make_crawl_for_job( @@ -389,12 +400,14 @@ class K8SManager: endpoint_url = self._secret_data(storage_secret, "STORE_ENDPOINT_URL") access_key = self._secret_data(storage_secret, "STORE_ACCESS_KEY") secret_key = self._secret_data(storage_secret, "STORE_SECRET_KEY") + region = self._secret_data(storage_secret, "STORE_REGION") or "" self._default_storages[name] = S3Storage( access_key=access_key, secret_key=secret_key, endpoint_url=endpoint_url, access_endpoint_url=access_endpoint_url, + region=region, ) return self._default_storages[name] @@ -542,17 +555,19 @@ class K8SManager: return True async def run_profile_browser( - self, userid, aid, storage, command, baseprofile=None + self, userid, aid, command, storage=None, storage_name=None, baseprofile=None ): """run browser for profile creation """ - # Configure Annotations + Labels - if storage.type == "default": + + # if default storage, use name and path + profiles/ + if storage: storage_name = storage.name - storage_path = storage.path + storage_path = storage.path + "profiles/" + # otherwise, use storage name and existing path from secret else: - storage_name = aid storage_path = "" + # Configure Annotations + Labels labels = { "btrix.user": userid, "btrix.archive": aid, @@ -560,7 +575,7 @@ class K8SManager: } if baseprofile: - labels["btrix.baseprofile"] = baseprofile + labels["btrix.baseprofile"] = str(baseprofile) await self.check_storage(storage_name) @@ -825,7 +840,7 @@ class K8SManager: if profile_filename: command.append("--profile") - command.append(f"@{profile_filename}") + command.append(f"@profiles/{profile_filename}") job_template = { "metadata": {"annotations": annotations}, @@ -835,6 +850,7 @@ class K8SManager: "template": { "metadata": {"labels": labels}, "spec": { + "nodeSelector": self.crawl_node_selector, "containers": [ { "name": "crawler", @@ -891,7 +907,7 @@ class K8SManager: }, self.crawl_volume, ], - "restartPolicy": "Never", + "restartPolicy": "OnFailure", "terminationGracePeriodSeconds": self.grace_period, }, }, diff --git a/backend/profiles.py b/backend/profiles.py index 2e56b0e8..a47ac949 100644 --- a/backend/profiles.py +++ b/backend/profiles.py @@ -4,6 +4,8 @@ from typing import Optional, List from datetime import datetime import uuid import asyncio +import os + from urllib.parse import urlencode from fastapi import APIRouter, Depends, Request, HTTPException @@ -100,6 +102,8 @@ class ProfileOps: self.crawlconfigs = None + self.shared_profile_storage = os.environ.get("SHARED_PROFILE_STORAGE") + def set_crawlconfigs(self, crawlconfigs): """ set crawlconfigs ops """ self.crawlconfigs = crawlconfigs @@ -116,16 +120,25 @@ class ProfileOps: """ Create new profile """ command = await self.get_command(profile_launch, archive) + if self.shared_profile_storage: + storage_name = self.shared_profile_storage + storage = None + elif archive.storage and archive.storage.type == "default": + storage_name = None + storage = archive.storage + else: + storage_name = str(archive.id) + storage = None + browserid = await self.crawl_manager.run_profile_browser( str(user.id), str(archive.id), - archive.storage, command, - baseprofile=str(profile_launch.profileId), + storage=storage, + storage_name=storage_name, + baseprofile=profile_launch.profileId, ) - print("base profile", str(profile_launch.profileId)) - if not browserid: raise HTTPException(status_code=400, detail="browser_not_created") @@ -231,7 +244,6 @@ class ProfileOps: baseid = browser_data.get("btrix.baseprofile") if baseid: - print("baseid", baseid) baseid = uuid.UUID(baseid) profile = Profile( diff --git a/backend/storages.py b/backend/storages.py index 5a807b0e..a43a4aab 100644 --- a/backend/storages.py +++ b/backend/storages.py @@ -71,7 +71,7 @@ async def get_s3_client(storage, use_access=False): async with session.create_client( "s3", - region_name="", + region_name=storage.region, endpoint_url=endpoint_url, aws_access_key_id=storage.access_key, aws_secret_access_key=storage.secret_key, diff --git a/chart/templates/backend.yaml b/chart/templates/backend.yaml index 3ed52364..7d16a237 100644 --- a/chart/templates/backend.yaml +++ b/chart/templates/backend.yaml @@ -24,6 +24,11 @@ spec: {{- end }} spec: + {{- if .Values.main_node_type }} + nodeSelector: + nodeType: {{ .Values.main_node_type }} + {{- end }} + initContainers: {{- if .Values.minio_local }} - name: init-bucket diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 281b7b78..d9a53ea3 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -27,6 +27,8 @@ data: CRAWLER_PV_CLAIM: "{{ .Values.crawler_pv_claim }}" {{- end }} + CRAWLER_NODE_TYPE: "{{ .Values.crawler_node_type }}" + REDIS_URL: "{{ .Values.redis_url }}" REDIS_CRAWLS_DONE_KEY: "crawls-done" diff --git a/chart/templates/frontend.yaml b/chart/templates/frontend.yaml index 065572f5..d22b60c7 100644 --- a/chart/templates/frontend.yaml +++ b/chart/templates/frontend.yaml @@ -25,6 +25,11 @@ spec: spec: + {{- if .Values.main_node_type }} + nodeSelector: + nodeType: {{ .Values.main_node_type }} + {{- end }} + volumes: - name: nginx-resolver emptyDir: {} diff --git a/chart/templates/minio.yaml b/chart/templates/minio.yaml index 6d8fa28c..b1b65b2f 100644 --- a/chart/templates/minio.yaml +++ b/chart/templates/minio.yaml @@ -5,6 +5,8 @@ kind: PersistentVolumeClaim apiVersion: v1 metadata: name: minio-storage-pvc + annotations: + "helm.sh/resource-policy": keep spec: accessModes: - ReadWriteOnce @@ -24,6 +26,8 @@ apiVersion: v1 kind: PersistentVolume metadata: name: "local-minio-store-pv" + annotations: + "helm.sh/resource-policy": keep spec: capacity: storage: 5Gi @@ -54,6 +58,11 @@ spec: app: local-minio spec: + {{- if .Values.main_node_type }} + nodeSelector: + nodeType: {{ .Values.main_node_type }} + {{- end }} + volumes: - name: data-minio persistentVolumeClaim: diff --git a/chart/templates/mongo.yaml b/chart/templates/mongo.yaml index fdb759f9..105254aa 100644 --- a/chart/templates/mongo.yaml +++ b/chart/templates/mongo.yaml @@ -22,6 +22,8 @@ kind: PersistentVolumeClaim apiVersion: v1 metadata: name: mongo-storage-pvc + annotations: + "helm.sh/resource-policy": keep spec: accessModes: - ReadWriteOnce @@ -41,6 +43,8 @@ apiVersion: v1 kind: PersistentVolume metadata: name: "local-mongo-store-pv" + annotations: + "helm.sh/resource-policy": keep spec: capacity: storage: 2Gi @@ -69,6 +73,11 @@ spec: app: local-mongo spec: + {{- if .Values.main_node_type }} + nodeSelector: + nodeType: {{ .Values.main_node_type }} + {{- end }} + volumes: - name: data-db persistentVolumeClaim: diff --git a/chart/templates/namespaces.yaml b/chart/templates/namespaces.yaml index fb4bc2de..491644f8 100644 --- a/chart/templates/namespaces.yaml +++ b/chart/templates/namespaces.yaml @@ -6,3 +6,4 @@ metadata: release: {{ .Release.Name }} annotations: "helm.sh/resource-policy": keep + diff --git a/chart/templates/redis.yaml b/chart/templates/redis.yaml index 0fa63e8c..5ef14033 100644 --- a/chart/templates/redis.yaml +++ b/chart/templates/redis.yaml @@ -5,6 +5,8 @@ kind: PersistentVolumeClaim apiVersion: v1 metadata: name: redis-storage-pvc + annotations: + "helm.sh/resource-policy": keep spec: accessModes: - ReadWriteOnce @@ -24,6 +26,8 @@ apiVersion: v1 kind: PersistentVolume metadata: name: "local-redis-store-pv" + annotations: + "helm.sh/resource-policy": keep spec: capacity: storage: 1Gi diff --git a/chart/templates/role.yaml b/chart/templates/role.yaml index 3250ca7d..a1692a38 100644 --- a/chart/templates/role.yaml +++ b/chart/templates/role.yaml @@ -6,7 +6,7 @@ metadata: name: crawler-run rules: - apiGroups: [""] - resources: ["pods", "pods/exec", "pods/log", "services", "configmaps", "secrets", "events"] + resources: ["pods", "pods/exec", "pods/log", "services", "configmaps", "secrets", "events", "persistentvolumeclaims"] verbs: ["get", "list", "watch", "create", "update", "patch", "delete", "deletecollection"] - apiGroups: ["batch", "extensions"] diff --git a/chart/templates/secrets.yaml b/chart/templates/secrets.yaml index 4376fe3e..64769998 100644 --- a/chart/templates/secrets.yaml +++ b/chart/templates/secrets.yaml @@ -26,6 +26,8 @@ stringData: SUPERUSER_EMAIL: "{{ .Values.superuser.email }}" SUPERUSER_PASSWORD: "{{ .Values.superuser.password }}" + SHARED_PROFILE_STORAGE: "{{ .Values.shared_profile_storage }}" + {{- range $storage := .Values.storages }} --- apiVersion: v1 @@ -53,6 +55,8 @@ stringData: STORE_ACCESS_ENDPOINT_URL: "{{ $storage.endpoint_url }}" {{- end }} + STORE_REGION: {{ $storage.region | default "" }} + {{- if $.Values.signer.auth_token }} WACZ_SIGN_TOKEN: "{{ $.Values.signer.auth_token }}" WACZ_SIGN_URL: "http://auth-signer.default:5053/sign" diff --git a/chart/templates/signer.yaml b/chart/templates/signer.yaml index d5bb8983..4449ca83 100644 --- a/chart/templates/signer.yaml +++ b/chart/templates/signer.yaml @@ -42,6 +42,8 @@ kind: PersistentVolumeClaim apiVersion: v1 metadata: name: signer-storage-pvc + annotations: + "helm.sh/resource-policy": keep spec: accessModes: - ReadWriteOnce @@ -61,6 +63,8 @@ apiVersion: v1 kind: PersistentVolume metadata: name: "signer-store-pv" + annotations: + "helm.sh/resource-policy": keep spec: capacity: storage: 1Gi @@ -95,6 +99,11 @@ spec: {{- end }} spec: + {{- if .Values.main_node_type }} + nodeSelector: + nodeType: {{ .Values.main_node_type }} + {{- end }} + volumes: - name: signer-config secret: diff --git a/chart/values.yaml b/chart/values.yaml index 68325e8c..06dd28d1 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -6,6 +6,11 @@ name: browsertrix-cloud # keep empty to use hostPath (eg. on minikube) volume_storage_class: +# if set, set the node selector 'nodeType' for deployment pods +# main_node_type: + +# if set, set the node selector 'nodeType' to this crawling pods +# crawler_node_type: registration_enabled: 1 jwt_token_lifetime_minutes: 60 @@ -147,6 +152,10 @@ storages: endpoint_url: "http://local-minio.default:9000/" +# optional: if above includes a separate storage for profiles, specify here to store profiles separately from wacz files +# may be useful if, for example, the wacz files are public, while profiles should not be +# shared_storage_profile: + # Email Options # ========================================= @@ -178,7 +187,7 @@ signer: enabled: false # host: # cert_email: "test@example.com - # image: webrecorder/authsign:0.3.1 + # image: webrecorder/authsign:0.4.0 # image_pull_policy: "IfNotPresent" # auth_token: diff --git a/docker-compose.yml b/docker-compose.yml index 46031d32..cde947e2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -34,8 +34,8 @@ services: image: redis command: redis-server --appendonly yes - ports: - - 6379:6379 + #ports: + # - 6379:6379 volumes: - btrix-redis-data:/data @@ -80,7 +80,7 @@ services: # enable to support signing of wacz files # port 80 must be open to automatically generate cert via LetsEncrypt authsign: - image: webrecorder/authsign:0.3.1 + image: webrecorder/authsign:0.4.0 volumes: - btrix-sign-data:/data