Backend: Crawls with Multiple WACZ files + Profile + Misc Fixes (#232)

* backend: k8s:
- support crawls with multiple wacz files, don't assume crawl complete after first wacz uploaded
- if crawl is running and has wacz file, still show as running
- k8s: allow configuring node selector for main pods (eg. nodeType=main) and for crawlers (eg. nodeType=crawling)
- profiles: support uploading to alternate storage specified via 'shared_profile_storage' value is set
- misc fixes for profiles

* backend: ensure docker run_profile api matches k8s
k8s chart: don't delete pvc and pv in helm chart

* dependency: bump authsign to 0.4.0
docker: disable public redis port

* profiles: fix path, profile browser return value

* fix typo in presigned url cacheing
This commit is contained in:
Ilya Kreymer 2022-05-19 18:40:41 -07:00 committed by GitHub
parent cdefb8d06e
commit 3df310ee4f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 154 additions and 40 deletions

View File

@ -44,6 +44,7 @@ class S3Storage(BaseModel):
access_key: str access_key: str
secret_key: str secret_key: str
access_endpoint_url: Optional[str] access_endpoint_url: Optional[str]
region: Optional[str] = ""
# ============================================================================ # ============================================================================

View File

@ -248,8 +248,12 @@ class CrawlConfigOps:
crawlconfig = CrawlConfig.from_dict(data) crawlconfig = CrawlConfig.from_dict(data)
suffix = f"{self.sanitize(crawlconfig.name)}-{self.sanitize(user.name)}"
# pylint: disable=line-too-long # pylint: disable=line-too-long
out_filename = f"{self.sanitize(crawlconfig.name)}-{self.sanitize(user.name)}-@ts-@hostsuffix.wacz" out_filename = (
f"data/{self.sanitize(crawlconfig.name)}-@id/{suffix}-@ts-@hostsuffix.wacz"
)
new_name = await self.crawl_manager.add_crawl_config( new_name = await self.crawl_manager.add_crawl_config(
crawlconfig=crawlconfig, crawlconfig=crawlconfig,

View File

@ -230,13 +230,14 @@ class CrawlOps:
print(f"Duration: {dura}", flush=True) print(f"Duration: {dura}", flush=True)
await self.archives.inc_usage(crawl.aid, dura) # if crawl.finished:
if crawl.state == "complete":
await self.archives.inc_usage(crawl.aid, dura)
await self.crawl_configs.inc_crawls( await self.crawl_configs.inc_crawls(
crawl.cid, crawl.id, crawl.finished, crawl.state crawl.cid, crawl.id, crawl.finished, crawl.state
) )
if crawl_file:
await self.delete_redis_keys(crawl) await self.delete_redis_keys(crawl)
return True return True
@ -317,9 +318,12 @@ class CrawlOps:
crawls = [] crawls = []
running_ids = set()
for crawl in running_crawls: for crawl in running_crawls:
list_crawl = ListCrawlOut(**crawl.dict()) list_crawl = ListCrawlOut(**crawl.dict())
crawls.append(await self._resolve_crawl_refs(list_crawl, archive)) crawls.append(await self._resolve_crawl_refs(list_crawl, archive))
running_ids.add(list_crawl.id)
if not running_only: if not running_only:
aid = archive.id if archive else None aid = archive.id if archive else None
@ -327,7 +331,9 @@ class CrawlOps:
aid=aid, exclude_files=True aid=aid, exclude_files=True
) )
crawls.extend(finished_crawls) for crawl in finished_crawls:
if crawl.id not in running_ids:
crawls.append(crawl)
return ListCrawls(crawls=crawls) return ListCrawls(crawls=crawls)
@ -339,21 +345,34 @@ class CrawlOps:
query["aid"] = archive.id query["aid"] = archive.id
res = await self.crawls.find_one(query) res = await self.crawls.find_one(query)
crawl = None
completed = False
if not res: if res:
aid_str = archive.id_str if archive else None
crawl = await self.crawl_manager.get_running_crawl(crawlid, aid_str)
if crawl:
await self.get_redis_stats([crawl])
await self.cache_ips(crawl)
else:
files = [CrawlFile(**data) for data in res["files"]] files = [CrawlFile(**data) for data in res["files"]]
del res["files"] del res["files"]
res["resources"] = await self._resolve_signed_urls(files, archive) res["resources"] = await self._resolve_signed_urls(files, archive)
crawl = CrawlOut.from_dict(res) crawl = CrawlOut.from_dict(res)
completed = crawl.state == "complete"
if not completed:
aid_str = archive.id_str if archive else None
running_crawl = await self.crawl_manager.get_running_crawl(crawlid, aid_str)
if running_crawl:
await self.get_redis_stats([running_crawl])
await self.cache_ips(running_crawl)
if crawl:
crawl.stats = running_crawl.stats
# pylint: disable=invalid-name
crawl.watchIPs = running_crawl.watchIPs
crawl.scale = running_crawl.scale
crawl.state = running_crawl.state
else:
crawl = running_crawl
if not crawl: if not crawl:
raise HTTPException(status_code=404, detail=f"Crawl not found: {crawlid}") raise HTTPException(status_code=404, detail=f"Crawl not found: {crawlid}")
@ -383,7 +402,7 @@ class CrawlOps:
async with self.redis.pipeline(transaction=True) as pipe: async with self.redis.pipeline(transaction=True) as pipe:
for file_ in files: for file_ in files:
pipe.get(f"{file_.filename}") pipe.get(f"f:{file_.filename}")
results = await pipe.execute() results = await pipe.execute()

View File

@ -421,13 +421,18 @@ class DockerManager:
self, self,
userid, userid,
aid, aid,
storage,
command, command,
storage=None,
storage_name=None,
baseprofile=None, baseprofile=None,
): ):
""" Run browser for profile creation """ """ Run browser for profile creation """
storage_name = storage.name if storage_name:
storage, storage_path = await self._get_storage_and_path(storage) storage = self.storages[storage_name]
storage_path = storage.path
else:
storage_name = storage.name
storage, storage_path = await self._get_storage_and_path(storage)
env_vars = [ env_vars = [
f"STORE_USER={userid}", f"STORE_USER={userid}",

View File

@ -67,6 +67,12 @@ class K8SManager:
else: else:
self.crawl_volume["emptyDir"] = {} self.crawl_volume["emptyDir"] = {}
crawl_node_type = os.environ.get("CRAWLER_NODE_TYPE")
if crawl_node_type:
self.crawl_node_selector = {"nodeType": crawl_node_type}
else:
self.crawl_node_selector = {}
self.loop = asyncio.get_running_loop() self.loop = asyncio.get_running_loop()
self.loop.create_task(self.run_event_loop()) self.loop.create_task(self.run_event_loop())
self.loop.create_task(self.init_redis(self.redis_url)) self.loop.create_task(self.init_redis(self.redis_url))
@ -172,7 +178,12 @@ class K8SManager:
) )
async def add_crawl_config( async def add_crawl_config(
self, crawlconfig, storage, run_now, out_filename, profile_filename self,
crawlconfig,
storage,
run_now,
out_filename,
profile_filename,
): ):
"""add new crawl as cron job, store crawl config in configmap""" """add new crawl as cron job, store crawl config in configmap"""
cid = str(crawlconfig.id) cid = str(crawlconfig.id)
@ -343,7 +354,7 @@ class K8SManager:
return None, None return None, None
manual = job.metadata.annotations.get("btrix.run.manual") == "1" manual = job.metadata.annotations.get("btrix.run.manual") == "1"
if manual and not self.no_delete_jobs: if manual and not self.no_delete_jobs and crawlcomplete.completed:
self.loop.create_task(self._delete_job(job.metadata.name)) self.loop.create_task(self._delete_job(job.metadata.name))
crawl = self._make_crawl_for_job( crawl = self._make_crawl_for_job(
@ -389,12 +400,14 @@ class K8SManager:
endpoint_url = self._secret_data(storage_secret, "STORE_ENDPOINT_URL") endpoint_url = self._secret_data(storage_secret, "STORE_ENDPOINT_URL")
access_key = self._secret_data(storage_secret, "STORE_ACCESS_KEY") access_key = self._secret_data(storage_secret, "STORE_ACCESS_KEY")
secret_key = self._secret_data(storage_secret, "STORE_SECRET_KEY") secret_key = self._secret_data(storage_secret, "STORE_SECRET_KEY")
region = self._secret_data(storage_secret, "STORE_REGION") or ""
self._default_storages[name] = S3Storage( self._default_storages[name] = S3Storage(
access_key=access_key, access_key=access_key,
secret_key=secret_key, secret_key=secret_key,
endpoint_url=endpoint_url, endpoint_url=endpoint_url,
access_endpoint_url=access_endpoint_url, access_endpoint_url=access_endpoint_url,
region=region,
) )
return self._default_storages[name] return self._default_storages[name]
@ -542,17 +555,19 @@ class K8SManager:
return True return True
async def run_profile_browser( async def run_profile_browser(
self, userid, aid, storage, command, baseprofile=None self, userid, aid, command, storage=None, storage_name=None, baseprofile=None
): ):
"""run browser for profile creation """ """run browser for profile creation """
# Configure Annotations + Labels
if storage.type == "default": # if default storage, use name and path + profiles/
if storage:
storage_name = storage.name storage_name = storage.name
storage_path = storage.path storage_path = storage.path + "profiles/"
# otherwise, use storage name and existing path from secret
else: else:
storage_name = aid
storage_path = "" storage_path = ""
# Configure Annotations + Labels
labels = { labels = {
"btrix.user": userid, "btrix.user": userid,
"btrix.archive": aid, "btrix.archive": aid,
@ -560,7 +575,7 @@ class K8SManager:
} }
if baseprofile: if baseprofile:
labels["btrix.baseprofile"] = baseprofile labels["btrix.baseprofile"] = str(baseprofile)
await self.check_storage(storage_name) await self.check_storage(storage_name)
@ -825,7 +840,7 @@ class K8SManager:
if profile_filename: if profile_filename:
command.append("--profile") command.append("--profile")
command.append(f"@{profile_filename}") command.append(f"@profiles/{profile_filename}")
job_template = { job_template = {
"metadata": {"annotations": annotations}, "metadata": {"annotations": annotations},
@ -835,6 +850,7 @@ class K8SManager:
"template": { "template": {
"metadata": {"labels": labels}, "metadata": {"labels": labels},
"spec": { "spec": {
"nodeSelector": self.crawl_node_selector,
"containers": [ "containers": [
{ {
"name": "crawler", "name": "crawler",
@ -891,7 +907,7 @@ class K8SManager:
}, },
self.crawl_volume, self.crawl_volume,
], ],
"restartPolicy": "Never", "restartPolicy": "OnFailure",
"terminationGracePeriodSeconds": self.grace_period, "terminationGracePeriodSeconds": self.grace_period,
}, },
}, },

View File

@ -4,6 +4,8 @@ from typing import Optional, List
from datetime import datetime from datetime import datetime
import uuid import uuid
import asyncio import asyncio
import os
from urllib.parse import urlencode from urllib.parse import urlencode
from fastapi import APIRouter, Depends, Request, HTTPException from fastapi import APIRouter, Depends, Request, HTTPException
@ -100,6 +102,8 @@ class ProfileOps:
self.crawlconfigs = None self.crawlconfigs = None
self.shared_profile_storage = os.environ.get("SHARED_PROFILE_STORAGE")
def set_crawlconfigs(self, crawlconfigs): def set_crawlconfigs(self, crawlconfigs):
""" set crawlconfigs ops """ """ set crawlconfigs ops """
self.crawlconfigs = crawlconfigs self.crawlconfigs = crawlconfigs
@ -116,16 +120,25 @@ class ProfileOps:
""" Create new profile """ """ Create new profile """
command = await self.get_command(profile_launch, archive) command = await self.get_command(profile_launch, archive)
if self.shared_profile_storage:
storage_name = self.shared_profile_storage
storage = None
elif archive.storage and archive.storage.type == "default":
storage_name = None
storage = archive.storage
else:
storage_name = str(archive.id)
storage = None
browserid = await self.crawl_manager.run_profile_browser( browserid = await self.crawl_manager.run_profile_browser(
str(user.id), str(user.id),
str(archive.id), str(archive.id),
archive.storage,
command, command,
baseprofile=str(profile_launch.profileId), storage=storage,
storage_name=storage_name,
baseprofile=profile_launch.profileId,
) )
print("base profile", str(profile_launch.profileId))
if not browserid: if not browserid:
raise HTTPException(status_code=400, detail="browser_not_created") raise HTTPException(status_code=400, detail="browser_not_created")
@ -231,7 +244,6 @@ class ProfileOps:
baseid = browser_data.get("btrix.baseprofile") baseid = browser_data.get("btrix.baseprofile")
if baseid: if baseid:
print("baseid", baseid)
baseid = uuid.UUID(baseid) baseid = uuid.UUID(baseid)
profile = Profile( profile = Profile(

View File

@ -71,7 +71,7 @@ async def get_s3_client(storage, use_access=False):
async with session.create_client( async with session.create_client(
"s3", "s3",
region_name="", region_name=storage.region,
endpoint_url=endpoint_url, endpoint_url=endpoint_url,
aws_access_key_id=storage.access_key, aws_access_key_id=storage.access_key,
aws_secret_access_key=storage.secret_key, aws_secret_access_key=storage.secret_key,

View File

@ -24,6 +24,11 @@ spec:
{{- end }} {{- end }}
spec: spec:
{{- if .Values.main_node_type }}
nodeSelector:
nodeType: {{ .Values.main_node_type }}
{{- end }}
initContainers: initContainers:
{{- if .Values.minio_local }} {{- if .Values.minio_local }}
- name: init-bucket - name: init-bucket

View File

@ -27,6 +27,8 @@ data:
CRAWLER_PV_CLAIM: "{{ .Values.crawler_pv_claim }}" CRAWLER_PV_CLAIM: "{{ .Values.crawler_pv_claim }}"
{{- end }} {{- end }}
CRAWLER_NODE_TYPE: "{{ .Values.crawler_node_type }}"
REDIS_URL: "{{ .Values.redis_url }}" REDIS_URL: "{{ .Values.redis_url }}"
REDIS_CRAWLS_DONE_KEY: "crawls-done" REDIS_CRAWLS_DONE_KEY: "crawls-done"

View File

@ -25,6 +25,11 @@ spec:
spec: spec:
{{- if .Values.main_node_type }}
nodeSelector:
nodeType: {{ .Values.main_node_type }}
{{- end }}
volumes: volumes:
- name: nginx-resolver - name: nginx-resolver
emptyDir: {} emptyDir: {}

View File

@ -5,6 +5,8 @@ kind: PersistentVolumeClaim
apiVersion: v1 apiVersion: v1
metadata: metadata:
name: minio-storage-pvc name: minio-storage-pvc
annotations:
"helm.sh/resource-policy": keep
spec: spec:
accessModes: accessModes:
- ReadWriteOnce - ReadWriteOnce
@ -24,6 +26,8 @@ apiVersion: v1
kind: PersistentVolume kind: PersistentVolume
metadata: metadata:
name: "local-minio-store-pv" name: "local-minio-store-pv"
annotations:
"helm.sh/resource-policy": keep
spec: spec:
capacity: capacity:
storage: 5Gi storage: 5Gi
@ -54,6 +58,11 @@ spec:
app: local-minio app: local-minio
spec: spec:
{{- if .Values.main_node_type }}
nodeSelector:
nodeType: {{ .Values.main_node_type }}
{{- end }}
volumes: volumes:
- name: data-minio - name: data-minio
persistentVolumeClaim: persistentVolumeClaim:

View File

@ -22,6 +22,8 @@ kind: PersistentVolumeClaim
apiVersion: v1 apiVersion: v1
metadata: metadata:
name: mongo-storage-pvc name: mongo-storage-pvc
annotations:
"helm.sh/resource-policy": keep
spec: spec:
accessModes: accessModes:
- ReadWriteOnce - ReadWriteOnce
@ -41,6 +43,8 @@ apiVersion: v1
kind: PersistentVolume kind: PersistentVolume
metadata: metadata:
name: "local-mongo-store-pv" name: "local-mongo-store-pv"
annotations:
"helm.sh/resource-policy": keep
spec: spec:
capacity: capacity:
storage: 2Gi storage: 2Gi
@ -69,6 +73,11 @@ spec:
app: local-mongo app: local-mongo
spec: spec:
{{- if .Values.main_node_type }}
nodeSelector:
nodeType: {{ .Values.main_node_type }}
{{- end }}
volumes: volumes:
- name: data-db - name: data-db
persistentVolumeClaim: persistentVolumeClaim:

View File

@ -6,3 +6,4 @@ metadata:
release: {{ .Release.Name }} release: {{ .Release.Name }}
annotations: annotations:
"helm.sh/resource-policy": keep "helm.sh/resource-policy": keep

View File

@ -5,6 +5,8 @@ kind: PersistentVolumeClaim
apiVersion: v1 apiVersion: v1
metadata: metadata:
name: redis-storage-pvc name: redis-storage-pvc
annotations:
"helm.sh/resource-policy": keep
spec: spec:
accessModes: accessModes:
- ReadWriteOnce - ReadWriteOnce
@ -24,6 +26,8 @@ apiVersion: v1
kind: PersistentVolume kind: PersistentVolume
metadata: metadata:
name: "local-redis-store-pv" name: "local-redis-store-pv"
annotations:
"helm.sh/resource-policy": keep
spec: spec:
capacity: capacity:
storage: 1Gi storage: 1Gi

View File

@ -6,7 +6,7 @@ metadata:
name: crawler-run name: crawler-run
rules: rules:
- apiGroups: [""] - apiGroups: [""]
resources: ["pods", "pods/exec", "pods/log", "services", "configmaps", "secrets", "events"] resources: ["pods", "pods/exec", "pods/log", "services", "configmaps", "secrets", "events", "persistentvolumeclaims"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete", "deletecollection"] verbs: ["get", "list", "watch", "create", "update", "patch", "delete", "deletecollection"]
- apiGroups: ["batch", "extensions"] - apiGroups: ["batch", "extensions"]

View File

@ -26,6 +26,8 @@ stringData:
SUPERUSER_EMAIL: "{{ .Values.superuser.email }}" SUPERUSER_EMAIL: "{{ .Values.superuser.email }}"
SUPERUSER_PASSWORD: "{{ .Values.superuser.password }}" SUPERUSER_PASSWORD: "{{ .Values.superuser.password }}"
SHARED_PROFILE_STORAGE: "{{ .Values.shared_profile_storage }}"
{{- range $storage := .Values.storages }} {{- range $storage := .Values.storages }}
--- ---
apiVersion: v1 apiVersion: v1
@ -53,6 +55,8 @@ stringData:
STORE_ACCESS_ENDPOINT_URL: "{{ $storage.endpoint_url }}" STORE_ACCESS_ENDPOINT_URL: "{{ $storage.endpoint_url }}"
{{- end }} {{- end }}
STORE_REGION: {{ $storage.region | default "" }}
{{- if $.Values.signer.auth_token }} {{- if $.Values.signer.auth_token }}
WACZ_SIGN_TOKEN: "{{ $.Values.signer.auth_token }}" WACZ_SIGN_TOKEN: "{{ $.Values.signer.auth_token }}"
WACZ_SIGN_URL: "http://auth-signer.default:5053/sign" WACZ_SIGN_URL: "http://auth-signer.default:5053/sign"

View File

@ -42,6 +42,8 @@ kind: PersistentVolumeClaim
apiVersion: v1 apiVersion: v1
metadata: metadata:
name: signer-storage-pvc name: signer-storage-pvc
annotations:
"helm.sh/resource-policy": keep
spec: spec:
accessModes: accessModes:
- ReadWriteOnce - ReadWriteOnce
@ -61,6 +63,8 @@ apiVersion: v1
kind: PersistentVolume kind: PersistentVolume
metadata: metadata:
name: "signer-store-pv" name: "signer-store-pv"
annotations:
"helm.sh/resource-policy": keep
spec: spec:
capacity: capacity:
storage: 1Gi storage: 1Gi
@ -95,6 +99,11 @@ spec:
{{- end }} {{- end }}
spec: spec:
{{- if .Values.main_node_type }}
nodeSelector:
nodeType: {{ .Values.main_node_type }}
{{- end }}
volumes: volumes:
- name: signer-config - name: signer-config
secret: secret:

View File

@ -6,6 +6,11 @@ name: browsertrix-cloud
# keep empty to use hostPath (eg. on minikube) # keep empty to use hostPath (eg. on minikube)
volume_storage_class: volume_storage_class:
# if set, set the node selector 'nodeType' for deployment pods
# main_node_type:
# if set, set the node selector 'nodeType' to this crawling pods
# crawler_node_type:
registration_enabled: 1 registration_enabled: 1
jwt_token_lifetime_minutes: 60 jwt_token_lifetime_minutes: 60
@ -147,6 +152,10 @@ storages:
endpoint_url: "http://local-minio.default:9000/" endpoint_url: "http://local-minio.default:9000/"
# optional: if above includes a separate storage for profiles, specify here to store profiles separately from wacz files
# may be useful if, for example, the wacz files are public, while profiles should not be
# shared_storage_profile:
# Email Options # Email Options
# ========================================= # =========================================
@ -178,7 +187,7 @@ signer:
enabled: false enabled: false
# host: <set to signer domain> # host: <set to signer domain>
# cert_email: "test@example.com # cert_email: "test@example.com
# image: webrecorder/authsign:0.3.1 # image: webrecorder/authsign:0.4.0
# image_pull_policy: "IfNotPresent" # image_pull_policy: "IfNotPresent"
# auth_token: <set to custom value> # auth_token: <set to custom value>

View File

@ -34,8 +34,8 @@ services:
image: redis image: redis
command: redis-server --appendonly yes command: redis-server --appendonly yes
ports: #ports:
- 6379:6379 # - 6379:6379
volumes: volumes:
- btrix-redis-data:/data - btrix-redis-data:/data
@ -80,7 +80,7 @@ services:
# enable to support signing of wacz files # enable to support signing of wacz files
# port 80 must be open to automatically generate cert via LetsEncrypt # port 80 must be open to automatically generate cert via LetsEncrypt
authsign: authsign:
image: webrecorder/authsign:0.3.1 image: webrecorder/authsign:0.4.0
volumes: volumes:
- btrix-sign-data:/data - btrix-sign-data:/data