Backend: Crawls with Multiple WACZ files + Profile + Misc Fixes (#232)

* backend: k8s:
- support crawls with multiple wacz files, don't assume crawl complete after first wacz uploaded
- if crawl is running and has wacz file, still show as running
- k8s: allow configuring node selector for main pods (eg. nodeType=main) and for crawlers (eg. nodeType=crawling)
- profiles: support uploading to alternate storage specified via 'shared_profile_storage' value is set
- misc fixes for profiles

* backend: ensure docker run_profile api matches k8s
k8s chart: don't delete pvc and pv in helm chart

* dependency: bump authsign to 0.4.0
docker: disable public redis port

* profiles: fix path, profile browser return value

* fix typo in presigned url cacheing
This commit is contained in:
Ilya Kreymer 2022-05-19 18:40:41 -07:00 committed by GitHub
parent cdefb8d06e
commit 3df310ee4f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 154 additions and 40 deletions

View File

@ -44,6 +44,7 @@ class S3Storage(BaseModel):
access_key: str
secret_key: str
access_endpoint_url: Optional[str]
region: Optional[str] = ""
# ============================================================================

View File

@ -248,8 +248,12 @@ class CrawlConfigOps:
crawlconfig = CrawlConfig.from_dict(data)
suffix = f"{self.sanitize(crawlconfig.name)}-{self.sanitize(user.name)}"
# pylint: disable=line-too-long
out_filename = f"{self.sanitize(crawlconfig.name)}-{self.sanitize(user.name)}-@ts-@hostsuffix.wacz"
out_filename = (
f"data/{self.sanitize(crawlconfig.name)}-@id/{suffix}-@ts-@hostsuffix.wacz"
)
new_name = await self.crawl_manager.add_crawl_config(
crawlconfig=crawlconfig,

View File

@ -230,13 +230,14 @@ class CrawlOps:
print(f"Duration: {dura}", flush=True)
await self.archives.inc_usage(crawl.aid, dura)
# if crawl.finished:
if crawl.state == "complete":
await self.archives.inc_usage(crawl.aid, dura)
await self.crawl_configs.inc_crawls(
crawl.cid, crawl.id, crawl.finished, crawl.state
)
await self.crawl_configs.inc_crawls(
crawl.cid, crawl.id, crawl.finished, crawl.state
)
if crawl_file:
await self.delete_redis_keys(crawl)
return True
@ -317,9 +318,12 @@ class CrawlOps:
crawls = []
running_ids = set()
for crawl in running_crawls:
list_crawl = ListCrawlOut(**crawl.dict())
crawls.append(await self._resolve_crawl_refs(list_crawl, archive))
running_ids.add(list_crawl.id)
if not running_only:
aid = archive.id if archive else None
@ -327,7 +331,9 @@ class CrawlOps:
aid=aid, exclude_files=True
)
crawls.extend(finished_crawls)
for crawl in finished_crawls:
if crawl.id not in running_ids:
crawls.append(crawl)
return ListCrawls(crawls=crawls)
@ -339,21 +345,34 @@ class CrawlOps:
query["aid"] = archive.id
res = await self.crawls.find_one(query)
crawl = None
completed = False
if not res:
aid_str = archive.id_str if archive else None
crawl = await self.crawl_manager.get_running_crawl(crawlid, aid_str)
if crawl:
await self.get_redis_stats([crawl])
await self.cache_ips(crawl)
else:
if res:
files = [CrawlFile(**data) for data in res["files"]]
del res["files"]
res["resources"] = await self._resolve_signed_urls(files, archive)
crawl = CrawlOut.from_dict(res)
completed = crawl.state == "complete"
if not completed:
aid_str = archive.id_str if archive else None
running_crawl = await self.crawl_manager.get_running_crawl(crawlid, aid_str)
if running_crawl:
await self.get_redis_stats([running_crawl])
await self.cache_ips(running_crawl)
if crawl:
crawl.stats = running_crawl.stats
# pylint: disable=invalid-name
crawl.watchIPs = running_crawl.watchIPs
crawl.scale = running_crawl.scale
crawl.state = running_crawl.state
else:
crawl = running_crawl
if not crawl:
raise HTTPException(status_code=404, detail=f"Crawl not found: {crawlid}")
@ -383,7 +402,7 @@ class CrawlOps:
async with self.redis.pipeline(transaction=True) as pipe:
for file_ in files:
pipe.get(f"{file_.filename}")
pipe.get(f"f:{file_.filename}")
results = await pipe.execute()

View File

@ -421,13 +421,18 @@ class DockerManager:
self,
userid,
aid,
storage,
command,
storage=None,
storage_name=None,
baseprofile=None,
):
""" Run browser for profile creation """
storage_name = storage.name
storage, storage_path = await self._get_storage_and_path(storage)
if storage_name:
storage = self.storages[storage_name]
storage_path = storage.path
else:
storage_name = storage.name
storage, storage_path = await self._get_storage_and_path(storage)
env_vars = [
f"STORE_USER={userid}",

View File

@ -67,6 +67,12 @@ class K8SManager:
else:
self.crawl_volume["emptyDir"] = {}
crawl_node_type = os.environ.get("CRAWLER_NODE_TYPE")
if crawl_node_type:
self.crawl_node_selector = {"nodeType": crawl_node_type}
else:
self.crawl_node_selector = {}
self.loop = asyncio.get_running_loop()
self.loop.create_task(self.run_event_loop())
self.loop.create_task(self.init_redis(self.redis_url))
@ -172,7 +178,12 @@ class K8SManager:
)
async def add_crawl_config(
self, crawlconfig, storage, run_now, out_filename, profile_filename
self,
crawlconfig,
storage,
run_now,
out_filename,
profile_filename,
):
"""add new crawl as cron job, store crawl config in configmap"""
cid = str(crawlconfig.id)
@ -343,7 +354,7 @@ class K8SManager:
return None, None
manual = job.metadata.annotations.get("btrix.run.manual") == "1"
if manual and not self.no_delete_jobs:
if manual and not self.no_delete_jobs and crawlcomplete.completed:
self.loop.create_task(self._delete_job(job.metadata.name))
crawl = self._make_crawl_for_job(
@ -389,12 +400,14 @@ class K8SManager:
endpoint_url = self._secret_data(storage_secret, "STORE_ENDPOINT_URL")
access_key = self._secret_data(storage_secret, "STORE_ACCESS_KEY")
secret_key = self._secret_data(storage_secret, "STORE_SECRET_KEY")
region = self._secret_data(storage_secret, "STORE_REGION") or ""
self._default_storages[name] = S3Storage(
access_key=access_key,
secret_key=secret_key,
endpoint_url=endpoint_url,
access_endpoint_url=access_endpoint_url,
region=region,
)
return self._default_storages[name]
@ -542,17 +555,19 @@ class K8SManager:
return True
async def run_profile_browser(
self, userid, aid, storage, command, baseprofile=None
self, userid, aid, command, storage=None, storage_name=None, baseprofile=None
):
"""run browser for profile creation """
# Configure Annotations + Labels
if storage.type == "default":
# if default storage, use name and path + profiles/
if storage:
storage_name = storage.name
storage_path = storage.path
storage_path = storage.path + "profiles/"
# otherwise, use storage name and existing path from secret
else:
storage_name = aid
storage_path = ""
# Configure Annotations + Labels
labels = {
"btrix.user": userid,
"btrix.archive": aid,
@ -560,7 +575,7 @@ class K8SManager:
}
if baseprofile:
labels["btrix.baseprofile"] = baseprofile
labels["btrix.baseprofile"] = str(baseprofile)
await self.check_storage(storage_name)
@ -825,7 +840,7 @@ class K8SManager:
if profile_filename:
command.append("--profile")
command.append(f"@{profile_filename}")
command.append(f"@profiles/{profile_filename}")
job_template = {
"metadata": {"annotations": annotations},
@ -835,6 +850,7 @@ class K8SManager:
"template": {
"metadata": {"labels": labels},
"spec": {
"nodeSelector": self.crawl_node_selector,
"containers": [
{
"name": "crawler",
@ -891,7 +907,7 @@ class K8SManager:
},
self.crawl_volume,
],
"restartPolicy": "Never",
"restartPolicy": "OnFailure",
"terminationGracePeriodSeconds": self.grace_period,
},
},

View File

@ -4,6 +4,8 @@ from typing import Optional, List
from datetime import datetime
import uuid
import asyncio
import os
from urllib.parse import urlencode
from fastapi import APIRouter, Depends, Request, HTTPException
@ -100,6 +102,8 @@ class ProfileOps:
self.crawlconfigs = None
self.shared_profile_storage = os.environ.get("SHARED_PROFILE_STORAGE")
def set_crawlconfigs(self, crawlconfigs):
""" set crawlconfigs ops """
self.crawlconfigs = crawlconfigs
@ -116,16 +120,25 @@ class ProfileOps:
""" Create new profile """
command = await self.get_command(profile_launch, archive)
if self.shared_profile_storage:
storage_name = self.shared_profile_storage
storage = None
elif archive.storage and archive.storage.type == "default":
storage_name = None
storage = archive.storage
else:
storage_name = str(archive.id)
storage = None
browserid = await self.crawl_manager.run_profile_browser(
str(user.id),
str(archive.id),
archive.storage,
command,
baseprofile=str(profile_launch.profileId),
storage=storage,
storage_name=storage_name,
baseprofile=profile_launch.profileId,
)
print("base profile", str(profile_launch.profileId))
if not browserid:
raise HTTPException(status_code=400, detail="browser_not_created")
@ -231,7 +244,6 @@ class ProfileOps:
baseid = browser_data.get("btrix.baseprofile")
if baseid:
print("baseid", baseid)
baseid = uuid.UUID(baseid)
profile = Profile(

View File

@ -71,7 +71,7 @@ async def get_s3_client(storage, use_access=False):
async with session.create_client(
"s3",
region_name="",
region_name=storage.region,
endpoint_url=endpoint_url,
aws_access_key_id=storage.access_key,
aws_secret_access_key=storage.secret_key,

View File

@ -24,6 +24,11 @@ spec:
{{- end }}
spec:
{{- if .Values.main_node_type }}
nodeSelector:
nodeType: {{ .Values.main_node_type }}
{{- end }}
initContainers:
{{- if .Values.minio_local }}
- name: init-bucket

View File

@ -27,6 +27,8 @@ data:
CRAWLER_PV_CLAIM: "{{ .Values.crawler_pv_claim }}"
{{- end }}
CRAWLER_NODE_TYPE: "{{ .Values.crawler_node_type }}"
REDIS_URL: "{{ .Values.redis_url }}"
REDIS_CRAWLS_DONE_KEY: "crawls-done"

View File

@ -25,6 +25,11 @@ spec:
spec:
{{- if .Values.main_node_type }}
nodeSelector:
nodeType: {{ .Values.main_node_type }}
{{- end }}
volumes:
- name: nginx-resolver
emptyDir: {}

View File

@ -5,6 +5,8 @@ kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: minio-storage-pvc
annotations:
"helm.sh/resource-policy": keep
spec:
accessModes:
- ReadWriteOnce
@ -24,6 +26,8 @@ apiVersion: v1
kind: PersistentVolume
metadata:
name: "local-minio-store-pv"
annotations:
"helm.sh/resource-policy": keep
spec:
capacity:
storage: 5Gi
@ -54,6 +58,11 @@ spec:
app: local-minio
spec:
{{- if .Values.main_node_type }}
nodeSelector:
nodeType: {{ .Values.main_node_type }}
{{- end }}
volumes:
- name: data-minio
persistentVolumeClaim:

View File

@ -22,6 +22,8 @@ kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: mongo-storage-pvc
annotations:
"helm.sh/resource-policy": keep
spec:
accessModes:
- ReadWriteOnce
@ -41,6 +43,8 @@ apiVersion: v1
kind: PersistentVolume
metadata:
name: "local-mongo-store-pv"
annotations:
"helm.sh/resource-policy": keep
spec:
capacity:
storage: 2Gi
@ -69,6 +73,11 @@ spec:
app: local-mongo
spec:
{{- if .Values.main_node_type }}
nodeSelector:
nodeType: {{ .Values.main_node_type }}
{{- end }}
volumes:
- name: data-db
persistentVolumeClaim:

View File

@ -6,3 +6,4 @@ metadata:
release: {{ .Release.Name }}
annotations:
"helm.sh/resource-policy": keep

View File

@ -5,6 +5,8 @@ kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: redis-storage-pvc
annotations:
"helm.sh/resource-policy": keep
spec:
accessModes:
- ReadWriteOnce
@ -24,6 +26,8 @@ apiVersion: v1
kind: PersistentVolume
metadata:
name: "local-redis-store-pv"
annotations:
"helm.sh/resource-policy": keep
spec:
capacity:
storage: 1Gi

View File

@ -6,7 +6,7 @@ metadata:
name: crawler-run
rules:
- apiGroups: [""]
resources: ["pods", "pods/exec", "pods/log", "services", "configmaps", "secrets", "events"]
resources: ["pods", "pods/exec", "pods/log", "services", "configmaps", "secrets", "events", "persistentvolumeclaims"]
verbs: ["get", "list", "watch", "create", "update", "patch", "delete", "deletecollection"]
- apiGroups: ["batch", "extensions"]

View File

@ -26,6 +26,8 @@ stringData:
SUPERUSER_EMAIL: "{{ .Values.superuser.email }}"
SUPERUSER_PASSWORD: "{{ .Values.superuser.password }}"
SHARED_PROFILE_STORAGE: "{{ .Values.shared_profile_storage }}"
{{- range $storage := .Values.storages }}
---
apiVersion: v1
@ -53,6 +55,8 @@ stringData:
STORE_ACCESS_ENDPOINT_URL: "{{ $storage.endpoint_url }}"
{{- end }}
STORE_REGION: {{ $storage.region | default "" }}
{{- if $.Values.signer.auth_token }}
WACZ_SIGN_TOKEN: "{{ $.Values.signer.auth_token }}"
WACZ_SIGN_URL: "http://auth-signer.default:5053/sign"

View File

@ -42,6 +42,8 @@ kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: signer-storage-pvc
annotations:
"helm.sh/resource-policy": keep
spec:
accessModes:
- ReadWriteOnce
@ -61,6 +63,8 @@ apiVersion: v1
kind: PersistentVolume
metadata:
name: "signer-store-pv"
annotations:
"helm.sh/resource-policy": keep
spec:
capacity:
storage: 1Gi
@ -95,6 +99,11 @@ spec:
{{- end }}
spec:
{{- if .Values.main_node_type }}
nodeSelector:
nodeType: {{ .Values.main_node_type }}
{{- end }}
volumes:
- name: signer-config
secret:

View File

@ -6,6 +6,11 @@ name: browsertrix-cloud
# keep empty to use hostPath (eg. on minikube)
volume_storage_class:
# if set, set the node selector 'nodeType' for deployment pods
# main_node_type:
# if set, set the node selector 'nodeType' to this crawling pods
# crawler_node_type:
registration_enabled: 1
jwt_token_lifetime_minutes: 60
@ -147,6 +152,10 @@ storages:
endpoint_url: "http://local-minio.default:9000/"
# optional: if above includes a separate storage for profiles, specify here to store profiles separately from wacz files
# may be useful if, for example, the wacz files are public, while profiles should not be
# shared_storage_profile:
# Email Options
# =========================================
@ -178,7 +187,7 @@ signer:
enabled: false
# host: <set to signer domain>
# cert_email: "test@example.com
# image: webrecorder/authsign:0.3.1
# image: webrecorder/authsign:0.4.0
# image_pull_policy: "IfNotPresent"
# auth_token: <set to custom value>

View File

@ -34,8 +34,8 @@ services:
image: redis
command: redis-server --appendonly yes
ports:
- 6379:6379
#ports:
# - 6379:6379
volumes:
- btrix-redis-data:/data
@ -80,7 +80,7 @@ services:
# enable to support signing of wacz files
# port 80 must be open to automatically generate cert via LetsEncrypt
authsign:
image: webrecorder/authsign:0.3.1
image: webrecorder/authsign:0.4.0
volumes:
- btrix-sign-data:/data