Backend: Crawls with Multiple WACZ files + Profile + Misc Fixes (#232)
* backend: k8s: - support crawls with multiple wacz files, don't assume crawl complete after first wacz uploaded - if crawl is running and has wacz file, still show as running - k8s: allow configuring node selector for main pods (eg. nodeType=main) and for crawlers (eg. nodeType=crawling) - profiles: support uploading to alternate storage specified via 'shared_profile_storage' value is set - misc fixes for profiles * backend: ensure docker run_profile api matches k8s k8s chart: don't delete pvc and pv in helm chart * dependency: bump authsign to 0.4.0 docker: disable public redis port * profiles: fix path, profile browser return value * fix typo in presigned url cacheing
This commit is contained in:
parent
cdefb8d06e
commit
3df310ee4f
@ -44,6 +44,7 @@ class S3Storage(BaseModel):
|
||||
access_key: str
|
||||
secret_key: str
|
||||
access_endpoint_url: Optional[str]
|
||||
region: Optional[str] = ""
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
@ -248,8 +248,12 @@ class CrawlConfigOps:
|
||||
|
||||
crawlconfig = CrawlConfig.from_dict(data)
|
||||
|
||||
suffix = f"{self.sanitize(crawlconfig.name)}-{self.sanitize(user.name)}"
|
||||
|
||||
# pylint: disable=line-too-long
|
||||
out_filename = f"{self.sanitize(crawlconfig.name)}-{self.sanitize(user.name)}-@ts-@hostsuffix.wacz"
|
||||
out_filename = (
|
||||
f"data/{self.sanitize(crawlconfig.name)}-@id/{suffix}-@ts-@hostsuffix.wacz"
|
||||
)
|
||||
|
||||
new_name = await self.crawl_manager.add_crawl_config(
|
||||
crawlconfig=crawlconfig,
|
||||
|
@ -230,13 +230,14 @@ class CrawlOps:
|
||||
|
||||
print(f"Duration: {dura}", flush=True)
|
||||
|
||||
await self.archives.inc_usage(crawl.aid, dura)
|
||||
# if crawl.finished:
|
||||
if crawl.state == "complete":
|
||||
await self.archives.inc_usage(crawl.aid, dura)
|
||||
|
||||
await self.crawl_configs.inc_crawls(
|
||||
crawl.cid, crawl.id, crawl.finished, crawl.state
|
||||
)
|
||||
await self.crawl_configs.inc_crawls(
|
||||
crawl.cid, crawl.id, crawl.finished, crawl.state
|
||||
)
|
||||
|
||||
if crawl_file:
|
||||
await self.delete_redis_keys(crawl)
|
||||
|
||||
return True
|
||||
@ -317,9 +318,12 @@ class CrawlOps:
|
||||
|
||||
crawls = []
|
||||
|
||||
running_ids = set()
|
||||
|
||||
for crawl in running_crawls:
|
||||
list_crawl = ListCrawlOut(**crawl.dict())
|
||||
crawls.append(await self._resolve_crawl_refs(list_crawl, archive))
|
||||
running_ids.add(list_crawl.id)
|
||||
|
||||
if not running_only:
|
||||
aid = archive.id if archive else None
|
||||
@ -327,7 +331,9 @@ class CrawlOps:
|
||||
aid=aid, exclude_files=True
|
||||
)
|
||||
|
||||
crawls.extend(finished_crawls)
|
||||
for crawl in finished_crawls:
|
||||
if crawl.id not in running_ids:
|
||||
crawls.append(crawl)
|
||||
|
||||
return ListCrawls(crawls=crawls)
|
||||
|
||||
@ -339,21 +345,34 @@ class CrawlOps:
|
||||
query["aid"] = archive.id
|
||||
|
||||
res = await self.crawls.find_one(query)
|
||||
crawl = None
|
||||
completed = False
|
||||
|
||||
if not res:
|
||||
aid_str = archive.id_str if archive else None
|
||||
crawl = await self.crawl_manager.get_running_crawl(crawlid, aid_str)
|
||||
if crawl:
|
||||
await self.get_redis_stats([crawl])
|
||||
await self.cache_ips(crawl)
|
||||
|
||||
else:
|
||||
if res:
|
||||
files = [CrawlFile(**data) for data in res["files"]]
|
||||
|
||||
del res["files"]
|
||||
|
||||
res["resources"] = await self._resolve_signed_urls(files, archive)
|
||||
crawl = CrawlOut.from_dict(res)
|
||||
completed = crawl.state == "complete"
|
||||
|
||||
if not completed:
|
||||
aid_str = archive.id_str if archive else None
|
||||
running_crawl = await self.crawl_manager.get_running_crawl(crawlid, aid_str)
|
||||
if running_crawl:
|
||||
await self.get_redis_stats([running_crawl])
|
||||
await self.cache_ips(running_crawl)
|
||||
|
||||
if crawl:
|
||||
crawl.stats = running_crawl.stats
|
||||
# pylint: disable=invalid-name
|
||||
crawl.watchIPs = running_crawl.watchIPs
|
||||
crawl.scale = running_crawl.scale
|
||||
crawl.state = running_crawl.state
|
||||
|
||||
else:
|
||||
crawl = running_crawl
|
||||
|
||||
if not crawl:
|
||||
raise HTTPException(status_code=404, detail=f"Crawl not found: {crawlid}")
|
||||
@ -383,7 +402,7 @@ class CrawlOps:
|
||||
|
||||
async with self.redis.pipeline(transaction=True) as pipe:
|
||||
for file_ in files:
|
||||
pipe.get(f"{file_.filename}")
|
||||
pipe.get(f"f:{file_.filename}")
|
||||
|
||||
results = await pipe.execute()
|
||||
|
||||
|
@ -421,13 +421,18 @@ class DockerManager:
|
||||
self,
|
||||
userid,
|
||||
aid,
|
||||
storage,
|
||||
command,
|
||||
storage=None,
|
||||
storage_name=None,
|
||||
baseprofile=None,
|
||||
):
|
||||
""" Run browser for profile creation """
|
||||
storage_name = storage.name
|
||||
storage, storage_path = await self._get_storage_and_path(storage)
|
||||
if storage_name:
|
||||
storage = self.storages[storage_name]
|
||||
storage_path = storage.path
|
||||
else:
|
||||
storage_name = storage.name
|
||||
storage, storage_path = await self._get_storage_and_path(storage)
|
||||
|
||||
env_vars = [
|
||||
f"STORE_USER={userid}",
|
||||
|
@ -67,6 +67,12 @@ class K8SManager:
|
||||
else:
|
||||
self.crawl_volume["emptyDir"] = {}
|
||||
|
||||
crawl_node_type = os.environ.get("CRAWLER_NODE_TYPE")
|
||||
if crawl_node_type:
|
||||
self.crawl_node_selector = {"nodeType": crawl_node_type}
|
||||
else:
|
||||
self.crawl_node_selector = {}
|
||||
|
||||
self.loop = asyncio.get_running_loop()
|
||||
self.loop.create_task(self.run_event_loop())
|
||||
self.loop.create_task(self.init_redis(self.redis_url))
|
||||
@ -172,7 +178,12 @@ class K8SManager:
|
||||
)
|
||||
|
||||
async def add_crawl_config(
|
||||
self, crawlconfig, storage, run_now, out_filename, profile_filename
|
||||
self,
|
||||
crawlconfig,
|
||||
storage,
|
||||
run_now,
|
||||
out_filename,
|
||||
profile_filename,
|
||||
):
|
||||
"""add new crawl as cron job, store crawl config in configmap"""
|
||||
cid = str(crawlconfig.id)
|
||||
@ -343,7 +354,7 @@ class K8SManager:
|
||||
return None, None
|
||||
|
||||
manual = job.metadata.annotations.get("btrix.run.manual") == "1"
|
||||
if manual and not self.no_delete_jobs:
|
||||
if manual and not self.no_delete_jobs and crawlcomplete.completed:
|
||||
self.loop.create_task(self._delete_job(job.metadata.name))
|
||||
|
||||
crawl = self._make_crawl_for_job(
|
||||
@ -389,12 +400,14 @@ class K8SManager:
|
||||
endpoint_url = self._secret_data(storage_secret, "STORE_ENDPOINT_URL")
|
||||
access_key = self._secret_data(storage_secret, "STORE_ACCESS_KEY")
|
||||
secret_key = self._secret_data(storage_secret, "STORE_SECRET_KEY")
|
||||
region = self._secret_data(storage_secret, "STORE_REGION") or ""
|
||||
|
||||
self._default_storages[name] = S3Storage(
|
||||
access_key=access_key,
|
||||
secret_key=secret_key,
|
||||
endpoint_url=endpoint_url,
|
||||
access_endpoint_url=access_endpoint_url,
|
||||
region=region,
|
||||
)
|
||||
|
||||
return self._default_storages[name]
|
||||
@ -542,17 +555,19 @@ class K8SManager:
|
||||
return True
|
||||
|
||||
async def run_profile_browser(
|
||||
self, userid, aid, storage, command, baseprofile=None
|
||||
self, userid, aid, command, storage=None, storage_name=None, baseprofile=None
|
||||
):
|
||||
"""run browser for profile creation """
|
||||
# Configure Annotations + Labels
|
||||
if storage.type == "default":
|
||||
|
||||
# if default storage, use name and path + profiles/
|
||||
if storage:
|
||||
storage_name = storage.name
|
||||
storage_path = storage.path
|
||||
storage_path = storage.path + "profiles/"
|
||||
# otherwise, use storage name and existing path from secret
|
||||
else:
|
||||
storage_name = aid
|
||||
storage_path = ""
|
||||
|
||||
# Configure Annotations + Labels
|
||||
labels = {
|
||||
"btrix.user": userid,
|
||||
"btrix.archive": aid,
|
||||
@ -560,7 +575,7 @@ class K8SManager:
|
||||
}
|
||||
|
||||
if baseprofile:
|
||||
labels["btrix.baseprofile"] = baseprofile
|
||||
labels["btrix.baseprofile"] = str(baseprofile)
|
||||
|
||||
await self.check_storage(storage_name)
|
||||
|
||||
@ -825,7 +840,7 @@ class K8SManager:
|
||||
|
||||
if profile_filename:
|
||||
command.append("--profile")
|
||||
command.append(f"@{profile_filename}")
|
||||
command.append(f"@profiles/{profile_filename}")
|
||||
|
||||
job_template = {
|
||||
"metadata": {"annotations": annotations},
|
||||
@ -835,6 +850,7 @@ class K8SManager:
|
||||
"template": {
|
||||
"metadata": {"labels": labels},
|
||||
"spec": {
|
||||
"nodeSelector": self.crawl_node_selector,
|
||||
"containers": [
|
||||
{
|
||||
"name": "crawler",
|
||||
@ -891,7 +907,7 @@ class K8SManager:
|
||||
},
|
||||
self.crawl_volume,
|
||||
],
|
||||
"restartPolicy": "Never",
|
||||
"restartPolicy": "OnFailure",
|
||||
"terminationGracePeriodSeconds": self.grace_period,
|
||||
},
|
||||
},
|
||||
|
@ -4,6 +4,8 @@ from typing import Optional, List
|
||||
from datetime import datetime
|
||||
import uuid
|
||||
import asyncio
|
||||
import os
|
||||
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from fastapi import APIRouter, Depends, Request, HTTPException
|
||||
@ -100,6 +102,8 @@ class ProfileOps:
|
||||
|
||||
self.crawlconfigs = None
|
||||
|
||||
self.shared_profile_storage = os.environ.get("SHARED_PROFILE_STORAGE")
|
||||
|
||||
def set_crawlconfigs(self, crawlconfigs):
|
||||
""" set crawlconfigs ops """
|
||||
self.crawlconfigs = crawlconfigs
|
||||
@ -116,16 +120,25 @@ class ProfileOps:
|
||||
""" Create new profile """
|
||||
command = await self.get_command(profile_launch, archive)
|
||||
|
||||
if self.shared_profile_storage:
|
||||
storage_name = self.shared_profile_storage
|
||||
storage = None
|
||||
elif archive.storage and archive.storage.type == "default":
|
||||
storage_name = None
|
||||
storage = archive.storage
|
||||
else:
|
||||
storage_name = str(archive.id)
|
||||
storage = None
|
||||
|
||||
browserid = await self.crawl_manager.run_profile_browser(
|
||||
str(user.id),
|
||||
str(archive.id),
|
||||
archive.storage,
|
||||
command,
|
||||
baseprofile=str(profile_launch.profileId),
|
||||
storage=storage,
|
||||
storage_name=storage_name,
|
||||
baseprofile=profile_launch.profileId,
|
||||
)
|
||||
|
||||
print("base profile", str(profile_launch.profileId))
|
||||
|
||||
if not browserid:
|
||||
raise HTTPException(status_code=400, detail="browser_not_created")
|
||||
|
||||
@ -231,7 +244,6 @@ class ProfileOps:
|
||||
|
||||
baseid = browser_data.get("btrix.baseprofile")
|
||||
if baseid:
|
||||
print("baseid", baseid)
|
||||
baseid = uuid.UUID(baseid)
|
||||
|
||||
profile = Profile(
|
||||
|
@ -71,7 +71,7 @@ async def get_s3_client(storage, use_access=False):
|
||||
|
||||
async with session.create_client(
|
||||
"s3",
|
||||
region_name="",
|
||||
region_name=storage.region,
|
||||
endpoint_url=endpoint_url,
|
||||
aws_access_key_id=storage.access_key,
|
||||
aws_secret_access_key=storage.secret_key,
|
||||
|
@ -24,6 +24,11 @@ spec:
|
||||
{{- end }}
|
||||
|
||||
spec:
|
||||
{{- if .Values.main_node_type }}
|
||||
nodeSelector:
|
||||
nodeType: {{ .Values.main_node_type }}
|
||||
{{- end }}
|
||||
|
||||
initContainers:
|
||||
{{- if .Values.minio_local }}
|
||||
- name: init-bucket
|
||||
|
@ -27,6 +27,8 @@ data:
|
||||
CRAWLER_PV_CLAIM: "{{ .Values.crawler_pv_claim }}"
|
||||
{{- end }}
|
||||
|
||||
CRAWLER_NODE_TYPE: "{{ .Values.crawler_node_type }}"
|
||||
|
||||
REDIS_URL: "{{ .Values.redis_url }}"
|
||||
|
||||
REDIS_CRAWLS_DONE_KEY: "crawls-done"
|
||||
|
@ -25,6 +25,11 @@ spec:
|
||||
|
||||
|
||||
spec:
|
||||
{{- if .Values.main_node_type }}
|
||||
nodeSelector:
|
||||
nodeType: {{ .Values.main_node_type }}
|
||||
{{- end }}
|
||||
|
||||
volumes:
|
||||
- name: nginx-resolver
|
||||
emptyDir: {}
|
||||
|
@ -5,6 +5,8 @@ kind: PersistentVolumeClaim
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: minio-storage-pvc
|
||||
annotations:
|
||||
"helm.sh/resource-policy": keep
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
@ -24,6 +26,8 @@ apiVersion: v1
|
||||
kind: PersistentVolume
|
||||
metadata:
|
||||
name: "local-minio-store-pv"
|
||||
annotations:
|
||||
"helm.sh/resource-policy": keep
|
||||
spec:
|
||||
capacity:
|
||||
storage: 5Gi
|
||||
@ -54,6 +58,11 @@ spec:
|
||||
app: local-minio
|
||||
|
||||
spec:
|
||||
{{- if .Values.main_node_type }}
|
||||
nodeSelector:
|
||||
nodeType: {{ .Values.main_node_type }}
|
||||
{{- end }}
|
||||
|
||||
volumes:
|
||||
- name: data-minio
|
||||
persistentVolumeClaim:
|
||||
|
@ -22,6 +22,8 @@ kind: PersistentVolumeClaim
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: mongo-storage-pvc
|
||||
annotations:
|
||||
"helm.sh/resource-policy": keep
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
@ -41,6 +43,8 @@ apiVersion: v1
|
||||
kind: PersistentVolume
|
||||
metadata:
|
||||
name: "local-mongo-store-pv"
|
||||
annotations:
|
||||
"helm.sh/resource-policy": keep
|
||||
spec:
|
||||
capacity:
|
||||
storage: 2Gi
|
||||
@ -69,6 +73,11 @@ spec:
|
||||
app: local-mongo
|
||||
|
||||
spec:
|
||||
{{- if .Values.main_node_type }}
|
||||
nodeSelector:
|
||||
nodeType: {{ .Values.main_node_type }}
|
||||
{{- end }}
|
||||
|
||||
volumes:
|
||||
- name: data-db
|
||||
persistentVolumeClaim:
|
||||
|
@ -6,3 +6,4 @@ metadata:
|
||||
release: {{ .Release.Name }}
|
||||
annotations:
|
||||
"helm.sh/resource-policy": keep
|
||||
|
||||
|
@ -5,6 +5,8 @@ kind: PersistentVolumeClaim
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: redis-storage-pvc
|
||||
annotations:
|
||||
"helm.sh/resource-policy": keep
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
@ -24,6 +26,8 @@ apiVersion: v1
|
||||
kind: PersistentVolume
|
||||
metadata:
|
||||
name: "local-redis-store-pv"
|
||||
annotations:
|
||||
"helm.sh/resource-policy": keep
|
||||
spec:
|
||||
capacity:
|
||||
storage: 1Gi
|
||||
|
@ -6,7 +6,7 @@ metadata:
|
||||
name: crawler-run
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["pods", "pods/exec", "pods/log", "services", "configmaps", "secrets", "events"]
|
||||
resources: ["pods", "pods/exec", "pods/log", "services", "configmaps", "secrets", "events", "persistentvolumeclaims"]
|
||||
verbs: ["get", "list", "watch", "create", "update", "patch", "delete", "deletecollection"]
|
||||
|
||||
- apiGroups: ["batch", "extensions"]
|
||||
|
@ -26,6 +26,8 @@ stringData:
|
||||
SUPERUSER_EMAIL: "{{ .Values.superuser.email }}"
|
||||
SUPERUSER_PASSWORD: "{{ .Values.superuser.password }}"
|
||||
|
||||
SHARED_PROFILE_STORAGE: "{{ .Values.shared_profile_storage }}"
|
||||
|
||||
{{- range $storage := .Values.storages }}
|
||||
---
|
||||
apiVersion: v1
|
||||
@ -53,6 +55,8 @@ stringData:
|
||||
STORE_ACCESS_ENDPOINT_URL: "{{ $storage.endpoint_url }}"
|
||||
{{- end }}
|
||||
|
||||
STORE_REGION: {{ $storage.region | default "" }}
|
||||
|
||||
{{- if $.Values.signer.auth_token }}
|
||||
WACZ_SIGN_TOKEN: "{{ $.Values.signer.auth_token }}"
|
||||
WACZ_SIGN_URL: "http://auth-signer.default:5053/sign"
|
||||
|
@ -42,6 +42,8 @@ kind: PersistentVolumeClaim
|
||||
apiVersion: v1
|
||||
metadata:
|
||||
name: signer-storage-pvc
|
||||
annotations:
|
||||
"helm.sh/resource-policy": keep
|
||||
spec:
|
||||
accessModes:
|
||||
- ReadWriteOnce
|
||||
@ -61,6 +63,8 @@ apiVersion: v1
|
||||
kind: PersistentVolume
|
||||
metadata:
|
||||
name: "signer-store-pv"
|
||||
annotations:
|
||||
"helm.sh/resource-policy": keep
|
||||
spec:
|
||||
capacity:
|
||||
storage: 1Gi
|
||||
@ -95,6 +99,11 @@ spec:
|
||||
{{- end }}
|
||||
|
||||
spec:
|
||||
{{- if .Values.main_node_type }}
|
||||
nodeSelector:
|
||||
nodeType: {{ .Values.main_node_type }}
|
||||
{{- end }}
|
||||
|
||||
volumes:
|
||||
- name: signer-config
|
||||
secret:
|
||||
|
@ -6,6 +6,11 @@ name: browsertrix-cloud
|
||||
# keep empty to use hostPath (eg. on minikube)
|
||||
volume_storage_class:
|
||||
|
||||
# if set, set the node selector 'nodeType' for deployment pods
|
||||
# main_node_type:
|
||||
|
||||
# if set, set the node selector 'nodeType' to this crawling pods
|
||||
# crawler_node_type:
|
||||
|
||||
registration_enabled: 1
|
||||
jwt_token_lifetime_minutes: 60
|
||||
@ -147,6 +152,10 @@ storages:
|
||||
|
||||
endpoint_url: "http://local-minio.default:9000/"
|
||||
|
||||
# optional: if above includes a separate storage for profiles, specify here to store profiles separately from wacz files
|
||||
# may be useful if, for example, the wacz files are public, while profiles should not be
|
||||
# shared_storage_profile:
|
||||
|
||||
|
||||
# Email Options
|
||||
# =========================================
|
||||
@ -178,7 +187,7 @@ signer:
|
||||
enabled: false
|
||||
# host: <set to signer domain>
|
||||
# cert_email: "test@example.com
|
||||
# image: webrecorder/authsign:0.3.1
|
||||
# image: webrecorder/authsign:0.4.0
|
||||
# image_pull_policy: "IfNotPresent"
|
||||
# auth_token: <set to custom value>
|
||||
|
||||
|
@ -34,8 +34,8 @@ services:
|
||||
image: redis
|
||||
command: redis-server --appendonly yes
|
||||
|
||||
ports:
|
||||
- 6379:6379
|
||||
#ports:
|
||||
# - 6379:6379
|
||||
|
||||
volumes:
|
||||
- btrix-redis-data:/data
|
||||
@ -80,7 +80,7 @@ services:
|
||||
# enable to support signing of wacz files
|
||||
# port 80 must be open to automatically generate cert via LetsEncrypt
|
||||
authsign:
|
||||
image: webrecorder/authsign:0.3.1
|
||||
image: webrecorder/authsign:0.4.0
|
||||
|
||||
volumes:
|
||||
- btrix-sign-data:/data
|
||||
|
Loading…
Reference in New Issue
Block a user