support committing to s3 storage

move mongo into separate optional deployment along with minio
support for configuring storages
support for deleting crawls, associated config and secrets
This commit is contained in:
Ilya Kreymer 2021-07-02 15:56:24 -07:00
parent a111bacfb5
commit f77eaccf41
11 changed files with 238 additions and 134 deletions

View File

@ -43,6 +43,8 @@ class BaseCrawlConfig(BaseModel):
seeds: List[Union[str, Seed]]
collection: Optional[str] = "my-web-archive"
scopeType: Optional[ScopeType] = ScopeType.PREFIX
scope: Union[str, List[str], None] = ""
exclude: Union[str, List[str], None] = ""
@ -66,6 +68,7 @@ class CrawlConfig(BaseCrawlConfig):
"""Schedulable config"""
schedule: Optional[str] = ""
storageName: Optional[str] = "default"
# ============================================================================
@ -85,8 +88,9 @@ def to_crawl_config(data, uid=None):
class CrawlOps:
"""Crawl Config Operations"""
def __init__(self, mdb, crawl_manager):
def __init__(self, mdb, storage_ops, crawl_manager):
self.crawl_configs = mdb["crawl_configs"]
self.storage_ops = storage_ops
self.crawl_manager = crawl_manager
self.default_crawl_params = [
"--collection",
@ -102,10 +106,18 @@ class CrawlOps:
data = config.dict()
data["user"] = user.id
storage = await self.storage_ops.get_storage_by_name(config.storageName, user)
if not storage:
raise HTTPException(
status_code=400,
detail=f"Invalid Config: Storage '{config.storageName}' not found",
)
result = await self.crawl_configs.insert_one(data)
out = to_crawl_config(data, result.inserted_id)
await self.crawl_manager.add_crawl_config(
out.dict(), str(user.id), self.default_crawl_params
out.dict(), str(user.id), storage, self.default_crawl_params
)
return result
@ -118,9 +130,7 @@ class CrawlOps:
async def delete_crawl_config(self, _id: str, user: User):
"""Delete config"""
await self.crawl_manager.delete_crawl_configs(f"btrix.crawlconfig={_id}")
return self.crawl_configs.delete_one(
{"_id": ObjectId(_id), "user": user.id}
)
return self.crawl_configs.delete_one({"_id": ObjectId(_id), "user": user.id})
async def delete_crawl_configs(self, user: User):
"""Delete all crawl configs for user"""
@ -143,9 +153,9 @@ class CrawlOps:
# ============================================================================
# pylint: disable=redefined-builtin,invalid-name
def init_crawl_config_api(app, mdb, user_dep: User, crawl_manager):
def init_crawl_config_api(app, mdb, user_dep: User, storage_ops, crawl_manager):
"""Init /crawlconfigs api routes"""
ops = CrawlOps(mdb, crawl_manager)
ops = CrawlOps(mdb, storage_ops, crawl_manager)
router = APIRouter(
prefix="/crawlconfigs",
@ -170,7 +180,7 @@ def init_crawl_config_api(app, mdb, user_dep: User, crawl_manager):
if not result or not result.deleted_count:
raise HTTPException(status_code=404, detail="Crawl Config Not Found")
return {"deleted": result.deleted_count}
return {"deleted": 1}
@router.get("/{id}")
async def get_crawl_config(id: str, user: User = Depends(user_dep)):

View File

@ -1,12 +0,0 @@
import aiodocker
class DockerDriver(BaseDriver):
def __init__(self):
self.docker = aiodocker.Docker()
self.crawl_image = os.environ.get(
"CRAWLER_IMAGE", "webrecorder/browsertrix-crawler"
)
def start_crawl(self):
container = await self.docker.containers.create(config=config)

View File

@ -12,13 +12,6 @@ from jinja2 import Environment, FileSystemLoader
# ============================================================================
if os.environ.get("IN_CLUSTER"):
print("Cluster Init")
config.load_incluster_config()
else:
config.load_kube_config()
DEFAULT_NAMESPACE = os.environ.get("CRAWLER_NAMESPACE") or "crawlers"
DEFAULT_NO_SCHEDULE = "* * 31 2 *"
@ -26,10 +19,12 @@ DEFAULT_NO_SCHEDULE = "* * 31 2 *"
# ============================================================================
class K8SManager:
# pylint: disable=too-few-public-methods
# pylint: disable=too-many-instance-attributes,too-many-locals
"""K8SManager, manager creation of k8s resources from crawl api requests"""
def __init__(self, namespace=DEFAULT_NAMESPACE):
config.load_incluster_config()
self.core_api = client.CoreV1Api()
self.batch_api = client.BatchV1Api()
self.batch_beta_api = client.BatchV1beta1Api()
@ -45,7 +40,11 @@ class K8SManager:
self.crawler_image_pull_policy = "IfNotPresent"
async def add_crawl_config(
self, crawlconfig: dict, userid: str, extra_crawl_params: list = None
self,
crawlconfig: dict,
userid: str,
storage: dict,
extra_crawl_params: list = None,
):
"""add new crawl as cron job, store crawl config in configmap"""
uid = str(crawlconfig["id"])
@ -54,6 +53,7 @@ class K8SManager:
extra_crawl_params = extra_crawl_params or []
# Create Config Map
config_map = client.V1ConfigMap(
metadata={
"name": f"crawl-config-{uid}",
@ -67,6 +67,30 @@ class K8SManager:
namespace=self.namespace, body=config_map
)
# Create Secret
endpoint_with_coll_url = os.path.join(
storage["endpoint_url"], crawlconfig["collection"] + "/"
)
crawl_secret = client.V1Secret(
metadata={
"name": f"crawl-secret-{uid}",
"namespace": self.namespace,
"labels": labels,
},
string_data={
"STORE_USER": userid,
"STORE_ENDPOINT_URL": endpoint_with_coll_url,
"STORE_ACCESS_KEY": storage["access_key"],
"STORE_SECRET_KEY": storage["secret_key"],
},
)
api_response = await self.core_api.create_namespaced_secret(
namespace=self.namespace, body=crawl_secret
)
# Create Cron Job
run_now = False
schedule = crawlconfig.get("schedule")
suspend = False
@ -100,21 +124,33 @@ class K8SManager:
namespace=self.namespace, body=cron_job
)
# print(api_response)
# Run Job Now
if run_now:
await self.create_run_now_job(api_response, labels)
return api_response
async def delete_crawl_configs(self, label):
"""Delete Crawl Cron Job and all dependent resources"""
"""Delete Crawl Cron Job and all dependent resources, including configmap and secrets"""
await self.batch_beta_api.delete_collection_namespaced_cron_job(
namespace=self.namespace,
label_selector=label,
propagation_policy="Foreground",
)
await self.core_api.delete_collection_namespaced_secret(
namespace=self.namespace,
label_selector=label,
propagation_policy="Foreground",
)
await self.core_api.delete_collection_namespaced_config_map(
namespace=self.namespace,
label_selector=label,
propagation_policy="Foreground",
)
async def create_run_now_job(self, cron_job, labels):
"""Create new job from cron job to run instantly"""
annotations = {}
@ -174,6 +210,9 @@ class K8SManager:
"readOnly": True,
}
],
"envFrom": [
{"secretRef": {"name": f"crawl-secret-{uid}"}}
],
}
],
"volumes": [

View File

@ -8,14 +8,13 @@ import os
from fastapi import FastAPI, Request
# from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
# from fastapi.staticfiles import StaticFiles
from users import init_users_api, UserDB
from db import init_db
from storages import init_storages_api
from crawls import init_crawl_config_api
from k8sman import K8SManager
@ -27,18 +26,19 @@ class BrowsertrixAPI:
# pylint: disable=too-many-instance-attributes
def __init__(self):
self.default_storage = os.environ.get(
"DEFAULT_STORAGE", "http://localhost:8010/store-bucket/"
self.default_storage_endpoint_url = os.environ.get(
"STORE_ENDPOINT_URL", "http://localhost:8010/store-bucket/"
)
self.default_storage_access_key = os.environ.get("STORE_ACCESS_KEY")
self.default_storage_secret_key = os.environ.get("STORE_SECRET_KEY")
self.app = FastAPI()
if os.environ.get("K8S"):
if os.environ.get("KUBERNETES_SERVICE_HOST"):
self.crawl_manager = K8SManager()
else:
self.crawl_manager = None
# self.app.mount("/static", StaticFiles(directory="static"), name="static")
#to implement
raise Exception("Currently, only running in Kubernetes is supported")
self.mdb = init_db()
@ -55,7 +55,11 @@ class BrowsertrixAPI:
self.storage_ops = init_storages_api(self.app, self.mdb, current_active_user)
self.crawl_config_ops = init_crawl_config_api(
self.app, self.mdb, current_active_user, self.crawl_manager
self.app,
self.mdb,
current_active_user,
self.storage_ops,
self.crawl_manager,
)
# @app.get("/")
@ -65,7 +69,14 @@ class BrowsertrixAPI:
# pylint: disable=no-self-use, unused-argument
async def on_after_register(self, user: UserDB, request):
"""callback after registeration"""
await self.storage_ops.create_storage_for_user(self.default_storage, user)
await self.storage_ops.create_storage_for_user(
endpoint_url=self.default_storage_endpoint_url,
access_key=self.default_storage_access_key,
secret_key=self.default_storage_secret_key,
user=user,
)
print(f"User {user.id} has registered.")
# pylint: disable=no-self-use, unused-argument

View File

@ -2,7 +2,6 @@
Storage API handling
"""
import os
from typing import Optional
from pydantic import BaseModel, UUID4
@ -15,7 +14,7 @@ from users import User
class Storage(BaseModel):
"""Storage Base Model"""
title: str
name: str
user: UUID4
@ -24,6 +23,8 @@ class S3Storage(Storage):
"""S3 Storage Model"""
endpoint_url: str
access_key: str
secret_key: str
is_public: Optional[bool]
@ -38,11 +39,17 @@ class StorageOps:
"""Add new storage"""
return await self.storages_coll.insert_one(storage.dict())
async def create_storage_for_user(self, endpoint_prefix: str, user: User):
async def create_storage_for_user(
self, endpoint_url: str, access_key: str, secret_key: str, user: User
):
"""Create default storage for new user"""
endpoint_url = os.path.join(endpoint_prefix, str(user.id)) + "/"
storage = S3Storage(
endpoint_url=endpoint_url, is_public=False, user=user.id, title="default"
endpoint_url=endpoint_url,
access_key=access_key,
secret_key=secret_key,
user=user.id,
name="default",
)
print(f"Created Default Endpoint at ${endpoint_url}")
await self.add_storage(storage)
@ -52,12 +59,16 @@ class StorageOps:
cursor = self.storages_coll.find({"user": user.id})
return await cursor.to_list(length=1000)
async def get_storage(self, uid: str, user: User):
"""Get a storage for user"""
async def get_storage_by_id(self, uid: str, user: User):
"""Get a storage for user by unique id"""
return await self.storages_coll.find_one(
{"_id": ObjectId(uid), "user": user.id}
)
async def get_storage_by_name(self, name: str, user: User):
"""Get a storage for user by name"""
return await self.storages_coll.find_one({"name": name, "user": user.id})
# ============================================================================
def init_storages_api(app, mdb, user_dep: User):
@ -77,7 +88,7 @@ def init_storages_api(app, mdb, user_dep: User):
"storages": [
{
"id": str(res["_id"]),
"title": res["title"],
"name": res["name"],
"endpoint_url": res["endpoint_url"],
}
for res in results
@ -86,12 +97,12 @@ def init_storages_api(app, mdb, user_dep: User):
@router.get("/{id}")
async def get_storage(uid: str, user: User = Depends(user_dep)):
res = await ops.get_storage(uid, user)
res = await ops.get_storage_by_id(uid, user)
print(res)
if not res:
return {}
return {"id": uid, "title": res["title"], "endpoint_url": res["endpoint_url"]}
return {"id": uid, "name": res["name"], "endpoint_url": res["endpoint_url"]}
@router.post("/")
async def add_storage(storage: S3Storage, user: User = Depends(user_dep)):

View File

@ -17,24 +17,28 @@ class User(models.BaseUser):
Base User Model
"""
# ============================================================================
class UserCreate(models.BaseUserCreate):
"""
User Creation Model
"""
# ============================================================================
class UserUpdate(User, models.BaseUserUpdate):
"""
User Update Model
"""
# ============================================================================
class UserDB(User, models.BaseUserDB):
"""
User in DB Model
"""
# ============================================================================
def init_users_api(
app,

View File

@ -1,49 +0,0 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ .Values.name }}-env-config
namespace: {{ .Release.Namespace }}
data:
MONGO_HOST: localhost
CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }}
CRAWLER_IMAGE: {{ .Values.crawler_image }}
IN_CLUSTER: "1"
K8S: "1"
---
apiVersion: v1
kind: Secret
metadata:
name: mongo-auth
namespace: {{ .Release.Namespace }}
type: Opaque
stringData:
MONGO_INITDB_ROOT_USERNAME: {{ .Values.mongo_auth.username | quote }}
MONGO_INITDB_ROOT_PASSWORD: {{ .Values.mongo_auth.password | quote }}
---
apiVersion: v1
kind: Secret
metadata:
name: storage-auth
namespace: {{ .Release.Namespace }}
type: Opaque
stringData:
PASSWORD_SECRET: "{{ .Values.api_password_secret }}"
MINIO_ROOT_USER: "{{ .Values.storage.access_key }}"
MINIO_ROOT_PASSWORD: "{{ .Values.storage.secret_key }}"
AWS_ACCESS_KEY_ID: "{{ .Values.storage.access_key }}"
AWS_SECRET_ACCESS_KEY: "{{ .Values.storage.secret_key }}"
AWS_ENDPOINT: "{{ .Values.storage.endpoint }}"
#S3_FORCE_PATH_STYLE: "{{ .Values.storage.force_path_style | quote }}"

View File

@ -1,3 +1,40 @@
---
apiVersion: v1
kind: ConfigMap
metadata:
name: {{ .Values.name }}-env-config
namespace: {{ .Release.Namespace }}
data:
MONGO_HOST: {{ .Values.mongo_host }}
CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }}
CRAWLER_IMAGE: {{ .Values.crawler_image }}
---
apiVersion: v1
kind: Secret
metadata:
name: storage-auth
namespace: {{ .Release.Namespace }}
type: Opaque
stringData:
PASSWORD_SECRET: "{{ .Values.api_password_secret }}"
{{- if .Values.minio_local }}
MINIO_ROOT_USER: "{{ .Values.storage.access_key }}"
MINIO_ROOT_PASSWORD: "{{ .Values.storage.secret_key }}"
{{- end }}
STORE_ACCESS_KEY: "{{ .Values.storage.access_key }}"
STORE_SECRET_KEY: "{{ .Values.storage.secret_key }}"
STORE_ENDPOINT_URL: "{{ .Values.storage.endpoint }}"
#S3_FORCE_PATH_STYLE: "{{ .Values.storage.force_path_style | quote }}"
---
apiVersion: apps/v1
kind: Deployment
@ -14,19 +51,7 @@ spec:
labels:
app: {{ .Values.name }}
annotations:
checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
spec:
volumes:
- name: data-db
hostPath:
path: /browsertrix-mongo-data
type: DirectoryOrCreate
- name: data-storage
emptyDir: {}
containers:
- name: api
image: {{ .Values.api_image }}
@ -47,17 +72,6 @@ spec:
requests:
cpu: {{ .Values.api_requests_cpu }}
- name: mongo
image: {{ .Values.mongo_image }}
imagePullPolicy: {{ .Values.mongo_pull_policy }}
envFrom:
- secretRef:
name: mongo-auth
volumeMounts:
- name: data-db
mountPath: /data/db
---

View File

@ -1,4 +1,4 @@
{{- if .Values.use_minio }}
{{- if .Values.minio_local }}
---
apiVersion: apps/v1
@ -17,13 +17,12 @@ spec:
labels:
app: local-minio
annotations:
checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
spec:
volumes:
- name: data-storage
emptyDir: {}
hostPath:
path: /browsertrix-minio-data
type: DirectoryOrCreate
containers:
- name: minio
@ -39,7 +38,6 @@ spec:
mountPath: /data
---
apiVersion: v1
kind: Service

View File

@ -0,0 +1,74 @@
{{- if .Values.mongo_local }}
---
apiVersion: v1
kind: Secret
metadata:
name: mongo-auth
namespace: {{ .Release.Namespace }}
type: Opaque
stringData:
MONGO_INITDB_ROOT_USERNAME: {{ .Values.mongo_auth.username | quote }}
MONGO_INITDB_ROOT_PASSWORD: {{ .Values.mongo_auth.password | quote }}
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: local-mongo
namespace: {{ .Release.Namespace }}
spec:
selector:
matchLabels:
app: local-mongo
replicas: {{ .Values.api_num_replicas }}
template:
metadata:
labels:
app: local-mongo
spec:
volumes:
- name: data-db
hostPath:
path: /browsertrix-mongo-data
type: DirectoryOrCreate
containers:
- name: mongo
image: {{ .Values.mongo_image }}
imagePullPolicy: {{ .Values.mongo_pull_policy }}
envFrom:
- secretRef:
name: mongo-auth
volumeMounts:
- name: data-db
mountPath: /data/db
---
apiVersion: v1
kind: Service
metadata:
namespace: {{ .Release.Namespace }}
name: local-mongo
labels:
app: local-mongo
spec:
type: ClusterIP
selector:
app: local-mongo
ports:
- protocol: TCP
port: 27017
targetPort: 27017
name: minio
{{- end }}

View File

@ -16,6 +16,10 @@ api_requests_cpu: "25m"
# MongoDB Image
# =========================================
mongo_local: true
mongo_host: "local-mongo"
mongo_image: "mongo"
mongo_pull_policy: "IfNotPresent"
@ -41,15 +45,15 @@ crawler_namespace: "crawlers"
storage:
access_key: "ADMIN"
secret_key: "PASSW"
secret_key: "PASSW0RD"
# api_endpoint can be "" if using AWS S3, otherwise, set to your provider's S3 endpoint
endpoint: "http://local-minio.default:9000"
endpoint: "http://local-minio.default:9000/test-bucket/"
# if your provider requires path-style URLs for S3 objects, set force_path_style to "true" (any truthy string)
# https://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html
force_path_style: "true"
# the target bucket's name and desired storage subpath, formatted as an s3:// URL for convenience
# (the protocol is ignored; the bucket == the netloc; the subpath == the rest)
storage_prefix: "s3://browsertrix/archives/"
storage_prefix: "s3://test/bucket/"
# acl settings for uploaded files, if any.
# for example, to enable uploaded files to be public, set to:
# acl: "public-read"
@ -60,7 +64,7 @@ storage:
# Local Minio Pod (optional)
# =========================================
# set to true to use a local minio image
use_minio: True
minio_local: True
minio_image: minio/minio
minio_pull_policy: "IfNotPresent"