From f77eaccf418fb0cd3ed907adcdb2a0c71be12116 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Fri, 2 Jul 2021 15:56:24 -0700 Subject: [PATCH] support committing to s3 storage move mongo into separate optional deployment along with minio support for configuring storages support for deleting crawls, associated config and secrets --- backend/crawls.py | 26 +++++--- backend/dockerdriver.py | 12 ---- backend/k8sman.py | 63 ++++++++++++++---- backend/main.py | 31 ++++++--- backend/storages.py | 31 ++++++--- backend/users.py | 4 ++ chart/templates/configmap.yaml | 49 -------------- chart/templates/{deploy.yaml => main.yaml} | 60 +++++++++++------- chart/templates/minio.yaml | 10 ++- chart/templates/mongo.yaml | 74 ++++++++++++++++++++++ chart/values.yaml | 12 ++-- 11 files changed, 238 insertions(+), 134 deletions(-) delete mode 100644 backend/dockerdriver.py delete mode 100644 chart/templates/configmap.yaml rename chart/templates/{deploy.yaml => main.yaml} (63%) create mode 100644 chart/templates/mongo.yaml diff --git a/backend/crawls.py b/backend/crawls.py index ca1c6917..c56a3c26 100644 --- a/backend/crawls.py +++ b/backend/crawls.py @@ -43,6 +43,8 @@ class BaseCrawlConfig(BaseModel): seeds: List[Union[str, Seed]] + collection: Optional[str] = "my-web-archive" + scopeType: Optional[ScopeType] = ScopeType.PREFIX scope: Union[str, List[str], None] = "" exclude: Union[str, List[str], None] = "" @@ -66,6 +68,7 @@ class CrawlConfig(BaseCrawlConfig): """Schedulable config""" schedule: Optional[str] = "" + storageName: Optional[str] = "default" # ============================================================================ @@ -85,8 +88,9 @@ def to_crawl_config(data, uid=None): class CrawlOps: """Crawl Config Operations""" - def __init__(self, mdb, crawl_manager): + def __init__(self, mdb, storage_ops, crawl_manager): self.crawl_configs = mdb["crawl_configs"] + self.storage_ops = storage_ops self.crawl_manager = crawl_manager self.default_crawl_params = [ "--collection", @@ -102,10 +106,18 @@ class CrawlOps: data = config.dict() data["user"] = user.id + storage = await self.storage_ops.get_storage_by_name(config.storageName, user) + + if not storage: + raise HTTPException( + status_code=400, + detail=f"Invalid Config: Storage '{config.storageName}' not found", + ) + result = await self.crawl_configs.insert_one(data) out = to_crawl_config(data, result.inserted_id) await self.crawl_manager.add_crawl_config( - out.dict(), str(user.id), self.default_crawl_params + out.dict(), str(user.id), storage, self.default_crawl_params ) return result @@ -118,9 +130,7 @@ class CrawlOps: async def delete_crawl_config(self, _id: str, user: User): """Delete config""" await self.crawl_manager.delete_crawl_configs(f"btrix.crawlconfig={_id}") - return self.crawl_configs.delete_one( - {"_id": ObjectId(_id), "user": user.id} - ) + return self.crawl_configs.delete_one({"_id": ObjectId(_id), "user": user.id}) async def delete_crawl_configs(self, user: User): """Delete all crawl configs for user""" @@ -143,9 +153,9 @@ class CrawlOps: # ============================================================================ # pylint: disable=redefined-builtin,invalid-name -def init_crawl_config_api(app, mdb, user_dep: User, crawl_manager): +def init_crawl_config_api(app, mdb, user_dep: User, storage_ops, crawl_manager): """Init /crawlconfigs api routes""" - ops = CrawlOps(mdb, crawl_manager) + ops = CrawlOps(mdb, storage_ops, crawl_manager) router = APIRouter( prefix="/crawlconfigs", @@ -170,7 +180,7 @@ def init_crawl_config_api(app, mdb, user_dep: User, crawl_manager): if not result or not result.deleted_count: raise HTTPException(status_code=404, detail="Crawl Config Not Found") - return {"deleted": result.deleted_count} + return {"deleted": 1} @router.get("/{id}") async def get_crawl_config(id: str, user: User = Depends(user_dep)): diff --git a/backend/dockerdriver.py b/backend/dockerdriver.py deleted file mode 100644 index b3fb88a1..00000000 --- a/backend/dockerdriver.py +++ /dev/null @@ -1,12 +0,0 @@ -import aiodocker - - -class DockerDriver(BaseDriver): - def __init__(self): - self.docker = aiodocker.Docker() - self.crawl_image = os.environ.get( - "CRAWLER_IMAGE", "webrecorder/browsertrix-crawler" - ) - - def start_crawl(self): - container = await self.docker.containers.create(config=config) diff --git a/backend/k8sman.py b/backend/k8sman.py index 57e3e925..91db46a3 100644 --- a/backend/k8sman.py +++ b/backend/k8sman.py @@ -12,13 +12,6 @@ from jinja2 import Environment, FileSystemLoader # ============================================================================ -if os.environ.get("IN_CLUSTER"): - print("Cluster Init") - config.load_incluster_config() -else: - config.load_kube_config() - - DEFAULT_NAMESPACE = os.environ.get("CRAWLER_NAMESPACE") or "crawlers" DEFAULT_NO_SCHEDULE = "* * 31 2 *" @@ -26,10 +19,12 @@ DEFAULT_NO_SCHEDULE = "* * 31 2 *" # ============================================================================ class K8SManager: - # pylint: disable=too-few-public-methods + # pylint: disable=too-many-instance-attributes,too-many-locals """K8SManager, manager creation of k8s resources from crawl api requests""" def __init__(self, namespace=DEFAULT_NAMESPACE): + config.load_incluster_config() + self.core_api = client.CoreV1Api() self.batch_api = client.BatchV1Api() self.batch_beta_api = client.BatchV1beta1Api() @@ -45,7 +40,11 @@ class K8SManager: self.crawler_image_pull_policy = "IfNotPresent" async def add_crawl_config( - self, crawlconfig: dict, userid: str, extra_crawl_params: list = None + self, + crawlconfig: dict, + userid: str, + storage: dict, + extra_crawl_params: list = None, ): """add new crawl as cron job, store crawl config in configmap""" uid = str(crawlconfig["id"]) @@ -54,6 +53,7 @@ class K8SManager: extra_crawl_params = extra_crawl_params or [] + # Create Config Map config_map = client.V1ConfigMap( metadata={ "name": f"crawl-config-{uid}", @@ -67,6 +67,30 @@ class K8SManager: namespace=self.namespace, body=config_map ) + # Create Secret + endpoint_with_coll_url = os.path.join( + storage["endpoint_url"], crawlconfig["collection"] + "/" + ) + + crawl_secret = client.V1Secret( + metadata={ + "name": f"crawl-secret-{uid}", + "namespace": self.namespace, + "labels": labels, + }, + string_data={ + "STORE_USER": userid, + "STORE_ENDPOINT_URL": endpoint_with_coll_url, + "STORE_ACCESS_KEY": storage["access_key"], + "STORE_SECRET_KEY": storage["secret_key"], + }, + ) + + api_response = await self.core_api.create_namespaced_secret( + namespace=self.namespace, body=crawl_secret + ) + + # Create Cron Job run_now = False schedule = crawlconfig.get("schedule") suspend = False @@ -100,21 +124,33 @@ class K8SManager: namespace=self.namespace, body=cron_job ) - # print(api_response) - + # Run Job Now if run_now: await self.create_run_now_job(api_response, labels) return api_response async def delete_crawl_configs(self, label): - """Delete Crawl Cron Job and all dependent resources""" + """Delete Crawl Cron Job and all dependent resources, including configmap and secrets""" + await self.batch_beta_api.delete_collection_namespaced_cron_job( namespace=self.namespace, label_selector=label, propagation_policy="Foreground", ) + await self.core_api.delete_collection_namespaced_secret( + namespace=self.namespace, + label_selector=label, + propagation_policy="Foreground", + ) + + await self.core_api.delete_collection_namespaced_config_map( + namespace=self.namespace, + label_selector=label, + propagation_policy="Foreground", + ) + async def create_run_now_job(self, cron_job, labels): """Create new job from cron job to run instantly""" annotations = {} @@ -174,6 +210,9 @@ class K8SManager: "readOnly": True, } ], + "envFrom": [ + {"secretRef": {"name": f"crawl-secret-{uid}"}} + ], } ], "volumes": [ diff --git a/backend/main.py b/backend/main.py index 42bfc138..d2a9bf8b 100644 --- a/backend/main.py +++ b/backend/main.py @@ -8,14 +8,13 @@ import os from fastapi import FastAPI, Request # from fastapi.responses import HTMLResponse -from fastapi.staticfiles import StaticFiles +# from fastapi.staticfiles import StaticFiles from users import init_users_api, UserDB from db import init_db from storages import init_storages_api from crawls import init_crawl_config_api - from k8sman import K8SManager @@ -27,18 +26,19 @@ class BrowsertrixAPI: # pylint: disable=too-many-instance-attributes def __init__(self): - self.default_storage = os.environ.get( - "DEFAULT_STORAGE", "http://localhost:8010/store-bucket/" + self.default_storage_endpoint_url = os.environ.get( + "STORE_ENDPOINT_URL", "http://localhost:8010/store-bucket/" ) + self.default_storage_access_key = os.environ.get("STORE_ACCESS_KEY") + self.default_storage_secret_key = os.environ.get("STORE_SECRET_KEY") self.app = FastAPI() - if os.environ.get("K8S"): + if os.environ.get("KUBERNETES_SERVICE_HOST"): self.crawl_manager = K8SManager() else: - self.crawl_manager = None - - # self.app.mount("/static", StaticFiles(directory="static"), name="static") + #to implement + raise Exception("Currently, only running in Kubernetes is supported") self.mdb = init_db() @@ -55,7 +55,11 @@ class BrowsertrixAPI: self.storage_ops = init_storages_api(self.app, self.mdb, current_active_user) self.crawl_config_ops = init_crawl_config_api( - self.app, self.mdb, current_active_user, self.crawl_manager + self.app, + self.mdb, + current_active_user, + self.storage_ops, + self.crawl_manager, ) # @app.get("/") @@ -65,7 +69,14 @@ class BrowsertrixAPI: # pylint: disable=no-self-use, unused-argument async def on_after_register(self, user: UserDB, request): """callback after registeration""" - await self.storage_ops.create_storage_for_user(self.default_storage, user) + + await self.storage_ops.create_storage_for_user( + endpoint_url=self.default_storage_endpoint_url, + access_key=self.default_storage_access_key, + secret_key=self.default_storage_secret_key, + user=user, + ) + print(f"User {user.id} has registered.") # pylint: disable=no-self-use, unused-argument diff --git a/backend/storages.py b/backend/storages.py index 06b3c833..6b5fd84a 100644 --- a/backend/storages.py +++ b/backend/storages.py @@ -2,7 +2,6 @@ Storage API handling """ -import os from typing import Optional from pydantic import BaseModel, UUID4 @@ -15,7 +14,7 @@ from users import User class Storage(BaseModel): """Storage Base Model""" - title: str + name: str user: UUID4 @@ -24,6 +23,8 @@ class S3Storage(Storage): """S3 Storage Model""" endpoint_url: str + access_key: str + secret_key: str is_public: Optional[bool] @@ -38,11 +39,17 @@ class StorageOps: """Add new storage""" return await self.storages_coll.insert_one(storage.dict()) - async def create_storage_for_user(self, endpoint_prefix: str, user: User): + async def create_storage_for_user( + self, endpoint_url: str, access_key: str, secret_key: str, user: User + ): """Create default storage for new user""" - endpoint_url = os.path.join(endpoint_prefix, str(user.id)) + "/" + storage = S3Storage( - endpoint_url=endpoint_url, is_public=False, user=user.id, title="default" + endpoint_url=endpoint_url, + access_key=access_key, + secret_key=secret_key, + user=user.id, + name="default", ) print(f"Created Default Endpoint at ${endpoint_url}") await self.add_storage(storage) @@ -52,12 +59,16 @@ class StorageOps: cursor = self.storages_coll.find({"user": user.id}) return await cursor.to_list(length=1000) - async def get_storage(self, uid: str, user: User): - """Get a storage for user""" + async def get_storage_by_id(self, uid: str, user: User): + """Get a storage for user by unique id""" return await self.storages_coll.find_one( {"_id": ObjectId(uid), "user": user.id} ) + async def get_storage_by_name(self, name: str, user: User): + """Get a storage for user by name""" + return await self.storages_coll.find_one({"name": name, "user": user.id}) + # ============================================================================ def init_storages_api(app, mdb, user_dep: User): @@ -77,7 +88,7 @@ def init_storages_api(app, mdb, user_dep: User): "storages": [ { "id": str(res["_id"]), - "title": res["title"], + "name": res["name"], "endpoint_url": res["endpoint_url"], } for res in results @@ -86,12 +97,12 @@ def init_storages_api(app, mdb, user_dep: User): @router.get("/{id}") async def get_storage(uid: str, user: User = Depends(user_dep)): - res = await ops.get_storage(uid, user) + res = await ops.get_storage_by_id(uid, user) print(res) if not res: return {} - return {"id": uid, "title": res["title"], "endpoint_url": res["endpoint_url"]} + return {"id": uid, "name": res["name"], "endpoint_url": res["endpoint_url"]} @router.post("/") async def add_storage(storage: S3Storage, user: User = Depends(user_dep)): diff --git a/backend/users.py b/backend/users.py index a84d8533..2d93f9b0 100644 --- a/backend/users.py +++ b/backend/users.py @@ -17,24 +17,28 @@ class User(models.BaseUser): Base User Model """ + # ============================================================================ class UserCreate(models.BaseUserCreate): """ User Creation Model """ + # ============================================================================ class UserUpdate(User, models.BaseUserUpdate): """ User Update Model """ + # ============================================================================ class UserDB(User, models.BaseUserDB): """ User in DB Model """ + # ============================================================================ def init_users_api( app, diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml deleted file mode 100644 index 7379dc62..00000000 --- a/chart/templates/configmap.yaml +++ /dev/null @@ -1,49 +0,0 @@ ---- -apiVersion: v1 -kind: ConfigMap -metadata: - name: {{ .Values.name }}-env-config - namespace: {{ .Release.Namespace }} - -data: - MONGO_HOST: localhost - - CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }} - CRAWLER_IMAGE: {{ .Values.crawler_image }} - - IN_CLUSTER: "1" - K8S: "1" - - ---- -apiVersion: v1 -kind: Secret -metadata: - name: mongo-auth - namespace: {{ .Release.Namespace }} - -type: Opaque -stringData: - MONGO_INITDB_ROOT_USERNAME: {{ .Values.mongo_auth.username | quote }} - MONGO_INITDB_ROOT_PASSWORD: {{ .Values.mongo_auth.password | quote }} - - ---- -apiVersion: v1 -kind: Secret -metadata: - name: storage-auth - namespace: {{ .Release.Namespace }} - -type: Opaque -stringData: - PASSWORD_SECRET: "{{ .Values.api_password_secret }}" - - MINIO_ROOT_USER: "{{ .Values.storage.access_key }}" - MINIO_ROOT_PASSWORD: "{{ .Values.storage.secret_key }}" - - AWS_ACCESS_KEY_ID: "{{ .Values.storage.access_key }}" - AWS_SECRET_ACCESS_KEY: "{{ .Values.storage.secret_key }}" - AWS_ENDPOINT: "{{ .Values.storage.endpoint }}" - #S3_FORCE_PATH_STYLE: "{{ .Values.storage.force_path_style | quote }}" - diff --git a/chart/templates/deploy.yaml b/chart/templates/main.yaml similarity index 63% rename from chart/templates/deploy.yaml rename to chart/templates/main.yaml index 6787f130..a643ba67 100644 --- a/chart/templates/deploy.yaml +++ b/chart/templates/main.yaml @@ -1,3 +1,40 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: {{ .Values.name }}-env-config + namespace: {{ .Release.Namespace }} + +data: + MONGO_HOST: {{ .Values.mongo_host }} + + CRAWLER_NAMESPACE: {{ .Values.crawler_namespace }} + CRAWLER_IMAGE: {{ .Values.crawler_image }} + + +--- +apiVersion: v1 +kind: Secret +metadata: + name: storage-auth + namespace: {{ .Release.Namespace }} + +type: Opaque +stringData: + PASSWORD_SECRET: "{{ .Values.api_password_secret }}" + +{{- if .Values.minio_local }} + MINIO_ROOT_USER: "{{ .Values.storage.access_key }}" + MINIO_ROOT_PASSWORD: "{{ .Values.storage.secret_key }}" +{{- end }} + + STORE_ACCESS_KEY: "{{ .Values.storage.access_key }}" + STORE_SECRET_KEY: "{{ .Values.storage.secret_key }}" + STORE_ENDPOINT_URL: "{{ .Values.storage.endpoint }}" + #S3_FORCE_PATH_STYLE: "{{ .Values.storage.force_path_style | quote }}" + + + --- apiVersion: apps/v1 kind: Deployment @@ -14,19 +51,7 @@ spec: labels: app: {{ .Values.name }} - annotations: - checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} - spec: - volumes: - - name: data-db - hostPath: - path: /browsertrix-mongo-data - type: DirectoryOrCreate - - - name: data-storage - emptyDir: {} - containers: - name: api image: {{ .Values.api_image }} @@ -47,17 +72,6 @@ spec: requests: cpu: {{ .Values.api_requests_cpu }} - - name: mongo - image: {{ .Values.mongo_image }} - imagePullPolicy: {{ .Values.mongo_pull_policy }} - envFrom: - - secretRef: - name: mongo-auth - - volumeMounts: - - name: data-db - mountPath: /data/db - --- diff --git a/chart/templates/minio.yaml b/chart/templates/minio.yaml index 88de262e..0539be3a 100644 --- a/chart/templates/minio.yaml +++ b/chart/templates/minio.yaml @@ -1,4 +1,4 @@ -{{- if .Values.use_minio }} +{{- if .Values.minio_local }} --- apiVersion: apps/v1 @@ -17,13 +17,12 @@ spec: labels: app: local-minio - annotations: - checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }} - spec: volumes: - name: data-storage - emptyDir: {} + hostPath: + path: /browsertrix-minio-data + type: DirectoryOrCreate containers: - name: minio @@ -39,7 +38,6 @@ spec: mountPath: /data --- - apiVersion: v1 kind: Service diff --git a/chart/templates/mongo.yaml b/chart/templates/mongo.yaml new file mode 100644 index 00000000..12a86c74 --- /dev/null +++ b/chart/templates/mongo.yaml @@ -0,0 +1,74 @@ +{{- if .Values.mongo_local }} + +--- +apiVersion: v1 +kind: Secret +metadata: + name: mongo-auth + namespace: {{ .Release.Namespace }} + +type: Opaque +stringData: + MONGO_INITDB_ROOT_USERNAME: {{ .Values.mongo_auth.username | quote }} + MONGO_INITDB_ROOT_PASSWORD: {{ .Values.mongo_auth.password | quote }} + + +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: local-mongo + namespace: {{ .Release.Namespace }} +spec: + selector: + matchLabels: + app: local-mongo + replicas: {{ .Values.api_num_replicas }} + template: + metadata: + labels: + app: local-mongo + + spec: + volumes: + - name: data-db + hostPath: + path: /browsertrix-mongo-data + type: DirectoryOrCreate + + containers: + - name: mongo + image: {{ .Values.mongo_image }} + imagePullPolicy: {{ .Values.mongo_pull_policy }} + envFrom: + - secretRef: + name: mongo-auth + + volumeMounts: + - name: data-db + mountPath: /data/db + +--- +apiVersion: v1 +kind: Service + +metadata: + namespace: {{ .Release.Namespace }} + name: local-mongo + labels: + app: local-mongo + +spec: + type: ClusterIP + selector: + app: local-mongo + + ports: + - protocol: TCP + port: 27017 + targetPort: 27017 + name: minio + +{{- end }} + + diff --git a/chart/values.yaml b/chart/values.yaml index da238c2b..6521c975 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -16,6 +16,10 @@ api_requests_cpu: "25m" # MongoDB Image # ========================================= +mongo_local: true + +mongo_host: "local-mongo" + mongo_image: "mongo" mongo_pull_policy: "IfNotPresent" @@ -41,15 +45,15 @@ crawler_namespace: "crawlers" storage: access_key: "ADMIN" - secret_key: "PASSW" + secret_key: "PASSW0RD" # api_endpoint can be "" if using AWS S3, otherwise, set to your provider's S3 endpoint - endpoint: "http://local-minio.default:9000" + endpoint: "http://local-minio.default:9000/test-bucket/" # if your provider requires path-style URLs for S3 objects, set force_path_style to "true" (any truthy string) # https://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html force_path_style: "true" # the target bucket's name and desired storage subpath, formatted as an s3:// URL for convenience # (the protocol is ignored; the bucket == the netloc; the subpath == the rest) - storage_prefix: "s3://browsertrix/archives/" + storage_prefix: "s3://test/bucket/" # acl settings for uploaded files, if any. # for example, to enable uploaded files to be public, set to: # acl: "public-read" @@ -60,7 +64,7 @@ storage: # Local Minio Pod (optional) # ========================================= # set to true to use a local minio image -use_minio: True +minio_local: True minio_image: minio/minio minio_pull_policy: "IfNotPresent"