diff --git a/backend/crawls.py b/backend/crawls.py index c3a59e7c..66e82535 100644 --- a/backend/crawls.py +++ b/backend/crawls.py @@ -3,6 +3,7 @@ import asyncio import json import uuid +import os from typing import Optional, List, Dict, Union from datetime import datetime @@ -14,6 +15,7 @@ import aioredis from db import BaseMongoModel from archives import Archive +from storages import get_presigned_url # ============================================================================ @@ -32,7 +34,7 @@ class CrawlScale(BaseModel): # ============================================================================ class CrawlFile(BaseModel): - """ output of a crawl """ + """ file from a crawl """ filename: str hash: str @@ -40,6 +42,16 @@ class CrawlFile(BaseModel): def_storage_name: Optional[str] +# ============================================================================ +class CrawlFileOut(BaseModel): + """ output for file from a crawl (conformance to Data Resource Spec) """ + + name: str + path: str + hash: str + size: int + + # ============================================================================ class Crawl(BaseMongoModel): """ Store State of a Crawl (Finished or Running) """ @@ -74,6 +86,7 @@ class CrawlOut(Crawl): userName: Optional[str] configName: Optional[str] + resources: Optional[List[CrawlFileOut]] = [] # ============================================================================ @@ -129,7 +142,7 @@ class CrawlCompleteIn(BaseModel): class CrawlOps: """ Crawl Ops """ - # pylint: disable=too-many-arguments + # pylint: disable=too-many-arguments, too-many-instance-attributes def __init__(self, mdb, redis_url, users, crawl_manager, crawl_configs, archives): self.crawls = mdb["crawls"] self.crawl_manager = crawl_manager @@ -138,6 +151,8 @@ class CrawlOps: self.archives = archives self.crawls_done_key = "crawls-done" + self.presign_duration = int(os.environ.get("PRESIGN_DURATION_SECONDS", 3600)) + self.redis = None asyncio.create_task(self.init_redis(redis_url)) asyncio.create_task(self.init_index()) @@ -290,7 +305,7 @@ class CrawlOps: for crawl in running_crawls: list_crawl = ListCrawlOut(**crawl.dict()) - crawls.append(await self._resolve_crawl(list_crawl, archive)) + crawls.append(await self._resolve_crawl_refs(list_crawl, archive)) crawls.extend(finished_crawls) @@ -309,13 +324,18 @@ class CrawlOps: status_code=404, detail=f"Crawl not found: {crawlid}" ) + files = [CrawlFile(**data) for data in res["files"]] + + del res["files"] + + res["resources"] = await self._resolve_signed_urls(files, archive) crawl = CrawlOut.from_dict(res) - await self._resolve_filenames(crawl) + return await self._resolve_crawl_refs(crawl, archive) - return await self._resolve_crawl(crawl, archive) - - async def _resolve_crawl(self, crawl: Union[CrawlOut, ListCrawlOut], archive): + async def _resolve_crawl_refs( + self, crawl: Union[CrawlOut, ListCrawlOut], archive: Archive + ): """ Resolve running crawl data """ config = await self.crawl_configs.get_crawl_config(crawl.cid, archive) @@ -328,6 +348,38 @@ class CrawlOps: return crawl + async def _resolve_signed_urls(self, files, archive: Archive): + if not files: + return + + async with self.redis.pipeline(transaction=True) as pipe: + for file_ in files: + pipe.get(f"{file_.filename}") + + results = await pipe.execute() + + out_files = [] + + for file_, presigned_url in zip(files, results): + if not presigned_url: + presigned_url = await get_presigned_url( + archive, file_, self.crawl_manager, self.presign_duration + ) + await self.redis.setex( + f"f:{file_.filename}", self.presign_duration - 1, presigned_url + ) + + out_files.append( + CrawlFileOut( + name=file_.filename, + path=presigned_url, + hash=file_.hash, + size=file_.size, + ) + ) + + return out_files + async def _resolve_filenames(self, crawl: CrawlOut): """ Resolve absolute filenames for each file """ if not crawl.files: @@ -448,21 +500,13 @@ def init_crawls_api( return {"deleted": res} @app.get( - "/archives/{aid}/crawls/{crawl_id}", tags=["crawls"], response_model=CrawlOut + "/archives/{aid}/crawls/{crawl_id}.json", + tags=["crawls"], + response_model=CrawlOut, ) async def get_crawl(crawl_id, archive: Archive = Depends(archive_crawl_dep)): return await ops.get_crawl(crawl_id, archive) - # @app.get( - # "/archives/{aid}/crawls/{crawl_id}/running", - # tags=["crawls"], - # ) - # async def get_running(crawl_id, archive: Archive = Depends(archive_crawl_dep)): - # if not crawl_manager.is_running(crawl_id, archive.id_str): - # raise HTTPException(status_code=404, detail="No Such Crawl") - # - # return {"running": True} - @app.post( "/archives/{aid}/crawls/{crawl_id}/scale", tags=["crawls"], diff --git a/backend/k8sman.py b/backend/k8sman.py index 9f1db917..59041dba 100644 --- a/backend/k8sman.py +++ b/backend/k8sman.py @@ -372,9 +372,10 @@ class K8SManager: return self._default_storages[name] - async def _secret_data(self, storage, name): - """ decode secret storage data """ - return base64.standard_b64decode(storage.data[name]).decode() + # pylint: disable=no-self-use + def _secret_data(self, secret, name): + """ decode secret data """ + return base64.standard_b64decode(secret.data[name]).decode() async def get_running_crawl(self, name, aid): """Get running crawl (job) with given name, or none diff --git a/backend/main.py b/backend/main.py index 634faed9..2c4e4a49 100644 --- a/backend/main.py +++ b/backend/main.py @@ -5,7 +5,7 @@ supports docker and kubernetes based deployments of multiple browsertrix-crawler import os -from fastapi import FastAPI, Response +from fastapi import FastAPI from db import init_db @@ -94,13 +94,6 @@ def main(): async def healthz(): return {} - @app.get("/replay/sw.js") - async def replay_sw(): - return Response( - content='importScripts("https://cdn.jsdelivr.net/npm/replaywebpage@1.5.7/sw.js");', - media_type="application/javascript", - ) - # ============================================================================ @app.on_event("startup") diff --git a/backend/requirements.txt b/backend/requirements.txt index 384134e7..b8cdb69b 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -1,6 +1,6 @@ uvicorn -fastapi==0.70.0 -fastapi-users[mongodb]==8.1.2 +fastapi==0.71.0 +fastapi-users[mongodb]==9.2.2 loguru aiofiles kubernetes-asyncio diff --git a/backend/users.py b/backend/users.py index eec26d96..275ddc8a 100644 --- a/backend/users.py +++ b/backend/users.py @@ -12,10 +12,15 @@ from pydantic import EmailStr, UUID4 import passlib.pwd from fastapi import Request, Response, HTTPException, Depends +from fastapi.security import OAuth2PasswordBearer from fastapi_users import FastAPIUsers, models, BaseUserManager from fastapi_users.manager import UserAlreadyExists -from fastapi_users.authentication import JWTAuthentication +from fastapi_users.authentication import ( + AuthenticationBackend, + BearerTransport, + JWTStrategy, +) from fastapi_users.db import MongoDBUserDatabase from invites import InvitePending, InviteRequest @@ -252,31 +257,71 @@ def init_user_manager(mdb, emailsender, invites): return UserManager(user_db, emailsender, invites) +# ============================================================================ +class OA2BearerOrQuery(OAuth2PasswordBearer): + """ Override bearer check to also test query """ + + async def __call__(self, request: Request) -> Optional[str]: + param = None + exc = None + try: + param = await super().__call__(request) + if param: + return param + + # pylint: disable=broad-except + except Exception as super_exc: + exc = super_exc + + param = request.query_params.get("auth_bearer") + + if not param and exc: + raise exc + + return param + + +# ============================================================================ +class BearerOrQueryTransport(BearerTransport): + """ Bearer or Query Transport """ + + scheme: OA2BearerOrQuery + + def __init__(self, tokenUrl: str): + # pylint: disable=super-init-not-called + self.scheme = OA2BearerOrQuery(tokenUrl, auto_error=False) + + # ============================================================================ def init_users_api(app, user_manager): """ init fastapi_users """ - jwt_authentication = JWTAuthentication( - secret=PASSWORD_SECRET, - lifetime_seconds=JWT_TOKEN_LIFETIME, - tokenUrl="auth/jwt/login", + bearer_transport = BearerOrQueryTransport(tokenUrl="auth/jwt/login") + + def get_jwt_strategy() -> JWTStrategy: + return JWTStrategy(secret=PASSWORD_SECRET, lifetime_seconds=JWT_TOKEN_LIFETIME) + + auth_backend = AuthenticationBackend( + name="jwt", + transport=bearer_transport, + get_strategy=get_jwt_strategy, ) fastapi_users = FastAPIUsers( lambda: user_manager, - [jwt_authentication], + [auth_backend], User, UserCreateIn, UserUpdate, UserDB, ) - auth_router = fastapi_users.get_auth_router(jwt_authentication) + auth_router = fastapi_users.get_auth_router(auth_backend) current_active_user = fastapi_users.current_user(active=True) @auth_router.post("/refresh") async def refresh_jwt(response: Response, user=Depends(current_active_user)): - return await jwt_authentication.get_login_response(user, response, user_manager) + return await auth_backend.login(get_jwt_strategy(), user, response) app.include_router( auth_router, diff --git a/frontend/Dockerfile b/frontend/Dockerfile index b7ad559a..ef81976b 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -1,4 +1,8 @@ +# central place to configure the production replayweb.pgae loading prefix +ARG RWP_BASE_URL=https://cdn.jsdelivr.net/npm/replaywebpage@1.5.8/ + FROM node:16 as build +ARG RWP_BASE_URL WORKDIR /app COPY package.json . @@ -8,9 +12,11 @@ COPY *.* ./ COPY src ./src/ RUN yarn build - FROM nginx +ARG RWP_BASE_URL +ENV RWP_BASE_URL=${RWP_BASE_URL} + COPY --from=build /app/dist /usr/share/nginx/html COPY ./nginx.conf.template /etc/nginx/templates/ diff --git a/frontend/nginx.conf.template b/frontend/nginx.conf.template index 4b1472f2..eadcba8a 100644 --- a/frontend/nginx.conf.template +++ b/frontend/nginx.conf.template @@ -53,10 +53,10 @@ server { index index.html index.htm; } - location /replay/ { - proxy_pass http://${BACKEND_HOST}:8000; - proxy_set_header Host $http_host; - proxy_set_header X-Forwarded-Proto $scheme; + # used in both k8s and docker: RWP_BASE_URL set in Dockerfile + location /replay/sw.js { + add_header Content-Type application/javascript; + return 200 'importScripts("${RWP_BASE_URL}sw.js");'; } # used by docker only: k8s deployment handles /api directly via ingress diff --git a/frontend/src/index.html b/frontend/src/index.ejs similarity index 84% rename from frontend/src/index.html rename to frontend/src/index.ejs index ee16ed36..16972bff 100644 --- a/frontend/src/index.html +++ b/frontend/src/index.ejs @@ -8,6 +8,7 @@ />