Presign and replay (#127)

* support for replay via replayweb.page embed, fixes #124

backend:
- pre-sign all files urls
- cache pre-signed urls in redis, presign again when expired (default duration 3600, settable via PRESIGN_DURATION_SECONDS env var)
- change files output -> resources to confirm to Data Package spec supported by replayweb.page
- add CrawlFileOut which contains 'name' (file id), 'path' (presigned url), 'hash', and 'size'
- add /replay/sw.js endpoint to import sw.js from latest replay-web-page release
- update to fastapi-users 9.2.2
- customize backend auth to allow authentication to check 'auth_bearer' query arg if 'Authorization' header not set
- remove sw.js endpoint, handling in frontend

frontend:
- add <replay-web-page> to frontend, include rwp ui.js from latest release in index.html for now
- update crawl api endpoint to end in json
- replay-web-page loads the api endpoint directly!
- update Crawl type to use new format, 'resources' -> instead of 'files', each file has 'name' and 'path'

- nginx: add endpoint to serve the replay sw.js endpoint
- add defer attr to ui.js
- move 'Download' to 'Download Files'

* frontend: support customizing replayweb.page loading url via RWP_BASE_URL env var in Dockerfile
- default prod value set in frontend Dockerfile (set to upcoming 1.5.8 release needed for multi-wacz-file support) (can be overridden during image build via --build-arg)
- rename index.html -> index.ejs to allow interpolation
- RWP_BASE_URL defaults to latest https://replayweb.page/ for testing
- for local testing, add sw.js loading via devServer, also using RWP_BASE_URL (#131)

Co-authored-by: sua yoo <sua@suayoo.com>
This commit is contained in:
Ilya Kreymer 2022-01-31 17:02:15 -08:00 committed by GitHub
parent 336cf11521
commit adb5c835f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 164 additions and 60 deletions

View File

@ -3,6 +3,7 @@
import asyncio import asyncio
import json import json
import uuid import uuid
import os
from typing import Optional, List, Dict, Union from typing import Optional, List, Dict, Union
from datetime import datetime from datetime import datetime
@ -14,6 +15,7 @@ import aioredis
from db import BaseMongoModel from db import BaseMongoModel
from archives import Archive from archives import Archive
from storages import get_presigned_url
# ============================================================================ # ============================================================================
@ -32,7 +34,7 @@ class CrawlScale(BaseModel):
# ============================================================================ # ============================================================================
class CrawlFile(BaseModel): class CrawlFile(BaseModel):
""" output of a crawl """ """ file from a crawl """
filename: str filename: str
hash: str hash: str
@ -40,6 +42,16 @@ class CrawlFile(BaseModel):
def_storage_name: Optional[str] def_storage_name: Optional[str]
# ============================================================================
class CrawlFileOut(BaseModel):
""" output for file from a crawl (conformance to Data Resource Spec) """
name: str
path: str
hash: str
size: int
# ============================================================================ # ============================================================================
class Crawl(BaseMongoModel): class Crawl(BaseMongoModel):
""" Store State of a Crawl (Finished or Running) """ """ Store State of a Crawl (Finished or Running) """
@ -74,6 +86,7 @@ class CrawlOut(Crawl):
userName: Optional[str] userName: Optional[str]
configName: Optional[str] configName: Optional[str]
resources: Optional[List[CrawlFileOut]] = []
# ============================================================================ # ============================================================================
@ -129,7 +142,7 @@ class CrawlCompleteIn(BaseModel):
class CrawlOps: class CrawlOps:
""" Crawl Ops """ """ Crawl Ops """
# pylint: disable=too-many-arguments # pylint: disable=too-many-arguments, too-many-instance-attributes
def __init__(self, mdb, redis_url, users, crawl_manager, crawl_configs, archives): def __init__(self, mdb, redis_url, users, crawl_manager, crawl_configs, archives):
self.crawls = mdb["crawls"] self.crawls = mdb["crawls"]
self.crawl_manager = crawl_manager self.crawl_manager = crawl_manager
@ -138,6 +151,8 @@ class CrawlOps:
self.archives = archives self.archives = archives
self.crawls_done_key = "crawls-done" self.crawls_done_key = "crawls-done"
self.presign_duration = int(os.environ.get("PRESIGN_DURATION_SECONDS", 3600))
self.redis = None self.redis = None
asyncio.create_task(self.init_redis(redis_url)) asyncio.create_task(self.init_redis(redis_url))
asyncio.create_task(self.init_index()) asyncio.create_task(self.init_index())
@ -290,7 +305,7 @@ class CrawlOps:
for crawl in running_crawls: for crawl in running_crawls:
list_crawl = ListCrawlOut(**crawl.dict()) list_crawl = ListCrawlOut(**crawl.dict())
crawls.append(await self._resolve_crawl(list_crawl, archive)) crawls.append(await self._resolve_crawl_refs(list_crawl, archive))
crawls.extend(finished_crawls) crawls.extend(finished_crawls)
@ -309,13 +324,18 @@ class CrawlOps:
status_code=404, detail=f"Crawl not found: {crawlid}" status_code=404, detail=f"Crawl not found: {crawlid}"
) )
files = [CrawlFile(**data) for data in res["files"]]
del res["files"]
res["resources"] = await self._resolve_signed_urls(files, archive)
crawl = CrawlOut.from_dict(res) crawl = CrawlOut.from_dict(res)
await self._resolve_filenames(crawl) return await self._resolve_crawl_refs(crawl, archive)
return await self._resolve_crawl(crawl, archive) async def _resolve_crawl_refs(
self, crawl: Union[CrawlOut, ListCrawlOut], archive: Archive
async def _resolve_crawl(self, crawl: Union[CrawlOut, ListCrawlOut], archive): ):
""" Resolve running crawl data """ """ Resolve running crawl data """
config = await self.crawl_configs.get_crawl_config(crawl.cid, archive) config = await self.crawl_configs.get_crawl_config(crawl.cid, archive)
@ -328,6 +348,38 @@ class CrawlOps:
return crawl return crawl
async def _resolve_signed_urls(self, files, archive: Archive):
if not files:
return
async with self.redis.pipeline(transaction=True) as pipe:
for file_ in files:
pipe.get(f"{file_.filename}")
results = await pipe.execute()
out_files = []
for file_, presigned_url in zip(files, results):
if not presigned_url:
presigned_url = await get_presigned_url(
archive, file_, self.crawl_manager, self.presign_duration
)
await self.redis.setex(
f"f:{file_.filename}", self.presign_duration - 1, presigned_url
)
out_files.append(
CrawlFileOut(
name=file_.filename,
path=presigned_url,
hash=file_.hash,
size=file_.size,
)
)
return out_files
async def _resolve_filenames(self, crawl: CrawlOut): async def _resolve_filenames(self, crawl: CrawlOut):
""" Resolve absolute filenames for each file """ """ Resolve absolute filenames for each file """
if not crawl.files: if not crawl.files:
@ -448,21 +500,13 @@ def init_crawls_api(
return {"deleted": res} return {"deleted": res}
@app.get( @app.get(
"/archives/{aid}/crawls/{crawl_id}", tags=["crawls"], response_model=CrawlOut "/archives/{aid}/crawls/{crawl_id}.json",
tags=["crawls"],
response_model=CrawlOut,
) )
async def get_crawl(crawl_id, archive: Archive = Depends(archive_crawl_dep)): async def get_crawl(crawl_id, archive: Archive = Depends(archive_crawl_dep)):
return await ops.get_crawl(crawl_id, archive) return await ops.get_crawl(crawl_id, archive)
# @app.get(
# "/archives/{aid}/crawls/{crawl_id}/running",
# tags=["crawls"],
# )
# async def get_running(crawl_id, archive: Archive = Depends(archive_crawl_dep)):
# if not crawl_manager.is_running(crawl_id, archive.id_str):
# raise HTTPException(status_code=404, detail="No Such Crawl")
#
# return {"running": True}
@app.post( @app.post(
"/archives/{aid}/crawls/{crawl_id}/scale", "/archives/{aid}/crawls/{crawl_id}/scale",
tags=["crawls"], tags=["crawls"],

View File

@ -372,9 +372,10 @@ class K8SManager:
return self._default_storages[name] return self._default_storages[name]
async def _secret_data(self, storage, name): # pylint: disable=no-self-use
""" decode secret storage data """ def _secret_data(self, secret, name):
return base64.standard_b64decode(storage.data[name]).decode() """ decode secret data """
return base64.standard_b64decode(secret.data[name]).decode()
async def get_running_crawl(self, name, aid): async def get_running_crawl(self, name, aid):
"""Get running crawl (job) with given name, or none """Get running crawl (job) with given name, or none

View File

@ -5,7 +5,7 @@ supports docker and kubernetes based deployments of multiple browsertrix-crawler
import os import os
from fastapi import FastAPI, Response from fastapi import FastAPI
from db import init_db from db import init_db
@ -94,13 +94,6 @@ def main():
async def healthz(): async def healthz():
return {} return {}
@app.get("/replay/sw.js")
async def replay_sw():
return Response(
content='importScripts("https://cdn.jsdelivr.net/npm/replaywebpage@1.5.7/sw.js");',
media_type="application/javascript",
)
# ============================================================================ # ============================================================================
@app.on_event("startup") @app.on_event("startup")

View File

@ -1,6 +1,6 @@
uvicorn uvicorn
fastapi==0.70.0 fastapi==0.71.0
fastapi-users[mongodb]==8.1.2 fastapi-users[mongodb]==9.2.2
loguru loguru
aiofiles aiofiles
kubernetes-asyncio kubernetes-asyncio

View File

@ -12,10 +12,15 @@ from pydantic import EmailStr, UUID4
import passlib.pwd import passlib.pwd
from fastapi import Request, Response, HTTPException, Depends from fastapi import Request, Response, HTTPException, Depends
from fastapi.security import OAuth2PasswordBearer
from fastapi_users import FastAPIUsers, models, BaseUserManager from fastapi_users import FastAPIUsers, models, BaseUserManager
from fastapi_users.manager import UserAlreadyExists from fastapi_users.manager import UserAlreadyExists
from fastapi_users.authentication import JWTAuthentication from fastapi_users.authentication import (
AuthenticationBackend,
BearerTransport,
JWTStrategy,
)
from fastapi_users.db import MongoDBUserDatabase from fastapi_users.db import MongoDBUserDatabase
from invites import InvitePending, InviteRequest from invites import InvitePending, InviteRequest
@ -252,31 +257,71 @@ def init_user_manager(mdb, emailsender, invites):
return UserManager(user_db, emailsender, invites) return UserManager(user_db, emailsender, invites)
# ============================================================================
class OA2BearerOrQuery(OAuth2PasswordBearer):
""" Override bearer check to also test query """
async def __call__(self, request: Request) -> Optional[str]:
param = None
exc = None
try:
param = await super().__call__(request)
if param:
return param
# pylint: disable=broad-except
except Exception as super_exc:
exc = super_exc
param = request.query_params.get("auth_bearer")
if not param and exc:
raise exc
return param
# ============================================================================
class BearerOrQueryTransport(BearerTransport):
""" Bearer or Query Transport """
scheme: OA2BearerOrQuery
def __init__(self, tokenUrl: str):
# pylint: disable=super-init-not-called
self.scheme = OA2BearerOrQuery(tokenUrl, auto_error=False)
# ============================================================================ # ============================================================================
def init_users_api(app, user_manager): def init_users_api(app, user_manager):
""" init fastapi_users """ """ init fastapi_users """
jwt_authentication = JWTAuthentication( bearer_transport = BearerOrQueryTransport(tokenUrl="auth/jwt/login")
secret=PASSWORD_SECRET,
lifetime_seconds=JWT_TOKEN_LIFETIME, def get_jwt_strategy() -> JWTStrategy:
tokenUrl="auth/jwt/login", return JWTStrategy(secret=PASSWORD_SECRET, lifetime_seconds=JWT_TOKEN_LIFETIME)
auth_backend = AuthenticationBackend(
name="jwt",
transport=bearer_transport,
get_strategy=get_jwt_strategy,
) )
fastapi_users = FastAPIUsers( fastapi_users = FastAPIUsers(
lambda: user_manager, lambda: user_manager,
[jwt_authentication], [auth_backend],
User, User,
UserCreateIn, UserCreateIn,
UserUpdate, UserUpdate,
UserDB, UserDB,
) )
auth_router = fastapi_users.get_auth_router(jwt_authentication) auth_router = fastapi_users.get_auth_router(auth_backend)
current_active_user = fastapi_users.current_user(active=True) current_active_user = fastapi_users.current_user(active=True)
@auth_router.post("/refresh") @auth_router.post("/refresh")
async def refresh_jwt(response: Response, user=Depends(current_active_user)): async def refresh_jwt(response: Response, user=Depends(current_active_user)):
return await jwt_authentication.get_login_response(user, response, user_manager) return await auth_backend.login(get_jwt_strategy(), user, response)
app.include_router( app.include_router(
auth_router, auth_router,

View File

@ -1,4 +1,8 @@
# central place to configure the production replayweb.pgae loading prefix
ARG RWP_BASE_URL=https://cdn.jsdelivr.net/npm/replaywebpage@1.5.8/
FROM node:16 as build FROM node:16 as build
ARG RWP_BASE_URL
WORKDIR /app WORKDIR /app
COPY package.json . COPY package.json .
@ -8,9 +12,11 @@ COPY *.* ./
COPY src ./src/ COPY src ./src/
RUN yarn build RUN yarn build
FROM nginx FROM nginx
ARG RWP_BASE_URL
ENV RWP_BASE_URL=${RWP_BASE_URL}
COPY --from=build /app/dist /usr/share/nginx/html COPY --from=build /app/dist /usr/share/nginx/html
COPY ./nginx.conf.template /etc/nginx/templates/ COPY ./nginx.conf.template /etc/nginx/templates/

View File

@ -53,10 +53,10 @@ server {
index index.html index.htm; index index.html index.htm;
} }
location /replay/ { # used in both k8s and docker: RWP_BASE_URL set in Dockerfile
proxy_pass http://${BACKEND_HOST}:8000; location /replay/sw.js {
proxy_set_header Host $http_host; add_header Content-Type application/javascript;
proxy_set_header X-Forwarded-Proto $scheme; return 200 'importScripts("${RWP_BASE_URL}sw.js");';
} }
# used by docker only: k8s deployment handles /api directly via ingress # used by docker only: k8s deployment handles /api directly via ingress

View File

@ -8,6 +8,7 @@
/> />
<title>Browsertrix Cloud</title> <title>Browsertrix Cloud</title>
<base href="/" /> <base href="/" />
<script defer src="<%= rwp_base_url %>ui.js"></script>
</head> </head>
<body> <body>
<browsertrix-app></browsertrix-app> <browsertrix-app></browsertrix-app>

View File

@ -87,7 +87,7 @@ export class CrawlDetail extends LiteElement {
</section> </section>
<section> <section>
<h3 class="text-lg font-medium mb-2">${msg("Files")}</h3> <h3 class="text-lg font-medium mb-2">${msg("Download Files")}</h3>
${this.renderFiles()} ${this.renderFiles()}
</section> </section>
</main> </main>
@ -97,6 +97,9 @@ export class CrawlDetail extends LiteElement {
private renderWatch() { private renderWatch() {
const isRunning = this.crawl?.state === "running"; const isRunning = this.crawl?.state === "running";
const bearer = this.authState?.headers?.Authorization?.split(" ", 2)[1];
const fileJson = `/api/archives/${this.archiveId}/crawls/${this.crawlId}.json?auth_bearer=${bearer}`;
return html` return html`
<div <div
class="aspect-video rounded border ${isRunning class="aspect-video rounded border ${isRunning
@ -105,6 +108,8 @@ export class CrawlDetail extends LiteElement {
> >
<!-- https://github.com/webrecorder/browsertrix-crawler/blob/9f541ab011e8e4bccf8de5bd7dc59b632c694bab/screencast/index.html --> <!-- https://github.com/webrecorder/browsertrix-crawler/blob/9f541ab011e8e4bccf8de5bd7dc59b632c694bab/screencast/index.html -->
[watch/replay] [watch/replay]
${this.crawl?.resources?.length ? html`<replay-web-page source="${fileJson}" coll="${this.crawl?.id}" replayBase="/replay/" noSandbox="true"></replay-web-page>` : ``}
</div> </div>
<div <div
class="absolute top-2 right-2 flex bg-white/90 hover:bg-white rounded-full" class="absolute top-2 right-2 flex bg-white/90 hover:bg-white rounded-full"
@ -318,23 +323,19 @@ export class CrawlDetail extends LiteElement {
private renderFiles() { private renderFiles() {
return html` return html`
<ul class="border rounded text-sm"> <ul class="border rounded text-sm">
${this.crawl?.files?.map( ${this.crawl?.resources?.map(
(file) => html` (file) => html`
<li class="flex justify-between p-3 border-t first:border-t-0"> <li class="flex justify-between p-3 border-t first:border-t-0">
<div> <div>
<a <a
class="text-primary hover:underline" class="text-primary hover:underline"
href=${file.filename} href=${file.path}
download download
title=${file.filename.slice( title=${file.name}
file.filename.lastIndexOf("/") + 1 >${file.name.slice(
file.name.lastIndexOf("/") + 1
)} )}
>${msg( </a>
str`Download ${file.filename.slice(
file.filename.lastIndexOf(".")
)}`
)}</a
>
</div> </div>
<div><sl-format-bytes value=${file.size}></sl-format-bytes></div> <div><sl-format-bytes value=${file.size}></sl-format-bytes></div>
</li> </li>
@ -376,7 +377,7 @@ export class CrawlDetail extends LiteElement {
// ); // );
const data: Crawl = await this.apiFetch( const data: Crawl = await this.apiFetch(
`/archives/${this.archiveId}/crawls/${this.crawlId}`, `/archives/${this.archiveId}/crawls/${this.crawlId}.json`,
this.authState! this.authState!
); );

View File

@ -11,7 +11,7 @@ export type Crawl = {
state: string; // "running" | "complete" | "failed" | "partial_complete" state: string; // "running" | "complete" | "failed" | "partial_complete"
scale: number; scale: number;
stats: { done: string; found: string } | null; stats: { done: string; found: string } | null;
files?: { filename: string; hash: string; size: number }[]; resources?: { name: string; path: string, hash: string; size: number }[];
fileCount?: number; fileCount?: number;
fileSize?: number; fileSize?: number;
completions?: number; completions?: number;

View File

@ -6,6 +6,10 @@ const CopyPlugin = require("copy-webpack-plugin");
const Dotenv = require("dotenv-webpack"); const Dotenv = require("dotenv-webpack");
const isDevServer = process.env.WEBPACK_SERVE; const isDevServer = process.env.WEBPACK_SERVE;
// for testing: for prod, the Dockerfile should have the official prod version used
const RWP_BASE_URL = process.env.RWP_BASE_URL || "https://replayweb.page/";
const dotEnvPath = path.resolve( const dotEnvPath = path.resolve(
process.cwd(), process.cwd(),
`.env${isDevServer ? `.local` : ""}` `.env${isDevServer ? `.local` : ""}`
@ -70,7 +74,6 @@ module.exports = {
directory: shoelaceAssetsSrcPath, directory: shoelaceAssetsSrcPath,
publicPath: "/" + shoelaceAssetsPublicPath, publicPath: "/" + shoelaceAssetsPublicPath,
}, },
{ {
directory: path.join(__dirname), directory: path.join(__dirname),
//publicPath: "/", //publicPath: "/",
@ -87,6 +90,13 @@ module.exports = {
pathRewrite: { "^/api": "" }, pathRewrite: { "^/api": "" },
}, },
}, },
// Serve replay service worker file
onBeforeSetupMiddleware: (server) => {
server.app.get("/replay/sw.js", (req, res) => {
res.set("Content-Type", "application/javascript");
res.send(`importScripts("${RWP_BASE_URL}sw.js")`);
});
},
port: 9870, port: 9870,
}, },
@ -94,7 +104,10 @@ module.exports = {
new Dotenv({ path: dotEnvPath }), new Dotenv({ path: dotEnvPath }),
new HtmlWebpackPlugin({ new HtmlWebpackPlugin({
template: "src/index.html", template: "src/index.ejs",
templateParameters: {
rwp_base_url: RWP_BASE_URL,
},
// Need to block during local development for HMR: // Need to block during local development for HMR:
inject: isDevServer ? "head" : true, inject: isDevServer ? "head" : true,
scriptLoading: isDevServer ? "blocking" : "defer", scriptLoading: isDevServer ? "blocking" : "defer",