Presign and replay (#127)

* support for replay via replayweb.page embed, fixes #124

backend:
- pre-sign all files urls
- cache pre-signed urls in redis, presign again when expired (default duration 3600, settable via PRESIGN_DURATION_SECONDS env var)
- change files output -> resources to confirm to Data Package spec supported by replayweb.page
- add CrawlFileOut which contains 'name' (file id), 'path' (presigned url), 'hash', and 'size'
- add /replay/sw.js endpoint to import sw.js from latest replay-web-page release
- update to fastapi-users 9.2.2
- customize backend auth to allow authentication to check 'auth_bearer' query arg if 'Authorization' header not set
- remove sw.js endpoint, handling in frontend

frontend:
- add <replay-web-page> to frontend, include rwp ui.js from latest release in index.html for now
- update crawl api endpoint to end in json
- replay-web-page loads the api endpoint directly!
- update Crawl type to use new format, 'resources' -> instead of 'files', each file has 'name' and 'path'

- nginx: add endpoint to serve the replay sw.js endpoint
- add defer attr to ui.js
- move 'Download' to 'Download Files'

* frontend: support customizing replayweb.page loading url via RWP_BASE_URL env var in Dockerfile
- default prod value set in frontend Dockerfile (set to upcoming 1.5.8 release needed for multi-wacz-file support) (can be overridden during image build via --build-arg)
- rename index.html -> index.ejs to allow interpolation
- RWP_BASE_URL defaults to latest https://replayweb.page/ for testing
- for local testing, add sw.js loading via devServer, also using RWP_BASE_URL (#131)

Co-authored-by: sua yoo <sua@suayoo.com>
This commit is contained in:
Ilya Kreymer 2022-01-31 17:02:15 -08:00 committed by GitHub
parent 336cf11521
commit adb5c835f2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 164 additions and 60 deletions

View File

@ -3,6 +3,7 @@
import asyncio
import json
import uuid
import os
from typing import Optional, List, Dict, Union
from datetime import datetime
@ -14,6 +15,7 @@ import aioredis
from db import BaseMongoModel
from archives import Archive
from storages import get_presigned_url
# ============================================================================
@ -32,7 +34,7 @@ class CrawlScale(BaseModel):
# ============================================================================
class CrawlFile(BaseModel):
""" output of a crawl """
""" file from a crawl """
filename: str
hash: str
@ -40,6 +42,16 @@ class CrawlFile(BaseModel):
def_storage_name: Optional[str]
# ============================================================================
class CrawlFileOut(BaseModel):
""" output for file from a crawl (conformance to Data Resource Spec) """
name: str
path: str
hash: str
size: int
# ============================================================================
class Crawl(BaseMongoModel):
""" Store State of a Crawl (Finished or Running) """
@ -74,6 +86,7 @@ class CrawlOut(Crawl):
userName: Optional[str]
configName: Optional[str]
resources: Optional[List[CrawlFileOut]] = []
# ============================================================================
@ -129,7 +142,7 @@ class CrawlCompleteIn(BaseModel):
class CrawlOps:
""" Crawl Ops """
# pylint: disable=too-many-arguments
# pylint: disable=too-many-arguments, too-many-instance-attributes
def __init__(self, mdb, redis_url, users, crawl_manager, crawl_configs, archives):
self.crawls = mdb["crawls"]
self.crawl_manager = crawl_manager
@ -138,6 +151,8 @@ class CrawlOps:
self.archives = archives
self.crawls_done_key = "crawls-done"
self.presign_duration = int(os.environ.get("PRESIGN_DURATION_SECONDS", 3600))
self.redis = None
asyncio.create_task(self.init_redis(redis_url))
asyncio.create_task(self.init_index())
@ -290,7 +305,7 @@ class CrawlOps:
for crawl in running_crawls:
list_crawl = ListCrawlOut(**crawl.dict())
crawls.append(await self._resolve_crawl(list_crawl, archive))
crawls.append(await self._resolve_crawl_refs(list_crawl, archive))
crawls.extend(finished_crawls)
@ -309,13 +324,18 @@ class CrawlOps:
status_code=404, detail=f"Crawl not found: {crawlid}"
)
files = [CrawlFile(**data) for data in res["files"]]
del res["files"]
res["resources"] = await self._resolve_signed_urls(files, archive)
crawl = CrawlOut.from_dict(res)
await self._resolve_filenames(crawl)
return await self._resolve_crawl_refs(crawl, archive)
return await self._resolve_crawl(crawl, archive)
async def _resolve_crawl(self, crawl: Union[CrawlOut, ListCrawlOut], archive):
async def _resolve_crawl_refs(
self, crawl: Union[CrawlOut, ListCrawlOut], archive: Archive
):
""" Resolve running crawl data """
config = await self.crawl_configs.get_crawl_config(crawl.cid, archive)
@ -328,6 +348,38 @@ class CrawlOps:
return crawl
async def _resolve_signed_urls(self, files, archive: Archive):
if not files:
return
async with self.redis.pipeline(transaction=True) as pipe:
for file_ in files:
pipe.get(f"{file_.filename}")
results = await pipe.execute()
out_files = []
for file_, presigned_url in zip(files, results):
if not presigned_url:
presigned_url = await get_presigned_url(
archive, file_, self.crawl_manager, self.presign_duration
)
await self.redis.setex(
f"f:{file_.filename}", self.presign_duration - 1, presigned_url
)
out_files.append(
CrawlFileOut(
name=file_.filename,
path=presigned_url,
hash=file_.hash,
size=file_.size,
)
)
return out_files
async def _resolve_filenames(self, crawl: CrawlOut):
""" Resolve absolute filenames for each file """
if not crawl.files:
@ -448,21 +500,13 @@ def init_crawls_api(
return {"deleted": res}
@app.get(
"/archives/{aid}/crawls/{crawl_id}", tags=["crawls"], response_model=CrawlOut
"/archives/{aid}/crawls/{crawl_id}.json",
tags=["crawls"],
response_model=CrawlOut,
)
async def get_crawl(crawl_id, archive: Archive = Depends(archive_crawl_dep)):
return await ops.get_crawl(crawl_id, archive)
# @app.get(
# "/archives/{aid}/crawls/{crawl_id}/running",
# tags=["crawls"],
# )
# async def get_running(crawl_id, archive: Archive = Depends(archive_crawl_dep)):
# if not crawl_manager.is_running(crawl_id, archive.id_str):
# raise HTTPException(status_code=404, detail="No Such Crawl")
#
# return {"running": True}
@app.post(
"/archives/{aid}/crawls/{crawl_id}/scale",
tags=["crawls"],

View File

@ -372,9 +372,10 @@ class K8SManager:
return self._default_storages[name]
async def _secret_data(self, storage, name):
""" decode secret storage data """
return base64.standard_b64decode(storage.data[name]).decode()
# pylint: disable=no-self-use
def _secret_data(self, secret, name):
""" decode secret data """
return base64.standard_b64decode(secret.data[name]).decode()
async def get_running_crawl(self, name, aid):
"""Get running crawl (job) with given name, or none

View File

@ -5,7 +5,7 @@ supports docker and kubernetes based deployments of multiple browsertrix-crawler
import os
from fastapi import FastAPI, Response
from fastapi import FastAPI
from db import init_db
@ -94,13 +94,6 @@ def main():
async def healthz():
return {}
@app.get("/replay/sw.js")
async def replay_sw():
return Response(
content='importScripts("https://cdn.jsdelivr.net/npm/replaywebpage@1.5.7/sw.js");',
media_type="application/javascript",
)
# ============================================================================
@app.on_event("startup")

View File

@ -1,6 +1,6 @@
uvicorn
fastapi==0.70.0
fastapi-users[mongodb]==8.1.2
fastapi==0.71.0
fastapi-users[mongodb]==9.2.2
loguru
aiofiles
kubernetes-asyncio

View File

@ -12,10 +12,15 @@ from pydantic import EmailStr, UUID4
import passlib.pwd
from fastapi import Request, Response, HTTPException, Depends
from fastapi.security import OAuth2PasswordBearer
from fastapi_users import FastAPIUsers, models, BaseUserManager
from fastapi_users.manager import UserAlreadyExists
from fastapi_users.authentication import JWTAuthentication
from fastapi_users.authentication import (
AuthenticationBackend,
BearerTransport,
JWTStrategy,
)
from fastapi_users.db import MongoDBUserDatabase
from invites import InvitePending, InviteRequest
@ -252,31 +257,71 @@ def init_user_manager(mdb, emailsender, invites):
return UserManager(user_db, emailsender, invites)
# ============================================================================
class OA2BearerOrQuery(OAuth2PasswordBearer):
""" Override bearer check to also test query """
async def __call__(self, request: Request) -> Optional[str]:
param = None
exc = None
try:
param = await super().__call__(request)
if param:
return param
# pylint: disable=broad-except
except Exception as super_exc:
exc = super_exc
param = request.query_params.get("auth_bearer")
if not param and exc:
raise exc
return param
# ============================================================================
class BearerOrQueryTransport(BearerTransport):
""" Bearer or Query Transport """
scheme: OA2BearerOrQuery
def __init__(self, tokenUrl: str):
# pylint: disable=super-init-not-called
self.scheme = OA2BearerOrQuery(tokenUrl, auto_error=False)
# ============================================================================
def init_users_api(app, user_manager):
""" init fastapi_users """
jwt_authentication = JWTAuthentication(
secret=PASSWORD_SECRET,
lifetime_seconds=JWT_TOKEN_LIFETIME,
tokenUrl="auth/jwt/login",
bearer_transport = BearerOrQueryTransport(tokenUrl="auth/jwt/login")
def get_jwt_strategy() -> JWTStrategy:
return JWTStrategy(secret=PASSWORD_SECRET, lifetime_seconds=JWT_TOKEN_LIFETIME)
auth_backend = AuthenticationBackend(
name="jwt",
transport=bearer_transport,
get_strategy=get_jwt_strategy,
)
fastapi_users = FastAPIUsers(
lambda: user_manager,
[jwt_authentication],
[auth_backend],
User,
UserCreateIn,
UserUpdate,
UserDB,
)
auth_router = fastapi_users.get_auth_router(jwt_authentication)
auth_router = fastapi_users.get_auth_router(auth_backend)
current_active_user = fastapi_users.current_user(active=True)
@auth_router.post("/refresh")
async def refresh_jwt(response: Response, user=Depends(current_active_user)):
return await jwt_authentication.get_login_response(user, response, user_manager)
return await auth_backend.login(get_jwt_strategy(), user, response)
app.include_router(
auth_router,

View File

@ -1,4 +1,8 @@
# central place to configure the production replayweb.pgae loading prefix
ARG RWP_BASE_URL=https://cdn.jsdelivr.net/npm/replaywebpage@1.5.8/
FROM node:16 as build
ARG RWP_BASE_URL
WORKDIR /app
COPY package.json .
@ -8,9 +12,11 @@ COPY *.* ./
COPY src ./src/
RUN yarn build
FROM nginx
ARG RWP_BASE_URL
ENV RWP_BASE_URL=${RWP_BASE_URL}
COPY --from=build /app/dist /usr/share/nginx/html
COPY ./nginx.conf.template /etc/nginx/templates/

View File

@ -53,10 +53,10 @@ server {
index index.html index.htm;
}
location /replay/ {
proxy_pass http://${BACKEND_HOST}:8000;
proxy_set_header Host $http_host;
proxy_set_header X-Forwarded-Proto $scheme;
# used in both k8s and docker: RWP_BASE_URL set in Dockerfile
location /replay/sw.js {
add_header Content-Type application/javascript;
return 200 'importScripts("${RWP_BASE_URL}sw.js");';
}
# used by docker only: k8s deployment handles /api directly via ingress

View File

@ -8,6 +8,7 @@
/>
<title>Browsertrix Cloud</title>
<base href="/" />
<script defer src="<%= rwp_base_url %>ui.js"></script>
</head>
<body>
<browsertrix-app></browsertrix-app>

View File

@ -87,7 +87,7 @@ export class CrawlDetail extends LiteElement {
</section>
<section>
<h3 class="text-lg font-medium mb-2">${msg("Files")}</h3>
<h3 class="text-lg font-medium mb-2">${msg("Download Files")}</h3>
${this.renderFiles()}
</section>
</main>
@ -97,6 +97,9 @@ export class CrawlDetail extends LiteElement {
private renderWatch() {
const isRunning = this.crawl?.state === "running";
const bearer = this.authState?.headers?.Authorization?.split(" ", 2)[1];
const fileJson = `/api/archives/${this.archiveId}/crawls/${this.crawlId}.json?auth_bearer=${bearer}`;
return html`
<div
class="aspect-video rounded border ${isRunning
@ -105,6 +108,8 @@ export class CrawlDetail extends LiteElement {
>
<!-- https://github.com/webrecorder/browsertrix-crawler/blob/9f541ab011e8e4bccf8de5bd7dc59b632c694bab/screencast/index.html -->
[watch/replay]
${this.crawl?.resources?.length ? html`<replay-web-page source="${fileJson}" coll="${this.crawl?.id}" replayBase="/replay/" noSandbox="true"></replay-web-page>` : ``}
</div>
<div
class="absolute top-2 right-2 flex bg-white/90 hover:bg-white rounded-full"
@ -318,23 +323,19 @@ export class CrawlDetail extends LiteElement {
private renderFiles() {
return html`
<ul class="border rounded text-sm">
${this.crawl?.files?.map(
${this.crawl?.resources?.map(
(file) => html`
<li class="flex justify-between p-3 border-t first:border-t-0">
<div>
<a
class="text-primary hover:underline"
href=${file.filename}
href=${file.path}
download
title=${file.filename.slice(
file.filename.lastIndexOf("/") + 1
title=${file.name}
>${file.name.slice(
file.name.lastIndexOf("/") + 1
)}
>${msg(
str`Download ${file.filename.slice(
file.filename.lastIndexOf(".")
)}`
)}</a
>
</a>
</div>
<div><sl-format-bytes value=${file.size}></sl-format-bytes></div>
</li>
@ -376,7 +377,7 @@ export class CrawlDetail extends LiteElement {
// );
const data: Crawl = await this.apiFetch(
`/archives/${this.archiveId}/crawls/${this.crawlId}`,
`/archives/${this.archiveId}/crawls/${this.crawlId}.json`,
this.authState!
);

View File

@ -11,7 +11,7 @@ export type Crawl = {
state: string; // "running" | "complete" | "failed" | "partial_complete"
scale: number;
stats: { done: string; found: string } | null;
files?: { filename: string; hash: string; size: number }[];
resources?: { name: string; path: string, hash: string; size: number }[];
fileCount?: number;
fileSize?: number;
completions?: number;

View File

@ -6,6 +6,10 @@ const CopyPlugin = require("copy-webpack-plugin");
const Dotenv = require("dotenv-webpack");
const isDevServer = process.env.WEBPACK_SERVE;
// for testing: for prod, the Dockerfile should have the official prod version used
const RWP_BASE_URL = process.env.RWP_BASE_URL || "https://replayweb.page/";
const dotEnvPath = path.resolve(
process.cwd(),
`.env${isDevServer ? `.local` : ""}`
@ -70,7 +74,6 @@ module.exports = {
directory: shoelaceAssetsSrcPath,
publicPath: "/" + shoelaceAssetsPublicPath,
},
{
directory: path.join(__dirname),
//publicPath: "/",
@ -87,6 +90,13 @@ module.exports = {
pathRewrite: { "^/api": "" },
},
},
// Serve replay service worker file
onBeforeSetupMiddleware: (server) => {
server.app.get("/replay/sw.js", (req, res) => {
res.set("Content-Type", "application/javascript");
res.send(`importScripts("${RWP_BASE_URL}sw.js")`);
});
},
port: 9870,
},
@ -94,7 +104,10 @@ module.exports = {
new Dotenv({ path: dotEnvPath }),
new HtmlWebpackPlugin({
template: "src/index.html",
template: "src/index.ejs",
templateParameters: {
rwp_base_url: RWP_BASE_URL,
},
// Need to block during local development for HMR:
inject: isDevServer ? "head" : true,
scriptLoading: isDevServer ? "blocking" : "defer",