Presign and replay (#127)
* support for replay via replayweb.page embed, fixes #124 backend: - pre-sign all files urls - cache pre-signed urls in redis, presign again when expired (default duration 3600, settable via PRESIGN_DURATION_SECONDS env var) - change files output -> resources to confirm to Data Package spec supported by replayweb.page - add CrawlFileOut which contains 'name' (file id), 'path' (presigned url), 'hash', and 'size' - add /replay/sw.js endpoint to import sw.js from latest replay-web-page release - update to fastapi-users 9.2.2 - customize backend auth to allow authentication to check 'auth_bearer' query arg if 'Authorization' header not set - remove sw.js endpoint, handling in frontend frontend: - add <replay-web-page> to frontend, include rwp ui.js from latest release in index.html for now - update crawl api endpoint to end in json - replay-web-page loads the api endpoint directly! - update Crawl type to use new format, 'resources' -> instead of 'files', each file has 'name' and 'path' - nginx: add endpoint to serve the replay sw.js endpoint - add defer attr to ui.js - move 'Download' to 'Download Files' * frontend: support customizing replayweb.page loading url via RWP_BASE_URL env var in Dockerfile - default prod value set in frontend Dockerfile (set to upcoming 1.5.8 release needed for multi-wacz-file support) (can be overridden during image build via --build-arg) - rename index.html -> index.ejs to allow interpolation - RWP_BASE_URL defaults to latest https://replayweb.page/ for testing - for local testing, add sw.js loading via devServer, also using RWP_BASE_URL (#131) Co-authored-by: sua yoo <sua@suayoo.com>
This commit is contained in:
parent
336cf11521
commit
adb5c835f2
@ -3,6 +3,7 @@
|
||||
import asyncio
|
||||
import json
|
||||
import uuid
|
||||
import os
|
||||
|
||||
from typing import Optional, List, Dict, Union
|
||||
from datetime import datetime
|
||||
@ -14,6 +15,7 @@ import aioredis
|
||||
|
||||
from db import BaseMongoModel
|
||||
from archives import Archive
|
||||
from storages import get_presigned_url
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -32,7 +34,7 @@ class CrawlScale(BaseModel):
|
||||
|
||||
# ============================================================================
|
||||
class CrawlFile(BaseModel):
|
||||
""" output of a crawl """
|
||||
""" file from a crawl """
|
||||
|
||||
filename: str
|
||||
hash: str
|
||||
@ -40,6 +42,16 @@ class CrawlFile(BaseModel):
|
||||
def_storage_name: Optional[str]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class CrawlFileOut(BaseModel):
|
||||
""" output for file from a crawl (conformance to Data Resource Spec) """
|
||||
|
||||
name: str
|
||||
path: str
|
||||
hash: str
|
||||
size: int
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class Crawl(BaseMongoModel):
|
||||
""" Store State of a Crawl (Finished or Running) """
|
||||
@ -74,6 +86,7 @@ class CrawlOut(Crawl):
|
||||
|
||||
userName: Optional[str]
|
||||
configName: Optional[str]
|
||||
resources: Optional[List[CrawlFileOut]] = []
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@ -129,7 +142,7 @@ class CrawlCompleteIn(BaseModel):
|
||||
class CrawlOps:
|
||||
""" Crawl Ops """
|
||||
|
||||
# pylint: disable=too-many-arguments
|
||||
# pylint: disable=too-many-arguments, too-many-instance-attributes
|
||||
def __init__(self, mdb, redis_url, users, crawl_manager, crawl_configs, archives):
|
||||
self.crawls = mdb["crawls"]
|
||||
self.crawl_manager = crawl_manager
|
||||
@ -138,6 +151,8 @@ class CrawlOps:
|
||||
self.archives = archives
|
||||
self.crawls_done_key = "crawls-done"
|
||||
|
||||
self.presign_duration = int(os.environ.get("PRESIGN_DURATION_SECONDS", 3600))
|
||||
|
||||
self.redis = None
|
||||
asyncio.create_task(self.init_redis(redis_url))
|
||||
asyncio.create_task(self.init_index())
|
||||
@ -290,7 +305,7 @@ class CrawlOps:
|
||||
|
||||
for crawl in running_crawls:
|
||||
list_crawl = ListCrawlOut(**crawl.dict())
|
||||
crawls.append(await self._resolve_crawl(list_crawl, archive))
|
||||
crawls.append(await self._resolve_crawl_refs(list_crawl, archive))
|
||||
|
||||
crawls.extend(finished_crawls)
|
||||
|
||||
@ -309,13 +324,18 @@ class CrawlOps:
|
||||
status_code=404, detail=f"Crawl not found: {crawlid}"
|
||||
)
|
||||
|
||||
files = [CrawlFile(**data) for data in res["files"]]
|
||||
|
||||
del res["files"]
|
||||
|
||||
res["resources"] = await self._resolve_signed_urls(files, archive)
|
||||
crawl = CrawlOut.from_dict(res)
|
||||
|
||||
await self._resolve_filenames(crawl)
|
||||
return await self._resolve_crawl_refs(crawl, archive)
|
||||
|
||||
return await self._resolve_crawl(crawl, archive)
|
||||
|
||||
async def _resolve_crawl(self, crawl: Union[CrawlOut, ListCrawlOut], archive):
|
||||
async def _resolve_crawl_refs(
|
||||
self, crawl: Union[CrawlOut, ListCrawlOut], archive: Archive
|
||||
):
|
||||
""" Resolve running crawl data """
|
||||
config = await self.crawl_configs.get_crawl_config(crawl.cid, archive)
|
||||
|
||||
@ -328,6 +348,38 @@ class CrawlOps:
|
||||
|
||||
return crawl
|
||||
|
||||
async def _resolve_signed_urls(self, files, archive: Archive):
|
||||
if not files:
|
||||
return
|
||||
|
||||
async with self.redis.pipeline(transaction=True) as pipe:
|
||||
for file_ in files:
|
||||
pipe.get(f"{file_.filename}")
|
||||
|
||||
results = await pipe.execute()
|
||||
|
||||
out_files = []
|
||||
|
||||
for file_, presigned_url in zip(files, results):
|
||||
if not presigned_url:
|
||||
presigned_url = await get_presigned_url(
|
||||
archive, file_, self.crawl_manager, self.presign_duration
|
||||
)
|
||||
await self.redis.setex(
|
||||
f"f:{file_.filename}", self.presign_duration - 1, presigned_url
|
||||
)
|
||||
|
||||
out_files.append(
|
||||
CrawlFileOut(
|
||||
name=file_.filename,
|
||||
path=presigned_url,
|
||||
hash=file_.hash,
|
||||
size=file_.size,
|
||||
)
|
||||
)
|
||||
|
||||
return out_files
|
||||
|
||||
async def _resolve_filenames(self, crawl: CrawlOut):
|
||||
""" Resolve absolute filenames for each file """
|
||||
if not crawl.files:
|
||||
@ -448,21 +500,13 @@ def init_crawls_api(
|
||||
return {"deleted": res}
|
||||
|
||||
@app.get(
|
||||
"/archives/{aid}/crawls/{crawl_id}", tags=["crawls"], response_model=CrawlOut
|
||||
"/archives/{aid}/crawls/{crawl_id}.json",
|
||||
tags=["crawls"],
|
||||
response_model=CrawlOut,
|
||||
)
|
||||
async def get_crawl(crawl_id, archive: Archive = Depends(archive_crawl_dep)):
|
||||
return await ops.get_crawl(crawl_id, archive)
|
||||
|
||||
# @app.get(
|
||||
# "/archives/{aid}/crawls/{crawl_id}/running",
|
||||
# tags=["crawls"],
|
||||
# )
|
||||
# async def get_running(crawl_id, archive: Archive = Depends(archive_crawl_dep)):
|
||||
# if not crawl_manager.is_running(crawl_id, archive.id_str):
|
||||
# raise HTTPException(status_code=404, detail="No Such Crawl")
|
||||
#
|
||||
# return {"running": True}
|
||||
|
||||
@app.post(
|
||||
"/archives/{aid}/crawls/{crawl_id}/scale",
|
||||
tags=["crawls"],
|
||||
|
@ -372,9 +372,10 @@ class K8SManager:
|
||||
|
||||
return self._default_storages[name]
|
||||
|
||||
async def _secret_data(self, storage, name):
|
||||
""" decode secret storage data """
|
||||
return base64.standard_b64decode(storage.data[name]).decode()
|
||||
# pylint: disable=no-self-use
|
||||
def _secret_data(self, secret, name):
|
||||
""" decode secret data """
|
||||
return base64.standard_b64decode(secret.data[name]).decode()
|
||||
|
||||
async def get_running_crawl(self, name, aid):
|
||||
"""Get running crawl (job) with given name, or none
|
||||
|
@ -5,7 +5,7 @@ supports docker and kubernetes based deployments of multiple browsertrix-crawler
|
||||
|
||||
import os
|
||||
|
||||
from fastapi import FastAPI, Response
|
||||
from fastapi import FastAPI
|
||||
|
||||
from db import init_db
|
||||
|
||||
@ -94,13 +94,6 @@ def main():
|
||||
async def healthz():
|
||||
return {}
|
||||
|
||||
@app.get("/replay/sw.js")
|
||||
async def replay_sw():
|
||||
return Response(
|
||||
content='importScripts("https://cdn.jsdelivr.net/npm/replaywebpage@1.5.7/sw.js");',
|
||||
media_type="application/javascript",
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
@app.on_event("startup")
|
||||
|
@ -1,6 +1,6 @@
|
||||
uvicorn
|
||||
fastapi==0.70.0
|
||||
fastapi-users[mongodb]==8.1.2
|
||||
fastapi==0.71.0
|
||||
fastapi-users[mongodb]==9.2.2
|
||||
loguru
|
||||
aiofiles
|
||||
kubernetes-asyncio
|
||||
|
@ -12,10 +12,15 @@ from pydantic import EmailStr, UUID4
|
||||
import passlib.pwd
|
||||
|
||||
from fastapi import Request, Response, HTTPException, Depends
|
||||
from fastapi.security import OAuth2PasswordBearer
|
||||
|
||||
from fastapi_users import FastAPIUsers, models, BaseUserManager
|
||||
from fastapi_users.manager import UserAlreadyExists
|
||||
from fastapi_users.authentication import JWTAuthentication
|
||||
from fastapi_users.authentication import (
|
||||
AuthenticationBackend,
|
||||
BearerTransport,
|
||||
JWTStrategy,
|
||||
)
|
||||
from fastapi_users.db import MongoDBUserDatabase
|
||||
|
||||
from invites import InvitePending, InviteRequest
|
||||
@ -252,31 +257,71 @@ def init_user_manager(mdb, emailsender, invites):
|
||||
return UserManager(user_db, emailsender, invites)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class OA2BearerOrQuery(OAuth2PasswordBearer):
|
||||
""" Override bearer check to also test query """
|
||||
|
||||
async def __call__(self, request: Request) -> Optional[str]:
|
||||
param = None
|
||||
exc = None
|
||||
try:
|
||||
param = await super().__call__(request)
|
||||
if param:
|
||||
return param
|
||||
|
||||
# pylint: disable=broad-except
|
||||
except Exception as super_exc:
|
||||
exc = super_exc
|
||||
|
||||
param = request.query_params.get("auth_bearer")
|
||||
|
||||
if not param and exc:
|
||||
raise exc
|
||||
|
||||
return param
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class BearerOrQueryTransport(BearerTransport):
|
||||
""" Bearer or Query Transport """
|
||||
|
||||
scheme: OA2BearerOrQuery
|
||||
|
||||
def __init__(self, tokenUrl: str):
|
||||
# pylint: disable=super-init-not-called
|
||||
self.scheme = OA2BearerOrQuery(tokenUrl, auto_error=False)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
def init_users_api(app, user_manager):
|
||||
""" init fastapi_users """
|
||||
jwt_authentication = JWTAuthentication(
|
||||
secret=PASSWORD_SECRET,
|
||||
lifetime_seconds=JWT_TOKEN_LIFETIME,
|
||||
tokenUrl="auth/jwt/login",
|
||||
bearer_transport = BearerOrQueryTransport(tokenUrl="auth/jwt/login")
|
||||
|
||||
def get_jwt_strategy() -> JWTStrategy:
|
||||
return JWTStrategy(secret=PASSWORD_SECRET, lifetime_seconds=JWT_TOKEN_LIFETIME)
|
||||
|
||||
auth_backend = AuthenticationBackend(
|
||||
name="jwt",
|
||||
transport=bearer_transport,
|
||||
get_strategy=get_jwt_strategy,
|
||||
)
|
||||
|
||||
fastapi_users = FastAPIUsers(
|
||||
lambda: user_manager,
|
||||
[jwt_authentication],
|
||||
[auth_backend],
|
||||
User,
|
||||
UserCreateIn,
|
||||
UserUpdate,
|
||||
UserDB,
|
||||
)
|
||||
|
||||
auth_router = fastapi_users.get_auth_router(jwt_authentication)
|
||||
auth_router = fastapi_users.get_auth_router(auth_backend)
|
||||
|
||||
current_active_user = fastapi_users.current_user(active=True)
|
||||
|
||||
@auth_router.post("/refresh")
|
||||
async def refresh_jwt(response: Response, user=Depends(current_active_user)):
|
||||
return await jwt_authentication.get_login_response(user, response, user_manager)
|
||||
return await auth_backend.login(get_jwt_strategy(), user, response)
|
||||
|
||||
app.include_router(
|
||||
auth_router,
|
||||
|
@ -1,4 +1,8 @@
|
||||
# central place to configure the production replayweb.pgae loading prefix
|
||||
ARG RWP_BASE_URL=https://cdn.jsdelivr.net/npm/replaywebpage@1.5.8/
|
||||
|
||||
FROM node:16 as build
|
||||
ARG RWP_BASE_URL
|
||||
|
||||
WORKDIR /app
|
||||
COPY package.json .
|
||||
@ -8,9 +12,11 @@ COPY *.* ./
|
||||
COPY src ./src/
|
||||
RUN yarn build
|
||||
|
||||
|
||||
FROM nginx
|
||||
|
||||
ARG RWP_BASE_URL
|
||||
ENV RWP_BASE_URL=${RWP_BASE_URL}
|
||||
|
||||
COPY --from=build /app/dist /usr/share/nginx/html
|
||||
|
||||
COPY ./nginx.conf.template /etc/nginx/templates/
|
||||
|
@ -53,10 +53,10 @@ server {
|
||||
index index.html index.htm;
|
||||
}
|
||||
|
||||
location /replay/ {
|
||||
proxy_pass http://${BACKEND_HOST}:8000;
|
||||
proxy_set_header Host $http_host;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
# used in both k8s and docker: RWP_BASE_URL set in Dockerfile
|
||||
location /replay/sw.js {
|
||||
add_header Content-Type application/javascript;
|
||||
return 200 'importScripts("${RWP_BASE_URL}sw.js");';
|
||||
}
|
||||
|
||||
# used by docker only: k8s deployment handles /api directly via ingress
|
||||
|
@ -8,6 +8,7 @@
|
||||
/>
|
||||
<title>Browsertrix Cloud</title>
|
||||
<base href="/" />
|
||||
<script defer src="<%= rwp_base_url %>ui.js"></script>
|
||||
</head>
|
||||
<body>
|
||||
<browsertrix-app></browsertrix-app>
|
@ -87,7 +87,7 @@ export class CrawlDetail extends LiteElement {
|
||||
</section>
|
||||
|
||||
<section>
|
||||
<h3 class="text-lg font-medium mb-2">${msg("Files")}</h3>
|
||||
<h3 class="text-lg font-medium mb-2">${msg("Download Files")}</h3>
|
||||
${this.renderFiles()}
|
||||
</section>
|
||||
</main>
|
||||
@ -97,6 +97,9 @@ export class CrawlDetail extends LiteElement {
|
||||
private renderWatch() {
|
||||
const isRunning = this.crawl?.state === "running";
|
||||
|
||||
const bearer = this.authState?.headers?.Authorization?.split(" ", 2)[1];
|
||||
const fileJson = `/api/archives/${this.archiveId}/crawls/${this.crawlId}.json?auth_bearer=${bearer}`;
|
||||
|
||||
return html`
|
||||
<div
|
||||
class="aspect-video rounded border ${isRunning
|
||||
@ -105,6 +108,8 @@ export class CrawlDetail extends LiteElement {
|
||||
>
|
||||
<!-- https://github.com/webrecorder/browsertrix-crawler/blob/9f541ab011e8e4bccf8de5bd7dc59b632c694bab/screencast/index.html -->
|
||||
[watch/replay]
|
||||
${this.crawl?.resources?.length ? html`<replay-web-page source="${fileJson}" coll="${this.crawl?.id}" replayBase="/replay/" noSandbox="true"></replay-web-page>` : ``}
|
||||
|
||||
</div>
|
||||
<div
|
||||
class="absolute top-2 right-2 flex bg-white/90 hover:bg-white rounded-full"
|
||||
@ -318,23 +323,19 @@ export class CrawlDetail extends LiteElement {
|
||||
private renderFiles() {
|
||||
return html`
|
||||
<ul class="border rounded text-sm">
|
||||
${this.crawl?.files?.map(
|
||||
${this.crawl?.resources?.map(
|
||||
(file) => html`
|
||||
<li class="flex justify-between p-3 border-t first:border-t-0">
|
||||
<div>
|
||||
<a
|
||||
class="text-primary hover:underline"
|
||||
href=${file.filename}
|
||||
href=${file.path}
|
||||
download
|
||||
title=${file.filename.slice(
|
||||
file.filename.lastIndexOf("/") + 1
|
||||
title=${file.name}
|
||||
>${file.name.slice(
|
||||
file.name.lastIndexOf("/") + 1
|
||||
)}
|
||||
>${msg(
|
||||
str`Download ${file.filename.slice(
|
||||
file.filename.lastIndexOf(".")
|
||||
)}`
|
||||
)}</a
|
||||
>
|
||||
</a>
|
||||
</div>
|
||||
<div><sl-format-bytes value=${file.size}></sl-format-bytes></div>
|
||||
</li>
|
||||
@ -376,7 +377,7 @@ export class CrawlDetail extends LiteElement {
|
||||
// );
|
||||
|
||||
const data: Crawl = await this.apiFetch(
|
||||
`/archives/${this.archiveId}/crawls/${this.crawlId}`,
|
||||
`/archives/${this.archiveId}/crawls/${this.crawlId}.json`,
|
||||
this.authState!
|
||||
);
|
||||
|
||||
|
@ -11,7 +11,7 @@ export type Crawl = {
|
||||
state: string; // "running" | "complete" | "failed" | "partial_complete"
|
||||
scale: number;
|
||||
stats: { done: string; found: string } | null;
|
||||
files?: { filename: string; hash: string; size: number }[];
|
||||
resources?: { name: string; path: string, hash: string; size: number }[];
|
||||
fileCount?: number;
|
||||
fileSize?: number;
|
||||
completions?: number;
|
||||
|
@ -6,6 +6,10 @@ const CopyPlugin = require("copy-webpack-plugin");
|
||||
const Dotenv = require("dotenv-webpack");
|
||||
|
||||
const isDevServer = process.env.WEBPACK_SERVE;
|
||||
|
||||
// for testing: for prod, the Dockerfile should have the official prod version used
|
||||
const RWP_BASE_URL = process.env.RWP_BASE_URL || "https://replayweb.page/";
|
||||
|
||||
const dotEnvPath = path.resolve(
|
||||
process.cwd(),
|
||||
`.env${isDevServer ? `.local` : ""}`
|
||||
@ -70,7 +74,6 @@ module.exports = {
|
||||
directory: shoelaceAssetsSrcPath,
|
||||
publicPath: "/" + shoelaceAssetsPublicPath,
|
||||
},
|
||||
|
||||
{
|
||||
directory: path.join(__dirname),
|
||||
//publicPath: "/",
|
||||
@ -87,6 +90,13 @@ module.exports = {
|
||||
pathRewrite: { "^/api": "" },
|
||||
},
|
||||
},
|
||||
// Serve replay service worker file
|
||||
onBeforeSetupMiddleware: (server) => {
|
||||
server.app.get("/replay/sw.js", (req, res) => {
|
||||
res.set("Content-Type", "application/javascript");
|
||||
res.send(`importScripts("${RWP_BASE_URL}sw.js")`);
|
||||
});
|
||||
},
|
||||
port: 9870,
|
||||
},
|
||||
|
||||
@ -94,7 +104,10 @@ module.exports = {
|
||||
new Dotenv({ path: dotEnvPath }),
|
||||
|
||||
new HtmlWebpackPlugin({
|
||||
template: "src/index.html",
|
||||
template: "src/index.ejs",
|
||||
templateParameters: {
|
||||
rwp_base_url: RWP_BASE_URL,
|
||||
},
|
||||
// Need to block during local development for HMR:
|
||||
inject: isDevServer ? "head" : true,
|
||||
scriptLoading: isDevServer ? "blocking" : "defer",
|
||||
|
Loading…
Reference in New Issue
Block a user