Implement downloading archived item + QA runs as multi-WACZ (#1933)

Fixes #1412 

## Changes

### Backend
- Adds `all-crawls`, `crawls`, and `uploads` API endpoints to download
archived item as multi-WACZ
- Download QA runs as multi-WACZ
- Adds backend tests for new endpoints
- Update to new version of stream-zip library which does not require crc-32 to be present for ZIP members,
computes after streaming, fixing invalid crc-32 issues as previously computed crc-32s from crawler may be invalid.

### Frontend

Adds ability to download archived item from:

- Button in archived item detail Files tab
- Archived item details actions menu
- Archived items list menu



---------

Co-authored-by: Henry Wilkinson <henry@wilkinson.graphics>
Co-authored-by: sua yoo <sua@webrecorder.org>
Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
Tessa Walsh 2024-07-25 13:28:57 -04:00 committed by GitHub
parent b288cd81cc
commit 27ee16d308
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 224 additions and 38 deletions

View File

@ -8,6 +8,7 @@ import urllib.parse
import asyncio
from fastapi import HTTPException, Depends
from fastapi.responses import StreamingResponse
from .models import (
CrawlFile,
@ -797,6 +798,20 @@ class BaseCrawlOps:
"firstSeeds": list(first_seeds),
}
async def download_crawl_as_single_wacz(self, crawl_id: str, org: Organization):
"""Download all WACZs in archived item as streaming nested WACZ"""
crawl = await self.get_crawl_out(crawl_id, org)
if not crawl.resources:
raise HTTPException(status_code=400, detail="no_crawl_resources")
resp = await self.storage_ops.download_streaming_wacz(org, crawl.resources)
headers = {"Content-Disposition": f'attachment; filename="{crawl_id}.wacz"'}
return StreamingResponse(
resp, headers=headers, media_type="application/wacz+zip"
)
async def calculate_org_crawl_file_storage(
self, oid: UUID, type_: Optional[str] = None
) -> Tuple[int, int, int]:
@ -928,6 +943,16 @@ def init_base_crawls_api(app, user_dep, *args):
async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)):
return await ops.get_crawl_out(crawl_id, org)
@app.get(
"/orgs/{oid}/all-crawls/{crawl_id}/download",
tags=["all-crawls"],
response_model=bytes,
)
async def download_base_crawl_as_single_wacz(
crawl_id: str, org: Organization = Depends(org_viewer_dep)
):
return await ops.download_crawl_as_single_wacz(crawl_id, org)
@app.patch(
"/orgs/{oid}/all-crawls/{crawl_id}",
tags=["all-crawls"],

View File

@ -1008,6 +1008,28 @@ class CrawlOps(BaseCrawlOps):
return QARunWithResources(**qa_run_dict)
async def download_qa_run_as_single_wacz(
self, crawl_id: str, qa_run_id: str, org: Organization
):
"""Download all WACZs in a QA run as streaming nested WACZ"""
qa_run = await self.get_qa_run_for_replay(crawl_id, qa_run_id, org)
if not qa_run.finished:
raise HTTPException(status_code=400, detail="qa_run_not_finished")
if not qa_run.resources:
raise HTTPException(status_code=400, detail="qa_run_no_resources")
resp = await self.storage_ops.download_streaming_wacz(org, qa_run.resources)
finished = qa_run.finished.isoformat()
headers = {
"Content-Disposition": f'attachment; filename="qa-{finished}-crawl-{crawl_id}.wacz"'
}
return StreamingResponse(
resp, headers=headers, media_type="application/wacz+zip"
)
async def get_qa_run_aggregate_stats(
self,
crawl_id: str,
@ -1226,6 +1248,14 @@ def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args):
async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)):
return await ops.get_crawl_out(crawl_id, org, "crawl")
@app.get(
"/orgs/{oid}/crawls/{crawl_id}/download", tags=["crawls"], response_model=bytes
)
async def download_crawl_as_single_wacz(
crawl_id: str, org: Organization = Depends(org_viewer_dep)
):
return await ops.download_crawl_as_single_wacz(crawl_id, org)
# QA APIs
# ---------------------
@app.get(
@ -1249,6 +1279,16 @@ def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args):
):
return await ops.get_qa_run_for_replay(crawl_id, qa_run_id, org)
@app.get(
"/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/download",
tags=["qa"],
response_model=bytes,
)
async def download_qa_run_as_single_wacz(
crawl_id: str, qa_run_id: str, org: Organization = Depends(org_viewer_dep)
):
return await ops.download_qa_run_as_single_wacz(crawl_id, qa_run_id, org)
@app.get(
"/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/stats",
tags=["qa"],

View File

@ -11,6 +11,7 @@ from typing import (
AsyncIterator,
TYPE_CHECKING,
Any,
cast,
)
from urllib.parse import urlsplit
from contextlib import asynccontextmanager
@ -26,7 +27,7 @@ from datetime import datetime
from zipfile import ZipInfo
from fastapi import Depends, HTTPException
from stream_zip import stream_zip, NO_COMPRESSION_64
from stream_zip import stream_zip, NO_COMPRESSION_64, Method
from remotezip import RemoteZip
import aiobotocore.session
@ -698,7 +699,9 @@ class StorageOps:
response = client.get_object(Bucket=bucket, Key=key + name)
return response["Body"].iter_chunks(chunk_size=CHUNK_SIZE)
def member_files():
def member_files() -> (
Iterable[tuple[str, datetime, int, Method, Iterable[bytes]]]
):
modified_at = datetime(year=1980, month=1, day=1)
perms = 0o664
for file_ in all_files:
@ -706,7 +709,7 @@ class StorageOps:
file_.name,
modified_at,
perms,
NO_COMPRESSION_64(file_.size, file_.crc32),
NO_COMPRESSION_64(file_.size, 0),
get_file(file_.name),
)
@ -720,7 +723,8 @@ class StorageOps:
(datapackage_bytes,),
)
return stream_zip(member_files(), chunk_size=CHUNK_SIZE)
# stream_zip() is an Iterator but defined as an Iterable, can cast
return cast(Iterator[bytes], stream_zip(member_files(), chunk_size=CHUNK_SIZE))
async def download_streaming_wacz(
self, org: Organization, files: List[CrawlFileOut]

View File

@ -423,6 +423,16 @@ def init_uploads_api(app, user_dep, *args):
async def get_upload_replay(crawl_id, org: Organization = Depends(org_viewer_dep)):
return await ops.get_crawl_out(crawl_id, org, "upload")
@app.get(
"/orgs/{oid}/uploads/{crawl_id}/download",
tags=["uploads"],
response_model=bytes,
)
async def download_upload_as_single_wacz(
crawl_id: str, org: Organization = Depends(org_viewer_dep)
):
return await ops.download_crawl_as_single_wacz(crawl_id, org)
@app.patch(
"/orgs/{oid}/uploads/{crawl_id}",
tags=["uploads"],

View File

@ -17,8 +17,7 @@ jinja2
humanize
python-multipart
pathvalidate
#https://github.com/ikreymer/stream-zip/archive/refs/heads/stream-uncompress.zip
https://github.com/ikreymer/stream-zip/archive/refs/heads/stream-ignore-local-crc32.zip
https://github.com/ikreymer/stream-zip/archive/refs/heads/crc32-optional.zip
boto3
backoff>=2.2.1
python-slugify>=8.0.1

View File

@ -2,6 +2,8 @@ from .conftest import API_PREFIX, HOST_PREFIX
import requests
import time
from datetime import datetime
from tempfile import TemporaryFile
from zipfile import ZipFile, ZIP_STORED
import pytest
@ -541,6 +543,33 @@ def test_sort_crawls_by_qa_runs(
last_count = crawl_qa_count
def test_download_wacz_crawls(
crawler_crawl_id,
crawler_auth_headers,
default_org_id,
qa_run_id,
qa_run_pages_ready,
):
with TemporaryFile() as fh:
with requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/download",
headers=crawler_auth_headers,
stream=True,
) as r:
assert r.status_code == 200
for chunk in r.iter_content():
fh.write(chunk)
fh.seek(0)
with ZipFile(fh, "r") as zip_file:
contents = zip_file.namelist()
assert len(contents) >= 2
for filename in contents:
assert filename.endswith(".wacz") or filename == "datapackage.json"
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
def test_delete_qa_runs(
crawler_crawl_id,
crawler_auth_headers,

View File

@ -6,6 +6,10 @@ import zipfile
import re
import csv
import codecs
from tempfile import TemporaryFile
from zipfile import ZipFile, ZIP_STORED
import pytest
from .conftest import API_PREFIX, HOST_PREFIX, FINISHED_STATES
from .test_collections import UPDATED_NAME as COLLECTION_NAME
@ -371,6 +375,38 @@ def test_verify_wacz():
assert len(pages.strip().split("\n")) == 4
@pytest.mark.parametrize(
"type_path",
[
# crawls endpoint
("crawls"),
# all-crawls endpoint
("all-crawls"),
],
)
def test_download_wacz_crawls(
admin_auth_headers, default_org_id, admin_crawl_id, type_path
):
with TemporaryFile() as fh:
with requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/{type_path}/{admin_crawl_id}/download",
headers=admin_auth_headers,
stream=True,
) as r:
assert r.status_code == 200
for chunk in r.iter_content():
fh.write(chunk)
fh.seek(0)
with ZipFile(fh, "r") as zip_file:
contents = zip_file.namelist()
assert len(contents) >= 2
for filename in contents:
assert filename.endswith(".wacz") or filename == "datapackage.json"
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
def test_update_crawl(
admin_auth_headers,
default_org_id,

View File

@ -1,7 +1,9 @@
import requests
import os
import time
from tempfile import TemporaryFile
from urllib.parse import urljoin
from zipfile import ZipFile, ZIP_STORED
import pytest
@ -329,6 +331,27 @@ def test_update_upload_metadata(admin_auth_headers, default_org_id, upload_id):
assert data["collectionIds"] == UPDATED_COLLECTION_IDS
def test_download_wacz_uploads(admin_auth_headers, default_org_id, upload_id):
with TemporaryFile() as fh:
with requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/download",
headers=admin_auth_headers,
stream=True,
) as r:
assert r.status_code == 200
for chunk in r.iter_content():
fh.write(chunk)
fh.seek(0)
with ZipFile(fh, "r") as zip_file:
contents = zip_file.namelist()
assert len(contents) == 2
for filename in contents:
assert filename.endswith(".wacz") or filename == "datapackage.json"
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
def test_delete_stream_upload(
admin_auth_headers, crawler_auth_headers, default_org_id, upload_id
):

View File

@ -275,7 +275,21 @@ export class ArchivedItemDetail extends TailwindElement {
]);
break;
case "files":
sectionContent = this.renderPanel(msg("Files"), this.renderFiles());
sectionContent = this.renderPanel(
html` ${this.renderTitle(msg("Files"))}
<sl-tooltip content=${msg("Download all files as a single WACZ")}>
<sl-button
href=${`/api/orgs/${this.orgId}/all-crawls/${this.crawlId}/download?auth_bearer=${authToken}`}
download
size="small"
variant="primary"
>
<sl-icon slot="prefix" name="cloud-download"></sl-icon>
${msg("Download Item")}
</sl-button>
</sl-tooltip>`,
this.renderFiles(),
);
break;
case "logs":
sectionContent = this.renderPanel(
@ -558,6 +572,8 @@ export class ArchivedItemDetail extends TailwindElement {
private renderMenu() {
if (!this.crawl) return;
const authToken = this.authState!.headers.Authorization.split(" ")[1];
return html`
<sl-dropdown placement="bottom-end" distance="4" hoist>
<sl-button slot="trigger" size="small" caret
@ -609,6 +625,19 @@ export class ArchivedItemDetail extends TailwindElement {
<sl-icon name="tags" slot="prefix"></sl-icon>
${msg("Copy Tags")}
</sl-menu-item>
${when(
finishedCrawlStates.includes(this.crawl.state),
() => html`
<sl-divider></sl-divider>
<btrix-menu-item-link
href=${`/api/orgs/${this.orgId}/all-crawls/${this.crawlId}/download?auth_bearer=${authToken}`}
download
>
<sl-icon name="cloud-download" slot="prefix"></sl-icon>
${msg("Download Item")}
</btrix-menu-item-link>
`,
)}
${when(
this.isCrawler && !isActive(this.crawl.state),
() => html`
@ -618,7 +647,7 @@ export class ArchivedItemDetail extends TailwindElement {
@click=${() => void this.deleteCrawl()}
>
<sl-icon name="trash3" slot="prefix"></sl-icon>
${msg("Delete Crawl")}
${msg("Delete Item")}
</sl-menu-item>
`,
)}

View File

@ -404,14 +404,8 @@ export class ArchivedItemDetailQA extends TailwindElement {
}
downloadLink.loading = true;
const file = await this.getQARunDownloadLink(run.id);
if (file) {
downloadLink.disabled = false;
downloadLink.href = file.path;
} else {
downloadLink.disabled = true;
}
downloadLink.loading = false;
downloadLink.disabled = false;
downloadLink.href = `/orgs/${this.orgId}/crawls/${this.crawlId}/qa/${run.id}/download`;
}}
>
<sl-menu>
@ -933,19 +927,6 @@ export class ArchivedItemDetailQA extends TailwindElement {
);
}
private async getQARunDownloadLink(qaRunId: string) {
try {
const { resources } = await this.api.fetch<QARun>(
`/orgs/${this.orgId}/crawls/${this.crawlId}/qa/${qaRunId}/replay.json`,
this.authState!,
);
// TODO handle more than one file
return resources?.[0];
} catch (e) {
console.debug(e);
}
}
private async deleteQARun(id: string) {
try {
await this.api.fetch(

View File

@ -603,23 +603,19 @@ export class CrawlsList extends TailwindElement {
?showStatus=${this.itemType !== null}
>
<btrix-table-cell slot="actionCell" class="p-0">
<btrix-overflow-dropdown
@click=${(e: MouseEvent) => {
// Prevent navigation to detail view
e.preventDefault();
e.stopImmediatePropagation();
}}
>
<btrix-overflow-dropdown>
<sl-menu>${this.renderMenuItems(item)}</sl-menu>
</btrix-overflow-dropdown>
</btrix-table-cell>
</btrix-archived-item-list-item>
`;
private readonly renderMenuItems = (item: ArchivedItem) =>
private readonly renderMenuItems = (item: ArchivedItem) => {
// HACK shoelace doesn't current have a way to override non-hover
// color without resetting the --sl-color-neutral-700 variable
html`
const authToken = this.authState!.headers.Authorization.split(" ")[1];
return html`
${when(
this.isCrawler,
() => html`
@ -664,6 +660,19 @@ export class CrawlsList extends TailwindElement {
<sl-icon name="tags" slot="prefix"></sl-icon>
${msg("Copy Tags")}
</sl-menu-item>
${when(
finishedCrawlStates.includes(item.state),
() => html`
<sl-divider></sl-divider>
<btrix-menu-item-link
href=${`/api/orgs/${this.orgId}/all-crawls/${item.id}/download?auth_bearer=${authToken}`}
download
>
<sl-icon name="cloud-download" slot="prefix"></sl-icon>
${msg("Download Item")}
</btrix-menu-item-link>
`,
)}
${when(
this.isCrawler && !isActive(item.state),
() => html`
@ -678,6 +687,7 @@ export class CrawlsList extends TailwindElement {
`,
)}
`;
};
private readonly renderStatusMenuItem = (state: CrawlState) => {
const { icon, label } = CrawlStatus.getContent(state);