Implement downloading archived item + QA runs as multi-WACZ (#1933)
Fixes #1412 ## Changes ### Backend - Adds `all-crawls`, `crawls`, and `uploads` API endpoints to download archived item as multi-WACZ - Download QA runs as multi-WACZ - Adds backend tests for new endpoints - Update to new version of stream-zip library which does not require crc-32 to be present for ZIP members, computes after streaming, fixing invalid crc-32 issues as previously computed crc-32s from crawler may be invalid. ### Frontend Adds ability to download archived item from: - Button in archived item detail Files tab - Archived item details actions menu - Archived items list menu --------- Co-authored-by: Henry Wilkinson <henry@wilkinson.graphics> Co-authored-by: sua yoo <sua@webrecorder.org> Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
This commit is contained in:
parent
b288cd81cc
commit
27ee16d308
@ -8,6 +8,7 @@ import urllib.parse
|
||||
|
||||
import asyncio
|
||||
from fastapi import HTTPException, Depends
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
from .models import (
|
||||
CrawlFile,
|
||||
@ -797,6 +798,20 @@ class BaseCrawlOps:
|
||||
"firstSeeds": list(first_seeds),
|
||||
}
|
||||
|
||||
async def download_crawl_as_single_wacz(self, crawl_id: str, org: Organization):
|
||||
"""Download all WACZs in archived item as streaming nested WACZ"""
|
||||
crawl = await self.get_crawl_out(crawl_id, org)
|
||||
|
||||
if not crawl.resources:
|
||||
raise HTTPException(status_code=400, detail="no_crawl_resources")
|
||||
|
||||
resp = await self.storage_ops.download_streaming_wacz(org, crawl.resources)
|
||||
|
||||
headers = {"Content-Disposition": f'attachment; filename="{crawl_id}.wacz"'}
|
||||
return StreamingResponse(
|
||||
resp, headers=headers, media_type="application/wacz+zip"
|
||||
)
|
||||
|
||||
async def calculate_org_crawl_file_storage(
|
||||
self, oid: UUID, type_: Optional[str] = None
|
||||
) -> Tuple[int, int, int]:
|
||||
@ -928,6 +943,16 @@ def init_base_crawls_api(app, user_dep, *args):
|
||||
async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)):
|
||||
return await ops.get_crawl_out(crawl_id, org)
|
||||
|
||||
@app.get(
|
||||
"/orgs/{oid}/all-crawls/{crawl_id}/download",
|
||||
tags=["all-crawls"],
|
||||
response_model=bytes,
|
||||
)
|
||||
async def download_base_crawl_as_single_wacz(
|
||||
crawl_id: str, org: Organization = Depends(org_viewer_dep)
|
||||
):
|
||||
return await ops.download_crawl_as_single_wacz(crawl_id, org)
|
||||
|
||||
@app.patch(
|
||||
"/orgs/{oid}/all-crawls/{crawl_id}",
|
||||
tags=["all-crawls"],
|
||||
|
@ -1008,6 +1008,28 @@ class CrawlOps(BaseCrawlOps):
|
||||
|
||||
return QARunWithResources(**qa_run_dict)
|
||||
|
||||
async def download_qa_run_as_single_wacz(
|
||||
self, crawl_id: str, qa_run_id: str, org: Organization
|
||||
):
|
||||
"""Download all WACZs in a QA run as streaming nested WACZ"""
|
||||
qa_run = await self.get_qa_run_for_replay(crawl_id, qa_run_id, org)
|
||||
if not qa_run.finished:
|
||||
raise HTTPException(status_code=400, detail="qa_run_not_finished")
|
||||
|
||||
if not qa_run.resources:
|
||||
raise HTTPException(status_code=400, detail="qa_run_no_resources")
|
||||
|
||||
resp = await self.storage_ops.download_streaming_wacz(org, qa_run.resources)
|
||||
|
||||
finished = qa_run.finished.isoformat()
|
||||
|
||||
headers = {
|
||||
"Content-Disposition": f'attachment; filename="qa-{finished}-crawl-{crawl_id}.wacz"'
|
||||
}
|
||||
return StreamingResponse(
|
||||
resp, headers=headers, media_type="application/wacz+zip"
|
||||
)
|
||||
|
||||
async def get_qa_run_aggregate_stats(
|
||||
self,
|
||||
crawl_id: str,
|
||||
@ -1226,6 +1248,14 @@ def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args):
|
||||
async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)):
|
||||
return await ops.get_crawl_out(crawl_id, org, "crawl")
|
||||
|
||||
@app.get(
|
||||
"/orgs/{oid}/crawls/{crawl_id}/download", tags=["crawls"], response_model=bytes
|
||||
)
|
||||
async def download_crawl_as_single_wacz(
|
||||
crawl_id: str, org: Organization = Depends(org_viewer_dep)
|
||||
):
|
||||
return await ops.download_crawl_as_single_wacz(crawl_id, org)
|
||||
|
||||
# QA APIs
|
||||
# ---------------------
|
||||
@app.get(
|
||||
@ -1249,6 +1279,16 @@ def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args):
|
||||
):
|
||||
return await ops.get_qa_run_for_replay(crawl_id, qa_run_id, org)
|
||||
|
||||
@app.get(
|
||||
"/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/download",
|
||||
tags=["qa"],
|
||||
response_model=bytes,
|
||||
)
|
||||
async def download_qa_run_as_single_wacz(
|
||||
crawl_id: str, qa_run_id: str, org: Organization = Depends(org_viewer_dep)
|
||||
):
|
||||
return await ops.download_qa_run_as_single_wacz(crawl_id, qa_run_id, org)
|
||||
|
||||
@app.get(
|
||||
"/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/stats",
|
||||
tags=["qa"],
|
||||
|
@ -11,6 +11,7 @@ from typing import (
|
||||
AsyncIterator,
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
cast,
|
||||
)
|
||||
from urllib.parse import urlsplit
|
||||
from contextlib import asynccontextmanager
|
||||
@ -26,7 +27,7 @@ from datetime import datetime
|
||||
from zipfile import ZipInfo
|
||||
|
||||
from fastapi import Depends, HTTPException
|
||||
from stream_zip import stream_zip, NO_COMPRESSION_64
|
||||
from stream_zip import stream_zip, NO_COMPRESSION_64, Method
|
||||
from remotezip import RemoteZip
|
||||
|
||||
import aiobotocore.session
|
||||
@ -698,7 +699,9 @@ class StorageOps:
|
||||
response = client.get_object(Bucket=bucket, Key=key + name)
|
||||
return response["Body"].iter_chunks(chunk_size=CHUNK_SIZE)
|
||||
|
||||
def member_files():
|
||||
def member_files() -> (
|
||||
Iterable[tuple[str, datetime, int, Method, Iterable[bytes]]]
|
||||
):
|
||||
modified_at = datetime(year=1980, month=1, day=1)
|
||||
perms = 0o664
|
||||
for file_ in all_files:
|
||||
@ -706,7 +709,7 @@ class StorageOps:
|
||||
file_.name,
|
||||
modified_at,
|
||||
perms,
|
||||
NO_COMPRESSION_64(file_.size, file_.crc32),
|
||||
NO_COMPRESSION_64(file_.size, 0),
|
||||
get_file(file_.name),
|
||||
)
|
||||
|
||||
@ -720,7 +723,8 @@ class StorageOps:
|
||||
(datapackage_bytes,),
|
||||
)
|
||||
|
||||
return stream_zip(member_files(), chunk_size=CHUNK_SIZE)
|
||||
# stream_zip() is an Iterator but defined as an Iterable, can cast
|
||||
return cast(Iterator[bytes], stream_zip(member_files(), chunk_size=CHUNK_SIZE))
|
||||
|
||||
async def download_streaming_wacz(
|
||||
self, org: Organization, files: List[CrawlFileOut]
|
||||
|
@ -423,6 +423,16 @@ def init_uploads_api(app, user_dep, *args):
|
||||
async def get_upload_replay(crawl_id, org: Organization = Depends(org_viewer_dep)):
|
||||
return await ops.get_crawl_out(crawl_id, org, "upload")
|
||||
|
||||
@app.get(
|
||||
"/orgs/{oid}/uploads/{crawl_id}/download",
|
||||
tags=["uploads"],
|
||||
response_model=bytes,
|
||||
)
|
||||
async def download_upload_as_single_wacz(
|
||||
crawl_id: str, org: Organization = Depends(org_viewer_dep)
|
||||
):
|
||||
return await ops.download_crawl_as_single_wacz(crawl_id, org)
|
||||
|
||||
@app.patch(
|
||||
"/orgs/{oid}/uploads/{crawl_id}",
|
||||
tags=["uploads"],
|
||||
|
@ -17,8 +17,7 @@ jinja2
|
||||
humanize
|
||||
python-multipart
|
||||
pathvalidate
|
||||
#https://github.com/ikreymer/stream-zip/archive/refs/heads/stream-uncompress.zip
|
||||
https://github.com/ikreymer/stream-zip/archive/refs/heads/stream-ignore-local-crc32.zip
|
||||
https://github.com/ikreymer/stream-zip/archive/refs/heads/crc32-optional.zip
|
||||
boto3
|
||||
backoff>=2.2.1
|
||||
python-slugify>=8.0.1
|
||||
|
@ -2,6 +2,8 @@ from .conftest import API_PREFIX, HOST_PREFIX
|
||||
import requests
|
||||
import time
|
||||
from datetime import datetime
|
||||
from tempfile import TemporaryFile
|
||||
from zipfile import ZipFile, ZIP_STORED
|
||||
|
||||
import pytest
|
||||
|
||||
@ -541,6 +543,33 @@ def test_sort_crawls_by_qa_runs(
|
||||
last_count = crawl_qa_count
|
||||
|
||||
|
||||
def test_download_wacz_crawls(
|
||||
crawler_crawl_id,
|
||||
crawler_auth_headers,
|
||||
default_org_id,
|
||||
qa_run_id,
|
||||
qa_run_pages_ready,
|
||||
):
|
||||
with TemporaryFile() as fh:
|
||||
with requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/download",
|
||||
headers=crawler_auth_headers,
|
||||
stream=True,
|
||||
) as r:
|
||||
assert r.status_code == 200
|
||||
for chunk in r.iter_content():
|
||||
fh.write(chunk)
|
||||
|
||||
fh.seek(0)
|
||||
with ZipFile(fh, "r") as zip_file:
|
||||
contents = zip_file.namelist()
|
||||
|
||||
assert len(contents) >= 2
|
||||
for filename in contents:
|
||||
assert filename.endswith(".wacz") or filename == "datapackage.json"
|
||||
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
|
||||
|
||||
|
||||
def test_delete_qa_runs(
|
||||
crawler_crawl_id,
|
||||
crawler_auth_headers,
|
||||
|
@ -6,6 +6,10 @@ import zipfile
|
||||
import re
|
||||
import csv
|
||||
import codecs
|
||||
from tempfile import TemporaryFile
|
||||
from zipfile import ZipFile, ZIP_STORED
|
||||
|
||||
import pytest
|
||||
|
||||
from .conftest import API_PREFIX, HOST_PREFIX, FINISHED_STATES
|
||||
from .test_collections import UPDATED_NAME as COLLECTION_NAME
|
||||
@ -371,6 +375,38 @@ def test_verify_wacz():
|
||||
assert len(pages.strip().split("\n")) == 4
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"type_path",
|
||||
[
|
||||
# crawls endpoint
|
||||
("crawls"),
|
||||
# all-crawls endpoint
|
||||
("all-crawls"),
|
||||
],
|
||||
)
|
||||
def test_download_wacz_crawls(
|
||||
admin_auth_headers, default_org_id, admin_crawl_id, type_path
|
||||
):
|
||||
with TemporaryFile() as fh:
|
||||
with requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/{type_path}/{admin_crawl_id}/download",
|
||||
headers=admin_auth_headers,
|
||||
stream=True,
|
||||
) as r:
|
||||
assert r.status_code == 200
|
||||
for chunk in r.iter_content():
|
||||
fh.write(chunk)
|
||||
|
||||
fh.seek(0)
|
||||
with ZipFile(fh, "r") as zip_file:
|
||||
contents = zip_file.namelist()
|
||||
|
||||
assert len(contents) >= 2
|
||||
for filename in contents:
|
||||
assert filename.endswith(".wacz") or filename == "datapackage.json"
|
||||
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
|
||||
|
||||
|
||||
def test_update_crawl(
|
||||
admin_auth_headers,
|
||||
default_org_id,
|
||||
|
@ -1,7 +1,9 @@
|
||||
import requests
|
||||
import os
|
||||
import time
|
||||
from tempfile import TemporaryFile
|
||||
from urllib.parse import urljoin
|
||||
from zipfile import ZipFile, ZIP_STORED
|
||||
|
||||
import pytest
|
||||
|
||||
@ -329,6 +331,27 @@ def test_update_upload_metadata(admin_auth_headers, default_org_id, upload_id):
|
||||
assert data["collectionIds"] == UPDATED_COLLECTION_IDS
|
||||
|
||||
|
||||
def test_download_wacz_uploads(admin_auth_headers, default_org_id, upload_id):
|
||||
with TemporaryFile() as fh:
|
||||
with requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/download",
|
||||
headers=admin_auth_headers,
|
||||
stream=True,
|
||||
) as r:
|
||||
assert r.status_code == 200
|
||||
for chunk in r.iter_content():
|
||||
fh.write(chunk)
|
||||
|
||||
fh.seek(0)
|
||||
with ZipFile(fh, "r") as zip_file:
|
||||
contents = zip_file.namelist()
|
||||
|
||||
assert len(contents) == 2
|
||||
for filename in contents:
|
||||
assert filename.endswith(".wacz") or filename == "datapackage.json"
|
||||
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
|
||||
|
||||
|
||||
def test_delete_stream_upload(
|
||||
admin_auth_headers, crawler_auth_headers, default_org_id, upload_id
|
||||
):
|
||||
|
@ -275,7 +275,21 @@ export class ArchivedItemDetail extends TailwindElement {
|
||||
]);
|
||||
break;
|
||||
case "files":
|
||||
sectionContent = this.renderPanel(msg("Files"), this.renderFiles());
|
||||
sectionContent = this.renderPanel(
|
||||
html` ${this.renderTitle(msg("Files"))}
|
||||
<sl-tooltip content=${msg("Download all files as a single WACZ")}>
|
||||
<sl-button
|
||||
href=${`/api/orgs/${this.orgId}/all-crawls/${this.crawlId}/download?auth_bearer=${authToken}`}
|
||||
download
|
||||
size="small"
|
||||
variant="primary"
|
||||
>
|
||||
<sl-icon slot="prefix" name="cloud-download"></sl-icon>
|
||||
${msg("Download Item")}
|
||||
</sl-button>
|
||||
</sl-tooltip>`,
|
||||
this.renderFiles(),
|
||||
);
|
||||
break;
|
||||
case "logs":
|
||||
sectionContent = this.renderPanel(
|
||||
@ -558,6 +572,8 @@ export class ArchivedItemDetail extends TailwindElement {
|
||||
private renderMenu() {
|
||||
if (!this.crawl) return;
|
||||
|
||||
const authToken = this.authState!.headers.Authorization.split(" ")[1];
|
||||
|
||||
return html`
|
||||
<sl-dropdown placement="bottom-end" distance="4" hoist>
|
||||
<sl-button slot="trigger" size="small" caret
|
||||
@ -609,6 +625,19 @@ export class ArchivedItemDetail extends TailwindElement {
|
||||
<sl-icon name="tags" slot="prefix"></sl-icon>
|
||||
${msg("Copy Tags")}
|
||||
</sl-menu-item>
|
||||
${when(
|
||||
finishedCrawlStates.includes(this.crawl.state),
|
||||
() => html`
|
||||
<sl-divider></sl-divider>
|
||||
<btrix-menu-item-link
|
||||
href=${`/api/orgs/${this.orgId}/all-crawls/${this.crawlId}/download?auth_bearer=${authToken}`}
|
||||
download
|
||||
>
|
||||
<sl-icon name="cloud-download" slot="prefix"></sl-icon>
|
||||
${msg("Download Item")}
|
||||
</btrix-menu-item-link>
|
||||
`,
|
||||
)}
|
||||
${when(
|
||||
this.isCrawler && !isActive(this.crawl.state),
|
||||
() => html`
|
||||
@ -618,7 +647,7 @@ export class ArchivedItemDetail extends TailwindElement {
|
||||
@click=${() => void this.deleteCrawl()}
|
||||
>
|
||||
<sl-icon name="trash3" slot="prefix"></sl-icon>
|
||||
${msg("Delete Crawl")}
|
||||
${msg("Delete Item")}
|
||||
</sl-menu-item>
|
||||
`,
|
||||
)}
|
||||
|
@ -404,14 +404,8 @@ export class ArchivedItemDetailQA extends TailwindElement {
|
||||
}
|
||||
|
||||
downloadLink.loading = true;
|
||||
const file = await this.getQARunDownloadLink(run.id);
|
||||
if (file) {
|
||||
downloadLink.disabled = false;
|
||||
downloadLink.href = file.path;
|
||||
} else {
|
||||
downloadLink.disabled = true;
|
||||
}
|
||||
downloadLink.loading = false;
|
||||
downloadLink.disabled = false;
|
||||
downloadLink.href = `/orgs/${this.orgId}/crawls/${this.crawlId}/qa/${run.id}/download`;
|
||||
}}
|
||||
>
|
||||
<sl-menu>
|
||||
@ -933,19 +927,6 @@ export class ArchivedItemDetailQA extends TailwindElement {
|
||||
);
|
||||
}
|
||||
|
||||
private async getQARunDownloadLink(qaRunId: string) {
|
||||
try {
|
||||
const { resources } = await this.api.fetch<QARun>(
|
||||
`/orgs/${this.orgId}/crawls/${this.crawlId}/qa/${qaRunId}/replay.json`,
|
||||
this.authState!,
|
||||
);
|
||||
// TODO handle more than one file
|
||||
return resources?.[0];
|
||||
} catch (e) {
|
||||
console.debug(e);
|
||||
}
|
||||
}
|
||||
|
||||
private async deleteQARun(id: string) {
|
||||
try {
|
||||
await this.api.fetch(
|
||||
|
@ -603,23 +603,19 @@ export class CrawlsList extends TailwindElement {
|
||||
?showStatus=${this.itemType !== null}
|
||||
>
|
||||
<btrix-table-cell slot="actionCell" class="p-0">
|
||||
<btrix-overflow-dropdown
|
||||
@click=${(e: MouseEvent) => {
|
||||
// Prevent navigation to detail view
|
||||
e.preventDefault();
|
||||
e.stopImmediatePropagation();
|
||||
}}
|
||||
>
|
||||
<btrix-overflow-dropdown>
|
||||
<sl-menu>${this.renderMenuItems(item)}</sl-menu>
|
||||
</btrix-overflow-dropdown>
|
||||
</btrix-table-cell>
|
||||
</btrix-archived-item-list-item>
|
||||
`;
|
||||
|
||||
private readonly renderMenuItems = (item: ArchivedItem) =>
|
||||
private readonly renderMenuItems = (item: ArchivedItem) => {
|
||||
// HACK shoelace doesn't current have a way to override non-hover
|
||||
// color without resetting the --sl-color-neutral-700 variable
|
||||
html`
|
||||
const authToken = this.authState!.headers.Authorization.split(" ")[1];
|
||||
|
||||
return html`
|
||||
${when(
|
||||
this.isCrawler,
|
||||
() => html`
|
||||
@ -664,6 +660,19 @@ export class CrawlsList extends TailwindElement {
|
||||
<sl-icon name="tags" slot="prefix"></sl-icon>
|
||||
${msg("Copy Tags")}
|
||||
</sl-menu-item>
|
||||
${when(
|
||||
finishedCrawlStates.includes(item.state),
|
||||
() => html`
|
||||
<sl-divider></sl-divider>
|
||||
<btrix-menu-item-link
|
||||
href=${`/api/orgs/${this.orgId}/all-crawls/${item.id}/download?auth_bearer=${authToken}`}
|
||||
download
|
||||
>
|
||||
<sl-icon name="cloud-download" slot="prefix"></sl-icon>
|
||||
${msg("Download Item")}
|
||||
</btrix-menu-item-link>
|
||||
`,
|
||||
)}
|
||||
${when(
|
||||
this.isCrawler && !isActive(item.state),
|
||||
() => html`
|
||||
@ -678,6 +687,7 @@ export class CrawlsList extends TailwindElement {
|
||||
`,
|
||||
)}
|
||||
`;
|
||||
};
|
||||
|
||||
private readonly renderStatusMenuItem = (state: CrawlState) => {
|
||||
const { icon, label } = CrawlStatus.getContent(state);
|
||||
|
Loading…
Reference in New Issue
Block a user