From 27ee16d308adb1b1bc10391cc0d514ff2c233512 Mon Sep 17 00:00:00 2001 From: Tessa Walsh Date: Thu, 25 Jul 2024 13:28:57 -0400 Subject: [PATCH] Implement downloading archived item + QA runs as multi-WACZ (#1933) Fixes #1412 ## Changes ### Backend - Adds `all-crawls`, `crawls`, and `uploads` API endpoints to download archived item as multi-WACZ - Download QA runs as multi-WACZ - Adds backend tests for new endpoints - Update to new version of stream-zip library which does not require crc-32 to be present for ZIP members, computes after streaming, fixing invalid crc-32 issues as previously computed crc-32s from crawler may be invalid. ### Frontend Adds ability to download archived item from: - Button in archived item detail Files tab - Archived item details actions menu - Archived items list menu --------- Co-authored-by: Henry Wilkinson Co-authored-by: sua yoo Co-authored-by: Ilya Kreymer --- backend/btrixcloud/basecrawls.py | 25 ++++++++++++ backend/btrixcloud/crawls.py | 40 +++++++++++++++++++ backend/btrixcloud/storages.py | 12 ++++-- backend/btrixcloud/uploads.py | 10 +++++ backend/requirements.txt | 3 +- backend/test/test_qa.py | 29 ++++++++++++++ backend/test/test_run_crawl.py | 36 +++++++++++++++++ backend/test/test_uploads.py | 23 +++++++++++ .../archived-item-detail.ts | 33 ++++++++++++++- .../pages/org/archived-item-detail/ui/qa.ts | 23 +---------- frontend/src/pages/org/archived-items.ts | 28 ++++++++----- 11 files changed, 224 insertions(+), 38 deletions(-) diff --git a/backend/btrixcloud/basecrawls.py b/backend/btrixcloud/basecrawls.py index 2284a876..0b21c0f3 100644 --- a/backend/btrixcloud/basecrawls.py +++ b/backend/btrixcloud/basecrawls.py @@ -8,6 +8,7 @@ import urllib.parse import asyncio from fastapi import HTTPException, Depends +from fastapi.responses import StreamingResponse from .models import ( CrawlFile, @@ -797,6 +798,20 @@ class BaseCrawlOps: "firstSeeds": list(first_seeds), } + async def download_crawl_as_single_wacz(self, crawl_id: str, org: Organization): + """Download all WACZs in archived item as streaming nested WACZ""" + crawl = await self.get_crawl_out(crawl_id, org) + + if not crawl.resources: + raise HTTPException(status_code=400, detail="no_crawl_resources") + + resp = await self.storage_ops.download_streaming_wacz(org, crawl.resources) + + headers = {"Content-Disposition": f'attachment; filename="{crawl_id}.wacz"'} + return StreamingResponse( + resp, headers=headers, media_type="application/wacz+zip" + ) + async def calculate_org_crawl_file_storage( self, oid: UUID, type_: Optional[str] = None ) -> Tuple[int, int, int]: @@ -928,6 +943,16 @@ def init_base_crawls_api(app, user_dep, *args): async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)): return await ops.get_crawl_out(crawl_id, org) + @app.get( + "/orgs/{oid}/all-crawls/{crawl_id}/download", + tags=["all-crawls"], + response_model=bytes, + ) + async def download_base_crawl_as_single_wacz( + crawl_id: str, org: Organization = Depends(org_viewer_dep) + ): + return await ops.download_crawl_as_single_wacz(crawl_id, org) + @app.patch( "/orgs/{oid}/all-crawls/{crawl_id}", tags=["all-crawls"], diff --git a/backend/btrixcloud/crawls.py b/backend/btrixcloud/crawls.py index c6229ee2..c5203fbf 100644 --- a/backend/btrixcloud/crawls.py +++ b/backend/btrixcloud/crawls.py @@ -1008,6 +1008,28 @@ class CrawlOps(BaseCrawlOps): return QARunWithResources(**qa_run_dict) + async def download_qa_run_as_single_wacz( + self, crawl_id: str, qa_run_id: str, org: Organization + ): + """Download all WACZs in a QA run as streaming nested WACZ""" + qa_run = await self.get_qa_run_for_replay(crawl_id, qa_run_id, org) + if not qa_run.finished: + raise HTTPException(status_code=400, detail="qa_run_not_finished") + + if not qa_run.resources: + raise HTTPException(status_code=400, detail="qa_run_no_resources") + + resp = await self.storage_ops.download_streaming_wacz(org, qa_run.resources) + + finished = qa_run.finished.isoformat() + + headers = { + "Content-Disposition": f'attachment; filename="qa-{finished}-crawl-{crawl_id}.wacz"' + } + return StreamingResponse( + resp, headers=headers, media_type="application/wacz+zip" + ) + async def get_qa_run_aggregate_stats( self, crawl_id: str, @@ -1226,6 +1248,14 @@ def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args): async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)): return await ops.get_crawl_out(crawl_id, org, "crawl") + @app.get( + "/orgs/{oid}/crawls/{crawl_id}/download", tags=["crawls"], response_model=bytes + ) + async def download_crawl_as_single_wacz( + crawl_id: str, org: Organization = Depends(org_viewer_dep) + ): + return await ops.download_crawl_as_single_wacz(crawl_id, org) + # QA APIs # --------------------- @app.get( @@ -1249,6 +1279,16 @@ def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args): ): return await ops.get_qa_run_for_replay(crawl_id, qa_run_id, org) + @app.get( + "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/download", + tags=["qa"], + response_model=bytes, + ) + async def download_qa_run_as_single_wacz( + crawl_id: str, qa_run_id: str, org: Organization = Depends(org_viewer_dep) + ): + return await ops.download_qa_run_as_single_wacz(crawl_id, qa_run_id, org) + @app.get( "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/stats", tags=["qa"], diff --git a/backend/btrixcloud/storages.py b/backend/btrixcloud/storages.py index 8d86d127..d5529c28 100644 --- a/backend/btrixcloud/storages.py +++ b/backend/btrixcloud/storages.py @@ -11,6 +11,7 @@ from typing import ( AsyncIterator, TYPE_CHECKING, Any, + cast, ) from urllib.parse import urlsplit from contextlib import asynccontextmanager @@ -26,7 +27,7 @@ from datetime import datetime from zipfile import ZipInfo from fastapi import Depends, HTTPException -from stream_zip import stream_zip, NO_COMPRESSION_64 +from stream_zip import stream_zip, NO_COMPRESSION_64, Method from remotezip import RemoteZip import aiobotocore.session @@ -698,7 +699,9 @@ class StorageOps: response = client.get_object(Bucket=bucket, Key=key + name) return response["Body"].iter_chunks(chunk_size=CHUNK_SIZE) - def member_files(): + def member_files() -> ( + Iterable[tuple[str, datetime, int, Method, Iterable[bytes]]] + ): modified_at = datetime(year=1980, month=1, day=1) perms = 0o664 for file_ in all_files: @@ -706,7 +709,7 @@ class StorageOps: file_.name, modified_at, perms, - NO_COMPRESSION_64(file_.size, file_.crc32), + NO_COMPRESSION_64(file_.size, 0), get_file(file_.name), ) @@ -720,7 +723,8 @@ class StorageOps: (datapackage_bytes,), ) - return stream_zip(member_files(), chunk_size=CHUNK_SIZE) + # stream_zip() is an Iterator but defined as an Iterable, can cast + return cast(Iterator[bytes], stream_zip(member_files(), chunk_size=CHUNK_SIZE)) async def download_streaming_wacz( self, org: Organization, files: List[CrawlFileOut] diff --git a/backend/btrixcloud/uploads.py b/backend/btrixcloud/uploads.py index 46ab0892..6f6a7036 100644 --- a/backend/btrixcloud/uploads.py +++ b/backend/btrixcloud/uploads.py @@ -423,6 +423,16 @@ def init_uploads_api(app, user_dep, *args): async def get_upload_replay(crawl_id, org: Organization = Depends(org_viewer_dep)): return await ops.get_crawl_out(crawl_id, org, "upload") + @app.get( + "/orgs/{oid}/uploads/{crawl_id}/download", + tags=["uploads"], + response_model=bytes, + ) + async def download_upload_as_single_wacz( + crawl_id: str, org: Organization = Depends(org_viewer_dep) + ): + return await ops.download_crawl_as_single_wacz(crawl_id, org) + @app.patch( "/orgs/{oid}/uploads/{crawl_id}", tags=["uploads"], diff --git a/backend/requirements.txt b/backend/requirements.txt index c07986a0..3c6b868b 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -17,8 +17,7 @@ jinja2 humanize python-multipart pathvalidate -#https://github.com/ikreymer/stream-zip/archive/refs/heads/stream-uncompress.zip -https://github.com/ikreymer/stream-zip/archive/refs/heads/stream-ignore-local-crc32.zip +https://github.com/ikreymer/stream-zip/archive/refs/heads/crc32-optional.zip boto3 backoff>=2.2.1 python-slugify>=8.0.1 diff --git a/backend/test/test_qa.py b/backend/test/test_qa.py index 14fb9ea6..ed4c14e7 100644 --- a/backend/test/test_qa.py +++ b/backend/test/test_qa.py @@ -2,6 +2,8 @@ from .conftest import API_PREFIX, HOST_PREFIX import requests import time from datetime import datetime +from tempfile import TemporaryFile +from zipfile import ZipFile, ZIP_STORED import pytest @@ -541,6 +543,33 @@ def test_sort_crawls_by_qa_runs( last_count = crawl_qa_count +def test_download_wacz_crawls( + crawler_crawl_id, + crawler_auth_headers, + default_org_id, + qa_run_id, + qa_run_pages_ready, +): + with TemporaryFile() as fh: + with requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/download", + headers=crawler_auth_headers, + stream=True, + ) as r: + assert r.status_code == 200 + for chunk in r.iter_content(): + fh.write(chunk) + + fh.seek(0) + with ZipFile(fh, "r") as zip_file: + contents = zip_file.namelist() + + assert len(contents) >= 2 + for filename in contents: + assert filename.endswith(".wacz") or filename == "datapackage.json" + assert zip_file.getinfo(filename).compress_type == ZIP_STORED + + def test_delete_qa_runs( crawler_crawl_id, crawler_auth_headers, diff --git a/backend/test/test_run_crawl.py b/backend/test/test_run_crawl.py index c81e014f..9d2d4c94 100644 --- a/backend/test/test_run_crawl.py +++ b/backend/test/test_run_crawl.py @@ -6,6 +6,10 @@ import zipfile import re import csv import codecs +from tempfile import TemporaryFile +from zipfile import ZipFile, ZIP_STORED + +import pytest from .conftest import API_PREFIX, HOST_PREFIX, FINISHED_STATES from .test_collections import UPDATED_NAME as COLLECTION_NAME @@ -371,6 +375,38 @@ def test_verify_wacz(): assert len(pages.strip().split("\n")) == 4 +@pytest.mark.parametrize( + "type_path", + [ + # crawls endpoint + ("crawls"), + # all-crawls endpoint + ("all-crawls"), + ], +) +def test_download_wacz_crawls( + admin_auth_headers, default_org_id, admin_crawl_id, type_path +): + with TemporaryFile() as fh: + with requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/{type_path}/{admin_crawl_id}/download", + headers=admin_auth_headers, + stream=True, + ) as r: + assert r.status_code == 200 + for chunk in r.iter_content(): + fh.write(chunk) + + fh.seek(0) + with ZipFile(fh, "r") as zip_file: + contents = zip_file.namelist() + + assert len(contents) >= 2 + for filename in contents: + assert filename.endswith(".wacz") or filename == "datapackage.json" + assert zip_file.getinfo(filename).compress_type == ZIP_STORED + + def test_update_crawl( admin_auth_headers, default_org_id, diff --git a/backend/test/test_uploads.py b/backend/test/test_uploads.py index 22c88d8b..50931cc1 100644 --- a/backend/test/test_uploads.py +++ b/backend/test/test_uploads.py @@ -1,7 +1,9 @@ import requests import os import time +from tempfile import TemporaryFile from urllib.parse import urljoin +from zipfile import ZipFile, ZIP_STORED import pytest @@ -329,6 +331,27 @@ def test_update_upload_metadata(admin_auth_headers, default_org_id, upload_id): assert data["collectionIds"] == UPDATED_COLLECTION_IDS +def test_download_wacz_uploads(admin_auth_headers, default_org_id, upload_id): + with TemporaryFile() as fh: + with requests.get( + f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/download", + headers=admin_auth_headers, + stream=True, + ) as r: + assert r.status_code == 200 + for chunk in r.iter_content(): + fh.write(chunk) + + fh.seek(0) + with ZipFile(fh, "r") as zip_file: + contents = zip_file.namelist() + + assert len(contents) == 2 + for filename in contents: + assert filename.endswith(".wacz") or filename == "datapackage.json" + assert zip_file.getinfo(filename).compress_type == ZIP_STORED + + def test_delete_stream_upload( admin_auth_headers, crawler_auth_headers, default_org_id, upload_id ): diff --git a/frontend/src/pages/org/archived-item-detail/archived-item-detail.ts b/frontend/src/pages/org/archived-item-detail/archived-item-detail.ts index 79e23575..f066cbc0 100644 --- a/frontend/src/pages/org/archived-item-detail/archived-item-detail.ts +++ b/frontend/src/pages/org/archived-item-detail/archived-item-detail.ts @@ -275,7 +275,21 @@ export class ArchivedItemDetail extends TailwindElement { ]); break; case "files": - sectionContent = this.renderPanel(msg("Files"), this.renderFiles()); + sectionContent = this.renderPanel( + html` ${this.renderTitle(msg("Files"))} + + + + ${msg("Download Item")} + + `, + this.renderFiles(), + ); break; case "logs": sectionContent = this.renderPanel( @@ -558,6 +572,8 @@ export class ArchivedItemDetail extends TailwindElement { private renderMenu() { if (!this.crawl) return; + const authToken = this.authState!.headers.Authorization.split(" ")[1]; + return html` ${msg("Copy Tags")} + ${when( + finishedCrawlStates.includes(this.crawl.state), + () => html` + + + + ${msg("Download Item")} + + `, + )} ${when( this.isCrawler && !isActive(this.crawl.state), () => html` @@ -618,7 +647,7 @@ export class ArchivedItemDetail extends TailwindElement { @click=${() => void this.deleteCrawl()} > - ${msg("Delete Crawl")} + ${msg("Delete Item")} `, )} diff --git a/frontend/src/pages/org/archived-item-detail/ui/qa.ts b/frontend/src/pages/org/archived-item-detail/ui/qa.ts index c53bbad0..3e931b43 100644 --- a/frontend/src/pages/org/archived-item-detail/ui/qa.ts +++ b/frontend/src/pages/org/archived-item-detail/ui/qa.ts @@ -404,14 +404,8 @@ export class ArchivedItemDetailQA extends TailwindElement { } downloadLink.loading = true; - const file = await this.getQARunDownloadLink(run.id); - if (file) { - downloadLink.disabled = false; - downloadLink.href = file.path; - } else { - downloadLink.disabled = true; - } - downloadLink.loading = false; + downloadLink.disabled = false; + downloadLink.href = `/orgs/${this.orgId}/crawls/${this.crawlId}/qa/${run.id}/download`; }} > @@ -933,19 +927,6 @@ export class ArchivedItemDetailQA extends TailwindElement { ); } - private async getQARunDownloadLink(qaRunId: string) { - try { - const { resources } = await this.api.fetch( - `/orgs/${this.orgId}/crawls/${this.crawlId}/qa/${qaRunId}/replay.json`, - this.authState!, - ); - // TODO handle more than one file - return resources?.[0]; - } catch (e) { - console.debug(e); - } - } - private async deleteQARun(id: string) { try { await this.api.fetch( diff --git a/frontend/src/pages/org/archived-items.ts b/frontend/src/pages/org/archived-items.ts index 33deb652..4e43970b 100644 --- a/frontend/src/pages/org/archived-items.ts +++ b/frontend/src/pages/org/archived-items.ts @@ -603,23 +603,19 @@ export class CrawlsList extends TailwindElement { ?showStatus=${this.itemType !== null} > - { - // Prevent navigation to detail view - e.preventDefault(); - e.stopImmediatePropagation(); - }} - > + ${this.renderMenuItems(item)} `; - private readonly renderMenuItems = (item: ArchivedItem) => + private readonly renderMenuItems = (item: ArchivedItem) => { // HACK shoelace doesn't current have a way to override non-hover // color without resetting the --sl-color-neutral-700 variable - html` + const authToken = this.authState!.headers.Authorization.split(" ")[1]; + + return html` ${when( this.isCrawler, () => html` @@ -664,6 +660,19 @@ export class CrawlsList extends TailwindElement { ${msg("Copy Tags")} + ${when( + finishedCrawlStates.includes(item.state), + () => html` + + + + ${msg("Download Item")} + + `, + )} ${when( this.isCrawler && !isActive(item.state), () => html` @@ -678,6 +687,7 @@ export class CrawlsList extends TailwindElement { `, )} `; + }; private readonly renderStatusMenuItem = (state: CrawlState) => { const { icon, label } = CrawlStatus.getContent(state);