Implement downloading archived item + QA runs as multi-WACZ (#1933)

Fixes #1412 ## Changes ### Backend - Adds `all-crawls`, `crawls`, and `uploads` API endpoints to download archived item as multi-WACZ - Download QA runs as multi-WACZ - Adds backend tests for new endpoints - Update to new version of stream-zip library which does not require crc-32 to be present for ZIP members, computes after streaming, fixing invalid crc-32 issues as previously computed crc-32s from crawler may be invalid. ### Frontend Adds ability to download archived item from: - Button in archived item detail Files tab - Archived item details actions menu - Archived items list menu --------- Co-authored-by: Henry Wilkinson <henry@wilkinson.graphics> Co-authored-by: sua yoo <sua@webrecorder.org> Co-authored-by: Ilya Kreymer <ikreymer@gmail.com>
2024-07-25 13:28:57 -04:00 · 2024-07-25 13:28:57 -04:00 · 27ee16d308
commit 27ee16d308
parent b288cd81cc
11 changed files with 224 additions and 38 deletions
--- a/backend/btrixcloud/basecrawls.py
+++ b/backend/btrixcloud/basecrawls.py
@ -8,6 +8,7 @@ import urllib.parse

 import asyncio
 from fastapi import HTTPException, Depends
+from fastapi.responses import StreamingResponse

 from .models import (
    CrawlFile,
@ -797,6 +798,20 @@ class BaseCrawlOps:
            "firstSeeds": list(first_seeds),
        }

+    async def download_crawl_as_single_wacz(self, crawl_id: str, org: Organization):
+        """Download all WACZs in archived item as streaming nested WACZ"""
+        crawl = await self.get_crawl_out(crawl_id, org)
+
+        if not crawl.resources:
+            raise HTTPException(status_code=400, detail="no_crawl_resources")
+
+        resp = await self.storage_ops.download_streaming_wacz(org, crawl.resources)
+
+        headers = {"Content-Disposition": f'attachment; filename="{crawl_id}.wacz"'}
+        return StreamingResponse(
+            resp, headers=headers, media_type="application/wacz+zip"
+        )
+
    async def calculate_org_crawl_file_storage(
        self, oid: UUID, type_: Optional[str] = None
    ) -> Tuple[int, int, int]:
@ -928,6 +943,16 @@ def init_base_crawls_api(app, user_dep, *args):
    async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)):
        return await ops.get_crawl_out(crawl_id, org)

+    @app.get(
+        "/orgs/{oid}/all-crawls/{crawl_id}/download",
+        tags=["all-crawls"],
+        response_model=bytes,
+    )
+    async def download_base_crawl_as_single_wacz(
+        crawl_id: str, org: Organization = Depends(org_viewer_dep)
+    ):
+        return await ops.download_crawl_as_single_wacz(crawl_id, org)
+
    @app.patch(
        "/orgs/{oid}/all-crawls/{crawl_id}",
        tags=["all-crawls"],
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -1008,6 +1008,28 @@ class CrawlOps(BaseCrawlOps):

        return QARunWithResources(**qa_run_dict)

+    async def download_qa_run_as_single_wacz(
+        self, crawl_id: str, qa_run_id: str, org: Organization
+    ):
+        """Download all WACZs in a QA run as streaming nested WACZ"""
+        qa_run = await self.get_qa_run_for_replay(crawl_id, qa_run_id, org)
+        if not qa_run.finished:
+            raise HTTPException(status_code=400, detail="qa_run_not_finished")
+
+        if not qa_run.resources:
+            raise HTTPException(status_code=400, detail="qa_run_no_resources")
+
+        resp = await self.storage_ops.download_streaming_wacz(org, qa_run.resources)
+
+        finished = qa_run.finished.isoformat()
+
+        headers = {
+            "Content-Disposition": f'attachment; filename="qa-{finished}-crawl-{crawl_id}.wacz"'
+        }
+        return StreamingResponse(
+            resp, headers=headers, media_type="application/wacz+zip"
+        )
+
    async def get_qa_run_aggregate_stats(
        self,
        crawl_id: str,
@ -1226,6 +1248,14 @@ def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args):
    async def get_crawl_out(crawl_id, org: Organization = Depends(org_viewer_dep)):
        return await ops.get_crawl_out(crawl_id, org, "crawl")

+    @app.get(
+        "/orgs/{oid}/crawls/{crawl_id}/download", tags=["crawls"], response_model=bytes
+    )
+    async def download_crawl_as_single_wacz(
+        crawl_id: str, org: Organization = Depends(org_viewer_dep)
+    ):
+        return await ops.download_crawl_as_single_wacz(crawl_id, org)
+
    # QA APIs
    # ---------------------
    @app.get(
@ -1249,6 +1279,16 @@ def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args):
    ):
        return await ops.get_qa_run_for_replay(crawl_id, qa_run_id, org)

+    @app.get(
+        "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/download",
+        tags=["qa"],
+        response_model=bytes,
+    )
+    async def download_qa_run_as_single_wacz(
+        crawl_id: str, qa_run_id: str, org: Organization = Depends(org_viewer_dep)
+    ):
+        return await ops.download_qa_run_as_single_wacz(crawl_id, qa_run_id, org)
+
    @app.get(
        "/orgs/{oid}/crawls/{crawl_id}/qa/{qa_run_id}/stats",
        tags=["qa"],
--- a/backend/btrixcloud/storages.py
+++ b/backend/btrixcloud/storages.py
@ -11,6 +11,7 @@ from typing import (
    AsyncIterator,
    TYPE_CHECKING,
    Any,
+    cast,
 )
 from urllib.parse import urlsplit
 from contextlib import asynccontextmanager
@ -26,7 +27,7 @@ from datetime import datetime
 from zipfile import ZipInfo

 from fastapi import Depends, HTTPException
-from stream_zip import stream_zip, NO_COMPRESSION_64
+from stream_zip import stream_zip, NO_COMPRESSION_64, Method
 from remotezip import RemoteZip

 import aiobotocore.session
@ -698,7 +699,9 @@ class StorageOps:
            response = client.get_object(Bucket=bucket, Key=key + name)
            return response["Body"].iter_chunks(chunk_size=CHUNK_SIZE)

-        def member_files():
+        def member_files() -> (
+            Iterable[tuple[str, datetime, int, Method, Iterable[bytes]]]
+        ):
            modified_at = datetime(year=1980, month=1, day=1)
            perms = 0o664
            for file_ in all_files:
@ -706,7 +709,7 @@ class StorageOps:
                    file_.name,
                    modified_at,
                    perms,
-                    NO_COMPRESSION_64(file_.size, file_.crc32),
+                    NO_COMPRESSION_64(file_.size, 0),
                    get_file(file_.name),
                )

@ -720,7 +723,8 @@ class StorageOps:
                (datapackage_bytes,),
            )

-        return stream_zip(member_files(), chunk_size=CHUNK_SIZE)
+        # stream_zip() is an Iterator but defined as an Iterable, can cast
+        return cast(Iterator[bytes], stream_zip(member_files(), chunk_size=CHUNK_SIZE))

    async def download_streaming_wacz(
        self, org: Organization, files: List[CrawlFileOut]
--- a/backend/btrixcloud/uploads.py
+++ b/backend/btrixcloud/uploads.py
@ -423,6 +423,16 @@ def init_uploads_api(app, user_dep, *args):
    async def get_upload_replay(crawl_id, org: Organization = Depends(org_viewer_dep)):
        return await ops.get_crawl_out(crawl_id, org, "upload")

+    @app.get(
+        "/orgs/{oid}/uploads/{crawl_id}/download",
+        tags=["uploads"],
+        response_model=bytes,
+    )
+    async def download_upload_as_single_wacz(
+        crawl_id: str, org: Organization = Depends(org_viewer_dep)
+    ):
+        return await ops.download_crawl_as_single_wacz(crawl_id, org)
+
    @app.patch(
        "/orgs/{oid}/uploads/{crawl_id}",
        tags=["uploads"],
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -17,8 +17,7 @@ jinja2
 humanize
 python-multipart
 pathvalidate
-#https://github.com/ikreymer/stream-zip/archive/refs/heads/stream-uncompress.zip
-https://github.com/ikreymer/stream-zip/archive/refs/heads/stream-ignore-local-crc32.zip
+https://github.com/ikreymer/stream-zip/archive/refs/heads/crc32-optional.zip
 boto3
 backoff>=2.2.1
 python-slugify>=8.0.1
--- a/backend/test/test_qa.py
+++ b/backend/test/test_qa.py
@ -2,6 +2,8 @@ from .conftest import API_PREFIX, HOST_PREFIX
 import requests
 import time
 from datetime import datetime
+from tempfile import TemporaryFile
+from zipfile import ZipFile, ZIP_STORED

 import pytest

@ -541,6 +543,33 @@ def test_sort_crawls_by_qa_runs(
        last_count = crawl_qa_count


+def test_download_wacz_crawls(
+    crawler_crawl_id,
+    crawler_auth_headers,
+    default_org_id,
+    qa_run_id,
+    qa_run_pages_ready,
+):
+    with TemporaryFile() as fh:
+        with requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/download",
+            headers=crawler_auth_headers,
+            stream=True,
+        ) as r:
+            assert r.status_code == 200
+            for chunk in r.iter_content():
+                fh.write(chunk)
+
+        fh.seek(0)
+        with ZipFile(fh, "r") as zip_file:
+            contents = zip_file.namelist()
+
+            assert len(contents) >= 2
+            for filename in contents:
+                assert filename.endswith(".wacz") or filename == "datapackage.json"
+                assert zip_file.getinfo(filename).compress_type == ZIP_STORED
+
+
 def test_delete_qa_runs(
    crawler_crawl_id,
    crawler_auth_headers,
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@ -6,6 +6,10 @@ import zipfile
 import re
 import csv
 import codecs
+from tempfile import TemporaryFile
+from zipfile import ZipFile, ZIP_STORED
+
+import pytest

 from .conftest import API_PREFIX, HOST_PREFIX, FINISHED_STATES
 from .test_collections import UPDATED_NAME as COLLECTION_NAME
@ -371,6 +375,38 @@ def test_verify_wacz():
    assert len(pages.strip().split("\n")) == 4


+@pytest.mark.parametrize(
+    "type_path",
+    [
+        # crawls endpoint
+        ("crawls"),
+        # all-crawls endpoint
+        ("all-crawls"),
+    ],
+)
+def test_download_wacz_crawls(
+    admin_auth_headers, default_org_id, admin_crawl_id, type_path
+):
+    with TemporaryFile() as fh:
+        with requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/{type_path}/{admin_crawl_id}/download",
+            headers=admin_auth_headers,
+            stream=True,
+        ) as r:
+            assert r.status_code == 200
+            for chunk in r.iter_content():
+                fh.write(chunk)
+
+        fh.seek(0)
+        with ZipFile(fh, "r") as zip_file:
+            contents = zip_file.namelist()
+
+            assert len(contents) >= 2
+            for filename in contents:
+                assert filename.endswith(".wacz") or filename == "datapackage.json"
+                assert zip_file.getinfo(filename).compress_type == ZIP_STORED
+
+
 def test_update_crawl(
    admin_auth_headers,
    default_org_id,
--- a/backend/test/test_uploads.py
+++ b/backend/test/test_uploads.py
@ -1,7 +1,9 @@
 import requests
 import os
 import time
+from tempfile import TemporaryFile
 from urllib.parse import urljoin
+from zipfile import ZipFile, ZIP_STORED

 import pytest

@ -329,6 +331,27 @@ def test_update_upload_metadata(admin_auth_headers, default_org_id, upload_id):
    assert data["collectionIds"] == UPDATED_COLLECTION_IDS


+def test_download_wacz_uploads(admin_auth_headers, default_org_id, upload_id):
+    with TemporaryFile() as fh:
+        with requests.get(
+            f"{API_PREFIX}/orgs/{default_org_id}/uploads/{upload_id}/download",
+            headers=admin_auth_headers,
+            stream=True,
+        ) as r:
+            assert r.status_code == 200
+            for chunk in r.iter_content():
+                fh.write(chunk)
+
+        fh.seek(0)
+        with ZipFile(fh, "r") as zip_file:
+            contents = zip_file.namelist()
+
+            assert len(contents) == 2
+            for filename in contents:
+                assert filename.endswith(".wacz") or filename == "datapackage.json"
+                assert zip_file.getinfo(filename).compress_type == ZIP_STORED
+
+
 def test_delete_stream_upload(
    admin_auth_headers, crawler_auth_headers, default_org_id, upload_id
 ):
--- a/frontend/src/pages/org/archived-item-detail/archived-item-detail.ts
+++ b/frontend/src/pages/org/archived-item-detail/archived-item-detail.ts
@ -275,7 +275,21 @@ export class ArchivedItemDetail extends TailwindElement {
        ]);
        break;
      case "files":
-        sectionContent = this.renderPanel(msg("Files"), this.renderFiles());
+        sectionContent = this.renderPanel(
+          html` ${this.renderTitle(msg("Files"))}
+            <sl-tooltip content=${msg("Download all files as a single WACZ")}>
+              <sl-button
+                href=${`/api/orgs/${this.orgId}/all-crawls/${this.crawlId}/download?auth_bearer=${authToken}`}
+                download
+                size="small"
+                variant="primary"
+              >
+                <sl-icon slot="prefix" name="cloud-download"></sl-icon>
+                ${msg("Download Item")}
+              </sl-button>
+            </sl-tooltip>`,
+          this.renderFiles(),
+        );
        break;
      case "logs":
        sectionContent = this.renderPanel(
@ -558,6 +572,8 @@ export class ArchivedItemDetail extends TailwindElement {
  private renderMenu() {
    if (!this.crawl) return;

+    const authToken = this.authState!.headers.Authorization.split(" ")[1];
+
    return html`
      <sl-dropdown placement="bottom-end" distance="4" hoist>
        <sl-button slot="trigger" size="small" caret
@ -609,6 +625,19 @@ export class ArchivedItemDetail extends TailwindElement {
            <sl-icon name="tags" slot="prefix"></sl-icon>
            ${msg("Copy Tags")}
          </sl-menu-item>
+          ${when(
+            finishedCrawlStates.includes(this.crawl.state),
+            () => html`
+              <sl-divider></sl-divider>
+              <btrix-menu-item-link
+                href=${`/api/orgs/${this.orgId}/all-crawls/${this.crawlId}/download?auth_bearer=${authToken}`}
+                download
+              >
+                <sl-icon name="cloud-download" slot="prefix"></sl-icon>
+                ${msg("Download Item")}
+              </btrix-menu-item-link>
+            `,
+          )}
          ${when(
            this.isCrawler && !isActive(this.crawl.state),
            () => html`
@ -618,7 +647,7 @@ export class ArchivedItemDetail extends TailwindElement {
                @click=${() => void this.deleteCrawl()}
              >
                <sl-icon name="trash3" slot="prefix"></sl-icon>
-                ${msg("Delete Crawl")}
+                ${msg("Delete Item")}
              </sl-menu-item>
            `,
          )}
--- a/frontend/src/pages/org/archived-item-detail/ui/qa.ts
+++ b/frontend/src/pages/org/archived-item-detail/ui/qa.ts
@ -404,14 +404,8 @@ export class ArchivedItemDetailQA extends TailwindElement {
                  }

                  downloadLink.loading = true;
-                  const file = await this.getQARunDownloadLink(run.id);
-                  if (file) {
-                    downloadLink.disabled = false;
-                    downloadLink.href = file.path;
-                  } else {
-                    downloadLink.disabled = true;
-                  }
-                  downloadLink.loading = false;
+                  downloadLink.disabled = false;
+                  downloadLink.href = `/orgs/${this.orgId}/crawls/${this.crawlId}/qa/${run.id}/download`;
                }}
              >
                <sl-menu>
@ -933,19 +927,6 @@ export class ArchivedItemDetailQA extends TailwindElement {
    );
  }

-  private async getQARunDownloadLink(qaRunId: string) {
-    try {
-      const { resources } = await this.api.fetch<QARun>(
-        `/orgs/${this.orgId}/crawls/${this.crawlId}/qa/${qaRunId}/replay.json`,
-        this.authState!,
-      );
-      // TODO handle more than one file
-      return resources?.[0];
-    } catch (e) {
-      console.debug(e);
-    }
-  }
-
  private async deleteQARun(id: string) {
    try {
      await this.api.fetch(
--- a/frontend/src/pages/org/archived-items.ts
+++ b/frontend/src/pages/org/archived-items.ts
@ -603,23 +603,19 @@ export class CrawlsList extends TailwindElement {
      ?showStatus=${this.itemType !== null}
    >
      <btrix-table-cell slot="actionCell" class="p-0">
-        <btrix-overflow-dropdown
-          @click=${(e: MouseEvent) => {
-            // Prevent navigation to detail view
-            e.preventDefault();
-            e.stopImmediatePropagation();
-          }}
-        >
+        <btrix-overflow-dropdown>
          <sl-menu>${this.renderMenuItems(item)}</sl-menu>
        </btrix-overflow-dropdown>
      </btrix-table-cell>
    </btrix-archived-item-list-item>
  `;

-  private readonly renderMenuItems = (item: ArchivedItem) =>
+  private readonly renderMenuItems = (item: ArchivedItem) => {
    // HACK shoelace doesn't current have a way to override non-hover
    // color without resetting the --sl-color-neutral-700 variable
-    html`
+    const authToken = this.authState!.headers.Authorization.split(" ")[1];
+
+    return html`
      ${when(
        this.isCrawler,
        () => html`
@ -664,6 +660,19 @@ export class CrawlsList extends TailwindElement {
        <sl-icon name="tags" slot="prefix"></sl-icon>
        ${msg("Copy Tags")}
      </sl-menu-item>
+      ${when(
+        finishedCrawlStates.includes(item.state),
+        () => html`
+          <sl-divider></sl-divider>
+          <btrix-menu-item-link
+            href=${`/api/orgs/${this.orgId}/all-crawls/${item.id}/download?auth_bearer=${authToken}`}
+            download
+          >
+            <sl-icon name="cloud-download" slot="prefix"></sl-icon>
+            ${msg("Download Item")}
+          </btrix-menu-item-link>
+        `,
+      )}
      ${when(
        this.isCrawler && !isActive(item.state),
        () => html`
@ -678,6 +687,7 @@ export class CrawlsList extends TailwindElement {
        `,
      )}
    `;
+  };

  private readonly renderStatusMenuItem = (state: CrawlState) => {
    const { icon, label } = CrawlStatus.getContent(state);