switch to simpler streaming download + multiwacz metadata improvements: (#1982)

- download via presigned URLs via requests instead of boto APIs, remove boto - follow-up to #1933 for streaming download improvements - fixes datapackage.json in multi-wacz to contain the same resources objects with: `name`, `path`, `hash`, `bytes` to match single WACZ. - Add additional metadata to multi-wacz datapackage.json, including `type` (`crawl`, `upload`, `collection`, `qaRun`), `id` (unique id for the object), `title` / `description` if available (for crawl/upload/collection), and `crawlId` for `qaRun`
2024-10-03 16:13:31 -07:00 · 2024-10-03 16:13:31 -07:00 · 104ea097c4
commit 104ea097c4
parent 2429bb620c
6 changed files with 69 additions and 56 deletions
--- a/backend/btrixcloud/basecrawls.py
+++ b/backend/btrixcloud/basecrawls.py
@ -54,7 +54,7 @@ PRESIGN_MINUTES_DEFAULT = PRESIGN_MINUTES_MAX


 # ============================================================================
-# pylint: disable=too-many-instance-attributes, too-many-public-methods
+# pylint: disable=too-many-instance-attributes, too-many-public-methods, too-many-lines
 class BaseCrawlOps:
    """operations that apply to all crawls"""

@ -823,7 +823,14 @@ class BaseCrawlOps:
        if not crawl.resources:
            raise HTTPException(status_code=400, detail="no_crawl_resources")

-        resp = await self.storage_ops.download_streaming_wacz(org, crawl.resources)
+        metadata = {"type": crawl.type, "id": crawl_id, "organization": org.slug}
+        if crawl.name:
+            metadata["title"] = crawl.name
+
+        if crawl.description:
+            metadata["description"] = crawl.description
+
+        resp = await self.storage_ops.download_streaming_wacz(metadata, crawl.resources)

        headers = {"Content-Disposition": f'attachment; filename="{crawl_id}.wacz"'}
        return StreamingResponse(
--- a/backend/btrixcloud/colls.py
+++ b/backend/btrixcloud/colls.py
@ -323,7 +323,16 @@ class CollectionOps:
        """Download all WACZs in collection as streaming nested WACZ"""
        coll = await self.get_collection(coll_id, org, resources=True)

-        resp = await self.storage_ops.download_streaming_wacz(org, coll.resources)
+        metadata = {
+            "type": "collection",
+            "id": str(coll_id),
+            "title": coll.name,
+            "organization": org.slug,
+        }
+        if coll.description:
+            metadata["description"] = coll.description
+
+        resp = await self.storage_ops.download_streaming_wacz(metadata, coll.resources)

        headers = {"Content-Disposition": f'attachment; filename="{coll.name}.wacz"'}
        return StreamingResponse(
--- a/backend/btrixcloud/crawls.py
+++ b/backend/btrixcloud/crawls.py
@ -1034,7 +1034,16 @@ class CrawlOps(BaseCrawlOps):
        if not qa_run.resources:
            raise HTTPException(status_code=400, detail="qa_run_no_resources")

-        resp = await self.storage_ops.download_streaming_wacz(org, qa_run.resources)
+        metadata = {
+            "type": "qaRun",
+            "id": qa_run_id,
+            "crawlId": crawl_id,
+            "organization": org.slug,
+        }
+
+        resp = await self.storage_ops.download_streaming_wacz(
+            metadata, qa_run.resources
+        )

        finished = qa_run.finished.isoformat()

--- a/backend/btrixcloud/storages.py
+++ b/backend/btrixcloud/storages.py
@ -31,11 +31,10 @@ from stream_zip import stream_zip, NO_COMPRESSION_64, Method
 from remotezip import RemoteZip

 import aiobotocore.session
-import boto3
+import requests

-from mypy_boto3_s3.client import S3Client
-from mypy_boto3_s3.type_defs import CompletedPartTypeDef
 from types_aiobotocore_s3 import S3Client as AIOS3Client
+from types_aiobotocore_s3.type_defs import CompletedPartTypeDef

 from .models import (
    BaseFile,
@ -52,6 +51,7 @@ from .models import (
 )

 from .utils import is_bool, slug_from_name
+from .version import __version__


 if TYPE_CHECKING:
@ -289,35 +289,6 @@ class StorageOps:
        ) as client:
            yield client, bucket, key

-    @asynccontextmanager
-    async def get_sync_client(
-        self, org: Organization
-    ) -> AsyncIterator[tuple[S3Client, str, str]]:
-        """context manager for s3 client"""
-        storage = self.get_org_primary_storage(org)
-
-        endpoint_url = storage.endpoint_url
-
-        if not endpoint_url.endswith("/"):
-            endpoint_url += "/"
-
-        parts = urlsplit(endpoint_url)
-        bucket, key = parts.path[1:].split("/", 1)
-
-        endpoint_url = parts.scheme + "://" + parts.netloc
-
-        try:
-            client = boto3.client(
-                "s3",
-                region_name=storage.region,
-                endpoint_url=endpoint_url,
-                aws_access_key_id=storage.access_key,
-                aws_secret_access_key=storage.secret_key,
-            )
-            yield client, bucket, key
-        finally:
-            client.close()
-
    async def verify_storage_upload(self, storage: S3Storage, filename: str) -> None:
        """Test credentials and storage endpoint by uploading an empty test file"""

@ -683,21 +654,32 @@ class StorageOps:
                yield from file_stream

    def _sync_dl(
-        self, all_files: List[CrawlFileOut], client: S3Client, bucket: str, key: str
+        self, metadata: dict[str, str], all_files: List[CrawlFileOut]
    ) -> Iterator[bytes]:
        """generate streaming zip as sync"""
-        for file_ in all_files:
-            file_.path = file_.name
-
        datapackage = {
            "profile": "multi-wacz-package",
-            "resources": [file_.dict() for file_ in all_files],
+            "resources": [
+                {
+                    "name": file_.name,
+                    "path": file_.name,
+                    "hash": "sha256:" + file_.hash,
+                    "bytes": file_.size,
+                }
+                for file_ in all_files
+            ],
+            "software": f"Browsertrix v{__version__}",
+            **metadata,
        }
-        datapackage_bytes = json.dumps(datapackage).encode("utf-8")
+        datapackage_bytes = json.dumps(datapackage, indent=2).encode("utf-8")

-        def get_file(name) -> Iterator[bytes]:
-            response = client.get_object(Bucket=bucket, Key=key + name)
-            return response["Body"].iter_chunks(chunk_size=CHUNK_SIZE)
+        def get_datapackage() -> Iterable[bytes]:
+            yield datapackage_bytes
+
+        def get_file(path: str) -> Iterable[bytes]:
+            path = self.resolve_internal_access_path(path)
+            r = requests.get(path, stream=True, timeout=None)
+            yield from r.iter_content(CHUNK_SIZE)

        def member_files() -> (
            Iterable[tuple[str, datetime, int, Method, Iterable[bytes]]]
@ -710,7 +692,7 @@ class StorageOps:
                    modified_at,
                    perms,
                    NO_COMPRESSION_64(file_.size, 0),
-                    get_file(file_.name),
+                    get_file(file_.path),
                )

            yield (
@ -720,25 +702,22 @@ class StorageOps:
                NO_COMPRESSION_64(
                    len(datapackage_bytes), zlib.crc32(datapackage_bytes)
                ),
-                (datapackage_bytes,),
+                get_datapackage(),
            )

        # stream_zip() is an Iterator but defined as an Iterable, can cast
        return cast(Iterator[bytes], stream_zip(member_files(), chunk_size=CHUNK_SIZE))

    async def download_streaming_wacz(
-        self, org: Organization, files: List[CrawlFileOut]
+        self, metadata: dict[str, str], files: List[CrawlFileOut]
    ) -> Iterator[bytes]:
        """return an iter for downloading a stream nested wacz file
        from list of files"""
-        async with self.get_sync_client(org) as (client, bucket, key):
-            loop = asyncio.get_event_loop()
+        loop = asyncio.get_event_loop()

-            resp = await loop.run_in_executor(
-                None, self._sync_dl, files, client, bucket, key
-            )
+        resp = await loop.run_in_executor(None, self._sync_dl, metadata, files)

-            return resp
+        return resp


 # ============================================================================
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@ -11,6 +11,7 @@ aiofiles
 kubernetes-asyncio==29.0.0
 kubernetes
 aiobotocore
+requests
 redis>=5.0.0
 pyyaml
 jinja2
@ -18,10 +19,8 @@ humanize
 python-multipart
 pathvalidate
 https://github.com/ikreymer/stream-zip/archive/refs/heads/crc32-optional.zip
-boto3
 backoff>=2.2.1
 python-slugify>=8.0.1
-mypy_boto3_s3
 types_aiobotocore_s3
 types-redis
 types-python-slugify
--- a/backend/test/test_run_crawl.py
+++ b/backend/test/test_run_crawl.py
@ -6,6 +6,7 @@ import zipfile
 import re
 import csv
 import codecs
+import json
 from tempfile import TemporaryFile
 from zipfile import ZipFile, ZIP_STORED

@ -406,6 +407,15 @@ def test_download_wacz_crawls(
                assert filename.endswith(".wacz") or filename == "datapackage.json"
                assert zip_file.getinfo(filename).compress_type == ZIP_STORED

+                if filename == "datapackage.json":
+                    data = zip_file.read(filename).decode("utf-8")
+                    datapackage = json.loads(data)
+                    assert len(datapackage["resources"]) == 1
+                    for resource in datapackage["resources"]:
+                        assert resource["name"] == resource["path"]
+                        assert resource["hash"]
+                        assert resource["bytes"]
+

 def test_update_crawl(
    admin_auth_headers,