Streaming Download for Collections (#1012)

* support streaming download of collections (part of #927)
- WACZ zip created on the fly using stream-zip
- add 'Download Collection' option to collection detail and list
- after editing collection, return to collection view
- tests: add test for streaming download, ensure WACZ files + datapackage present, STORE compression used

---------

Co-authored-by: sua yoo <sua@suayoo.com>
This commit is contained in:
Ilya Kreymer 2023-07-26 15:42:17 -07:00 committed by GitHub
parent 6062042fae
commit 6506965d98
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 214 additions and 30 deletions

View File

@ -177,8 +177,7 @@ class BaseCrawlOps:
size = 0 size = 0
for file_ in crawl.files: for file_ in crawl.files:
size += file_.size size += file_.size
status_code = await delete_crawl_file_object(org, file_, self.crawl_manager) if not await delete_crawl_file_object(org, file_, self.crawl_manager):
if status_code != 204:
raise HTTPException(status_code=400, detail="file_deletion_error") raise HTTPException(status_code=400, detail="file_deletion_error")
return size return size

View File

@ -8,6 +8,7 @@ from typing import Optional, List
import pymongo import pymongo
from fastapi import Depends, HTTPException from fastapi import Depends, HTTPException
from fastapi.responses import StreamingResponse
from .basecrawls import SUCCESSFUL_STATES from .basecrawls import SUCCESSFUL_STATES
from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .pagination import DEFAULT_PAGE_SIZE, paginated_format
@ -21,6 +22,9 @@ from .models import (
Organization, Organization,
PaginatedResponse, PaginatedResponse,
) )
from .storages import (
download_streaming_wacz,
)
# ============================================================================ # ============================================================================
@ -259,6 +263,17 @@ class CollectionOps:
return {"success": True} return {"success": True}
async def download_collection(self, coll_id: uuid.UUID, org: Organization):
"""Download all WACZs in collection as streaming nested WACZ"""
coll = await self.get_collection(coll_id, org, resources=True)
resp = await download_streaming_wacz(org, self.crawl_manager, coll.resources)
headers = {"Content-Disposition": f'attachment; filename="{coll.name}.wacz"'}
return StreamingResponse(
resp, headers=headers, media_type="application/wacz+zip"
)
# ============================================================================ # ============================================================================
async def update_collection_counts_and_tags( async def update_collection_counts_and_tags(
@ -458,4 +473,10 @@ def init_collections_api(app, mdb, crawls, orgs, crawl_manager):
): ):
return await colls.delete_collection(coll_id, org) return await colls.delete_collection(coll_id, org)
@app.get("/orgs/{oid}/collections/{coll_id}/download", tags=["collections"])
async def download_collection(
coll_id: uuid.UUID, org: Organization = Depends(org_viewer_dep)
):
return await colls.download_collection(coll_id, org)
return colls return colls

View File

@ -5,13 +5,24 @@ from typing import Union
from urllib.parse import urlsplit from urllib.parse import urlsplit
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
import asyncio
import json
from datetime import datetime
from fastapi import Depends, HTTPException from fastapi import Depends, HTTPException
from aiobotocore.session import get_session from stream_zip import stream_zip, NO_COMPRESSION_64
import aiobotocore.session
import boto3
from .models import Organization, DefaultStorage, S3Storage, User from .models import Organization, DefaultStorage, S3Storage, User
from .zip import get_zip_file, extract_and_parse_log_file from .zip import get_zip_file, extract_and_parse_log_file
CHUNK_SIZE = 1024 * 256
# ============================================================================ # ============================================================================
def init_storages_api(org_ops, crawl_manager, user_dep): def init_storages_api(org_ops, crawl_manager, user_dep):
"""API for updating storage for an org""" """API for updating storage for an org"""
@ -65,7 +76,7 @@ async def get_s3_client(storage, use_access=False):
endpoint_url = parts.scheme + "://" + parts.netloc endpoint_url = parts.scheme + "://" + parts.netloc
session = get_session() session = aiobotocore.session.get_session()
async with session.create_client( async with session.create_client(
"s3", "s3",
@ -77,6 +88,34 @@ async def get_s3_client(storage, use_access=False):
yield client, bucket, key yield client, bucket, key
# ============================================================================
def get_sync_s3_client(storage, use_access=False):
"""context manager for s3 client"""
endpoint_url = storage.endpoint_url
if not endpoint_url.endswith("/"):
endpoint_url += "/"
parts = urlsplit(endpoint_url)
bucket, key = parts.path[1:].split("/", 1)
endpoint_url = parts.scheme + "://" + parts.netloc
client = boto3.client(
"s3",
region_name=storage.region,
endpoint_url=endpoint_url,
aws_access_key_id=storage.access_key,
aws_secret_access_key=storage.secret_key,
)
public_endpoint_url = (
storage.endpoint_url if not use_access else storage.access_endpoint_url
)
return client, bucket, key, public_endpoint_url
# ============================================================================ # ============================================================================
async def verify_storage_upload(storage, filename): async def verify_storage_upload(storage, filename):
"""Test credentials and storage endpoint by uploading an empty test file""" """Test credentials and storage endpoint by uploading an empty test file"""
@ -108,6 +147,22 @@ async def do_upload_single(org, filename, data, crawl_manager, storage_name="def
return await client.put_object(Bucket=bucket, Key=key, Body=data) return await client.put_object(Bucket=bucket, Key=key, Body=data)
# ============================================================================
async def get_sync_client(org, crawl_manager, storage_name="default", use_access=False):
"""get sync client"""
s3storage = None
if org.storage.type == "s3":
s3storage = org.storage
else:
s3storage = await crawl_manager.get_default_storage(storage_name)
if not s3storage:
raise TypeError("No Default Storage Found, Invalid Storage Type")
return get_sync_s3_client(s3storage, use_access=use_access)
# ============================================================================ # ============================================================================
# pylint: disable=too-many-arguments,too-many-locals # pylint: disable=too-many-arguments,too-many-locals
async def do_upload_multipart( async def do_upload_multipart(
@ -232,10 +287,18 @@ async def get_presigned_url(org, crawlfile, crawl_manager, duration=3600):
# ============================================================================ # ============================================================================
async def delete_crawl_file_object(org, crawlfile, crawl_manager): async def delete_crawl_file_object(org, crawlfile, crawl_manager):
"""delete crawl file from storage.""" """delete crawl file from storage."""
return await delete_file(
org, crawlfile.filename, crawl_manager, crawlfile.def_storage_name
)
# ============================================================================
async def delete_file(org, filename, crawl_manager, def_storage_name="default"):
"""delete specified file from storage"""
status_code = None status_code = None
if crawlfile.def_storage_name: if def_storage_name:
s3storage = await crawl_manager.get_default_storage(crawlfile.def_storage_name) s3storage = await crawl_manager.get_default_storage(def_storage_name)
elif org.storage.type == "s3": elif org.storage.type == "s3":
s3storage = org.storage s3storage = org.storage
@ -248,11 +311,11 @@ async def delete_crawl_file_object(org, crawlfile, crawl_manager):
bucket, bucket,
key, key,
): ):
key += crawlfile.filename key += filename
response = await client.delete_object(Bucket=bucket, Key=key) response = await client.delete_object(Bucket=bucket, Key=key)
status_code = response["ResponseMetadata"]["HTTPStatusCode"] status_code = response["ResponseMetadata"]["HTTPStatusCode"]
return status_code return status_code == 204
# ============================================================================ # ============================================================================
@ -289,3 +352,55 @@ async def get_wacz_logs(org, crawlfile, crawl_manager):
combined_log_lines.extend(parsed_log_lines) combined_log_lines.extend(parsed_log_lines)
return combined_log_lines return combined_log_lines
# ============================================================================
def _sync_dl(all_files, client, bucket, key):
"""generate streaming zip as sync"""
for file_ in all_files:
file_.path = file_.name
datapackage = {
"profile": "multi-wacz-package",
"resources": [file_.dict() for file_ in all_files],
}
datapackage = json.dumps(datapackage).encode("utf-8")
def get_file(name):
response = client.get_object(Bucket=bucket, Key=key + name)
return response["Body"].iter_chunks(chunk_size=CHUNK_SIZE)
def member_files():
modified_at = datetime(year=1980, month=1, day=1)
perms = 0o664
for file_ in all_files:
yield (
file_.name,
modified_at,
perms,
NO_COMPRESSION_64,
get_file(file_.name),
)
yield (
"datapackage.json",
modified_at,
perms,
NO_COMPRESSION_64,
(datapackage,),
)
return stream_zip(member_files(), chunk_size=CHUNK_SIZE)
# ============================================================================
async def download_streaming_wacz(org, crawl_manager, files):
"""return an iter for downloading a stream nested wacz file
from list of files"""
client, bucket, key, _ = await get_sync_client(org, crawl_manager)
loop = asyncio.get_event_loop()
resp = await loop.run_in_executor(None, _sync_dl, files, client, bucket, key)
return resp

View File

@ -11,3 +11,5 @@ jinja2
humanize humanize
python-multipart python-multipart
pathvalidate pathvalidate
https://github.com/ikreymer/stream-zip/archive/refs/heads/stream-uncompress.zip
boto3

View File

@ -1,6 +1,9 @@
import requests import requests
import os import os
from zipfile import ZipFile, ZIP_STORED
from tempfile import TemporaryFile
from .conftest import API_PREFIX from .conftest import API_PREFIX
from .utils import read_in_chunks from .utils import read_in_chunks
@ -310,6 +313,28 @@ def test_add_upload_to_collection(crawler_auth_headers, default_org_id):
assert _coll_id in r.json()["collections"] assert _coll_id in r.json()["collections"]
def test_download_streaming_collection(crawler_auth_headers, default_org_id):
# Add upload
with TemporaryFile() as fh:
with requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/download",
headers=crawler_auth_headers,
stream=True,
) as r:
assert r.status_code == 200
for chunk in r.iter_content():
fh.write(chunk)
fh.seek(0)
with ZipFile(fh, "r") as zip_file:
contents = zip_file.namelist()
assert len(contents) == 4
for filename in contents:
assert filename.endswith(".wacz") or filename == "datapackage.json"
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
def test_list_collections( def test_list_collections(
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
): ):

View File

@ -110,6 +110,8 @@ export class CollectionDetail extends LiteElement {
`; `;
private renderActions = () => { private renderActions = () => {
const authToken = this.authState!.headers.Authorization.split(" ")[1];
return html` return html`
<sl-dropdown distance="4"> <sl-dropdown distance="4">
<sl-button slot="trigger" size="small" caret <sl-button slot="trigger" size="small" caret
@ -125,6 +127,20 @@ export class CollectionDetail extends LiteElement {
<sl-icon name="gear" slot="prefix"></sl-icon> <sl-icon name="gear" slot="prefix"></sl-icon>
${msg("Edit Collection")} ${msg("Edit Collection")}
</sl-menu-item> </sl-menu-item>
<sl-divider></sl-divider>
<!-- Shoelace doesn't allow "href" on menu items,
see https://github.com/shoelace-style/shoelace/issues/1351 -->
<a
href=${`/api/orgs/${this.orgId}/collections/${this.collectionId}/download?auth_bearer=${authToken}`}
class="px-6 py-[0.6rem] flex gap-2 items-center whitespace-nowrap hover:bg-neutral-100"
@click=${(e: MouseEvent) => {
(e.target as HTMLAnchorElement).closest("sl-dropdown")?.hide();
}}
>
<sl-icon name="cloud-download" slot="prefix"></sl-icon>
${msg("Download Collection")}
</a>
<sl-divider></sl-divider>
<sl-menu-item <sl-menu-item
style="--sl-color-neutral-700: var(--danger)" style="--sl-color-neutral-700: var(--danger)"
@click=${this.confirmDelete} @click=${this.confirmDelete}

View File

@ -95,10 +95,10 @@ export class CollectionEdit extends LiteElement {
await this.saveMetadata({ name, description }); await this.saveMetadata({ name, description });
} }
this.navTo(`/orgs/${this.orgId}/collections/view/${this.collectionId}`);
this.notify({ this.notify({
message: msg( message: msg(
html`Successfully updated html`Successfully updated <strong>${name}</strong> Collection.`
<strong>${name}</strong> Collection.`
), ),
variant: "success", variant: "success",
icon: "check2-circle", icon: "check2-circle",

View File

@ -439,28 +439,17 @@ export class CollectionsList extends LiteElement {
private renderItem = (col: Collection) => private renderItem = (col: Collection) =>
html`<li class="mb-2 last:mb-0"> html`<li class="mb-2 last:mb-0">
<a <div class="block border rounded leading-none">
href=${`/orgs/${this.orgId}/collections/view/${col.id}`}
class="block border rounded shadow-sm leading-none hover:bg-neutral-50"
@click=${(e: MouseEvent) => {
if (
(
(e.currentTarget as HTMLElement)?.querySelector(
".actionsCol"
) as HTMLElement
).contains(e.target as HTMLElement)
) {
e.preventDefault();
} else {
this.navLink(e);
}
}}
>
<div <div
class="relative p-3 md:p-0 grid grid-cols-1 md:grid-cols-[repeat(2,1fr)_16ch_repeat(2,10ch)_2.5rem] gap-3 lg:h-10 items-center" class="relative p-3 md:p-0 grid grid-cols-1 md:grid-cols-[repeat(2,1fr)_16ch_repeat(2,10ch)_2.5rem] gap-3 lg:h-10 items-center"
> >
<div class="col-span-1 md:pl-3 truncate font-semibold"> <div class="col-span-1 md:pl-3 truncate font-semibold">
${col.name} <a
href=${`/orgs/${this.orgId}/collections/view/${col.id}`}
class="block text-primary hover:text-indigo-500"
>
${col.name}
</a>
</div> </div>
<div class="col-span-1 order-last md:order-none truncate"> <div class="col-span-1 order-last md:order-none truncate">
${col.tags ${col.tags
@ -500,10 +489,12 @@ export class CollectionsList extends LiteElement {
${this.isCrawler ? this.renderActions(col) : ""} ${this.isCrawler ? this.renderActions(col) : ""}
</div> </div>
</div> </div>
</a> </div>
</li>`; </li>`;
private renderActions = (col: Collection) => { private renderActions = (col: Collection) => {
const authToken = this.authState!.headers.Authorization.split(" ")[1];
return html` return html`
<sl-dropdown distance="4"> <sl-dropdown distance="4">
<btrix-button class="p-2" slot="trigger" label=${msg("Actions")} icon> <btrix-button class="p-2" slot="trigger" label=${msg("Actions")} icon>
@ -517,6 +508,21 @@ export class CollectionsList extends LiteElement {
<sl-icon name="gear" slot="prefix"></sl-icon> <sl-icon name="gear" slot="prefix"></sl-icon>
${msg("Edit Collection")} ${msg("Edit Collection")}
</sl-menu-item> </sl-menu-item>
<sl-divider></sl-divider>
<!-- Shoelace doesn't allow "href" on menu items,
see https://github.com/shoelace-style/shoelace/issues/1351 -->
<a
href=${`/api/orgs/${this.orgId}/collections/${col.id}/download?auth_bearer=${authToken}`}
class="px-6 py-[0.6rem] flex gap-2 items-center whitespace-nowrap hover:bg-neutral-100"
download
@click=${(e: MouseEvent) => {
(e.target as HTMLAnchorElement).closest("sl-dropdown")?.hide();
}}
>
<sl-icon name="cloud-download" slot="prefix"></sl-icon>
${msg("Download Collection")}
</a>
<sl-divider></sl-divider>
<sl-menu-item <sl-menu-item
style="--sl-color-neutral-700: var(--danger)" style="--sl-color-neutral-700: var(--danger)"
@click=${() => this.confirmDelete(col)} @click=${() => this.confirmDelete(col)}

View File

@ -14,4 +14,4 @@ export type CollectionList = Collection[];
export type CollectionSearchValues = { export type CollectionSearchValues = {
names: string[]; names: string[];
} };