Streaming Download for Collections (#1012)
* support streaming download of collections (part of #927) - WACZ zip created on the fly using stream-zip - add 'Download Collection' option to collection detail and list - after editing collection, return to collection view - tests: add test for streaming download, ensure WACZ files + datapackage present, STORE compression used --------- Co-authored-by: sua yoo <sua@suayoo.com>
This commit is contained in:
parent
6062042fae
commit
6506965d98
@ -177,8 +177,7 @@ class BaseCrawlOps:
|
|||||||
size = 0
|
size = 0
|
||||||
for file_ in crawl.files:
|
for file_ in crawl.files:
|
||||||
size += file_.size
|
size += file_.size
|
||||||
status_code = await delete_crawl_file_object(org, file_, self.crawl_manager)
|
if not await delete_crawl_file_object(org, file_, self.crawl_manager):
|
||||||
if status_code != 204:
|
|
||||||
raise HTTPException(status_code=400, detail="file_deletion_error")
|
raise HTTPException(status_code=400, detail="file_deletion_error")
|
||||||
|
|
||||||
return size
|
return size
|
||||||
|
@ -8,6 +8,7 @@ from typing import Optional, List
|
|||||||
|
|
||||||
import pymongo
|
import pymongo
|
||||||
from fastapi import Depends, HTTPException
|
from fastapi import Depends, HTTPException
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
|
||||||
from .basecrawls import SUCCESSFUL_STATES
|
from .basecrawls import SUCCESSFUL_STATES
|
||||||
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
|
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
|
||||||
@ -21,6 +22,9 @@ from .models import (
|
|||||||
Organization,
|
Organization,
|
||||||
PaginatedResponse,
|
PaginatedResponse,
|
||||||
)
|
)
|
||||||
|
from .storages import (
|
||||||
|
download_streaming_wacz,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -259,6 +263,17 @@ class CollectionOps:
|
|||||||
|
|
||||||
return {"success": True}
|
return {"success": True}
|
||||||
|
|
||||||
|
async def download_collection(self, coll_id: uuid.UUID, org: Organization):
|
||||||
|
"""Download all WACZs in collection as streaming nested WACZ"""
|
||||||
|
coll = await self.get_collection(coll_id, org, resources=True)
|
||||||
|
|
||||||
|
resp = await download_streaming_wacz(org, self.crawl_manager, coll.resources)
|
||||||
|
|
||||||
|
headers = {"Content-Disposition": f'attachment; filename="{coll.name}.wacz"'}
|
||||||
|
return StreamingResponse(
|
||||||
|
resp, headers=headers, media_type="application/wacz+zip"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
async def update_collection_counts_and_tags(
|
async def update_collection_counts_and_tags(
|
||||||
@ -458,4 +473,10 @@ def init_collections_api(app, mdb, crawls, orgs, crawl_manager):
|
|||||||
):
|
):
|
||||||
return await colls.delete_collection(coll_id, org)
|
return await colls.delete_collection(coll_id, org)
|
||||||
|
|
||||||
|
@app.get("/orgs/{oid}/collections/{coll_id}/download", tags=["collections"])
|
||||||
|
async def download_collection(
|
||||||
|
coll_id: uuid.UUID, org: Organization = Depends(org_viewer_dep)
|
||||||
|
):
|
||||||
|
return await colls.download_collection(coll_id, org)
|
||||||
|
|
||||||
return colls
|
return colls
|
||||||
|
@ -5,13 +5,24 @@ from typing import Union
|
|||||||
from urllib.parse import urlsplit
|
from urllib.parse import urlsplit
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import json
|
||||||
|
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
from fastapi import Depends, HTTPException
|
from fastapi import Depends, HTTPException
|
||||||
from aiobotocore.session import get_session
|
from stream_zip import stream_zip, NO_COMPRESSION_64
|
||||||
|
|
||||||
|
import aiobotocore.session
|
||||||
|
import boto3
|
||||||
|
|
||||||
from .models import Organization, DefaultStorage, S3Storage, User
|
from .models import Organization, DefaultStorage, S3Storage, User
|
||||||
from .zip import get_zip_file, extract_and_parse_log_file
|
from .zip import get_zip_file, extract_and_parse_log_file
|
||||||
|
|
||||||
|
|
||||||
|
CHUNK_SIZE = 1024 * 256
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
def init_storages_api(org_ops, crawl_manager, user_dep):
|
def init_storages_api(org_ops, crawl_manager, user_dep):
|
||||||
"""API for updating storage for an org"""
|
"""API for updating storage for an org"""
|
||||||
@ -65,7 +76,7 @@ async def get_s3_client(storage, use_access=False):
|
|||||||
|
|
||||||
endpoint_url = parts.scheme + "://" + parts.netloc
|
endpoint_url = parts.scheme + "://" + parts.netloc
|
||||||
|
|
||||||
session = get_session()
|
session = aiobotocore.session.get_session()
|
||||||
|
|
||||||
async with session.create_client(
|
async with session.create_client(
|
||||||
"s3",
|
"s3",
|
||||||
@ -77,6 +88,34 @@ async def get_s3_client(storage, use_access=False):
|
|||||||
yield client, bucket, key
|
yield client, bucket, key
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
def get_sync_s3_client(storage, use_access=False):
|
||||||
|
"""context manager for s3 client"""
|
||||||
|
endpoint_url = storage.endpoint_url
|
||||||
|
|
||||||
|
if not endpoint_url.endswith("/"):
|
||||||
|
endpoint_url += "/"
|
||||||
|
|
||||||
|
parts = urlsplit(endpoint_url)
|
||||||
|
bucket, key = parts.path[1:].split("/", 1)
|
||||||
|
|
||||||
|
endpoint_url = parts.scheme + "://" + parts.netloc
|
||||||
|
|
||||||
|
client = boto3.client(
|
||||||
|
"s3",
|
||||||
|
region_name=storage.region,
|
||||||
|
endpoint_url=endpoint_url,
|
||||||
|
aws_access_key_id=storage.access_key,
|
||||||
|
aws_secret_access_key=storage.secret_key,
|
||||||
|
)
|
||||||
|
|
||||||
|
public_endpoint_url = (
|
||||||
|
storage.endpoint_url if not use_access else storage.access_endpoint_url
|
||||||
|
)
|
||||||
|
|
||||||
|
return client, bucket, key, public_endpoint_url
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
async def verify_storage_upload(storage, filename):
|
async def verify_storage_upload(storage, filename):
|
||||||
"""Test credentials and storage endpoint by uploading an empty test file"""
|
"""Test credentials and storage endpoint by uploading an empty test file"""
|
||||||
@ -108,6 +147,22 @@ async def do_upload_single(org, filename, data, crawl_manager, storage_name="def
|
|||||||
return await client.put_object(Bucket=bucket, Key=key, Body=data)
|
return await client.put_object(Bucket=bucket, Key=key, Body=data)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
async def get_sync_client(org, crawl_manager, storage_name="default", use_access=False):
|
||||||
|
"""get sync client"""
|
||||||
|
s3storage = None
|
||||||
|
|
||||||
|
if org.storage.type == "s3":
|
||||||
|
s3storage = org.storage
|
||||||
|
else:
|
||||||
|
s3storage = await crawl_manager.get_default_storage(storage_name)
|
||||||
|
|
||||||
|
if not s3storage:
|
||||||
|
raise TypeError("No Default Storage Found, Invalid Storage Type")
|
||||||
|
|
||||||
|
return get_sync_s3_client(s3storage, use_access=use_access)
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# pylint: disable=too-many-arguments,too-many-locals
|
# pylint: disable=too-many-arguments,too-many-locals
|
||||||
async def do_upload_multipart(
|
async def do_upload_multipart(
|
||||||
@ -232,10 +287,18 @@ async def get_presigned_url(org, crawlfile, crawl_manager, duration=3600):
|
|||||||
# ============================================================================
|
# ============================================================================
|
||||||
async def delete_crawl_file_object(org, crawlfile, crawl_manager):
|
async def delete_crawl_file_object(org, crawlfile, crawl_manager):
|
||||||
"""delete crawl file from storage."""
|
"""delete crawl file from storage."""
|
||||||
|
return await delete_file(
|
||||||
|
org, crawlfile.filename, crawl_manager, crawlfile.def_storage_name
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
async def delete_file(org, filename, crawl_manager, def_storage_name="default"):
|
||||||
|
"""delete specified file from storage"""
|
||||||
status_code = None
|
status_code = None
|
||||||
|
|
||||||
if crawlfile.def_storage_name:
|
if def_storage_name:
|
||||||
s3storage = await crawl_manager.get_default_storage(crawlfile.def_storage_name)
|
s3storage = await crawl_manager.get_default_storage(def_storage_name)
|
||||||
|
|
||||||
elif org.storage.type == "s3":
|
elif org.storage.type == "s3":
|
||||||
s3storage = org.storage
|
s3storage = org.storage
|
||||||
@ -248,11 +311,11 @@ async def delete_crawl_file_object(org, crawlfile, crawl_manager):
|
|||||||
bucket,
|
bucket,
|
||||||
key,
|
key,
|
||||||
):
|
):
|
||||||
key += crawlfile.filename
|
key += filename
|
||||||
response = await client.delete_object(Bucket=bucket, Key=key)
|
response = await client.delete_object(Bucket=bucket, Key=key)
|
||||||
status_code = response["ResponseMetadata"]["HTTPStatusCode"]
|
status_code = response["ResponseMetadata"]["HTTPStatusCode"]
|
||||||
|
|
||||||
return status_code
|
return status_code == 204
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
@ -289,3 +352,55 @@ async def get_wacz_logs(org, crawlfile, crawl_manager):
|
|||||||
combined_log_lines.extend(parsed_log_lines)
|
combined_log_lines.extend(parsed_log_lines)
|
||||||
|
|
||||||
return combined_log_lines
|
return combined_log_lines
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
def _sync_dl(all_files, client, bucket, key):
|
||||||
|
"""generate streaming zip as sync"""
|
||||||
|
for file_ in all_files:
|
||||||
|
file_.path = file_.name
|
||||||
|
|
||||||
|
datapackage = {
|
||||||
|
"profile": "multi-wacz-package",
|
||||||
|
"resources": [file_.dict() for file_ in all_files],
|
||||||
|
}
|
||||||
|
datapackage = json.dumps(datapackage).encode("utf-8")
|
||||||
|
|
||||||
|
def get_file(name):
|
||||||
|
response = client.get_object(Bucket=bucket, Key=key + name)
|
||||||
|
return response["Body"].iter_chunks(chunk_size=CHUNK_SIZE)
|
||||||
|
|
||||||
|
def member_files():
|
||||||
|
modified_at = datetime(year=1980, month=1, day=1)
|
||||||
|
perms = 0o664
|
||||||
|
for file_ in all_files:
|
||||||
|
yield (
|
||||||
|
file_.name,
|
||||||
|
modified_at,
|
||||||
|
perms,
|
||||||
|
NO_COMPRESSION_64,
|
||||||
|
get_file(file_.name),
|
||||||
|
)
|
||||||
|
|
||||||
|
yield (
|
||||||
|
"datapackage.json",
|
||||||
|
modified_at,
|
||||||
|
perms,
|
||||||
|
NO_COMPRESSION_64,
|
||||||
|
(datapackage,),
|
||||||
|
)
|
||||||
|
|
||||||
|
return stream_zip(member_files(), chunk_size=CHUNK_SIZE)
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
async def download_streaming_wacz(org, crawl_manager, files):
|
||||||
|
"""return an iter for downloading a stream nested wacz file
|
||||||
|
from list of files"""
|
||||||
|
client, bucket, key, _ = await get_sync_client(org, crawl_manager)
|
||||||
|
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
|
||||||
|
resp = await loop.run_in_executor(None, _sync_dl, files, client, bucket, key)
|
||||||
|
|
||||||
|
return resp
|
||||||
|
@ -11,3 +11,5 @@ jinja2
|
|||||||
humanize
|
humanize
|
||||||
python-multipart
|
python-multipart
|
||||||
pathvalidate
|
pathvalidate
|
||||||
|
https://github.com/ikreymer/stream-zip/archive/refs/heads/stream-uncompress.zip
|
||||||
|
boto3
|
||||||
|
@ -1,6 +1,9 @@
|
|||||||
import requests
|
import requests
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
from zipfile import ZipFile, ZIP_STORED
|
||||||
|
from tempfile import TemporaryFile
|
||||||
|
|
||||||
from .conftest import API_PREFIX
|
from .conftest import API_PREFIX
|
||||||
from .utils import read_in_chunks
|
from .utils import read_in_chunks
|
||||||
|
|
||||||
@ -310,6 +313,28 @@ def test_add_upload_to_collection(crawler_auth_headers, default_org_id):
|
|||||||
assert _coll_id in r.json()["collections"]
|
assert _coll_id in r.json()["collections"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_download_streaming_collection(crawler_auth_headers, default_org_id):
|
||||||
|
# Add upload
|
||||||
|
with TemporaryFile() as fh:
|
||||||
|
with requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/download",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
stream=True,
|
||||||
|
) as r:
|
||||||
|
assert r.status_code == 200
|
||||||
|
for chunk in r.iter_content():
|
||||||
|
fh.write(chunk)
|
||||||
|
|
||||||
|
fh.seek(0)
|
||||||
|
with ZipFile(fh, "r") as zip_file:
|
||||||
|
contents = zip_file.namelist()
|
||||||
|
|
||||||
|
assert len(contents) == 4
|
||||||
|
for filename in contents:
|
||||||
|
assert filename.endswith(".wacz") or filename == "datapackage.json"
|
||||||
|
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
|
||||||
|
|
||||||
|
|
||||||
def test_list_collections(
|
def test_list_collections(
|
||||||
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
crawler_auth_headers, default_org_id, crawler_crawl_id, admin_crawl_id
|
||||||
):
|
):
|
||||||
|
@ -110,6 +110,8 @@ export class CollectionDetail extends LiteElement {
|
|||||||
`;
|
`;
|
||||||
|
|
||||||
private renderActions = () => {
|
private renderActions = () => {
|
||||||
|
const authToken = this.authState!.headers.Authorization.split(" ")[1];
|
||||||
|
|
||||||
return html`
|
return html`
|
||||||
<sl-dropdown distance="4">
|
<sl-dropdown distance="4">
|
||||||
<sl-button slot="trigger" size="small" caret
|
<sl-button slot="trigger" size="small" caret
|
||||||
@ -125,6 +127,20 @@ export class CollectionDetail extends LiteElement {
|
|||||||
<sl-icon name="gear" slot="prefix"></sl-icon>
|
<sl-icon name="gear" slot="prefix"></sl-icon>
|
||||||
${msg("Edit Collection")}
|
${msg("Edit Collection")}
|
||||||
</sl-menu-item>
|
</sl-menu-item>
|
||||||
|
<sl-divider></sl-divider>
|
||||||
|
<!-- Shoelace doesn't allow "href" on menu items,
|
||||||
|
see https://github.com/shoelace-style/shoelace/issues/1351 -->
|
||||||
|
<a
|
||||||
|
href=${`/api/orgs/${this.orgId}/collections/${this.collectionId}/download?auth_bearer=${authToken}`}
|
||||||
|
class="px-6 py-[0.6rem] flex gap-2 items-center whitespace-nowrap hover:bg-neutral-100"
|
||||||
|
@click=${(e: MouseEvent) => {
|
||||||
|
(e.target as HTMLAnchorElement).closest("sl-dropdown")?.hide();
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<sl-icon name="cloud-download" slot="prefix"></sl-icon>
|
||||||
|
${msg("Download Collection")}
|
||||||
|
</a>
|
||||||
|
<sl-divider></sl-divider>
|
||||||
<sl-menu-item
|
<sl-menu-item
|
||||||
style="--sl-color-neutral-700: var(--danger)"
|
style="--sl-color-neutral-700: var(--danger)"
|
||||||
@click=${this.confirmDelete}
|
@click=${this.confirmDelete}
|
||||||
|
@ -95,10 +95,10 @@ export class CollectionEdit extends LiteElement {
|
|||||||
await this.saveMetadata({ name, description });
|
await this.saveMetadata({ name, description });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
this.navTo(`/orgs/${this.orgId}/collections/view/${this.collectionId}`);
|
||||||
this.notify({
|
this.notify({
|
||||||
message: msg(
|
message: msg(
|
||||||
html`Successfully updated
|
html`Successfully updated <strong>${name}</strong> Collection.`
|
||||||
<strong>${name}</strong> Collection.`
|
|
||||||
),
|
),
|
||||||
variant: "success",
|
variant: "success",
|
||||||
icon: "check2-circle",
|
icon: "check2-circle",
|
||||||
|
@ -439,28 +439,17 @@ export class CollectionsList extends LiteElement {
|
|||||||
|
|
||||||
private renderItem = (col: Collection) =>
|
private renderItem = (col: Collection) =>
|
||||||
html`<li class="mb-2 last:mb-0">
|
html`<li class="mb-2 last:mb-0">
|
||||||
<a
|
<div class="block border rounded leading-none">
|
||||||
href=${`/orgs/${this.orgId}/collections/view/${col.id}`}
|
|
||||||
class="block border rounded shadow-sm leading-none hover:bg-neutral-50"
|
|
||||||
@click=${(e: MouseEvent) => {
|
|
||||||
if (
|
|
||||||
(
|
|
||||||
(e.currentTarget as HTMLElement)?.querySelector(
|
|
||||||
".actionsCol"
|
|
||||||
) as HTMLElement
|
|
||||||
).contains(e.target as HTMLElement)
|
|
||||||
) {
|
|
||||||
e.preventDefault();
|
|
||||||
} else {
|
|
||||||
this.navLink(e);
|
|
||||||
}
|
|
||||||
}}
|
|
||||||
>
|
|
||||||
<div
|
<div
|
||||||
class="relative p-3 md:p-0 grid grid-cols-1 md:grid-cols-[repeat(2,1fr)_16ch_repeat(2,10ch)_2.5rem] gap-3 lg:h-10 items-center"
|
class="relative p-3 md:p-0 grid grid-cols-1 md:grid-cols-[repeat(2,1fr)_16ch_repeat(2,10ch)_2.5rem] gap-3 lg:h-10 items-center"
|
||||||
>
|
>
|
||||||
<div class="col-span-1 md:pl-3 truncate font-semibold">
|
<div class="col-span-1 md:pl-3 truncate font-semibold">
|
||||||
${col.name}
|
<a
|
||||||
|
href=${`/orgs/${this.orgId}/collections/view/${col.id}`}
|
||||||
|
class="block text-primary hover:text-indigo-500"
|
||||||
|
>
|
||||||
|
${col.name}
|
||||||
|
</a>
|
||||||
</div>
|
</div>
|
||||||
<div class="col-span-1 order-last md:order-none truncate">
|
<div class="col-span-1 order-last md:order-none truncate">
|
||||||
${col.tags
|
${col.tags
|
||||||
@ -500,10 +489,12 @@ export class CollectionsList extends LiteElement {
|
|||||||
${this.isCrawler ? this.renderActions(col) : ""}
|
${this.isCrawler ? this.renderActions(col) : ""}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</a>
|
</div>
|
||||||
</li>`;
|
</li>`;
|
||||||
|
|
||||||
private renderActions = (col: Collection) => {
|
private renderActions = (col: Collection) => {
|
||||||
|
const authToken = this.authState!.headers.Authorization.split(" ")[1];
|
||||||
|
|
||||||
return html`
|
return html`
|
||||||
<sl-dropdown distance="4">
|
<sl-dropdown distance="4">
|
||||||
<btrix-button class="p-2" slot="trigger" label=${msg("Actions")} icon>
|
<btrix-button class="p-2" slot="trigger" label=${msg("Actions")} icon>
|
||||||
@ -517,6 +508,21 @@ export class CollectionsList extends LiteElement {
|
|||||||
<sl-icon name="gear" slot="prefix"></sl-icon>
|
<sl-icon name="gear" slot="prefix"></sl-icon>
|
||||||
${msg("Edit Collection")}
|
${msg("Edit Collection")}
|
||||||
</sl-menu-item>
|
</sl-menu-item>
|
||||||
|
<sl-divider></sl-divider>
|
||||||
|
<!-- Shoelace doesn't allow "href" on menu items,
|
||||||
|
see https://github.com/shoelace-style/shoelace/issues/1351 -->
|
||||||
|
<a
|
||||||
|
href=${`/api/orgs/${this.orgId}/collections/${col.id}/download?auth_bearer=${authToken}`}
|
||||||
|
class="px-6 py-[0.6rem] flex gap-2 items-center whitespace-nowrap hover:bg-neutral-100"
|
||||||
|
download
|
||||||
|
@click=${(e: MouseEvent) => {
|
||||||
|
(e.target as HTMLAnchorElement).closest("sl-dropdown")?.hide();
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<sl-icon name="cloud-download" slot="prefix"></sl-icon>
|
||||||
|
${msg("Download Collection")}
|
||||||
|
</a>
|
||||||
|
<sl-divider></sl-divider>
|
||||||
<sl-menu-item
|
<sl-menu-item
|
||||||
style="--sl-color-neutral-700: var(--danger)"
|
style="--sl-color-neutral-700: var(--danger)"
|
||||||
@click=${() => this.confirmDelete(col)}
|
@click=${() => this.confirmDelete(col)}
|
||||||
|
@ -14,4 +14,4 @@ export type CollectionList = Collection[];
|
|||||||
|
|
||||||
export type CollectionSearchValues = {
|
export type CollectionSearchValues = {
|
||||||
names: string[];
|
names: string[];
|
||||||
}
|
};
|
||||||
|
Loading…
Reference in New Issue
Block a user