browsertrix/backend/btrixcloud/colls.py
Tessa Walsh a031fab313
Backend work for public collections (#2198)
Fixes #2182 

This rather large PR adds the rest of what should be needed for public
collections work in the frontend.

New API endpoints include:

- Public collections endpoints: GET, streaming download
- Paginated list of URLs in collection with snapshot (page) info for
each
- Collection endpoint to set home URL
- Collection endpoint to upload thumbnail as stream
- DELETE endpoint to remove collection thumbnail

Changes to existing API endpoints include:

- Paginating public collection list results
- Several `pages` endpoints that previously only supported `/crawls/` in
their path, e.g. `/orgs/{oid}/crawls/all/pages/reAdd`, now support
`/uploads/` and `/all-crawls/` namespaces as well. This is necessitated
by adding pages for uploads to the database (see below). For
`/orgs/{oid}/namespace/all/pages/reAdd`, `crawls` or `uploads` will
serve as a filter to only affect crawls of that given type. Other
endpoints are more liberal at this point, and will perform the same
action regardless of the namespace used in the route (we'll likely want
to change this in a follow-up to be more consistent).
- `/orgs/{oid}/namespace/all/pages/reAdd` now kicks off a background job
rather than doing all of the computation in an asyncio task in the
backend container. The background job additionally updates collection
date ranges, page/size counts, and tags for each collection in the org
after pages have been (re)added.

Other big changes:

- New uploads will now have their pages read into the database!
Collection page counts now also include uploads
- A migration was added to start a background job for each org that will
add the pages for previously-uploaded WACZ files to the database and
update collections accordingly
- Adds a new `ImageFile` subclass of `BaseFile` for thumbnails that we
can use for other user-uploaded image files moving forward, with
separate output models for authenticated and public endpoints
2025-01-13 15:15:48 -08:00

1113 lines
36 KiB
Python

"""
Collections API
"""
# pylint: disable=too-many-lines
from collections import Counter
from uuid import UUID, uuid4
from typing import Optional, List, TYPE_CHECKING, cast, Dict, Tuple, Any, Union
import os
import re
import urllib.parse
import asyncio
import pymongo
from fastapi import Depends, HTTPException, Response
from fastapi.responses import StreamingResponse
from starlette.requests import Request
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
from .models import (
Collection,
CollIn,
CollOut,
CollIdName,
UpdateColl,
AddRemoveCrawlList,
BaseCrawl,
CrawlOutWithResources,
CrawlFileOut,
Organization,
PaginatedCollOutResponse,
SUCCESSFUL_STATES,
AddedResponseIdName,
EmptyResponse,
UpdatedResponse,
SuccessResponse,
AddedResponse,
DeletedResponse,
CollectionSearchValuesResponse,
OrgPublicCollections,
PublicOrgDetails,
CollAccessType,
PageUrlCount,
PageIdTimestamp,
PaginatedPageUrlCountResponse,
UpdateCollHomeUrl,
User,
ImageFile,
ImageFilePreparer,
MIN_UPLOAD_PART_SIZE,
PublicCollOut,
)
from .utils import dt_now
if TYPE_CHECKING:
from .orgs import OrgOps
from .storages import StorageOps
from .webhooks import EventWebhookOps
from .crawls import CrawlOps
else:
OrgOps = StorageOps = EventWebhookOps = CrawlOps = object
THUMBNAIL_MAX_SIZE = 2_000_000
# ============================================================================
class CollectionOps:
"""ops for working with named collections of crawls"""
# pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods
orgs: OrgOps
storage_ops: StorageOps
event_webhook_ops: EventWebhookOps
crawl_ops: CrawlOps
def __init__(self, mdb, storage_ops, orgs, event_webhook_ops):
self.collections = mdb["collections"]
self.crawls = mdb["crawls"]
self.crawl_configs = mdb["crawl_configs"]
self.pages = mdb["pages"]
self.crawl_ops = cast(CrawlOps, None)
self.orgs = orgs
self.storage_ops = storage_ops
self.event_webhook_ops = event_webhook_ops
def set_crawl_ops(self, ops):
"""set crawl ops"""
self.crawl_ops = ops
def set_page_ops(self, ops):
"""set page ops"""
# pylint: disable=attribute-defined-outside-init
self.page_ops = ops
async def init_index(self):
"""init lookup index"""
await self.collections.create_index(
[("oid", pymongo.ASCENDING), ("name", pymongo.ASCENDING)], unique=True
)
await self.collections.create_index(
[("oid", pymongo.ASCENDING), ("description", pymongo.ASCENDING)]
)
async def add_collection(self, oid: UUID, coll_in: CollIn):
"""Add new collection"""
crawl_ids = coll_in.crawlIds if coll_in.crawlIds else []
coll_id = uuid4()
modified = dt_now()
coll = Collection(
id=coll_id,
oid=oid,
name=coll_in.name,
description=coll_in.description,
caption=coll_in.caption,
modified=modified,
access=coll_in.access,
defaultThumbnailName=coll_in.defaultThumbnailName,
allowPublicDownload=coll_in.allowPublicDownload,
)
try:
await self.collections.insert_one(coll.to_dict())
org = await self.orgs.get_org_by_id(oid)
if crawl_ids:
await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)
await self.update_collection_counts_and_tags(coll_id)
await self.update_collection_dates(coll_id)
asyncio.create_task(
self.event_webhook_ops.create_added_to_collection_notification(
crawl_ids, coll_id, org
)
)
return {"added": True, "id": coll_id, "name": coll.name}
except pymongo.errors.DuplicateKeyError:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=400, detail="collection_name_taken")
async def update_collection(
self, coll_id: UUID, org: Organization, update: UpdateColl
):
"""Update collection"""
query = update.dict(exclude_unset=True)
if len(query) == 0:
raise HTTPException(status_code=400, detail="no_update_data")
query["modified"] = dt_now()
try:
result = await self.collections.find_one_and_update(
{"_id": coll_id, "oid": org.id},
{"$set": query},
return_document=pymongo.ReturnDocument.AFTER,
)
except pymongo.errors.DuplicateKeyError:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=400, detail="collection_name_taken")
if not result:
raise HTTPException(status_code=404, detail="collection_not_found")
return {"updated": True}
async def add_crawls_to_collection(
self, coll_id: UUID, crawl_ids: List[str], org: Organization
) -> CollOut:
"""Add crawls to collection"""
await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)
modified = dt_now()
result = await self.collections.find_one_and_update(
{"_id": coll_id},
{"$set": {"modified": modified}},
return_document=pymongo.ReturnDocument.AFTER,
)
if not result:
raise HTTPException(status_code=404, detail="collection_not_found")
await self.update_collection_counts_and_tags(coll_id)
await self.update_collection_dates(coll_id)
asyncio.create_task(
self.event_webhook_ops.create_added_to_collection_notification(
crawl_ids, coll_id, org
)
)
return await self.get_collection_out(coll_id, org)
async def remove_crawls_from_collection(
self, coll_id: UUID, crawl_ids: List[str], org: Organization
) -> CollOut:
"""Remove crawls from collection"""
await self.crawl_ops.remove_from_collection(crawl_ids, coll_id)
modified = dt_now()
result = await self.collections.find_one_and_update(
{"_id": coll_id},
{"$set": {"modified": modified}},
return_document=pymongo.ReturnDocument.AFTER,
)
if not result:
raise HTTPException(status_code=404, detail="collection_not_found")
await self.update_collection_counts_and_tags(coll_id)
await self.update_collection_dates(coll_id)
asyncio.create_task(
self.event_webhook_ops.create_removed_from_collection_notification(
crawl_ids, coll_id, org
)
)
return await self.get_collection_out(coll_id, org)
async def get_collection_raw(
self, coll_id: UUID, public_or_unlisted_only: bool = False
) -> Dict[str, Any]:
"""Get collection by id as dict from database"""
query: dict[str, object] = {"_id": coll_id}
if public_or_unlisted_only:
query["access"] = {"$in": ["public", "unlisted"]}
result = await self.collections.find_one(query)
if not result:
raise HTTPException(status_code=404, detail="collection_not_found")
return result
async def get_collection(
self, coll_id: UUID, public_or_unlisted_only: bool = False
) -> Collection:
"""Get collection by id"""
result = await self.get_collection_raw(coll_id, public_or_unlisted_only)
return Collection.from_dict(result)
async def get_collection_out(
self,
coll_id: UUID,
org: Organization,
resources=False,
public_or_unlisted_only=False,
) -> CollOut:
"""Get CollOut by id"""
result = await self.get_collection_raw(coll_id, public_or_unlisted_only)
if resources:
result["resources"] = await self.get_collection_crawl_resources(coll_id)
thumbnail = result.get("thumbnail")
if thumbnail:
image_file = ImageFile(**thumbnail)
result["thumbnail"] = await image_file.get_image_file_out(
org, self.storage_ops
)
return CollOut.from_dict(result)
async def get_public_collection_out(
self, coll_id: UUID, org: Organization, allow_unlisted: bool = False
) -> PublicCollOut:
"""Get PublicCollOut by id"""
result = await self.get_collection_raw(coll_id)
allowed_access = [CollAccessType.PUBLIC]
if allow_unlisted:
allowed_access.append(CollAccessType.UNLISTED)
if result.get("access") not in allowed_access:
raise HTTPException(status_code=404, detail="collection_not_found")
result["resources"] = await self.get_collection_crawl_resources(coll_id)
thumbnail = result.get("thumbnail")
if thumbnail:
image_file = ImageFile(**thumbnail)
result["thumbnail"] = await image_file.get_public_image_file_out(
org, self.storage_ops
)
return PublicCollOut.from_dict(result)
async def list_collections(
self,
org: Organization,
public_colls_out: bool = False,
page_size: int = DEFAULT_PAGE_SIZE,
page: int = 1,
sort_by: Optional[str] = None,
sort_direction: int = 1,
name: Optional[str] = None,
name_prefix: Optional[str] = None,
access: Optional[str] = None,
):
"""List all collections for org"""
# pylint: disable=too-many-locals, duplicate-code, too-many-branches
# Zero-index page for query
page = page - 1
skip = page * page_size
match_query: dict[str, object] = {"oid": org.id}
if name:
match_query["name"] = name
elif name_prefix:
regex_pattern = f"^{name_prefix}"
match_query["name"] = {"$regex": regex_pattern, "$options": "i"}
if public_colls_out:
match_query["access"] = CollAccessType.PUBLIC
elif access:
match_query["access"] = access
aggregate = [{"$match": match_query}]
if sort_by:
if sort_by not in ("modified", "name", "description", "totalSize"):
raise HTTPException(status_code=400, detail="invalid_sort_by")
if sort_direction not in (1, -1):
raise HTTPException(status_code=400, detail="invalid_sort_direction")
aggregate.extend([{"$sort": {sort_by: sort_direction}}])
aggregate.extend(
[
{
"$facet": {
"items": [
{"$skip": skip},
{"$limit": page_size},
],
"total": [{"$count": "count"}],
}
},
]
)
cursor = self.collections.aggregate(
aggregate, collation=pymongo.collation.Collation(locale="en")
)
results = await cursor.to_list(length=1)
result = results[0]
items = result["items"]
try:
total = int(result["total"][0]["count"])
except (IndexError, ValueError):
total = 0
collections: List[Union[CollOut, PublicCollOut]] = []
for res in items:
res["resources"] = await self.get_collection_crawl_resources(res["_id"])
thumbnail = res.get("thumbnail")
if thumbnail:
image_file = ImageFile(**thumbnail)
if public_colls_out:
res["thumbnail"] = await image_file.get_public_image_file_out(
org, self.storage_ops
)
else:
res["thumbnail"] = await image_file.get_image_file_out(
org, self.storage_ops
)
if public_colls_out:
collections.append(PublicCollOut.from_dict(res))
else:
collections.append(CollOut.from_dict(res))
return collections, total
async def get_collection_crawl_resources(self, coll_id: UUID):
"""Return pre-signed resources for all collection crawl files."""
# Ensure collection exists
_ = await self.get_collection_raw(coll_id)
all_files = []
crawls, _ = await self.crawl_ops.list_all_base_crawls(
collection_id=coll_id,
states=list(SUCCESSFUL_STATES),
page_size=10_000,
cls_type=CrawlOutWithResources,
)
for crawl in crawls:
if crawl.resources:
all_files.extend(crawl.resources)
return all_files
async def get_collection_names(self, uuids: List[UUID]):
"""return object of {_id, names} given list of collection ids"""
cursor = self.collections.find(
{"_id": {"$in": uuids}}, projection=["_id", "name"]
)
names = await cursor.to_list(length=1000)
names = [
CollIdName(id=namedata["_id"], name=namedata["name"]) for namedata in names
]
return names
async def get_collection_search_values(self, org: Organization):
"""Return list of collection names"""
names = await self.collections.distinct("name", {"oid": org.id})
# Remove empty strings
names = [name for name in names if name]
return {"names": names}
async def get_collection_crawl_ids(self, coll_id: UUID) -> List[str]:
"""Return list of crawl ids in collection"""
crawl_ids = []
async for crawl_raw in self.crawls.find(
{"collectionIds": coll_id}, projection=["_id"]
):
crawl_id = crawl_raw.get("_id")
if crawl_id:
crawl_ids.append(crawl_id)
return crawl_ids
async def delete_collection(self, coll_id: UUID, org: Organization):
"""Delete collection and remove from associated crawls."""
await self.crawl_ops.remove_collection_from_all_crawls(coll_id)
result = await self.collections.delete_one({"_id": coll_id, "oid": org.id})
if result.deleted_count < 1:
raise HTTPException(status_code=404, detail="collection_not_found")
asyncio.create_task(
self.event_webhook_ops.create_collection_deleted_notification(coll_id, org)
)
return {"success": True}
async def download_collection(self, coll_id: UUID, org: Organization):
"""Download all WACZs in collection as streaming nested WACZ"""
coll = await self.get_collection_out(coll_id, org, resources=True)
metadata = {
"type": "collection",
"id": str(coll_id),
"title": coll.name,
"organization": org.slug,
}
if coll.description:
metadata["description"] = coll.description
resp = await self.storage_ops.download_streaming_wacz(metadata, coll.resources)
headers = {"Content-Disposition": f'attachment; filename="{coll.name}.wacz"'}
return StreamingResponse(
resp, headers=headers, media_type="application/wacz+zip"
)
async def recalculate_org_collection_counts_tags(self, org: Organization):
"""Recalculate counts and tags for collections in org"""
collections, _ = await self.list_collections(
org,
page_size=100_000,
)
for coll in collections:
await self.update_collection_counts_and_tags(coll.id)
async def update_collection_counts_and_tags(self, collection_id: UUID):
"""Set current crawl info in config when crawl begins"""
crawl_count = 0
page_count = 0
total_size = 0
tags = []
coll = await self.get_collection(collection_id)
org = await self.orgs.get_org_by_id(coll.oid)
async for crawl_raw in self.crawls.find({"collectionIds": collection_id}):
crawl = BaseCrawl.from_dict(crawl_raw)
if crawl.state not in SUCCESSFUL_STATES:
continue
crawl_count += 1
files = crawl.files or []
for file in files:
total_size += file.size
try:
_, crawl_pages = await self.page_ops.list_pages(
crawl.id, org, page_size=1_000_000
)
page_count += crawl_pages
# pylint: disable=broad-exception-caught
except Exception:
pass
if crawl.tags:
tags.extend(crawl.tags)
sorted_tags = [tag for tag, count in Counter(tags).most_common()]
await self.collections.find_one_and_update(
{"_id": collection_id},
{
"$set": {
"crawlCount": crawl_count,
"pageCount": page_count,
"totalSize": total_size,
"tags": sorted_tags,
}
},
)
async def recalculate_org_collection_dates(self, org: Organization):
"""Recalculate earliest and latest dates for collections in org"""
collections, _ = await self.list_collections(
org,
page_size=100_000,
)
for coll in collections:
await self.update_collection_dates(coll.id)
async def update_collection_dates(self, coll_id: UUID):
"""Update collection earliest and latest dates from page timestamps"""
coll = await self.get_collection(coll_id)
crawl_ids = await self.get_collection_crawl_ids(coll_id)
earliest_ts = None
latest_ts = None
match_query = {
"oid": coll.oid,
"crawl_id": {"$in": crawl_ids},
"ts": {"$ne": None},
}
cursor = self.pages.find(match_query).sort("ts", 1).limit(1)
pages = await cursor.to_list(length=1)
try:
earliest_page = pages[0]
earliest_ts = earliest_page.get("ts")
except IndexError:
pass
cursor = self.pages.find(match_query).sort("ts", -1).limit(1)
pages = await cursor.to_list(length=1)
try:
latest_page = pages[0]
latest_ts = latest_page.get("ts")
except IndexError:
pass
await self.collections.find_one_and_update(
{"_id": coll_id},
{
"$set": {
"dateEarliest": earliest_ts,
"dateLatest": latest_ts,
}
},
)
async def update_crawl_collections(self, crawl_id: str):
"""Update counts and tags for all collections in crawl"""
crawl = await self.crawls.find_one({"_id": crawl_id})
crawl_coll_ids = crawl.get("collectionIds")
for collection_id in crawl_coll_ids:
await self.update_collection_counts_and_tags(collection_id)
async def add_successful_crawl_to_collections(self, crawl_id: str, cid: UUID):
"""Add successful crawl to its auto-add collections."""
workflow = await self.crawl_configs.find_one({"_id": cid})
auto_add_collections = workflow.get("autoAddCollections")
if auto_add_collections:
await self.crawls.find_one_and_update(
{"_id": crawl_id},
{"$set": {"collectionIds": auto_add_collections}},
)
await self.update_crawl_collections(crawl_id)
async def get_org_public_collections(
self,
org_slug: str,
page_size: int = DEFAULT_PAGE_SIZE,
page: int = 1,
sort_by: Optional[str] = None,
sort_direction: int = 1,
):
"""List public collections for org"""
try:
org = await self.orgs.get_org_by_slug(org_slug)
# pylint: disable=broad-exception-caught
except Exception:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=404, detail="public_profile_not_found")
if not org.enablePublicProfile:
raise HTTPException(status_code=404, detail="public_profile_not_found")
collections, _ = await self.list_collections(
org,
page_size=page_size,
page=page,
sort_by=sort_by,
sort_direction=sort_direction,
public_colls_out=True,
)
public_org_details = PublicOrgDetails(
name=org.name,
description=org.publicDescription or "",
url=org.publicUrl or "",
)
return OrgPublicCollections(org=public_org_details, collections=collections)
async def list_urls_in_collection(
self,
coll_id: UUID,
oid: UUID,
url_prefix: Optional[str] = None,
page_size: int = DEFAULT_PAGE_SIZE,
page: int = 1,
) -> Tuple[List[PageUrlCount], int]:
"""List all URLs in collection sorted desc by snapshot count"""
# pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements
# Zero-index page for query
page = page - 1
skip = page_size * page
crawl_ids = await self.get_collection_crawl_ids(coll_id)
match_query: dict[str, object] = {"oid": oid, "crawl_id": {"$in": crawl_ids}}
if url_prefix:
url_prefix = urllib.parse.unquote(url_prefix)
regex_pattern = f"^{re.escape(url_prefix)}"
match_query["url"] = {"$regex": regex_pattern, "$options": "i"}
aggregate = [{"$match": match_query}]
aggregate.extend(
[
{
"$group": {
"_id": "$url",
"pages": {"$push": "$$ROOT"},
"count": {"$sum": 1},
},
},
{"$sort": {"count": -1}},
{"$set": {"url": "$_id"}},
{
"$facet": {
"items": [
{"$skip": skip},
{"$limit": page_size},
],
"total": [{"$count": "count"}],
}
},
]
)
# Get total
cursor = self.pages.aggregate(aggregate)
results = await cursor.to_list(length=1)
result = results[0]
items = result["items"]
try:
total = int(result["total"][0]["count"])
except (IndexError, ValueError):
total = 0
return [
PageUrlCount(
url=data.get("url", ""),
count=data.get("count", 0),
snapshots=[
PageIdTimestamp(
pageId=p["_id"], ts=p.get("ts"), status=p.get("status", 200)
)
for p in data.get("pages", [])
],
)
for data in items
], total
async def set_home_url(
self, coll_id: UUID, update: UpdateCollHomeUrl, org: Organization
) -> Dict[str, bool]:
"""Set home URL for collection and save thumbnail to database"""
if update.pageId:
page = await self.page_ops.get_page(update.pageId, org.id)
update_query = {
"homeUrl": page.url,
"homeUrlTs": page.ts,
"homeUrlPageId": page.id,
}
else:
update_query = {
"homeUrl": None,
"homeUrlTs": None,
"homeUrlPageId": None,
}
await self.collections.find_one_and_update(
{"_id": coll_id, "oid": org.id},
{"$set": update_query},
)
return {"updated": True}
async def upload_thumbnail_stream(
self, stream, filename: str, coll_id: UUID, org: Organization, user: User
) -> Dict[str, bool]:
"""Upload file as stream to use as collection thumbnail"""
coll = await self.get_collection(coll_id)
_, extension = os.path.splitext(filename)
image_filename = f"thumbnail-{str(coll_id)}{extension}"
prefix = org.storage.get_storage_extra_path(str(org.id)) + "images/"
file_prep = ImageFilePreparer(
prefix,
image_filename,
original_filename=filename,
user=user,
created=dt_now(),
)
async def stream_iter():
"""iterate over each chunk and compute and digest + total size"""
async for chunk in stream:
file_prep.add_chunk(chunk)
yield chunk
print("Collection thumbnail stream upload starting", flush=True)
if not await self.storage_ops.do_upload_multipart(
org,
file_prep.upload_name,
stream_iter(),
MIN_UPLOAD_PART_SIZE,
):
print("Collection thumbnail stream upload failed", flush=True)
raise HTTPException(status_code=400, detail="upload_failed")
print("Collection thumbnail stream upload complete", flush=True)
thumbnail_file = file_prep.get_image_file(org.storage)
if thumbnail_file.size > THUMBNAIL_MAX_SIZE:
print(
"Collection thumbnail stream upload failed: max size (2 MB) exceeded",
flush=True,
)
await self.storage_ops.delete_file_object(org, thumbnail_file)
raise HTTPException(status_code=400, detail="upload_failed")
if coll.thumbnail:
if not await self.storage_ops.delete_file_object(org, coll.thumbnail):
print(
f"Unable to delete previous collection thumbnail: {coll.thumbnail.filename}"
)
coll.thumbnail = thumbnail_file
# Update entire document to avoid bson.errors.InvalidDocument exception
await self.collections.find_one_and_update(
{"_id": coll_id, "oid": org.id},
{"$set": coll.to_dict()},
)
return {"added": True}
async def delete_thumbnail(self, coll_id: UUID, org: Organization):
"""Delete collection thumbnail"""
coll = await self.get_collection(coll_id)
if not coll.thumbnail:
raise HTTPException(status_code=404, detail="thumbnail_not_found")
if not await self.storage_ops.delete_file_object(org, coll.thumbnail):
print(f"Unable to delete collection thumbnail: {coll.thumbnail.filename}")
raise HTTPException(status_code=400, detail="file_deletion_error")
# Delete from database
await self.collections.find_one_and_update(
{"_id": coll_id, "oid": org.id},
{"$set": {"thumbnail": None}},
)
return {"deleted": True}
# ============================================================================
# pylint: disable=too-many-locals
def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_dep):
"""init collections api"""
# pylint: disable=invalid-name, unused-argument, too-many-arguments
colls = CollectionOps(mdb, storage_ops, orgs, event_webhook_ops)
org_crawl_dep = orgs.org_crawl_dep
org_viewer_dep = orgs.org_viewer_dep
org_public = orgs.org_public
@app.post(
"/orgs/{oid}/collections",
tags=["collections"],
response_model=AddedResponseIdName,
)
async def add_collection(
new_coll: CollIn, org: Organization = Depends(org_crawl_dep)
):
return await colls.add_collection(org.id, new_coll)
@app.get(
"/orgs/{oid}/collections",
tags=["collections"],
response_model=PaginatedCollOutResponse,
)
async def list_collection_all(
org: Organization = Depends(org_viewer_dep),
pageSize: int = DEFAULT_PAGE_SIZE,
page: int = 1,
sortBy: Optional[str] = None,
sortDirection: int = 1,
name: Optional[str] = None,
namePrefix: Optional[str] = None,
access: Optional[str] = None,
):
collections, total = await colls.list_collections(
org,
page_size=pageSize,
page=page,
sort_by=sortBy,
sort_direction=sortDirection,
name=name,
name_prefix=namePrefix,
access=access,
)
return paginated_format(collections, total, page, pageSize)
@app.get(
"/orgs/{oid}/collections/$all",
tags=["collections"],
response_model=Dict[str, List[CrawlFileOut]],
)
async def get_collection_all(org: Organization = Depends(org_viewer_dep)):
results = {}
try:
all_collections, _ = await colls.list_collections(org, page_size=10_000)
for collection in all_collections:
results[collection.name] = await colls.get_collection_crawl_resources(
collection.id
)
except Exception as exc:
# pylint: disable=raise-missing-from
raise HTTPException(
status_code=400, detail="Error Listing All Crawled Files: " + str(exc)
)
return results
@app.get(
"/orgs/{oid}/collections/search-values",
tags=["collections"],
response_model=CollectionSearchValuesResponse,
)
async def get_collection_search_values(
org: Organization = Depends(org_viewer_dep),
):
return await colls.get_collection_search_values(org)
@app.get(
"/orgs/{oid}/collections/{coll_id}",
tags=["collections"],
response_model=CollOut,
)
async def get_collection(
coll_id: UUID, org: Organization = Depends(org_viewer_dep)
):
return await colls.get_collection_out(coll_id, org)
@app.get(
"/orgs/{oid}/collections/{coll_id}/replay.json",
tags=["collections"],
response_model=CollOut,
)
async def get_collection_replay(
coll_id: UUID, org: Organization = Depends(org_viewer_dep)
):
return await colls.get_collection_out(coll_id, org, resources=True)
@app.get(
"/orgs/{oid}/collections/{coll_id}/public/replay.json",
tags=["collections"],
response_model=CollOut,
)
async def get_collection_public_replay(
response: Response,
coll_id: UUID,
org: Organization = Depends(org_public),
):
coll = await colls.get_collection_out(
coll_id, org, resources=True, public_or_unlisted_only=True
)
response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Headers"] = "*"
return coll
@app.options(
"/orgs/{oid}/collections/{coll_id}/public/replay.json",
tags=["collections"],
response_model=EmptyResponse,
)
async def get_replay_preflight(response: Response):
response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS"
response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Headers"] = "*"
return {}
@app.patch(
"/orgs/{oid}/collections/{coll_id}",
tags=["collections"],
response_model=UpdatedResponse,
)
async def update_collection(
coll_id: UUID,
update: UpdateColl,
org: Organization = Depends(org_crawl_dep),
):
return await colls.update_collection(coll_id, org, update)
@app.post(
"/orgs/{oid}/collections/{coll_id}/add",
tags=["collections"],
response_model=CollOut,
)
async def add_crawl_to_collection(
crawlList: AddRemoveCrawlList,
coll_id: UUID,
org: Organization = Depends(org_crawl_dep),
) -> CollOut:
return await colls.add_crawls_to_collection(coll_id, crawlList.crawlIds, org)
@app.post(
"/orgs/{oid}/collections/{coll_id}/remove",
tags=["collections"],
response_model=CollOut,
)
async def remove_crawl_from_collection(
crawlList: AddRemoveCrawlList,
coll_id: UUID,
org: Organization = Depends(org_crawl_dep),
) -> CollOut:
return await colls.remove_crawls_from_collection(
coll_id, crawlList.crawlIds, org
)
@app.delete(
"/orgs/{oid}/collections/{coll_id}",
tags=["collections"],
response_model=SuccessResponse,
)
async def delete_collection(
coll_id: UUID, org: Organization = Depends(org_crawl_dep)
):
return await colls.delete_collection(coll_id, org)
@app.get(
"/orgs/{oid}/collections/{coll_id}/download",
tags=["collections"],
response_model=bytes,
)
async def download_collection(
coll_id: UUID, org: Organization = Depends(org_viewer_dep)
):
return await colls.download_collection(coll_id, org)
@app.get(
"/public/orgs/{org_slug}/collections",
tags=["collections", "public"],
response_model=OrgPublicCollections,
)
async def get_org_public_collections(
org_slug: str,
pageSize: int = DEFAULT_PAGE_SIZE,
page: int = 1,
sortBy: Optional[str] = None,
sortDirection: int = 1,
):
return await colls.get_org_public_collections(
org_slug,
page_size=pageSize,
page=page,
sort_by=sortBy,
sort_direction=sortDirection,
)
@app.get(
"/public/orgs/{org_slug}/collections/{coll_id}",
tags=["collections", "public"],
response_model=PublicCollOut,
)
async def get_public_collection(
org_slug: str,
coll_id: UUID,
):
try:
org = await colls.orgs.get_org_by_slug(org_slug)
# pylint: disable=broad-exception-caught
except Exception:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=404, detail="collection_not_found")
return await colls.get_public_collection_out(coll_id, org, allow_unlisted=True)
@app.get(
"/public/orgs/{org_slug}/collections/{coll_id}/download",
tags=["collections", "public"],
response_model=bytes,
)
async def download_public_collection(
org_slug: str,
coll_id: UUID,
):
try:
org = await colls.orgs.get_org_by_slug(org_slug)
# pylint: disable=broad-exception-caught
except Exception:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=404, detail="collection_not_found")
# Make sure collection exists and is public/unlisted
coll = await colls.get_collection(coll_id, public_or_unlisted_only=True)
if coll.allowPublicDownload is False:
raise HTTPException(status_code=403, detail="not_allowed")
return await colls.download_collection(coll_id, org)
@app.get(
"/orgs/{oid}/collections/{coll_id}/urls",
tags=["collections"],
response_model=PaginatedPageUrlCountResponse,
)
async def get_collection_url_list(
coll_id: UUID,
oid: UUID,
urlPrefix: Optional[str] = None,
pageSize: int = DEFAULT_PAGE_SIZE,
page: int = 1,
):
"""Retrieve paginated list of urls in collection sorted by snapshot count"""
pages, total = await colls.list_urls_in_collection(
coll_id=coll_id,
oid=oid,
url_prefix=urlPrefix,
page_size=pageSize,
page=page,
)
return paginated_format(pages, total, page, pageSize)
@app.post(
"/orgs/{oid}/collections/{coll_id}/home-url",
tags=["collections"],
response_model=UpdatedResponse,
)
async def set_collection_home_url(
update: UpdateCollHomeUrl,
coll_id: UUID,
org: Organization = Depends(org_crawl_dep),
):
return await colls.set_home_url(coll_id, update, org)
@app.put(
"/orgs/{oid}/collections/{coll_id}/thumbnail",
tags=["collections"],
response_model=AddedResponse,
)
async def upload_thumbnail_stream(
request: Request,
filename: str,
coll_id: UUID,
org: Organization = Depends(org_crawl_dep),
user: User = Depends(user_dep),
):
return await colls.upload_thumbnail_stream(
request.stream(), filename, coll_id, org, user
)
@app.delete(
"/orgs/{oid}/collections/{coll_id}/thumbnail",
tags=["collections"],
response_model=DeletedResponse,
)
async def delete_thumbnail_stream(
coll_id: UUID,
org: Organization = Depends(org_crawl_dep),
):
return await colls.delete_thumbnail(coll_id, org)
return colls