""" Collections API """ # pylint: disable=too-many-lines from datetime import datetime from collections import Counter from uuid import UUID, uuid4 from typing import Optional, List, TYPE_CHECKING, cast, Dict, Any, Union import os import asyncio import pymongo from pymongo.collation import Collation from fastapi import Depends, HTTPException, Response from fastapi.responses import StreamingResponse from starlette.requests import Request from .pagination import DEFAULT_PAGE_SIZE, paginated_format from .models import ( AnyHttpUrl, Collection, CollIn, CollOut, CollIdName, CollectionThumbnailSource, UpdateColl, AddRemoveCrawlList, BaseCrawl, CrawlOutWithResources, CrawlFileOut, Organization, PaginatedCollOutResponse, SUCCESSFUL_STATES, AddedResponseIdName, EmptyResponse, UpdatedResponse, SuccessResponse, AddedResponse, DeletedResponse, CollectionSearchValuesResponse, OrgPublicCollections, PublicOrgDetails, CollAccessType, UpdateCollHomeUrl, User, ImageFile, ImageFilePreparer, MIN_UPLOAD_PART_SIZE, PublicCollOut, ) from .utils import dt_now, slug_from_name, get_duplicate_key_error_field, get_origin if TYPE_CHECKING: from .orgs import OrgOps from .storages import StorageOps from .webhooks import EventWebhookOps from .crawls import CrawlOps from .pages import PageOps else: OrgOps = StorageOps = EventWebhookOps = CrawlOps = PageOps = object THUMBNAIL_MAX_SIZE = 2_000_000 # ============================================================================ class CollectionOps: """ops for working with named collections of crawls""" # pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods orgs: OrgOps storage_ops: StorageOps event_webhook_ops: EventWebhookOps crawl_ops: CrawlOps page_ops: PageOps def __init__(self, mdb, storage_ops, orgs, event_webhook_ops): self.collections = mdb["collections"] self.crawls = mdb["crawls"] self.crawl_configs = mdb["crawl_configs"] self.pages = mdb["pages"] self.crawl_ops = cast(CrawlOps, None) self.orgs = orgs self.storage_ops = storage_ops self.event_webhook_ops = event_webhook_ops def set_crawl_ops(self, ops): """set crawl ops""" self.crawl_ops = ops def set_page_ops(self, ops): """set page ops""" # pylint: disable=attribute-defined-outside-init self.page_ops = ops async def init_index(self): """init lookup index""" case_insensitive_collation = Collation(locale="en", strength=1) await self.collections.create_index( [("oid", pymongo.ASCENDING), ("name", pymongo.ASCENDING)], unique=True, collation=case_insensitive_collation, ) await self.collections.create_index( [("oid", pymongo.ASCENDING), ("slug", pymongo.ASCENDING)], unique=True, collation=case_insensitive_collation, ) await self.collections.create_index( [("oid", pymongo.ASCENDING), ("description", pymongo.ASCENDING)] ) async def add_collection(self, oid: UUID, coll_in: CollIn): """Add new collection""" crawl_ids = coll_in.crawlIds if coll_in.crawlIds else [] coll_id = uuid4() created = dt_now() slug = coll_in.slug or slug_from_name(coll_in.name) coll = Collection( id=coll_id, oid=oid, name=coll_in.name, slug=slug, description=coll_in.description, caption=coll_in.caption, created=created, modified=created, access=coll_in.access, defaultThumbnailName=coll_in.defaultThumbnailName, allowPublicDownload=coll_in.allowPublicDownload, ) try: await self.collections.insert_one(coll.to_dict()) org = await self.orgs.get_org_by_id(oid) await self.clear_org_previous_slugs_matching_slug(slug, org) if crawl_ids: await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org) await self.update_collection_counts_and_tags(coll_id) await self.update_collection_dates(coll_id) asyncio.create_task( self.event_webhook_ops.create_added_to_collection_notification( crawl_ids, coll_id, org ) ) return {"added": True, "id": coll_id, "name": coll.name} except pymongo.errors.DuplicateKeyError as err: # pylint: disable=raise-missing-from field = get_duplicate_key_error_field(err) raise HTTPException(status_code=400, detail=f"collection_{field}_taken") async def update_collection( self, coll_id: UUID, org: Organization, update: UpdateColl ): """Update collection""" query = update.dict(exclude_unset=True) if len(query) == 0: raise HTTPException(status_code=400, detail="no_update_data") name_update = query.get("name") slug_update = query.get("slug") previous_slug = None if name_update or slug_update: # If we're updating slug, save old one to previousSlugs to support redirects coll = await self.get_collection(coll_id) previous_slug = coll.slug if name_update and not slug_update: slug = slug_from_name(name_update) query["slug"] = slug slug_update = slug query["modified"] = dt_now() db_update = {"$set": query} if previous_slug: db_update["$push"] = {"previousSlugs": previous_slug} try: result = await self.collections.find_one_and_update( {"_id": coll_id, "oid": org.id}, db_update, return_document=pymongo.ReturnDocument.AFTER, ) except pymongo.errors.DuplicateKeyError as err: # pylint: disable=raise-missing-from field = get_duplicate_key_error_field(err) raise HTTPException(status_code=400, detail=f"collection_{field}_taken") if not result: raise HTTPException(status_code=404, detail="collection_not_found") if slug_update: await self.clear_org_previous_slugs_matching_slug(slug_update, org) return {"updated": True} async def clear_org_previous_slugs_matching_slug( self, slug: str, org: Organization ): """Clear new slug from previousSlugs array of other collections in same org""" await self.collections.update_many( {"oid": org.id, "previousSlugs": slug}, {"$pull": {"previousSlugs": slug}}, ) async def add_crawls_to_collection( self, coll_id: UUID, crawl_ids: List[str], org: Organization ) -> CollOut: """Add crawls to collection""" await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org) modified = dt_now() result = await self.collections.find_one_and_update( {"_id": coll_id}, {"$set": {"modified": modified}}, return_document=pymongo.ReturnDocument.AFTER, ) if not result: raise HTTPException(status_code=404, detail="collection_not_found") await self.update_collection_counts_and_tags(coll_id) await self.update_collection_dates(coll_id) asyncio.create_task( self.event_webhook_ops.create_added_to_collection_notification( crawl_ids, coll_id, org ) ) return await self.get_collection_out(coll_id, org) async def remove_crawls_from_collection( self, coll_id: UUID, crawl_ids: List[str], org: Organization ) -> CollOut: """Remove crawls from collection""" await self.crawl_ops.remove_from_collection(crawl_ids, coll_id) modified = dt_now() result = await self.collections.find_one_and_update( {"_id": coll_id}, {"$set": {"modified": modified}}, return_document=pymongo.ReturnDocument.AFTER, ) if not result: raise HTTPException(status_code=404, detail="collection_not_found") await self.update_collection_counts_and_tags(coll_id) await self.update_collection_dates(coll_id) asyncio.create_task( self.event_webhook_ops.create_removed_from_collection_notification( crawl_ids, coll_id, org ) ) return await self.get_collection_out(coll_id, org) async def get_collection_raw( self, coll_id: UUID, public_or_unlisted_only: bool = False ) -> Dict[str, Any]: """Get collection by id as dict from database""" query: dict[str, object] = {"_id": coll_id} if public_or_unlisted_only: query["access"] = {"$in": ["public", "unlisted"]} result = await self.collections.find_one(query) if not result: raise HTTPException(status_code=404, detail="collection_not_found") return result async def get_collection_raw_by_slug( self, coll_slug: str, previous_slugs: bool = False, public_or_unlisted_only: bool = False, ) -> Dict[str, Any]: """Get collection by slug (current or previous) as dict from database""" query: dict[str, object] = {} if previous_slugs: query["previousSlugs"] = coll_slug else: query["slug"] = coll_slug if public_or_unlisted_only: query["access"] = {"$in": ["public", "unlisted"]} result = await self.collections.find_one(query) if not result: raise HTTPException(status_code=404, detail="collection_not_found") return result async def get_collection( self, coll_id: UUID, public_or_unlisted_only: bool = False ) -> Collection: """Get collection by id""" result = await self.get_collection_raw(coll_id, public_or_unlisted_only) return Collection.from_dict(result) async def get_collection_by_slug( self, coll_slug: str, public_or_unlisted_only: bool = False ) -> Collection: """Get collection by slug""" try: result = await self.get_collection_raw_by_slug( coll_slug, public_or_unlisted_only=public_or_unlisted_only ) return Collection.from_dict(result) # pylint: disable=broad-exception-caught except Exception: pass result = await self.get_collection_raw_by_slug( coll_slug, previous_slugs=True, public_or_unlisted_only=public_or_unlisted_only, ) return Collection.from_dict(result) async def get_collection_out( self, coll_id: UUID, org: Organization, resources=False, public_or_unlisted_only=False, headers: Optional[dict] = None, ) -> CollOut: """Get CollOut by id""" # pylint: disable=too-many-locals result = await self.get_collection_raw(coll_id, public_or_unlisted_only) if resources: result["resources"], crawl_ids, pages_optimized = ( await self.get_collection_crawl_resources(coll_id) ) initial_pages, _ = await self.page_ops.list_pages( crawl_ids=crawl_ids, page_size=25, ) public = "public/" if public_or_unlisted_only else "" if pages_optimized: result["initialPages"] = initial_pages result["pagesQueryUrl"] = ( get_origin(headers) + f"/api/orgs/{org.id}/collections/{coll_id}/{public}pages" ) thumbnail = result.get("thumbnail") if thumbnail: image_file = ImageFile(**thumbnail) result["thumbnail"] = await image_file.get_image_file_out( org, self.storage_ops ) return CollOut.from_dict(result) async def get_public_collection_out( self, coll_id: UUID, org: Organization, allow_unlisted: bool = False, ) -> PublicCollOut: """Get PublicCollOut by id""" result = await self.get_collection_raw(coll_id) allowed_access = [CollAccessType.PUBLIC] if allow_unlisted: allowed_access.append(CollAccessType.UNLISTED) if result.get("access") not in allowed_access: raise HTTPException(status_code=404, detail="collection_not_found") result["resources"], _, _ = await self.get_collection_crawl_resources(coll_id) thumbnail = result.get("thumbnail") if thumbnail: image_file = ImageFile(**thumbnail) result["thumbnail"] = await image_file.get_public_image_file_out( org, self.storage_ops ) return PublicCollOut.from_dict(result) async def list_collections( self, org: Organization, public_colls_out: bool = False, page_size: int = DEFAULT_PAGE_SIZE, page: int = 1, sort_by: Optional[str] = None, sort_direction: int = 1, name: Optional[str] = None, name_prefix: Optional[str] = None, access: Optional[str] = None, ): """List all collections for org""" # pylint: disable=too-many-locals, duplicate-code, too-many-branches # Zero-index page for query page = page - 1 skip = page * page_size match_query: Dict[str, Union[str, UUID, int, object]] = {"oid": org.id} if name: match_query["name"] = name elif name_prefix: regex_pattern = f"^{name_prefix}" match_query["name"] = {"$regex": regex_pattern, "$options": "i"} if public_colls_out: match_query["access"] = CollAccessType.PUBLIC elif access: match_query["access"] = access aggregate: List[Dict[str, Union[str, UUID, int, object]]] = [ {"$match": match_query} ] if sort_by: if sort_by not in ( "created", "modified", "dateLatest", "name", "crawlCount", "pageCount", "totalSize", "description", "caption", ): raise HTTPException(status_code=400, detail="invalid_sort_by") if sort_direction not in (1, -1): raise HTTPException(status_code=400, detail="invalid_sort_direction") sort_query = {sort_by: sort_direction} # add secondary sort keys: if sort_by == "dateLatest": sort_query["dateEarliest"] = sort_direction aggregate.extend([{"$sort": sort_query}]) aggregate.extend( [ { "$facet": { "items": [ {"$skip": skip}, {"$limit": page_size}, ], "total": [{"$count": "count"}], } }, ] ) cursor = self.collections.aggregate( aggregate, collation=pymongo.collation.Collation(locale="en") ) results = await cursor.to_list(length=1) result = results[0] items = result["items"] try: total = int(result["total"][0]["count"]) except (IndexError, ValueError): total = 0 collections: List[Union[CollOut, PublicCollOut]] = [] for res in items: thumbnail = res.get("thumbnail") if thumbnail: image_file = ImageFile(**thumbnail) if public_colls_out: res["thumbnail"] = await image_file.get_public_image_file_out( org, self.storage_ops ) else: res["thumbnail"] = await image_file.get_image_file_out( org, self.storage_ops ) if public_colls_out: collections.append(PublicCollOut.from_dict(res)) else: collections.append(CollOut.from_dict(res)) return collections, total async def get_collection_crawl_resources( self, coll_id: UUID ) -> tuple[List[CrawlFileOut], List[str], bool]: """Return pre-signed resources for all collection crawl files.""" # Ensure collection exists _ = await self.get_collection_raw(coll_id) resources = [] pages_optimized = True crawls, _ = await self.crawl_ops.list_all_base_crawls( collection_id=coll_id, states=list(SUCCESSFUL_STATES), page_size=10_000, cls_type=CrawlOutWithResources, ) crawl_ids = [] for crawl in crawls: crawl_ids.append(crawl.id) if crawl.resources: resources.extend(crawl.resources) if crawl.version != 2: pages_optimized = False return resources, crawl_ids, pages_optimized async def get_collection_names(self, uuids: List[UUID]): """return object of {_id, names} given list of collection ids""" cursor = self.collections.find( {"_id": {"$in": uuids}}, projection=["_id", "name"] ) names = await cursor.to_list(length=1000) names = [ CollIdName(id=namedata["_id"], name=namedata["name"]) for namedata in names ] return names async def get_collection_search_values(self, org: Organization): """Return list of collection names""" names = await self.collections.distinct("name", {"oid": org.id}) # Remove empty strings names = [name for name in names if name] return {"names": names} async def get_collection_crawl_ids( self, coll_id: UUID, public_or_unlisted_only=False ) -> List[str]: """Return list of crawl ids in collection, including only public collections""" crawl_ids = [] # ensure collection is public or unlisted, else throw here if public_or_unlisted_only: await self.get_collection_raw(coll_id, public_or_unlisted_only) async for crawl_raw in self.crawls.find( {"collectionIds": coll_id}, projection=["_id"] ): crawl_id = crawl_raw.get("_id") if crawl_id: crawl_ids.append(crawl_id) return crawl_ids async def delete_collection(self, coll_id: UUID, org: Organization): """Delete collection and remove from associated crawls.""" await self.crawl_ops.remove_collection_from_all_crawls(coll_id) result = await self.collections.delete_one({"_id": coll_id, "oid": org.id}) if result.deleted_count < 1: raise HTTPException(status_code=404, detail="collection_not_found") asyncio.create_task( self.event_webhook_ops.create_collection_deleted_notification(coll_id, org) ) return {"success": True} async def download_collection(self, coll_id: UUID, org: Organization): """Download all WACZs in collection as streaming nested WACZ""" coll = await self.get_collection_out(coll_id, org, resources=True) metadata = { "type": "collection", "id": str(coll_id), "title": coll.name, "organization": org.slug, } if coll.description: metadata["description"] = coll.description resp = await self.storage_ops.download_streaming_wacz(metadata, coll.resources) headers = {"Content-Disposition": f'attachment; filename="{coll.name}.wacz"'} return StreamingResponse( resp, headers=headers, media_type="application/wacz+zip" ) async def recalculate_org_collection_stats(self, org: Organization): """recalculate counts, tags and dates for all collections in an org""" async for coll in self.collections.find({"oid": org.id}, projection={"_id": 1}): await self.update_collection_counts_and_tags(coll.get("_id")) await self.update_collection_dates(coll.get("_id")) async def update_collection_counts_and_tags(self, collection_id: UUID): """Set current crawl info in config when crawl begins""" # pylint: disable=too-many-locals crawl_count = 0 page_count = 0 total_size = 0 tags = [] crawl_ids = [] preload_resources = [] async for crawl_raw in self.crawls.find({"collectionIds": collection_id}): crawl = BaseCrawl.from_dict(crawl_raw) if crawl.state not in SUCCESSFUL_STATES: continue crawl_count += 1 files = crawl.files or [] for file in files: total_size += file.size try: crawl_page_count = await self.pages.count_documents( {"crawl_id": crawl.id} ) if crawl_page_count == 0: for file in files: preload_resources.append( { "name": os.path.basename(file.filename), "crawlId": crawl.id, } ) else: page_count += crawl_page_count # pylint: disable=broad-exception-caught except Exception: pass if crawl.tags: tags.extend(crawl.tags) crawl_ids.append(crawl.id) sorted_tags = [tag for tag, count in Counter(tags).most_common()] unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids) await self.collections.find_one_and_update( {"_id": collection_id}, { "$set": { "crawlCount": crawl_count, "pageCount": page_count, "uniquePageCount": unique_page_count, "totalSize": total_size, "tags": sorted_tags, "preloadResources": preload_resources, } }, ) async def update_collection_dates(self, coll_id: UUID): """Update collection earliest and latest dates from page timestamps""" # pylint: disable=too-many-locals coll = await self.get_collection(coll_id) crawl_ids = await self.get_collection_crawl_ids(coll_id) earliest_ts = None latest_ts = None match_query = { "oid": coll.oid, "crawl_id": {"$in": crawl_ids}, "ts": {"$ne": None}, } cursor = self.pages.find(match_query).sort("ts", 1).limit(1) pages = await cursor.to_list(length=1) try: earliest_page = pages[0] earliest_ts = earliest_page.get("ts") except IndexError: pass cursor = self.pages.find(match_query).sort("ts", -1).limit(1) pages = await cursor.to_list(length=1) try: latest_page = pages[0] latest_ts = latest_page.get("ts") except IndexError: pass await self.collections.find_one_and_update( {"_id": coll_id}, { "$set": { "dateEarliest": earliest_ts, "dateLatest": latest_ts, } }, ) async def update_crawl_collections(self, crawl_id: str): """Update counts, dates, and modified for all collections in crawl""" crawl = await self.crawls.find_one({"_id": crawl_id}) crawl_coll_ids = crawl.get("collectionIds") modified = dt_now() for coll_id in crawl_coll_ids: await self.update_collection_counts_and_tags(coll_id) await self.update_collection_dates(coll_id) await self.collections.find_one_and_update( {"_id": coll_id}, {"$set": {"modified": modified}}, return_document=pymongo.ReturnDocument.AFTER, ) async def add_successful_crawl_to_collections(self, crawl_id: str, cid: UUID): """Add successful crawl to its auto-add collections.""" workflow = await self.crawl_configs.find_one({"_id": cid}) auto_add_collections = workflow.get("autoAddCollections") if auto_add_collections: await self.crawls.find_one_and_update( {"_id": crawl_id}, {"$set": {"collectionIds": auto_add_collections}}, ) await self.update_crawl_collections(crawl_id) async def get_org_public_collections( self, org_slug: str, page_size: int = DEFAULT_PAGE_SIZE, page: int = 1, sort_by: Optional[str] = None, sort_direction: int = 1, ): """List public collections for org""" try: org = await self.orgs.get_org_by_slug(org_slug) # pylint: disable=broad-exception-caught except Exception: # pylint: disable=raise-missing-from raise HTTPException(status_code=404, detail="public_profile_not_found") if not org.enablePublicProfile: raise HTTPException(status_code=404, detail="public_profile_not_found") collections, _ = await self.list_collections( org, page_size=page_size, page=page, sort_by=sort_by, sort_direction=sort_direction, public_colls_out=True, ) public_org_details = PublicOrgDetails( name=org.name, description=org.publicDescription or "", url=org.publicUrl or "", ) return OrgPublicCollections(org=public_org_details, collections=collections) async def set_home_url( self, coll_id: UUID, update: UpdateCollHomeUrl, org: Organization ) -> Dict[str, bool]: """Set home URL for collection and save thumbnail to database""" if update.pageId: page = await self.page_ops.get_page(update.pageId, org.id) update_query = { "homeUrl": page.url, "homeUrlTs": page.ts, "homeUrlPageId": page.id, } else: update_query = { "homeUrl": None, "homeUrlTs": None, "homeUrlPageId": None, } await self.collections.find_one_and_update( {"_id": coll_id, "oid": org.id}, {"$set": update_query}, ) return {"updated": True} # pylint: disable=too-many-locals async def upload_thumbnail_stream( self, stream, filename: str, coll_id: UUID, org: Organization, user: User, source_url: Optional[AnyHttpUrl] = None, source_ts: Optional[datetime] = None, source_page_id: Optional[UUID] = None, ) -> Dict[str, bool]: """Upload file as stream to use as collection thumbnail""" coll = await self.get_collection(coll_id) _, extension = os.path.splitext(filename) image_filename = f"thumbnail-{str(coll_id)}{extension}" prefix = org.storage.get_storage_extra_path(str(org.id)) + "images/" file_prep = ImageFilePreparer( prefix, image_filename, original_filename=filename, user=user, created=dt_now(), ) async def stream_iter(): """iterate over each chunk and compute and digest + total size""" async for chunk in stream: file_prep.add_chunk(chunk) yield chunk print("Collection thumbnail stream upload starting", flush=True) if not await self.storage_ops.do_upload_multipart( org, file_prep.upload_name, stream_iter(), MIN_UPLOAD_PART_SIZE, ): print("Collection thumbnail stream upload failed", flush=True) raise HTTPException(status_code=400, detail="upload_failed") print("Collection thumbnail stream upload complete", flush=True) thumbnail_file = file_prep.get_image_file(org.storage) if thumbnail_file.size > THUMBNAIL_MAX_SIZE: print( "Collection thumbnail stream upload failed: max size (2 MB) exceeded", flush=True, ) await self.storage_ops.delete_file_object(org, thumbnail_file) raise HTTPException( status_code=400, detail="max_thumbnail_size_2_mb_exceeded", ) if coll.thumbnail: if not await self.storage_ops.delete_file_object(org, coll.thumbnail): print( f"Unable to delete previous collection thumbnail: {coll.thumbnail.filename}" ) coll.thumbnail = thumbnail_file if source_url and source_ts and source_page_id: coll.thumbnailSource = CollectionThumbnailSource( url=source_url, urlTs=source_ts, urlPageId=source_page_id, ) # Update entire document to avoid bson.errors.InvalidDocument exception await self.collections.find_one_and_update( {"_id": coll_id, "oid": org.id}, {"$set": coll.to_dict()}, ) return {"added": True} async def delete_thumbnail(self, coll_id: UUID, org: Organization): """Delete collection thumbnail""" coll = await self.get_collection(coll_id) if not coll.thumbnail: raise HTTPException(status_code=404, detail="thumbnail_not_found") if not await self.storage_ops.delete_file_object(org, coll.thumbnail): print(f"Unable to delete collection thumbnail: {coll.thumbnail.filename}") raise HTTPException(status_code=400, detail="file_deletion_error") # Delete from database await self.collections.find_one_and_update( {"_id": coll_id, "oid": org.id}, {"$set": {"thumbnail": None}}, ) return {"deleted": True} # ============================================================================ # pylint: disable=too-many-locals def init_collections_api( app, mdb, orgs, storage_ops, event_webhook_ops, user_dep ) -> CollectionOps: """init collections api""" # pylint: disable=invalid-name, unused-argument, too-many-arguments colls: CollectionOps = CollectionOps(mdb, storage_ops, orgs, event_webhook_ops) org_crawl_dep = orgs.org_crawl_dep org_viewer_dep = orgs.org_viewer_dep org_public = orgs.org_public @app.post( "/orgs/{oid}/collections", tags=["collections"], response_model=AddedResponseIdName, ) async def add_collection( new_coll: CollIn, org: Organization = Depends(org_crawl_dep) ): return await colls.add_collection(org.id, new_coll) @app.get( "/orgs/{oid}/collections", tags=["collections"], response_model=PaginatedCollOutResponse, ) async def list_collection_all( org: Organization = Depends(org_viewer_dep), pageSize: int = DEFAULT_PAGE_SIZE, page: int = 1, sortBy: Optional[str] = None, sortDirection: int = 1, name: Optional[str] = None, namePrefix: Optional[str] = None, access: Optional[str] = None, ): collections, total = await colls.list_collections( org, page_size=pageSize, page=page, sort_by=sortBy, sort_direction=sortDirection, name=name, name_prefix=namePrefix, access=access, ) return paginated_format(collections, total, page, pageSize) @app.get( "/orgs/{oid}/collections/$all", tags=["collections"], response_model=Dict[str, List[CrawlFileOut]], ) async def get_collection_all(org: Organization = Depends(org_viewer_dep)): results = {} try: all_collections, _ = await colls.list_collections(org, page_size=10_000) for collection in all_collections: results[collection.name], _, _ = ( await colls.get_collection_crawl_resources(collection.id) ) except Exception as exc: # pylint: disable=raise-missing-from raise HTTPException( status_code=400, detail="Error Listing All Crawled Files: " + str(exc) ) return results @app.get( "/orgs/{oid}/collections/search-values", tags=["collections"], response_model=CollectionSearchValuesResponse, ) async def get_collection_search_values( org: Organization = Depends(org_viewer_dep), ): return await colls.get_collection_search_values(org) @app.get( "/orgs/{oid}/collections/{coll_id}", tags=["collections"], response_model=CollOut, ) async def get_collection( coll_id: UUID, org: Organization = Depends(org_viewer_dep) ): return await colls.get_collection_out(coll_id, org) @app.get( "/orgs/{oid}/collections/{coll_id}/replay.json", tags=["collections"], response_model=CollOut, ) async def get_collection_replay( request: Request, coll_id: UUID, org: Organization = Depends(org_viewer_dep) ): return await colls.get_collection_out( coll_id, org, resources=True, headers=dict(request.headers) ) @app.get( "/orgs/{oid}/collections/{coll_id}/public/replay.json", tags=["collections"], response_model=CollOut, ) async def get_collection_public_replay( request: Request, response: Response, coll_id: UUID, org: Organization = Depends(org_public), ): coll = await colls.get_collection_out( coll_id, org, resources=True, public_or_unlisted_only=True, headers=dict(request.headers), ) response.headers["Access-Control-Allow-Origin"] = "*" response.headers["Access-Control-Allow-Headers"] = "*" return coll @app.options( "/orgs/{oid}/collections/{coll_id}/public/replay.json", tags=["collections"], response_model=EmptyResponse, ) async def get_replay_preflight(response: Response): response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS" response.headers["Access-Control-Allow-Origin"] = "*" response.headers["Access-Control-Allow-Headers"] = "*" return {} @app.patch( "/orgs/{oid}/collections/{coll_id}", tags=["collections"], response_model=UpdatedResponse, ) async def update_collection( coll_id: UUID, update: UpdateColl, org: Organization = Depends(org_crawl_dep), ): return await colls.update_collection(coll_id, org, update) @app.post( "/orgs/{oid}/collections/{coll_id}/add", tags=["collections"], response_model=CollOut, ) async def add_crawl_to_collection( crawlList: AddRemoveCrawlList, coll_id: UUID, org: Organization = Depends(org_crawl_dep), ) -> CollOut: return await colls.add_crawls_to_collection(coll_id, crawlList.crawlIds, org) @app.post( "/orgs/{oid}/collections/{coll_id}/remove", tags=["collections"], response_model=CollOut, ) async def remove_crawl_from_collection( crawlList: AddRemoveCrawlList, coll_id: UUID, org: Organization = Depends(org_crawl_dep), ) -> CollOut: return await colls.remove_crawls_from_collection( coll_id, crawlList.crawlIds, org ) @app.delete( "/orgs/{oid}/collections/{coll_id}", tags=["collections"], response_model=SuccessResponse, ) async def delete_collection( coll_id: UUID, org: Organization = Depends(org_crawl_dep) ): return await colls.delete_collection(coll_id, org) @app.get( "/orgs/{oid}/collections/{coll_id}/download", tags=["collections"], response_model=bytes, ) async def download_collection( coll_id: UUID, org: Organization = Depends(org_viewer_dep) ): return await colls.download_collection(coll_id, org) @app.get( "/public/orgs/{org_slug}/collections", tags=["collections", "public"], response_model=OrgPublicCollections, ) async def get_org_public_collections( org_slug: str, pageSize: int = DEFAULT_PAGE_SIZE, page: int = 1, sortBy: Optional[str] = None, sortDirection: int = 1, ): return await colls.get_org_public_collections( org_slug, page_size=pageSize, page=page, sort_by=sortBy, sort_direction=sortDirection, ) @app.get( "/public/orgs/{org_slug}/collections/{coll_slug}", tags=["collections", "public"], response_model=PublicCollOut, ) async def get_public_collection( org_slug: str, coll_slug: str, ): try: org = await colls.orgs.get_org_by_slug(org_slug) # pylint: disable=broad-exception-caught except Exception: # pylint: disable=raise-missing-from raise HTTPException(status_code=404, detail="collection_not_found") coll = await colls.get_collection_by_slug(coll_slug) return await colls.get_public_collection_out(coll.id, org, allow_unlisted=True) @app.get( "/public/orgs/{org_slug}/collections/{coll_slug}/download", tags=["collections", "public"], response_model=bytes, ) async def download_public_collection( org_slug: str, coll_slug: str, ): try: org = await colls.orgs.get_org_by_slug(org_slug) # pylint: disable=broad-exception-caught except Exception: # pylint: disable=raise-missing-from raise HTTPException(status_code=404, detail="collection_not_found") # Make sure collection exists and is public/unlisted coll = await colls.get_collection_by_slug( coll_slug, public_or_unlisted_only=True ) if coll.allowPublicDownload is False: raise HTTPException(status_code=403, detail="not_allowed") return await colls.download_collection(coll.id, org) @app.post( "/orgs/{oid}/collections/{coll_id}/home-url", tags=["collections"], response_model=UpdatedResponse, ) async def set_collection_home_url( update: UpdateCollHomeUrl, coll_id: UUID, org: Organization = Depends(org_crawl_dep), ): return await colls.set_home_url(coll_id, update, org) @app.put( "/orgs/{oid}/collections/{coll_id}/thumbnail", tags=["collections"], response_model=AddedResponse, ) async def upload_thumbnail_stream( request: Request, filename: str, coll_id: UUID, sourceUrl: Optional[AnyHttpUrl], sourceTs: Optional[datetime], sourcePageId: Optional[UUID], org: Organization = Depends(org_crawl_dep), user: User = Depends(user_dep), ): return await colls.upload_thumbnail_stream( request.stream(), filename, coll_id, org, user, sourceUrl, sourceTs, sourcePageId, ) @app.delete( "/orgs/{oid}/collections/{coll_id}/thumbnail", tags=["collections"], response_model=DeletedResponse, ) async def delete_thumbnail_stream( coll_id: UUID, org: Organization = Depends(org_crawl_dep), ): return await colls.delete_thumbnail(coll_id, org) return colls