browsertrix/backend/btrixcloud/colls.py
Ilya Kreymer 8a507f0473
Consolidate list page endpoints + better QA sorting + optimize pages fix (#2417)
- consolidate list_pages() and list_replay_query_pages() into
list_pages()
- to keep backwards compatibility, add <crawl>/pagesSearch that does not
include page totals, keep <crawl>/pages with page total (slower)
- qa frontend: add default 'Crawl Order' sort order, to better show
pages in QA view
- bgjob: account for parallelism in bgjobs, add logging if succeeded
mismatches parallelism
- QA sorting: default to 'crawl order' by default to get better results.
- Optimize pages job: also cover crawls that may not have any pages but have pages listed in done stats
- Bgjobs: give custom op jobs more memory
2025-02-21 13:47:20 -08:00

1214 lines
40 KiB
Python

"""
Collections API
"""
# pylint: disable=too-many-lines
from datetime import datetime
from collections import Counter
from uuid import UUID, uuid4
from typing import Optional, List, TYPE_CHECKING, cast, Dict, Any, Union
import os
import asyncio
import pymongo
from pymongo.collation import Collation
from fastapi import Depends, HTTPException, Response
from fastapi.responses import StreamingResponse
from starlette.requests import Request
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
from .models import (
AnyHttpUrl,
Collection,
CollIn,
CollOut,
CollIdName,
CollectionThumbnailSource,
UpdateColl,
AddRemoveCrawlList,
BaseCrawl,
CrawlOutWithResources,
CrawlFileOut,
Organization,
PaginatedCollOutResponse,
SUCCESSFUL_STATES,
AddedResponseIdName,
EmptyResponse,
UpdatedResponse,
SuccessResponse,
AddedResponse,
DeletedResponse,
CollectionSearchValuesResponse,
OrgPublicCollections,
PublicOrgDetails,
CollAccessType,
UpdateCollHomeUrl,
User,
ImageFile,
ImageFilePreparer,
MIN_UPLOAD_PART_SIZE,
PublicCollOut,
)
from .utils import dt_now, slug_from_name, get_duplicate_key_error_field, get_origin
if TYPE_CHECKING:
from .orgs import OrgOps
from .storages import StorageOps
from .webhooks import EventWebhookOps
from .crawls import CrawlOps
from .pages import PageOps
else:
OrgOps = StorageOps = EventWebhookOps = CrawlOps = PageOps = object
THUMBNAIL_MAX_SIZE = 2_000_000
# ============================================================================
class CollectionOps:
"""ops for working with named collections of crawls"""
# pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods
orgs: OrgOps
storage_ops: StorageOps
event_webhook_ops: EventWebhookOps
crawl_ops: CrawlOps
page_ops: PageOps
def __init__(self, mdb, storage_ops, orgs, event_webhook_ops):
self.collections = mdb["collections"]
self.crawls = mdb["crawls"]
self.crawl_configs = mdb["crawl_configs"]
self.pages = mdb["pages"]
self.crawl_ops = cast(CrawlOps, None)
self.orgs = orgs
self.storage_ops = storage_ops
self.event_webhook_ops = event_webhook_ops
def set_crawl_ops(self, ops):
"""set crawl ops"""
self.crawl_ops = ops
def set_page_ops(self, ops):
"""set page ops"""
# pylint: disable=attribute-defined-outside-init
self.page_ops = ops
async def init_index(self):
"""init lookup index"""
case_insensitive_collation = Collation(locale="en", strength=1)
await self.collections.create_index(
[("oid", pymongo.ASCENDING), ("name", pymongo.ASCENDING)],
unique=True,
collation=case_insensitive_collation,
)
await self.collections.create_index(
[("oid", pymongo.ASCENDING), ("slug", pymongo.ASCENDING)],
unique=True,
collation=case_insensitive_collation,
)
await self.collections.create_index(
[("oid", pymongo.ASCENDING), ("description", pymongo.ASCENDING)]
)
async def add_collection(self, oid: UUID, coll_in: CollIn):
"""Add new collection"""
crawl_ids = coll_in.crawlIds if coll_in.crawlIds else []
coll_id = uuid4()
created = dt_now()
slug = coll_in.slug or slug_from_name(coll_in.name)
coll = Collection(
id=coll_id,
oid=oid,
name=coll_in.name,
slug=slug,
description=coll_in.description,
caption=coll_in.caption,
created=created,
modified=created,
access=coll_in.access,
defaultThumbnailName=coll_in.defaultThumbnailName,
allowPublicDownload=coll_in.allowPublicDownload,
)
try:
await self.collections.insert_one(coll.to_dict())
org = await self.orgs.get_org_by_id(oid)
await self.clear_org_previous_slugs_matching_slug(slug, org)
if crawl_ids:
await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)
await self.update_collection_counts_and_tags(coll_id)
await self.update_collection_dates(coll_id)
asyncio.create_task(
self.event_webhook_ops.create_added_to_collection_notification(
crawl_ids, coll_id, org
)
)
return {"added": True, "id": coll_id, "name": coll.name}
except pymongo.errors.DuplicateKeyError as err:
# pylint: disable=raise-missing-from
field = get_duplicate_key_error_field(err)
raise HTTPException(status_code=400, detail=f"collection_{field}_taken")
async def update_collection(
self, coll_id: UUID, org: Organization, update: UpdateColl
):
"""Update collection"""
query = update.dict(exclude_unset=True)
if len(query) == 0:
raise HTTPException(status_code=400, detail="no_update_data")
name_update = query.get("name")
slug_update = query.get("slug")
previous_slug = None
if name_update or slug_update:
# If we're updating slug, save old one to previousSlugs to support redirects
coll = await self.get_collection(coll_id)
previous_slug = coll.slug
if name_update and not slug_update:
slug = slug_from_name(name_update)
query["slug"] = slug
slug_update = slug
query["modified"] = dt_now()
db_update = {"$set": query}
if previous_slug:
db_update["$push"] = {"previousSlugs": previous_slug}
try:
result = await self.collections.find_one_and_update(
{"_id": coll_id, "oid": org.id},
db_update,
return_document=pymongo.ReturnDocument.AFTER,
)
except pymongo.errors.DuplicateKeyError as err:
# pylint: disable=raise-missing-from
field = get_duplicate_key_error_field(err)
raise HTTPException(status_code=400, detail=f"collection_{field}_taken")
if not result:
raise HTTPException(status_code=404, detail="collection_not_found")
if slug_update:
await self.clear_org_previous_slugs_matching_slug(slug_update, org)
return {"updated": True}
async def clear_org_previous_slugs_matching_slug(
self, slug: str, org: Organization
):
"""Clear new slug from previousSlugs array of other collections in same org"""
await self.collections.update_many(
{"oid": org.id, "previousSlugs": slug},
{"$pull": {"previousSlugs": slug}},
)
async def add_crawls_to_collection(
self, coll_id: UUID, crawl_ids: List[str], org: Organization
) -> CollOut:
"""Add crawls to collection"""
await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)
modified = dt_now()
result = await self.collections.find_one_and_update(
{"_id": coll_id},
{"$set": {"modified": modified}},
return_document=pymongo.ReturnDocument.AFTER,
)
if not result:
raise HTTPException(status_code=404, detail="collection_not_found")
await self.update_collection_counts_and_tags(coll_id)
await self.update_collection_dates(coll_id)
asyncio.create_task(
self.event_webhook_ops.create_added_to_collection_notification(
crawl_ids, coll_id, org
)
)
return await self.get_collection_out(coll_id, org)
async def remove_crawls_from_collection(
self, coll_id: UUID, crawl_ids: List[str], org: Organization
) -> CollOut:
"""Remove crawls from collection"""
await self.crawl_ops.remove_from_collection(crawl_ids, coll_id)
modified = dt_now()
result = await self.collections.find_one_and_update(
{"_id": coll_id},
{"$set": {"modified": modified}},
return_document=pymongo.ReturnDocument.AFTER,
)
if not result:
raise HTTPException(status_code=404, detail="collection_not_found")
await self.update_collection_counts_and_tags(coll_id)
await self.update_collection_dates(coll_id)
asyncio.create_task(
self.event_webhook_ops.create_removed_from_collection_notification(
crawl_ids, coll_id, org
)
)
return await self.get_collection_out(coll_id, org)
async def get_collection_raw(
self, coll_id: UUID, public_or_unlisted_only: bool = False
) -> Dict[str, Any]:
"""Get collection by id as dict from database"""
query: dict[str, object] = {"_id": coll_id}
if public_or_unlisted_only:
query["access"] = {"$in": ["public", "unlisted"]}
result = await self.collections.find_one(query)
if not result:
raise HTTPException(status_code=404, detail="collection_not_found")
return result
async def get_collection_raw_by_slug(
self,
coll_slug: str,
previous_slugs: bool = False,
public_or_unlisted_only: bool = False,
) -> Dict[str, Any]:
"""Get collection by slug (current or previous) as dict from database"""
query: dict[str, object] = {}
if previous_slugs:
query["previousSlugs"] = coll_slug
else:
query["slug"] = coll_slug
if public_or_unlisted_only:
query["access"] = {"$in": ["public", "unlisted"]}
result = await self.collections.find_one(query)
if not result:
raise HTTPException(status_code=404, detail="collection_not_found")
return result
async def get_collection(
self, coll_id: UUID, public_or_unlisted_only: bool = False
) -> Collection:
"""Get collection by id"""
result = await self.get_collection_raw(coll_id, public_or_unlisted_only)
return Collection.from_dict(result)
async def get_collection_by_slug(
self, coll_slug: str, public_or_unlisted_only: bool = False
) -> Collection:
"""Get collection by slug"""
try:
result = await self.get_collection_raw_by_slug(
coll_slug, public_or_unlisted_only=public_or_unlisted_only
)
return Collection.from_dict(result)
# pylint: disable=broad-exception-caught
except Exception:
pass
result = await self.get_collection_raw_by_slug(
coll_slug,
previous_slugs=True,
public_or_unlisted_only=public_or_unlisted_only,
)
return Collection.from_dict(result)
async def get_collection_out(
self,
coll_id: UUID,
org: Organization,
resources=False,
public_or_unlisted_only=False,
headers: Optional[dict] = None,
) -> CollOut:
"""Get CollOut by id"""
# pylint: disable=too-many-locals
result = await self.get_collection_raw(coll_id, public_or_unlisted_only)
if resources:
result["resources"], crawl_ids, pages_optimized = (
await self.get_collection_crawl_resources(coll_id)
)
initial_pages, _ = await self.page_ops.list_pages(
crawl_ids=crawl_ids,
page_size=25,
)
public = "public/" if public_or_unlisted_only else ""
if pages_optimized:
result["initialPages"] = initial_pages
result["pagesQueryUrl"] = (
get_origin(headers)
+ f"/api/orgs/{org.id}/collections/{coll_id}/{public}pages"
)
thumbnail = result.get("thumbnail")
if thumbnail:
image_file = ImageFile(**thumbnail)
result["thumbnail"] = await image_file.get_image_file_out(
org, self.storage_ops
)
return CollOut.from_dict(result)
async def get_public_collection_out(
self,
coll_id: UUID,
org: Organization,
allow_unlisted: bool = False,
) -> PublicCollOut:
"""Get PublicCollOut by id"""
result = await self.get_collection_raw(coll_id)
allowed_access = [CollAccessType.PUBLIC]
if allow_unlisted:
allowed_access.append(CollAccessType.UNLISTED)
if result.get("access") not in allowed_access:
raise HTTPException(status_code=404, detail="collection_not_found")
result["resources"], _, _ = await self.get_collection_crawl_resources(coll_id)
thumbnail = result.get("thumbnail")
if thumbnail:
image_file = ImageFile(**thumbnail)
result["thumbnail"] = await image_file.get_public_image_file_out(
org, self.storage_ops
)
return PublicCollOut.from_dict(result)
async def list_collections(
self,
org: Organization,
public_colls_out: bool = False,
page_size: int = DEFAULT_PAGE_SIZE,
page: int = 1,
sort_by: Optional[str] = None,
sort_direction: int = 1,
name: Optional[str] = None,
name_prefix: Optional[str] = None,
access: Optional[str] = None,
):
"""List all collections for org"""
# pylint: disable=too-many-locals, duplicate-code, too-many-branches
# Zero-index page for query
page = page - 1
skip = page * page_size
match_query: Dict[str, Union[str, UUID, int, object]] = {"oid": org.id}
if name:
match_query["name"] = name
elif name_prefix:
regex_pattern = f"^{name_prefix}"
match_query["name"] = {"$regex": regex_pattern, "$options": "i"}
if public_colls_out:
match_query["access"] = CollAccessType.PUBLIC
elif access:
match_query["access"] = access
aggregate: List[Dict[str, Union[str, UUID, int, object]]] = [
{"$match": match_query}
]
if sort_by:
if sort_by not in (
"created",
"modified",
"dateLatest",
"name",
"crawlCount",
"pageCount",
"totalSize",
"description",
"caption",
):
raise HTTPException(status_code=400, detail="invalid_sort_by")
if sort_direction not in (1, -1):
raise HTTPException(status_code=400, detail="invalid_sort_direction")
sort_query = {sort_by: sort_direction}
# add secondary sort keys:
if sort_by == "dateLatest":
sort_query["dateEarliest"] = sort_direction
aggregate.extend([{"$sort": sort_query}])
aggregate.extend(
[
{
"$facet": {
"items": [
{"$skip": skip},
{"$limit": page_size},
],
"total": [{"$count": "count"}],
}
},
]
)
cursor = self.collections.aggregate(
aggregate, collation=pymongo.collation.Collation(locale="en")
)
results = await cursor.to_list(length=1)
result = results[0]
items = result["items"]
try:
total = int(result["total"][0]["count"])
except (IndexError, ValueError):
total = 0
collections: List[Union[CollOut, PublicCollOut]] = []
for res in items:
thumbnail = res.get("thumbnail")
if thumbnail:
image_file = ImageFile(**thumbnail)
if public_colls_out:
res["thumbnail"] = await image_file.get_public_image_file_out(
org, self.storage_ops
)
else:
res["thumbnail"] = await image_file.get_image_file_out(
org, self.storage_ops
)
if public_colls_out:
collections.append(PublicCollOut.from_dict(res))
else:
collections.append(CollOut.from_dict(res))
return collections, total
async def get_collection_crawl_resources(
self, coll_id: UUID
) -> tuple[List[CrawlFileOut], List[str], bool]:
"""Return pre-signed resources for all collection crawl files."""
# Ensure collection exists
_ = await self.get_collection_raw(coll_id)
resources = []
pages_optimized = True
crawls, _ = await self.crawl_ops.list_all_base_crawls(
collection_id=coll_id,
states=list(SUCCESSFUL_STATES),
page_size=10_000,
cls_type=CrawlOutWithResources,
)
crawl_ids = []
for crawl in crawls:
crawl_ids.append(crawl.id)
if crawl.resources:
resources.extend(crawl.resources)
if crawl.version != 2:
pages_optimized = False
return resources, crawl_ids, pages_optimized
async def get_collection_names(self, uuids: List[UUID]):
"""return object of {_id, names} given list of collection ids"""
cursor = self.collections.find(
{"_id": {"$in": uuids}}, projection=["_id", "name"]
)
names = await cursor.to_list(length=1000)
names = [
CollIdName(id=namedata["_id"], name=namedata["name"]) for namedata in names
]
return names
async def get_collection_search_values(self, org: Organization):
"""Return list of collection names"""
names = await self.collections.distinct("name", {"oid": org.id})
# Remove empty strings
names = [name for name in names if name]
return {"names": names}
async def get_collection_crawl_ids(
self, coll_id: UUID, public_or_unlisted_only=False
) -> List[str]:
"""Return list of crawl ids in collection, including only public collections"""
crawl_ids = []
# ensure collection is public or unlisted, else throw here
if public_or_unlisted_only:
await self.get_collection_raw(coll_id, public_or_unlisted_only)
async for crawl_raw in self.crawls.find(
{"collectionIds": coll_id}, projection=["_id"]
):
crawl_id = crawl_raw.get("_id")
if crawl_id:
crawl_ids.append(crawl_id)
return crawl_ids
async def delete_collection(self, coll_id: UUID, org: Organization):
"""Delete collection and remove from associated crawls."""
await self.crawl_ops.remove_collection_from_all_crawls(coll_id)
result = await self.collections.delete_one({"_id": coll_id, "oid": org.id})
if result.deleted_count < 1:
raise HTTPException(status_code=404, detail="collection_not_found")
asyncio.create_task(
self.event_webhook_ops.create_collection_deleted_notification(coll_id, org)
)
return {"success": True}
async def download_collection(self, coll_id: UUID, org: Organization):
"""Download all WACZs in collection as streaming nested WACZ"""
coll = await self.get_collection_out(coll_id, org, resources=True)
metadata = {
"type": "collection",
"id": str(coll_id),
"title": coll.name,
"organization": org.slug,
}
if coll.description:
metadata["description"] = coll.description
resp = await self.storage_ops.download_streaming_wacz(metadata, coll.resources)
headers = {"Content-Disposition": f'attachment; filename="{coll.name}.wacz"'}
return StreamingResponse(
resp, headers=headers, media_type="application/wacz+zip"
)
async def recalculate_org_collection_stats(self, org: Organization):
"""recalculate counts, tags and dates for all collections in an org"""
async for coll in self.collections.find({"oid": org.id}, projection={"_id": 1}):
await self.update_collection_counts_and_tags(coll.get("_id"))
await self.update_collection_dates(coll.get("_id"))
async def update_collection_counts_and_tags(self, collection_id: UUID):
"""Set current crawl info in config when crawl begins"""
# pylint: disable=too-many-locals
crawl_count = 0
page_count = 0
total_size = 0
tags = []
crawl_ids = []
preload_resources = []
async for crawl_raw in self.crawls.find({"collectionIds": collection_id}):
crawl = BaseCrawl.from_dict(crawl_raw)
if crawl.state not in SUCCESSFUL_STATES:
continue
crawl_count += 1
files = crawl.files or []
for file in files:
total_size += file.size
try:
crawl_page_count = await self.pages.count_documents(
{"crawl_id": crawl.id}
)
if crawl_page_count == 0:
for file in files:
preload_resources.append(
{
"name": os.path.basename(file.filename),
"crawlId": crawl.id,
}
)
else:
page_count += crawl_page_count
# pylint: disable=broad-exception-caught
except Exception:
pass
if crawl.tags:
tags.extend(crawl.tags)
crawl_ids.append(crawl.id)
sorted_tags = [tag for tag, count in Counter(tags).most_common()]
unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids)
await self.collections.find_one_and_update(
{"_id": collection_id},
{
"$set": {
"crawlCount": crawl_count,
"pageCount": page_count,
"uniquePageCount": unique_page_count,
"totalSize": total_size,
"tags": sorted_tags,
"preloadResources": preload_resources,
}
},
)
async def update_collection_dates(self, coll_id: UUID):
"""Update collection earliest and latest dates from page timestamps"""
# pylint: disable=too-many-locals
coll = await self.get_collection(coll_id)
crawl_ids = await self.get_collection_crawl_ids(coll_id)
earliest_ts = None
latest_ts = None
match_query = {
"oid": coll.oid,
"crawl_id": {"$in": crawl_ids},
"ts": {"$ne": None},
}
cursor = self.pages.find(match_query).sort("ts", 1).limit(1)
pages = await cursor.to_list(length=1)
try:
earliest_page = pages[0]
earliest_ts = earliest_page.get("ts")
except IndexError:
pass
cursor = self.pages.find(match_query).sort("ts", -1).limit(1)
pages = await cursor.to_list(length=1)
try:
latest_page = pages[0]
latest_ts = latest_page.get("ts")
except IndexError:
pass
await self.collections.find_one_and_update(
{"_id": coll_id},
{
"$set": {
"dateEarliest": earliest_ts,
"dateLatest": latest_ts,
}
},
)
async def update_crawl_collections(self, crawl_id: str):
"""Update counts, dates, and modified for all collections in crawl"""
crawl = await self.crawls.find_one({"_id": crawl_id})
crawl_coll_ids = crawl.get("collectionIds")
modified = dt_now()
for coll_id in crawl_coll_ids:
await self.update_collection_counts_and_tags(coll_id)
await self.update_collection_dates(coll_id)
await self.collections.find_one_and_update(
{"_id": coll_id},
{"$set": {"modified": modified}},
return_document=pymongo.ReturnDocument.AFTER,
)
async def add_successful_crawl_to_collections(self, crawl_id: str, cid: UUID):
"""Add successful crawl to its auto-add collections."""
workflow = await self.crawl_configs.find_one({"_id": cid})
auto_add_collections = workflow.get("autoAddCollections")
if auto_add_collections:
await self.crawls.find_one_and_update(
{"_id": crawl_id},
{"$set": {"collectionIds": auto_add_collections}},
)
await self.update_crawl_collections(crawl_id)
async def get_org_public_collections(
self,
org_slug: str,
page_size: int = DEFAULT_PAGE_SIZE,
page: int = 1,
sort_by: Optional[str] = None,
sort_direction: int = 1,
):
"""List public collections for org"""
try:
org = await self.orgs.get_org_by_slug(org_slug)
# pylint: disable=broad-exception-caught
except Exception:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=404, detail="public_profile_not_found")
if not org.enablePublicProfile:
raise HTTPException(status_code=404, detail="public_profile_not_found")
collections, _ = await self.list_collections(
org,
page_size=page_size,
page=page,
sort_by=sort_by,
sort_direction=sort_direction,
public_colls_out=True,
)
public_org_details = PublicOrgDetails(
name=org.name,
description=org.publicDescription or "",
url=org.publicUrl or "",
)
return OrgPublicCollections(org=public_org_details, collections=collections)
async def set_home_url(
self, coll_id: UUID, update: UpdateCollHomeUrl, org: Organization
) -> Dict[str, bool]:
"""Set home URL for collection and save thumbnail to database"""
if update.pageId:
page = await self.page_ops.get_page(update.pageId, org.id)
update_query = {
"homeUrl": page.url,
"homeUrlTs": page.ts,
"homeUrlPageId": page.id,
}
else:
update_query = {
"homeUrl": None,
"homeUrlTs": None,
"homeUrlPageId": None,
}
await self.collections.find_one_and_update(
{"_id": coll_id, "oid": org.id},
{"$set": update_query},
)
return {"updated": True}
# pylint: disable=too-many-locals
async def upload_thumbnail_stream(
self,
stream,
filename: str,
coll_id: UUID,
org: Organization,
user: User,
source_url: Optional[AnyHttpUrl] = None,
source_ts: Optional[datetime] = None,
source_page_id: Optional[UUID] = None,
) -> Dict[str, bool]:
"""Upload file as stream to use as collection thumbnail"""
coll = await self.get_collection(coll_id)
_, extension = os.path.splitext(filename)
image_filename = f"thumbnail-{str(coll_id)}{extension}"
prefix = org.storage.get_storage_extra_path(str(org.id)) + "images/"
file_prep = ImageFilePreparer(
prefix,
image_filename,
original_filename=filename,
user=user,
created=dt_now(),
)
async def stream_iter():
"""iterate over each chunk and compute and digest + total size"""
async for chunk in stream:
file_prep.add_chunk(chunk)
yield chunk
print("Collection thumbnail stream upload starting", flush=True)
if not await self.storage_ops.do_upload_multipart(
org,
file_prep.upload_name,
stream_iter(),
MIN_UPLOAD_PART_SIZE,
):
print("Collection thumbnail stream upload failed", flush=True)
raise HTTPException(status_code=400, detail="upload_failed")
print("Collection thumbnail stream upload complete", flush=True)
thumbnail_file = file_prep.get_image_file(org.storage)
if thumbnail_file.size > THUMBNAIL_MAX_SIZE:
print(
"Collection thumbnail stream upload failed: max size (2 MB) exceeded",
flush=True,
)
await self.storage_ops.delete_file_object(org, thumbnail_file)
raise HTTPException(
status_code=400,
detail="max_thumbnail_size_2_mb_exceeded",
)
if coll.thumbnail:
if not await self.storage_ops.delete_file_object(org, coll.thumbnail):
print(
f"Unable to delete previous collection thumbnail: {coll.thumbnail.filename}"
)
coll.thumbnail = thumbnail_file
if source_url and source_ts and source_page_id:
coll.thumbnailSource = CollectionThumbnailSource(
url=source_url,
urlTs=source_ts,
urlPageId=source_page_id,
)
# Update entire document to avoid bson.errors.InvalidDocument exception
await self.collections.find_one_and_update(
{"_id": coll_id, "oid": org.id},
{"$set": coll.to_dict()},
)
return {"added": True}
async def delete_thumbnail(self, coll_id: UUID, org: Organization):
"""Delete collection thumbnail"""
coll = await self.get_collection(coll_id)
if not coll.thumbnail:
raise HTTPException(status_code=404, detail="thumbnail_not_found")
if not await self.storage_ops.delete_file_object(org, coll.thumbnail):
print(f"Unable to delete collection thumbnail: {coll.thumbnail.filename}")
raise HTTPException(status_code=400, detail="file_deletion_error")
# Delete from database
await self.collections.find_one_and_update(
{"_id": coll_id, "oid": org.id},
{"$set": {"thumbnail": None}},
)
return {"deleted": True}
# ============================================================================
# pylint: disable=too-many-locals
def init_collections_api(
app, mdb, orgs, storage_ops, event_webhook_ops, user_dep
) -> CollectionOps:
"""init collections api"""
# pylint: disable=invalid-name, unused-argument, too-many-arguments
colls: CollectionOps = CollectionOps(mdb, storage_ops, orgs, event_webhook_ops)
org_crawl_dep = orgs.org_crawl_dep
org_viewer_dep = orgs.org_viewer_dep
org_public = orgs.org_public
@app.post(
"/orgs/{oid}/collections",
tags=["collections"],
response_model=AddedResponseIdName,
)
async def add_collection(
new_coll: CollIn, org: Organization = Depends(org_crawl_dep)
):
return await colls.add_collection(org.id, new_coll)
@app.get(
"/orgs/{oid}/collections",
tags=["collections"],
response_model=PaginatedCollOutResponse,
)
async def list_collection_all(
org: Organization = Depends(org_viewer_dep),
pageSize: int = DEFAULT_PAGE_SIZE,
page: int = 1,
sortBy: Optional[str] = None,
sortDirection: int = 1,
name: Optional[str] = None,
namePrefix: Optional[str] = None,
access: Optional[str] = None,
):
collections, total = await colls.list_collections(
org,
page_size=pageSize,
page=page,
sort_by=sortBy,
sort_direction=sortDirection,
name=name,
name_prefix=namePrefix,
access=access,
)
return paginated_format(collections, total, page, pageSize)
@app.get(
"/orgs/{oid}/collections/$all",
tags=["collections"],
response_model=Dict[str, List[CrawlFileOut]],
)
async def get_collection_all(org: Organization = Depends(org_viewer_dep)):
results = {}
try:
all_collections, _ = await colls.list_collections(org, page_size=10_000)
for collection in all_collections:
results[collection.name], _, _ = (
await colls.get_collection_crawl_resources(collection.id)
)
except Exception as exc:
# pylint: disable=raise-missing-from
raise HTTPException(
status_code=400, detail="Error Listing All Crawled Files: " + str(exc)
)
return results
@app.get(
"/orgs/{oid}/collections/search-values",
tags=["collections"],
response_model=CollectionSearchValuesResponse,
)
async def get_collection_search_values(
org: Organization = Depends(org_viewer_dep),
):
return await colls.get_collection_search_values(org)
@app.get(
"/orgs/{oid}/collections/{coll_id}",
tags=["collections"],
response_model=CollOut,
)
async def get_collection(
coll_id: UUID, org: Organization = Depends(org_viewer_dep)
):
return await colls.get_collection_out(coll_id, org)
@app.get(
"/orgs/{oid}/collections/{coll_id}/replay.json",
tags=["collections"],
response_model=CollOut,
)
async def get_collection_replay(
request: Request, coll_id: UUID, org: Organization = Depends(org_viewer_dep)
):
return await colls.get_collection_out(
coll_id, org, resources=True, headers=dict(request.headers)
)
@app.get(
"/orgs/{oid}/collections/{coll_id}/public/replay.json",
tags=["collections"],
response_model=CollOut,
)
async def get_collection_public_replay(
request: Request,
response: Response,
coll_id: UUID,
org: Organization = Depends(org_public),
):
coll = await colls.get_collection_out(
coll_id,
org,
resources=True,
public_or_unlisted_only=True,
headers=dict(request.headers),
)
response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Headers"] = "*"
return coll
@app.options(
"/orgs/{oid}/collections/{coll_id}/public/replay.json",
tags=["collections"],
response_model=EmptyResponse,
)
async def get_replay_preflight(response: Response):
response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS"
response.headers["Access-Control-Allow-Origin"] = "*"
response.headers["Access-Control-Allow-Headers"] = "*"
return {}
@app.patch(
"/orgs/{oid}/collections/{coll_id}",
tags=["collections"],
response_model=UpdatedResponse,
)
async def update_collection(
coll_id: UUID,
update: UpdateColl,
org: Organization = Depends(org_crawl_dep),
):
return await colls.update_collection(coll_id, org, update)
@app.post(
"/orgs/{oid}/collections/{coll_id}/add",
tags=["collections"],
response_model=CollOut,
)
async def add_crawl_to_collection(
crawlList: AddRemoveCrawlList,
coll_id: UUID,
org: Organization = Depends(org_crawl_dep),
) -> CollOut:
return await colls.add_crawls_to_collection(coll_id, crawlList.crawlIds, org)
@app.post(
"/orgs/{oid}/collections/{coll_id}/remove",
tags=["collections"],
response_model=CollOut,
)
async def remove_crawl_from_collection(
crawlList: AddRemoveCrawlList,
coll_id: UUID,
org: Organization = Depends(org_crawl_dep),
) -> CollOut:
return await colls.remove_crawls_from_collection(
coll_id, crawlList.crawlIds, org
)
@app.delete(
"/orgs/{oid}/collections/{coll_id}",
tags=["collections"],
response_model=SuccessResponse,
)
async def delete_collection(
coll_id: UUID, org: Organization = Depends(org_crawl_dep)
):
return await colls.delete_collection(coll_id, org)
@app.get(
"/orgs/{oid}/collections/{coll_id}/download",
tags=["collections"],
response_model=bytes,
)
async def download_collection(
coll_id: UUID, org: Organization = Depends(org_viewer_dep)
):
return await colls.download_collection(coll_id, org)
@app.get(
"/public/orgs/{org_slug}/collections",
tags=["collections", "public"],
response_model=OrgPublicCollections,
)
async def get_org_public_collections(
org_slug: str,
pageSize: int = DEFAULT_PAGE_SIZE,
page: int = 1,
sortBy: Optional[str] = None,
sortDirection: int = 1,
):
return await colls.get_org_public_collections(
org_slug,
page_size=pageSize,
page=page,
sort_by=sortBy,
sort_direction=sortDirection,
)
@app.get(
"/public/orgs/{org_slug}/collections/{coll_slug}",
tags=["collections", "public"],
response_model=PublicCollOut,
)
async def get_public_collection(
org_slug: str,
coll_slug: str,
):
try:
org = await colls.orgs.get_org_by_slug(org_slug)
# pylint: disable=broad-exception-caught
except Exception:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=404, detail="collection_not_found")
coll = await colls.get_collection_by_slug(coll_slug)
return await colls.get_public_collection_out(coll.id, org, allow_unlisted=True)
@app.get(
"/public/orgs/{org_slug}/collections/{coll_slug}/download",
tags=["collections", "public"],
response_model=bytes,
)
async def download_public_collection(
org_slug: str,
coll_slug: str,
):
try:
org = await colls.orgs.get_org_by_slug(org_slug)
# pylint: disable=broad-exception-caught
except Exception:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=404, detail="collection_not_found")
# Make sure collection exists and is public/unlisted
coll = await colls.get_collection_by_slug(
coll_slug, public_or_unlisted_only=True
)
if coll.allowPublicDownload is False:
raise HTTPException(status_code=403, detail="not_allowed")
return await colls.download_collection(coll.id, org)
@app.post(
"/orgs/{oid}/collections/{coll_id}/home-url",
tags=["collections"],
response_model=UpdatedResponse,
)
async def set_collection_home_url(
update: UpdateCollHomeUrl,
coll_id: UUID,
org: Organization = Depends(org_crawl_dep),
):
return await colls.set_home_url(coll_id, update, org)
@app.put(
"/orgs/{oid}/collections/{coll_id}/thumbnail",
tags=["collections"],
response_model=AddedResponse,
)
async def upload_thumbnail_stream(
request: Request,
filename: str,
coll_id: UUID,
sourceUrl: Optional[AnyHttpUrl],
sourceTs: Optional[datetime],
sourcePageId: Optional[UUID],
org: Organization = Depends(org_crawl_dep),
user: User = Depends(user_dep),
):
return await colls.upload_thumbnail_stream(
request.stream(),
filename,
coll_id,
org,
user,
sourceUrl,
sourceTs,
sourcePageId,
)
@app.delete(
"/orgs/{oid}/collections/{coll_id}/thumbnail",
tags=["collections"],
response_model=DeletedResponse,
)
async def delete_thumbnail_stream(
coll_id: UUID,
org: Organization = Depends(org_crawl_dep),
):
return await colls.delete_thumbnail(coll_id, org)
return colls