- consolidate list_pages() and list_replay_query_pages() into list_pages() - to keep backwards compatibility, add <crawl>/pagesSearch that does not include page totals, keep <crawl>/pages with page total (slower) - qa frontend: add default 'Crawl Order' sort order, to better show pages in QA view - bgjob: account for parallelism in bgjobs, add logging if succeeded mismatches parallelism - QA sorting: default to 'crawl order' by default to get better results. - Optimize pages job: also cover crawls that may not have any pages but have pages listed in done stats - Bgjobs: give custom op jobs more memory
1214 lines
40 KiB
Python
1214 lines
40 KiB
Python
"""
|
|
Collections API
|
|
"""
|
|
|
|
# pylint: disable=too-many-lines
|
|
from datetime import datetime
|
|
from collections import Counter
|
|
from uuid import UUID, uuid4
|
|
from typing import Optional, List, TYPE_CHECKING, cast, Dict, Any, Union
|
|
import os
|
|
|
|
import asyncio
|
|
import pymongo
|
|
from pymongo.collation import Collation
|
|
from fastapi import Depends, HTTPException, Response
|
|
from fastapi.responses import StreamingResponse
|
|
from starlette.requests import Request
|
|
|
|
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
|
|
from .models import (
|
|
AnyHttpUrl,
|
|
Collection,
|
|
CollIn,
|
|
CollOut,
|
|
CollIdName,
|
|
CollectionThumbnailSource,
|
|
UpdateColl,
|
|
AddRemoveCrawlList,
|
|
BaseCrawl,
|
|
CrawlOutWithResources,
|
|
CrawlFileOut,
|
|
Organization,
|
|
PaginatedCollOutResponse,
|
|
SUCCESSFUL_STATES,
|
|
AddedResponseIdName,
|
|
EmptyResponse,
|
|
UpdatedResponse,
|
|
SuccessResponse,
|
|
AddedResponse,
|
|
DeletedResponse,
|
|
CollectionSearchValuesResponse,
|
|
OrgPublicCollections,
|
|
PublicOrgDetails,
|
|
CollAccessType,
|
|
UpdateCollHomeUrl,
|
|
User,
|
|
ImageFile,
|
|
ImageFilePreparer,
|
|
MIN_UPLOAD_PART_SIZE,
|
|
PublicCollOut,
|
|
)
|
|
from .utils import dt_now, slug_from_name, get_duplicate_key_error_field, get_origin
|
|
|
|
if TYPE_CHECKING:
|
|
from .orgs import OrgOps
|
|
from .storages import StorageOps
|
|
from .webhooks import EventWebhookOps
|
|
from .crawls import CrawlOps
|
|
from .pages import PageOps
|
|
else:
|
|
OrgOps = StorageOps = EventWebhookOps = CrawlOps = PageOps = object
|
|
|
|
|
|
THUMBNAIL_MAX_SIZE = 2_000_000
|
|
|
|
|
|
# ============================================================================
|
|
class CollectionOps:
|
|
"""ops for working with named collections of crawls"""
|
|
|
|
# pylint: disable=too-many-arguments, too-many-instance-attributes, too-many-public-methods
|
|
|
|
orgs: OrgOps
|
|
storage_ops: StorageOps
|
|
event_webhook_ops: EventWebhookOps
|
|
crawl_ops: CrawlOps
|
|
page_ops: PageOps
|
|
|
|
def __init__(self, mdb, storage_ops, orgs, event_webhook_ops):
|
|
self.collections = mdb["collections"]
|
|
self.crawls = mdb["crawls"]
|
|
self.crawl_configs = mdb["crawl_configs"]
|
|
self.pages = mdb["pages"]
|
|
self.crawl_ops = cast(CrawlOps, None)
|
|
|
|
self.orgs = orgs
|
|
self.storage_ops = storage_ops
|
|
self.event_webhook_ops = event_webhook_ops
|
|
|
|
def set_crawl_ops(self, ops):
|
|
"""set crawl ops"""
|
|
self.crawl_ops = ops
|
|
|
|
def set_page_ops(self, ops):
|
|
"""set page ops"""
|
|
# pylint: disable=attribute-defined-outside-init
|
|
self.page_ops = ops
|
|
|
|
async def init_index(self):
|
|
"""init lookup index"""
|
|
case_insensitive_collation = Collation(locale="en", strength=1)
|
|
await self.collections.create_index(
|
|
[("oid", pymongo.ASCENDING), ("name", pymongo.ASCENDING)],
|
|
unique=True,
|
|
collation=case_insensitive_collation,
|
|
)
|
|
|
|
await self.collections.create_index(
|
|
[("oid", pymongo.ASCENDING), ("slug", pymongo.ASCENDING)],
|
|
unique=True,
|
|
collation=case_insensitive_collation,
|
|
)
|
|
|
|
await self.collections.create_index(
|
|
[("oid", pymongo.ASCENDING), ("description", pymongo.ASCENDING)]
|
|
)
|
|
|
|
async def add_collection(self, oid: UUID, coll_in: CollIn):
|
|
"""Add new collection"""
|
|
crawl_ids = coll_in.crawlIds if coll_in.crawlIds else []
|
|
coll_id = uuid4()
|
|
created = dt_now()
|
|
|
|
slug = coll_in.slug or slug_from_name(coll_in.name)
|
|
|
|
coll = Collection(
|
|
id=coll_id,
|
|
oid=oid,
|
|
name=coll_in.name,
|
|
slug=slug,
|
|
description=coll_in.description,
|
|
caption=coll_in.caption,
|
|
created=created,
|
|
modified=created,
|
|
access=coll_in.access,
|
|
defaultThumbnailName=coll_in.defaultThumbnailName,
|
|
allowPublicDownload=coll_in.allowPublicDownload,
|
|
)
|
|
try:
|
|
await self.collections.insert_one(coll.to_dict())
|
|
org = await self.orgs.get_org_by_id(oid)
|
|
await self.clear_org_previous_slugs_matching_slug(slug, org)
|
|
|
|
if crawl_ids:
|
|
await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)
|
|
await self.update_collection_counts_and_tags(coll_id)
|
|
await self.update_collection_dates(coll_id)
|
|
asyncio.create_task(
|
|
self.event_webhook_ops.create_added_to_collection_notification(
|
|
crawl_ids, coll_id, org
|
|
)
|
|
)
|
|
|
|
return {"added": True, "id": coll_id, "name": coll.name}
|
|
except pymongo.errors.DuplicateKeyError as err:
|
|
# pylint: disable=raise-missing-from
|
|
field = get_duplicate_key_error_field(err)
|
|
raise HTTPException(status_code=400, detail=f"collection_{field}_taken")
|
|
|
|
async def update_collection(
|
|
self, coll_id: UUID, org: Organization, update: UpdateColl
|
|
):
|
|
"""Update collection"""
|
|
query = update.dict(exclude_unset=True)
|
|
|
|
if len(query) == 0:
|
|
raise HTTPException(status_code=400, detail="no_update_data")
|
|
|
|
name_update = query.get("name")
|
|
slug_update = query.get("slug")
|
|
|
|
previous_slug = None
|
|
|
|
if name_update or slug_update:
|
|
# If we're updating slug, save old one to previousSlugs to support redirects
|
|
coll = await self.get_collection(coll_id)
|
|
previous_slug = coll.slug
|
|
|
|
if name_update and not slug_update:
|
|
slug = slug_from_name(name_update)
|
|
query["slug"] = slug
|
|
slug_update = slug
|
|
|
|
query["modified"] = dt_now()
|
|
|
|
db_update = {"$set": query}
|
|
if previous_slug:
|
|
db_update["$push"] = {"previousSlugs": previous_slug}
|
|
|
|
try:
|
|
result = await self.collections.find_one_and_update(
|
|
{"_id": coll_id, "oid": org.id},
|
|
db_update,
|
|
return_document=pymongo.ReturnDocument.AFTER,
|
|
)
|
|
except pymongo.errors.DuplicateKeyError as err:
|
|
# pylint: disable=raise-missing-from
|
|
field = get_duplicate_key_error_field(err)
|
|
raise HTTPException(status_code=400, detail=f"collection_{field}_taken")
|
|
|
|
if not result:
|
|
raise HTTPException(status_code=404, detail="collection_not_found")
|
|
|
|
if slug_update:
|
|
await self.clear_org_previous_slugs_matching_slug(slug_update, org)
|
|
|
|
return {"updated": True}
|
|
|
|
async def clear_org_previous_slugs_matching_slug(
|
|
self, slug: str, org: Organization
|
|
):
|
|
"""Clear new slug from previousSlugs array of other collections in same org"""
|
|
await self.collections.update_many(
|
|
{"oid": org.id, "previousSlugs": slug},
|
|
{"$pull": {"previousSlugs": slug}},
|
|
)
|
|
|
|
async def add_crawls_to_collection(
|
|
self, coll_id: UUID, crawl_ids: List[str], org: Organization
|
|
) -> CollOut:
|
|
"""Add crawls to collection"""
|
|
await self.crawl_ops.add_to_collection(crawl_ids, coll_id, org)
|
|
|
|
modified = dt_now()
|
|
result = await self.collections.find_one_and_update(
|
|
{"_id": coll_id},
|
|
{"$set": {"modified": modified}},
|
|
return_document=pymongo.ReturnDocument.AFTER,
|
|
)
|
|
if not result:
|
|
raise HTTPException(status_code=404, detail="collection_not_found")
|
|
|
|
await self.update_collection_counts_and_tags(coll_id)
|
|
await self.update_collection_dates(coll_id)
|
|
|
|
asyncio.create_task(
|
|
self.event_webhook_ops.create_added_to_collection_notification(
|
|
crawl_ids, coll_id, org
|
|
)
|
|
)
|
|
|
|
return await self.get_collection_out(coll_id, org)
|
|
|
|
async def remove_crawls_from_collection(
|
|
self, coll_id: UUID, crawl_ids: List[str], org: Organization
|
|
) -> CollOut:
|
|
"""Remove crawls from collection"""
|
|
await self.crawl_ops.remove_from_collection(crawl_ids, coll_id)
|
|
modified = dt_now()
|
|
result = await self.collections.find_one_and_update(
|
|
{"_id": coll_id},
|
|
{"$set": {"modified": modified}},
|
|
return_document=pymongo.ReturnDocument.AFTER,
|
|
)
|
|
if not result:
|
|
raise HTTPException(status_code=404, detail="collection_not_found")
|
|
|
|
await self.update_collection_counts_and_tags(coll_id)
|
|
await self.update_collection_dates(coll_id)
|
|
|
|
asyncio.create_task(
|
|
self.event_webhook_ops.create_removed_from_collection_notification(
|
|
crawl_ids, coll_id, org
|
|
)
|
|
)
|
|
|
|
return await self.get_collection_out(coll_id, org)
|
|
|
|
async def get_collection_raw(
|
|
self, coll_id: UUID, public_or_unlisted_only: bool = False
|
|
) -> Dict[str, Any]:
|
|
"""Get collection by id as dict from database"""
|
|
query: dict[str, object] = {"_id": coll_id}
|
|
if public_or_unlisted_only:
|
|
query["access"] = {"$in": ["public", "unlisted"]}
|
|
|
|
result = await self.collections.find_one(query)
|
|
if not result:
|
|
raise HTTPException(status_code=404, detail="collection_not_found")
|
|
|
|
return result
|
|
|
|
async def get_collection_raw_by_slug(
|
|
self,
|
|
coll_slug: str,
|
|
previous_slugs: bool = False,
|
|
public_or_unlisted_only: bool = False,
|
|
) -> Dict[str, Any]:
|
|
"""Get collection by slug (current or previous) as dict from database"""
|
|
query: dict[str, object] = {}
|
|
if previous_slugs:
|
|
query["previousSlugs"] = coll_slug
|
|
else:
|
|
query["slug"] = coll_slug
|
|
if public_or_unlisted_only:
|
|
query["access"] = {"$in": ["public", "unlisted"]}
|
|
|
|
result = await self.collections.find_one(query)
|
|
if not result:
|
|
raise HTTPException(status_code=404, detail="collection_not_found")
|
|
|
|
return result
|
|
|
|
async def get_collection(
|
|
self, coll_id: UUID, public_or_unlisted_only: bool = False
|
|
) -> Collection:
|
|
"""Get collection by id"""
|
|
result = await self.get_collection_raw(coll_id, public_or_unlisted_only)
|
|
return Collection.from_dict(result)
|
|
|
|
async def get_collection_by_slug(
|
|
self, coll_slug: str, public_or_unlisted_only: bool = False
|
|
) -> Collection:
|
|
"""Get collection by slug"""
|
|
try:
|
|
result = await self.get_collection_raw_by_slug(
|
|
coll_slug, public_or_unlisted_only=public_or_unlisted_only
|
|
)
|
|
return Collection.from_dict(result)
|
|
# pylint: disable=broad-exception-caught
|
|
except Exception:
|
|
pass
|
|
|
|
result = await self.get_collection_raw_by_slug(
|
|
coll_slug,
|
|
previous_slugs=True,
|
|
public_or_unlisted_only=public_or_unlisted_only,
|
|
)
|
|
return Collection.from_dict(result)
|
|
|
|
async def get_collection_out(
|
|
self,
|
|
coll_id: UUID,
|
|
org: Organization,
|
|
resources=False,
|
|
public_or_unlisted_only=False,
|
|
headers: Optional[dict] = None,
|
|
) -> CollOut:
|
|
"""Get CollOut by id"""
|
|
# pylint: disable=too-many-locals
|
|
result = await self.get_collection_raw(coll_id, public_or_unlisted_only)
|
|
|
|
if resources:
|
|
result["resources"], crawl_ids, pages_optimized = (
|
|
await self.get_collection_crawl_resources(coll_id)
|
|
)
|
|
|
|
initial_pages, _ = await self.page_ops.list_pages(
|
|
crawl_ids=crawl_ids,
|
|
page_size=25,
|
|
)
|
|
|
|
public = "public/" if public_or_unlisted_only else ""
|
|
|
|
if pages_optimized:
|
|
result["initialPages"] = initial_pages
|
|
result["pagesQueryUrl"] = (
|
|
get_origin(headers)
|
|
+ f"/api/orgs/{org.id}/collections/{coll_id}/{public}pages"
|
|
)
|
|
|
|
thumbnail = result.get("thumbnail")
|
|
if thumbnail:
|
|
image_file = ImageFile(**thumbnail)
|
|
result["thumbnail"] = await image_file.get_image_file_out(
|
|
org, self.storage_ops
|
|
)
|
|
|
|
return CollOut.from_dict(result)
|
|
|
|
async def get_public_collection_out(
|
|
self,
|
|
coll_id: UUID,
|
|
org: Organization,
|
|
allow_unlisted: bool = False,
|
|
) -> PublicCollOut:
|
|
"""Get PublicCollOut by id"""
|
|
result = await self.get_collection_raw(coll_id)
|
|
|
|
allowed_access = [CollAccessType.PUBLIC]
|
|
if allow_unlisted:
|
|
allowed_access.append(CollAccessType.UNLISTED)
|
|
|
|
if result.get("access") not in allowed_access:
|
|
raise HTTPException(status_code=404, detail="collection_not_found")
|
|
|
|
result["resources"], _, _ = await self.get_collection_crawl_resources(coll_id)
|
|
|
|
thumbnail = result.get("thumbnail")
|
|
if thumbnail:
|
|
image_file = ImageFile(**thumbnail)
|
|
result["thumbnail"] = await image_file.get_public_image_file_out(
|
|
org, self.storage_ops
|
|
)
|
|
|
|
return PublicCollOut.from_dict(result)
|
|
|
|
async def list_collections(
|
|
self,
|
|
org: Organization,
|
|
public_colls_out: bool = False,
|
|
page_size: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
sort_by: Optional[str] = None,
|
|
sort_direction: int = 1,
|
|
name: Optional[str] = None,
|
|
name_prefix: Optional[str] = None,
|
|
access: Optional[str] = None,
|
|
):
|
|
"""List all collections for org"""
|
|
# pylint: disable=too-many-locals, duplicate-code, too-many-branches
|
|
# Zero-index page for query
|
|
page = page - 1
|
|
skip = page * page_size
|
|
|
|
match_query: Dict[str, Union[str, UUID, int, object]] = {"oid": org.id}
|
|
|
|
if name:
|
|
match_query["name"] = name
|
|
elif name_prefix:
|
|
regex_pattern = f"^{name_prefix}"
|
|
match_query["name"] = {"$regex": regex_pattern, "$options": "i"}
|
|
|
|
if public_colls_out:
|
|
match_query["access"] = CollAccessType.PUBLIC
|
|
elif access:
|
|
match_query["access"] = access
|
|
|
|
aggregate: List[Dict[str, Union[str, UUID, int, object]]] = [
|
|
{"$match": match_query}
|
|
]
|
|
|
|
if sort_by:
|
|
if sort_by not in (
|
|
"created",
|
|
"modified",
|
|
"dateLatest",
|
|
"name",
|
|
"crawlCount",
|
|
"pageCount",
|
|
"totalSize",
|
|
"description",
|
|
"caption",
|
|
):
|
|
raise HTTPException(status_code=400, detail="invalid_sort_by")
|
|
if sort_direction not in (1, -1):
|
|
raise HTTPException(status_code=400, detail="invalid_sort_direction")
|
|
|
|
sort_query = {sort_by: sort_direction}
|
|
|
|
# add secondary sort keys:
|
|
if sort_by == "dateLatest":
|
|
sort_query["dateEarliest"] = sort_direction
|
|
|
|
aggregate.extend([{"$sort": sort_query}])
|
|
|
|
aggregate.extend(
|
|
[
|
|
{
|
|
"$facet": {
|
|
"items": [
|
|
{"$skip": skip},
|
|
{"$limit": page_size},
|
|
],
|
|
"total": [{"$count": "count"}],
|
|
}
|
|
},
|
|
]
|
|
)
|
|
|
|
cursor = self.collections.aggregate(
|
|
aggregate, collation=pymongo.collation.Collation(locale="en")
|
|
)
|
|
results = await cursor.to_list(length=1)
|
|
result = results[0]
|
|
items = result["items"]
|
|
|
|
try:
|
|
total = int(result["total"][0]["count"])
|
|
except (IndexError, ValueError):
|
|
total = 0
|
|
|
|
collections: List[Union[CollOut, PublicCollOut]] = []
|
|
|
|
for res in items:
|
|
thumbnail = res.get("thumbnail")
|
|
if thumbnail:
|
|
image_file = ImageFile(**thumbnail)
|
|
|
|
if public_colls_out:
|
|
res["thumbnail"] = await image_file.get_public_image_file_out(
|
|
org, self.storage_ops
|
|
)
|
|
else:
|
|
res["thumbnail"] = await image_file.get_image_file_out(
|
|
org, self.storage_ops
|
|
)
|
|
|
|
if public_colls_out:
|
|
collections.append(PublicCollOut.from_dict(res))
|
|
else:
|
|
collections.append(CollOut.from_dict(res))
|
|
|
|
return collections, total
|
|
|
|
async def get_collection_crawl_resources(
|
|
self, coll_id: UUID
|
|
) -> tuple[List[CrawlFileOut], List[str], bool]:
|
|
"""Return pre-signed resources for all collection crawl files."""
|
|
# Ensure collection exists
|
|
_ = await self.get_collection_raw(coll_id)
|
|
|
|
resources = []
|
|
pages_optimized = True
|
|
|
|
crawls, _ = await self.crawl_ops.list_all_base_crawls(
|
|
collection_id=coll_id,
|
|
states=list(SUCCESSFUL_STATES),
|
|
page_size=10_000,
|
|
cls_type=CrawlOutWithResources,
|
|
)
|
|
|
|
crawl_ids = []
|
|
|
|
for crawl in crawls:
|
|
crawl_ids.append(crawl.id)
|
|
if crawl.resources:
|
|
resources.extend(crawl.resources)
|
|
if crawl.version != 2:
|
|
pages_optimized = False
|
|
|
|
return resources, crawl_ids, pages_optimized
|
|
|
|
async def get_collection_names(self, uuids: List[UUID]):
|
|
"""return object of {_id, names} given list of collection ids"""
|
|
cursor = self.collections.find(
|
|
{"_id": {"$in": uuids}}, projection=["_id", "name"]
|
|
)
|
|
names = await cursor.to_list(length=1000)
|
|
names = [
|
|
CollIdName(id=namedata["_id"], name=namedata["name"]) for namedata in names
|
|
]
|
|
return names
|
|
|
|
async def get_collection_search_values(self, org: Organization):
|
|
"""Return list of collection names"""
|
|
names = await self.collections.distinct("name", {"oid": org.id})
|
|
# Remove empty strings
|
|
names = [name for name in names if name]
|
|
return {"names": names}
|
|
|
|
async def get_collection_crawl_ids(
|
|
self, coll_id: UUID, public_or_unlisted_only=False
|
|
) -> List[str]:
|
|
"""Return list of crawl ids in collection, including only public collections"""
|
|
crawl_ids = []
|
|
# ensure collection is public or unlisted, else throw here
|
|
if public_or_unlisted_only:
|
|
await self.get_collection_raw(coll_id, public_or_unlisted_only)
|
|
|
|
async for crawl_raw in self.crawls.find(
|
|
{"collectionIds": coll_id}, projection=["_id"]
|
|
):
|
|
crawl_id = crawl_raw.get("_id")
|
|
if crawl_id:
|
|
crawl_ids.append(crawl_id)
|
|
return crawl_ids
|
|
|
|
async def delete_collection(self, coll_id: UUID, org: Organization):
|
|
"""Delete collection and remove from associated crawls."""
|
|
await self.crawl_ops.remove_collection_from_all_crawls(coll_id)
|
|
|
|
result = await self.collections.delete_one({"_id": coll_id, "oid": org.id})
|
|
if result.deleted_count < 1:
|
|
raise HTTPException(status_code=404, detail="collection_not_found")
|
|
|
|
asyncio.create_task(
|
|
self.event_webhook_ops.create_collection_deleted_notification(coll_id, org)
|
|
)
|
|
|
|
return {"success": True}
|
|
|
|
async def download_collection(self, coll_id: UUID, org: Organization):
|
|
"""Download all WACZs in collection as streaming nested WACZ"""
|
|
coll = await self.get_collection_out(coll_id, org, resources=True)
|
|
|
|
metadata = {
|
|
"type": "collection",
|
|
"id": str(coll_id),
|
|
"title": coll.name,
|
|
"organization": org.slug,
|
|
}
|
|
if coll.description:
|
|
metadata["description"] = coll.description
|
|
|
|
resp = await self.storage_ops.download_streaming_wacz(metadata, coll.resources)
|
|
|
|
headers = {"Content-Disposition": f'attachment; filename="{coll.name}.wacz"'}
|
|
return StreamingResponse(
|
|
resp, headers=headers, media_type="application/wacz+zip"
|
|
)
|
|
|
|
async def recalculate_org_collection_stats(self, org: Organization):
|
|
"""recalculate counts, tags and dates for all collections in an org"""
|
|
async for coll in self.collections.find({"oid": org.id}, projection={"_id": 1}):
|
|
await self.update_collection_counts_and_tags(coll.get("_id"))
|
|
await self.update_collection_dates(coll.get("_id"))
|
|
|
|
async def update_collection_counts_and_tags(self, collection_id: UUID):
|
|
"""Set current crawl info in config when crawl begins"""
|
|
# pylint: disable=too-many-locals
|
|
crawl_count = 0
|
|
page_count = 0
|
|
total_size = 0
|
|
tags = []
|
|
|
|
crawl_ids = []
|
|
preload_resources = []
|
|
|
|
async for crawl_raw in self.crawls.find({"collectionIds": collection_id}):
|
|
crawl = BaseCrawl.from_dict(crawl_raw)
|
|
if crawl.state not in SUCCESSFUL_STATES:
|
|
continue
|
|
crawl_count += 1
|
|
files = crawl.files or []
|
|
for file in files:
|
|
total_size += file.size
|
|
|
|
try:
|
|
crawl_page_count = await self.pages.count_documents(
|
|
{"crawl_id": crawl.id}
|
|
)
|
|
|
|
if crawl_page_count == 0:
|
|
for file in files:
|
|
preload_resources.append(
|
|
{
|
|
"name": os.path.basename(file.filename),
|
|
"crawlId": crawl.id,
|
|
}
|
|
)
|
|
else:
|
|
page_count += crawl_page_count
|
|
# pylint: disable=broad-exception-caught
|
|
except Exception:
|
|
pass
|
|
|
|
if crawl.tags:
|
|
tags.extend(crawl.tags)
|
|
|
|
crawl_ids.append(crawl.id)
|
|
|
|
sorted_tags = [tag for tag, count in Counter(tags).most_common()]
|
|
|
|
unique_page_count = await self.page_ops.get_unique_page_count(crawl_ids)
|
|
|
|
await self.collections.find_one_and_update(
|
|
{"_id": collection_id},
|
|
{
|
|
"$set": {
|
|
"crawlCount": crawl_count,
|
|
"pageCount": page_count,
|
|
"uniquePageCount": unique_page_count,
|
|
"totalSize": total_size,
|
|
"tags": sorted_tags,
|
|
"preloadResources": preload_resources,
|
|
}
|
|
},
|
|
)
|
|
|
|
async def update_collection_dates(self, coll_id: UUID):
|
|
"""Update collection earliest and latest dates from page timestamps"""
|
|
# pylint: disable=too-many-locals
|
|
coll = await self.get_collection(coll_id)
|
|
crawl_ids = await self.get_collection_crawl_ids(coll_id)
|
|
|
|
earliest_ts = None
|
|
latest_ts = None
|
|
|
|
match_query = {
|
|
"oid": coll.oid,
|
|
"crawl_id": {"$in": crawl_ids},
|
|
"ts": {"$ne": None},
|
|
}
|
|
|
|
cursor = self.pages.find(match_query).sort("ts", 1).limit(1)
|
|
pages = await cursor.to_list(length=1)
|
|
try:
|
|
earliest_page = pages[0]
|
|
earliest_ts = earliest_page.get("ts")
|
|
except IndexError:
|
|
pass
|
|
|
|
cursor = self.pages.find(match_query).sort("ts", -1).limit(1)
|
|
pages = await cursor.to_list(length=1)
|
|
try:
|
|
latest_page = pages[0]
|
|
latest_ts = latest_page.get("ts")
|
|
except IndexError:
|
|
pass
|
|
|
|
await self.collections.find_one_and_update(
|
|
{"_id": coll_id},
|
|
{
|
|
"$set": {
|
|
"dateEarliest": earliest_ts,
|
|
"dateLatest": latest_ts,
|
|
}
|
|
},
|
|
)
|
|
|
|
async def update_crawl_collections(self, crawl_id: str):
|
|
"""Update counts, dates, and modified for all collections in crawl"""
|
|
crawl = await self.crawls.find_one({"_id": crawl_id})
|
|
crawl_coll_ids = crawl.get("collectionIds")
|
|
modified = dt_now()
|
|
|
|
for coll_id in crawl_coll_ids:
|
|
await self.update_collection_counts_and_tags(coll_id)
|
|
await self.update_collection_dates(coll_id)
|
|
await self.collections.find_one_and_update(
|
|
{"_id": coll_id},
|
|
{"$set": {"modified": modified}},
|
|
return_document=pymongo.ReturnDocument.AFTER,
|
|
)
|
|
|
|
async def add_successful_crawl_to_collections(self, crawl_id: str, cid: UUID):
|
|
"""Add successful crawl to its auto-add collections."""
|
|
workflow = await self.crawl_configs.find_one({"_id": cid})
|
|
auto_add_collections = workflow.get("autoAddCollections")
|
|
if auto_add_collections:
|
|
await self.crawls.find_one_and_update(
|
|
{"_id": crawl_id},
|
|
{"$set": {"collectionIds": auto_add_collections}},
|
|
)
|
|
await self.update_crawl_collections(crawl_id)
|
|
|
|
async def get_org_public_collections(
|
|
self,
|
|
org_slug: str,
|
|
page_size: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
sort_by: Optional[str] = None,
|
|
sort_direction: int = 1,
|
|
):
|
|
"""List public collections for org"""
|
|
try:
|
|
org = await self.orgs.get_org_by_slug(org_slug)
|
|
# pylint: disable=broad-exception-caught
|
|
except Exception:
|
|
# pylint: disable=raise-missing-from
|
|
raise HTTPException(status_code=404, detail="public_profile_not_found")
|
|
|
|
if not org.enablePublicProfile:
|
|
raise HTTPException(status_code=404, detail="public_profile_not_found")
|
|
|
|
collections, _ = await self.list_collections(
|
|
org,
|
|
page_size=page_size,
|
|
page=page,
|
|
sort_by=sort_by,
|
|
sort_direction=sort_direction,
|
|
public_colls_out=True,
|
|
)
|
|
|
|
public_org_details = PublicOrgDetails(
|
|
name=org.name,
|
|
description=org.publicDescription or "",
|
|
url=org.publicUrl or "",
|
|
)
|
|
|
|
return OrgPublicCollections(org=public_org_details, collections=collections)
|
|
|
|
async def set_home_url(
|
|
self, coll_id: UUID, update: UpdateCollHomeUrl, org: Organization
|
|
) -> Dict[str, bool]:
|
|
"""Set home URL for collection and save thumbnail to database"""
|
|
if update.pageId:
|
|
page = await self.page_ops.get_page(update.pageId, org.id)
|
|
update_query = {
|
|
"homeUrl": page.url,
|
|
"homeUrlTs": page.ts,
|
|
"homeUrlPageId": page.id,
|
|
}
|
|
else:
|
|
update_query = {
|
|
"homeUrl": None,
|
|
"homeUrlTs": None,
|
|
"homeUrlPageId": None,
|
|
}
|
|
|
|
await self.collections.find_one_and_update(
|
|
{"_id": coll_id, "oid": org.id},
|
|
{"$set": update_query},
|
|
)
|
|
|
|
return {"updated": True}
|
|
|
|
# pylint: disable=too-many-locals
|
|
async def upload_thumbnail_stream(
|
|
self,
|
|
stream,
|
|
filename: str,
|
|
coll_id: UUID,
|
|
org: Organization,
|
|
user: User,
|
|
source_url: Optional[AnyHttpUrl] = None,
|
|
source_ts: Optional[datetime] = None,
|
|
source_page_id: Optional[UUID] = None,
|
|
) -> Dict[str, bool]:
|
|
"""Upload file as stream to use as collection thumbnail"""
|
|
coll = await self.get_collection(coll_id)
|
|
|
|
_, extension = os.path.splitext(filename)
|
|
|
|
image_filename = f"thumbnail-{str(coll_id)}{extension}"
|
|
|
|
prefix = org.storage.get_storage_extra_path(str(org.id)) + "images/"
|
|
|
|
file_prep = ImageFilePreparer(
|
|
prefix,
|
|
image_filename,
|
|
original_filename=filename,
|
|
user=user,
|
|
created=dt_now(),
|
|
)
|
|
|
|
async def stream_iter():
|
|
"""iterate over each chunk and compute and digest + total size"""
|
|
async for chunk in stream:
|
|
file_prep.add_chunk(chunk)
|
|
yield chunk
|
|
|
|
print("Collection thumbnail stream upload starting", flush=True)
|
|
|
|
if not await self.storage_ops.do_upload_multipart(
|
|
org,
|
|
file_prep.upload_name,
|
|
stream_iter(),
|
|
MIN_UPLOAD_PART_SIZE,
|
|
):
|
|
print("Collection thumbnail stream upload failed", flush=True)
|
|
raise HTTPException(status_code=400, detail="upload_failed")
|
|
|
|
print("Collection thumbnail stream upload complete", flush=True)
|
|
|
|
thumbnail_file = file_prep.get_image_file(org.storage)
|
|
|
|
if thumbnail_file.size > THUMBNAIL_MAX_SIZE:
|
|
print(
|
|
"Collection thumbnail stream upload failed: max size (2 MB) exceeded",
|
|
flush=True,
|
|
)
|
|
await self.storage_ops.delete_file_object(org, thumbnail_file)
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="max_thumbnail_size_2_mb_exceeded",
|
|
)
|
|
|
|
if coll.thumbnail:
|
|
if not await self.storage_ops.delete_file_object(org, coll.thumbnail):
|
|
print(
|
|
f"Unable to delete previous collection thumbnail: {coll.thumbnail.filename}"
|
|
)
|
|
|
|
coll.thumbnail = thumbnail_file
|
|
|
|
if source_url and source_ts and source_page_id:
|
|
coll.thumbnailSource = CollectionThumbnailSource(
|
|
url=source_url,
|
|
urlTs=source_ts,
|
|
urlPageId=source_page_id,
|
|
)
|
|
|
|
# Update entire document to avoid bson.errors.InvalidDocument exception
|
|
await self.collections.find_one_and_update(
|
|
{"_id": coll_id, "oid": org.id},
|
|
{"$set": coll.to_dict()},
|
|
)
|
|
|
|
return {"added": True}
|
|
|
|
async def delete_thumbnail(self, coll_id: UUID, org: Organization):
|
|
"""Delete collection thumbnail"""
|
|
coll = await self.get_collection(coll_id)
|
|
|
|
if not coll.thumbnail:
|
|
raise HTTPException(status_code=404, detail="thumbnail_not_found")
|
|
|
|
if not await self.storage_ops.delete_file_object(org, coll.thumbnail):
|
|
print(f"Unable to delete collection thumbnail: {coll.thumbnail.filename}")
|
|
raise HTTPException(status_code=400, detail="file_deletion_error")
|
|
|
|
# Delete from database
|
|
await self.collections.find_one_and_update(
|
|
{"_id": coll_id, "oid": org.id},
|
|
{"$set": {"thumbnail": None}},
|
|
)
|
|
|
|
return {"deleted": True}
|
|
|
|
|
|
# ============================================================================
|
|
# pylint: disable=too-many-locals
|
|
def init_collections_api(
|
|
app, mdb, orgs, storage_ops, event_webhook_ops, user_dep
|
|
) -> CollectionOps:
|
|
"""init collections api"""
|
|
# pylint: disable=invalid-name, unused-argument, too-many-arguments
|
|
|
|
colls: CollectionOps = CollectionOps(mdb, storage_ops, orgs, event_webhook_ops)
|
|
|
|
org_crawl_dep = orgs.org_crawl_dep
|
|
org_viewer_dep = orgs.org_viewer_dep
|
|
org_public = orgs.org_public
|
|
|
|
@app.post(
|
|
"/orgs/{oid}/collections",
|
|
tags=["collections"],
|
|
response_model=AddedResponseIdName,
|
|
)
|
|
async def add_collection(
|
|
new_coll: CollIn, org: Organization = Depends(org_crawl_dep)
|
|
):
|
|
return await colls.add_collection(org.id, new_coll)
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/collections",
|
|
tags=["collections"],
|
|
response_model=PaginatedCollOutResponse,
|
|
)
|
|
async def list_collection_all(
|
|
org: Organization = Depends(org_viewer_dep),
|
|
pageSize: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
sortBy: Optional[str] = None,
|
|
sortDirection: int = 1,
|
|
name: Optional[str] = None,
|
|
namePrefix: Optional[str] = None,
|
|
access: Optional[str] = None,
|
|
):
|
|
collections, total = await colls.list_collections(
|
|
org,
|
|
page_size=pageSize,
|
|
page=page,
|
|
sort_by=sortBy,
|
|
sort_direction=sortDirection,
|
|
name=name,
|
|
name_prefix=namePrefix,
|
|
access=access,
|
|
)
|
|
return paginated_format(collections, total, page, pageSize)
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/collections/$all",
|
|
tags=["collections"],
|
|
response_model=Dict[str, List[CrawlFileOut]],
|
|
)
|
|
async def get_collection_all(org: Organization = Depends(org_viewer_dep)):
|
|
results = {}
|
|
try:
|
|
all_collections, _ = await colls.list_collections(org, page_size=10_000)
|
|
for collection in all_collections:
|
|
results[collection.name], _, _ = (
|
|
await colls.get_collection_crawl_resources(collection.id)
|
|
)
|
|
except Exception as exc:
|
|
# pylint: disable=raise-missing-from
|
|
raise HTTPException(
|
|
status_code=400, detail="Error Listing All Crawled Files: " + str(exc)
|
|
)
|
|
|
|
return results
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/collections/search-values",
|
|
tags=["collections"],
|
|
response_model=CollectionSearchValuesResponse,
|
|
)
|
|
async def get_collection_search_values(
|
|
org: Organization = Depends(org_viewer_dep),
|
|
):
|
|
return await colls.get_collection_search_values(org)
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/collections/{coll_id}",
|
|
tags=["collections"],
|
|
response_model=CollOut,
|
|
)
|
|
async def get_collection(
|
|
coll_id: UUID, org: Organization = Depends(org_viewer_dep)
|
|
):
|
|
return await colls.get_collection_out(coll_id, org)
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/collections/{coll_id}/replay.json",
|
|
tags=["collections"],
|
|
response_model=CollOut,
|
|
)
|
|
async def get_collection_replay(
|
|
request: Request, coll_id: UUID, org: Organization = Depends(org_viewer_dep)
|
|
):
|
|
return await colls.get_collection_out(
|
|
coll_id, org, resources=True, headers=dict(request.headers)
|
|
)
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/collections/{coll_id}/public/replay.json",
|
|
tags=["collections"],
|
|
response_model=CollOut,
|
|
)
|
|
async def get_collection_public_replay(
|
|
request: Request,
|
|
response: Response,
|
|
coll_id: UUID,
|
|
org: Organization = Depends(org_public),
|
|
):
|
|
coll = await colls.get_collection_out(
|
|
coll_id,
|
|
org,
|
|
resources=True,
|
|
public_or_unlisted_only=True,
|
|
headers=dict(request.headers),
|
|
)
|
|
response.headers["Access-Control-Allow-Origin"] = "*"
|
|
response.headers["Access-Control-Allow-Headers"] = "*"
|
|
return coll
|
|
|
|
@app.options(
|
|
"/orgs/{oid}/collections/{coll_id}/public/replay.json",
|
|
tags=["collections"],
|
|
response_model=EmptyResponse,
|
|
)
|
|
async def get_replay_preflight(response: Response):
|
|
response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS"
|
|
response.headers["Access-Control-Allow-Origin"] = "*"
|
|
response.headers["Access-Control-Allow-Headers"] = "*"
|
|
return {}
|
|
|
|
@app.patch(
|
|
"/orgs/{oid}/collections/{coll_id}",
|
|
tags=["collections"],
|
|
response_model=UpdatedResponse,
|
|
)
|
|
async def update_collection(
|
|
coll_id: UUID,
|
|
update: UpdateColl,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
return await colls.update_collection(coll_id, org, update)
|
|
|
|
@app.post(
|
|
"/orgs/{oid}/collections/{coll_id}/add",
|
|
tags=["collections"],
|
|
response_model=CollOut,
|
|
)
|
|
async def add_crawl_to_collection(
|
|
crawlList: AddRemoveCrawlList,
|
|
coll_id: UUID,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
) -> CollOut:
|
|
return await colls.add_crawls_to_collection(coll_id, crawlList.crawlIds, org)
|
|
|
|
@app.post(
|
|
"/orgs/{oid}/collections/{coll_id}/remove",
|
|
tags=["collections"],
|
|
response_model=CollOut,
|
|
)
|
|
async def remove_crawl_from_collection(
|
|
crawlList: AddRemoveCrawlList,
|
|
coll_id: UUID,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
) -> CollOut:
|
|
return await colls.remove_crawls_from_collection(
|
|
coll_id, crawlList.crawlIds, org
|
|
)
|
|
|
|
@app.delete(
|
|
"/orgs/{oid}/collections/{coll_id}",
|
|
tags=["collections"],
|
|
response_model=SuccessResponse,
|
|
)
|
|
async def delete_collection(
|
|
coll_id: UUID, org: Organization = Depends(org_crawl_dep)
|
|
):
|
|
return await colls.delete_collection(coll_id, org)
|
|
|
|
@app.get(
|
|
"/orgs/{oid}/collections/{coll_id}/download",
|
|
tags=["collections"],
|
|
response_model=bytes,
|
|
)
|
|
async def download_collection(
|
|
coll_id: UUID, org: Organization = Depends(org_viewer_dep)
|
|
):
|
|
return await colls.download_collection(coll_id, org)
|
|
|
|
@app.get(
|
|
"/public/orgs/{org_slug}/collections",
|
|
tags=["collections", "public"],
|
|
response_model=OrgPublicCollections,
|
|
)
|
|
async def get_org_public_collections(
|
|
org_slug: str,
|
|
pageSize: int = DEFAULT_PAGE_SIZE,
|
|
page: int = 1,
|
|
sortBy: Optional[str] = None,
|
|
sortDirection: int = 1,
|
|
):
|
|
return await colls.get_org_public_collections(
|
|
org_slug,
|
|
page_size=pageSize,
|
|
page=page,
|
|
sort_by=sortBy,
|
|
sort_direction=sortDirection,
|
|
)
|
|
|
|
@app.get(
|
|
"/public/orgs/{org_slug}/collections/{coll_slug}",
|
|
tags=["collections", "public"],
|
|
response_model=PublicCollOut,
|
|
)
|
|
async def get_public_collection(
|
|
org_slug: str,
|
|
coll_slug: str,
|
|
):
|
|
try:
|
|
org = await colls.orgs.get_org_by_slug(org_slug)
|
|
# pylint: disable=broad-exception-caught
|
|
except Exception:
|
|
# pylint: disable=raise-missing-from
|
|
raise HTTPException(status_code=404, detail="collection_not_found")
|
|
|
|
coll = await colls.get_collection_by_slug(coll_slug)
|
|
|
|
return await colls.get_public_collection_out(coll.id, org, allow_unlisted=True)
|
|
|
|
@app.get(
|
|
"/public/orgs/{org_slug}/collections/{coll_slug}/download",
|
|
tags=["collections", "public"],
|
|
response_model=bytes,
|
|
)
|
|
async def download_public_collection(
|
|
org_slug: str,
|
|
coll_slug: str,
|
|
):
|
|
try:
|
|
org = await colls.orgs.get_org_by_slug(org_slug)
|
|
# pylint: disable=broad-exception-caught
|
|
except Exception:
|
|
# pylint: disable=raise-missing-from
|
|
raise HTTPException(status_code=404, detail="collection_not_found")
|
|
|
|
# Make sure collection exists and is public/unlisted
|
|
coll = await colls.get_collection_by_slug(
|
|
coll_slug, public_or_unlisted_only=True
|
|
)
|
|
|
|
if coll.allowPublicDownload is False:
|
|
raise HTTPException(status_code=403, detail="not_allowed")
|
|
|
|
return await colls.download_collection(coll.id, org)
|
|
|
|
@app.post(
|
|
"/orgs/{oid}/collections/{coll_id}/home-url",
|
|
tags=["collections"],
|
|
response_model=UpdatedResponse,
|
|
)
|
|
async def set_collection_home_url(
|
|
update: UpdateCollHomeUrl,
|
|
coll_id: UUID,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
return await colls.set_home_url(coll_id, update, org)
|
|
|
|
@app.put(
|
|
"/orgs/{oid}/collections/{coll_id}/thumbnail",
|
|
tags=["collections"],
|
|
response_model=AddedResponse,
|
|
)
|
|
async def upload_thumbnail_stream(
|
|
request: Request,
|
|
filename: str,
|
|
coll_id: UUID,
|
|
sourceUrl: Optional[AnyHttpUrl],
|
|
sourceTs: Optional[datetime],
|
|
sourcePageId: Optional[UUID],
|
|
org: Organization = Depends(org_crawl_dep),
|
|
user: User = Depends(user_dep),
|
|
):
|
|
return await colls.upload_thumbnail_stream(
|
|
request.stream(),
|
|
filename,
|
|
coll_id,
|
|
org,
|
|
user,
|
|
sourceUrl,
|
|
sourceTs,
|
|
sourcePageId,
|
|
)
|
|
|
|
@app.delete(
|
|
"/orgs/{oid}/collections/{coll_id}/thumbnail",
|
|
tags=["collections"],
|
|
response_model=DeletedResponse,
|
|
)
|
|
async def delete_thumbnail_stream(
|
|
coll_id: UUID,
|
|
org: Organization = Depends(org_crawl_dep),
|
|
):
|
|
return await colls.delete_thumbnail(coll_id, org)
|
|
|
|
return colls
|