browsertrix/backend/colls.py
Ilya Kreymer 57a4b6b46f add collections api:
- collections defined by name per archive
- can update collections with additional metadata (currently just description)
- crawl config api accepts a list of collections by name, resolved to collection uids and stored in config
- finished crawls also associated with collection list
- /archives/{aid}/collections/{name} can list all crawl artifacts (wacz files) from a named collection (in frictionless data package-ish format)
- /archives/{aid}/collections/$all lists all crawled artifacts for the archive

readiness check: add /healthz endpoints for app and nginx
ingress: add /data/ route to local bucket

storage improvements:
- for default storages, store path only, and prepend default storage access endpoint
- collections api returns the paths using the storage access endpoint
- define default storages as secrets in k8s (can support multiple), hard-coded in docker (only one for now)
2021-10-27 09:39:14 -07:00

209 lines
6.5 KiB
Python

"""
Collections API
"""
import asyncio
import uuid
from typing import Optional, List
import pymongo
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel
from db import BaseMongoModel
from archives import Archive
# ============================================================================
class Collection(BaseMongoModel):
""" Archive collection structure """
name: str
archive: str
description: Optional[str]
# ============================================================================
class CollIn(BaseModel):
""" Collection Passed in By User """
name: str
description: Optional[str]
# ============================================================================
class CollectionOps:
""" ops for working with named collections of crawls """
def __init__(self, mdb, crawls, crawl_manager):
self.collections = mdb["collections"]
self.crawls = crawls
self.crawl_manager = crawl_manager
asyncio.create_task(self.init_index())
async def init_index(self):
""" init lookup index """
await self.collections.create_index(
[("archive", pymongo.ASCENDING), ("name", pymongo.ASCENDING)], unique=True
)
async def add_collection(self, archive, name, description=None):
""" add new collection """
coll = Collection(
id=str(uuid.uuid4()), archive=archive, name=name, description=description
)
try:
res = await self.collections.insert_one(coll.to_dict())
return res.inserted_id
except pymongo.errors.DuplicateKeyError:
res = await self.collections.find_one_and_update(
{"archive": archive, "name": name},
{"$set": {"name": name, "description": description}},
)
return res["_id"]
async def find_collection(self, archive: str, name: str):
""" find collection by archive + name """
res = await self.collections.find_one({"archive": archive, "name": name})
return Collection.from_dict(res) if res else None
async def find_collections(self, aid: str, names: List[str]):
""" find all collections for archive given a list of names """
cursor = self.collections.find(
{"archive": aid, "name": {"$in": names}}, projection=["_id", "name"]
)
results = await cursor.to_list(length=1000)
if len(results) != len(names):
for result in results:
names.remove(result["name"])
if names:
raise HTTPException(
status_code=400,
detail=f"Specified collection(s) not found: {', '.join(names)}",
)
return [result["_id"] for result in results]
async def list_collections(self, aid: str):
""" list all collections for archive """
cursor = self.collections.find({"archive": aid}, projection=["_id", "name"])
results = await cursor.to_list(length=1000)
return {result["name"]: result["_id"] for result in results}
async def get_collection_crawls(self, archive, name=None):
""" fidn collection and get all crawls by collection name per archive """
aid = None
collid = None
if name:
coll = await self.find_collection(archive, name)
if not coll:
return None
collid = coll.id
else:
aid = archive
crawls = await self.crawls.list_finished_crawls(aid=aid, collid=collid)
all_files = []
for crawl in crawls:
if not crawl.files:
continue
for file_ in crawl.files:
if file_.def_storage_name:
storage_prefix = (
await self.crawl_manager.get_default_storage_access_endpoint(
file_.def_storage_name
)
)
file_.filename = storage_prefix + file_.filename
all_files.append(file_.dict(exclude={"def_storage_name"}))
return {"resources": all_files}
# ============================================================================
def init_collections_api(mdb, crawls, archives, crawl_manager):
""" init collections api """
colls = CollectionOps(mdb, crawls, crawl_manager)
archive_crawl_dep = archives.archive_crawl_dep
archive_viewer_dep = archives.archive_viewer_dep
router = APIRouter(
prefix="/collections",
dependencies=[Depends(archive_crawl_dep)],
responses={404: {"description": "Not found"}},
tags=["collections"],
)
@router.post("")
async def add_collection(
new_coll: CollIn, archive: Archive = Depends(archive_crawl_dep)
):
coll_id = None
if new_coll.name == "$all":
raise HTTPException(status_code=400, detail="Invalid Name")
try:
coll_id = await colls.add_collection(
archive.id, new_coll.name, new_coll.description
)
except Exception as exc:
# pylint: disable=raise-missing-from
raise HTTPException(
status_code=400, detail=f"Error Updating Collection: {exc}"
)
return {"collection": coll_id}
@router.get("")
async def list_collection_all(archive: Archive = Depends(archive_viewer_dep)):
return await colls.list_collections(archive.id)
@router.get("/$all")
async def get_collection_all(archive: Archive = Depends(archive_viewer_dep)):
try:
results = await colls.get_collection_crawls(archive.id)
except Exception as exc:
# pylint: disable=raise-missing-from
raise HTTPException(
status_code=400, detail="Error Listing All Crawled Files: " + str(exc)
)
return results
@router.get("/{coll_name}")
async def get_collection(
coll_name: str, archive: Archive = Depends(archive_viewer_dep)
):
try:
results = await colls.get_collection_crawls(archive.id, coll_name)
except Exception as exc:
# pylint: disable=raise-missing-from
raise HTTPException(
status_code=400, detail=f"Error Listing Collection: {exc}"
)
if not results:
raise HTTPException(
status_code=404, detail=f"Collection {coll_name} not found"
)
return results
archives.router.include_router(router)
return colls