- collections defined by name per archive
- can update collections with additional metadata (currently just description)
- crawl config api accepts a list of collections by name, resolved to collection uids and stored in config
- finished crawls also associated with collection list
- /archives/{aid}/collections/{name} can list all crawl artifacts (wacz files) from a named collection (in frictionless data package-ish format)
- /archives/{aid}/collections/$all lists all crawled artifacts for the archive
readiness check: add /healthz endpoints for app and nginx
ingress: add /data/ route to local bucket
storage improvements:
- for default storages, store path only, and prepend default storage access endpoint
- collections api returns the paths using the storage access endpoint
- define default storages as secrets in k8s (can support multiple), hard-coded in docker (only one for now)
		
	
			
		
			
				
	
	
		
			209 lines
		
	
	
		
			6.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			209 lines
		
	
	
		
			6.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| Collections API
 | |
| """
 | |
| 
 | |
| import asyncio
 | |
| import uuid
 | |
| from typing import Optional, List
 | |
| 
 | |
| import pymongo
 | |
| from fastapi import APIRouter, Depends, HTTPException
 | |
| 
 | |
| from pydantic import BaseModel
 | |
| 
 | |
| from db import BaseMongoModel
 | |
| from archives import Archive
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class Collection(BaseMongoModel):
 | |
|     """ Archive collection structure """
 | |
| 
 | |
|     name: str
 | |
| 
 | |
|     archive: str
 | |
| 
 | |
|     description: Optional[str]
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CollIn(BaseModel):
 | |
|     """ Collection Passed in By User """
 | |
| 
 | |
|     name: str
 | |
|     description: Optional[str]
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class CollectionOps:
 | |
|     """ ops for working with named collections of crawls """
 | |
| 
 | |
|     def __init__(self, mdb, crawls, crawl_manager):
 | |
|         self.collections = mdb["collections"]
 | |
| 
 | |
|         self.crawls = crawls
 | |
|         self.crawl_manager = crawl_manager
 | |
| 
 | |
|         asyncio.create_task(self.init_index())
 | |
| 
 | |
|     async def init_index(self):
 | |
|         """ init lookup index """
 | |
|         await self.collections.create_index(
 | |
|             [("archive", pymongo.ASCENDING), ("name", pymongo.ASCENDING)], unique=True
 | |
|         )
 | |
| 
 | |
|     async def add_collection(self, archive, name, description=None):
 | |
|         """ add new collection """
 | |
|         coll = Collection(
 | |
|             id=str(uuid.uuid4()), archive=archive, name=name, description=description
 | |
|         )
 | |
|         try:
 | |
|             res = await self.collections.insert_one(coll.to_dict())
 | |
|             return res.inserted_id
 | |
| 
 | |
|         except pymongo.errors.DuplicateKeyError:
 | |
|             res = await self.collections.find_one_and_update(
 | |
|                 {"archive": archive, "name": name},
 | |
|                 {"$set": {"name": name, "description": description}},
 | |
|             )
 | |
|             return res["_id"]
 | |
| 
 | |
|     async def find_collection(self, archive: str, name: str):
 | |
|         """ find collection by archive + name """
 | |
|         res = await self.collections.find_one({"archive": archive, "name": name})
 | |
|         return Collection.from_dict(res) if res else None
 | |
| 
 | |
|     async def find_collections(self, aid: str, names: List[str]):
 | |
|         """ find all collections for archive given a list of names """
 | |
|         cursor = self.collections.find(
 | |
|             {"archive": aid, "name": {"$in": names}}, projection=["_id", "name"]
 | |
|         )
 | |
|         results = await cursor.to_list(length=1000)
 | |
|         if len(results) != len(names):
 | |
|             for result in results:
 | |
|                 names.remove(result["name"])
 | |
| 
 | |
|             if names:
 | |
|                 raise HTTPException(
 | |
|                     status_code=400,
 | |
|                     detail=f"Specified collection(s) not found: {', '.join(names)}",
 | |
|                 )
 | |
| 
 | |
|         return [result["_id"] for result in results]
 | |
| 
 | |
|     async def list_collections(self, aid: str):
 | |
|         """ list all collections for archive """
 | |
|         cursor = self.collections.find({"archive": aid}, projection=["_id", "name"])
 | |
|         results = await cursor.to_list(length=1000)
 | |
|         return {result["name"]: result["_id"] for result in results}
 | |
| 
 | |
|     async def get_collection_crawls(self, archive, name=None):
 | |
|         """ fidn collection and get all crawls by collection name per archive """
 | |
|         aid = None
 | |
|         collid = None
 | |
|         if name:
 | |
|             coll = await self.find_collection(archive, name)
 | |
|             if not coll:
 | |
|                 return None
 | |
| 
 | |
|             collid = coll.id
 | |
|         else:
 | |
|             aid = archive
 | |
| 
 | |
|         crawls = await self.crawls.list_finished_crawls(aid=aid, collid=collid)
 | |
|         all_files = []
 | |
|         for crawl in crawls:
 | |
|             if not crawl.files:
 | |
|                 continue
 | |
| 
 | |
|             for file_ in crawl.files:
 | |
|                 if file_.def_storage_name:
 | |
|                     storage_prefix = (
 | |
|                         await self.crawl_manager.get_default_storage_access_endpoint(
 | |
|                             file_.def_storage_name
 | |
|                         )
 | |
|                     )
 | |
|                     file_.filename = storage_prefix + file_.filename
 | |
| 
 | |
|                 all_files.append(file_.dict(exclude={"def_storage_name"}))
 | |
| 
 | |
|         return {"resources": all_files}
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| def init_collections_api(mdb, crawls, archives, crawl_manager):
 | |
|     """ init collections api """
 | |
|     colls = CollectionOps(mdb, crawls, crawl_manager)
 | |
| 
 | |
|     archive_crawl_dep = archives.archive_crawl_dep
 | |
|     archive_viewer_dep = archives.archive_viewer_dep
 | |
| 
 | |
|     router = APIRouter(
 | |
|         prefix="/collections",
 | |
|         dependencies=[Depends(archive_crawl_dep)],
 | |
|         responses={404: {"description": "Not found"}},
 | |
|         tags=["collections"],
 | |
|     )
 | |
| 
 | |
|     @router.post("")
 | |
|     async def add_collection(
 | |
|         new_coll: CollIn, archive: Archive = Depends(archive_crawl_dep)
 | |
|     ):
 | |
|         coll_id = None
 | |
|         if new_coll.name == "$all":
 | |
|             raise HTTPException(status_code=400, detail="Invalid Name")
 | |
| 
 | |
|         try:
 | |
|             coll_id = await colls.add_collection(
 | |
|                 archive.id, new_coll.name, new_coll.description
 | |
|             )
 | |
| 
 | |
|         except Exception as exc:
 | |
|             # pylint: disable=raise-missing-from
 | |
|             raise HTTPException(
 | |
|                 status_code=400, detail=f"Error Updating Collection: {exc}"
 | |
|             )
 | |
| 
 | |
|         return {"collection": coll_id}
 | |
| 
 | |
|     @router.get("")
 | |
|     async def list_collection_all(archive: Archive = Depends(archive_viewer_dep)):
 | |
|         return await colls.list_collections(archive.id)
 | |
| 
 | |
|     @router.get("/$all")
 | |
|     async def get_collection_all(archive: Archive = Depends(archive_viewer_dep)):
 | |
|         try:
 | |
|             results = await colls.get_collection_crawls(archive.id)
 | |
| 
 | |
|         except Exception as exc:
 | |
|             # pylint: disable=raise-missing-from
 | |
|             raise HTTPException(
 | |
|                 status_code=400, detail="Error Listing All Crawled Files: " + str(exc)
 | |
|             )
 | |
| 
 | |
|         return results
 | |
| 
 | |
|     @router.get("/{coll_name}")
 | |
|     async def get_collection(
 | |
|         coll_name: str, archive: Archive = Depends(archive_viewer_dep)
 | |
|     ):
 | |
|         try:
 | |
|             results = await colls.get_collection_crawls(archive.id, coll_name)
 | |
| 
 | |
|         except Exception as exc:
 | |
|             # pylint: disable=raise-missing-from
 | |
|             raise HTTPException(
 | |
|                 status_code=400, detail=f"Error Listing Collection: {exc}"
 | |
|             )
 | |
| 
 | |
|         if not results:
 | |
|             raise HTTPException(
 | |
|                 status_code=404, detail=f"Collection {coll_name} not found"
 | |
|             )
 | |
| 
 | |
|         return results
 | |
| 
 | |
|     archives.router.include_router(router)
 | |
| 
 | |
|     return colls
 |