Add initial pages + pagesQuery endpoint to /replay.json APIs (#2380)
Fixes #2360 - Adds `initialPages` to /replay.json response for collections, returning up-to 25 pages (seed pages first, then sorted by capture time). - Adds `pagesQueryUrl` to /replay.json - Adds a public pages search endpoint to support public collections. - Adds `preloadResources`, including list of WACZ files that should always be loaded, to /replay.json --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
		
							parent
							
								
									73f9f949af
								
							
						
					
					
						commit
						7b2932c582
					
				| @ -3,6 +3,7 @@ | ||||
| from datetime import datetime, timedelta | ||||
| from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast, Tuple | ||||
| from uuid import UUID | ||||
| import os | ||||
| import urllib.parse | ||||
| 
 | ||||
| import asyncio | ||||
| @ -31,7 +32,7 @@ from .models import ( | ||||
|     PRESIGN_DURATION_SECONDS, | ||||
| ) | ||||
| from .pagination import paginated_format, DEFAULT_PAGE_SIZE | ||||
| from .utils import dt_now, date_to_str | ||||
| from .utils import dt_now, date_to_str, get_origin | ||||
| 
 | ||||
| if TYPE_CHECKING: | ||||
|     from .crawlconfigs import CrawlConfigOps | ||||
| @ -156,6 +157,7 @@ class BaseCrawlOps: | ||||
|         org: Optional[Organization] = None, | ||||
|         type_: Optional[str] = None, | ||||
|         skip_resources=False, | ||||
|         headers: Optional[dict] = None, | ||||
|     ) -> CrawlOutWithResources: | ||||
|         """Get crawl data for api output""" | ||||
|         res = await self.get_crawl_raw(crawlid, org, type_) | ||||
| @ -168,6 +170,16 @@ class BaseCrawlOps: | ||||
|             if coll_ids: | ||||
|                 res["collections"] = await self.colls.get_collection_names(coll_ids) | ||||
| 
 | ||||
|             res["initialPages"], _ = await self.page_ops.list_pages( | ||||
|                 crawlid, is_seed=True, page_size=25 | ||||
|             ) | ||||
| 
 | ||||
|             oid = res.get("oid") | ||||
|             if oid: | ||||
|                 res["pagesQueryUrl"] = ( | ||||
|                     get_origin(headers) + f"/api/orgs/{oid}/crawls/{crawlid}/pages" | ||||
|                 ) | ||||
| 
 | ||||
|         crawl = CrawlOutWithResources.from_dict(res) | ||||
| 
 | ||||
|         if not skip_resources: | ||||
| @ -497,7 +509,7 @@ class BaseCrawlOps: | ||||
| 
 | ||||
|             out_files.append( | ||||
|                 CrawlFileOut( | ||||
|                     name=file_.filename, | ||||
|                     name=os.path.basename(file_.filename), | ||||
|                     path=presigned_url or "", | ||||
|                     hash=file_.hash, | ||||
|                     size=file_.size, | ||||
|  | ||||
| @ -53,16 +53,18 @@ from .models import ( | ||||
|     ImageFilePreparer, | ||||
|     MIN_UPLOAD_PART_SIZE, | ||||
|     PublicCollOut, | ||||
|     PreloadResource, | ||||
| ) | ||||
| from .utils import dt_now, slug_from_name, get_duplicate_key_error_field | ||||
| from .utils import dt_now, slug_from_name, get_duplicate_key_error_field, get_origin | ||||
| 
 | ||||
| if TYPE_CHECKING: | ||||
|     from .orgs import OrgOps | ||||
|     from .storages import StorageOps | ||||
|     from .webhooks import EventWebhookOps | ||||
|     from .crawls import CrawlOps | ||||
|     from .pages import PageOps | ||||
| else: | ||||
|     OrgOps = StorageOps = EventWebhookOps = CrawlOps = object | ||||
|     OrgOps = StorageOps = EventWebhookOps = CrawlOps = PageOps = object | ||||
| 
 | ||||
| 
 | ||||
| THUMBNAIL_MAX_SIZE = 2_000_000 | ||||
| @ -78,6 +80,7 @@ class CollectionOps: | ||||
|     storage_ops: StorageOps | ||||
|     event_webhook_ops: EventWebhookOps | ||||
|     crawl_ops: CrawlOps | ||||
|     page_ops: PageOps | ||||
| 
 | ||||
|     def __init__(self, mdb, storage_ops, orgs, event_webhook_ops): | ||||
|         self.collections = mdb["collections"] | ||||
| @ -337,12 +340,28 @@ class CollectionOps: | ||||
|         org: Organization, | ||||
|         resources=False, | ||||
|         public_or_unlisted_only=False, | ||||
|         headers: Optional[dict] = None, | ||||
|     ) -> CollOut: | ||||
|         """Get CollOut by id""" | ||||
|         # pylint: disable=too-many-locals | ||||
|         result = await self.get_collection_raw(coll_id, public_or_unlisted_only) | ||||
| 
 | ||||
|         if resources: | ||||
|             result["resources"] = await self.get_collection_crawl_resources(coll_id) | ||||
|             result["resources"], result["preloadResources"] = ( | ||||
|                 await self.get_collection_crawl_resources( | ||||
|                     coll_id, include_preloads=True | ||||
|                 ) | ||||
|             ) | ||||
| 
 | ||||
|             result["initialPages"], result["totalPages"] = ( | ||||
|                 await self.page_ops.list_collection_pages(coll_id, page_size=25) | ||||
|             ) | ||||
| 
 | ||||
|             public = "public/" if public_or_unlisted_only else "" | ||||
|             result["pagesQueryUrl"] = ( | ||||
|                 get_origin(headers) | ||||
|                 + f"/api/orgs/{org.id}/collections/{coll_id}/{public}pages" | ||||
|             ) | ||||
| 
 | ||||
|         thumbnail = result.get("thumbnail") | ||||
|         if thumbnail: | ||||
| @ -369,7 +388,7 @@ class CollectionOps: | ||||
|         if result.get("access") not in allowed_access: | ||||
|             raise HTTPException(status_code=404, detail="collection_not_found") | ||||
| 
 | ||||
|         result["resources"] = await self.get_collection_crawl_resources(coll_id) | ||||
|         result["resources"], _ = await self.get_collection_crawl_resources(coll_id) | ||||
| 
 | ||||
|         thumbnail = result.get("thumbnail") | ||||
|         if thumbnail: | ||||
| @ -468,7 +487,11 @@ class CollectionOps: | ||||
|         collections: List[Union[CollOut, PublicCollOut]] = [] | ||||
| 
 | ||||
|         for res in items: | ||||
|             res["resources"] = await self.get_collection_crawl_resources(res["_id"]) | ||||
|             res["resources"], res["preloadResources"] = ( | ||||
|                 await self.get_collection_crawl_resources( | ||||
|                     res["_id"], include_preloads=not public_colls_out | ||||
|                 ) | ||||
|             ) | ||||
| 
 | ||||
|             thumbnail = res.get("thumbnail") | ||||
|             if thumbnail: | ||||
| @ -490,12 +513,14 @@ class CollectionOps: | ||||
| 
 | ||||
|         return collections, total | ||||
| 
 | ||||
|     async def get_collection_crawl_resources(self, coll_id: UUID): | ||||
|     async def get_collection_crawl_resources( | ||||
|         self, coll_id: UUID, include_preloads=False | ||||
|     ): | ||||
|         """Return pre-signed resources for all collection crawl files.""" | ||||
|         # Ensure collection exists | ||||
|         _ = await self.get_collection_raw(coll_id) | ||||
| 
 | ||||
|         all_files = [] | ||||
|         resources = [] | ||||
| 
 | ||||
|         crawls, _ = await self.crawl_ops.list_all_base_crawls( | ||||
|             collection_id=coll_id, | ||||
| @ -506,9 +531,36 @@ class CollectionOps: | ||||
| 
 | ||||
|         for crawl in crawls: | ||||
|             if crawl.resources: | ||||
|                 all_files.extend(crawl.resources) | ||||
|                 resources.extend(crawl.resources) | ||||
| 
 | ||||
|         return all_files | ||||
|         preload_resources: List[PreloadResource] = [] | ||||
| 
 | ||||
|         if include_preloads: | ||||
|             no_page_items = await self.get_collection_resources_with_no_pages(crawls) | ||||
|             for item in no_page_items: | ||||
|                 preload_resources.append(item) | ||||
| 
 | ||||
|         return resources, preload_resources | ||||
| 
 | ||||
|     async def get_collection_resources_with_no_pages( | ||||
|         self, crawls: List[CrawlOutWithResources] | ||||
|     ) -> List[PreloadResource]: | ||||
|         """Return wacz files in collection that have no pages""" | ||||
|         resources_no_pages: List[PreloadResource] = [] | ||||
| 
 | ||||
|         for crawl in crawls: | ||||
|             _, page_count = await self.page_ops.list_pages(crawl.id) | ||||
|             if page_count == 0 and crawl.resources: | ||||
|                 for resource in crawl.resources: | ||||
|                     resources_no_pages.append( | ||||
|                         PreloadResource( | ||||
|                             name=os.path.basename(resource.name), | ||||
|                             crawlId=crawl.id, | ||||
|                             hasPages=False, | ||||
|                         ) | ||||
|                     ) | ||||
| 
 | ||||
|         return resources_no_pages | ||||
| 
 | ||||
|     async def get_collection_names(self, uuids: List[UUID]): | ||||
|         """return object of {_id, names} given list of collection ids""" | ||||
| @ -528,9 +580,15 @@ class CollectionOps: | ||||
|         names = [name for name in names if name] | ||||
|         return {"names": names} | ||||
| 
 | ||||
|     async def get_collection_crawl_ids(self, coll_id: UUID) -> List[str]: | ||||
|         """Return list of crawl ids in collection""" | ||||
|     async def get_collection_crawl_ids( | ||||
|         self, coll_id: UUID, public_or_unlisted_only=False | ||||
|     ) -> List[str]: | ||||
|         """Return list of crawl ids in collection, including only public collections""" | ||||
|         crawl_ids = [] | ||||
|         # ensure collection is public or unlisted, else throw here | ||||
|         if public_or_unlisted_only: | ||||
|             await self.get_collection_raw(coll_id, public_or_unlisted_only) | ||||
| 
 | ||||
|         async for crawl_raw in self.crawls.find( | ||||
|             {"collectionIds": coll_id}, projection=["_id"] | ||||
|         ): | ||||
| @ -1010,8 +1068,8 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de | ||||
|         try: | ||||
|             all_collections, _ = await colls.list_collections(org, page_size=10_000) | ||||
|             for collection in all_collections: | ||||
|                 results[collection.name] = await colls.get_collection_crawl_resources( | ||||
|                     collection.id | ||||
|                 results[collection.name], _ = ( | ||||
|                     await colls.get_collection_crawl_resources(collection.id) | ||||
|                 ) | ||||
|         except Exception as exc: | ||||
|             # pylint: disable=raise-missing-from | ||||
| @ -1047,9 +1105,11 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de | ||||
|         response_model=CollOut, | ||||
|     ) | ||||
|     async def get_collection_replay( | ||||
|         coll_id: UUID, org: Organization = Depends(org_viewer_dep) | ||||
|         request: Request, coll_id: UUID, org: Organization = Depends(org_viewer_dep) | ||||
|     ): | ||||
|         return await colls.get_collection_out(coll_id, org, resources=True) | ||||
|         return await colls.get_collection_out( | ||||
|             coll_id, org, resources=True, headers=dict(request.headers) | ||||
|         ) | ||||
| 
 | ||||
|     @app.get( | ||||
|         "/orgs/{oid}/collections/{coll_id}/public/replay.json", | ||||
| @ -1057,12 +1117,17 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de | ||||
|         response_model=CollOut, | ||||
|     ) | ||||
|     async def get_collection_public_replay( | ||||
|         request: Request, | ||||
|         response: Response, | ||||
|         coll_id: UUID, | ||||
|         org: Organization = Depends(org_public), | ||||
|     ): | ||||
|         coll = await colls.get_collection_out( | ||||
|             coll_id, org, resources=True, public_or_unlisted_only=True | ||||
|             coll_id, | ||||
|             org, | ||||
|             resources=True, | ||||
|             public_or_unlisted_only=True, | ||||
|             headers=dict(request.headers), | ||||
|         ) | ||||
|         response.headers["Access-Control-Allow-Origin"] = "*" | ||||
|         response.headers["Access-Control-Allow-Headers"] = "*" | ||||
|  | ||||
| @ -881,14 +881,6 @@ class CrawlOut(BaseMongoModel): | ||||
|     errorPageCount: Optional[int] = 0 | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class CrawlOutWithResources(CrawlOut): | ||||
|     """Crawl output model including resources""" | ||||
| 
 | ||||
|     resources: Optional[List[CrawlFileOut]] = [] | ||||
|     collections: Optional[List[CollIdName]] = [] | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class UpdateCrawl(BaseModel): | ||||
|     """Update crawl""" | ||||
| @ -1222,6 +1214,173 @@ class ImageFilePreparer(FilePreparer): | ||||
|         ) | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| 
 | ||||
| ### PAGES ### | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageReviewUpdate(BaseModel): | ||||
|     """Update model for page manual review/approval""" | ||||
| 
 | ||||
|     approved: Optional[bool] = None | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageNoteIn(BaseModel): | ||||
|     """Input model for adding page notes""" | ||||
| 
 | ||||
|     text: str | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageNoteEdit(BaseModel): | ||||
|     """Input model for editing page notes""" | ||||
| 
 | ||||
|     id: UUID | ||||
|     text: str | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageNoteDelete(BaseModel): | ||||
|     """Delete model for page notes""" | ||||
| 
 | ||||
|     delete_list: List[UUID] = [] | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageNote(BaseModel): | ||||
|     """Model for page notes, tracking user and time""" | ||||
| 
 | ||||
|     id: UUID | ||||
|     text: str | ||||
|     created: datetime | ||||
|     userid: UUID | ||||
|     userName: str | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageQACompare(BaseModel): | ||||
|     """Model for updating pages from QA run""" | ||||
| 
 | ||||
|     screenshotMatch: Optional[float] = None | ||||
|     textMatch: Optional[float] = None | ||||
|     resourceCounts: Optional[Dict[str, int]] = None | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class Page(BaseMongoModel): | ||||
|     """Core page data, no QA""" | ||||
| 
 | ||||
|     id: UUID | ||||
| 
 | ||||
|     oid: UUID | ||||
|     crawl_id: str | ||||
| 
 | ||||
|     # core page data | ||||
|     url: AnyHttpUrl | ||||
|     title: Optional[str] = None | ||||
|     ts: Optional[datetime] = None | ||||
|     loadState: Optional[int] = None | ||||
|     status: Optional[int] = None | ||||
|     mime: Optional[str] = None | ||||
|     filename: Optional[str] = None | ||||
|     depth: Optional[int] = None | ||||
|     favIconUrl: Optional[AnyHttpUrl] = None | ||||
|     isSeed: Optional[bool] = False | ||||
| 
 | ||||
|     # manual review | ||||
|     userid: Optional[UUID] = None | ||||
|     modified: Optional[datetime] = None | ||||
|     approved: Optional[bool] = None | ||||
|     notes: List[PageNote] = [] | ||||
| 
 | ||||
|     isFile: Optional[bool] = False | ||||
|     isError: Optional[bool] = False | ||||
| 
 | ||||
|     def compute_page_type(self): | ||||
|         """sets self.isFile or self.isError flags""" | ||||
|         self.isFile = False | ||||
|         self.isError = False | ||||
|         if self.loadState == 2: | ||||
|             # pylint: disable=unsupported-membership-test | ||||
|             if self.mime and "html" not in self.mime: | ||||
|                 self.isFile = True | ||||
|             elif self.title is None and self.status == 200: | ||||
|                 self.isFile = True | ||||
| 
 | ||||
|         elif self.loadState == 0: | ||||
|             self.isError = True | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageWithAllQA(Page): | ||||
|     """Model for core page data + qa""" | ||||
| 
 | ||||
|     # automated heuristics, keyed by QA run id | ||||
|     qa: Optional[Dict[str, PageQACompare]] = {} | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageOut(Page): | ||||
|     """Model for pages output, no QA""" | ||||
| 
 | ||||
|     status: int = 200 | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageOutWithSingleQA(Page): | ||||
|     """Page out with single QA entry""" | ||||
| 
 | ||||
|     qa: Optional[PageQACompare] = None | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageNoteAddedResponse(BaseModel): | ||||
|     """Model for response to adding page""" | ||||
| 
 | ||||
|     added: bool | ||||
|     data: PageNote | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageNoteUpdatedResponse(BaseModel): | ||||
|     """Model for response to updating page""" | ||||
| 
 | ||||
|     updated: bool | ||||
|     data: PageNote | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageIdTimestamp(BaseModel): | ||||
|     """Simplified model for page info to include in PageUrlCount""" | ||||
| 
 | ||||
|     pageId: UUID | ||||
|     ts: Optional[datetime] = None | ||||
|     status: int = 200 | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageUrlCount(BaseModel): | ||||
|     """Model for counting pages by URL""" | ||||
| 
 | ||||
|     url: AnyHttpUrl | ||||
|     count: int = 0 | ||||
|     snapshots: List[PageIdTimestamp] = [] | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class CrawlOutWithResources(CrawlOut): | ||||
|     """Crawl output model including resources""" | ||||
| 
 | ||||
|     resources: Optional[List[CrawlFileOut]] = [] | ||||
|     collections: Optional[List[CollIdName]] = [] | ||||
| 
 | ||||
|     initialPages: List[PageOut] = [] | ||||
|     totalPages: Optional[int] = None | ||||
|     pagesQueryUrl: str = "" | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| 
 | ||||
| ### COLLECTIONS ### | ||||
| @ -1245,6 +1404,15 @@ class CollectionThumbnailSource(BaseModel): | ||||
|     urlPageId: UUID | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PreloadResource(BaseModel): | ||||
|     """Resources that will preloaded in RWP""" | ||||
| 
 | ||||
|     name: str | ||||
|     crawlId: str | ||||
|     hasPages: bool | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class Collection(BaseMongoModel): | ||||
|     """Org collection structure""" | ||||
| @ -1338,6 +1506,11 @@ class CollOut(BaseMongoModel): | ||||
| 
 | ||||
|     allowPublicDownload: bool = True | ||||
| 
 | ||||
|     initialPages: List[PageOut] = [] | ||||
|     totalPages: Optional[int] = None | ||||
|     preloadResources: List[PreloadResource] = [] | ||||
|     pagesQueryUrl: str = "" | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PublicCollOut(BaseMongoModel): | ||||
| @ -2435,161 +2608,6 @@ AnyJob = RootModel[ | ||||
| ] | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| 
 | ||||
| ### PAGES ### | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageReviewUpdate(BaseModel): | ||||
|     """Update model for page manual review/approval""" | ||||
| 
 | ||||
|     approved: Optional[bool] = None | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageNoteIn(BaseModel): | ||||
|     """Input model for adding page notes""" | ||||
| 
 | ||||
|     text: str | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageNoteEdit(BaseModel): | ||||
|     """Input model for editing page notes""" | ||||
| 
 | ||||
|     id: UUID | ||||
|     text: str | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageNoteDelete(BaseModel): | ||||
|     """Delete model for page notes""" | ||||
| 
 | ||||
|     delete_list: List[UUID] = [] | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageNote(BaseModel): | ||||
|     """Model for page notes, tracking user and time""" | ||||
| 
 | ||||
|     id: UUID | ||||
|     text: str | ||||
|     created: datetime | ||||
|     userid: UUID | ||||
|     userName: str | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageQACompare(BaseModel): | ||||
|     """Model for updating pages from QA run""" | ||||
| 
 | ||||
|     screenshotMatch: Optional[float] = None | ||||
|     textMatch: Optional[float] = None | ||||
|     resourceCounts: Optional[Dict[str, int]] = None | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class Page(BaseMongoModel): | ||||
|     """Core page data, no QA""" | ||||
| 
 | ||||
|     id: UUID | ||||
| 
 | ||||
|     oid: UUID | ||||
|     crawl_id: str | ||||
| 
 | ||||
|     # core page data | ||||
|     url: AnyHttpUrl | ||||
|     title: Optional[str] = None | ||||
|     ts: Optional[datetime] = None | ||||
|     loadState: Optional[int] = None | ||||
|     status: Optional[int] = None | ||||
|     mime: Optional[str] = None | ||||
|     filename: Optional[str] = None | ||||
|     depth: Optional[int] = None | ||||
|     favIconUrl: Optional[AnyHttpUrl] = None | ||||
|     isSeed: Optional[bool] = False | ||||
| 
 | ||||
|     # manual review | ||||
|     userid: Optional[UUID] = None | ||||
|     modified: Optional[datetime] = None | ||||
|     approved: Optional[bool] = None | ||||
|     notes: List[PageNote] = [] | ||||
| 
 | ||||
|     isFile: Optional[bool] = False | ||||
|     isError: Optional[bool] = False | ||||
| 
 | ||||
|     def compute_page_type(self): | ||||
|         """sets self.isFile or self.isError flags""" | ||||
|         self.isFile = False | ||||
|         self.isError = False | ||||
|         if self.loadState == 2: | ||||
|             # pylint: disable=unsupported-membership-test | ||||
|             if self.mime and "html" not in self.mime: | ||||
|                 self.isFile = True | ||||
|             elif self.title is None and self.status == 200: | ||||
|                 self.isFile = True | ||||
| 
 | ||||
|         elif self.loadState == 0: | ||||
|             self.isError = True | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageWithAllQA(Page): | ||||
|     """Model for core page data + qa""" | ||||
| 
 | ||||
|     # automated heuristics, keyed by QA run id | ||||
|     qa: Optional[Dict[str, PageQACompare]] = {} | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageOut(Page): | ||||
|     """Model for pages output, no QA""" | ||||
| 
 | ||||
|     status: int = 200 | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageOutWithSingleQA(Page): | ||||
|     """Page out with single QA entry""" | ||||
| 
 | ||||
|     qa: Optional[PageQACompare] = None | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageNoteAddedResponse(BaseModel): | ||||
|     """Model for response to adding page""" | ||||
| 
 | ||||
|     added: bool | ||||
|     data: PageNote | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageNoteUpdatedResponse(BaseModel): | ||||
|     """Model for response to updating page""" | ||||
| 
 | ||||
|     updated: bool | ||||
|     data: PageNote | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageIdTimestamp(BaseModel): | ||||
|     """Simplified model for page info to include in PageUrlCount""" | ||||
| 
 | ||||
|     pageId: UUID | ||||
|     ts: Optional[datetime] = None | ||||
|     status: int = 200 | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| class PageUrlCount(BaseModel): | ||||
|     """Model for counting pages by URL""" | ||||
| 
 | ||||
|     url: AnyHttpUrl | ||||
|     count: int = 0 | ||||
|     snapshots: List[PageIdTimestamp] = [] | ||||
| 
 | ||||
| 
 | ||||
| # ============================================================================ | ||||
| 
 | ||||
| ### GENERIC RESPONSE MODELS ### | ||||
|  | ||||
| @ -11,7 +11,7 @@ from datetime import datetime | ||||
| from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union | ||||
| from uuid import UUID, uuid4 | ||||
| 
 | ||||
| from fastapi import Depends, HTTPException, Request | ||||
| from fastapi import Depends, HTTPException, Request, Response | ||||
| import pymongo | ||||
| 
 | ||||
| from .models import ( | ||||
| @ -35,6 +35,7 @@ from .models import ( | ||||
|     DeletedResponse, | ||||
|     PageNoteAddedResponse, | ||||
|     PageNoteUpdatedResponse, | ||||
|     EmptyResponse, | ||||
| ) | ||||
| from .pagination import DEFAULT_PAGE_SIZE, paginated_format | ||||
| from .utils import str_to_date, str_list_to_bools, dt_now | ||||
| @ -503,6 +504,7 @@ class PageOps: | ||||
|         self, | ||||
|         crawl_id: str, | ||||
|         org: Optional[Organization] = None, | ||||
|         search: Optional[str] = None, | ||||
|         url: Optional[str] = None, | ||||
|         url_prefix: Optional[str] = None, | ||||
|         ts: Optional[datetime] = None, | ||||
| @ -534,6 +536,13 @@ class PageOps: | ||||
|         if org: | ||||
|             query["oid"] = org.id | ||||
| 
 | ||||
|         if search: | ||||
|             search_regex = re.escape(urllib.parse.unquote(search)) | ||||
|             query["$or"] = [ | ||||
|                 {"url": {"$regex": search_regex, "$options": "i"}}, | ||||
|                 {"title": {"$regex": search_regex, "$options": "i"}}, | ||||
|             ] | ||||
| 
 | ||||
|         if url_prefix: | ||||
|             url_prefix = urllib.parse.unquote(url_prefix) | ||||
|             regex_pattern = f"^{re.escape(url_prefix)}" | ||||
| @ -661,6 +670,7 @@ class PageOps: | ||||
|         self, | ||||
|         coll_id: UUID, | ||||
|         org: Optional[Organization] = None, | ||||
|         search: Optional[str] = None, | ||||
|         url: Optional[str] = None, | ||||
|         url_prefix: Optional[str] = None, | ||||
|         ts: Optional[datetime] = None, | ||||
| @ -670,6 +680,7 @@ class PageOps: | ||||
|         page: int = 1, | ||||
|         sort_by: Optional[str] = None, | ||||
|         sort_direction: Optional[int] = -1, | ||||
|         public_or_unlisted_only=False, | ||||
|     ) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]: | ||||
|         """List all pages in collection, with optional filtering""" | ||||
|         # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements | ||||
| @ -677,7 +688,9 @@ class PageOps: | ||||
|         page = page - 1 | ||||
|         skip = page_size * page | ||||
| 
 | ||||
|         crawl_ids = await self.coll_ops.get_collection_crawl_ids(coll_id) | ||||
|         crawl_ids = await self.coll_ops.get_collection_crawl_ids( | ||||
|             coll_id, public_or_unlisted_only | ||||
|         ) | ||||
| 
 | ||||
|         query: dict[str, object] = { | ||||
|             "crawl_id": {"$in": crawl_ids}, | ||||
| @ -685,7 +698,14 @@ class PageOps: | ||||
|         if org: | ||||
|             query["oid"] = org.id | ||||
| 
 | ||||
|         if url_prefix: | ||||
|         if search: | ||||
|             search_regex = re.escape(urllib.parse.unquote(search)) | ||||
|             query["$or"] = [ | ||||
|                 {"url": {"$regex": search_regex, "$options": "i"}}, | ||||
|                 {"title": {"$regex": search_regex, "$options": "i"}}, | ||||
|             ] | ||||
| 
 | ||||
|         elif url_prefix: | ||||
|             url_prefix = urllib.parse.unquote(url_prefix) | ||||
|             regex_pattern = f"^{re.escape(url_prefix)}" | ||||
|             query["url"] = {"$regex": regex_pattern, "$options": "i"} | ||||
| @ -724,6 +744,9 @@ class PageOps: | ||||
|                 raise HTTPException(status_code=400, detail="invalid_sort_direction") | ||||
| 
 | ||||
|             aggregate.extend([{"$sort": {sort_by: sort_direction}}]) | ||||
|         else: | ||||
|             # default sort: seeds first, then by timestamp | ||||
|             aggregate.extend([{"$sort": {"isSeed": -1, "ts": 1}}]) | ||||
| 
 | ||||
|         aggregate.extend( | ||||
|             [ | ||||
| @ -886,6 +909,7 @@ def init_pages_api( | ||||
| 
 | ||||
|     org_viewer_dep = org_ops.org_viewer_dep | ||||
|     org_crawl_dep = org_ops.org_crawl_dep | ||||
|     org_public = org_ops.org_public | ||||
| 
 | ||||
|     @app.post( | ||||
|         "/orgs/{oid}/crawls/all/pages/reAdd", | ||||
| @ -1056,6 +1080,7 @@ def init_pages_api( | ||||
|     async def get_crawl_pages_list( | ||||
|         crawl_id: str, | ||||
|         org: Organization = Depends(org_crawl_dep), | ||||
|         search: Optional[str] = None, | ||||
|         url: Optional[str] = None, | ||||
|         urlPrefix: Optional[str] = None, | ||||
|         ts: Optional[datetime] = None, | ||||
| @ -1077,6 +1102,7 @@ def init_pages_api( | ||||
|         pages, total = await ops.list_pages( | ||||
|             crawl_id=crawl_id, | ||||
|             org=org, | ||||
|             search=search, | ||||
|             url=url, | ||||
|             url_prefix=urlPrefix, | ||||
|             ts=ts, | ||||
| @ -1093,13 +1119,15 @@ def init_pages_api( | ||||
|         return paginated_format(pages, total, page, pageSize) | ||||
| 
 | ||||
|     @app.get( | ||||
|         "/orgs/{oid}/collections/{coll_id}/pages", | ||||
|         "/orgs/{oid}/collections/{coll_id}/public/pages", | ||||
|         tags=["pages", "collections"], | ||||
|         response_model=PaginatedPageOutResponse, | ||||
|     ) | ||||
|     async def get_collection_pages_list( | ||||
|     async def get_public_collection_pages_list( | ||||
|         coll_id: UUID, | ||||
|         org: Organization = Depends(org_viewer_dep), | ||||
|         response: Response, | ||||
|         org: Organization = Depends(org_public), | ||||
|         search: Optional[str] = None, | ||||
|         url: Optional[str] = None, | ||||
|         urlPrefix: Optional[str] = None, | ||||
|         ts: Optional[datetime] = None, | ||||
| @ -1114,6 +1142,58 @@ def init_pages_api( | ||||
|         pages, total = await ops.list_collection_pages( | ||||
|             coll_id=coll_id, | ||||
|             org=org, | ||||
|             search=search, | ||||
|             url=url, | ||||
|             url_prefix=urlPrefix, | ||||
|             ts=ts, | ||||
|             is_seed=isSeed, | ||||
|             depth=depth, | ||||
|             page_size=pageSize, | ||||
|             page=page, | ||||
|             sort_by=sortBy, | ||||
|             sort_direction=sortDirection, | ||||
|             public_or_unlisted_only=True, | ||||
|         ) | ||||
| 
 | ||||
|         response.headers["Access-Control-Allow-Origin"] = "*" | ||||
|         response.headers["Access-Control-Allow-Headers"] = "*" | ||||
|         return paginated_format(pages, total, page, pageSize) | ||||
| 
 | ||||
|     @app.options( | ||||
|         "/orgs/{oid}/collections/{coll_id}/public/pages", | ||||
|         tags=["pages", "collections"], | ||||
|         response_model=EmptyResponse, | ||||
|     ) | ||||
|     async def get_replay_preflight(response: Response): | ||||
|         response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS" | ||||
|         response.headers["Access-Control-Allow-Origin"] = "*" | ||||
|         response.headers["Access-Control-Allow-Headers"] = "*" | ||||
|         return {} | ||||
| 
 | ||||
|     @app.get( | ||||
|         "/orgs/{oid}/collections/{coll_id}/pages", | ||||
|         tags=["pages", "collections"], | ||||
|         response_model=PaginatedPageOutResponse, | ||||
|     ) | ||||
|     async def get_collection_pages_list( | ||||
|         coll_id: UUID, | ||||
|         org: Organization = Depends(org_viewer_dep), | ||||
|         search: Optional[str] = None, | ||||
|         url: Optional[str] = None, | ||||
|         urlPrefix: Optional[str] = None, | ||||
|         ts: Optional[datetime] = None, | ||||
|         isSeed: Optional[bool] = None, | ||||
|         depth: Optional[int] = None, | ||||
|         pageSize: int = DEFAULT_PAGE_SIZE, | ||||
|         page: int = 1, | ||||
|         sortBy: Optional[str] = None, | ||||
|         sortDirection: Optional[int] = -1, | ||||
|     ): | ||||
|         """Retrieve paginated list of pages in collection""" | ||||
|         pages, total = await ops.list_collection_pages( | ||||
|             coll_id=coll_id, | ||||
|             org=org, | ||||
|             search=search, | ||||
|             url=url, | ||||
|             url_prefix=urlPrefix, | ||||
|             ts=ts, | ||||
|  | ||||
| @ -382,6 +382,11 @@ def test_get_collection_replay(crawler_auth_headers, default_org_id): | ||||
|     assert data["dateEarliest"] | ||||
|     assert data["dateLatest"] | ||||
|     assert data["defaultThumbnailName"] | ||||
|     assert data["initialPages"] | ||||
|     assert data["pagesQueryUrl"].endswith( | ||||
|         f"/orgs/{default_org_id}/collections/{_coll_id}/pages" | ||||
|     ) | ||||
|     assert "preloadResources" in data | ||||
| 
 | ||||
|     resources = data["resources"] | ||||
|     assert resources | ||||
| @ -413,10 +418,27 @@ def test_collection_public(crawler_auth_headers, default_org_id): | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json", | ||||
|         headers=crawler_auth_headers, | ||||
|     ) | ||||
|     data = r.json() | ||||
|     assert data["initialPages"] | ||||
|     assert data["pagesQueryUrl"].endswith( | ||||
|         f"/orgs/{default_org_id}/collections/{_coll_id}/public/pages" | ||||
|     ) | ||||
|     assert "preloadResources" in data | ||||
| 
 | ||||
|     assert r.status_code == 200 | ||||
|     assert r.headers["Access-Control-Allow-Origin"] == "*" | ||||
|     assert r.headers["Access-Control-Allow-Headers"] == "*" | ||||
| 
 | ||||
|     # test public pages endpoint | ||||
|     r = requests.get( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages", | ||||
|         headers=crawler_auth_headers, | ||||
|     ) | ||||
|     assert r.status_code == 200 | ||||
|     data = r.json() | ||||
|     assert data["total"] > 0 | ||||
|     assert data["items"] | ||||
| 
 | ||||
|     # make unlisted and test replay headers | ||||
|     r = requests.patch( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}", | ||||
| @ -451,6 +473,12 @@ def test_collection_public(crawler_auth_headers, default_org_id): | ||||
|     ) | ||||
|     assert r.status_code == 404 | ||||
| 
 | ||||
|     r = requests.get( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages", | ||||
|         headers=crawler_auth_headers, | ||||
|     ) | ||||
|     assert r.status_code == 404 | ||||
| 
 | ||||
| 
 | ||||
| def test_collection_access_invalid_value(crawler_auth_headers, default_org_id): | ||||
|     r = requests.patch( | ||||
| @ -614,6 +642,39 @@ def test_list_pages_in_collection(crawler_auth_headers, default_org_id): | ||||
|     coll_page_id = coll_page["id"] | ||||
|     coll_page_url = coll_page["url"] | ||||
|     coll_page_ts = coll_page["ts"] | ||||
|     coll_page_title = coll_page["title"] | ||||
| 
 | ||||
|     # Test search filter | ||||
|     partial_title = coll_page_title[:5] | ||||
|     partial_url = coll_page_url[:8] | ||||
| 
 | ||||
|     r = requests.get( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_title}", | ||||
|         headers=crawler_auth_headers, | ||||
|     ) | ||||
|     assert r.status_code == 200 | ||||
|     data = r.json() | ||||
| 
 | ||||
|     assert data["total"] >= 1 | ||||
|     for matching_page in data["items"]: | ||||
|         assert ( | ||||
|             partial_title in matching_page["title"] | ||||
|             or partial_url in matching_page["url"] | ||||
|         ) | ||||
| 
 | ||||
|     r = requests.get( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_url}", | ||||
|         headers=crawler_auth_headers, | ||||
|     ) | ||||
|     assert r.status_code == 200 | ||||
|     data = r.json() | ||||
| 
 | ||||
|     assert data["total"] >= 1 | ||||
|     for matching_page in data["items"]: | ||||
|         assert ( | ||||
|             partial_title in matching_page["title"] | ||||
|             or partial_url in matching_page["url"] | ||||
|         ) | ||||
| 
 | ||||
|     # Test exact url filter | ||||
|     r = requests.get( | ||||
|  | ||||
| @ -184,6 +184,11 @@ def test_wait_for_complete(admin_auth_headers, default_org_id): | ||||
|     assert len(data["resources"]) == 1 | ||||
|     assert data["resources"][0]["path"] | ||||
| 
 | ||||
|     assert len(data["initialPages"]) == 1 | ||||
|     assert data["pagesQueryUrl"].endswith( | ||||
|         f"/orgs/{default_org_id}/crawls/{admin_crawl_id}/pages" | ||||
|     ) | ||||
| 
 | ||||
|     # ensure filename matches specified pattern | ||||
|     # set in default_crawl_filename_template | ||||
|     assert re.search("/[\\d]+-testing-[\\w-]+\\.wacz", data["resources"][0]["path"]) | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user