Add initial pages + pagesQuery endpoint to /replay.json APIs (#2380)
Fixes #2360 - Adds `initialPages` to /replay.json response for collections, returning up-to 25 pages (seed pages first, then sorted by capture time). - Adds `pagesQueryUrl` to /replay.json - Adds a public pages search endpoint to support public collections. - Adds `preloadResources`, including list of WACZ files that should always be loaded, to /replay.json --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
		
							parent
							
								
									73f9f949af
								
							
						
					
					
						commit
						7b2932c582
					
				| @ -3,6 +3,7 @@ | |||||||
| from datetime import datetime, timedelta | from datetime import datetime, timedelta | ||||||
| from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast, Tuple | from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast, Tuple | ||||||
| from uuid import UUID | from uuid import UUID | ||||||
|  | import os | ||||||
| import urllib.parse | import urllib.parse | ||||||
| 
 | 
 | ||||||
| import asyncio | import asyncio | ||||||
| @ -31,7 +32,7 @@ from .models import ( | |||||||
|     PRESIGN_DURATION_SECONDS, |     PRESIGN_DURATION_SECONDS, | ||||||
| ) | ) | ||||||
| from .pagination import paginated_format, DEFAULT_PAGE_SIZE | from .pagination import paginated_format, DEFAULT_PAGE_SIZE | ||||||
| from .utils import dt_now, date_to_str | from .utils import dt_now, date_to_str, get_origin | ||||||
| 
 | 
 | ||||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||||
|     from .crawlconfigs import CrawlConfigOps |     from .crawlconfigs import CrawlConfigOps | ||||||
| @ -156,6 +157,7 @@ class BaseCrawlOps: | |||||||
|         org: Optional[Organization] = None, |         org: Optional[Organization] = None, | ||||||
|         type_: Optional[str] = None, |         type_: Optional[str] = None, | ||||||
|         skip_resources=False, |         skip_resources=False, | ||||||
|  |         headers: Optional[dict] = None, | ||||||
|     ) -> CrawlOutWithResources: |     ) -> CrawlOutWithResources: | ||||||
|         """Get crawl data for api output""" |         """Get crawl data for api output""" | ||||||
|         res = await self.get_crawl_raw(crawlid, org, type_) |         res = await self.get_crawl_raw(crawlid, org, type_) | ||||||
| @ -168,6 +170,16 @@ class BaseCrawlOps: | |||||||
|             if coll_ids: |             if coll_ids: | ||||||
|                 res["collections"] = await self.colls.get_collection_names(coll_ids) |                 res["collections"] = await self.colls.get_collection_names(coll_ids) | ||||||
| 
 | 
 | ||||||
|  |             res["initialPages"], _ = await self.page_ops.list_pages( | ||||||
|  |                 crawlid, is_seed=True, page_size=25 | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |             oid = res.get("oid") | ||||||
|  |             if oid: | ||||||
|  |                 res["pagesQueryUrl"] = ( | ||||||
|  |                     get_origin(headers) + f"/api/orgs/{oid}/crawls/{crawlid}/pages" | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|         crawl = CrawlOutWithResources.from_dict(res) |         crawl = CrawlOutWithResources.from_dict(res) | ||||||
| 
 | 
 | ||||||
|         if not skip_resources: |         if not skip_resources: | ||||||
| @ -497,7 +509,7 @@ class BaseCrawlOps: | |||||||
| 
 | 
 | ||||||
|             out_files.append( |             out_files.append( | ||||||
|                 CrawlFileOut( |                 CrawlFileOut( | ||||||
|                     name=file_.filename, |                     name=os.path.basename(file_.filename), | ||||||
|                     path=presigned_url or "", |                     path=presigned_url or "", | ||||||
|                     hash=file_.hash, |                     hash=file_.hash, | ||||||
|                     size=file_.size, |                     size=file_.size, | ||||||
|  | |||||||
| @ -53,16 +53,18 @@ from .models import ( | |||||||
|     ImageFilePreparer, |     ImageFilePreparer, | ||||||
|     MIN_UPLOAD_PART_SIZE, |     MIN_UPLOAD_PART_SIZE, | ||||||
|     PublicCollOut, |     PublicCollOut, | ||||||
|  |     PreloadResource, | ||||||
| ) | ) | ||||||
| from .utils import dt_now, slug_from_name, get_duplicate_key_error_field | from .utils import dt_now, slug_from_name, get_duplicate_key_error_field, get_origin | ||||||
| 
 | 
 | ||||||
| if TYPE_CHECKING: | if TYPE_CHECKING: | ||||||
|     from .orgs import OrgOps |     from .orgs import OrgOps | ||||||
|     from .storages import StorageOps |     from .storages import StorageOps | ||||||
|     from .webhooks import EventWebhookOps |     from .webhooks import EventWebhookOps | ||||||
|     from .crawls import CrawlOps |     from .crawls import CrawlOps | ||||||
|  |     from .pages import PageOps | ||||||
| else: | else: | ||||||
|     OrgOps = StorageOps = EventWebhookOps = CrawlOps = object |     OrgOps = StorageOps = EventWebhookOps = CrawlOps = PageOps = object | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| THUMBNAIL_MAX_SIZE = 2_000_000 | THUMBNAIL_MAX_SIZE = 2_000_000 | ||||||
| @ -78,6 +80,7 @@ class CollectionOps: | |||||||
|     storage_ops: StorageOps |     storage_ops: StorageOps | ||||||
|     event_webhook_ops: EventWebhookOps |     event_webhook_ops: EventWebhookOps | ||||||
|     crawl_ops: CrawlOps |     crawl_ops: CrawlOps | ||||||
|  |     page_ops: PageOps | ||||||
| 
 | 
 | ||||||
|     def __init__(self, mdb, storage_ops, orgs, event_webhook_ops): |     def __init__(self, mdb, storage_ops, orgs, event_webhook_ops): | ||||||
|         self.collections = mdb["collections"] |         self.collections = mdb["collections"] | ||||||
| @ -337,12 +340,28 @@ class CollectionOps: | |||||||
|         org: Organization, |         org: Organization, | ||||||
|         resources=False, |         resources=False, | ||||||
|         public_or_unlisted_only=False, |         public_or_unlisted_only=False, | ||||||
|  |         headers: Optional[dict] = None, | ||||||
|     ) -> CollOut: |     ) -> CollOut: | ||||||
|         """Get CollOut by id""" |         """Get CollOut by id""" | ||||||
|  |         # pylint: disable=too-many-locals | ||||||
|         result = await self.get_collection_raw(coll_id, public_or_unlisted_only) |         result = await self.get_collection_raw(coll_id, public_or_unlisted_only) | ||||||
| 
 | 
 | ||||||
|         if resources: |         if resources: | ||||||
|             result["resources"] = await self.get_collection_crawl_resources(coll_id) |             result["resources"], result["preloadResources"] = ( | ||||||
|  |                 await self.get_collection_crawl_resources( | ||||||
|  |                     coll_id, include_preloads=True | ||||||
|  |                 ) | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |             result["initialPages"], result["totalPages"] = ( | ||||||
|  |                 await self.page_ops.list_collection_pages(coll_id, page_size=25) | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |             public = "public/" if public_or_unlisted_only else "" | ||||||
|  |             result["pagesQueryUrl"] = ( | ||||||
|  |                 get_origin(headers) | ||||||
|  |                 + f"/api/orgs/{org.id}/collections/{coll_id}/{public}pages" | ||||||
|  |             ) | ||||||
| 
 | 
 | ||||||
|         thumbnail = result.get("thumbnail") |         thumbnail = result.get("thumbnail") | ||||||
|         if thumbnail: |         if thumbnail: | ||||||
| @ -369,7 +388,7 @@ class CollectionOps: | |||||||
|         if result.get("access") not in allowed_access: |         if result.get("access") not in allowed_access: | ||||||
|             raise HTTPException(status_code=404, detail="collection_not_found") |             raise HTTPException(status_code=404, detail="collection_not_found") | ||||||
| 
 | 
 | ||||||
|         result["resources"] = await self.get_collection_crawl_resources(coll_id) |         result["resources"], _ = await self.get_collection_crawl_resources(coll_id) | ||||||
| 
 | 
 | ||||||
|         thumbnail = result.get("thumbnail") |         thumbnail = result.get("thumbnail") | ||||||
|         if thumbnail: |         if thumbnail: | ||||||
| @ -468,7 +487,11 @@ class CollectionOps: | |||||||
|         collections: List[Union[CollOut, PublicCollOut]] = [] |         collections: List[Union[CollOut, PublicCollOut]] = [] | ||||||
| 
 | 
 | ||||||
|         for res in items: |         for res in items: | ||||||
|             res["resources"] = await self.get_collection_crawl_resources(res["_id"]) |             res["resources"], res["preloadResources"] = ( | ||||||
|  |                 await self.get_collection_crawl_resources( | ||||||
|  |                     res["_id"], include_preloads=not public_colls_out | ||||||
|  |                 ) | ||||||
|  |             ) | ||||||
| 
 | 
 | ||||||
|             thumbnail = res.get("thumbnail") |             thumbnail = res.get("thumbnail") | ||||||
|             if thumbnail: |             if thumbnail: | ||||||
| @ -490,12 +513,14 @@ class CollectionOps: | |||||||
| 
 | 
 | ||||||
|         return collections, total |         return collections, total | ||||||
| 
 | 
 | ||||||
|     async def get_collection_crawl_resources(self, coll_id: UUID): |     async def get_collection_crawl_resources( | ||||||
|  |         self, coll_id: UUID, include_preloads=False | ||||||
|  |     ): | ||||||
|         """Return pre-signed resources for all collection crawl files.""" |         """Return pre-signed resources for all collection crawl files.""" | ||||||
|         # Ensure collection exists |         # Ensure collection exists | ||||||
|         _ = await self.get_collection_raw(coll_id) |         _ = await self.get_collection_raw(coll_id) | ||||||
| 
 | 
 | ||||||
|         all_files = [] |         resources = [] | ||||||
| 
 | 
 | ||||||
|         crawls, _ = await self.crawl_ops.list_all_base_crawls( |         crawls, _ = await self.crawl_ops.list_all_base_crawls( | ||||||
|             collection_id=coll_id, |             collection_id=coll_id, | ||||||
| @ -506,9 +531,36 @@ class CollectionOps: | |||||||
| 
 | 
 | ||||||
|         for crawl in crawls: |         for crawl in crawls: | ||||||
|             if crawl.resources: |             if crawl.resources: | ||||||
|                 all_files.extend(crawl.resources) |                 resources.extend(crawl.resources) | ||||||
| 
 | 
 | ||||||
|         return all_files |         preload_resources: List[PreloadResource] = [] | ||||||
|  | 
 | ||||||
|  |         if include_preloads: | ||||||
|  |             no_page_items = await self.get_collection_resources_with_no_pages(crawls) | ||||||
|  |             for item in no_page_items: | ||||||
|  |                 preload_resources.append(item) | ||||||
|  | 
 | ||||||
|  |         return resources, preload_resources | ||||||
|  | 
 | ||||||
|  |     async def get_collection_resources_with_no_pages( | ||||||
|  |         self, crawls: List[CrawlOutWithResources] | ||||||
|  |     ) -> List[PreloadResource]: | ||||||
|  |         """Return wacz files in collection that have no pages""" | ||||||
|  |         resources_no_pages: List[PreloadResource] = [] | ||||||
|  | 
 | ||||||
|  |         for crawl in crawls: | ||||||
|  |             _, page_count = await self.page_ops.list_pages(crawl.id) | ||||||
|  |             if page_count == 0 and crawl.resources: | ||||||
|  |                 for resource in crawl.resources: | ||||||
|  |                     resources_no_pages.append( | ||||||
|  |                         PreloadResource( | ||||||
|  |                             name=os.path.basename(resource.name), | ||||||
|  |                             crawlId=crawl.id, | ||||||
|  |                             hasPages=False, | ||||||
|  |                         ) | ||||||
|  |                     ) | ||||||
|  | 
 | ||||||
|  |         return resources_no_pages | ||||||
| 
 | 
 | ||||||
|     async def get_collection_names(self, uuids: List[UUID]): |     async def get_collection_names(self, uuids: List[UUID]): | ||||||
|         """return object of {_id, names} given list of collection ids""" |         """return object of {_id, names} given list of collection ids""" | ||||||
| @ -528,9 +580,15 @@ class CollectionOps: | |||||||
|         names = [name for name in names if name] |         names = [name for name in names if name] | ||||||
|         return {"names": names} |         return {"names": names} | ||||||
| 
 | 
 | ||||||
|     async def get_collection_crawl_ids(self, coll_id: UUID) -> List[str]: |     async def get_collection_crawl_ids( | ||||||
|         """Return list of crawl ids in collection""" |         self, coll_id: UUID, public_or_unlisted_only=False | ||||||
|  |     ) -> List[str]: | ||||||
|  |         """Return list of crawl ids in collection, including only public collections""" | ||||||
|         crawl_ids = [] |         crawl_ids = [] | ||||||
|  |         # ensure collection is public or unlisted, else throw here | ||||||
|  |         if public_or_unlisted_only: | ||||||
|  |             await self.get_collection_raw(coll_id, public_or_unlisted_only) | ||||||
|  | 
 | ||||||
|         async for crawl_raw in self.crawls.find( |         async for crawl_raw in self.crawls.find( | ||||||
|             {"collectionIds": coll_id}, projection=["_id"] |             {"collectionIds": coll_id}, projection=["_id"] | ||||||
|         ): |         ): | ||||||
| @ -1010,8 +1068,8 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de | |||||||
|         try: |         try: | ||||||
|             all_collections, _ = await colls.list_collections(org, page_size=10_000) |             all_collections, _ = await colls.list_collections(org, page_size=10_000) | ||||||
|             for collection in all_collections: |             for collection in all_collections: | ||||||
|                 results[collection.name] = await colls.get_collection_crawl_resources( |                 results[collection.name], _ = ( | ||||||
|                     collection.id |                     await colls.get_collection_crawl_resources(collection.id) | ||||||
|                 ) |                 ) | ||||||
|         except Exception as exc: |         except Exception as exc: | ||||||
|             # pylint: disable=raise-missing-from |             # pylint: disable=raise-missing-from | ||||||
| @ -1047,9 +1105,11 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de | |||||||
|         response_model=CollOut, |         response_model=CollOut, | ||||||
|     ) |     ) | ||||||
|     async def get_collection_replay( |     async def get_collection_replay( | ||||||
|         coll_id: UUID, org: Organization = Depends(org_viewer_dep) |         request: Request, coll_id: UUID, org: Organization = Depends(org_viewer_dep) | ||||||
|     ): |     ): | ||||||
|         return await colls.get_collection_out(coll_id, org, resources=True) |         return await colls.get_collection_out( | ||||||
|  |             coll_id, org, resources=True, headers=dict(request.headers) | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
|     @app.get( |     @app.get( | ||||||
|         "/orgs/{oid}/collections/{coll_id}/public/replay.json", |         "/orgs/{oid}/collections/{coll_id}/public/replay.json", | ||||||
| @ -1057,12 +1117,17 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de | |||||||
|         response_model=CollOut, |         response_model=CollOut, | ||||||
|     ) |     ) | ||||||
|     async def get_collection_public_replay( |     async def get_collection_public_replay( | ||||||
|  |         request: Request, | ||||||
|         response: Response, |         response: Response, | ||||||
|         coll_id: UUID, |         coll_id: UUID, | ||||||
|         org: Organization = Depends(org_public), |         org: Organization = Depends(org_public), | ||||||
|     ): |     ): | ||||||
|         coll = await colls.get_collection_out( |         coll = await colls.get_collection_out( | ||||||
|             coll_id, org, resources=True, public_or_unlisted_only=True |             coll_id, | ||||||
|  |             org, | ||||||
|  |             resources=True, | ||||||
|  |             public_or_unlisted_only=True, | ||||||
|  |             headers=dict(request.headers), | ||||||
|         ) |         ) | ||||||
|         response.headers["Access-Control-Allow-Origin"] = "*" |         response.headers["Access-Control-Allow-Origin"] = "*" | ||||||
|         response.headers["Access-Control-Allow-Headers"] = "*" |         response.headers["Access-Control-Allow-Headers"] = "*" | ||||||
|  | |||||||
| @ -881,14 +881,6 @@ class CrawlOut(BaseMongoModel): | |||||||
|     errorPageCount: Optional[int] = 0 |     errorPageCount: Optional[int] = 0 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # ============================================================================ |  | ||||||
| class CrawlOutWithResources(CrawlOut): |  | ||||||
|     """Crawl output model including resources""" |  | ||||||
| 
 |  | ||||||
|     resources: Optional[List[CrawlFileOut]] = [] |  | ||||||
|     collections: Optional[List[CollIdName]] = [] |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # ============================================================================ | # ============================================================================ | ||||||
| class UpdateCrawl(BaseModel): | class UpdateCrawl(BaseModel): | ||||||
|     """Update crawl""" |     """Update crawl""" | ||||||
| @ -1222,6 +1214,173 @@ class ImageFilePreparer(FilePreparer): | |||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | 
 | ||||||
|  | ### PAGES ### | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | class PageReviewUpdate(BaseModel): | ||||||
|  |     """Update model for page manual review/approval""" | ||||||
|  | 
 | ||||||
|  |     approved: Optional[bool] = None | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | class PageNoteIn(BaseModel): | ||||||
|  |     """Input model for adding page notes""" | ||||||
|  | 
 | ||||||
|  |     text: str | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | class PageNoteEdit(BaseModel): | ||||||
|  |     """Input model for editing page notes""" | ||||||
|  | 
 | ||||||
|  |     id: UUID | ||||||
|  |     text: str | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | class PageNoteDelete(BaseModel): | ||||||
|  |     """Delete model for page notes""" | ||||||
|  | 
 | ||||||
|  |     delete_list: List[UUID] = [] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | class PageNote(BaseModel): | ||||||
|  |     """Model for page notes, tracking user and time""" | ||||||
|  | 
 | ||||||
|  |     id: UUID | ||||||
|  |     text: str | ||||||
|  |     created: datetime | ||||||
|  |     userid: UUID | ||||||
|  |     userName: str | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | class PageQACompare(BaseModel): | ||||||
|  |     """Model for updating pages from QA run""" | ||||||
|  | 
 | ||||||
|  |     screenshotMatch: Optional[float] = None | ||||||
|  |     textMatch: Optional[float] = None | ||||||
|  |     resourceCounts: Optional[Dict[str, int]] = None | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | class Page(BaseMongoModel): | ||||||
|  |     """Core page data, no QA""" | ||||||
|  | 
 | ||||||
|  |     id: UUID | ||||||
|  | 
 | ||||||
|  |     oid: UUID | ||||||
|  |     crawl_id: str | ||||||
|  | 
 | ||||||
|  |     # core page data | ||||||
|  |     url: AnyHttpUrl | ||||||
|  |     title: Optional[str] = None | ||||||
|  |     ts: Optional[datetime] = None | ||||||
|  |     loadState: Optional[int] = None | ||||||
|  |     status: Optional[int] = None | ||||||
|  |     mime: Optional[str] = None | ||||||
|  |     filename: Optional[str] = None | ||||||
|  |     depth: Optional[int] = None | ||||||
|  |     favIconUrl: Optional[AnyHttpUrl] = None | ||||||
|  |     isSeed: Optional[bool] = False | ||||||
|  | 
 | ||||||
|  |     # manual review | ||||||
|  |     userid: Optional[UUID] = None | ||||||
|  |     modified: Optional[datetime] = None | ||||||
|  |     approved: Optional[bool] = None | ||||||
|  |     notes: List[PageNote] = [] | ||||||
|  | 
 | ||||||
|  |     isFile: Optional[bool] = False | ||||||
|  |     isError: Optional[bool] = False | ||||||
|  | 
 | ||||||
|  |     def compute_page_type(self): | ||||||
|  |         """sets self.isFile or self.isError flags""" | ||||||
|  |         self.isFile = False | ||||||
|  |         self.isError = False | ||||||
|  |         if self.loadState == 2: | ||||||
|  |             # pylint: disable=unsupported-membership-test | ||||||
|  |             if self.mime and "html" not in self.mime: | ||||||
|  |                 self.isFile = True | ||||||
|  |             elif self.title is None and self.status == 200: | ||||||
|  |                 self.isFile = True | ||||||
|  | 
 | ||||||
|  |         elif self.loadState == 0: | ||||||
|  |             self.isError = True | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | class PageWithAllQA(Page): | ||||||
|  |     """Model for core page data + qa""" | ||||||
|  | 
 | ||||||
|  |     # automated heuristics, keyed by QA run id | ||||||
|  |     qa: Optional[Dict[str, PageQACompare]] = {} | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | class PageOut(Page): | ||||||
|  |     """Model for pages output, no QA""" | ||||||
|  | 
 | ||||||
|  |     status: int = 200 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | class PageOutWithSingleQA(Page): | ||||||
|  |     """Page out with single QA entry""" | ||||||
|  | 
 | ||||||
|  |     qa: Optional[PageQACompare] = None | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | class PageNoteAddedResponse(BaseModel): | ||||||
|  |     """Model for response to adding page""" | ||||||
|  | 
 | ||||||
|  |     added: bool | ||||||
|  |     data: PageNote | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | class PageNoteUpdatedResponse(BaseModel): | ||||||
|  |     """Model for response to updating page""" | ||||||
|  | 
 | ||||||
|  |     updated: bool | ||||||
|  |     data: PageNote | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | class PageIdTimestamp(BaseModel): | ||||||
|  |     """Simplified model for page info to include in PageUrlCount""" | ||||||
|  | 
 | ||||||
|  |     pageId: UUID | ||||||
|  |     ts: Optional[datetime] = None | ||||||
|  |     status: int = 200 | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | class PageUrlCount(BaseModel): | ||||||
|  |     """Model for counting pages by URL""" | ||||||
|  | 
 | ||||||
|  |     url: AnyHttpUrl | ||||||
|  |     count: int = 0 | ||||||
|  |     snapshots: List[PageIdTimestamp] = [] | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | class CrawlOutWithResources(CrawlOut): | ||||||
|  |     """Crawl output model including resources""" | ||||||
|  | 
 | ||||||
|  |     resources: Optional[List[CrawlFileOut]] = [] | ||||||
|  |     collections: Optional[List[CollIdName]] = [] | ||||||
|  | 
 | ||||||
|  |     initialPages: List[PageOut] = [] | ||||||
|  |     totalPages: Optional[int] = None | ||||||
|  |     pagesQueryUrl: str = "" | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| # ============================================================================ | # ============================================================================ | ||||||
| 
 | 
 | ||||||
| ### COLLECTIONS ### | ### COLLECTIONS ### | ||||||
| @ -1245,6 +1404,15 @@ class CollectionThumbnailSource(BaseModel): | |||||||
|     urlPageId: UUID |     urlPageId: UUID | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | # ============================================================================ | ||||||
|  | class PreloadResource(BaseModel): | ||||||
|  |     """Resources that will preloaded in RWP""" | ||||||
|  | 
 | ||||||
|  |     name: str | ||||||
|  |     crawlId: str | ||||||
|  |     hasPages: bool | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| # ============================================================================ | # ============================================================================ | ||||||
| class Collection(BaseMongoModel): | class Collection(BaseMongoModel): | ||||||
|     """Org collection structure""" |     """Org collection structure""" | ||||||
| @ -1338,6 +1506,11 @@ class CollOut(BaseMongoModel): | |||||||
| 
 | 
 | ||||||
|     allowPublicDownload: bool = True |     allowPublicDownload: bool = True | ||||||
| 
 | 
 | ||||||
|  |     initialPages: List[PageOut] = [] | ||||||
|  |     totalPages: Optional[int] = None | ||||||
|  |     preloadResources: List[PreloadResource] = [] | ||||||
|  |     pagesQueryUrl: str = "" | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| # ============================================================================ | # ============================================================================ | ||||||
| class PublicCollOut(BaseMongoModel): | class PublicCollOut(BaseMongoModel): | ||||||
| @ -2435,161 +2608,6 @@ AnyJob = RootModel[ | |||||||
| ] | ] | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| # ============================================================================ |  | ||||||
| 
 |  | ||||||
| ### PAGES ### |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # ============================================================================ |  | ||||||
| class PageReviewUpdate(BaseModel): |  | ||||||
|     """Update model for page manual review/approval""" |  | ||||||
| 
 |  | ||||||
|     approved: Optional[bool] = None |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # ============================================================================ |  | ||||||
| class PageNoteIn(BaseModel): |  | ||||||
|     """Input model for adding page notes""" |  | ||||||
| 
 |  | ||||||
|     text: str |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # ============================================================================ |  | ||||||
| class PageNoteEdit(BaseModel): |  | ||||||
|     """Input model for editing page notes""" |  | ||||||
| 
 |  | ||||||
|     id: UUID |  | ||||||
|     text: str |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # ============================================================================ |  | ||||||
| class PageNoteDelete(BaseModel): |  | ||||||
|     """Delete model for page notes""" |  | ||||||
| 
 |  | ||||||
|     delete_list: List[UUID] = [] |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # ============================================================================ |  | ||||||
| class PageNote(BaseModel): |  | ||||||
|     """Model for page notes, tracking user and time""" |  | ||||||
| 
 |  | ||||||
|     id: UUID |  | ||||||
|     text: str |  | ||||||
|     created: datetime |  | ||||||
|     userid: UUID |  | ||||||
|     userName: str |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # ============================================================================ |  | ||||||
| class PageQACompare(BaseModel): |  | ||||||
|     """Model for updating pages from QA run""" |  | ||||||
| 
 |  | ||||||
|     screenshotMatch: Optional[float] = None |  | ||||||
|     textMatch: Optional[float] = None |  | ||||||
|     resourceCounts: Optional[Dict[str, int]] = None |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # ============================================================================ |  | ||||||
| class Page(BaseMongoModel): |  | ||||||
|     """Core page data, no QA""" |  | ||||||
| 
 |  | ||||||
|     id: UUID |  | ||||||
| 
 |  | ||||||
|     oid: UUID |  | ||||||
|     crawl_id: str |  | ||||||
| 
 |  | ||||||
|     # core page data |  | ||||||
|     url: AnyHttpUrl |  | ||||||
|     title: Optional[str] = None |  | ||||||
|     ts: Optional[datetime] = None |  | ||||||
|     loadState: Optional[int] = None |  | ||||||
|     status: Optional[int] = None |  | ||||||
|     mime: Optional[str] = None |  | ||||||
|     filename: Optional[str] = None |  | ||||||
|     depth: Optional[int] = None |  | ||||||
|     favIconUrl: Optional[AnyHttpUrl] = None |  | ||||||
|     isSeed: Optional[bool] = False |  | ||||||
| 
 |  | ||||||
|     # manual review |  | ||||||
|     userid: Optional[UUID] = None |  | ||||||
|     modified: Optional[datetime] = None |  | ||||||
|     approved: Optional[bool] = None |  | ||||||
|     notes: List[PageNote] = [] |  | ||||||
| 
 |  | ||||||
|     isFile: Optional[bool] = False |  | ||||||
|     isError: Optional[bool] = False |  | ||||||
| 
 |  | ||||||
|     def compute_page_type(self): |  | ||||||
|         """sets self.isFile or self.isError flags""" |  | ||||||
|         self.isFile = False |  | ||||||
|         self.isError = False |  | ||||||
|         if self.loadState == 2: |  | ||||||
|             # pylint: disable=unsupported-membership-test |  | ||||||
|             if self.mime and "html" not in self.mime: |  | ||||||
|                 self.isFile = True |  | ||||||
|             elif self.title is None and self.status == 200: |  | ||||||
|                 self.isFile = True |  | ||||||
| 
 |  | ||||||
|         elif self.loadState == 0: |  | ||||||
|             self.isError = True |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # ============================================================================ |  | ||||||
| class PageWithAllQA(Page): |  | ||||||
|     """Model for core page data + qa""" |  | ||||||
| 
 |  | ||||||
|     # automated heuristics, keyed by QA run id |  | ||||||
|     qa: Optional[Dict[str, PageQACompare]] = {} |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # ============================================================================ |  | ||||||
| class PageOut(Page): |  | ||||||
|     """Model for pages output, no QA""" |  | ||||||
| 
 |  | ||||||
|     status: int = 200 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # ============================================================================ |  | ||||||
| class PageOutWithSingleQA(Page): |  | ||||||
|     """Page out with single QA entry""" |  | ||||||
| 
 |  | ||||||
|     qa: Optional[PageQACompare] = None |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # ============================================================================ |  | ||||||
| class PageNoteAddedResponse(BaseModel): |  | ||||||
|     """Model for response to adding page""" |  | ||||||
| 
 |  | ||||||
|     added: bool |  | ||||||
|     data: PageNote |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # ============================================================================ |  | ||||||
| class PageNoteUpdatedResponse(BaseModel): |  | ||||||
|     """Model for response to updating page""" |  | ||||||
| 
 |  | ||||||
|     updated: bool |  | ||||||
|     data: PageNote |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # ============================================================================ |  | ||||||
| class PageIdTimestamp(BaseModel): |  | ||||||
|     """Simplified model for page info to include in PageUrlCount""" |  | ||||||
| 
 |  | ||||||
|     pageId: UUID |  | ||||||
|     ts: Optional[datetime] = None |  | ||||||
|     status: int = 200 |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # ============================================================================ |  | ||||||
| class PageUrlCount(BaseModel): |  | ||||||
|     """Model for counting pages by URL""" |  | ||||||
| 
 |  | ||||||
|     url: AnyHttpUrl |  | ||||||
|     count: int = 0 |  | ||||||
|     snapshots: List[PageIdTimestamp] = [] |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| # ============================================================================ | # ============================================================================ | ||||||
| 
 | 
 | ||||||
| ### GENERIC RESPONSE MODELS ### | ### GENERIC RESPONSE MODELS ### | ||||||
|  | |||||||
| @ -11,7 +11,7 @@ from datetime import datetime | |||||||
| from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union | from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union | ||||||
| from uuid import UUID, uuid4 | from uuid import UUID, uuid4 | ||||||
| 
 | 
 | ||||||
| from fastapi import Depends, HTTPException, Request | from fastapi import Depends, HTTPException, Request, Response | ||||||
| import pymongo | import pymongo | ||||||
| 
 | 
 | ||||||
| from .models import ( | from .models import ( | ||||||
| @ -35,6 +35,7 @@ from .models import ( | |||||||
|     DeletedResponse, |     DeletedResponse, | ||||||
|     PageNoteAddedResponse, |     PageNoteAddedResponse, | ||||||
|     PageNoteUpdatedResponse, |     PageNoteUpdatedResponse, | ||||||
|  |     EmptyResponse, | ||||||
| ) | ) | ||||||
| from .pagination import DEFAULT_PAGE_SIZE, paginated_format | from .pagination import DEFAULT_PAGE_SIZE, paginated_format | ||||||
| from .utils import str_to_date, str_list_to_bools, dt_now | from .utils import str_to_date, str_list_to_bools, dt_now | ||||||
| @ -503,6 +504,7 @@ class PageOps: | |||||||
|         self, |         self, | ||||||
|         crawl_id: str, |         crawl_id: str, | ||||||
|         org: Optional[Organization] = None, |         org: Optional[Organization] = None, | ||||||
|  |         search: Optional[str] = None, | ||||||
|         url: Optional[str] = None, |         url: Optional[str] = None, | ||||||
|         url_prefix: Optional[str] = None, |         url_prefix: Optional[str] = None, | ||||||
|         ts: Optional[datetime] = None, |         ts: Optional[datetime] = None, | ||||||
| @ -534,6 +536,13 @@ class PageOps: | |||||||
|         if org: |         if org: | ||||||
|             query["oid"] = org.id |             query["oid"] = org.id | ||||||
| 
 | 
 | ||||||
|  |         if search: | ||||||
|  |             search_regex = re.escape(urllib.parse.unquote(search)) | ||||||
|  |             query["$or"] = [ | ||||||
|  |                 {"url": {"$regex": search_regex, "$options": "i"}}, | ||||||
|  |                 {"title": {"$regex": search_regex, "$options": "i"}}, | ||||||
|  |             ] | ||||||
|  | 
 | ||||||
|         if url_prefix: |         if url_prefix: | ||||||
|             url_prefix = urllib.parse.unquote(url_prefix) |             url_prefix = urllib.parse.unquote(url_prefix) | ||||||
|             regex_pattern = f"^{re.escape(url_prefix)}" |             regex_pattern = f"^{re.escape(url_prefix)}" | ||||||
| @ -661,6 +670,7 @@ class PageOps: | |||||||
|         self, |         self, | ||||||
|         coll_id: UUID, |         coll_id: UUID, | ||||||
|         org: Optional[Organization] = None, |         org: Optional[Organization] = None, | ||||||
|  |         search: Optional[str] = None, | ||||||
|         url: Optional[str] = None, |         url: Optional[str] = None, | ||||||
|         url_prefix: Optional[str] = None, |         url_prefix: Optional[str] = None, | ||||||
|         ts: Optional[datetime] = None, |         ts: Optional[datetime] = None, | ||||||
| @ -670,6 +680,7 @@ class PageOps: | |||||||
|         page: int = 1, |         page: int = 1, | ||||||
|         sort_by: Optional[str] = None, |         sort_by: Optional[str] = None, | ||||||
|         sort_direction: Optional[int] = -1, |         sort_direction: Optional[int] = -1, | ||||||
|  |         public_or_unlisted_only=False, | ||||||
|     ) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]: |     ) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]: | ||||||
|         """List all pages in collection, with optional filtering""" |         """List all pages in collection, with optional filtering""" | ||||||
|         # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements |         # pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements | ||||||
| @ -677,7 +688,9 @@ class PageOps: | |||||||
|         page = page - 1 |         page = page - 1 | ||||||
|         skip = page_size * page |         skip = page_size * page | ||||||
| 
 | 
 | ||||||
|         crawl_ids = await self.coll_ops.get_collection_crawl_ids(coll_id) |         crawl_ids = await self.coll_ops.get_collection_crawl_ids( | ||||||
|  |             coll_id, public_or_unlisted_only | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
|         query: dict[str, object] = { |         query: dict[str, object] = { | ||||||
|             "crawl_id": {"$in": crawl_ids}, |             "crawl_id": {"$in": crawl_ids}, | ||||||
| @ -685,7 +698,14 @@ class PageOps: | |||||||
|         if org: |         if org: | ||||||
|             query["oid"] = org.id |             query["oid"] = org.id | ||||||
| 
 | 
 | ||||||
|         if url_prefix: |         if search: | ||||||
|  |             search_regex = re.escape(urllib.parse.unquote(search)) | ||||||
|  |             query["$or"] = [ | ||||||
|  |                 {"url": {"$regex": search_regex, "$options": "i"}}, | ||||||
|  |                 {"title": {"$regex": search_regex, "$options": "i"}}, | ||||||
|  |             ] | ||||||
|  | 
 | ||||||
|  |         elif url_prefix: | ||||||
|             url_prefix = urllib.parse.unquote(url_prefix) |             url_prefix = urllib.parse.unquote(url_prefix) | ||||||
|             regex_pattern = f"^{re.escape(url_prefix)}" |             regex_pattern = f"^{re.escape(url_prefix)}" | ||||||
|             query["url"] = {"$regex": regex_pattern, "$options": "i"} |             query["url"] = {"$regex": regex_pattern, "$options": "i"} | ||||||
| @ -724,6 +744,9 @@ class PageOps: | |||||||
|                 raise HTTPException(status_code=400, detail="invalid_sort_direction") |                 raise HTTPException(status_code=400, detail="invalid_sort_direction") | ||||||
| 
 | 
 | ||||||
|             aggregate.extend([{"$sort": {sort_by: sort_direction}}]) |             aggregate.extend([{"$sort": {sort_by: sort_direction}}]) | ||||||
|  |         else: | ||||||
|  |             # default sort: seeds first, then by timestamp | ||||||
|  |             aggregate.extend([{"$sort": {"isSeed": -1, "ts": 1}}]) | ||||||
| 
 | 
 | ||||||
|         aggregate.extend( |         aggregate.extend( | ||||||
|             [ |             [ | ||||||
| @ -886,6 +909,7 @@ def init_pages_api( | |||||||
| 
 | 
 | ||||||
|     org_viewer_dep = org_ops.org_viewer_dep |     org_viewer_dep = org_ops.org_viewer_dep | ||||||
|     org_crawl_dep = org_ops.org_crawl_dep |     org_crawl_dep = org_ops.org_crawl_dep | ||||||
|  |     org_public = org_ops.org_public | ||||||
| 
 | 
 | ||||||
|     @app.post( |     @app.post( | ||||||
|         "/orgs/{oid}/crawls/all/pages/reAdd", |         "/orgs/{oid}/crawls/all/pages/reAdd", | ||||||
| @ -1056,6 +1080,7 @@ def init_pages_api( | |||||||
|     async def get_crawl_pages_list( |     async def get_crawl_pages_list( | ||||||
|         crawl_id: str, |         crawl_id: str, | ||||||
|         org: Organization = Depends(org_crawl_dep), |         org: Organization = Depends(org_crawl_dep), | ||||||
|  |         search: Optional[str] = None, | ||||||
|         url: Optional[str] = None, |         url: Optional[str] = None, | ||||||
|         urlPrefix: Optional[str] = None, |         urlPrefix: Optional[str] = None, | ||||||
|         ts: Optional[datetime] = None, |         ts: Optional[datetime] = None, | ||||||
| @ -1077,6 +1102,7 @@ def init_pages_api( | |||||||
|         pages, total = await ops.list_pages( |         pages, total = await ops.list_pages( | ||||||
|             crawl_id=crawl_id, |             crawl_id=crawl_id, | ||||||
|             org=org, |             org=org, | ||||||
|  |             search=search, | ||||||
|             url=url, |             url=url, | ||||||
|             url_prefix=urlPrefix, |             url_prefix=urlPrefix, | ||||||
|             ts=ts, |             ts=ts, | ||||||
| @ -1093,13 +1119,15 @@ def init_pages_api( | |||||||
|         return paginated_format(pages, total, page, pageSize) |         return paginated_format(pages, total, page, pageSize) | ||||||
| 
 | 
 | ||||||
|     @app.get( |     @app.get( | ||||||
|         "/orgs/{oid}/collections/{coll_id}/pages", |         "/orgs/{oid}/collections/{coll_id}/public/pages", | ||||||
|         tags=["pages", "collections"], |         tags=["pages", "collections"], | ||||||
|         response_model=PaginatedPageOutResponse, |         response_model=PaginatedPageOutResponse, | ||||||
|     ) |     ) | ||||||
|     async def get_collection_pages_list( |     async def get_public_collection_pages_list( | ||||||
|         coll_id: UUID, |         coll_id: UUID, | ||||||
|         org: Organization = Depends(org_viewer_dep), |         response: Response, | ||||||
|  |         org: Organization = Depends(org_public), | ||||||
|  |         search: Optional[str] = None, | ||||||
|         url: Optional[str] = None, |         url: Optional[str] = None, | ||||||
|         urlPrefix: Optional[str] = None, |         urlPrefix: Optional[str] = None, | ||||||
|         ts: Optional[datetime] = None, |         ts: Optional[datetime] = None, | ||||||
| @ -1114,6 +1142,58 @@ def init_pages_api( | |||||||
|         pages, total = await ops.list_collection_pages( |         pages, total = await ops.list_collection_pages( | ||||||
|             coll_id=coll_id, |             coll_id=coll_id, | ||||||
|             org=org, |             org=org, | ||||||
|  |             search=search, | ||||||
|  |             url=url, | ||||||
|  |             url_prefix=urlPrefix, | ||||||
|  |             ts=ts, | ||||||
|  |             is_seed=isSeed, | ||||||
|  |             depth=depth, | ||||||
|  |             page_size=pageSize, | ||||||
|  |             page=page, | ||||||
|  |             sort_by=sortBy, | ||||||
|  |             sort_direction=sortDirection, | ||||||
|  |             public_or_unlisted_only=True, | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |         response.headers["Access-Control-Allow-Origin"] = "*" | ||||||
|  |         response.headers["Access-Control-Allow-Headers"] = "*" | ||||||
|  |         return paginated_format(pages, total, page, pageSize) | ||||||
|  | 
 | ||||||
|  |     @app.options( | ||||||
|  |         "/orgs/{oid}/collections/{coll_id}/public/pages", | ||||||
|  |         tags=["pages", "collections"], | ||||||
|  |         response_model=EmptyResponse, | ||||||
|  |     ) | ||||||
|  |     async def get_replay_preflight(response: Response): | ||||||
|  |         response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS" | ||||||
|  |         response.headers["Access-Control-Allow-Origin"] = "*" | ||||||
|  |         response.headers["Access-Control-Allow-Headers"] = "*" | ||||||
|  |         return {} | ||||||
|  | 
 | ||||||
|  |     @app.get( | ||||||
|  |         "/orgs/{oid}/collections/{coll_id}/pages", | ||||||
|  |         tags=["pages", "collections"], | ||||||
|  |         response_model=PaginatedPageOutResponse, | ||||||
|  |     ) | ||||||
|  |     async def get_collection_pages_list( | ||||||
|  |         coll_id: UUID, | ||||||
|  |         org: Organization = Depends(org_viewer_dep), | ||||||
|  |         search: Optional[str] = None, | ||||||
|  |         url: Optional[str] = None, | ||||||
|  |         urlPrefix: Optional[str] = None, | ||||||
|  |         ts: Optional[datetime] = None, | ||||||
|  |         isSeed: Optional[bool] = None, | ||||||
|  |         depth: Optional[int] = None, | ||||||
|  |         pageSize: int = DEFAULT_PAGE_SIZE, | ||||||
|  |         page: int = 1, | ||||||
|  |         sortBy: Optional[str] = None, | ||||||
|  |         sortDirection: Optional[int] = -1, | ||||||
|  |     ): | ||||||
|  |         """Retrieve paginated list of pages in collection""" | ||||||
|  |         pages, total = await ops.list_collection_pages( | ||||||
|  |             coll_id=coll_id, | ||||||
|  |             org=org, | ||||||
|  |             search=search, | ||||||
|             url=url, |             url=url, | ||||||
|             url_prefix=urlPrefix, |             url_prefix=urlPrefix, | ||||||
|             ts=ts, |             ts=ts, | ||||||
|  | |||||||
| @ -382,6 +382,11 @@ def test_get_collection_replay(crawler_auth_headers, default_org_id): | |||||||
|     assert data["dateEarliest"] |     assert data["dateEarliest"] | ||||||
|     assert data["dateLatest"] |     assert data["dateLatest"] | ||||||
|     assert data["defaultThumbnailName"] |     assert data["defaultThumbnailName"] | ||||||
|  |     assert data["initialPages"] | ||||||
|  |     assert data["pagesQueryUrl"].endswith( | ||||||
|  |         f"/orgs/{default_org_id}/collections/{_coll_id}/pages" | ||||||
|  |     ) | ||||||
|  |     assert "preloadResources" in data | ||||||
| 
 | 
 | ||||||
|     resources = data["resources"] |     resources = data["resources"] | ||||||
|     assert resources |     assert resources | ||||||
| @ -413,10 +418,27 @@ def test_collection_public(crawler_auth_headers, default_org_id): | |||||||
|         f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json", |         f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json", | ||||||
|         headers=crawler_auth_headers, |         headers=crawler_auth_headers, | ||||||
|     ) |     ) | ||||||
|  |     data = r.json() | ||||||
|  |     assert data["initialPages"] | ||||||
|  |     assert data["pagesQueryUrl"].endswith( | ||||||
|  |         f"/orgs/{default_org_id}/collections/{_coll_id}/public/pages" | ||||||
|  |     ) | ||||||
|  |     assert "preloadResources" in data | ||||||
|  | 
 | ||||||
|     assert r.status_code == 200 |     assert r.status_code == 200 | ||||||
|     assert r.headers["Access-Control-Allow-Origin"] == "*" |     assert r.headers["Access-Control-Allow-Origin"] == "*" | ||||||
|     assert r.headers["Access-Control-Allow-Headers"] == "*" |     assert r.headers["Access-Control-Allow-Headers"] == "*" | ||||||
| 
 | 
 | ||||||
|  |     # test public pages endpoint | ||||||
|  |     r = requests.get( | ||||||
|  |         f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages", | ||||||
|  |         headers=crawler_auth_headers, | ||||||
|  |     ) | ||||||
|  |     assert r.status_code == 200 | ||||||
|  |     data = r.json() | ||||||
|  |     assert data["total"] > 0 | ||||||
|  |     assert data["items"] | ||||||
|  | 
 | ||||||
|     # make unlisted and test replay headers |     # make unlisted and test replay headers | ||||||
|     r = requests.patch( |     r = requests.patch( | ||||||
|         f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}", |         f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}", | ||||||
| @ -451,6 +473,12 @@ def test_collection_public(crawler_auth_headers, default_org_id): | |||||||
|     ) |     ) | ||||||
|     assert r.status_code == 404 |     assert r.status_code == 404 | ||||||
| 
 | 
 | ||||||
|  |     r = requests.get( | ||||||
|  |         f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages", | ||||||
|  |         headers=crawler_auth_headers, | ||||||
|  |     ) | ||||||
|  |     assert r.status_code == 404 | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| def test_collection_access_invalid_value(crawler_auth_headers, default_org_id): | def test_collection_access_invalid_value(crawler_auth_headers, default_org_id): | ||||||
|     r = requests.patch( |     r = requests.patch( | ||||||
| @ -614,6 +642,39 @@ def test_list_pages_in_collection(crawler_auth_headers, default_org_id): | |||||||
|     coll_page_id = coll_page["id"] |     coll_page_id = coll_page["id"] | ||||||
|     coll_page_url = coll_page["url"] |     coll_page_url = coll_page["url"] | ||||||
|     coll_page_ts = coll_page["ts"] |     coll_page_ts = coll_page["ts"] | ||||||
|  |     coll_page_title = coll_page["title"] | ||||||
|  | 
 | ||||||
|  |     # Test search filter | ||||||
|  |     partial_title = coll_page_title[:5] | ||||||
|  |     partial_url = coll_page_url[:8] | ||||||
|  | 
 | ||||||
|  |     r = requests.get( | ||||||
|  |         f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_title}", | ||||||
|  |         headers=crawler_auth_headers, | ||||||
|  |     ) | ||||||
|  |     assert r.status_code == 200 | ||||||
|  |     data = r.json() | ||||||
|  | 
 | ||||||
|  |     assert data["total"] >= 1 | ||||||
|  |     for matching_page in data["items"]: | ||||||
|  |         assert ( | ||||||
|  |             partial_title in matching_page["title"] | ||||||
|  |             or partial_url in matching_page["url"] | ||||||
|  |         ) | ||||||
|  | 
 | ||||||
|  |     r = requests.get( | ||||||
|  |         f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_url}", | ||||||
|  |         headers=crawler_auth_headers, | ||||||
|  |     ) | ||||||
|  |     assert r.status_code == 200 | ||||||
|  |     data = r.json() | ||||||
|  | 
 | ||||||
|  |     assert data["total"] >= 1 | ||||||
|  |     for matching_page in data["items"]: | ||||||
|  |         assert ( | ||||||
|  |             partial_title in matching_page["title"] | ||||||
|  |             or partial_url in matching_page["url"] | ||||||
|  |         ) | ||||||
| 
 | 
 | ||||||
|     # Test exact url filter |     # Test exact url filter | ||||||
|     r = requests.get( |     r = requests.get( | ||||||
|  | |||||||
| @ -184,6 +184,11 @@ def test_wait_for_complete(admin_auth_headers, default_org_id): | |||||||
|     assert len(data["resources"]) == 1 |     assert len(data["resources"]) == 1 | ||||||
|     assert data["resources"][0]["path"] |     assert data["resources"][0]["path"] | ||||||
| 
 | 
 | ||||||
|  |     assert len(data["initialPages"]) == 1 | ||||||
|  |     assert data["pagesQueryUrl"].endswith( | ||||||
|  |         f"/orgs/{default_org_id}/crawls/{admin_crawl_id}/pages" | ||||||
|  |     ) | ||||||
|  | 
 | ||||||
|     # ensure filename matches specified pattern |     # ensure filename matches specified pattern | ||||||
|     # set in default_crawl_filename_template |     # set in default_crawl_filename_template | ||||||
|     assert re.search("/[\\d]+-testing-[\\w-]+\\.wacz", data["resources"][0]["path"]) |     assert re.search("/[\\d]+-testing-[\\w-]+\\.wacz", data["resources"][0]["path"]) | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user