Add initial pages + pagesQuery endpoint to /replay.json APIs (#2380)
Fixes #2360 - Adds `initialPages` to /replay.json response for collections, returning up-to 25 pages (seed pages first, then sorted by capture time). - Adds `pagesQueryUrl` to /replay.json - Adds a public pages search endpoint to support public collections. - Adds `preloadResources`, including list of WACZ files that should always be loaded, to /replay.json --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
73f9f949af
commit
7b2932c582
@ -3,6 +3,7 @@
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Optional, List, Union, Dict, Any, Type, TYPE_CHECKING, cast, Tuple
|
||||
from uuid import UUID
|
||||
import os
|
||||
import urllib.parse
|
||||
|
||||
import asyncio
|
||||
@ -31,7 +32,7 @@ from .models import (
|
||||
PRESIGN_DURATION_SECONDS,
|
||||
)
|
||||
from .pagination import paginated_format, DEFAULT_PAGE_SIZE
|
||||
from .utils import dt_now, date_to_str
|
||||
from .utils import dt_now, date_to_str, get_origin
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .crawlconfigs import CrawlConfigOps
|
||||
@ -156,6 +157,7 @@ class BaseCrawlOps:
|
||||
org: Optional[Organization] = None,
|
||||
type_: Optional[str] = None,
|
||||
skip_resources=False,
|
||||
headers: Optional[dict] = None,
|
||||
) -> CrawlOutWithResources:
|
||||
"""Get crawl data for api output"""
|
||||
res = await self.get_crawl_raw(crawlid, org, type_)
|
||||
@ -168,6 +170,16 @@ class BaseCrawlOps:
|
||||
if coll_ids:
|
||||
res["collections"] = await self.colls.get_collection_names(coll_ids)
|
||||
|
||||
res["initialPages"], _ = await self.page_ops.list_pages(
|
||||
crawlid, is_seed=True, page_size=25
|
||||
)
|
||||
|
||||
oid = res.get("oid")
|
||||
if oid:
|
||||
res["pagesQueryUrl"] = (
|
||||
get_origin(headers) + f"/api/orgs/{oid}/crawls/{crawlid}/pages"
|
||||
)
|
||||
|
||||
crawl = CrawlOutWithResources.from_dict(res)
|
||||
|
||||
if not skip_resources:
|
||||
@ -497,7 +509,7 @@ class BaseCrawlOps:
|
||||
|
||||
out_files.append(
|
||||
CrawlFileOut(
|
||||
name=file_.filename,
|
||||
name=os.path.basename(file_.filename),
|
||||
path=presigned_url or "",
|
||||
hash=file_.hash,
|
||||
size=file_.size,
|
||||
|
||||
@ -53,16 +53,18 @@ from .models import (
|
||||
ImageFilePreparer,
|
||||
MIN_UPLOAD_PART_SIZE,
|
||||
PublicCollOut,
|
||||
PreloadResource,
|
||||
)
|
||||
from .utils import dt_now, slug_from_name, get_duplicate_key_error_field
|
||||
from .utils import dt_now, slug_from_name, get_duplicate_key_error_field, get_origin
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .orgs import OrgOps
|
||||
from .storages import StorageOps
|
||||
from .webhooks import EventWebhookOps
|
||||
from .crawls import CrawlOps
|
||||
from .pages import PageOps
|
||||
else:
|
||||
OrgOps = StorageOps = EventWebhookOps = CrawlOps = object
|
||||
OrgOps = StorageOps = EventWebhookOps = CrawlOps = PageOps = object
|
||||
|
||||
|
||||
THUMBNAIL_MAX_SIZE = 2_000_000
|
||||
@ -78,6 +80,7 @@ class CollectionOps:
|
||||
storage_ops: StorageOps
|
||||
event_webhook_ops: EventWebhookOps
|
||||
crawl_ops: CrawlOps
|
||||
page_ops: PageOps
|
||||
|
||||
def __init__(self, mdb, storage_ops, orgs, event_webhook_ops):
|
||||
self.collections = mdb["collections"]
|
||||
@ -337,12 +340,28 @@ class CollectionOps:
|
||||
org: Organization,
|
||||
resources=False,
|
||||
public_or_unlisted_only=False,
|
||||
headers: Optional[dict] = None,
|
||||
) -> CollOut:
|
||||
"""Get CollOut by id"""
|
||||
# pylint: disable=too-many-locals
|
||||
result = await self.get_collection_raw(coll_id, public_or_unlisted_only)
|
||||
|
||||
if resources:
|
||||
result["resources"] = await self.get_collection_crawl_resources(coll_id)
|
||||
result["resources"], result["preloadResources"] = (
|
||||
await self.get_collection_crawl_resources(
|
||||
coll_id, include_preloads=True
|
||||
)
|
||||
)
|
||||
|
||||
result["initialPages"], result["totalPages"] = (
|
||||
await self.page_ops.list_collection_pages(coll_id, page_size=25)
|
||||
)
|
||||
|
||||
public = "public/" if public_or_unlisted_only else ""
|
||||
result["pagesQueryUrl"] = (
|
||||
get_origin(headers)
|
||||
+ f"/api/orgs/{org.id}/collections/{coll_id}/{public}pages"
|
||||
)
|
||||
|
||||
thumbnail = result.get("thumbnail")
|
||||
if thumbnail:
|
||||
@ -369,7 +388,7 @@ class CollectionOps:
|
||||
if result.get("access") not in allowed_access:
|
||||
raise HTTPException(status_code=404, detail="collection_not_found")
|
||||
|
||||
result["resources"] = await self.get_collection_crawl_resources(coll_id)
|
||||
result["resources"], _ = await self.get_collection_crawl_resources(coll_id)
|
||||
|
||||
thumbnail = result.get("thumbnail")
|
||||
if thumbnail:
|
||||
@ -468,7 +487,11 @@ class CollectionOps:
|
||||
collections: List[Union[CollOut, PublicCollOut]] = []
|
||||
|
||||
for res in items:
|
||||
res["resources"] = await self.get_collection_crawl_resources(res["_id"])
|
||||
res["resources"], res["preloadResources"] = (
|
||||
await self.get_collection_crawl_resources(
|
||||
res["_id"], include_preloads=not public_colls_out
|
||||
)
|
||||
)
|
||||
|
||||
thumbnail = res.get("thumbnail")
|
||||
if thumbnail:
|
||||
@ -490,12 +513,14 @@ class CollectionOps:
|
||||
|
||||
return collections, total
|
||||
|
||||
async def get_collection_crawl_resources(self, coll_id: UUID):
|
||||
async def get_collection_crawl_resources(
|
||||
self, coll_id: UUID, include_preloads=False
|
||||
):
|
||||
"""Return pre-signed resources for all collection crawl files."""
|
||||
# Ensure collection exists
|
||||
_ = await self.get_collection_raw(coll_id)
|
||||
|
||||
all_files = []
|
||||
resources = []
|
||||
|
||||
crawls, _ = await self.crawl_ops.list_all_base_crawls(
|
||||
collection_id=coll_id,
|
||||
@ -506,9 +531,36 @@ class CollectionOps:
|
||||
|
||||
for crawl in crawls:
|
||||
if crawl.resources:
|
||||
all_files.extend(crawl.resources)
|
||||
resources.extend(crawl.resources)
|
||||
|
||||
return all_files
|
||||
preload_resources: List[PreloadResource] = []
|
||||
|
||||
if include_preloads:
|
||||
no_page_items = await self.get_collection_resources_with_no_pages(crawls)
|
||||
for item in no_page_items:
|
||||
preload_resources.append(item)
|
||||
|
||||
return resources, preload_resources
|
||||
|
||||
async def get_collection_resources_with_no_pages(
|
||||
self, crawls: List[CrawlOutWithResources]
|
||||
) -> List[PreloadResource]:
|
||||
"""Return wacz files in collection that have no pages"""
|
||||
resources_no_pages: List[PreloadResource] = []
|
||||
|
||||
for crawl in crawls:
|
||||
_, page_count = await self.page_ops.list_pages(crawl.id)
|
||||
if page_count == 0 and crawl.resources:
|
||||
for resource in crawl.resources:
|
||||
resources_no_pages.append(
|
||||
PreloadResource(
|
||||
name=os.path.basename(resource.name),
|
||||
crawlId=crawl.id,
|
||||
hasPages=False,
|
||||
)
|
||||
)
|
||||
|
||||
return resources_no_pages
|
||||
|
||||
async def get_collection_names(self, uuids: List[UUID]):
|
||||
"""return object of {_id, names} given list of collection ids"""
|
||||
@ -528,9 +580,15 @@ class CollectionOps:
|
||||
names = [name for name in names if name]
|
||||
return {"names": names}
|
||||
|
||||
async def get_collection_crawl_ids(self, coll_id: UUID) -> List[str]:
|
||||
"""Return list of crawl ids in collection"""
|
||||
async def get_collection_crawl_ids(
|
||||
self, coll_id: UUID, public_or_unlisted_only=False
|
||||
) -> List[str]:
|
||||
"""Return list of crawl ids in collection, including only public collections"""
|
||||
crawl_ids = []
|
||||
# ensure collection is public or unlisted, else throw here
|
||||
if public_or_unlisted_only:
|
||||
await self.get_collection_raw(coll_id, public_or_unlisted_only)
|
||||
|
||||
async for crawl_raw in self.crawls.find(
|
||||
{"collectionIds": coll_id}, projection=["_id"]
|
||||
):
|
||||
@ -1010,8 +1068,8 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de
|
||||
try:
|
||||
all_collections, _ = await colls.list_collections(org, page_size=10_000)
|
||||
for collection in all_collections:
|
||||
results[collection.name] = await colls.get_collection_crawl_resources(
|
||||
collection.id
|
||||
results[collection.name], _ = (
|
||||
await colls.get_collection_crawl_resources(collection.id)
|
||||
)
|
||||
except Exception as exc:
|
||||
# pylint: disable=raise-missing-from
|
||||
@ -1047,9 +1105,11 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de
|
||||
response_model=CollOut,
|
||||
)
|
||||
async def get_collection_replay(
|
||||
coll_id: UUID, org: Organization = Depends(org_viewer_dep)
|
||||
request: Request, coll_id: UUID, org: Organization = Depends(org_viewer_dep)
|
||||
):
|
||||
return await colls.get_collection_out(coll_id, org, resources=True)
|
||||
return await colls.get_collection_out(
|
||||
coll_id, org, resources=True, headers=dict(request.headers)
|
||||
)
|
||||
|
||||
@app.get(
|
||||
"/orgs/{oid}/collections/{coll_id}/public/replay.json",
|
||||
@ -1057,12 +1117,17 @@ def init_collections_api(app, mdb, orgs, storage_ops, event_webhook_ops, user_de
|
||||
response_model=CollOut,
|
||||
)
|
||||
async def get_collection_public_replay(
|
||||
request: Request,
|
||||
response: Response,
|
||||
coll_id: UUID,
|
||||
org: Organization = Depends(org_public),
|
||||
):
|
||||
coll = await colls.get_collection_out(
|
||||
coll_id, org, resources=True, public_or_unlisted_only=True
|
||||
coll_id,
|
||||
org,
|
||||
resources=True,
|
||||
public_or_unlisted_only=True,
|
||||
headers=dict(request.headers),
|
||||
)
|
||||
response.headers["Access-Control-Allow-Origin"] = "*"
|
||||
response.headers["Access-Control-Allow-Headers"] = "*"
|
||||
|
||||
@ -881,14 +881,6 @@ class CrawlOut(BaseMongoModel):
|
||||
errorPageCount: Optional[int] = 0
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class CrawlOutWithResources(CrawlOut):
|
||||
"""Crawl output model including resources"""
|
||||
|
||||
resources: Optional[List[CrawlFileOut]] = []
|
||||
collections: Optional[List[CollIdName]] = []
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class UpdateCrawl(BaseModel):
|
||||
"""Update crawl"""
|
||||
@ -1222,6 +1214,173 @@ class ImageFilePreparer(FilePreparer):
|
||||
)
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
||||
### PAGES ###
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageReviewUpdate(BaseModel):
|
||||
"""Update model for page manual review/approval"""
|
||||
|
||||
approved: Optional[bool] = None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageNoteIn(BaseModel):
|
||||
"""Input model for adding page notes"""
|
||||
|
||||
text: str
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageNoteEdit(BaseModel):
|
||||
"""Input model for editing page notes"""
|
||||
|
||||
id: UUID
|
||||
text: str
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageNoteDelete(BaseModel):
|
||||
"""Delete model for page notes"""
|
||||
|
||||
delete_list: List[UUID] = []
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageNote(BaseModel):
|
||||
"""Model for page notes, tracking user and time"""
|
||||
|
||||
id: UUID
|
||||
text: str
|
||||
created: datetime
|
||||
userid: UUID
|
||||
userName: str
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageQACompare(BaseModel):
|
||||
"""Model for updating pages from QA run"""
|
||||
|
||||
screenshotMatch: Optional[float] = None
|
||||
textMatch: Optional[float] = None
|
||||
resourceCounts: Optional[Dict[str, int]] = None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class Page(BaseMongoModel):
|
||||
"""Core page data, no QA"""
|
||||
|
||||
id: UUID
|
||||
|
||||
oid: UUID
|
||||
crawl_id: str
|
||||
|
||||
# core page data
|
||||
url: AnyHttpUrl
|
||||
title: Optional[str] = None
|
||||
ts: Optional[datetime] = None
|
||||
loadState: Optional[int] = None
|
||||
status: Optional[int] = None
|
||||
mime: Optional[str] = None
|
||||
filename: Optional[str] = None
|
||||
depth: Optional[int] = None
|
||||
favIconUrl: Optional[AnyHttpUrl] = None
|
||||
isSeed: Optional[bool] = False
|
||||
|
||||
# manual review
|
||||
userid: Optional[UUID] = None
|
||||
modified: Optional[datetime] = None
|
||||
approved: Optional[bool] = None
|
||||
notes: List[PageNote] = []
|
||||
|
||||
isFile: Optional[bool] = False
|
||||
isError: Optional[bool] = False
|
||||
|
||||
def compute_page_type(self):
|
||||
"""sets self.isFile or self.isError flags"""
|
||||
self.isFile = False
|
||||
self.isError = False
|
||||
if self.loadState == 2:
|
||||
# pylint: disable=unsupported-membership-test
|
||||
if self.mime and "html" not in self.mime:
|
||||
self.isFile = True
|
||||
elif self.title is None and self.status == 200:
|
||||
self.isFile = True
|
||||
|
||||
elif self.loadState == 0:
|
||||
self.isError = True
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageWithAllQA(Page):
|
||||
"""Model for core page data + qa"""
|
||||
|
||||
# automated heuristics, keyed by QA run id
|
||||
qa: Optional[Dict[str, PageQACompare]] = {}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageOut(Page):
|
||||
"""Model for pages output, no QA"""
|
||||
|
||||
status: int = 200
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageOutWithSingleQA(Page):
|
||||
"""Page out with single QA entry"""
|
||||
|
||||
qa: Optional[PageQACompare] = None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageNoteAddedResponse(BaseModel):
|
||||
"""Model for response to adding page"""
|
||||
|
||||
added: bool
|
||||
data: PageNote
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageNoteUpdatedResponse(BaseModel):
|
||||
"""Model for response to updating page"""
|
||||
|
||||
updated: bool
|
||||
data: PageNote
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageIdTimestamp(BaseModel):
|
||||
"""Simplified model for page info to include in PageUrlCount"""
|
||||
|
||||
pageId: UUID
|
||||
ts: Optional[datetime] = None
|
||||
status: int = 200
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageUrlCount(BaseModel):
|
||||
"""Model for counting pages by URL"""
|
||||
|
||||
url: AnyHttpUrl
|
||||
count: int = 0
|
||||
snapshots: List[PageIdTimestamp] = []
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class CrawlOutWithResources(CrawlOut):
|
||||
"""Crawl output model including resources"""
|
||||
|
||||
resources: Optional[List[CrawlFileOut]] = []
|
||||
collections: Optional[List[CollIdName]] = []
|
||||
|
||||
initialPages: List[PageOut] = []
|
||||
totalPages: Optional[int] = None
|
||||
pagesQueryUrl: str = ""
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
||||
### COLLECTIONS ###
|
||||
@ -1245,6 +1404,15 @@ class CollectionThumbnailSource(BaseModel):
|
||||
urlPageId: UUID
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PreloadResource(BaseModel):
|
||||
"""Resources that will preloaded in RWP"""
|
||||
|
||||
name: str
|
||||
crawlId: str
|
||||
hasPages: bool
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class Collection(BaseMongoModel):
|
||||
"""Org collection structure"""
|
||||
@ -1338,6 +1506,11 @@ class CollOut(BaseMongoModel):
|
||||
|
||||
allowPublicDownload: bool = True
|
||||
|
||||
initialPages: List[PageOut] = []
|
||||
totalPages: Optional[int] = None
|
||||
preloadResources: List[PreloadResource] = []
|
||||
pagesQueryUrl: str = ""
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PublicCollOut(BaseMongoModel):
|
||||
@ -2435,161 +2608,6 @@ AnyJob = RootModel[
|
||||
]
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
||||
### PAGES ###
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageReviewUpdate(BaseModel):
|
||||
"""Update model for page manual review/approval"""
|
||||
|
||||
approved: Optional[bool] = None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageNoteIn(BaseModel):
|
||||
"""Input model for adding page notes"""
|
||||
|
||||
text: str
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageNoteEdit(BaseModel):
|
||||
"""Input model for editing page notes"""
|
||||
|
||||
id: UUID
|
||||
text: str
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageNoteDelete(BaseModel):
|
||||
"""Delete model for page notes"""
|
||||
|
||||
delete_list: List[UUID] = []
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageNote(BaseModel):
|
||||
"""Model for page notes, tracking user and time"""
|
||||
|
||||
id: UUID
|
||||
text: str
|
||||
created: datetime
|
||||
userid: UUID
|
||||
userName: str
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageQACompare(BaseModel):
|
||||
"""Model for updating pages from QA run"""
|
||||
|
||||
screenshotMatch: Optional[float] = None
|
||||
textMatch: Optional[float] = None
|
||||
resourceCounts: Optional[Dict[str, int]] = None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class Page(BaseMongoModel):
|
||||
"""Core page data, no QA"""
|
||||
|
||||
id: UUID
|
||||
|
||||
oid: UUID
|
||||
crawl_id: str
|
||||
|
||||
# core page data
|
||||
url: AnyHttpUrl
|
||||
title: Optional[str] = None
|
||||
ts: Optional[datetime] = None
|
||||
loadState: Optional[int] = None
|
||||
status: Optional[int] = None
|
||||
mime: Optional[str] = None
|
||||
filename: Optional[str] = None
|
||||
depth: Optional[int] = None
|
||||
favIconUrl: Optional[AnyHttpUrl] = None
|
||||
isSeed: Optional[bool] = False
|
||||
|
||||
# manual review
|
||||
userid: Optional[UUID] = None
|
||||
modified: Optional[datetime] = None
|
||||
approved: Optional[bool] = None
|
||||
notes: List[PageNote] = []
|
||||
|
||||
isFile: Optional[bool] = False
|
||||
isError: Optional[bool] = False
|
||||
|
||||
def compute_page_type(self):
|
||||
"""sets self.isFile or self.isError flags"""
|
||||
self.isFile = False
|
||||
self.isError = False
|
||||
if self.loadState == 2:
|
||||
# pylint: disable=unsupported-membership-test
|
||||
if self.mime and "html" not in self.mime:
|
||||
self.isFile = True
|
||||
elif self.title is None and self.status == 200:
|
||||
self.isFile = True
|
||||
|
||||
elif self.loadState == 0:
|
||||
self.isError = True
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageWithAllQA(Page):
|
||||
"""Model for core page data + qa"""
|
||||
|
||||
# automated heuristics, keyed by QA run id
|
||||
qa: Optional[Dict[str, PageQACompare]] = {}
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageOut(Page):
|
||||
"""Model for pages output, no QA"""
|
||||
|
||||
status: int = 200
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageOutWithSingleQA(Page):
|
||||
"""Page out with single QA entry"""
|
||||
|
||||
qa: Optional[PageQACompare] = None
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageNoteAddedResponse(BaseModel):
|
||||
"""Model for response to adding page"""
|
||||
|
||||
added: bool
|
||||
data: PageNote
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageNoteUpdatedResponse(BaseModel):
|
||||
"""Model for response to updating page"""
|
||||
|
||||
updated: bool
|
||||
data: PageNote
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageIdTimestamp(BaseModel):
|
||||
"""Simplified model for page info to include in PageUrlCount"""
|
||||
|
||||
pageId: UUID
|
||||
ts: Optional[datetime] = None
|
||||
status: int = 200
|
||||
|
||||
|
||||
# ============================================================================
|
||||
class PageUrlCount(BaseModel):
|
||||
"""Model for counting pages by URL"""
|
||||
|
||||
url: AnyHttpUrl
|
||||
count: int = 0
|
||||
snapshots: List[PageIdTimestamp] = []
|
||||
|
||||
|
||||
# ============================================================================
|
||||
|
||||
### GENERIC RESPONSE MODELS ###
|
||||
|
||||
@ -11,7 +11,7 @@ from datetime import datetime
|
||||
from typing import TYPE_CHECKING, Optional, Tuple, List, Dict, Any, Union
|
||||
from uuid import UUID, uuid4
|
||||
|
||||
from fastapi import Depends, HTTPException, Request
|
||||
from fastapi import Depends, HTTPException, Request, Response
|
||||
import pymongo
|
||||
|
||||
from .models import (
|
||||
@ -35,6 +35,7 @@ from .models import (
|
||||
DeletedResponse,
|
||||
PageNoteAddedResponse,
|
||||
PageNoteUpdatedResponse,
|
||||
EmptyResponse,
|
||||
)
|
||||
from .pagination import DEFAULT_PAGE_SIZE, paginated_format
|
||||
from .utils import str_to_date, str_list_to_bools, dt_now
|
||||
@ -503,6 +504,7 @@ class PageOps:
|
||||
self,
|
||||
crawl_id: str,
|
||||
org: Optional[Organization] = None,
|
||||
search: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
url_prefix: Optional[str] = None,
|
||||
ts: Optional[datetime] = None,
|
||||
@ -534,6 +536,13 @@ class PageOps:
|
||||
if org:
|
||||
query["oid"] = org.id
|
||||
|
||||
if search:
|
||||
search_regex = re.escape(urllib.parse.unquote(search))
|
||||
query["$or"] = [
|
||||
{"url": {"$regex": search_regex, "$options": "i"}},
|
||||
{"title": {"$regex": search_regex, "$options": "i"}},
|
||||
]
|
||||
|
||||
if url_prefix:
|
||||
url_prefix = urllib.parse.unquote(url_prefix)
|
||||
regex_pattern = f"^{re.escape(url_prefix)}"
|
||||
@ -661,6 +670,7 @@ class PageOps:
|
||||
self,
|
||||
coll_id: UUID,
|
||||
org: Optional[Organization] = None,
|
||||
search: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
url_prefix: Optional[str] = None,
|
||||
ts: Optional[datetime] = None,
|
||||
@ -670,6 +680,7 @@ class PageOps:
|
||||
page: int = 1,
|
||||
sort_by: Optional[str] = None,
|
||||
sort_direction: Optional[int] = -1,
|
||||
public_or_unlisted_only=False,
|
||||
) -> Tuple[Union[List[PageOut], List[PageOutWithSingleQA]], int]:
|
||||
"""List all pages in collection, with optional filtering"""
|
||||
# pylint: disable=duplicate-code, too-many-locals, too-many-branches, too-many-statements
|
||||
@ -677,7 +688,9 @@ class PageOps:
|
||||
page = page - 1
|
||||
skip = page_size * page
|
||||
|
||||
crawl_ids = await self.coll_ops.get_collection_crawl_ids(coll_id)
|
||||
crawl_ids = await self.coll_ops.get_collection_crawl_ids(
|
||||
coll_id, public_or_unlisted_only
|
||||
)
|
||||
|
||||
query: dict[str, object] = {
|
||||
"crawl_id": {"$in": crawl_ids},
|
||||
@ -685,7 +698,14 @@ class PageOps:
|
||||
if org:
|
||||
query["oid"] = org.id
|
||||
|
||||
if url_prefix:
|
||||
if search:
|
||||
search_regex = re.escape(urllib.parse.unquote(search))
|
||||
query["$or"] = [
|
||||
{"url": {"$regex": search_regex, "$options": "i"}},
|
||||
{"title": {"$regex": search_regex, "$options": "i"}},
|
||||
]
|
||||
|
||||
elif url_prefix:
|
||||
url_prefix = urllib.parse.unquote(url_prefix)
|
||||
regex_pattern = f"^{re.escape(url_prefix)}"
|
||||
query["url"] = {"$regex": regex_pattern, "$options": "i"}
|
||||
@ -724,6 +744,9 @@ class PageOps:
|
||||
raise HTTPException(status_code=400, detail="invalid_sort_direction")
|
||||
|
||||
aggregate.extend([{"$sort": {sort_by: sort_direction}}])
|
||||
else:
|
||||
# default sort: seeds first, then by timestamp
|
||||
aggregate.extend([{"$sort": {"isSeed": -1, "ts": 1}}])
|
||||
|
||||
aggregate.extend(
|
||||
[
|
||||
@ -886,6 +909,7 @@ def init_pages_api(
|
||||
|
||||
org_viewer_dep = org_ops.org_viewer_dep
|
||||
org_crawl_dep = org_ops.org_crawl_dep
|
||||
org_public = org_ops.org_public
|
||||
|
||||
@app.post(
|
||||
"/orgs/{oid}/crawls/all/pages/reAdd",
|
||||
@ -1056,6 +1080,7 @@ def init_pages_api(
|
||||
async def get_crawl_pages_list(
|
||||
crawl_id: str,
|
||||
org: Organization = Depends(org_crawl_dep),
|
||||
search: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
urlPrefix: Optional[str] = None,
|
||||
ts: Optional[datetime] = None,
|
||||
@ -1077,6 +1102,7 @@ def init_pages_api(
|
||||
pages, total = await ops.list_pages(
|
||||
crawl_id=crawl_id,
|
||||
org=org,
|
||||
search=search,
|
||||
url=url,
|
||||
url_prefix=urlPrefix,
|
||||
ts=ts,
|
||||
@ -1093,13 +1119,15 @@ def init_pages_api(
|
||||
return paginated_format(pages, total, page, pageSize)
|
||||
|
||||
@app.get(
|
||||
"/orgs/{oid}/collections/{coll_id}/pages",
|
||||
"/orgs/{oid}/collections/{coll_id}/public/pages",
|
||||
tags=["pages", "collections"],
|
||||
response_model=PaginatedPageOutResponse,
|
||||
)
|
||||
async def get_collection_pages_list(
|
||||
async def get_public_collection_pages_list(
|
||||
coll_id: UUID,
|
||||
org: Organization = Depends(org_viewer_dep),
|
||||
response: Response,
|
||||
org: Organization = Depends(org_public),
|
||||
search: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
urlPrefix: Optional[str] = None,
|
||||
ts: Optional[datetime] = None,
|
||||
@ -1114,6 +1142,58 @@ def init_pages_api(
|
||||
pages, total = await ops.list_collection_pages(
|
||||
coll_id=coll_id,
|
||||
org=org,
|
||||
search=search,
|
||||
url=url,
|
||||
url_prefix=urlPrefix,
|
||||
ts=ts,
|
||||
is_seed=isSeed,
|
||||
depth=depth,
|
||||
page_size=pageSize,
|
||||
page=page,
|
||||
sort_by=sortBy,
|
||||
sort_direction=sortDirection,
|
||||
public_or_unlisted_only=True,
|
||||
)
|
||||
|
||||
response.headers["Access-Control-Allow-Origin"] = "*"
|
||||
response.headers["Access-Control-Allow-Headers"] = "*"
|
||||
return paginated_format(pages, total, page, pageSize)
|
||||
|
||||
@app.options(
|
||||
"/orgs/{oid}/collections/{coll_id}/public/pages",
|
||||
tags=["pages", "collections"],
|
||||
response_model=EmptyResponse,
|
||||
)
|
||||
async def get_replay_preflight(response: Response):
|
||||
response.headers["Access-Control-Allow-Methods"] = "GET, HEAD, OPTIONS"
|
||||
response.headers["Access-Control-Allow-Origin"] = "*"
|
||||
response.headers["Access-Control-Allow-Headers"] = "*"
|
||||
return {}
|
||||
|
||||
@app.get(
|
||||
"/orgs/{oid}/collections/{coll_id}/pages",
|
||||
tags=["pages", "collections"],
|
||||
response_model=PaginatedPageOutResponse,
|
||||
)
|
||||
async def get_collection_pages_list(
|
||||
coll_id: UUID,
|
||||
org: Organization = Depends(org_viewer_dep),
|
||||
search: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
urlPrefix: Optional[str] = None,
|
||||
ts: Optional[datetime] = None,
|
||||
isSeed: Optional[bool] = None,
|
||||
depth: Optional[int] = None,
|
||||
pageSize: int = DEFAULT_PAGE_SIZE,
|
||||
page: int = 1,
|
||||
sortBy: Optional[str] = None,
|
||||
sortDirection: Optional[int] = -1,
|
||||
):
|
||||
"""Retrieve paginated list of pages in collection"""
|
||||
pages, total = await ops.list_collection_pages(
|
||||
coll_id=coll_id,
|
||||
org=org,
|
||||
search=search,
|
||||
url=url,
|
||||
url_prefix=urlPrefix,
|
||||
ts=ts,
|
||||
|
||||
@ -382,6 +382,11 @@ def test_get_collection_replay(crawler_auth_headers, default_org_id):
|
||||
assert data["dateEarliest"]
|
||||
assert data["dateLatest"]
|
||||
assert data["defaultThumbnailName"]
|
||||
assert data["initialPages"]
|
||||
assert data["pagesQueryUrl"].endswith(
|
||||
f"/orgs/{default_org_id}/collections/{_coll_id}/pages"
|
||||
)
|
||||
assert "preloadResources" in data
|
||||
|
||||
resources = data["resources"]
|
||||
assert resources
|
||||
@ -413,10 +418,27 @@ def test_collection_public(crawler_auth_headers, default_org_id):
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/replay.json",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
data = r.json()
|
||||
assert data["initialPages"]
|
||||
assert data["pagesQueryUrl"].endswith(
|
||||
f"/orgs/{default_org_id}/collections/{_coll_id}/public/pages"
|
||||
)
|
||||
assert "preloadResources" in data
|
||||
|
||||
assert r.status_code == 200
|
||||
assert r.headers["Access-Control-Allow-Origin"] == "*"
|
||||
assert r.headers["Access-Control-Allow-Headers"] == "*"
|
||||
|
||||
# test public pages endpoint
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
assert data["total"] > 0
|
||||
assert data["items"]
|
||||
|
||||
# make unlisted and test replay headers
|
||||
r = requests.patch(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}",
|
||||
@ -451,6 +473,12 @@ def test_collection_public(crawler_auth_headers, default_org_id):
|
||||
)
|
||||
assert r.status_code == 404
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/public/pages",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 404
|
||||
|
||||
|
||||
def test_collection_access_invalid_value(crawler_auth_headers, default_org_id):
|
||||
r = requests.patch(
|
||||
@ -614,6 +642,39 @@ def test_list_pages_in_collection(crawler_auth_headers, default_org_id):
|
||||
coll_page_id = coll_page["id"]
|
||||
coll_page_url = coll_page["url"]
|
||||
coll_page_ts = coll_page["ts"]
|
||||
coll_page_title = coll_page["title"]
|
||||
|
||||
# Test search filter
|
||||
partial_title = coll_page_title[:5]
|
||||
partial_url = coll_page_url[:8]
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_title}",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
|
||||
assert data["total"] >= 1
|
||||
for matching_page in data["items"]:
|
||||
assert (
|
||||
partial_title in matching_page["title"]
|
||||
or partial_url in matching_page["url"]
|
||||
)
|
||||
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/collections/{_coll_id}/pages?search={partial_url}",
|
||||
headers=crawler_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
|
||||
assert data["total"] >= 1
|
||||
for matching_page in data["items"]:
|
||||
assert (
|
||||
partial_title in matching_page["title"]
|
||||
or partial_url in matching_page["url"]
|
||||
)
|
||||
|
||||
# Test exact url filter
|
||||
r = requests.get(
|
||||
|
||||
@ -184,6 +184,11 @@ def test_wait_for_complete(admin_auth_headers, default_org_id):
|
||||
assert len(data["resources"]) == 1
|
||||
assert data["resources"][0]["path"]
|
||||
|
||||
assert len(data["initialPages"]) == 1
|
||||
assert data["pagesQueryUrl"].endswith(
|
||||
f"/orgs/{default_org_id}/crawls/{admin_crawl_id}/pages"
|
||||
)
|
||||
|
||||
# ensure filename matches specified pattern
|
||||
# set in default_crawl_filename_template
|
||||
assert re.search("/[\\d]+-testing-[\\w-]+\\.wacz", data["resources"][0]["path"])
|
||||
|
||||
Loading…
Reference in New Issue
Block a user