Ensure collection stats are updated when WACZ is added on upload (#2351)

Fixes #2350 

Collection earliest/latest dates and the collection modified date are
also now updated when crawls or uploads are added to a collection via
the collection auto-add feature.
This commit is contained in:
Tessa Walsh 2025-01-30 16:05:56 -05:00 committed by GitHub
parent b0aebb599a
commit 0a8df62ab4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 41 additions and 4 deletions

View File

@ -684,11 +684,19 @@ class CollectionOps:
)
async def update_crawl_collections(self, crawl_id: str):
"""Update counts and tags for all collections in crawl"""
"""Update counts, dates, and modified for all collections in crawl"""
crawl = await self.crawls.find_one({"_id": crawl_id})
crawl_coll_ids = crawl.get("collectionIds")
for collection_id in crawl_coll_ids:
await self.update_collection_counts_and_tags(collection_id)
modified = dt_now()
for coll_id in crawl_coll_ids:
await self.update_collection_counts_and_tags(coll_id)
await self.update_collection_dates(coll_id)
await self.collections.find_one_and_update(
{"_id": coll_id},
{"$set": {"modified": modified}},
return_document=pymongo.ReturnDocument.AFTER,
)
async def add_successful_crawl_to_collections(self, crawl_id: str, cid: UUID):
"""Add successful crawl to its auto-add collections."""

View File

@ -190,7 +190,9 @@ class UploadOps(BaseCrawlOps):
self.event_webhook_ops.create_upload_finished_notification(crawl_id, org.id)
)
asyncio.create_task(self.page_ops.add_crawl_pages_to_db_from_wacz(crawl_id))
asyncio.create_task(
self._add_pages_and_update_collections(crawl_id, collections)
)
await self.orgs.inc_org_bytes_stored(org.id, file_size, "upload")
@ -204,6 +206,13 @@ class UploadOps(BaseCrawlOps):
return {"id": crawl_id, "added": True, "storageQuotaReached": quota_reached}
async def _add_pages_and_update_collections(
crawl_id: str, collections: Optional[List[str]] = None
):
await self.page_ops.add_crawl_pages_to_db_from_wacz(crawl_id)
if collections:
await self.colls.update_crawl_collections(crawl_id)
async def delete_uploads(
self,
delete_list: DeleteCrawlList,

View File

@ -285,6 +285,26 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
assert data["uniquePageCount"] > 0
def test_uploads_collection_updated(
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
):
# Verify that collection is updated when WACZ is added on upload
r = requests.get(
f"{API_PREFIX}/orgs/{default_org_id}/collections/{uploads_collection_id}",
headers=admin_auth_headers,
)
assert r.status_code == 200
data = r.json()
assert data["crawlCount"] > 0
assert data["pageCount"] > 0
assert data["uniquePageCount"] > 0
assert data["totalSize"] > 0
assert data["dateEarliest"]
assert data["dateLatest"]
assert data["modified"] > data["created"]
def test_replace_upload(
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
):