Ensure collection stats are updated when WACZ is added on upload (#2351)
Fixes #2350 Collection earliest/latest dates and the collection modified date are also now updated when crawls or uploads are added to a collection via the collection auto-add feature.
This commit is contained in:
parent
b0aebb599a
commit
0a8df62ab4
@ -684,11 +684,19 @@ class CollectionOps:
|
||||
)
|
||||
|
||||
async def update_crawl_collections(self, crawl_id: str):
|
||||
"""Update counts and tags for all collections in crawl"""
|
||||
"""Update counts, dates, and modified for all collections in crawl"""
|
||||
crawl = await self.crawls.find_one({"_id": crawl_id})
|
||||
crawl_coll_ids = crawl.get("collectionIds")
|
||||
for collection_id in crawl_coll_ids:
|
||||
await self.update_collection_counts_and_tags(collection_id)
|
||||
modified = dt_now()
|
||||
|
||||
for coll_id in crawl_coll_ids:
|
||||
await self.update_collection_counts_and_tags(coll_id)
|
||||
await self.update_collection_dates(coll_id)
|
||||
await self.collections.find_one_and_update(
|
||||
{"_id": coll_id},
|
||||
{"$set": {"modified": modified}},
|
||||
return_document=pymongo.ReturnDocument.AFTER,
|
||||
)
|
||||
|
||||
async def add_successful_crawl_to_collections(self, crawl_id: str, cid: UUID):
|
||||
"""Add successful crawl to its auto-add collections."""
|
||||
|
@ -190,7 +190,9 @@ class UploadOps(BaseCrawlOps):
|
||||
self.event_webhook_ops.create_upload_finished_notification(crawl_id, org.id)
|
||||
)
|
||||
|
||||
asyncio.create_task(self.page_ops.add_crawl_pages_to_db_from_wacz(crawl_id))
|
||||
asyncio.create_task(
|
||||
self._add_pages_and_update_collections(crawl_id, collections)
|
||||
)
|
||||
|
||||
await self.orgs.inc_org_bytes_stored(org.id, file_size, "upload")
|
||||
|
||||
@ -204,6 +206,13 @@ class UploadOps(BaseCrawlOps):
|
||||
|
||||
return {"id": crawl_id, "added": True, "storageQuotaReached": quota_reached}
|
||||
|
||||
async def _add_pages_and_update_collections(
|
||||
crawl_id: str, collections: Optional[List[str]] = None
|
||||
):
|
||||
await self.page_ops.add_crawl_pages_to_db_from_wacz(crawl_id)
|
||||
if collections:
|
||||
await self.colls.update_crawl_collections(crawl_id)
|
||||
|
||||
async def delete_uploads(
|
||||
self,
|
||||
delete_list: DeleteCrawlList,
|
||||
|
@ -285,6 +285,26 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id):
|
||||
assert data["uniquePageCount"] > 0
|
||||
|
||||
|
||||
def test_uploads_collection_updated(
|
||||
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
|
||||
):
|
||||
# Verify that collection is updated when WACZ is added on upload
|
||||
r = requests.get(
|
||||
f"{API_PREFIX}/orgs/{default_org_id}/collections/{uploads_collection_id}",
|
||||
headers=admin_auth_headers,
|
||||
)
|
||||
assert r.status_code == 200
|
||||
data = r.json()
|
||||
|
||||
assert data["crawlCount"] > 0
|
||||
assert data["pageCount"] > 0
|
||||
assert data["uniquePageCount"] > 0
|
||||
assert data["totalSize"] > 0
|
||||
assert data["dateEarliest"]
|
||||
assert data["dateLatest"]
|
||||
assert data["modified"] > data["created"]
|
||||
|
||||
|
||||
def test_replace_upload(
|
||||
admin_auth_headers, default_org_id, uploads_collection_id, upload_id
|
||||
):
|
||||
|
Loading…
Reference in New Issue
Block a user