Ensure collection stats are updated when WACZ is added on upload (#2351)
Fixes #2350 Collection earliest/latest dates and the collection modified date are also now updated when crawls or uploads are added to a collection via the collection auto-add feature.
This commit is contained in:
		
							parent
							
								
									b0aebb599a
								
							
						
					
					
						commit
						0a8df62ab4
					
				| @ -684,11 +684,19 @@ class CollectionOps: | ||||
|         ) | ||||
| 
 | ||||
|     async def update_crawl_collections(self, crawl_id: str): | ||||
|         """Update counts and tags for all collections in crawl""" | ||||
|         """Update counts, dates, and modified for all collections in crawl""" | ||||
|         crawl = await self.crawls.find_one({"_id": crawl_id}) | ||||
|         crawl_coll_ids = crawl.get("collectionIds") | ||||
|         for collection_id in crawl_coll_ids: | ||||
|             await self.update_collection_counts_and_tags(collection_id) | ||||
|         modified = dt_now() | ||||
| 
 | ||||
|         for coll_id in crawl_coll_ids: | ||||
|             await self.update_collection_counts_and_tags(coll_id) | ||||
|             await self.update_collection_dates(coll_id) | ||||
|             await self.collections.find_one_and_update( | ||||
|                 {"_id": coll_id}, | ||||
|                 {"$set": {"modified": modified}}, | ||||
|                 return_document=pymongo.ReturnDocument.AFTER, | ||||
|             ) | ||||
| 
 | ||||
|     async def add_successful_crawl_to_collections(self, crawl_id: str, cid: UUID): | ||||
|         """Add successful crawl to its auto-add collections.""" | ||||
|  | ||||
| @ -190,7 +190,9 @@ class UploadOps(BaseCrawlOps): | ||||
|             self.event_webhook_ops.create_upload_finished_notification(crawl_id, org.id) | ||||
|         ) | ||||
| 
 | ||||
|         asyncio.create_task(self.page_ops.add_crawl_pages_to_db_from_wacz(crawl_id)) | ||||
|         asyncio.create_task( | ||||
|             self._add_pages_and_update_collections(crawl_id, collections) | ||||
|         ) | ||||
| 
 | ||||
|         await self.orgs.inc_org_bytes_stored(org.id, file_size, "upload") | ||||
| 
 | ||||
| @ -204,6 +206,13 @@ class UploadOps(BaseCrawlOps): | ||||
| 
 | ||||
|         return {"id": crawl_id, "added": True, "storageQuotaReached": quota_reached} | ||||
| 
 | ||||
|     async def _add_pages_and_update_collections( | ||||
|         crawl_id: str, collections: Optional[List[str]] = None | ||||
|     ): | ||||
|         await self.page_ops.add_crawl_pages_to_db_from_wacz(crawl_id) | ||||
|         if collections: | ||||
|             await self.colls.update_crawl_collections(crawl_id) | ||||
| 
 | ||||
|     async def delete_uploads( | ||||
|         self, | ||||
|         delete_list: DeleteCrawlList, | ||||
|  | ||||
| @ -285,6 +285,26 @@ def test_get_upload_pages(admin_auth_headers, default_org_id, upload_id): | ||||
|     assert data["uniquePageCount"] > 0 | ||||
| 
 | ||||
| 
 | ||||
| def test_uploads_collection_updated( | ||||
|     admin_auth_headers, default_org_id, uploads_collection_id, upload_id | ||||
| ): | ||||
|     # Verify that collection is updated when WACZ is added on upload | ||||
|     r = requests.get( | ||||
|         f"{API_PREFIX}/orgs/{default_org_id}/collections/{uploads_collection_id}", | ||||
|         headers=admin_auth_headers, | ||||
|     ) | ||||
|     assert r.status_code == 200 | ||||
|     data = r.json() | ||||
| 
 | ||||
|     assert data["crawlCount"] > 0 | ||||
|     assert data["pageCount"] > 0 | ||||
|     assert data["uniquePageCount"] > 0 | ||||
|     assert data["totalSize"] > 0 | ||||
|     assert data["dateEarliest"] | ||||
|     assert data["dateLatest"] | ||||
|     assert data["modified"] > data["created"] | ||||
| 
 | ||||
| 
 | ||||
| def test_replace_upload( | ||||
|     admin_auth_headers, default_org_id, uploads_collection_id, upload_id | ||||
| ): | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user