Backend work for #1859 - Remove file count from qa stats endpoint - Compute isFile or isError per page when page is added - Increment filePageCount and errorPageCount per crawl to count number of isFile or isError pages - Add file and error counts to crawl replay.json endpoint (filePageCount and errorPageCount) - Add migration 0028 to set isFile / isError for each page, aggregate filePageCount / errorPageCount per crawl - Determine if page is a file based on loadState == 2, mime type or status code and lack of title
		
			
				
	
	
		
			72 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			72 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """
 | |
| Migration 0028 - Page files and errors
 | |
| """
 | |
| 
 | |
| from btrixcloud.migrations import BaseMigration
 | |
| from btrixcloud.models import Page, Crawl
 | |
| 
 | |
| 
 | |
| MIGRATION_VERSION = "0028"
 | |
| 
 | |
| 
 | |
| class Migration(BaseMigration):
 | |
|     """Migration class."""
 | |
| 
 | |
|     # pylint: disable=unused-argument
 | |
|     def __init__(self, mdb, **kwargs):
 | |
|         super().__init__(mdb, migration_version=MIGRATION_VERSION)
 | |
| 
 | |
|     async def migrate_up(self):
 | |
|         """Perform migration up.
 | |
| 
 | |
|         Update older crawls and their pages:
 | |
|         - Add crawl.filePageCount and crawl.errorPageCount
 | |
|         - Set Page.isFile and Page.isError
 | |
|         """
 | |
|         pages_db = self.mdb["pages"]
 | |
|         crawls_db = self.mdb["crawls"]
 | |
| 
 | |
|         cursor = crawls_db.find({"type": "crawl", "filePageCount": None})
 | |
|         async for crawl_dict in cursor:
 | |
|             try:
 | |
|                 crawl = Crawl.from_dict(crawl_dict)
 | |
|                 crawl.filePageCount = 0
 | |
|                 crawl.errorPageCount = 0
 | |
| 
 | |
|                 cursor = pages_db.find({"crawl_id": crawl.id})
 | |
|                 async for page_dict in cursor:
 | |
|                     page = Page.from_dict(page_dict)
 | |
| 
 | |
|                     page.compute_page_type()
 | |
|                     if page.isFile:
 | |
|                         crawl.filePageCount += 1
 | |
| 
 | |
|                     if page.isError:
 | |
|                         crawl.errorPageCount += 1
 | |
| 
 | |
|                     if page.isFile or page.isError:
 | |
|                         await pages_db.find_one_and_update(
 | |
|                             {"_id": page.id},
 | |
|                             {
 | |
|                                 "$set": page.dict(
 | |
|                                     include={"isFile": True, "isError": True}
 | |
|                                 )
 | |
|                             },
 | |
|                         )
 | |
| 
 | |
|                 await crawls_db.find_one_and_update(
 | |
|                     {"_id": crawl.id, "type": "crawl"},
 | |
|                     {
 | |
|                         "$set": crawl.dict(
 | |
|                             include={"filePageCount": True, "errorPageCount": True}
 | |
|                         )
 | |
|                     },
 | |
|                 )
 | |
|             # pylint: disable=broad-exception-caught
 | |
|             except Exception as err:
 | |
|                 crawl_id = crawl_dict.get("_id")
 | |
|                 print(
 | |
|                     f"Error updating page counts and pages for crawl {crawl_id}: {err}",
 | |
|                     flush=True,
 | |
|                 )
 |