Backend work for #1859 - Remove file count from qa stats endpoint - Compute isFile or isError per page when page is added - Increment filePageCount and errorPageCount per crawl to count number of isFile or isError pages - Add file and error counts to crawl replay.json endpoint (filePageCount and errorPageCount) - Add migration 0028 to set isFile / isError for each page, aggregate filePageCount / errorPageCount per crawl - Determine if page is a file based on loadState == 2, mime type or status code and lack of title
		
			
				
	
	
		
			72 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			72 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
"""
 | 
						|
Migration 0028 - Page files and errors
 | 
						|
"""
 | 
						|
 | 
						|
from btrixcloud.migrations import BaseMigration
 | 
						|
from btrixcloud.models import Page, Crawl
 | 
						|
 | 
						|
 | 
						|
MIGRATION_VERSION = "0028"
 | 
						|
 | 
						|
 | 
						|
class Migration(BaseMigration):
 | 
						|
    """Migration class."""
 | 
						|
 | 
						|
    # pylint: disable=unused-argument
 | 
						|
    def __init__(self, mdb, **kwargs):
 | 
						|
        super().__init__(mdb, migration_version=MIGRATION_VERSION)
 | 
						|
 | 
						|
    async def migrate_up(self):
 | 
						|
        """Perform migration up.
 | 
						|
 | 
						|
        Update older crawls and their pages:
 | 
						|
        - Add crawl.filePageCount and crawl.errorPageCount
 | 
						|
        - Set Page.isFile and Page.isError
 | 
						|
        """
 | 
						|
        pages_db = self.mdb["pages"]
 | 
						|
        crawls_db = self.mdb["crawls"]
 | 
						|
 | 
						|
        cursor = crawls_db.find({"type": "crawl", "filePageCount": None})
 | 
						|
        async for crawl_dict in cursor:
 | 
						|
            try:
 | 
						|
                crawl = Crawl.from_dict(crawl_dict)
 | 
						|
                crawl.filePageCount = 0
 | 
						|
                crawl.errorPageCount = 0
 | 
						|
 | 
						|
                cursor = pages_db.find({"crawl_id": crawl.id})
 | 
						|
                async for page_dict in cursor:
 | 
						|
                    page = Page.from_dict(page_dict)
 | 
						|
 | 
						|
                    page.compute_page_type()
 | 
						|
                    if page.isFile:
 | 
						|
                        crawl.filePageCount += 1
 | 
						|
 | 
						|
                    if page.isError:
 | 
						|
                        crawl.errorPageCount += 1
 | 
						|
 | 
						|
                    if page.isFile or page.isError:
 | 
						|
                        await pages_db.find_one_and_update(
 | 
						|
                            {"_id": page.id},
 | 
						|
                            {
 | 
						|
                                "$set": page.dict(
 | 
						|
                                    include={"isFile": True, "isError": True}
 | 
						|
                                )
 | 
						|
                            },
 | 
						|
                        )
 | 
						|
 | 
						|
                await crawls_db.find_one_and_update(
 | 
						|
                    {"_id": crawl.id, "type": "crawl"},
 | 
						|
                    {
 | 
						|
                        "$set": crawl.dict(
 | 
						|
                            include={"filePageCount": True, "errorPageCount": True}
 | 
						|
                        )
 | 
						|
                    },
 | 
						|
                )
 | 
						|
            # pylint: disable=broad-exception-caught
 | 
						|
            except Exception as err:
 | 
						|
                crawl_id = crawl_dict.get("_id")
 | 
						|
                print(
 | 
						|
                    f"Error updating page counts and pages for crawl {crawl_id}: {err}",
 | 
						|
                    flush=True,
 | 
						|
                )
 |