Ensure QA run WACZs are deleted (#1715)
- When qa run is deleted - When crawl is deleted And adds tests for WACZ deletion. Fixes #1713
This commit is contained in:
		
							parent
							
								
									127315189f
								
							
						
					
					
						commit
						b8caeb88e9
					
				@ -23,6 +23,7 @@ from .models import (
 | 
			
		||||
    StorageRef,
 | 
			
		||||
    RUNNING_AND_STARTING_STATES,
 | 
			
		||||
    SUCCESSFUL_STATES,
 | 
			
		||||
    QARun,
 | 
			
		||||
)
 | 
			
		||||
from .pagination import paginated_format, DEFAULT_PAGE_SIZE
 | 
			
		||||
from .utils import dt_now
 | 
			
		||||
@ -317,6 +318,7 @@ class BaseCrawlOps:
 | 
			
		||||
 | 
			
		||||
            if type_ == "crawl":
 | 
			
		||||
                await self.page_ops.delete_crawl_pages(crawl_id, org.id)
 | 
			
		||||
                await self.delete_all_crawl_qa_files(crawl_id, org)
 | 
			
		||||
 | 
			
		||||
            crawl_size = await self._delete_crawl_files(crawl, org)
 | 
			
		||||
            size += crawl_size
 | 
			
		||||
@ -351,16 +353,20 @@ class BaseCrawlOps:
 | 
			
		||||
 | 
			
		||||
        return res.deleted_count, cids_to_update, quota_reached
 | 
			
		||||
 | 
			
		||||
    async def _delete_crawl_files(self, crawl: BaseCrawl, org: Organization):
 | 
			
		||||
    async def _delete_crawl_files(
 | 
			
		||||
        self, crawl: Union[BaseCrawl, QARun], org: Organization
 | 
			
		||||
    ):
 | 
			
		||||
        """Delete files associated with crawl from storage."""
 | 
			
		||||
        size = 0
 | 
			
		||||
        for file_ in crawl.files:
 | 
			
		||||
            size += file_.size
 | 
			
		||||
            if not await self.storage_ops.delete_crawl_file_object(org, file_):
 | 
			
		||||
                raise HTTPException(status_code=400, detail="file_deletion_error")
 | 
			
		||||
            await self.background_job_ops.create_delete_replica_jobs(
 | 
			
		||||
                org, file_, crawl.id, crawl.type
 | 
			
		||||
            )
 | 
			
		||||
            # Not replicating QA run WACZs yet
 | 
			
		||||
            if not isinstance(crawl, QARun):
 | 
			
		||||
                await self.background_job_ops.create_delete_replica_jobs(
 | 
			
		||||
                    org, file_, crawl.id, crawl.type
 | 
			
		||||
                )
 | 
			
		||||
 | 
			
		||||
        return size
 | 
			
		||||
 | 
			
		||||
@ -370,6 +376,14 @@ class BaseCrawlOps:
 | 
			
		||||
        org = await self.orgs.get_org_by_id(oid)
 | 
			
		||||
        return await self._delete_crawl_files(crawl, org)
 | 
			
		||||
 | 
			
		||||
    async def delete_all_crawl_qa_files(self, crawl_id: str, org: Organization):
 | 
			
		||||
        """Delete files for all qa runs in a crawl"""
 | 
			
		||||
        crawl_raw = await self.get_crawl_raw(crawl_id)
 | 
			
		||||
        qa_finished = crawl_raw.get("qaFinished", {})
 | 
			
		||||
        for qa_run_raw in qa_finished.values():
 | 
			
		||||
            qa_run = QARun(**qa_run_raw)
 | 
			
		||||
            await self._delete_crawl_files(qa_run, org)
 | 
			
		||||
 | 
			
		||||
    async def _resolve_crawl_refs(
 | 
			
		||||
        self,
 | 
			
		||||
        crawl: Union[CrawlOut, CrawlOutWithResources],
 | 
			
		||||
 | 
			
		||||
@ -828,11 +828,15 @@ class CrawlOps(BaseCrawlOps):
 | 
			
		||||
                status_code=404, detail=f"crawl_not_found, (details: {exc})"
 | 
			
		||||
            )
 | 
			
		||||
 | 
			
		||||
    async def delete_crawl_qa_runs(self, crawl_id: str, delete_list: DeleteQARunList):
 | 
			
		||||
    async def delete_crawl_qa_runs(
 | 
			
		||||
        self, crawl_id: str, delete_list: DeleteQARunList, org: Organization
 | 
			
		||||
    ):
 | 
			
		||||
        """delete specified finished QA run"""
 | 
			
		||||
 | 
			
		||||
        count = 0
 | 
			
		||||
        for qa_run_id in delete_list.qa_run_ids:
 | 
			
		||||
            await self.page_ops.delete_qa_run_from_pages(crawl_id, qa_run_id)
 | 
			
		||||
            await self.delete_crawl_qa_run_files(crawl_id, qa_run_id, org)
 | 
			
		||||
 | 
			
		||||
            res = await self.crawls.find_one_and_update(
 | 
			
		||||
                {"_id": crawl_id, "type": "crawl"},
 | 
			
		||||
                {"$unset": {f"qaFinished.{qa_run_id}": ""}},
 | 
			
		||||
@ -841,10 +845,21 @@ class CrawlOps(BaseCrawlOps):
 | 
			
		||||
            if res:
 | 
			
		||||
                count += 1
 | 
			
		||||
 | 
			
		||||
            await self.page_ops.delete_qa_run_from_pages(crawl_id, qa_run_id)
 | 
			
		||||
 | 
			
		||||
        return {"deleted": count}
 | 
			
		||||
 | 
			
		||||
    async def delete_crawl_qa_run_files(
 | 
			
		||||
        self, crawl_id: str, qa_run_id: str, org: Organization
 | 
			
		||||
    ):
 | 
			
		||||
        """delete crawl qa wacz files"""
 | 
			
		||||
        qa_run = await self.get_qa_run(crawl_id, qa_run_id, org)
 | 
			
		||||
        for file_ in qa_run.files:
 | 
			
		||||
            if not await self.storage_ops.delete_crawl_file_object(org, file_):
 | 
			
		||||
                raise HTTPException(status_code=400, detail="file_deletion_error")
 | 
			
		||||
            # Not replicating QA run WACZs yet
 | 
			
		||||
            # await self.background_job_ops.create_delete_replica_jobs(
 | 
			
		||||
            #     org, file_, qa_run_id, "qa"
 | 
			
		||||
            # )
 | 
			
		||||
 | 
			
		||||
    async def qa_run_finished(self, crawl_id: str):
 | 
			
		||||
        """clear active qa, add qa run to finished list, if successful"""
 | 
			
		||||
        crawl = await self.get_crawl(crawl_id)
 | 
			
		||||
@ -900,10 +915,10 @@ class CrawlOps(BaseCrawlOps):
 | 
			
		||||
        qa = crawl_data.get("qa")
 | 
			
		||||
        return QARunOut(**qa) if qa else None
 | 
			
		||||
 | 
			
		||||
    async def get_qa_run_for_replay(
 | 
			
		||||
    async def get_qa_run(
 | 
			
		||||
        self, crawl_id: str, qa_run_id: str, org: Optional[Organization] = None
 | 
			
		||||
    ) -> QARunWithResources:
 | 
			
		||||
        """Fetch QA runs with resources for replay.json"""
 | 
			
		||||
    ):
 | 
			
		||||
        """Get QARun by id"""
 | 
			
		||||
        crawl = await self.get_crawl(crawl_id, org)
 | 
			
		||||
        qa_finished = crawl.qaFinished or {}
 | 
			
		||||
        qa_run = qa_finished.get(qa_run_id)
 | 
			
		||||
@ -911,6 +926,15 @@ class CrawlOps(BaseCrawlOps):
 | 
			
		||||
        if not qa_run:
 | 
			
		||||
            raise HTTPException(status_code=404, detail="crawl_qa_not_found")
 | 
			
		||||
 | 
			
		||||
        return qa_run
 | 
			
		||||
 | 
			
		||||
    async def get_qa_run_for_replay(
 | 
			
		||||
        self, crawl_id: str, qa_run_id: str, org: Optional[Organization] = None
 | 
			
		||||
    ) -> QARunWithResources:
 | 
			
		||||
        """Fetch QA runs with resources for replay.json"""
 | 
			
		||||
        crawl = await self.get_crawl(crawl_id, org)
 | 
			
		||||
        qa_run = await self.get_qa_run(crawl_id, qa_run_id, org)
 | 
			
		||||
 | 
			
		||||
        if not org:
 | 
			
		||||
            org = await self.orgs.get_org_by_id(crawl.oid)
 | 
			
		||||
            if not org:
 | 
			
		||||
@ -1212,8 +1236,7 @@ def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args):
 | 
			
		||||
        qa_run_ids: DeleteQARunList,
 | 
			
		||||
        org: Organization = Depends(org_crawl_dep),
 | 
			
		||||
    ):
 | 
			
		||||
        # pylint: disable=unused-argument
 | 
			
		||||
        return await ops.delete_crawl_qa_runs(crawl_id, qa_run_ids)
 | 
			
		||||
        return await ops.delete_crawl_qa_runs(crawl_id, qa_run_ids, org)
 | 
			
		||||
 | 
			
		||||
    @app.get(
 | 
			
		||||
        "/orgs/{oid}/crawls/{crawl_id}/qa",
 | 
			
		||||
 | 
			
		||||
@ -549,6 +549,16 @@ def test_delete_qa_runs(
 | 
			
		||||
    qa_run_pages_ready,
 | 
			
		||||
    failed_qa_run_id,
 | 
			
		||||
):
 | 
			
		||||
    # Get download links for QA WACZs
 | 
			
		||||
    r = requests.get(
 | 
			
		||||
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/replay.json",
 | 
			
		||||
        headers=crawler_auth_headers,
 | 
			
		||||
    )
 | 
			
		||||
    data = r.json()
 | 
			
		||||
    assert len(data["resources"]) == 1
 | 
			
		||||
    qa_wacz_url = data["resources"][0]["path"]
 | 
			
		||||
 | 
			
		||||
    # Delete QA runs
 | 
			
		||||
    r = requests.post(
 | 
			
		||||
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/delete",
 | 
			
		||||
        json={"qa_run_ids": [qa_run_id, failed_qa_run_id]},
 | 
			
		||||
@ -575,6 +585,10 @@ def test_delete_qa_runs(
 | 
			
		||||
        time.sleep(5)
 | 
			
		||||
        count += 1
 | 
			
		||||
 | 
			
		||||
    # Ensure QA WACZs was deleted
 | 
			
		||||
    r = requests.get(f"http://localhost:30870{qa_wacz_url}")
 | 
			
		||||
    assert r.status_code == 404
 | 
			
		||||
 | 
			
		||||
    # Ensure associated qa run information in pages is also deleted
 | 
			
		||||
    for qa_run in (qa_run_id, failed_qa_run_id):
 | 
			
		||||
        count = 0
 | 
			
		||||
 | 
			
		||||
@ -864,6 +864,18 @@ def test_delete_crawls_crawler(
 | 
			
		||||
    assert r.status_code == 200
 | 
			
		||||
    assert r.json()["total"] > 0
 | 
			
		||||
 | 
			
		||||
    # Get WACZ presigned url for crawl about to delete
 | 
			
		||||
    wacz_presigned_urls = []
 | 
			
		||||
    r = requests.get(
 | 
			
		||||
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
 | 
			
		||||
        headers=crawler_auth_headers,
 | 
			
		||||
    )
 | 
			
		||||
    assert r.status_code == 200
 | 
			
		||||
    data = r.json()
 | 
			
		||||
    assert len(data["resources"]) >= 1
 | 
			
		||||
    for resource in data["resources"]:
 | 
			
		||||
        wacz_presigned_urls.append(resource["path"])
 | 
			
		||||
 | 
			
		||||
    # Test that crawler user can delete own crawl
 | 
			
		||||
    r = requests.post(
 | 
			
		||||
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
 | 
			
		||||
@ -884,6 +896,11 @@ def test_delete_crawls_crawler(
 | 
			
		||||
    )
 | 
			
		||||
    assert r.status_code == 404
 | 
			
		||||
 | 
			
		||||
    # Test that WACZs are deleted
 | 
			
		||||
    for wacz_url in wacz_presigned_urls:
 | 
			
		||||
        r = requests.get(f"http://localhost:30870{wacz_url}")
 | 
			
		||||
        assert r.status_code == 404
 | 
			
		||||
 | 
			
		||||
    # Test that associated pages are also deleted
 | 
			
		||||
    r = requests.get(
 | 
			
		||||
        f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
 | 
			
		||||
 | 
			
		||||
		Loading…
	
		Reference in New Issue
	
	Block a user