Ensure QA run WACZs are deleted (#1715)
- When qa run is deleted - When crawl is deleted And adds tests for WACZ deletion. Fixes #1713
This commit is contained in:
parent
127315189f
commit
b8caeb88e9
@ -23,6 +23,7 @@ from .models import (
|
|||||||
StorageRef,
|
StorageRef,
|
||||||
RUNNING_AND_STARTING_STATES,
|
RUNNING_AND_STARTING_STATES,
|
||||||
SUCCESSFUL_STATES,
|
SUCCESSFUL_STATES,
|
||||||
|
QARun,
|
||||||
)
|
)
|
||||||
from .pagination import paginated_format, DEFAULT_PAGE_SIZE
|
from .pagination import paginated_format, DEFAULT_PAGE_SIZE
|
||||||
from .utils import dt_now
|
from .utils import dt_now
|
||||||
@ -317,6 +318,7 @@ class BaseCrawlOps:
|
|||||||
|
|
||||||
if type_ == "crawl":
|
if type_ == "crawl":
|
||||||
await self.page_ops.delete_crawl_pages(crawl_id, org.id)
|
await self.page_ops.delete_crawl_pages(crawl_id, org.id)
|
||||||
|
await self.delete_all_crawl_qa_files(crawl_id, org)
|
||||||
|
|
||||||
crawl_size = await self._delete_crawl_files(crawl, org)
|
crawl_size = await self._delete_crawl_files(crawl, org)
|
||||||
size += crawl_size
|
size += crawl_size
|
||||||
@ -351,13 +353,17 @@ class BaseCrawlOps:
|
|||||||
|
|
||||||
return res.deleted_count, cids_to_update, quota_reached
|
return res.deleted_count, cids_to_update, quota_reached
|
||||||
|
|
||||||
async def _delete_crawl_files(self, crawl: BaseCrawl, org: Organization):
|
async def _delete_crawl_files(
|
||||||
|
self, crawl: Union[BaseCrawl, QARun], org: Organization
|
||||||
|
):
|
||||||
"""Delete files associated with crawl from storage."""
|
"""Delete files associated with crawl from storage."""
|
||||||
size = 0
|
size = 0
|
||||||
for file_ in crawl.files:
|
for file_ in crawl.files:
|
||||||
size += file_.size
|
size += file_.size
|
||||||
if not await self.storage_ops.delete_crawl_file_object(org, file_):
|
if not await self.storage_ops.delete_crawl_file_object(org, file_):
|
||||||
raise HTTPException(status_code=400, detail="file_deletion_error")
|
raise HTTPException(status_code=400, detail="file_deletion_error")
|
||||||
|
# Not replicating QA run WACZs yet
|
||||||
|
if not isinstance(crawl, QARun):
|
||||||
await self.background_job_ops.create_delete_replica_jobs(
|
await self.background_job_ops.create_delete_replica_jobs(
|
||||||
org, file_, crawl.id, crawl.type
|
org, file_, crawl.id, crawl.type
|
||||||
)
|
)
|
||||||
@ -370,6 +376,14 @@ class BaseCrawlOps:
|
|||||||
org = await self.orgs.get_org_by_id(oid)
|
org = await self.orgs.get_org_by_id(oid)
|
||||||
return await self._delete_crawl_files(crawl, org)
|
return await self._delete_crawl_files(crawl, org)
|
||||||
|
|
||||||
|
async def delete_all_crawl_qa_files(self, crawl_id: str, org: Organization):
|
||||||
|
"""Delete files for all qa runs in a crawl"""
|
||||||
|
crawl_raw = await self.get_crawl_raw(crawl_id)
|
||||||
|
qa_finished = crawl_raw.get("qaFinished", {})
|
||||||
|
for qa_run_raw in qa_finished.values():
|
||||||
|
qa_run = QARun(**qa_run_raw)
|
||||||
|
await self._delete_crawl_files(qa_run, org)
|
||||||
|
|
||||||
async def _resolve_crawl_refs(
|
async def _resolve_crawl_refs(
|
||||||
self,
|
self,
|
||||||
crawl: Union[CrawlOut, CrawlOutWithResources],
|
crawl: Union[CrawlOut, CrawlOutWithResources],
|
||||||
|
@ -828,11 +828,15 @@ class CrawlOps(BaseCrawlOps):
|
|||||||
status_code=404, detail=f"crawl_not_found, (details: {exc})"
|
status_code=404, detail=f"crawl_not_found, (details: {exc})"
|
||||||
)
|
)
|
||||||
|
|
||||||
async def delete_crawl_qa_runs(self, crawl_id: str, delete_list: DeleteQARunList):
|
async def delete_crawl_qa_runs(
|
||||||
|
self, crawl_id: str, delete_list: DeleteQARunList, org: Organization
|
||||||
|
):
|
||||||
"""delete specified finished QA run"""
|
"""delete specified finished QA run"""
|
||||||
|
|
||||||
count = 0
|
count = 0
|
||||||
for qa_run_id in delete_list.qa_run_ids:
|
for qa_run_id in delete_list.qa_run_ids:
|
||||||
|
await self.page_ops.delete_qa_run_from_pages(crawl_id, qa_run_id)
|
||||||
|
await self.delete_crawl_qa_run_files(crawl_id, qa_run_id, org)
|
||||||
|
|
||||||
res = await self.crawls.find_one_and_update(
|
res = await self.crawls.find_one_and_update(
|
||||||
{"_id": crawl_id, "type": "crawl"},
|
{"_id": crawl_id, "type": "crawl"},
|
||||||
{"$unset": {f"qaFinished.{qa_run_id}": ""}},
|
{"$unset": {f"qaFinished.{qa_run_id}": ""}},
|
||||||
@ -841,10 +845,21 @@ class CrawlOps(BaseCrawlOps):
|
|||||||
if res:
|
if res:
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
await self.page_ops.delete_qa_run_from_pages(crawl_id, qa_run_id)
|
|
||||||
|
|
||||||
return {"deleted": count}
|
return {"deleted": count}
|
||||||
|
|
||||||
|
async def delete_crawl_qa_run_files(
|
||||||
|
self, crawl_id: str, qa_run_id: str, org: Organization
|
||||||
|
):
|
||||||
|
"""delete crawl qa wacz files"""
|
||||||
|
qa_run = await self.get_qa_run(crawl_id, qa_run_id, org)
|
||||||
|
for file_ in qa_run.files:
|
||||||
|
if not await self.storage_ops.delete_crawl_file_object(org, file_):
|
||||||
|
raise HTTPException(status_code=400, detail="file_deletion_error")
|
||||||
|
# Not replicating QA run WACZs yet
|
||||||
|
# await self.background_job_ops.create_delete_replica_jobs(
|
||||||
|
# org, file_, qa_run_id, "qa"
|
||||||
|
# )
|
||||||
|
|
||||||
async def qa_run_finished(self, crawl_id: str):
|
async def qa_run_finished(self, crawl_id: str):
|
||||||
"""clear active qa, add qa run to finished list, if successful"""
|
"""clear active qa, add qa run to finished list, if successful"""
|
||||||
crawl = await self.get_crawl(crawl_id)
|
crawl = await self.get_crawl(crawl_id)
|
||||||
@ -900,10 +915,10 @@ class CrawlOps(BaseCrawlOps):
|
|||||||
qa = crawl_data.get("qa")
|
qa = crawl_data.get("qa")
|
||||||
return QARunOut(**qa) if qa else None
|
return QARunOut(**qa) if qa else None
|
||||||
|
|
||||||
async def get_qa_run_for_replay(
|
async def get_qa_run(
|
||||||
self, crawl_id: str, qa_run_id: str, org: Optional[Organization] = None
|
self, crawl_id: str, qa_run_id: str, org: Optional[Organization] = None
|
||||||
) -> QARunWithResources:
|
):
|
||||||
"""Fetch QA runs with resources for replay.json"""
|
"""Get QARun by id"""
|
||||||
crawl = await self.get_crawl(crawl_id, org)
|
crawl = await self.get_crawl(crawl_id, org)
|
||||||
qa_finished = crawl.qaFinished or {}
|
qa_finished = crawl.qaFinished or {}
|
||||||
qa_run = qa_finished.get(qa_run_id)
|
qa_run = qa_finished.get(qa_run_id)
|
||||||
@ -911,6 +926,15 @@ class CrawlOps(BaseCrawlOps):
|
|||||||
if not qa_run:
|
if not qa_run:
|
||||||
raise HTTPException(status_code=404, detail="crawl_qa_not_found")
|
raise HTTPException(status_code=404, detail="crawl_qa_not_found")
|
||||||
|
|
||||||
|
return qa_run
|
||||||
|
|
||||||
|
async def get_qa_run_for_replay(
|
||||||
|
self, crawl_id: str, qa_run_id: str, org: Optional[Organization] = None
|
||||||
|
) -> QARunWithResources:
|
||||||
|
"""Fetch QA runs with resources for replay.json"""
|
||||||
|
crawl = await self.get_crawl(crawl_id, org)
|
||||||
|
qa_run = await self.get_qa_run(crawl_id, qa_run_id, org)
|
||||||
|
|
||||||
if not org:
|
if not org:
|
||||||
org = await self.orgs.get_org_by_id(crawl.oid)
|
org = await self.orgs.get_org_by_id(crawl.oid)
|
||||||
if not org:
|
if not org:
|
||||||
@ -1212,8 +1236,7 @@ def init_crawls_api(crawl_manager: CrawlManager, app, user_dep, *args):
|
|||||||
qa_run_ids: DeleteQARunList,
|
qa_run_ids: DeleteQARunList,
|
||||||
org: Organization = Depends(org_crawl_dep),
|
org: Organization = Depends(org_crawl_dep),
|
||||||
):
|
):
|
||||||
# pylint: disable=unused-argument
|
return await ops.delete_crawl_qa_runs(crawl_id, qa_run_ids, org)
|
||||||
return await ops.delete_crawl_qa_runs(crawl_id, qa_run_ids)
|
|
||||||
|
|
||||||
@app.get(
|
@app.get(
|
||||||
"/orgs/{oid}/crawls/{crawl_id}/qa",
|
"/orgs/{oid}/crawls/{crawl_id}/qa",
|
||||||
|
@ -549,6 +549,16 @@ def test_delete_qa_runs(
|
|||||||
qa_run_pages_ready,
|
qa_run_pages_ready,
|
||||||
failed_qa_run_id,
|
failed_qa_run_id,
|
||||||
):
|
):
|
||||||
|
# Get download links for QA WACZs
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/{qa_run_id}/replay.json",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
)
|
||||||
|
data = r.json()
|
||||||
|
assert len(data["resources"]) == 1
|
||||||
|
qa_wacz_url = data["resources"][0]["path"]
|
||||||
|
|
||||||
|
# Delete QA runs
|
||||||
r = requests.post(
|
r = requests.post(
|
||||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/delete",
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/qa/delete",
|
||||||
json={"qa_run_ids": [qa_run_id, failed_qa_run_id]},
|
json={"qa_run_ids": [qa_run_id, failed_qa_run_id]},
|
||||||
@ -575,6 +585,10 @@ def test_delete_qa_runs(
|
|||||||
time.sleep(5)
|
time.sleep(5)
|
||||||
count += 1
|
count += 1
|
||||||
|
|
||||||
|
# Ensure QA WACZs was deleted
|
||||||
|
r = requests.get(f"http://localhost:30870{qa_wacz_url}")
|
||||||
|
assert r.status_code == 404
|
||||||
|
|
||||||
# Ensure associated qa run information in pages is also deleted
|
# Ensure associated qa run information in pages is also deleted
|
||||||
for qa_run in (qa_run_id, failed_qa_run_id):
|
for qa_run in (qa_run_id, failed_qa_run_id):
|
||||||
count = 0
|
count = 0
|
||||||
|
@ -864,6 +864,18 @@ def test_delete_crawls_crawler(
|
|||||||
assert r.status_code == 200
|
assert r.status_code == 200
|
||||||
assert r.json()["total"] > 0
|
assert r.json()["total"] > 0
|
||||||
|
|
||||||
|
# Get WACZ presigned url for crawl about to delete
|
||||||
|
wacz_presigned_urls = []
|
||||||
|
r = requests.get(
|
||||||
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/replay.json",
|
||||||
|
headers=crawler_auth_headers,
|
||||||
|
)
|
||||||
|
assert r.status_code == 200
|
||||||
|
data = r.json()
|
||||||
|
assert len(data["resources"]) >= 1
|
||||||
|
for resource in data["resources"]:
|
||||||
|
wacz_presigned_urls.append(resource["path"])
|
||||||
|
|
||||||
# Test that crawler user can delete own crawl
|
# Test that crawler user can delete own crawl
|
||||||
r = requests.post(
|
r = requests.post(
|
||||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
|
||||||
@ -884,6 +896,11 @@ def test_delete_crawls_crawler(
|
|||||||
)
|
)
|
||||||
assert r.status_code == 404
|
assert r.status_code == 404
|
||||||
|
|
||||||
|
# Test that WACZs are deleted
|
||||||
|
for wacz_url in wacz_presigned_urls:
|
||||||
|
r = requests.get(f"http://localhost:30870{wacz_url}")
|
||||||
|
assert r.status_code == 404
|
||||||
|
|
||||||
# Test that associated pages are also deleted
|
# Test that associated pages are also deleted
|
||||||
r = requests.get(
|
r = requests.get(
|
||||||
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
|
f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawler_crawl_id}/pages",
|
||||||
|
Loading…
Reference in New Issue
Block a user