* Fix POST /orgs/{oid}/crawls/delete
- Add permissions check to ensure crawler users can only delete
their own crawls
- Fix broken delete_crawls endpoint
- Delete files from storage as well as deleting crawl from db
- Add tests, including nightly test that ensures crawl files are
no longer accessible after the crawl is deleted
		
	
			
		
			
				
	
	
		
			89 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			89 lines
		
	
	
		
			2.5 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import os
 | |
| import requests
 | |
| import time
 | |
| 
 | |
| from .conftest import API_PREFIX, HOST_PREFIX
 | |
| 
 | |
| 
 | |
| def test_delete_crawls(
 | |
|     tmp_path, admin_auth_headers, default_org_id, crawl_id_wr, crawl_id_wr_specs
 | |
| ):
 | |
|     # Check that crawls have associated files
 | |
|     crawl_resource_urls = []
 | |
| 
 | |
|     def _file_is_retrievable(url):
 | |
|         """Attempt to retrieve file at url and return True or False."""
 | |
|         file_path = str(tmp_path / "test_download")
 | |
|         if os.path.exists(file_path):
 | |
|             os.remove(file_path)
 | |
| 
 | |
|         r = requests.get(f"{HOST_PREFIX}{url}")
 | |
|         if not r.status_code == 200:
 | |
|             return False
 | |
| 
 | |
|         with open(file_path, "wb") as fd:
 | |
|             fd.write(r.content)
 | |
| 
 | |
|         if not (os.path.isfile(file_path) and os.path.getsize(file_path) > 0):
 | |
|             return False
 | |
| 
 | |
|         os.remove(file_path)
 | |
|         return True
 | |
| 
 | |
|     r = requests.get(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json",
 | |
|         headers=admin_auth_headers,
 | |
|     )
 | |
|     assert r.status_code == 200
 | |
|     data = r.json()
 | |
|     resources = data["resources"]
 | |
|     assert resources
 | |
|     for resource in resources:
 | |
|         crawl_resource_urls.append(resource["path"])
 | |
| 
 | |
|     r = requests.get(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json",
 | |
|         headers=admin_auth_headers,
 | |
|     )
 | |
|     assert r.status_code == 200
 | |
|     data = r.json()
 | |
|     resources = data["resources"]
 | |
|     assert resources
 | |
|     for resource in resources:
 | |
|         crawl_resource_urls.append(resource["path"])
 | |
| 
 | |
|     # Test retrieving resources
 | |
|     for url in crawl_resource_urls:
 | |
|         assert _file_is_retrievable(url)
 | |
| 
 | |
|     # Delete crawls
 | |
|     r = requests.post(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawls/delete",
 | |
|         headers=admin_auth_headers,
 | |
|         json={"crawl_ids": [crawl_id_wr, crawl_id_wr_specs]},
 | |
|     )
 | |
|     assert r.status_code == 200
 | |
|     data = r.json()
 | |
| 
 | |
|     assert data["deleted"] == 2
 | |
| 
 | |
|     # Verify that crawls don't exist in db
 | |
|     r = requests.get(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr}/replay.json",
 | |
|         headers=admin_auth_headers,
 | |
|     )
 | |
|     assert r.status_code == 404
 | |
| 
 | |
|     r = requests.get(
 | |
|         f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id_wr_specs}/replay.json",
 | |
|         headers=admin_auth_headers,
 | |
|     )
 | |
|     assert r.status_code == 404
 | |
| 
 | |
|     # Give Minio time to delete the files
 | |
|     time.sleep(120)
 | |
| 
 | |
|     # Verify that files are no longer retrievable from storage
 | |
|     for url in crawl_resource_urls:
 | |
|         assert not _file_is_retrievable(url)
 |