Implement retry API endpoint for failed background jobs (#1356)
Fixes #1328 - Adds /retry endpoint for retrying failed jobs. - Returns 400 error if previous job still running or has succeeded - Keeps track of previous failed attempts in previousAttempts array on failed job. - Also amends the similar webhook /retry endpoint to use `POST` for consistency. - Remove duplicate api tag for backgroundjobs
This commit is contained in:
		
							parent
							
								
									82a5d1e4e4
								
							
						
					
					
						commit
						1afc411114
					
				| @ -20,6 +20,7 @@ from .models import ( | |||||||
|     DeleteReplicaJob, |     DeleteReplicaJob, | ||||||
|     PaginatedResponse, |     PaginatedResponse, | ||||||
|     AnyJob, |     AnyJob, | ||||||
|  |     StorageRef, | ||||||
| ) | ) | ||||||
| from .pagination import DEFAULT_PAGE_SIZE, paginated_format | from .pagination import DEFAULT_PAGE_SIZE, paginated_format | ||||||
| 
 | 
 | ||||||
| @ -90,9 +91,8 @@ class BackgroundJobOps: | |||||||
| 
 | 
 | ||||||
|     async def create_replica_jobs( |     async def create_replica_jobs( | ||||||
|         self, oid: UUID, file: BaseFile, object_id: str, object_type: str |         self, oid: UUID, file: BaseFile, object_id: str, object_type: str | ||||||
|     ) -> Dict: |     ) -> Dict[str, Union[bool, List[str]]]: | ||||||
|         """Create k8s background job to replicate a file to another storage location.""" |         """Create k8s background job to replicate a file to all replica storage locations.""" | ||||||
| 
 |  | ||||||
|         org = await self.org_ops.get_org_by_id(oid) |         org = await self.org_ops.get_org_by_id(oid) | ||||||
| 
 | 
 | ||||||
|         primary_storage = self.storage_ops.get_org_storage_by_ref(org, file.storage) |         primary_storage = self.storage_ops.get_org_storage_by_ref(org, file.storage) | ||||||
| @ -105,23 +105,42 @@ class BackgroundJobOps: | |||||||
|         ids = [] |         ids = [] | ||||||
| 
 | 
 | ||||||
|         for replica_ref in self.storage_ops.get_org_replicas_storage_refs(org): |         for replica_ref in self.storage_ops.get_org_replicas_storage_refs(org): | ||||||
|             replica_storage = self.storage_ops.get_org_storage_by_ref(org, replica_ref) |             job_id = await self.create_replica_job( | ||||||
|             replica_endpoint, bucket_suffix = self.strip_bucket( |                 org, | ||||||
|                 replica_storage.endpoint_url |                 file, | ||||||
|  |                 object_id, | ||||||
|  |                 object_type, | ||||||
|  |                 replica_ref, | ||||||
|  |                 primary_file_path, | ||||||
|  |                 primary_endpoint, | ||||||
|             ) |             ) | ||||||
|             replica_file_path = bucket_suffix + file.filename |             ids.append(job_id) | ||||||
| 
 | 
 | ||||||
|             # print(f"primary: {file.storage.get_storage_secret_name(str(oid))}") |         return {"added": True, "ids": ids} | ||||||
|             # print(f"  endpoint: {primary_endpoint}") |  | ||||||
|             # print(f"  path: {primary_file_path}") |  | ||||||
|             # print(f"replica: {replica_ref.get_storage_secret_name(str(oid))}") |  | ||||||
|             # print(f"  endpoint: {replica_endpoint}") |  | ||||||
|             # print(f"  path: {replica_file_path}") |  | ||||||
| 
 | 
 | ||||||
|             job_type = BgJobType.CREATE_REPLICA.value |     async def create_replica_job( | ||||||
|  |         self, | ||||||
|  |         org: Organization, | ||||||
|  |         file: BaseFile, | ||||||
|  |         object_id: str, | ||||||
|  |         object_type: str, | ||||||
|  |         replica_ref: StorageRef, | ||||||
|  |         primary_file_path: str, | ||||||
|  |         primary_endpoint: str, | ||||||
|  |         existing_job_id: Optional[str] = None, | ||||||
|  |     ) -> str: | ||||||
|  |         """Create k8s background job to replicate a file to a specific replica storage location.""" | ||||||
|  |         replica_storage = self.storage_ops.get_org_storage_by_ref(org, replica_ref) | ||||||
|  |         replica_endpoint, bucket_suffix = self.strip_bucket( | ||||||
|  |             replica_storage.endpoint_url | ||||||
|  |         ) | ||||||
|  |         replica_file_path = bucket_suffix + file.filename | ||||||
| 
 | 
 | ||||||
|  |         job_type = BgJobType.CREATE_REPLICA.value | ||||||
|  | 
 | ||||||
|  |         try: | ||||||
|             job_id = await self.crawl_manager.run_replica_job( |             job_id = await self.crawl_manager.run_replica_job( | ||||||
|                 oid=str(oid), |                 oid=str(org.id), | ||||||
|                 job_type=job_type, |                 job_type=job_type, | ||||||
|                 primary_storage=file.storage, |                 primary_storage=file.storage, | ||||||
|                 primary_file_path=primary_file_path, |                 primary_file_path=primary_file_path, | ||||||
| @ -130,74 +149,122 @@ class BackgroundJobOps: | |||||||
|                 replica_file_path=replica_file_path, |                 replica_file_path=replica_file_path, | ||||||
|                 replica_endpoint=replica_endpoint, |                 replica_endpoint=replica_endpoint, | ||||||
|                 job_id_prefix=f"{job_type}-{object_id}", |                 job_id_prefix=f"{job_type}-{object_id}", | ||||||
|  |                 existing_job_id=existing_job_id, | ||||||
|             ) |             ) | ||||||
|             replication_job = CreateReplicaJob( |             if existing_job_id: | ||||||
|                 id=job_id, |                 replication_job = await self.get_background_job(existing_job_id, org.id) | ||||||
|                 oid=oid, |                 previous_attempt = { | ||||||
|                 started=datetime.now(), |                     "started": replication_job.started, | ||||||
|                 file_path=file.filename, |                     "finished": replication_job.finished, | ||||||
|                 object_type=object_type, |                 } | ||||||
|                 object_id=object_id, |                 if replication_job.previousAttempts: | ||||||
|                 primary=file.storage, |                     replication_job.previousAttempts.append(previous_attempt) | ||||||
|                 replica_storage=replica_ref, |                 else: | ||||||
|             ) |                     replication_job.previousAttempts = [previous_attempt] | ||||||
|             # print( |                 replication_job.started = datetime.now() | ||||||
|             #    f"File path written into replication_job: {file.filename}", flush=True |                 replication_job.finished = None | ||||||
|             # ) |                 replication_job.success = None | ||||||
|  |             else: | ||||||
|  |                 replication_job = CreateReplicaJob( | ||||||
|  |                     id=job_id, | ||||||
|  |                     oid=org.id, | ||||||
|  |                     started=datetime.now(), | ||||||
|  |                     file_path=file.filename, | ||||||
|  |                     object_type=object_type, | ||||||
|  |                     object_id=object_id, | ||||||
|  |                     primary=file.storage, | ||||||
|  |                     replica_storage=replica_ref, | ||||||
|  |                 ) | ||||||
|  | 
 | ||||||
|             await self.jobs.find_one_and_update( |             await self.jobs.find_one_and_update( | ||||||
|                 {"_id": job_id}, {"$set": replication_job.to_dict()}, upsert=True |                 {"_id": job_id}, {"$set": replication_job.to_dict()}, upsert=True | ||||||
|             ) |             ) | ||||||
|             ids.append(job_id) |  | ||||||
| 
 | 
 | ||||||
|         return {"added": True, "ids": ids} |             return job_id | ||||||
|  |         except Exception as exc: | ||||||
|  |             # pylint: disable=raise-missing-from | ||||||
|  |             raise HTTPException(status_code=500, detail=f"Error starting crawl: {exc}") | ||||||
| 
 | 
 | ||||||
|     async def create_delete_replica_jobs( |     async def create_delete_replica_jobs( | ||||||
|         self, org: Organization, file: BaseFile, object_id: str, object_type: str |         self, org: Organization, file: BaseFile, object_id: str, object_type: str | ||||||
|     ) -> Dict[str, Union[bool, List[str]]]: |     ) -> Dict[str, Union[bool, List[str]]]: | ||||||
|         """Create a job to delete each replica for the given file""" |         """Create a job to delete each replica for the given file""" | ||||||
| 
 |  | ||||||
|         ids = [] |         ids = [] | ||||||
|         oid = str(org.id) |  | ||||||
| 
 | 
 | ||||||
|         for replica_ref in file.replicas or []: |         for replica_ref in file.replicas or []: | ||||||
|             replica_storage = self.storage_ops.get_org_storage_by_ref(org, replica_ref) |             job_id = await self.create_delete_replica_job( | ||||||
|             replica_endpoint, bucket_suffix = self.strip_bucket( |                 org, file, object_id, object_type, replica_ref | ||||||
|                 replica_storage.endpoint_url |  | ||||||
|             ) |             ) | ||||||
|             replica_file_path = bucket_suffix + file.filename |             ids.append(job_id) | ||||||
| 
 | 
 | ||||||
|             # print(f"replica: {replica_ref.get_storage_secret_name(oid)}") |         return {"added": True, "ids": ids} | ||||||
|             # print(f"  endpoint: {replica_endpoint}") |  | ||||||
|             # print(f"  path: {replica_file_path}") |  | ||||||
| 
 | 
 | ||||||
|             job_type = BgJobType.DELETE_REPLICA.value |     async def create_delete_replica_job( | ||||||
|  |         self, | ||||||
|  |         org: Organization, | ||||||
|  |         file: BaseFile, | ||||||
|  |         object_id: str, | ||||||
|  |         object_type: str, | ||||||
|  |         replica_ref: StorageRef, | ||||||
|  |         existing_job_id: Optional[str] = None, | ||||||
|  |     ) -> str: | ||||||
|  |         """Create a job to delete one replica of a given file""" | ||||||
|  |         replica_storage = self.storage_ops.get_org_storage_by_ref(org, replica_ref) | ||||||
|  |         replica_endpoint, bucket_suffix = self.strip_bucket( | ||||||
|  |             replica_storage.endpoint_url | ||||||
|  |         ) | ||||||
|  |         replica_file_path = bucket_suffix + file.filename | ||||||
| 
 | 
 | ||||||
|  |         job_type = BgJobType.DELETE_REPLICA.value | ||||||
|  | 
 | ||||||
|  |         try: | ||||||
|             job_id = await self.crawl_manager.run_replica_job( |             job_id = await self.crawl_manager.run_replica_job( | ||||||
|                 oid=oid, |                 oid=str(org.id), | ||||||
|                 job_type=job_type, |                 job_type=job_type, | ||||||
|                 replica_storage=replica_ref, |                 replica_storage=replica_ref, | ||||||
|                 replica_file_path=replica_file_path, |                 replica_file_path=replica_file_path, | ||||||
|                 replica_endpoint=replica_endpoint, |                 replica_endpoint=replica_endpoint, | ||||||
|                 job_id_prefix=f"{job_type}-{object_id}", |                 job_id_prefix=f"{job_type}-{object_id}", | ||||||
|  |                 existing_job_id=existing_job_id, | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|             delete_replica_job = DeleteReplicaJob( |             if existing_job_id: | ||||||
|                 id=job_id, |                 delete_replica_job = await self.get_background_job( | ||||||
|                 oid=oid, |                     existing_job_id, org.id | ||||||
|                 started=datetime.now(), |                 ) | ||||||
|                 file_path=file.filename, |                 previous_attempt = { | ||||||
|                 object_id=object_id, |                     "started": delete_replica_job.started, | ||||||
|                 object_type=object_type, |                     "finished": delete_replica_job.finished, | ||||||
|                 replica_storage=replica_ref, |                 } | ||||||
|             ) |                 if delete_replica_job.previousAttempts: | ||||||
|  |                     delete_replica_job.previousAttempts.append(previous_attempt) | ||||||
|  |                 else: | ||||||
|  |                     delete_replica_job.previousAttempts = [previous_attempt] | ||||||
|  |                 delete_replica_job.started = datetime.now() | ||||||
|  |                 delete_replica_job.finished = None | ||||||
|  |                 delete_replica_job.success = None | ||||||
|  |             else: | ||||||
|  |                 delete_replica_job = DeleteReplicaJob( | ||||||
|  |                     id=job_id, | ||||||
|  |                     oid=org.id, | ||||||
|  |                     started=datetime.now(), | ||||||
|  |                     file_path=file.filename, | ||||||
|  |                     object_id=object_id, | ||||||
|  |                     object_type=object_type, | ||||||
|  |                     replica_storage=replica_ref, | ||||||
|  |                 ) | ||||||
| 
 | 
 | ||||||
|             await self.jobs.find_one_and_update( |             await self.jobs.find_one_and_update( | ||||||
|                 {"_id": job_id}, {"$set": delete_replica_job.to_dict()}, upsert=True |                 {"_id": job_id}, {"$set": delete_replica_job.to_dict()}, upsert=True | ||||||
|             ) |             ) | ||||||
| 
 | 
 | ||||||
|             ids.append(job_id) |             return job_id | ||||||
| 
 | 
 | ||||||
|         return {"added": True, "ids": ids} |         except Exception as exc: | ||||||
|  |             # pylint: disable=raise-missing-from | ||||||
|  |             raise HTTPException( | ||||||
|  |                 status_code=400, detail=f"Error starting background job: {exc}" | ||||||
|  |             ) | ||||||
| 
 | 
 | ||||||
|     async def job_finished( |     async def job_finished( | ||||||
|         self, |         self, | ||||||
| @ -241,7 +308,9 @@ class BackgroundJobOps: | |||||||
|             {"$set": {"success": success, "finished": finished}}, |             {"$set": {"success": success, "finished": finished}}, | ||||||
|         ) |         ) | ||||||
| 
 | 
 | ||||||
|     async def get_background_job(self, job_id: str, oid: UUID) -> BackgroundJob: |     async def get_background_job( | ||||||
|  |         self, job_id: str, oid: UUID | ||||||
|  |     ) -> Union[CreateReplicaJob, DeleteReplicaJob]: | ||||||
|         """Get background job""" |         """Get background job""" | ||||||
|         query: dict[str, object] = {"_id": job_id, "oid": oid} |         query: dict[str, object] = {"_id": job_id, "oid": oid} | ||||||
|         res = await self.jobs.find_one(query) |         res = await self.jobs.find_one(query) | ||||||
| @ -255,10 +324,9 @@ class BackgroundJobOps: | |||||||
|         if data["type"] == BgJobType.CREATE_REPLICA: |         if data["type"] == BgJobType.CREATE_REPLICA: | ||||||
|             return CreateReplicaJob.from_dict(data) |             return CreateReplicaJob.from_dict(data) | ||||||
| 
 | 
 | ||||||
|         if data["type"] == BgJobType.DELETE_REPLICA: |         return DeleteReplicaJob.from_dict(data) | ||||||
|             return DeleteReplicaJob.from_dict(data) |  | ||||||
| 
 | 
 | ||||||
|         return BackgroundJob.from_dict(data) |         # return BackgroundJob.from_dict(data) | ||||||
| 
 | 
 | ||||||
|     async def list_background_jobs( |     async def list_background_jobs( | ||||||
|         self, |         self, | ||||||
| @ -324,6 +392,69 @@ class BackgroundJobOps: | |||||||
| 
 | 
 | ||||||
|         return jobs, total |         return jobs, total | ||||||
| 
 | 
 | ||||||
|  |     async def get_replica_job_file( | ||||||
|  |         self, job: Union[CreateReplicaJob, DeleteReplicaJob], org: Organization | ||||||
|  |     ) -> BaseFile: | ||||||
|  |         """Return file from replica job""" | ||||||
|  |         try: | ||||||
|  |             if job.object_type == "profile": | ||||||
|  |                 profile = await self.profile_ops.get_profile(UUID(job.object_id), org) | ||||||
|  |                 return BaseFile(**profile.resource.dict()) | ||||||
|  | 
 | ||||||
|  |             item_res = await self.base_crawl_ops.get_crawl_raw(job.object_id, org) | ||||||
|  |             matching_file = [ | ||||||
|  |                 f for f in item_res.get("files", []) if f["filename"] == job.file_path | ||||||
|  |             ][0] | ||||||
|  |             return BaseFile(**matching_file) | ||||||
|  |         # pylint: disable=broad-exception-caught, raise-missing-from | ||||||
|  |         except Exception: | ||||||
|  |             raise HTTPException(status_code=404, detail="file_not_found") | ||||||
|  | 
 | ||||||
|  |     async def retry_background_job( | ||||||
|  |         self, job_id: str, org: Organization | ||||||
|  |     ) -> Dict[str, Union[bool, Optional[str]]]: | ||||||
|  |         """Retry background job and return new job id""" | ||||||
|  |         job = await self.get_background_job(job_id, org.id) | ||||||
|  |         if not job: | ||||||
|  |             raise HTTPException(status_code=404, detail="job_not_found") | ||||||
|  | 
 | ||||||
|  |         if not job.finished: | ||||||
|  |             raise HTTPException(status_code=400, detail="job_not_finished") | ||||||
|  | 
 | ||||||
|  |         if job.success: | ||||||
|  |             raise HTTPException(status_code=400, detail="job_already_succeeded") | ||||||
|  | 
 | ||||||
|  |         file = await self.get_replica_job_file(job, org) | ||||||
|  | 
 | ||||||
|  |         if job.type == BgJobType.CREATE_REPLICA: | ||||||
|  |             primary_storage = self.storage_ops.get_org_storage_by_ref(org, file.storage) | ||||||
|  |             primary_endpoint, bucket_suffix = self.strip_bucket( | ||||||
|  |                 primary_storage.endpoint_url | ||||||
|  |             ) | ||||||
|  |             primary_file_path = bucket_suffix + file.filename | ||||||
|  |             await self.create_replica_job( | ||||||
|  |                 org, | ||||||
|  |                 file, | ||||||
|  |                 job.object_id, | ||||||
|  |                 job.object_type, | ||||||
|  |                 job.replica_storage, | ||||||
|  |                 primary_file_path, | ||||||
|  |                 primary_endpoint, | ||||||
|  |                 existing_job_id=job_id, | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |         if job.type == BgJobType.DELETE_REPLICA: | ||||||
|  |             await self.create_delete_replica_job( | ||||||
|  |                 org, | ||||||
|  |                 file, | ||||||
|  |                 job.object_id, | ||||||
|  |                 job.object_type, | ||||||
|  |                 job.replica_storage, | ||||||
|  |                 existing_job_id=job_id, | ||||||
|  |             ) | ||||||
|  | 
 | ||||||
|  |         return {"success": True} | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| # ============================================================================ | # ============================================================================ | ||||||
| # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme | # pylint: disable=too-many-arguments, too-many-locals, invalid-name, fixme | ||||||
| @ -344,7 +475,6 @@ def init_background_jobs_api( | |||||||
| 
 | 
 | ||||||
|     @router.get( |     @router.get( | ||||||
|         "/{job_id}", |         "/{job_id}", | ||||||
|         tags=["backgroundjobs"], |  | ||||||
|         response_model=AnyJob, |         response_model=AnyJob, | ||||||
|     ) |     ) | ||||||
|     async def get_background_job( |     async def get_background_job( | ||||||
| @ -354,7 +484,17 @@ def init_background_jobs_api( | |||||||
|         """Retrieve information for background job""" |         """Retrieve information for background job""" | ||||||
|         return await ops.get_background_job(job_id, org.id) |         return await ops.get_background_job(job_id, org.id) | ||||||
| 
 | 
 | ||||||
|     @router.get("", tags=["backgroundjobs"], response_model=PaginatedResponse) |     @router.post( | ||||||
|  |         "/{job_id}/retry", | ||||||
|  |     ) | ||||||
|  |     async def retry_background_job( | ||||||
|  |         job_id: str, | ||||||
|  |         org: Organization = Depends(org_crawl_dep), | ||||||
|  |     ): | ||||||
|  |         """Retry background job""" | ||||||
|  |         return await ops.retry_background_job(job_id, org) | ||||||
|  | 
 | ||||||
|  |     @router.get("", response_model=PaginatedResponse) | ||||||
|     async def list_background_jobs( |     async def list_background_jobs( | ||||||
|         org: Organization = Depends(org_crawl_dep), |         org: Organization = Depends(org_crawl_dep), | ||||||
|         pageSize: int = DEFAULT_PAGE_SIZE, |         pageSize: int = DEFAULT_PAGE_SIZE, | ||||||
|  | |||||||
| @ -74,14 +74,18 @@ class CrawlManager(K8sAPI): | |||||||
|         primary_file_path: Optional[str] = None, |         primary_file_path: Optional[str] = None, | ||||||
|         primary_endpoint: Optional[str] = None, |         primary_endpoint: Optional[str] = None, | ||||||
|         job_id_prefix: Optional[str] = None, |         job_id_prefix: Optional[str] = None, | ||||||
|  |         existing_job_id: Optional[str] = None, | ||||||
|     ): |     ): | ||||||
|         """run job to replicate file from primary storage to replica storage""" |         """run job to replicate file from primary storage to replica storage""" | ||||||
| 
 | 
 | ||||||
|         if not job_id_prefix: |         if existing_job_id: | ||||||
|             job_id_prefix = job_type |             job_id = existing_job_id | ||||||
|  |         else: | ||||||
|  |             if not job_id_prefix: | ||||||
|  |                 job_id_prefix = job_type | ||||||
| 
 | 
 | ||||||
|         # ensure name is <=63 characters |             # ensure name is <=63 characters | ||||||
|         job_id = f"{job_id_prefix[:52]}-{secrets.token_hex(5)}" |             job_id = f"{job_id_prefix[:52]}-{secrets.token_hex(5)}" | ||||||
| 
 | 
 | ||||||
|         params = { |         params = { | ||||||
|             "id": job_id, |             "id": job_id, | ||||||
|  | |||||||
| @ -1237,6 +1237,8 @@ class BackgroundJob(BaseMongoModel): | |||||||
|     started: datetime |     started: datetime | ||||||
|     finished: Optional[datetime] = None |     finished: Optional[datetime] = None | ||||||
| 
 | 
 | ||||||
|  |     previousAttempts: Optional[List[Dict[str, Optional[datetime]]]] = None | ||||||
|  | 
 | ||||||
| 
 | 
 | ||||||
| # ============================================================================ | # ============================================================================ | ||||||
| class CreateReplicaJob(BackgroundJob): | class CreateReplicaJob(BackgroundJob): | ||||||
|  | |||||||
| @ -400,7 +400,7 @@ def init_event_webhooks_api(mdb, org_ops, app): | |||||||
|     ): |     ): | ||||||
|         return await ops.get_notification(org, notificationid) |         return await ops.get_notification(org, notificationid) | ||||||
| 
 | 
 | ||||||
|     @router.get("/{notificationid}/retry") |     @router.post("/{notificationid}/retry") | ||||||
|     async def retry_notification( |     async def retry_notification( | ||||||
|         notificationid: UUID, |         notificationid: UUID, | ||||||
|         org: Organization = Depends(org_owner_dep), |         org: Organization = Depends(org_owner_dep), | ||||||
|  | |||||||
| @ -99,7 +99,7 @@ def test_get_webhook_event(admin_auth_headers, default_org_id): | |||||||
| 
 | 
 | ||||||
| def test_retry_webhook_event(admin_auth_headers, default_org_id): | def test_retry_webhook_event(admin_auth_headers, default_org_id): | ||||||
|     # Expect to fail because we haven't set up URLs that accept webhooks |     # Expect to fail because we haven't set up URLs that accept webhooks | ||||||
|     r = requests.get( |     r = requests.post( | ||||||
|         f"{API_PREFIX}/orgs/{default_org_id}/webhooks/{_webhook_event_id}/retry", |         f"{API_PREFIX}/orgs/{default_org_id}/webhooks/{_webhook_event_id}/retry", | ||||||
|         headers=admin_auth_headers, |         headers=admin_auth_headers, | ||||||
|     ) |     ) | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user