diff --git a/backend/crawlconfigs.py b/backend/crawlconfigs.py index 67d83c03..656b5ced 100644 --- a/backend/crawlconfigs.py +++ b/backend/crawlconfigs.py @@ -141,10 +141,11 @@ class CrawlOps: async def update_crawl_schedule(self, cid: str, update: UpdateSchedule): """ Update schedule for existing crawl config""" + if not await self.crawl_configs.find_one_and_update( {"_id": cid}, {"$set": {"schedule": update.schedule}} ): - return None + return False await self.crawl_manager.update_crawl_schedule(cid, update.schedule) return True @@ -222,11 +223,9 @@ def init_crawl_config_api(mdb, user_dep, archive_ops, crawl_manager): cid: str, ): + success = False try: - if not await ops.update_crawl_schedule(cid, update): - raise HTTPException( - status_code=404, detail=f"Crawl Config '{cid}' not found" - ) + success = await ops.update_crawl_schedule(cid, update) except Exception as e: # pylint: disable=raise-missing-from @@ -234,6 +233,11 @@ def init_crawl_config_api(mdb, user_dep, archive_ops, crawl_manager): status_code=403, detail=f"Error updating crawl config: {e}" ) + if not success: + raise HTTPException( + status_code=404, detail=f"Crawl Config '{cid}' not found" + ) + return {"updated": cid} @router.post("/{cid}/run") @@ -265,7 +269,9 @@ def init_crawl_config_api(mdb, user_dep, archive_ops, crawl_manager): ): result = await ops.delete_crawl_config(cid, archive) if not result or not result.deleted_count: - raise HTTPException(status_code=404, detail="Crawl Config Not Found") + raise HTTPException( + status_code=404, detail=f"Crawl Config '{cid}' Not Found" + ) return {"deleted": 1} diff --git a/backend/crawls.py b/backend/crawls.py index 1290e656..3508f4ab 100644 --- a/backend/crawls.py +++ b/backend/crawls.py @@ -125,7 +125,7 @@ def init_crawls_api(app, mdb, crawl_manager, crawl_config_ops, archives): archive_crawl_dep = archives.archive_crawl_dep - @app.post("/crawls/done", tags=["crawls"]) + @app.post("/_crawls/done", tags=["_internal"]) async def crawl_done(msg: CrawlCompleteIn): loop = asyncio.get_running_loop() loop.create_task(ops.on_handle_crawl_complete(msg)) @@ -152,25 +152,22 @@ def init_crawls_api(app, mdb, crawl_manager, crawl_config_ops, archives): "/archives/{aid}/crawls/{crawl_id}/cancel", tags=["crawls"], ) - async def crawl_cancel_stop( + async def crawl_cancel_immediately( crawl_id, archive: Archive = Depends(archive_crawl_dep) ): + crawl = None try: crawl = await crawl_manager.stop_crawl(crawl_id, archive.id, graceful=False) - if not crawl: - raise HTTPException( - status_code=404, detail=f"Crawl not found: {crawl_id}" - ) - - await ops.store_crawl(crawl) - - except HTTPException as httpe: - raise httpe except Exception as exc: # pylint: disable=raise-missing-from raise HTTPException(status_code=400, detail=f"Error Canceling Crawl: {exc}") + if not crawl: + raise HTTPException(status_code=404, detail=f"Crawl not found: {crawl_id}") + + await ops.store_crawl(crawl) + return {"canceled": True} @app.post( @@ -180,27 +177,33 @@ def init_crawls_api(app, mdb, crawl_manager, crawl_config_ops, archives): async def crawl_graceful_stop( crawl_id, archive: Archive = Depends(archive_crawl_dep) ): + canceled = False try: canceled = await crawl_manager.stop_crawl( crawl_id, archive.id, graceful=True ) - if not canceled: - raise HTTPException( - status_code=404, detail=f"Crawl not found: {crawl_id}" - ) - - except HTTPException as httpe: - raise httpe except Exception as exc: # pylint: disable=raise-missing-from raise HTTPException(status_code=400, detail=f"Error Stopping Crawl: {exc}") + if not canceled: + raise HTTPException(status_code=404, detail=f"Crawl not found: {crawl_id}") + return {"stopped_gracefully": True} @app.post("/archives/{aid}/crawls/delete", tags=["crawls"]) async def delete_crawls( delete_list: DeleteCrawlList, archive: Archive = Depends(archive_crawl_dep) ): + try: + for crawl_id in delete_list: + await crawl_manager.stop_crawl(crawl_id, archive.id, graceful=False) + + except Exception as exc: + # pylint: disable=raise-missing-from + raise HTTPException(status_code=400, detail=f"Error Stopping Crawl: {exc}") + res = await ops.delete_crawls(archive.id, delete_list) + return {"deleted": res} diff --git a/backend/dockerman.py b/backend/dockerman.py index 4c18c53b..ec1a6bb1 100644 --- a/backend/dockerman.py +++ b/backend/dockerman.py @@ -109,6 +109,7 @@ class DockerManager: timeout = int(container["Labels"]["btrix.timeout"]) actual = int(time.time()) - int(container["Created"]) if actual >= timeout: + # pylint: disable=line-too-long print( f"Crawl {container['Id']} running for {actual} seconds, exceeded timeout {timeout}, stopping..." ) @@ -181,18 +182,27 @@ class DockerManager: async def stop_crawl(self, crawl_id, aid, graceful=True): """ Stop crawl, if not graceful, issue SIGUSR1 to indicate cancelation """ - container = await self.client.containers.get(crawl_id) - if container["Config"]["Labels"]["btrix.archive"] != aid: - return None + result = None - if not graceful: - await container.kill(signal="SIGUSR1") - result = self._make_crawl_for_container(container, "canceled", True) - else: - result = True + try: + container = await self.client.containers.get(crawl_id) - await container.kill(signal="SIGTERM") + if container["Config"]["Labels"]["btrix.archive"] != aid: + return None + + if not graceful: + await container.kill(signal="SIGUSR1") + result = self._make_crawl_for_container(container, "canceled", True) + else: + result = True + + await container.kill(signal="SIGTERM") + except aiodocker.exceptions.DockerError as exc: + if exc.status == 404: + return None + + raise exc return result @@ -351,7 +361,7 @@ class DockerManager: f"STORE_ENDPOINT_URL={endpoint_with_coll_url}", f"STORE_ACCESS_KEY={storage.access_key}", f"STORE_SECRET_KEY={storage.secret_key}", - "WEBHOOK_URL=http://backend:8000/crawls/done", + "WEBHOOK_URL=http://backend:8000/_crawls/done", ] labels["btrix.run.schedule"] = schedule diff --git a/backend/k8sman.py b/backend/k8sman.py index 911f334a..af7ce214 100644 --- a/backend/k8sman.py +++ b/backend/k8sman.py @@ -123,7 +123,7 @@ class K8SManager: "STORE_ENDPOINT_URL": endpoint_with_coll_url, "STORE_ACCESS_KEY": storage.access_key, "STORE_SECRET_KEY": storage.secret_key, - "WEBHOOK_URL": "http://browsertrix-cloud.default:8000/crawls/done", + "WEBHOOK_URL": "http://browsertrix-cloud.default:8000/_crawls/done", }, ) @@ -169,7 +169,7 @@ class K8SManager: return cron_job - async def update_crawl_config(self, cid, schedule): + async def update_crawl_schedule(self, cid, schedule): """ Update the schedule for existing crawl config """ cron_jobs = await self.batch_beta_api.list_namespaced_cron_job( @@ -195,15 +195,12 @@ class K8SManager: name=cron_job.metadata.name, namespace=self.namespace, body=cron_job ) - async def run_crawl_config(self, cid, manual=True, schedule=""): + async def run_crawl_config(self, cid): """ Run crawl job for cron job based on specified crawlconfig id (cid) """ cron_jobs = await self.batch_beta_api.list_namespaced_cron_job( namespace=self.namespace, label_selector=f"btrix.crawlconfig={cid}" ) - if not manual or schedule: - raise Exception("Manual trigger not supported") - if len(cron_jobs.items) != 1: raise Exception("Crawl Config Not Found") @@ -245,8 +242,8 @@ class K8SManager: return None manual = job.metadata.annotations.get("btrix.run.manual") == "1" - if not manual: - await self._delete_job(job.metadata.name) + if manual: + self.loop.create_task(self._delete_job(job.metadata.name)) return self._make_crawl_for_job( job, @@ -426,6 +423,7 @@ class K8SManager: """Create new job from cron job to run instantly""" annotations = cron_job.spec.job_template.metadata.annotations annotations["btrix.run.manual"] = "1" + annotations["btrix.run.schedule"] = "" # owner_ref = client.V1OwnerReference( # kind="CronJob", diff --git a/pylintrc b/pylintrc new file mode 100644 index 00000000..ae9e18fc --- /dev/null +++ b/pylintrc @@ -0,0 +1,2 @@ +[MESSAGE CONTROL] +disable=duplicate-code