misc tweaks:

- better error handling for not found resources, ensure 404
- typo in k8smanager
- add pylintrc
- ensure manual job ares deleted when complete
- fix typos, reformat
This commit is contained in:
Ilya Kreymer 2021-08-25 18:20:17 -07:00
parent f1a816be48
commit 223658cfa2
5 changed files with 61 additions and 42 deletions

View File

@ -141,10 +141,11 @@ class CrawlOps:
async def update_crawl_schedule(self, cid: str, update: UpdateSchedule):
""" Update schedule for existing crawl config"""
if not await self.crawl_configs.find_one_and_update(
{"_id": cid}, {"$set": {"schedule": update.schedule}}
):
return None
return False
await self.crawl_manager.update_crawl_schedule(cid, update.schedule)
return True
@ -222,11 +223,9 @@ def init_crawl_config_api(mdb, user_dep, archive_ops, crawl_manager):
cid: str,
):
success = False
try:
if not await ops.update_crawl_schedule(cid, update):
raise HTTPException(
status_code=404, detail=f"Crawl Config '{cid}' not found"
)
success = await ops.update_crawl_schedule(cid, update)
except Exception as e:
# pylint: disable=raise-missing-from
@ -234,6 +233,11 @@ def init_crawl_config_api(mdb, user_dep, archive_ops, crawl_manager):
status_code=403, detail=f"Error updating crawl config: {e}"
)
if not success:
raise HTTPException(
status_code=404, detail=f"Crawl Config '{cid}' not found"
)
return {"updated": cid}
@router.post("/{cid}/run")
@ -265,7 +269,9 @@ def init_crawl_config_api(mdb, user_dep, archive_ops, crawl_manager):
):
result = await ops.delete_crawl_config(cid, archive)
if not result or not result.deleted_count:
raise HTTPException(status_code=404, detail="Crawl Config Not Found")
raise HTTPException(
status_code=404, detail=f"Crawl Config '{cid}' Not Found"
)
return {"deleted": 1}

View File

@ -125,7 +125,7 @@ def init_crawls_api(app, mdb, crawl_manager, crawl_config_ops, archives):
archive_crawl_dep = archives.archive_crawl_dep
@app.post("/crawls/done", tags=["crawls"])
@app.post("/_crawls/done", tags=["_internal"])
async def crawl_done(msg: CrawlCompleteIn):
loop = asyncio.get_running_loop()
loop.create_task(ops.on_handle_crawl_complete(msg))
@ -152,25 +152,22 @@ def init_crawls_api(app, mdb, crawl_manager, crawl_config_ops, archives):
"/archives/{aid}/crawls/{crawl_id}/cancel",
tags=["crawls"],
)
async def crawl_cancel_stop(
async def crawl_cancel_immediately(
crawl_id, archive: Archive = Depends(archive_crawl_dep)
):
crawl = None
try:
crawl = await crawl_manager.stop_crawl(crawl_id, archive.id, graceful=False)
if not crawl:
raise HTTPException(
status_code=404, detail=f"Crawl not found: {crawl_id}"
)
await ops.store_crawl(crawl)
except HTTPException as httpe:
raise httpe
except Exception as exc:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=400, detail=f"Error Canceling Crawl: {exc}")
if not crawl:
raise HTTPException(status_code=404, detail=f"Crawl not found: {crawl_id}")
await ops.store_crawl(crawl)
return {"canceled": True}
@app.post(
@ -180,27 +177,33 @@ def init_crawls_api(app, mdb, crawl_manager, crawl_config_ops, archives):
async def crawl_graceful_stop(
crawl_id, archive: Archive = Depends(archive_crawl_dep)
):
canceled = False
try:
canceled = await crawl_manager.stop_crawl(
crawl_id, archive.id, graceful=True
)
if not canceled:
raise HTTPException(
status_code=404, detail=f"Crawl not found: {crawl_id}"
)
except HTTPException as httpe:
raise httpe
except Exception as exc:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=400, detail=f"Error Stopping Crawl: {exc}")
if not canceled:
raise HTTPException(status_code=404, detail=f"Crawl not found: {crawl_id}")
return {"stopped_gracefully": True}
@app.post("/archives/{aid}/crawls/delete", tags=["crawls"])
async def delete_crawls(
delete_list: DeleteCrawlList, archive: Archive = Depends(archive_crawl_dep)
):
try:
for crawl_id in delete_list:
await crawl_manager.stop_crawl(crawl_id, archive.id, graceful=False)
except Exception as exc:
# pylint: disable=raise-missing-from
raise HTTPException(status_code=400, detail=f"Error Stopping Crawl: {exc}")
res = await ops.delete_crawls(archive.id, delete_list)
return {"deleted": res}

View File

@ -109,6 +109,7 @@ class DockerManager:
timeout = int(container["Labels"]["btrix.timeout"])
actual = int(time.time()) - int(container["Created"])
if actual >= timeout:
# pylint: disable=line-too-long
print(
f"Crawl {container['Id']} running for {actual} seconds, exceeded timeout {timeout}, stopping..."
)
@ -181,18 +182,27 @@ class DockerManager:
async def stop_crawl(self, crawl_id, aid, graceful=True):
""" Stop crawl, if not graceful, issue SIGUSR1 to indicate cancelation """
container = await self.client.containers.get(crawl_id)
if container["Config"]["Labels"]["btrix.archive"] != aid:
return None
result = None
if not graceful:
await container.kill(signal="SIGUSR1")
result = self._make_crawl_for_container(container, "canceled", True)
else:
result = True
try:
container = await self.client.containers.get(crawl_id)
await container.kill(signal="SIGTERM")
if container["Config"]["Labels"]["btrix.archive"] != aid:
return None
if not graceful:
await container.kill(signal="SIGUSR1")
result = self._make_crawl_for_container(container, "canceled", True)
else:
result = True
await container.kill(signal="SIGTERM")
except aiodocker.exceptions.DockerError as exc:
if exc.status == 404:
return None
raise exc
return result
@ -351,7 +361,7 @@ class DockerManager:
f"STORE_ENDPOINT_URL={endpoint_with_coll_url}",
f"STORE_ACCESS_KEY={storage.access_key}",
f"STORE_SECRET_KEY={storage.secret_key}",
"WEBHOOK_URL=http://backend:8000/crawls/done",
"WEBHOOK_URL=http://backend:8000/_crawls/done",
]
labels["btrix.run.schedule"] = schedule

View File

@ -123,7 +123,7 @@ class K8SManager:
"STORE_ENDPOINT_URL": endpoint_with_coll_url,
"STORE_ACCESS_KEY": storage.access_key,
"STORE_SECRET_KEY": storage.secret_key,
"WEBHOOK_URL": "http://browsertrix-cloud.default:8000/crawls/done",
"WEBHOOK_URL": "http://browsertrix-cloud.default:8000/_crawls/done",
},
)
@ -169,7 +169,7 @@ class K8SManager:
return cron_job
async def update_crawl_config(self, cid, schedule):
async def update_crawl_schedule(self, cid, schedule):
""" Update the schedule for existing crawl config """
cron_jobs = await self.batch_beta_api.list_namespaced_cron_job(
@ -195,15 +195,12 @@ class K8SManager:
name=cron_job.metadata.name, namespace=self.namespace, body=cron_job
)
async def run_crawl_config(self, cid, manual=True, schedule=""):
async def run_crawl_config(self, cid):
""" Run crawl job for cron job based on specified crawlconfig id (cid) """
cron_jobs = await self.batch_beta_api.list_namespaced_cron_job(
namespace=self.namespace, label_selector=f"btrix.crawlconfig={cid}"
)
if not manual or schedule:
raise Exception("Manual trigger not supported")
if len(cron_jobs.items) != 1:
raise Exception("Crawl Config Not Found")
@ -245,8 +242,8 @@ class K8SManager:
return None
manual = job.metadata.annotations.get("btrix.run.manual") == "1"
if not manual:
await self._delete_job(job.metadata.name)
if manual:
self.loop.create_task(self._delete_job(job.metadata.name))
return self._make_crawl_for_job(
job,
@ -426,6 +423,7 @@ class K8SManager:
"""Create new job from cron job to run instantly"""
annotations = cron_job.spec.job_template.metadata.annotations
annotations["btrix.run.manual"] = "1"
annotations["btrix.run.schedule"] = ""
# owner_ref = client.V1OwnerReference(
# kind="CronJob",

2
pylintrc Normal file
View File

@ -0,0 +1,2 @@
[MESSAGE CONTROL]
disable=duplicate-code