make crawlTimeout a per-crawconfig property

allow crawl complete/partial complete to update existing crawl state, eg. timeout
enable handling backofflimitexceeded / deadlineexceeded failure, with possible success able to override the failure state
filter out only active jobs in running crawls listing
This commit is contained in:
Ilya Kreymer 2021-08-24 11:27:34 -07:00
parent ed27f3e3ee
commit 20b19f932f
4 changed files with 57 additions and 26 deletions

View File

@ -75,7 +75,7 @@ class CrawlConfigIn(BaseModel):
schedule: Optional[str] = "" schedule: Optional[str] = ""
runNow: Optional[bool] = False runNow: Optional[bool] = False
# storageName: Optional[str] = "default" crawlTimeout: Optional[int] = 0
config: RawCrawlConfig config: RawCrawlConfig
@ -93,6 +93,8 @@ class CrawlConfig(BaseMongoModel):
config: RawCrawlConfig config: RawCrawlConfig
crawlTimeout: Optional[int] = 0
# ============================================================================ # ============================================================================
class CrawlOps: class CrawlOps:

View File

@ -1,13 +1,13 @@
""" Crawl API """ """ Crawl API """
import asyncio import asyncio
import traceback
from typing import Optional, List from typing import Optional, List
from datetime import datetime from datetime import datetime
from fastapi import Depends, HTTPException from fastapi import Depends, HTTPException
from pydantic import BaseModel from pydantic import BaseModel
import pymongo
from db import BaseMongoModel from db import BaseMongoModel
from archives import Archive from archives import Archive
@ -74,11 +74,20 @@ class CrawlOps:
print("Not a valid crawl complete msg!", flush=True) print("Not a valid crawl complete msg!", flush=True)
return return
await self.handle_finished(crawl) await self.store_crawl(crawl, update_existing=True)
async def handle_finished(self, crawl: Crawl): async def store_crawl(self, crawl: Crawl, update_existing=False):
""" Add finished crawl to db, increment archive usage """ """ Add finished crawl to db, increment archive usage """
await self.crawls.insert_one(crawl.to_dict()) if update_existing:
await self.crawls.find_one_and_replace(
{"_id": crawl.id}, crawl.to_dict(), upsert=True
)
else:
try:
await self.crawls.insert_one(crawl.to_dict())
except pymongo.errors.DuplicateKeyError:
print(f"Crawl Already Added: {crawl.id}")
return False
dura = int((crawl.finished - crawl.started).total_seconds()) dura = int((crawl.finished - crawl.started).total_seconds())
@ -150,7 +159,7 @@ def init_crawls_api(app, mdb, crawl_manager, archives):
status_code=404, detail=f"Crawl not found: {crawl_id}" status_code=404, detail=f"Crawl not found: {crawl_id}"
) )
await ops.handle_finished(crawl) await ops.store_crawl(crawl)
except HTTPException as httpe: except HTTPException as httpe:
raise httpe raise httpe
@ -182,7 +191,6 @@ def init_crawls_api(app, mdb, crawl_manager, archives):
except Exception as exc: except Exception as exc:
# pylint: disable=raise-missing-from # pylint: disable=raise-missing-from
traceback.print_exc()
raise HTTPException(status_code=400, detail=f"Error Stopping Crawl: {exc}") raise HTTPException(status_code=400, detail=f"Error Stopping Crawl: {exc}")
return {"stopped_gracefully": True} return {"stopped_gracefully": True}

View File

@ -37,17 +37,26 @@ class K8SManager:
self.crawler_image = os.environ.get("CRAWLER_IMAGE") self.crawler_image = os.environ.get("CRAWLER_IMAGE")
self.crawler_image_pull_policy = "IfNotPresent" self.crawler_image_pull_policy = "IfNotPresent"
self.crawl_timeout = int(os.environ.get("CRAWL_TIMEOUT", "1000000"))
self.crawl_retries = int(os.environ.get("CRAWL_RETRIES", "3")) self.crawl_retries = int(os.environ.get("CRAWL_RETRIES", "3"))
self.loop = asyncio.get_running_loop() self.loop = asyncio.get_running_loop()
self.loop.create_task(self.watch_job_loop()) self.loop.create_task(self.run_event_loop())
def set_crawl_ops(self, ops): def set_crawl_ops(self, ops):
""" Set crawl ops handler """ """ Set crawl ops handler """
self.crawl_ops = ops self.crawl_ops = ops
async def watch_job_loop(self): async def run_event_loop(self):
""" Run the job watch loop, retry in case of failure"""
while True:
try:
await self.watch_events()
# pylint: disable=broad-except
except Exception as exc:
print(f"Retrying job loop: {exc}")
await asyncio.sleep(10)
async def watch_events(self):
""" Get events for completed jobs""" """ Get events for completed jobs"""
async with watch.Watch().stream( async with watch.Watch().stream(
self.core_api.list_namespaced_event, self.core_api.list_namespaced_event,
@ -62,12 +71,12 @@ class K8SManager:
self.handle_crawl_failed(obj.involved_object.name, "failed") self.handle_crawl_failed(obj.involved_object.name, "failed")
) )
# elif obj.reason == "DeadlineExceeded": elif obj.reason == "DeadlineExceeded":
# self.loop.create_task( self.loop.create_task(
# self.handle_crawl_failed( self.handle_crawl_failed(
# obj.involved_object.name, "timed_out" obj.involved_object.name, "timed_out"
# ) )
# ) )
# pylint: disable=broad-except # pylint: disable=broad-except
except Exception as exc: except Exception as exc:
@ -131,7 +140,7 @@ class K8SManager:
extra_crawl_params = extra_crawl_params or [] extra_crawl_params = extra_crawl_params or []
job_template = self._get_job_template( job_template = self._get_job_template(
cid, labels, annotations, extra_crawl_params cid, labels, annotations, crawlconfig.crawlTimeout, extra_crawl_params
) )
spec = client.V1beta1CronJobSpec( spec = client.V1beta1CronJobSpec(
@ -205,6 +214,15 @@ class K8SManager:
cron_job.spec.suspend = suspend cron_job.spec.suspend = suspend
changed = True changed = True
if (
crawlconfig.crawlTimeout
!= cron_job.spec.job_template.spec.active_deadline_seconds
):
cron_job.spec.job_template.spec.active_deadline_seconds = (
crawlconfig.crawlTimeout
)
changed = True
if changed: if changed:
cron_job.spec.job_template.metadata.annotations[ cron_job.spec.job_template.metadata.annotations[
"btrix.run.schedule" "btrix.run.schedule"
@ -248,7 +266,11 @@ class K8SManager:
field_selector="status.successful=0", field_selector="status.successful=0",
) )
return [self._make_crawl_for_job(job, "running") for job in jobs.items] return [
self._make_crawl_for_job(job, "running")
for job in jobs.items
if job.status.active
]
async def validate_crawl_complete(self, crawlcomplete): async def validate_crawl_complete(self, crawlcomplete):
"""Ensure the crawlcomplete data is valid (job exists and user matches) """Ensure the crawlcomplete data is valid (job exists and user matches)
@ -332,7 +354,7 @@ class K8SManager:
crawl = self._make_crawl_for_job(job, reason, True) crawl = self._make_crawl_for_job(job, reason, True)
await self.crawl_ops.handle_finished(crawl) await self.crawl_ops.store_crawl(crawl)
await self._delete_job(job_name) await self._delete_job(job_name)
@ -360,7 +382,7 @@ class K8SManager:
await self.batch_api.delete_namespaced_job( await self.batch_api.delete_namespaced_job(
name=name, name=name,
namespace=self.namespace, namespace=self.namespace,
grace_period_seconds=120, grace_period_seconds=60,
propagation_policy="Foreground", propagation_policy="Foreground",
) )
@ -474,7 +496,9 @@ class K8SManager:
body=job, namespace=self.namespace body=job, namespace=self.namespace
) )
def _get_job_template(self, uid, labels, annotations, extra_crawl_params): def _get_job_template(
self, uid, labels, annotations, crawl_timeout, extra_crawl_params
):
"""Return crawl job template for crawl job, including labels, adding optiona crawl params""" """Return crawl job template for crawl job, including labels, adding optiona crawl params"""
command = ["crawl", "--config", "/tmp/crawl-config.json"] command = ["crawl", "--config", "/tmp/crawl-config.json"]
@ -556,7 +580,7 @@ class K8SManager:
}, },
} }
if self.crawl_timeout > 0: if crawl_timeout > 0:
job_template["spec"]["activeDeadlineSeconds"] = self.crawl_timeout job_template["spec"]["activeDeadlineSeconds"] = crawl_timeout
return job_template return job_template

View File

@ -41,9 +41,6 @@ crawler_pull_policy: "Never"
crawler_namespace: "crawlers" crawler_namespace: "crawlers"
# set 0 to disable timeout
crawl_timeout: 0
# num retries # num retries
crawl_retries: 1 crawl_retries: 1