docker manager: support scheduling with apscheduler and separate 'scheduler' process
This commit is contained in:
parent
91e9fc8699
commit
b417d7c185
@ -96,6 +96,14 @@ class CrawlConfig(BaseMongoModel):
|
|||||||
crawlTimeout: Optional[int] = 0
|
crawlTimeout: Optional[int] = 0
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
class TriggerCrawl(BaseModel):
|
||||||
|
""" Crawl trigger from internal scheduler """
|
||||||
|
|
||||||
|
id: str
|
||||||
|
schedule: str
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
class CrawlOps:
|
class CrawlOps:
|
||||||
"""Crawl Config Operations"""
|
"""Crawl Config Operations"""
|
||||||
@ -170,8 +178,8 @@ class CrawlOps:
|
|||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# pylint: disable=redefined-builtin,invalid-name
|
# pylint: disable=redefined-builtin,invalid-name,too-many-locals
|
||||||
def init_crawl_config_api(mdb, user_dep, archive_ops, crawl_manager):
|
def init_crawl_config_api(app, mdb, user_dep, archive_ops, crawl_manager):
|
||||||
"""Init /crawlconfigs api routes"""
|
"""Init /crawlconfigs api routes"""
|
||||||
ops = CrawlOps(mdb, archive_ops, crawl_manager)
|
ops = CrawlOps(mdb, archive_ops, crawl_manager)
|
||||||
|
|
||||||
@ -242,6 +250,13 @@ def init_crawl_config_api(mdb, user_dep, archive_ops, crawl_manager):
|
|||||||
|
|
||||||
return {"started": crawl_id}
|
return {"started": crawl_id}
|
||||||
|
|
||||||
|
@app.post("/crawls/trigger", tags=["crawlconfigs"])
|
||||||
|
async def trigger_crawl(trigger: TriggerCrawl):
|
||||||
|
await crawl_manager.run_crawl_config(
|
||||||
|
trigger.id, manual=False, schedule=trigger.schedule
|
||||||
|
)
|
||||||
|
return {}
|
||||||
|
|
||||||
@router.delete("")
|
@router.delete("")
|
||||||
async def delete_crawl_configs(archive: Archive = Depends(archive_crawl_dep)):
|
async def delete_crawl_configs(archive: Archive = Depends(archive_crawl_dep)):
|
||||||
result = await ops.delete_crawl_configs(archive)
|
result = await ops.delete_crawl_configs(archive)
|
||||||
|
@ -86,7 +86,7 @@ class CrawlOps:
|
|||||||
try:
|
try:
|
||||||
await self.crawls.insert_one(crawl.to_dict())
|
await self.crawls.insert_one(crawl.to_dict())
|
||||||
except pymongo.errors.DuplicateKeyError:
|
except pymongo.errors.DuplicateKeyError:
|
||||||
print(f"Crawl Already Added: {crawl.id} - {crawl.state}")
|
# print(f"Crawl Already Added: {crawl.id} - {crawl.state}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
dura = int((crawl.finished - crawl.started).total_seconds())
|
dura = int((crawl.finished - crawl.started).total_seconds())
|
||||||
|
@ -20,6 +20,7 @@ from crawls import Crawl
|
|||||||
class DockerManager:
|
class DockerManager:
|
||||||
""" Docker Crawl Manager Interface"""
|
""" Docker Crawl Manager Interface"""
|
||||||
|
|
||||||
|
# pylint: disable=too-many-instance-attributes
|
||||||
def __init__(self, archive_ops, extra_crawl_params=None):
|
def __init__(self, archive_ops, extra_crawl_params=None):
|
||||||
self.client = aiodocker.Docker()
|
self.client = aiodocker.Docker()
|
||||||
|
|
||||||
@ -66,23 +67,83 @@ class DockerManager:
|
|||||||
"btrix.user": userid,
|
"btrix.user": userid,
|
||||||
"btrix.archive": aid,
|
"btrix.archive": aid,
|
||||||
"btrix.crawlconfig": cid,
|
"btrix.crawlconfig": cid,
|
||||||
"btrix.run.schedule": crawlconfig.schedule,
|
|
||||||
"btrix.run.manual": "1" if crawlconfig.runNow else "0",
|
|
||||||
"btrix.coll": crawlconfig.config.collection,
|
"btrix.coll": crawlconfig.config.collection,
|
||||||
}
|
}
|
||||||
|
|
||||||
# Create Config Volume
|
# Create Config Volume
|
||||||
volume = await self._create_volume(crawlconfig, labels)
|
volume = await self._create_volume(crawlconfig, labels)
|
||||||
|
|
||||||
await self._run_crawl_now(storage, labels, volume, self.extra_crawl_params)
|
if crawlconfig.schedule:
|
||||||
|
print("Scheduling...", flush=True)
|
||||||
|
|
||||||
|
await self._send_sched_msg(
|
||||||
|
{"type": "add", "id": crawlconfig.id, "schedule": crawlconfig.schedule}
|
||||||
|
)
|
||||||
|
|
||||||
|
if crawlconfig.runNow:
|
||||||
|
await self._run_crawl_now(
|
||||||
|
storage,
|
||||||
|
labels,
|
||||||
|
volume,
|
||||||
|
)
|
||||||
|
|
||||||
async def update_crawl_config(self, crawlconfig):
|
async def update_crawl_config(self, crawlconfig):
|
||||||
""" Updating not supported for now (labels can not be altered) """
|
""" Only updating the schedule + run now """
|
||||||
raise Exception("Unsupported")
|
|
||||||
|
|
||||||
async def run_crawl_config(self, cid):
|
if crawlconfig.schedule:
|
||||||
|
print("Updating Schedule..", flush=True)
|
||||||
|
|
||||||
|
await self._send_sched_msg(
|
||||||
|
{"type": "add", "id": crawlconfig.id, "schedule": crawlconfig.schedule}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
await self._send_sched_msg(
|
||||||
|
{"type": "remove", "id": crawlconfig.id}
|
||||||
|
)
|
||||||
|
|
||||||
|
if crawlconfig.runNow:
|
||||||
|
await self.run_crawl_config(crawlconfig.id)
|
||||||
|
|
||||||
|
async def list_running_crawls(self, aid):
|
||||||
|
""" List running containers for this archive """
|
||||||
|
containers = await self._list_running_containers([f"btrix.archive={aid}"])
|
||||||
|
|
||||||
|
running = []
|
||||||
|
|
||||||
|
for container in containers:
|
||||||
|
full_container = await self.client.containers.get(container["Id"])
|
||||||
|
running.append(self._make_crawl_for_container(full_container, "running"))
|
||||||
|
|
||||||
|
return running
|
||||||
|
|
||||||
|
async def stop_crawl(self, crawl_id, aid, graceful=True):
|
||||||
|
""" Stop crawl, if not graceful, issue SIGUSR1 to indicate cancelation """
|
||||||
|
container = await self.client.containers.get(crawl_id)
|
||||||
|
|
||||||
|
if container["Config"]["Labels"]["btrix.archive"] != aid:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if not graceful:
|
||||||
|
await container.kill(signal="SIGUSR1")
|
||||||
|
result = self._make_crawl_for_container(container, "canceled", True)
|
||||||
|
else:
|
||||||
|
result = True
|
||||||
|
|
||||||
|
await container.kill(signal="SIGTERM")
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
async def run_crawl_config(self, cid, manual=True, schedule=""):
|
||||||
""" Run crawl job for cron job based on specified crawlconfig id (cid) """
|
""" Run crawl job for cron job based on specified crawlconfig id (cid) """
|
||||||
|
|
||||||
|
if not manual:
|
||||||
|
if await self._is_scheduled_crawl_for_config_running(cid):
|
||||||
|
print(
|
||||||
|
f"Crawl for {cid} already running, not starting new crawl",
|
||||||
|
flush=True,
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
volume_name = f"crawl-config-{cid}"
|
volume_name = f"crawl-config-{cid}"
|
||||||
volume_obj = aiodocker.docker.DockerVolume(self.client, volume_name)
|
volume_obj = aiodocker.docker.DockerVolume(self.client, volume_name)
|
||||||
|
|
||||||
@ -95,15 +156,16 @@ class DockerManager:
|
|||||||
try:
|
try:
|
||||||
archive = await self.archive_ops.get_archive_by_id(labels["btrix.archive"])
|
archive = await self.archive_ops.get_archive_by_id(labels["btrix.archive"])
|
||||||
storage = archive.storage
|
storage = archive.storage
|
||||||
|
|
||||||
# pylint: disable=broad-except
|
# pylint: disable=broad-except
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
print(exc, flush=True)
|
print(exc, flush=True)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
container = await self._run_crawl_now(
|
container = await self._run_crawl_now(
|
||||||
storage, labels, volume_name, self.extra_crawl_params
|
storage, labels, volume_name, schedule, manual
|
||||||
)
|
)
|
||||||
return container["Id"]
|
return container["id"][:12]
|
||||||
|
|
||||||
async def validate_crawl_complete(self, crawlcomplete):
|
async def validate_crawl_complete(self, crawlcomplete):
|
||||||
"""Validate that crawl is valid by checking that container exists and label matches
|
"""Validate that crawl is valid by checking that container exists and label matches
|
||||||
@ -189,16 +251,33 @@ class DockerManager:
|
|||||||
)
|
)
|
||||||
|
|
||||||
for volume in resp["Volumes"]:
|
for volume in resp["Volumes"]:
|
||||||
print(vol_obj, flush=True)
|
|
||||||
vol_obj = aiodocker.docker.DockerVolume(self.client, volume["Name"])
|
vol_obj = aiodocker.docker.DockerVolume(self.client, volume["Name"])
|
||||||
await vol_obj.delete()
|
|
||||||
|
|
||||||
async def _run_crawl_now(self, storage, labels, volume, extra_crawl_params=None):
|
await self._send_sched_msg(
|
||||||
|
{"type": "remove", "id": volume["Labels"]["btrix.crawlconfig"]}
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await vol_obj.delete()
|
||||||
|
# pylint: disable=bare-except
|
||||||
|
except:
|
||||||
|
print("Warning: Volume Delete Failed, Container in Use", flush=True)
|
||||||
|
|
||||||
|
async def _send_sched_msg(self, msg):
|
||||||
|
reader, writer = await asyncio.open_connection("scheduler", 9017)
|
||||||
|
writer.write(json.dumps(msg).encode("utf-8") + b"\n")
|
||||||
|
await writer.drain()
|
||||||
|
await reader.readline()
|
||||||
|
writer.close()
|
||||||
|
await writer.wait_closed()
|
||||||
|
|
||||||
|
# pylint: disable=too-many-arguments
|
||||||
|
async def _run_crawl_now(self, storage, labels, volume, schedule="", manual=True):
|
||||||
# Set Run Config
|
# Set Run Config
|
||||||
command = ["crawl", "--config", "/tmp/crawlconfig/crawl-config.json"]
|
command = ["crawl", "--config", "/tmp/crawlconfig/crawl-config.json"]
|
||||||
|
|
||||||
if extra_crawl_params:
|
if self.extra_crawl_params:
|
||||||
command += extra_crawl_params
|
command += self.extra_crawl_params
|
||||||
|
|
||||||
endpoint_with_coll_url = os.path.join(
|
endpoint_with_coll_url = os.path.join(
|
||||||
storage.endpoint_url, "collections", labels["btrix.coll"] + "/"
|
storage.endpoint_url, "collections", labels["btrix.coll"] + "/"
|
||||||
@ -213,6 +292,9 @@ class DockerManager:
|
|||||||
"WEBHOOK_URL=http://backend:8000/crawls/done",
|
"WEBHOOK_URL=http://backend:8000/crawls/done",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
labels["btrix.run.schedule"] = schedule
|
||||||
|
labels["btrix.run.manual"] = "1" if manual else "0"
|
||||||
|
|
||||||
run_config = {
|
run_config = {
|
||||||
"Image": self.crawler_image,
|
"Image": self.crawler_image,
|
||||||
"Volumes": {volume: {}},
|
"Volumes": {volume: {}},
|
||||||
@ -227,6 +309,18 @@ class DockerManager:
|
|||||||
|
|
||||||
return await self.client.containers.run(run_config)
|
return await self.client.containers.run(run_config)
|
||||||
|
|
||||||
|
async def _list_running_containers(self, labels):
|
||||||
|
results = await self.client.containers.list(
|
||||||
|
filters=json.dumps({"status": ["running"], "label": labels})
|
||||||
|
)
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def _is_scheduled_crawl_for_config_running(self, cid):
|
||||||
|
results = await self._list_running_containers(
|
||||||
|
[f"btrix.crawlconfig={cid}", "btrix.run.manual=0"]
|
||||||
|
)
|
||||||
|
return len(results) > 0
|
||||||
|
|
||||||
async def _handle_container_die(self, actor):
|
async def _handle_container_die(self, actor):
|
||||||
""" Handle crawl container shutdown """
|
""" Handle crawl container shutdown """
|
||||||
container = await self.client.containers.get(actor["ID"])
|
container = await self.client.containers.get(actor["ID"])
|
||||||
@ -243,6 +337,7 @@ class DockerManager:
|
|||||||
):
|
):
|
||||||
""" Make a crawl object from a container data"""
|
""" Make a crawl object from a container data"""
|
||||||
labels = container["Config"]["Labels"]
|
labels = container["Config"]["Labels"]
|
||||||
|
|
||||||
return Crawl(
|
return Crawl(
|
||||||
id=container["Id"],
|
id=container["Id"],
|
||||||
state=state,
|
state=state,
|
||||||
|
@ -234,12 +234,15 @@ class K8SManager:
|
|||||||
if run_now:
|
if run_now:
|
||||||
await self._create_run_now_job(cron_job)
|
await self._create_run_now_job(cron_job)
|
||||||
|
|
||||||
async def run_crawl_config(self, cid):
|
async def run_crawl_config(self, cid, manual=True, schedule=""):
|
||||||
""" Run crawl job for cron job based on specified crawlconfig id (cid) """
|
""" Run crawl job for cron job based on specified crawlconfig id (cid) """
|
||||||
cron_jobs = await self.batch_beta_api.list_namespaced_cron_job(
|
cron_jobs = await self.batch_beta_api.list_namespaced_cron_job(
|
||||||
namespace=self.namespace, label_selector=f"btrix.crawlconfig={cid}"
|
namespace=self.namespace, label_selector=f"btrix.crawlconfig={cid}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if not manual or schedule:
|
||||||
|
raise Exception("Manual trigger not supported")
|
||||||
|
|
||||||
if len(cron_jobs.items) != 1:
|
if len(cron_jobs.items) != 1:
|
||||||
raise Exception("Crawl Config Not Found")
|
raise Exception("Crawl Config Not Found")
|
||||||
|
|
||||||
|
@ -73,9 +73,9 @@ class BrowsertrixAPI:
|
|||||||
self.crawl_manager = DockerManager(
|
self.crawl_manager = DockerManager(
|
||||||
self.archive_ops, self.default_crawl_params
|
self.archive_ops, self.default_crawl_params
|
||||||
)
|
)
|
||||||
# raise Exception("Currently, only running in Kubernetes is supported")
|
|
||||||
|
|
||||||
self.crawl_config_ops = init_crawl_config_api(
|
self.crawl_config_ops = init_crawl_config_api(
|
||||||
|
self.app,
|
||||||
self.mdb,
|
self.mdb,
|
||||||
current_active_user,
|
current_active_user,
|
||||||
self.archive_ops,
|
self.archive_ops,
|
||||||
|
@ -4,3 +4,4 @@ loguru
|
|||||||
aiofiles
|
aiofiles
|
||||||
kubernetes-asyncio
|
kubernetes-asyncio
|
||||||
aiodocker
|
aiodocker
|
||||||
|
apscheduler
|
||||||
|
@ -14,8 +14,20 @@ services:
|
|||||||
- ./config.env
|
- ./config.env
|
||||||
|
|
||||||
depends_on:
|
depends_on:
|
||||||
- mongo
|
|
||||||
- minio
|
- minio
|
||||||
|
- mongo
|
||||||
|
- scheduler
|
||||||
|
|
||||||
|
scheduler:
|
||||||
|
build: ./backend
|
||||||
|
image: webrecorder/browsertrix-api
|
||||||
|
command: python -u scheduler.py
|
||||||
|
|
||||||
|
env_file:
|
||||||
|
- ./config.env
|
||||||
|
|
||||||
|
depends_on:
|
||||||
|
- mongo
|
||||||
|
|
||||||
mongo:
|
mongo:
|
||||||
image: mongo
|
image: mongo
|
||||||
|
Loading…
Reference in New Issue
Block a user