* k8s: add tolerations for 'nodeType=crawling:NoSchedule' to allow scheduling crawling on designated nodes for crawler and profiles jobs and statefulsets * add affinity for 'nodeType=crawling' on crawling and profile browser statefulsets * refactor crawljob: combine crawl_updater logic into base crawl_job * increment new 'crawlAttemptCount' counter crawlconfig when crawl is started, not necessarily finished, to avoid deleting configs that had attempted but not finished crawls. * better external mongodb support: use MONGO_DB_URL to set custom url directly, otherwise build from username, password and mongo host
113 lines
3.6 KiB
Python
113 lines
3.6 KiB
Python
""" entry point for K8s crawl job which manages the stateful crawl """
|
|
|
|
import asyncio
|
|
import os
|
|
|
|
from fastapi import FastAPI
|
|
|
|
from .utils import (
|
|
ping_containers,
|
|
get_service,
|
|
delete_swarm_stack,
|
|
run_swarm_stack,
|
|
)
|
|
|
|
from .base_job import SwarmJobMixin
|
|
from ..crawl_job import CrawlJob
|
|
|
|
|
|
app = FastAPI()
|
|
|
|
|
|
# =============================================================================
|
|
class SwarmCrawlJob(SwarmJobMixin, CrawlJob):
|
|
""" Crawl Job """
|
|
|
|
def _add_extra_crawl_template_params(self, params):
|
|
""" add extra params, if any, for crawl template """
|
|
params["userid"] = os.environ.get("USER_ID")
|
|
params["storage_filename"] = os.environ.get("STORE_FILENAME")
|
|
params["storage_path"] = os.environ.get("STORE_PATH")
|
|
|
|
async def _do_scale(self, new_scale):
|
|
loop = asyncio.get_running_loop()
|
|
|
|
# if making scale smaller, ensure existing crawlers saved their data
|
|
if new_scale < self.scale:
|
|
# ping for final exit
|
|
for num in range(self.scale, new_scale, -1):
|
|
num = num - 1
|
|
service_id = f"crawl-{self.job_id}-{num}_crawler"
|
|
await loop.run_in_executor(None, ping_containers, service_id, "SIGUSR1")
|
|
|
|
# delete
|
|
await self._do_delete_replicas(loop, new_scale, self.scale)
|
|
|
|
if new_scale > self.scale:
|
|
# create new stacks
|
|
params = {}
|
|
params.update(self._cached_params)
|
|
|
|
for num in range(self.scale, new_scale):
|
|
stack_id = f"{self.prefix}{self.job_id}-{num}"
|
|
params["index"] = num
|
|
data = self.templates.env.get_template("crawler.yaml").render(params)
|
|
await loop.run_in_executor(None, run_swarm_stack, stack_id, data)
|
|
|
|
return True
|
|
|
|
async def _get_crawl(self):
|
|
loop = asyncio.get_running_loop()
|
|
return await loop.run_in_executor(
|
|
None, get_service, f"crawl-{self.job_id}-0_crawler"
|
|
)
|
|
|
|
async def _send_shutdown_signal(self, graceful=True):
|
|
loop = asyncio.get_running_loop()
|
|
|
|
for num in range(0, self.scale):
|
|
name = f"crawl-{self.job_id}-{num}_crawler"
|
|
sig = "SIGABRT" if not graceful else "SIGINT"
|
|
print(f"Sending {sig} to {name}", flush=True)
|
|
await loop.run_in_executor(None, ping_containers, name, sig)
|
|
|
|
# pylint: disable=line-too-long
|
|
@property
|
|
def redis_url(self):
|
|
return f"redis://crawl-{self.job_id}-0_redis/0"
|
|
|
|
async def _do_create(self, loop, template, params):
|
|
scale = params.get("scale", 1)
|
|
|
|
self._cached_params = params
|
|
|
|
for num in range(0, scale):
|
|
stack_id = f"{self.prefix}{self.job_id}-{num}"
|
|
params["index"] = num
|
|
data = self.templates.env.get_template(template).render(params)
|
|
await loop.run_in_executor(None, run_swarm_stack, stack_id, data)
|
|
|
|
async def _do_delete(self, loop):
|
|
await self._do_delete_replicas(loop, 0, self.scale)
|
|
|
|
async def _do_delete_replicas(self, loop, start, end):
|
|
# volumes = []
|
|
|
|
for num in range(end, start, -1):
|
|
num = num - 1
|
|
stack_id = f"{self.prefix}{self.job_id}-{num}"
|
|
await loop.run_in_executor(None, delete_swarm_stack, stack_id)
|
|
|
|
# volumes.append(f"crawl-{self.job_id}-{num}")
|
|
|
|
# likely fails as containers still shutting down
|
|
# await loop.run_in_executor(None, delete_volumes, volumes)
|
|
|
|
|
|
# ============================================================================
|
|
@app.on_event("startup")
|
|
async def startup():
|
|
"""init on startup"""
|
|
job = SwarmCrawlJob()
|
|
job.register_handlers(app)
|