* backend: make crawlconfigs mutable! (#656) - crawlconfig PATCH /{id} can now receive a new JSON config to replace the old one (in addition to scale, schedule, tags) - exclusions: add / remove APIs mutate the current crawlconfig, do not result in a new crawlconfig created - exclusions: ensure crawl job 'config' is updated when exclusions are added/removed, unify add/remove exclusions on crawl - k8s: crawlconfig json is updated along with scale - k8s: stateful set is restarted by updating annotation, instead of changing template - crawl object: now has 'config', as well as 'profileid', 'schedule', 'crawlTimeout', 'jobType' properties to ensure anything that is changeable is stored on the crawl - crawlconfigcore: store share properties between crawl and crawlconfig in new crawlconfigcore (includes 'schedule', 'jobType', 'config', 'profileid', 'schedule', 'crawlTimeout', 'tags', 'oid') - crawlconfig object: remove 'oldId', 'newId', disallow deactivating/deleting while crawl is running - rename 'userid' -> 'createdBy' - remove unused 'completions' field - add missing return to fix /run response - crawlout: ensure 'profileName' is resolved on CrawlOut from profileid - crawlout: return 'name' instead of 'configName' for consistent response - update: 'modified', 'modifiedBy' fields to set modification date and user modifying config - update: ensure PROFILE_FILENAME is updated in configmap is profileid provided, clear if profileid=="" - update: return 'settings_changed' and 'metadata_changed' if either crawl settings or metadata changed - tests: update tests to check settings_changed/metadata_changed return values add revision tracking to crawlconfig: - store each revision separate mongo db collection - revisions accessible via /crawlconfigs/{cid}/revs - store 'rev' int in crawlconfig and in crawljob - only add revision history if crawl config changed migration: - update to db v3 - copy fields from crawlconfig -> crawl - rename userid -> createdBy - copy userid -> modifiedBy, created -> modified - skip invalid crawls (missing config), make createdBy optional (just in case) frontend: Update crawl config keys with new API (#681), update frontend to use new PATCH endpoint, load config from crawl object in details view --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net> Co-authored-by: sua yoo <sua@webrecorder.org> Co-authored-by: sua yoo <sua@suayoo.com>
		
			
				
	
	
		
			95 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			95 lines
		
	
	
		
			2.9 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
""" entry point for K8s crawl job which manages the stateful crawl """
 | 
						|
 | 
						|
import datetime
 | 
						|
from fastapi import FastAPI
 | 
						|
 | 
						|
from .utils import send_signal_to_pods
 | 
						|
from .base_job import K8SJobMixin
 | 
						|
 | 
						|
from ..crawl_job import CrawlJob
 | 
						|
 | 
						|
app = FastAPI()
 | 
						|
 | 
						|
 | 
						|
# =============================================================================
 | 
						|
class K8SCrawlJob(K8SJobMixin, CrawlJob):
 | 
						|
    """Crawl Job State"""
 | 
						|
 | 
						|
    async def _do_scale(self, new_scale):
 | 
						|
        crawl = await self._get_crawl()
 | 
						|
        if not crawl:
 | 
						|
            raise RuntimeError("crawl_not_found")
 | 
						|
 | 
						|
        # if making scale smaller, ensure existing crawlers saved their data
 | 
						|
        pods = []
 | 
						|
        for inx in range(new_scale, crawl.spec.replicas):
 | 
						|
            pods.append(
 | 
						|
                await self.core_api.read_namespaced_pod(
 | 
						|
                    name=f"crawl-{self.job_id}-{inx}",
 | 
						|
                    namespace=self.namespace,
 | 
						|
                )
 | 
						|
            )
 | 
						|
 | 
						|
        if pods:
 | 
						|
            await send_signal_to_pods(self.core_api_ws, self.namespace, pods, "SIGUSR2")
 | 
						|
 | 
						|
        crawl.spec.replicas = new_scale
 | 
						|
 | 
						|
        await self.apps_api.patch_namespaced_stateful_set(
 | 
						|
            name=crawl.metadata.name, namespace=self.namespace, body=crawl
 | 
						|
        )
 | 
						|
 | 
						|
    async def _rollover_restart(self):
 | 
						|
        """patch existing crawl statefulset with new timestamp to force restart"""
 | 
						|
        now = datetime.datetime.utcnow()
 | 
						|
        now = str(now.isoformat("T") + "Z")
 | 
						|
        patch_config = {
 | 
						|
            "spec": {
 | 
						|
                "template": {"metadata": {"annotations": {"btrix.restartedAt": now}}}
 | 
						|
            }
 | 
						|
        }
 | 
						|
 | 
						|
        await self.apps_api.patch_namespaced_stateful_set(
 | 
						|
            name=f"crawl-{self.job_id}", namespace=self.namespace, body=patch_config
 | 
						|
        )
 | 
						|
 | 
						|
        return {"success": True}
 | 
						|
 | 
						|
    async def _get_crawl(self):
 | 
						|
        try:
 | 
						|
            return await self.apps_api.read_namespaced_stateful_set(
 | 
						|
                name=f"crawl-{self.job_id}",
 | 
						|
                namespace=self.namespace,
 | 
						|
            )
 | 
						|
        # pylint: disable=bare-except
 | 
						|
        except:
 | 
						|
            return None
 | 
						|
 | 
						|
    def _get_crawl_scale(self, crawl):
 | 
						|
        """get scale from crawl, if any"""
 | 
						|
 | 
						|
        return crawl.spec.replicas if crawl else 0
 | 
						|
 | 
						|
    async def _send_shutdown_signal(self, signame):
 | 
						|
        pods = await self.core_api.list_namespaced_pod(
 | 
						|
            namespace=self.namespace,
 | 
						|
            label_selector=f"crawl={self.job_id},role=crawler",
 | 
						|
        )
 | 
						|
 | 
						|
        return await send_signal_to_pods(
 | 
						|
            self.core_api_ws, self.namespace, pods.items, signame
 | 
						|
        )
 | 
						|
 | 
						|
    # pylint: disable=line-too-long
 | 
						|
    @property
 | 
						|
    def redis_url(self):
 | 
						|
        return f"redis://redis-{self.job_id}-0.redis-{self.job_id}.{self.namespace}.svc.cluster.local/0"
 | 
						|
 | 
						|
 | 
						|
# ============================================================================
 | 
						|
@app.on_event("startup")
 | 
						|
async def startup():
 | 
						|
    """init on startup"""
 | 
						|
    job = K8SCrawlJob()
 | 
						|
    job.register_handlers(app)
 |