* add exclusion api, fixes #311 add new apis: `POST crawls/{crawl_id}/exclusion?regex=...` and `DELETE crawls/{crawl_id}/exclusion?regex=...` which will: - create new config with add 'regex' as exclusion (deleting or making inactive previous config) OR remove as exclusion. - update crawl to point to new config - update statefulset to point to new config, causing crawler pods to restart - filter out urls matching 'regex' from both queue and seen list (currently a bit slow) (when adding only) - return 400 if exclusion already existing when adding, or doesn't exist when removing - api reads redis list in reverse to match how exclusion queue is used
		
			
				
	
	
		
			215 lines
		
	
	
		
			6.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			215 lines
		
	
	
		
			6.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| """ shared crawl manager implementation """
 | |
| 
 | |
| import os
 | |
| import asyncio
 | |
| import datetime
 | |
| 
 | |
| from abc import ABC, abstractmethod
 | |
| 
 | |
| from fastapi.templating import Jinja2Templates
 | |
| 
 | |
| from .utils import random_suffix
 | |
| from .db import resolve_db_url
 | |
| 
 | |
| 
 | |
| # ============================================================================
 | |
| class BaseCrawlManager(ABC):
 | |
|     """abstract crawl manager"""
 | |
| 
 | |
|     def __init__(self, templates):
 | |
|         super().__init__()
 | |
| 
 | |
|         self.job_image = os.environ["JOB_IMAGE"]
 | |
| 
 | |
|         self.no_delete_jobs = os.environ.get("NO_DELETE_JOBS", "0") != "0"
 | |
| 
 | |
|         self.crawler_node_type = os.environ.get("CRAWLER_NODE_TYPE", "")
 | |
| 
 | |
|         self.templates = Jinja2Templates(directory=templates)
 | |
| 
 | |
|         self.loop = asyncio.get_running_loop()
 | |
| 
 | |
|     # pylint: disable=too-many-arguments
 | |
|     async def run_profile_browser(
 | |
|         self,
 | |
|         userid,
 | |
|         aid,
 | |
|         url,
 | |
|         storage=None,
 | |
|         storage_name=None,
 | |
|         baseprofile=None,
 | |
|         profile_path=None,
 | |
|     ):
 | |
|         """run browser for profile creation"""
 | |
| 
 | |
|         # if default storage, use name and path + profiles/
 | |
|         if storage:
 | |
|             storage_name = storage.name
 | |
|             storage_path = storage.path + "profiles/"
 | |
|         # otherwise, use storage name and existing path from secret
 | |
|         else:
 | |
|             storage_path = ""
 | |
| 
 | |
|         await self.check_storage(storage_name)
 | |
| 
 | |
|         browserid = f"prf-{random_suffix()}"
 | |
| 
 | |
|         params = {
 | |
|             "id": browserid,
 | |
|             "userid": str(userid),
 | |
|             "aid": str(aid),
 | |
|             "job_image": self.job_image,
 | |
|             "storage_name": storage_name,
 | |
|             "storage_path": storage_path or "",
 | |
|             "baseprofile": baseprofile or "",
 | |
|             "profile_path": profile_path,
 | |
|             "url": url,
 | |
|             "env": os.environ,
 | |
|         }
 | |
| 
 | |
|         data = self.templates.env.get_template("profile_job.yaml").render(params)
 | |
| 
 | |
|         await self._create_from_yaml(f"job-{browserid}", data)
 | |
| 
 | |
|         return browserid
 | |
| 
 | |
|     async def add_crawl_config(
 | |
|         self,
 | |
|         crawlconfig,
 | |
|         storage,
 | |
|         run_now,
 | |
|         out_filename,
 | |
|         profile_filename,
 | |
|     ):
 | |
|         """add new crawl as cron job, store crawl config in configmap"""
 | |
| 
 | |
|         if storage.type == "default":
 | |
|             storage_name = storage.name
 | |
|             storage_path = storage.path
 | |
|         else:
 | |
|             storage_name = str(crawlconfig.aid)
 | |
|             storage_path = ""
 | |
| 
 | |
|         await self.check_storage(storage_name)
 | |
| 
 | |
|         # Create Config Map
 | |
|         await self._create_config_map(
 | |
|             crawlconfig,
 | |
|             STORE_PATH=storage_path,
 | |
|             STORE_FILENAME=out_filename,
 | |
|             STORAGE_NAME=storage_name,
 | |
|             USER_ID=str(crawlconfig.userid),
 | |
|             ARCHIVE_ID=str(crawlconfig.aid),
 | |
|             CRAWL_CONFIG_ID=str(crawlconfig.id),
 | |
|             PROFILE_FILENAME=profile_filename,
 | |
|         )
 | |
| 
 | |
|         crawl_id = None
 | |
| 
 | |
|         if run_now:
 | |
|             crawl_id = await self._create_manual_job(crawlconfig)
 | |
| 
 | |
|         await self._update_scheduled_job(crawlconfig)
 | |
| 
 | |
|         return crawl_id
 | |
| 
 | |
|     # pylint: disable=unused-argument
 | |
|     async def run_crawl_config(self, crawlconfig, userid=None):
 | |
|         """Run crawl job for cron job based on specified crawlconfig
 | |
|         optionally set different user"""
 | |
| 
 | |
|         return await self._create_manual_job(crawlconfig)
 | |
| 
 | |
|     async def update_crawlconfig_schedule_or_scale(
 | |
|         self, crawlconfig, scale=None, schedule=None
 | |
|     ):
 | |
|         """Update the schedule or scale for existing crawl config"""
 | |
| 
 | |
|         if schedule is not None:
 | |
|             await self._update_scheduled_job(crawlconfig)
 | |
| 
 | |
|         if scale is not None:
 | |
|             await self._update_config_initial_scale(crawlconfig, scale)
 | |
| 
 | |
|         return True
 | |
| 
 | |
|     async def shutdown_crawl(self, crawl_id, aid, graceful=True):
 | |
|         """Request a crawl cancelation or stop by calling an API
 | |
|         on the job pod/container, returning the result"""
 | |
|         return await self._post_to_job(
 | |
|             crawl_id, aid, "/stop" if graceful else "/cancel"
 | |
|         )
 | |
| 
 | |
|     async def scale_crawl(self, crawl_id, aid, scale=1):
 | |
|         """Set the crawl scale (job parallelism) on the specified job"""
 | |
| 
 | |
|         return await self._post_to_job(crawl_id, aid, f"/scale/{scale}")
 | |
| 
 | |
|     async def change_crawl_config(self, crawl_id, aid, new_cid):
 | |
|         """Change crawl config and restart"""
 | |
| 
 | |
|         return await self._post_to_job(crawl_id, aid, f"/change_config/{new_cid}")
 | |
| 
 | |
|     async def delete_crawl_configs_for_archive(self, archive):
 | |
|         """Delete all crawl configs for given archive"""
 | |
|         return await self._delete_crawl_configs(f"btrix.archive={archive}")
 | |
| 
 | |
|     async def delete_crawl_config_by_id(self, cid):
 | |
|         """Delete all crawl configs by id"""
 | |
|         return await self._delete_crawl_configs(f"btrix.crawlconfig={cid}")
 | |
| 
 | |
|     async def _create_manual_job(self, crawlconfig):
 | |
|         cid = str(crawlconfig.id)
 | |
|         ts_now = datetime.datetime.utcnow().strftime("%Y%m%d%H%M%S")
 | |
|         crawl_id = f"manual-{ts_now}-{cid[:12]}"
 | |
| 
 | |
|         data = await self._load_job_template(crawlconfig, crawl_id, manual=True)
 | |
| 
 | |
|         # create job directly
 | |
|         await self._create_from_yaml(f"job-{crawl_id}", data)
 | |
| 
 | |
|         return crawl_id
 | |
| 
 | |
|     async def _load_job_template(self, crawlconfig, job_id, manual, schedule=None):
 | |
|         params = {
 | |
|             "id": job_id,
 | |
|             "cid": str(crawlconfig.id),
 | |
|             "userid": str(crawlconfig.userid),
 | |
|             "aid": str(crawlconfig.aid),
 | |
|             "job_image": self.job_image,
 | |
|             "manual": "1" if manual else "0",
 | |
|             "crawler_node_type": self.crawler_node_type,
 | |
|             "schedule": schedule,
 | |
|             "env": os.environ,
 | |
|             "mongo_db_url": resolve_db_url(),
 | |
|         }
 | |
| 
 | |
|         return self.templates.env.get_template("crawl_job.yaml").render(params)
 | |
| 
 | |
|     async def _update_config_initial_scale(self, crawlconfig, scale):
 | |
|         """update initial scale in config, if needed (k8s only)"""
 | |
| 
 | |
|     @abstractmethod
 | |
|     async def check_storage(self, storage_name, is_default=False):
 | |
|         """check if given storage is valid"""
 | |
| 
 | |
|     @abstractmethod
 | |
|     async def _create_from_yaml(self, id_, yaml_data):
 | |
|         """check if given storage is valid"""
 | |
| 
 | |
|     @abstractmethod
 | |
|     async def _create_config_map(self, crawlconfig, **kwargs):
 | |
|         """create config map for config"""
 | |
| 
 | |
|     @abstractmethod
 | |
|     async def _update_scheduled_job(self, crawlconfig):
 | |
|         """update schedule on crawl job"""
 | |
| 
 | |
|     @abstractmethod
 | |
|     async def _post_to_job(self, crawl_id, aid, path, data=None):
 | |
|         """make a POST request to the container for specified crawl job"""
 | |
| 
 | |
|     @abstractmethod
 | |
|     async def _delete_crawl_configs(self, label):
 | |
|         """delete crawl configs by specified label"""
 |