support overriding crawler image pull policy per channel (#2523)
- add 'imagePullPolicy' field to each crawler channel declaration - if unset, defaults to the setting in the existing 'crawler_image_pull_policy' field. fixes #2522 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
		
							parent
							
								
									df8c80f3cc
								
							
						
					
					
						commit
						62e47a8817
					
				@ -85,6 +85,7 @@ class CrawlConfigOps:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    crawler_channels: CrawlerChannels
 | 
					    crawler_channels: CrawlerChannels
 | 
				
			||||||
    crawler_images_map: dict[str, str]
 | 
					    crawler_images_map: dict[str, str]
 | 
				
			||||||
 | 
					    crawler_image_pull_policy_map: dict[str, str]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def __init__(
 | 
					    def __init__(
 | 
				
			||||||
        self,
 | 
					        self,
 | 
				
			||||||
@ -108,6 +109,9 @@ class CrawlConfigOps:
 | 
				
			|||||||
        self.coll_ops = cast(CollectionOps, None)
 | 
					        self.coll_ops = cast(CollectionOps, None)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.default_filename_template = os.environ["DEFAULT_CRAWL_FILENAME_TEMPLATE"]
 | 
					        self.default_filename_template = os.environ["DEFAULT_CRAWL_FILENAME_TEMPLATE"]
 | 
				
			||||||
 | 
					        self.default_crawler_image_pull_policy = os.environ.get(
 | 
				
			||||||
 | 
					            "DEFAULT_CRAWLER_IMAGE_PULL_POLICY", "IfNotPresent"
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.router = APIRouter(
 | 
					        self.router = APIRouter(
 | 
				
			||||||
            prefix="/crawlconfigs",
 | 
					            prefix="/crawlconfigs",
 | 
				
			||||||
@ -118,6 +122,7 @@ class CrawlConfigOps:
 | 
				
			|||||||
        self._file_rx = re.compile("\\W+")
 | 
					        self._file_rx = re.compile("\\W+")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        self.crawler_images_map = {}
 | 
					        self.crawler_images_map = {}
 | 
				
			||||||
 | 
					        self.crawler_image_pull_policy_map = {}
 | 
				
			||||||
        channels = []
 | 
					        channels = []
 | 
				
			||||||
        with open(os.environ["CRAWLER_CHANNELS_JSON"], encoding="utf-8") as fh:
 | 
					        with open(os.environ["CRAWLER_CHANNELS_JSON"], encoding="utf-8") as fh:
 | 
				
			||||||
            crawler_list = json.loads(fh.read())
 | 
					            crawler_list = json.loads(fh.read())
 | 
				
			||||||
@ -125,6 +130,10 @@ class CrawlConfigOps:
 | 
				
			|||||||
                channel = CrawlerChannel(**channel_data)
 | 
					                channel = CrawlerChannel(**channel_data)
 | 
				
			||||||
                channels.append(channel)
 | 
					                channels.append(channel)
 | 
				
			||||||
                self.crawler_images_map[channel.id] = channel.image
 | 
					                self.crawler_images_map[channel.id] = channel.image
 | 
				
			||||||
 | 
					                if channel.imagePullPolicy:
 | 
				
			||||||
 | 
					                    self.crawler_image_pull_policy_map[channel.id] = (
 | 
				
			||||||
 | 
					                        channel.imagePullPolicy
 | 
				
			||||||
 | 
					                    )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
            self.crawler_channels = CrawlerChannels(channels=channels)
 | 
					            self.crawler_channels = CrawlerChannels(channels=channels)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -960,6 +969,15 @@ class CrawlConfigOps:
 | 
				
			|||||||
        """Get crawler image name by id"""
 | 
					        """Get crawler image name by id"""
 | 
				
			||||||
        return self.crawler_images_map.get(crawler_channel or "")
 | 
					        return self.crawler_images_map.get(crawler_channel or "")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    def get_channel_crawler_image_pull_policy(
 | 
				
			||||||
 | 
					        self, crawler_channel: Optional[str]
 | 
				
			||||||
 | 
					    ) -> str:
 | 
				
			||||||
 | 
					        """Get crawler image name by id"""
 | 
				
			||||||
 | 
					        return (
 | 
				
			||||||
 | 
					            self.crawler_image_pull_policy_map.get(crawler_channel or "")
 | 
				
			||||||
 | 
					            or self.default_crawler_image_pull_policy
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
    def get_crawler_proxies_map(self) -> dict[str, CrawlerProxy]:
 | 
					    def get_crawler_proxies_map(self) -> dict[str, CrawlerProxy]:
 | 
				
			||||||
        """Load CrawlerProxy mapping from config"""
 | 
					        """Load CrawlerProxy mapping from config"""
 | 
				
			||||||
        proxies_last_update_path = os.environ["CRAWLER_PROXIES_LAST_UPDATE"]
 | 
					        proxies_last_update_path = os.environ["CRAWLER_PROXIES_LAST_UPDATE"]
 | 
				
			||||||
 | 
				
			|||||||
@ -33,6 +33,7 @@ class CrawlManager(K8sAPI):
 | 
				
			|||||||
        url: str,
 | 
					        url: str,
 | 
				
			||||||
        storage: StorageRef,
 | 
					        storage: StorageRef,
 | 
				
			||||||
        crawler_image: str,
 | 
					        crawler_image: str,
 | 
				
			||||||
 | 
					        image_pull_policy: str,
 | 
				
			||||||
        baseprofile: str = "",
 | 
					        baseprofile: str = "",
 | 
				
			||||||
        profile_filename: str = "",
 | 
					        profile_filename: str = "",
 | 
				
			||||||
        proxy_id: str = "",
 | 
					        proxy_id: str = "",
 | 
				
			||||||
@ -57,6 +58,7 @@ class CrawlManager(K8sAPI):
 | 
				
			|||||||
            "vnc_password": secrets.token_hex(16),
 | 
					            "vnc_password": secrets.token_hex(16),
 | 
				
			||||||
            "expire_time": date_to_str(dt_now() + timedelta(seconds=30)),
 | 
					            "expire_time": date_to_str(dt_now() + timedelta(seconds=30)),
 | 
				
			||||||
            "crawler_image": crawler_image,
 | 
					            "crawler_image": crawler_image,
 | 
				
			||||||
 | 
					            "image_pull_policy": image_pull_policy,
 | 
				
			||||||
            "proxy_id": proxy_id or DEFAULT_PROXY_ID,
 | 
					            "proxy_id": proxy_id or DEFAULT_PROXY_ID,
 | 
				
			||||||
        }
 | 
					        }
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -603,6 +603,7 @@ class CrawlerChannel(BaseModel):
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
    id: str
 | 
					    id: str
 | 
				
			||||||
    image: str
 | 
					    image: str
 | 
				
			||||||
 | 
					    imagePullPolicy: Optional[str] = None
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# ============================================================================
 | 
					# ============================================================================
 | 
				
			||||||
 | 
				
			|||||||
@ -279,6 +279,11 @@ class CrawlOperator(BaseOperator):
 | 
				
			|||||||
        )
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        params["crawler_image"] = status.crawlerImage
 | 
					        params["crawler_image"] = status.crawlerImage
 | 
				
			||||||
 | 
					        pull_policy = self.crawl_config_ops.get_channel_crawler_image_pull_policy(
 | 
				
			||||||
 | 
					            crawl.crawler_channel
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					        if pull_policy:
 | 
				
			||||||
 | 
					            params["crawler_image_pull_policy"] = pull_policy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        if crawl.proxy_id and not crawl.is_qa:
 | 
					        if crawl.proxy_id and not crawl.is_qa:
 | 
				
			||||||
            proxy = self.crawl_config_ops.get_crawler_proxy(crawl.proxy_id)
 | 
					            proxy = self.crawl_config_ops.get_crawler_proxy(crawl.proxy_id)
 | 
				
			||||||
 | 
				
			|||||||
@ -45,6 +45,9 @@ class ProfileOperator(BaseOperator):
 | 
				
			|||||||
        params["storage_secret"] = storage_secret
 | 
					        params["storage_secret"] = storage_secret
 | 
				
			||||||
        params["profile_filename"] = spec.get("profileFilename", "")
 | 
					        params["profile_filename"] = spec.get("profileFilename", "")
 | 
				
			||||||
        params["crawler_image"] = spec["crawlerImage"]
 | 
					        params["crawler_image"] = spec["crawlerImage"]
 | 
				
			||||||
 | 
					        pull_policy = spec.get("imagePullPolicy")
 | 
				
			||||||
 | 
					        if pull_policy:
 | 
				
			||||||
 | 
					            params["crawler_image_pull_policy"] = pull_policy
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        proxy_id = spec.get("proxyId")
 | 
					        proxy_id = spec.get("proxyId")
 | 
				
			||||||
        if proxy_id:
 | 
					        if proxy_id:
 | 
				
			||||||
 | 
				
			|||||||
@ -110,6 +110,10 @@ class ProfileOps:
 | 
				
			|||||||
        if not crawler_image:
 | 
					        if not crawler_image:
 | 
				
			||||||
            raise HTTPException(status_code=404, detail="crawler_not_found")
 | 
					            raise HTTPException(status_code=404, detail="crawler_not_found")
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        image_pull_policy = self.crawlconfigs.get_channel_crawler_image_pull_policy(
 | 
				
			||||||
 | 
					            profile_launch.crawlerChannel
 | 
				
			||||||
 | 
					        )
 | 
				
			||||||
 | 
					
 | 
				
			||||||
        # use either specified proxyId or if none, use proxyId from existing profile
 | 
					        # use either specified proxyId or if none, use proxyId from existing profile
 | 
				
			||||||
        proxy_id = profile_launch.proxyId or prev_proxy_id
 | 
					        proxy_id = profile_launch.proxyId or prev_proxy_id
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -122,6 +126,7 @@ class ProfileOps:
 | 
				
			|||||||
            url=str(profile_launch.url),
 | 
					            url=str(profile_launch.url),
 | 
				
			||||||
            storage=org.storage,
 | 
					            storage=org.storage,
 | 
				
			||||||
            crawler_image=crawler_image,
 | 
					            crawler_image=crawler_image,
 | 
				
			||||||
 | 
					            image_pull_policy=image_pull_policy,
 | 
				
			||||||
            baseprofile=prev_profile_id,
 | 
					            baseprofile=prev_profile_id,
 | 
				
			||||||
            profile_filename=prev_profile_path,
 | 
					            profile_filename=prev_profile_path,
 | 
				
			||||||
            proxy_id=proxy_id,
 | 
					            proxy_id=proxy_id,
 | 
				
			||||||
 | 
				
			|||||||
@ -23,6 +23,7 @@ spec:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  storageName: "{{ storage_name }}"
 | 
					  storageName: "{{ storage_name }}"
 | 
				
			||||||
  crawlerImage: "{{ crawler_image }}"
 | 
					  crawlerImage: "{{ crawler_image }}"
 | 
				
			||||||
 | 
					  imagePullPolicy: "{{ image_pull_policy }}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  startUrl: "{{ url }}"
 | 
					  startUrl: "{{ url }}"
 | 
				
			||||||
  profileFilename: "{{ profile_filename }}"
 | 
					  profileFilename: "{{ profile_filename }}"
 | 
				
			||||||
 | 
				
			|||||||
@ -22,10 +22,12 @@
 | 
				
			|||||||
# crawler_channels:
 | 
					# crawler_channels:
 | 
				
			||||||
#   - id: default
 | 
					#   - id: default
 | 
				
			||||||
#     image: "docker.io/webrecorder/browsertrix-crawler:latest"
 | 
					#     image: "docker.io/webrecorder/browsertrix-crawler:latest"
 | 
				
			||||||
 | 
					#     imagePullPolicy: Always
 | 
				
			||||||
#
 | 
					#
 | 
				
			||||||
#   # Add, remove, or edit additional crawler release channels for example:
 | 
					#   # Add, remove, or edit additional crawler release channels for example:
 | 
				
			||||||
#   - id: custom_version
 | 
					#   - id: custom_version
 | 
				
			||||||
#     image: "<DOCKER IMAGE>"
 | 
					#     image: "<DOCKER IMAGE>"
 | 
				
			||||||
 | 
					#     imagePullPolicy: IfNotPresent  # optional
 | 
				
			||||||
 | 
					
 | 
				
			||||||
# overrides to use existing images in local Docker, otherwise will pull from repository
 | 
					# overrides to use existing images in local Docker, otherwise will pull from repository
 | 
				
			||||||
# backend_pull_policy: "Never"
 | 
					# backend_pull_policy: "Never"
 | 
				
			||||||
 | 
				
			|||||||
@ -34,6 +34,8 @@ data:
 | 
				
			|||||||
 | 
					
 | 
				
			||||||
  DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}"
 | 
					  DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  DEFAULT_CRAWLER_IMAGE_PULL_POLICY: "{{ .Values.crawler_pull_policy }}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"
 | 
					  MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"
 | 
					  IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"
 | 
				
			||||||
 | 
				
			|||||||
@ -104,7 +104,7 @@ replica_deletion_delay_days: 0
 | 
				
			|||||||
# API Image
 | 
					# API Image
 | 
				
			||||||
# =========================================
 | 
					# =========================================
 | 
				
			||||||
backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.7"
 | 
					backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.7"
 | 
				
			||||||
backend_pull_policy: "Always"
 | 
					backend_pull_policy: "IfNotPresent"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
backend_password_secret: "PASSWORD!"
 | 
					backend_password_secret: "PASSWORD!"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -162,7 +162,7 @@ backend_avg_memory_threshold: 95
 | 
				
			|||||||
# Nginx Image
 | 
					# Nginx Image
 | 
				
			||||||
# =========================================
 | 
					# =========================================
 | 
				
			||||||
frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.7"
 | 
					frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.7"
 | 
				
			||||||
frontend_pull_policy: "Always"
 | 
					frontend_pull_policy: "IfNotPresent"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
frontend_cpu: "10m"
 | 
					frontend_cpu: "10m"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
@ -237,12 +237,15 @@ redis_storage: "3Gi"
 | 
				
			|||||||
crawler_channels:
 | 
					crawler_channels:
 | 
				
			||||||
  - id: default
 | 
					  - id: default
 | 
				
			||||||
    image: "docker.io/webrecorder/browsertrix-crawler:latest"
 | 
					    image: "docker.io/webrecorder/browsertrix-crawler:latest"
 | 
				
			||||||
 | 
					    imagePullPolicy: Always
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  # Add, remove, or edit additional crawler versions below, for example:
 | 
					  # Add, remove, or edit additional crawler versions below, for example:
 | 
				
			||||||
  # - id: custom_version
 | 
					  # - id: custom_version
 | 
				
			||||||
  #   image: "<DOCKER IMAGE>"
 | 
					  #   image: "<DOCKER IMAGE>"
 | 
				
			||||||
 | 
					  #   imagePullPolicy: Always|IfNotPresent|Never (optional, defaults to crawler_pull_policy)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
crawler_pull_policy: "Always"
 | 
					# default crawler pull policy if not set per channel
 | 
				
			||||||
 | 
					crawler_pull_policy: "IfNotPresent"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
crawler_namespace: "crawlers"
 | 
					crawler_namespace: "crawlers"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
				
			|||||||
@ -18,6 +18,7 @@ The `crawler_channels` setting is used to specify the [_Crawler Release Channel_
 | 
				
			|||||||
crawler_channels:
 | 
					crawler_channels:
 | 
				
			||||||
  - id: default
 | 
					  - id: default
 | 
				
			||||||
    image: "docker.io/webrecorder/browsertrix-crawler:latest"
 | 
					    image: "docker.io/webrecorder/browsertrix-crawler:latest"
 | 
				
			||||||
 | 
					    imagePullPolicy: Always # optional
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
This can be extended with additional channels. For example, here is what the value would look like adding a new x.y.z release of Browsertrix Crawler with the id `testing`:
 | 
					This can be extended with additional channels. For example, here is what the value would look like adding a new x.y.z release of Browsertrix Crawler with the id `testing`:
 | 
				
			||||||
@ -28,8 +29,11 @@ crawler_channels:
 | 
				
			|||||||
    image: "docker.io/webrecorder/browsertrix-crawler:latest"
 | 
					    image: "docker.io/webrecorder/browsertrix-crawler:latest"
 | 
				
			||||||
  - id: testing
 | 
					  - id: testing
 | 
				
			||||||
    image: "docker.io/webrecorder/browsertrix-crawler:x.y.z"
 | 
					    image: "docker.io/webrecorder/browsertrix-crawler:x.y.z"
 | 
				
			||||||
 | 
					    imagePullPolicy: IfNotPresent
 | 
				
			||||||
```
 | 
					```
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					The `imagePullPolicy` per channel is optional. If not set, the value set in `crawler_pull_policy` is used as the default.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
## Storage
 | 
					## Storage
 | 
				
			||||||
 | 
					
 | 
				
			||||||
The `storage` setting is used to specify primary and replica storage for a Browsertrix deployment. All configured storage options must be S3-compatible buckets. At minimum, there must be one configured storage option, which includes a `is_default_primary: true`.
 | 
					The `storage` setting is used to specify primary and replica storage for a Browsertrix deployment. All configured storage options must be S3-compatible buckets. At minimum, there must be one configured storage option, which includes a `is_default_primary: true`.
 | 
				
			||||||
 | 
				
			|||||||
		Loading…
	
		Reference in New Issue
	
	Block a user