diff --git a/backend/btrixcloud/crawlconfigs.py b/backend/btrixcloud/crawlconfigs.py index ab449bd8..0f66f03e 100644 --- a/backend/btrixcloud/crawlconfigs.py +++ b/backend/btrixcloud/crawlconfigs.py @@ -85,6 +85,7 @@ class CrawlConfigOps: crawler_channels: CrawlerChannels crawler_images_map: dict[str, str] + crawler_image_pull_policy_map: dict[str, str] def __init__( self, @@ -108,6 +109,9 @@ class CrawlConfigOps: self.coll_ops = cast(CollectionOps, None) self.default_filename_template = os.environ["DEFAULT_CRAWL_FILENAME_TEMPLATE"] + self.default_crawler_image_pull_policy = os.environ.get( + "DEFAULT_CRAWLER_IMAGE_PULL_POLICY", "IfNotPresent" + ) self.router = APIRouter( prefix="/crawlconfigs", @@ -118,6 +122,7 @@ class CrawlConfigOps: self._file_rx = re.compile("\\W+") self.crawler_images_map = {} + self.crawler_image_pull_policy_map = {} channels = [] with open(os.environ["CRAWLER_CHANNELS_JSON"], encoding="utf-8") as fh: crawler_list = json.loads(fh.read()) @@ -125,6 +130,10 @@ class CrawlConfigOps: channel = CrawlerChannel(**channel_data) channels.append(channel) self.crawler_images_map[channel.id] = channel.image + if channel.imagePullPolicy: + self.crawler_image_pull_policy_map[channel.id] = ( + channel.imagePullPolicy + ) self.crawler_channels = CrawlerChannels(channels=channels) @@ -960,6 +969,15 @@ class CrawlConfigOps: """Get crawler image name by id""" return self.crawler_images_map.get(crawler_channel or "") + def get_channel_crawler_image_pull_policy( + self, crawler_channel: Optional[str] + ) -> str: + """Get crawler image name by id""" + return ( + self.crawler_image_pull_policy_map.get(crawler_channel or "") + or self.default_crawler_image_pull_policy + ) + def get_crawler_proxies_map(self) -> dict[str, CrawlerProxy]: """Load CrawlerProxy mapping from config""" proxies_last_update_path = os.environ["CRAWLER_PROXIES_LAST_UPDATE"] diff --git a/backend/btrixcloud/crawlmanager.py b/backend/btrixcloud/crawlmanager.py index 9d4b190e..1412cd67 100644 --- a/backend/btrixcloud/crawlmanager.py +++ b/backend/btrixcloud/crawlmanager.py @@ -33,6 +33,7 @@ class CrawlManager(K8sAPI): url: str, storage: StorageRef, crawler_image: str, + image_pull_policy: str, baseprofile: str = "", profile_filename: str = "", proxy_id: str = "", @@ -57,6 +58,7 @@ class CrawlManager(K8sAPI): "vnc_password": secrets.token_hex(16), "expire_time": date_to_str(dt_now() + timedelta(seconds=30)), "crawler_image": crawler_image, + "image_pull_policy": image_pull_policy, "proxy_id": proxy_id or DEFAULT_PROXY_ID, } diff --git a/backend/btrixcloud/models.py b/backend/btrixcloud/models.py index 7af5d49c..25014e2a 100644 --- a/backend/btrixcloud/models.py +++ b/backend/btrixcloud/models.py @@ -603,6 +603,7 @@ class CrawlerChannel(BaseModel): id: str image: str + imagePullPolicy: Optional[str] = None # ============================================================================ diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index daebad02..aa8e152e 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -279,6 +279,11 @@ class CrawlOperator(BaseOperator): ) params["crawler_image"] = status.crawlerImage + pull_policy = self.crawl_config_ops.get_channel_crawler_image_pull_policy( + crawl.crawler_channel + ) + if pull_policy: + params["crawler_image_pull_policy"] = pull_policy if crawl.proxy_id and not crawl.is_qa: proxy = self.crawl_config_ops.get_crawler_proxy(crawl.proxy_id) diff --git a/backend/btrixcloud/operator/profiles.py b/backend/btrixcloud/operator/profiles.py index 071fbb38..5f1908f5 100644 --- a/backend/btrixcloud/operator/profiles.py +++ b/backend/btrixcloud/operator/profiles.py @@ -45,6 +45,9 @@ class ProfileOperator(BaseOperator): params["storage_secret"] = storage_secret params["profile_filename"] = spec.get("profileFilename", "") params["crawler_image"] = spec["crawlerImage"] + pull_policy = spec.get("imagePullPolicy") + if pull_policy: + params["crawler_image_pull_policy"] = pull_policy proxy_id = spec.get("proxyId") if proxy_id: diff --git a/backend/btrixcloud/profiles.py b/backend/btrixcloud/profiles.py index cd5391f2..06010551 100644 --- a/backend/btrixcloud/profiles.py +++ b/backend/btrixcloud/profiles.py @@ -110,6 +110,10 @@ class ProfileOps: if not crawler_image: raise HTTPException(status_code=404, detail="crawler_not_found") + image_pull_policy = self.crawlconfigs.get_channel_crawler_image_pull_policy( + profile_launch.crawlerChannel + ) + # use either specified proxyId or if none, use proxyId from existing profile proxy_id = profile_launch.proxyId or prev_proxy_id @@ -122,6 +126,7 @@ class ProfileOps: url=str(profile_launch.url), storage=org.storage, crawler_image=crawler_image, + image_pull_policy=image_pull_policy, baseprofile=prev_profile_id, profile_filename=prev_profile_path, proxy_id=proxy_id, diff --git a/chart/app-templates/profile_job.yaml b/chart/app-templates/profile_job.yaml index fc6f61fb..24d67cc9 100644 --- a/chart/app-templates/profile_job.yaml +++ b/chart/app-templates/profile_job.yaml @@ -23,6 +23,7 @@ spec: storageName: "{{ storage_name }}" crawlerImage: "{{ crawler_image }}" + imagePullPolicy: "{{ image_pull_policy }}" startUrl: "{{ url }}" profileFilename: "{{ profile_filename }}" diff --git a/chart/examples/local-config.yaml b/chart/examples/local-config.yaml index 8a1c0602..2d3b25a9 100644 --- a/chart/examples/local-config.yaml +++ b/chart/examples/local-config.yaml @@ -22,10 +22,12 @@ # crawler_channels: # - id: default # image: "docker.io/webrecorder/browsertrix-crawler:latest" +# imagePullPolicy: Always # # # Add, remove, or edit additional crawler release channels for example: # - id: custom_version # image: "" +# imagePullPolicy: IfNotPresent # optional # overrides to use existing images in local Docker, otherwise will pull from repository # backend_pull_policy: "Never" diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 9fd4188e..6e8cb405 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -34,6 +34,8 @@ data: DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}" + DEFAULT_CRAWLER_IMAGE_PULL_POLICY: "{{ .Values.crawler_pull_policy }}" + MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}" IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}" diff --git a/chart/values.yaml b/chart/values.yaml index 4d3a5556..fa82dfee 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -104,7 +104,7 @@ replica_deletion_delay_days: 0 # API Image # ========================================= backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.7" -backend_pull_policy: "Always" +backend_pull_policy: "IfNotPresent" backend_password_secret: "PASSWORD!" @@ -162,7 +162,7 @@ backend_avg_memory_threshold: 95 # Nginx Image # ========================================= frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.7" -frontend_pull_policy: "Always" +frontend_pull_policy: "IfNotPresent" frontend_cpu: "10m" @@ -237,12 +237,15 @@ redis_storage: "3Gi" crawler_channels: - id: default image: "docker.io/webrecorder/browsertrix-crawler:latest" + imagePullPolicy: Always # Add, remove, or edit additional crawler versions below, for example: # - id: custom_version # image: "" + # imagePullPolicy: Always|IfNotPresent|Never (optional, defaults to crawler_pull_policy) -crawler_pull_policy: "Always" +# default crawler pull policy if not set per channel +crawler_pull_policy: "IfNotPresent" crawler_namespace: "crawlers" diff --git a/frontend/docs/docs/deploy/customization.md b/frontend/docs/docs/deploy/customization.md index 66260caa..b4753a48 100644 --- a/frontend/docs/docs/deploy/customization.md +++ b/frontend/docs/docs/deploy/customization.md @@ -18,6 +18,7 @@ The `crawler_channels` setting is used to specify the [_Crawler Release Channel_ crawler_channels: - id: default image: "docker.io/webrecorder/browsertrix-crawler:latest" + imagePullPolicy: Always # optional ``` This can be extended with additional channels. For example, here is what the value would look like adding a new x.y.z release of Browsertrix Crawler with the id `testing`: @@ -28,8 +29,11 @@ crawler_channels: image: "docker.io/webrecorder/browsertrix-crawler:latest" - id: testing image: "docker.io/webrecorder/browsertrix-crawler:x.y.z" + imagePullPolicy: IfNotPresent ``` +The `imagePullPolicy` per channel is optional. If not set, the value set in `crawler_pull_policy` is used as the default. + ## Storage The `storage` setting is used to specify primary and replica storage for a Browsertrix deployment. All configured storage options must be S3-compatible buckets. At minimum, there must be one configured storage option, which includes a `is_default_primary: true`.