support overriding crawler image pull policy per channel (#2523)

- add 'imagePullPolicy' field to each crawler channel declaration
- if unset, defaults to the setting in the existing
'crawler_image_pull_policy' field.

fixes #2522

---------

Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
Ilya Kreymer 2025-03-31 14:11:41 -07:00 committed by GitHub
parent df8c80f3cc
commit 62e47a8817
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 49 additions and 3 deletions

View File

@ -85,6 +85,7 @@ class CrawlConfigOps:
crawler_channels: CrawlerChannels crawler_channels: CrawlerChannels
crawler_images_map: dict[str, str] crawler_images_map: dict[str, str]
crawler_image_pull_policy_map: dict[str, str]
def __init__( def __init__(
self, self,
@ -108,6 +109,9 @@ class CrawlConfigOps:
self.coll_ops = cast(CollectionOps, None) self.coll_ops = cast(CollectionOps, None)
self.default_filename_template = os.environ["DEFAULT_CRAWL_FILENAME_TEMPLATE"] self.default_filename_template = os.environ["DEFAULT_CRAWL_FILENAME_TEMPLATE"]
self.default_crawler_image_pull_policy = os.environ.get(
"DEFAULT_CRAWLER_IMAGE_PULL_POLICY", "IfNotPresent"
)
self.router = APIRouter( self.router = APIRouter(
prefix="/crawlconfigs", prefix="/crawlconfigs",
@ -118,6 +122,7 @@ class CrawlConfigOps:
self._file_rx = re.compile("\\W+") self._file_rx = re.compile("\\W+")
self.crawler_images_map = {} self.crawler_images_map = {}
self.crawler_image_pull_policy_map = {}
channels = [] channels = []
with open(os.environ["CRAWLER_CHANNELS_JSON"], encoding="utf-8") as fh: with open(os.environ["CRAWLER_CHANNELS_JSON"], encoding="utf-8") as fh:
crawler_list = json.loads(fh.read()) crawler_list = json.loads(fh.read())
@ -125,6 +130,10 @@ class CrawlConfigOps:
channel = CrawlerChannel(**channel_data) channel = CrawlerChannel(**channel_data)
channels.append(channel) channels.append(channel)
self.crawler_images_map[channel.id] = channel.image self.crawler_images_map[channel.id] = channel.image
if channel.imagePullPolicy:
self.crawler_image_pull_policy_map[channel.id] = (
channel.imagePullPolicy
)
self.crawler_channels = CrawlerChannels(channels=channels) self.crawler_channels = CrawlerChannels(channels=channels)
@ -960,6 +969,15 @@ class CrawlConfigOps:
"""Get crawler image name by id""" """Get crawler image name by id"""
return self.crawler_images_map.get(crawler_channel or "") return self.crawler_images_map.get(crawler_channel or "")
def get_channel_crawler_image_pull_policy(
self, crawler_channel: Optional[str]
) -> str:
"""Get crawler image name by id"""
return (
self.crawler_image_pull_policy_map.get(crawler_channel or "")
or self.default_crawler_image_pull_policy
)
def get_crawler_proxies_map(self) -> dict[str, CrawlerProxy]: def get_crawler_proxies_map(self) -> dict[str, CrawlerProxy]:
"""Load CrawlerProxy mapping from config""" """Load CrawlerProxy mapping from config"""
proxies_last_update_path = os.environ["CRAWLER_PROXIES_LAST_UPDATE"] proxies_last_update_path = os.environ["CRAWLER_PROXIES_LAST_UPDATE"]

View File

@ -33,6 +33,7 @@ class CrawlManager(K8sAPI):
url: str, url: str,
storage: StorageRef, storage: StorageRef,
crawler_image: str, crawler_image: str,
image_pull_policy: str,
baseprofile: str = "", baseprofile: str = "",
profile_filename: str = "", profile_filename: str = "",
proxy_id: str = "", proxy_id: str = "",
@ -57,6 +58,7 @@ class CrawlManager(K8sAPI):
"vnc_password": secrets.token_hex(16), "vnc_password": secrets.token_hex(16),
"expire_time": date_to_str(dt_now() + timedelta(seconds=30)), "expire_time": date_to_str(dt_now() + timedelta(seconds=30)),
"crawler_image": crawler_image, "crawler_image": crawler_image,
"image_pull_policy": image_pull_policy,
"proxy_id": proxy_id or DEFAULT_PROXY_ID, "proxy_id": proxy_id or DEFAULT_PROXY_ID,
} }

View File

@ -603,6 +603,7 @@ class CrawlerChannel(BaseModel):
id: str id: str
image: str image: str
imagePullPolicy: Optional[str] = None
# ============================================================================ # ============================================================================

View File

@ -279,6 +279,11 @@ class CrawlOperator(BaseOperator):
) )
params["crawler_image"] = status.crawlerImage params["crawler_image"] = status.crawlerImage
pull_policy = self.crawl_config_ops.get_channel_crawler_image_pull_policy(
crawl.crawler_channel
)
if pull_policy:
params["crawler_image_pull_policy"] = pull_policy
if crawl.proxy_id and not crawl.is_qa: if crawl.proxy_id and not crawl.is_qa:
proxy = self.crawl_config_ops.get_crawler_proxy(crawl.proxy_id) proxy = self.crawl_config_ops.get_crawler_proxy(crawl.proxy_id)

View File

@ -45,6 +45,9 @@ class ProfileOperator(BaseOperator):
params["storage_secret"] = storage_secret params["storage_secret"] = storage_secret
params["profile_filename"] = spec.get("profileFilename", "") params["profile_filename"] = spec.get("profileFilename", "")
params["crawler_image"] = spec["crawlerImage"] params["crawler_image"] = spec["crawlerImage"]
pull_policy = spec.get("imagePullPolicy")
if pull_policy:
params["crawler_image_pull_policy"] = pull_policy
proxy_id = spec.get("proxyId") proxy_id = spec.get("proxyId")
if proxy_id: if proxy_id:

View File

@ -110,6 +110,10 @@ class ProfileOps:
if not crawler_image: if not crawler_image:
raise HTTPException(status_code=404, detail="crawler_not_found") raise HTTPException(status_code=404, detail="crawler_not_found")
image_pull_policy = self.crawlconfigs.get_channel_crawler_image_pull_policy(
profile_launch.crawlerChannel
)
# use either specified proxyId or if none, use proxyId from existing profile # use either specified proxyId or if none, use proxyId from existing profile
proxy_id = profile_launch.proxyId or prev_proxy_id proxy_id = profile_launch.proxyId or prev_proxy_id
@ -122,6 +126,7 @@ class ProfileOps:
url=str(profile_launch.url), url=str(profile_launch.url),
storage=org.storage, storage=org.storage,
crawler_image=crawler_image, crawler_image=crawler_image,
image_pull_policy=image_pull_policy,
baseprofile=prev_profile_id, baseprofile=prev_profile_id,
profile_filename=prev_profile_path, profile_filename=prev_profile_path,
proxy_id=proxy_id, proxy_id=proxy_id,

View File

@ -23,6 +23,7 @@ spec:
storageName: "{{ storage_name }}" storageName: "{{ storage_name }}"
crawlerImage: "{{ crawler_image }}" crawlerImage: "{{ crawler_image }}"
imagePullPolicy: "{{ image_pull_policy }}"
startUrl: "{{ url }}" startUrl: "{{ url }}"
profileFilename: "{{ profile_filename }}" profileFilename: "{{ profile_filename }}"

View File

@ -22,10 +22,12 @@
# crawler_channels: # crawler_channels:
# - id: default # - id: default
# image: "docker.io/webrecorder/browsertrix-crawler:latest" # image: "docker.io/webrecorder/browsertrix-crawler:latest"
# imagePullPolicy: Always
# #
# # Add, remove, or edit additional crawler release channels for example: # # Add, remove, or edit additional crawler release channels for example:
# - id: custom_version # - id: custom_version
# image: "<DOCKER IMAGE>" # image: "<DOCKER IMAGE>"
# imagePullPolicy: IfNotPresent # optional
# overrides to use existing images in local Docker, otherwise will pull from repository # overrides to use existing images in local Docker, otherwise will pull from repository
# backend_pull_policy: "Never" # backend_pull_policy: "Never"

View File

@ -34,6 +34,8 @@ data:
DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}" DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}"
DEFAULT_CRAWLER_IMAGE_PULL_POLICY: "{{ .Values.crawler_pull_policy }}"
MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}" MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"
IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}" IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"

View File

@ -104,7 +104,7 @@ replica_deletion_delay_days: 0
# API Image # API Image
# ========================================= # =========================================
backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.7" backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.7"
backend_pull_policy: "Always" backend_pull_policy: "IfNotPresent"
backend_password_secret: "PASSWORD!" backend_password_secret: "PASSWORD!"
@ -162,7 +162,7 @@ backend_avg_memory_threshold: 95
# Nginx Image # Nginx Image
# ========================================= # =========================================
frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.7" frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.7"
frontend_pull_policy: "Always" frontend_pull_policy: "IfNotPresent"
frontend_cpu: "10m" frontend_cpu: "10m"
@ -237,12 +237,15 @@ redis_storage: "3Gi"
crawler_channels: crawler_channels:
- id: default - id: default
image: "docker.io/webrecorder/browsertrix-crawler:latest" image: "docker.io/webrecorder/browsertrix-crawler:latest"
imagePullPolicy: Always
# Add, remove, or edit additional crawler versions below, for example: # Add, remove, or edit additional crawler versions below, for example:
# - id: custom_version # - id: custom_version
# image: "<DOCKER IMAGE>" # image: "<DOCKER IMAGE>"
# imagePullPolicy: Always|IfNotPresent|Never (optional, defaults to crawler_pull_policy)
crawler_pull_policy: "Always" # default crawler pull policy if not set per channel
crawler_pull_policy: "IfNotPresent"
crawler_namespace: "crawlers" crawler_namespace: "crawlers"

View File

@ -18,6 +18,7 @@ The `crawler_channels` setting is used to specify the [_Crawler Release Channel_
crawler_channels: crawler_channels:
- id: default - id: default
image: "docker.io/webrecorder/browsertrix-crawler:latest" image: "docker.io/webrecorder/browsertrix-crawler:latest"
imagePullPolicy: Always # optional
``` ```
This can be extended with additional channels. For example, here is what the value would look like adding a new x.y.z release of Browsertrix Crawler with the id `testing`: This can be extended with additional channels. For example, here is what the value would look like adding a new x.y.z release of Browsertrix Crawler with the id `testing`:
@ -28,8 +29,11 @@ crawler_channels:
image: "docker.io/webrecorder/browsertrix-crawler:latest" image: "docker.io/webrecorder/browsertrix-crawler:latest"
- id: testing - id: testing
image: "docker.io/webrecorder/browsertrix-crawler:x.y.z" image: "docker.io/webrecorder/browsertrix-crawler:x.y.z"
imagePullPolicy: IfNotPresent
``` ```
The `imagePullPolicy` per channel is optional. If not set, the value set in `crawler_pull_policy` is used as the default.
## Storage ## Storage
The `storage` setting is used to specify primary and replica storage for a Browsertrix deployment. All configured storage options must be S3-compatible buckets. At minimum, there must be one configured storage option, which includes a `is_default_primary: true`. The `storage` setting is used to specify primary and replica storage for a Browsertrix deployment. All configured storage options must be S3-compatible buckets. At minimum, there must be one configured storage option, which includes a `is_default_primary: true`.