support overriding crawler image pull policy per channel (#2523)
- add 'imagePullPolicy' field to each crawler channel declaration - if unset, defaults to the setting in the existing 'crawler_image_pull_policy' field. fixes #2522 --------- Co-authored-by: Tessa Walsh <tessa@bitarchivist.net>
This commit is contained in:
parent
df8c80f3cc
commit
62e47a8817
@ -85,6 +85,7 @@ class CrawlConfigOps:
|
|||||||
|
|
||||||
crawler_channels: CrawlerChannels
|
crawler_channels: CrawlerChannels
|
||||||
crawler_images_map: dict[str, str]
|
crawler_images_map: dict[str, str]
|
||||||
|
crawler_image_pull_policy_map: dict[str, str]
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -108,6 +109,9 @@ class CrawlConfigOps:
|
|||||||
self.coll_ops = cast(CollectionOps, None)
|
self.coll_ops = cast(CollectionOps, None)
|
||||||
|
|
||||||
self.default_filename_template = os.environ["DEFAULT_CRAWL_FILENAME_TEMPLATE"]
|
self.default_filename_template = os.environ["DEFAULT_CRAWL_FILENAME_TEMPLATE"]
|
||||||
|
self.default_crawler_image_pull_policy = os.environ.get(
|
||||||
|
"DEFAULT_CRAWLER_IMAGE_PULL_POLICY", "IfNotPresent"
|
||||||
|
)
|
||||||
|
|
||||||
self.router = APIRouter(
|
self.router = APIRouter(
|
||||||
prefix="/crawlconfigs",
|
prefix="/crawlconfigs",
|
||||||
@ -118,6 +122,7 @@ class CrawlConfigOps:
|
|||||||
self._file_rx = re.compile("\\W+")
|
self._file_rx = re.compile("\\W+")
|
||||||
|
|
||||||
self.crawler_images_map = {}
|
self.crawler_images_map = {}
|
||||||
|
self.crawler_image_pull_policy_map = {}
|
||||||
channels = []
|
channels = []
|
||||||
with open(os.environ["CRAWLER_CHANNELS_JSON"], encoding="utf-8") as fh:
|
with open(os.environ["CRAWLER_CHANNELS_JSON"], encoding="utf-8") as fh:
|
||||||
crawler_list = json.loads(fh.read())
|
crawler_list = json.loads(fh.read())
|
||||||
@ -125,6 +130,10 @@ class CrawlConfigOps:
|
|||||||
channel = CrawlerChannel(**channel_data)
|
channel = CrawlerChannel(**channel_data)
|
||||||
channels.append(channel)
|
channels.append(channel)
|
||||||
self.crawler_images_map[channel.id] = channel.image
|
self.crawler_images_map[channel.id] = channel.image
|
||||||
|
if channel.imagePullPolicy:
|
||||||
|
self.crawler_image_pull_policy_map[channel.id] = (
|
||||||
|
channel.imagePullPolicy
|
||||||
|
)
|
||||||
|
|
||||||
self.crawler_channels = CrawlerChannels(channels=channels)
|
self.crawler_channels = CrawlerChannels(channels=channels)
|
||||||
|
|
||||||
@ -960,6 +969,15 @@ class CrawlConfigOps:
|
|||||||
"""Get crawler image name by id"""
|
"""Get crawler image name by id"""
|
||||||
return self.crawler_images_map.get(crawler_channel or "")
|
return self.crawler_images_map.get(crawler_channel or "")
|
||||||
|
|
||||||
|
def get_channel_crawler_image_pull_policy(
|
||||||
|
self, crawler_channel: Optional[str]
|
||||||
|
) -> str:
|
||||||
|
"""Get crawler image name by id"""
|
||||||
|
return (
|
||||||
|
self.crawler_image_pull_policy_map.get(crawler_channel or "")
|
||||||
|
or self.default_crawler_image_pull_policy
|
||||||
|
)
|
||||||
|
|
||||||
def get_crawler_proxies_map(self) -> dict[str, CrawlerProxy]:
|
def get_crawler_proxies_map(self) -> dict[str, CrawlerProxy]:
|
||||||
"""Load CrawlerProxy mapping from config"""
|
"""Load CrawlerProxy mapping from config"""
|
||||||
proxies_last_update_path = os.environ["CRAWLER_PROXIES_LAST_UPDATE"]
|
proxies_last_update_path = os.environ["CRAWLER_PROXIES_LAST_UPDATE"]
|
||||||
|
@ -33,6 +33,7 @@ class CrawlManager(K8sAPI):
|
|||||||
url: str,
|
url: str,
|
||||||
storage: StorageRef,
|
storage: StorageRef,
|
||||||
crawler_image: str,
|
crawler_image: str,
|
||||||
|
image_pull_policy: str,
|
||||||
baseprofile: str = "",
|
baseprofile: str = "",
|
||||||
profile_filename: str = "",
|
profile_filename: str = "",
|
||||||
proxy_id: str = "",
|
proxy_id: str = "",
|
||||||
@ -57,6 +58,7 @@ class CrawlManager(K8sAPI):
|
|||||||
"vnc_password": secrets.token_hex(16),
|
"vnc_password": secrets.token_hex(16),
|
||||||
"expire_time": date_to_str(dt_now() + timedelta(seconds=30)),
|
"expire_time": date_to_str(dt_now() + timedelta(seconds=30)),
|
||||||
"crawler_image": crawler_image,
|
"crawler_image": crawler_image,
|
||||||
|
"image_pull_policy": image_pull_policy,
|
||||||
"proxy_id": proxy_id or DEFAULT_PROXY_ID,
|
"proxy_id": proxy_id or DEFAULT_PROXY_ID,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -603,6 +603,7 @@ class CrawlerChannel(BaseModel):
|
|||||||
|
|
||||||
id: str
|
id: str
|
||||||
image: str
|
image: str
|
||||||
|
imagePullPolicy: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
@ -279,6 +279,11 @@ class CrawlOperator(BaseOperator):
|
|||||||
)
|
)
|
||||||
|
|
||||||
params["crawler_image"] = status.crawlerImage
|
params["crawler_image"] = status.crawlerImage
|
||||||
|
pull_policy = self.crawl_config_ops.get_channel_crawler_image_pull_policy(
|
||||||
|
crawl.crawler_channel
|
||||||
|
)
|
||||||
|
if pull_policy:
|
||||||
|
params["crawler_image_pull_policy"] = pull_policy
|
||||||
|
|
||||||
if crawl.proxy_id and not crawl.is_qa:
|
if crawl.proxy_id and not crawl.is_qa:
|
||||||
proxy = self.crawl_config_ops.get_crawler_proxy(crawl.proxy_id)
|
proxy = self.crawl_config_ops.get_crawler_proxy(crawl.proxy_id)
|
||||||
|
@ -45,6 +45,9 @@ class ProfileOperator(BaseOperator):
|
|||||||
params["storage_secret"] = storage_secret
|
params["storage_secret"] = storage_secret
|
||||||
params["profile_filename"] = spec.get("profileFilename", "")
|
params["profile_filename"] = spec.get("profileFilename", "")
|
||||||
params["crawler_image"] = spec["crawlerImage"]
|
params["crawler_image"] = spec["crawlerImage"]
|
||||||
|
pull_policy = spec.get("imagePullPolicy")
|
||||||
|
if pull_policy:
|
||||||
|
params["crawler_image_pull_policy"] = pull_policy
|
||||||
|
|
||||||
proxy_id = spec.get("proxyId")
|
proxy_id = spec.get("proxyId")
|
||||||
if proxy_id:
|
if proxy_id:
|
||||||
|
@ -110,6 +110,10 @@ class ProfileOps:
|
|||||||
if not crawler_image:
|
if not crawler_image:
|
||||||
raise HTTPException(status_code=404, detail="crawler_not_found")
|
raise HTTPException(status_code=404, detail="crawler_not_found")
|
||||||
|
|
||||||
|
image_pull_policy = self.crawlconfigs.get_channel_crawler_image_pull_policy(
|
||||||
|
profile_launch.crawlerChannel
|
||||||
|
)
|
||||||
|
|
||||||
# use either specified proxyId or if none, use proxyId from existing profile
|
# use either specified proxyId or if none, use proxyId from existing profile
|
||||||
proxy_id = profile_launch.proxyId or prev_proxy_id
|
proxy_id = profile_launch.proxyId or prev_proxy_id
|
||||||
|
|
||||||
@ -122,6 +126,7 @@ class ProfileOps:
|
|||||||
url=str(profile_launch.url),
|
url=str(profile_launch.url),
|
||||||
storage=org.storage,
|
storage=org.storage,
|
||||||
crawler_image=crawler_image,
|
crawler_image=crawler_image,
|
||||||
|
image_pull_policy=image_pull_policy,
|
||||||
baseprofile=prev_profile_id,
|
baseprofile=prev_profile_id,
|
||||||
profile_filename=prev_profile_path,
|
profile_filename=prev_profile_path,
|
||||||
proxy_id=proxy_id,
|
proxy_id=proxy_id,
|
||||||
|
@ -23,6 +23,7 @@ spec:
|
|||||||
|
|
||||||
storageName: "{{ storage_name }}"
|
storageName: "{{ storage_name }}"
|
||||||
crawlerImage: "{{ crawler_image }}"
|
crawlerImage: "{{ crawler_image }}"
|
||||||
|
imagePullPolicy: "{{ image_pull_policy }}"
|
||||||
|
|
||||||
startUrl: "{{ url }}"
|
startUrl: "{{ url }}"
|
||||||
profileFilename: "{{ profile_filename }}"
|
profileFilename: "{{ profile_filename }}"
|
||||||
|
@ -22,10 +22,12 @@
|
|||||||
# crawler_channels:
|
# crawler_channels:
|
||||||
# - id: default
|
# - id: default
|
||||||
# image: "docker.io/webrecorder/browsertrix-crawler:latest"
|
# image: "docker.io/webrecorder/browsertrix-crawler:latest"
|
||||||
|
# imagePullPolicy: Always
|
||||||
#
|
#
|
||||||
# # Add, remove, or edit additional crawler release channels for example:
|
# # Add, remove, or edit additional crawler release channels for example:
|
||||||
# - id: custom_version
|
# - id: custom_version
|
||||||
# image: "<DOCKER IMAGE>"
|
# image: "<DOCKER IMAGE>"
|
||||||
|
# imagePullPolicy: IfNotPresent # optional
|
||||||
|
|
||||||
# overrides to use existing images in local Docker, otherwise will pull from repository
|
# overrides to use existing images in local Docker, otherwise will pull from repository
|
||||||
# backend_pull_policy: "Never"
|
# backend_pull_policy: "Never"
|
||||||
|
@ -34,6 +34,8 @@ data:
|
|||||||
|
|
||||||
DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}"
|
DEFAULT_CRAWL_FILENAME_TEMPLATE: "{{ .Values.default_crawl_filename_template }}"
|
||||||
|
|
||||||
|
DEFAULT_CRAWLER_IMAGE_PULL_POLICY: "{{ .Values.crawler_pull_policy }}"
|
||||||
|
|
||||||
MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"
|
MAX_PAGES_PER_CRAWL: "{{ .Values.max_pages_per_crawl | default 0 }}"
|
||||||
|
|
||||||
IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"
|
IDLE_TIMEOUT: "{{ .Values.profile_browser_idle_seconds | default 60 }}"
|
||||||
|
@ -104,7 +104,7 @@ replica_deletion_delay_days: 0
|
|||||||
# API Image
|
# API Image
|
||||||
# =========================================
|
# =========================================
|
||||||
backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.7"
|
backend_image: "docker.io/webrecorder/browsertrix-backend:1.14.7"
|
||||||
backend_pull_policy: "Always"
|
backend_pull_policy: "IfNotPresent"
|
||||||
|
|
||||||
backend_password_secret: "PASSWORD!"
|
backend_password_secret: "PASSWORD!"
|
||||||
|
|
||||||
@ -162,7 +162,7 @@ backend_avg_memory_threshold: 95
|
|||||||
# Nginx Image
|
# Nginx Image
|
||||||
# =========================================
|
# =========================================
|
||||||
frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.7"
|
frontend_image: "docker.io/webrecorder/browsertrix-frontend:1.14.7"
|
||||||
frontend_pull_policy: "Always"
|
frontend_pull_policy: "IfNotPresent"
|
||||||
|
|
||||||
frontend_cpu: "10m"
|
frontend_cpu: "10m"
|
||||||
|
|
||||||
@ -237,12 +237,15 @@ redis_storage: "3Gi"
|
|||||||
crawler_channels:
|
crawler_channels:
|
||||||
- id: default
|
- id: default
|
||||||
image: "docker.io/webrecorder/browsertrix-crawler:latest"
|
image: "docker.io/webrecorder/browsertrix-crawler:latest"
|
||||||
|
imagePullPolicy: Always
|
||||||
|
|
||||||
# Add, remove, or edit additional crawler versions below, for example:
|
# Add, remove, or edit additional crawler versions below, for example:
|
||||||
# - id: custom_version
|
# - id: custom_version
|
||||||
# image: "<DOCKER IMAGE>"
|
# image: "<DOCKER IMAGE>"
|
||||||
|
# imagePullPolicy: Always|IfNotPresent|Never (optional, defaults to crawler_pull_policy)
|
||||||
|
|
||||||
crawler_pull_policy: "Always"
|
# default crawler pull policy if not set per channel
|
||||||
|
crawler_pull_policy: "IfNotPresent"
|
||||||
|
|
||||||
crawler_namespace: "crawlers"
|
crawler_namespace: "crawlers"
|
||||||
|
|
||||||
|
@ -18,6 +18,7 @@ The `crawler_channels` setting is used to specify the [_Crawler Release Channel_
|
|||||||
crawler_channels:
|
crawler_channels:
|
||||||
- id: default
|
- id: default
|
||||||
image: "docker.io/webrecorder/browsertrix-crawler:latest"
|
image: "docker.io/webrecorder/browsertrix-crawler:latest"
|
||||||
|
imagePullPolicy: Always # optional
|
||||||
```
|
```
|
||||||
|
|
||||||
This can be extended with additional channels. For example, here is what the value would look like adding a new x.y.z release of Browsertrix Crawler with the id `testing`:
|
This can be extended with additional channels. For example, here is what the value would look like adding a new x.y.z release of Browsertrix Crawler with the id `testing`:
|
||||||
@ -28,8 +29,11 @@ crawler_channels:
|
|||||||
image: "docker.io/webrecorder/browsertrix-crawler:latest"
|
image: "docker.io/webrecorder/browsertrix-crawler:latest"
|
||||||
- id: testing
|
- id: testing
|
||||||
image: "docker.io/webrecorder/browsertrix-crawler:x.y.z"
|
image: "docker.io/webrecorder/browsertrix-crawler:x.y.z"
|
||||||
|
imagePullPolicy: IfNotPresent
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The `imagePullPolicy` per channel is optional. If not set, the value set in `crawler_pull_policy` is used as the default.
|
||||||
|
|
||||||
## Storage
|
## Storage
|
||||||
|
|
||||||
The `storage` setting is used to specify primary and replica storage for a Browsertrix deployment. All configured storage options must be S3-compatible buckets. At minimum, there must be one configured storage option, which includes a `is_default_primary: true`.
|
The `storage` setting is used to specify primary and replica storage for a Browsertrix deployment. All configured storage options must be S3-compatible buckets. At minimum, there must be one configured storage option, which includes a `is_default_primary: true`.
|
||||||
|
Loading…
Reference in New Issue
Block a user