ensure running crawl configmap is updated when exclusions are added/removed (#2409)

exclusions are already updated dynamically if crawler pod is running,
but when crawler pod is restarted, this ensures new exclusions are also
picked up:
- mount configmap in separate path, avoiding subPath, to allow dynamic
updates of mounted volume
- adds a lastConfigUpdate timestamp to CrawlJob - if lastConfigUpdate in
spec is different from current, the configmap is recreated by operator
- operator: also update image from channel avoid any issues with
updating crawler in channel
- only updates for exclusion add/remove so far, can later be expanded to
other crawler settings (see: #2355 for broader running crawl config
updates)
- fixes #2408
This commit is contained in:
Ilya Kreymer 2025-02-19 11:42:19 -08:00 committed by GitHub
parent 905fe059a4
commit 88a9f3baf7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 31 additions and 12 deletions

View File

@ -220,6 +220,12 @@ class CrawlManager(K8sAPI):
proxy_id=crawlconfig.proxyId or DEFAULT_PROXY_ID, proxy_id=crawlconfig.proxyId or DEFAULT_PROXY_ID,
) )
async def reload_running_crawl_config(self, crawl_id: str):
"""force reload of configmap for crawl"""
return await self._patch_job(
crawl_id, {"lastConfigUpdate": date_to_str(dt_now())}
)
async def create_qa_crawl_job( async def create_qa_crawl_job(
self, self,
crawlconfig: CrawlConfig, crawlconfig: CrawlConfig,

View File

@ -544,6 +544,8 @@ class CrawlOps(BaseCrawlOps):
regex, cid, org, user, add regex, cid, org, user, add
) )
await self.crawl_manager.reload_running_crawl_config(crawl.id)
await self.crawls.find_one_and_update( await self.crawls.find_one_and_update(
{"_id": crawl_id, "type": "crawl", "oid": org.id}, {"_id": crawl_id, "type": "crawl", "oid": org.id},
{"$set": {"config": new_config.dict()}}, {"$set": {"config": new_config.dict()}},

View File

@ -274,12 +274,9 @@ class CrawlOperator(BaseOperator):
params["storage_path"] = storage_path params["storage_path"] = storage_path
params["storage_secret"] = storage_secret params["storage_secret"] = storage_secret
# only resolve if not already set status.crawlerImage = self.crawl_config_ops.get_channel_crawler_image(
# not automatically updating image for existing crawls crawl.crawler_channel
if not status.crawlerImage: )
status.crawlerImage = self.crawl_config_ops.get_channel_crawler_image(
crawl.crawler_channel
)
params["crawler_image"] = status.crawlerImage params["crawler_image"] = status.crawlerImage
@ -306,7 +303,16 @@ class CrawlOperator(BaseOperator):
else: else:
params["force_restart"] = False params["force_restart"] = False
children.extend(await self._load_crawl_configmap(crawl, data.children, params)) config_update_needed = (
spec.get("lastConfigUpdate", "") != status.lastConfigUpdate
)
status.lastConfigUpdate = spec.get("lastConfigUpdate", "")
children.extend(
await self._load_crawl_configmap(
crawl, data.children, params, config_update_needed
)
)
if crawl.qa_source_crawl_id: if crawl.qa_source_crawl_id:
params["qa_source_crawl_id"] = crawl.qa_source_crawl_id params["qa_source_crawl_id"] = crawl.qa_source_crawl_id
@ -364,11 +370,13 @@ class CrawlOperator(BaseOperator):
return behaviors return behaviors
async def _load_crawl_configmap(self, crawl: CrawlSpec, children, params): async def _load_crawl_configmap(
self, crawl: CrawlSpec, children, params, config_update_needed: bool
):
name = f"crawl-config-{crawl.id}" name = f"crawl-config-{crawl.id}"
configmap = children[CMAP].get(name) configmap = children[CMAP].get(name)
if configmap: if configmap and not config_update_needed:
metadata = configmap["metadata"] metadata = configmap["metadata"]
configmap["metadata"] = { configmap["metadata"] = {
"name": metadata["name"], "name": metadata["name"],
@ -390,6 +398,9 @@ class CrawlOperator(BaseOperator):
params["config"] = json.dumps(raw_config) params["config"] = json.dumps(raw_config)
if config_update_needed:
print(f"Updating config for {crawl.id}")
return self.load_from_yaml("crawl_configmap.yaml", params) return self.load_from_yaml("crawl_configmap.yaml", params)
async def _load_qa_configmap(self, params, children): async def _load_qa_configmap(self, params, children):

View File

@ -209,6 +209,7 @@ class CrawlStatus(BaseModel):
stopReason: Optional[StopReason] = None stopReason: Optional[StopReason] = None
initRedis: bool = False initRedis: bool = False
crawlerImage: Optional[str] = None crawlerImage: Optional[str] = None
lastConfigUpdate: str = ""
lastActiveTime: str = "" lastActiveTime: str = ""
podStatus: DefaultDict[str, Annotated[PodInfo, Field(default_factory=PodInfo)]] = ( podStatus: DefaultDict[str, Annotated[PodInfo, Field(default_factory=PodInfo)]] = (

View File

@ -127,7 +127,7 @@ spec:
command: command:
- {{ "crawl" if not qa_source_crawl_id else "qa" }} - {{ "crawl" if not qa_source_crawl_id else "qa" }}
- --config - --config
- /tmp/crawl-config.json - /tmp/config/crawl-config.json
- --workers - --workers
- "{{ workers }}" - "{{ workers }}"
- --redisStoreUrl - --redisStoreUrl
@ -153,8 +153,7 @@ spec:
{% endif %} {% endif %}
volumeMounts: volumeMounts:
- name: crawl-config - name: crawl-config
mountPath: /tmp/crawl-config.json mountPath: /tmp/config/
subPath: crawl-config.json
readOnly: True readOnly: True
{% if qa_source_crawl_id %} {% if qa_source_crawl_id %}