From 68bc053ba046e204b61c535a94300d4c9ce3703f Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 6 Sep 2023 17:53:02 -0700 Subject: [PATCH] Print crawl log to operator log (mostly for testing) (#1148) * log only if 'log_failed_crawl_lines' value is set to number of last lines to log from failed container --------- Co-authored-by: Tessa Walsh --- backend/btrixcloud/k8sapi.py | 9 +++++++++ backend/btrixcloud/operator.py | 23 +++++++++++++++++++++-- backend/test/test_stop_cancel_crawl.py | 1 + chart/templates/configmap.yaml | 1 + chart/test/test.yaml | 2 ++ chart/values.yaml | 5 +++++ 6 files changed, 39 insertions(+), 2 deletions(-) diff --git a/backend/btrixcloud/k8sapi.py b/backend/btrixcloud/k8sapi.py index 1483c48c..e896d039 100644 --- a/backend/btrixcloud/k8sapi.py +++ b/backend/btrixcloud/k8sapi.py @@ -205,3 +205,12 @@ class K8sAPI: self.api_client.set_default_header("Content-Type", content_type) else: del self.api_client.default_headers["Content-Type"] + + async def print_pod_logs(self, pod_names, container, lines=100): + """print pod logs""" + for pod in pod_names: + resp = await self.core_api.read_namespaced_pod_log( + pod, self.namespace, container=container, tail_lines=lines + ) + print(f"============== LOGS FOR POD: {pod} ==============") + print(resp) diff --git a/backend/btrixcloud/operator.py b/backend/btrixcloud/operator.py index 93d46640..a059d21f 100644 --- a/backend/btrixcloud/operator.py +++ b/backend/btrixcloud/operator.py @@ -136,6 +136,8 @@ class BtrixOperator(K8sAPI): self.fast_retry_secs = int(os.environ.get("FAST_RETRY_SECS") or 0) + self.log_failed_crawl_lines = int(os.environ.get("LOG_FAILED_CRAWL_LINES") or 0) + with open(self.config_file, encoding="utf-8") as fh_config: self.shared_params = yaml.safe_load(fh_config) @@ -580,8 +582,10 @@ class BtrixOperator(K8sAPI): status.filesAdded = int(await redis.get("filesAdded") or 0) status.filesAddedSize = int(await redis.get("filesAddedSize") or 0) + pod_names = list(pods.keys()) + # update stats and get status - return await self.update_crawl_state(redis, crawl, status) + return await self.update_crawl_state(redis, crawl, status, pod_names) # pylint: disable=broad-except except Exception as exc: @@ -674,7 +678,7 @@ class BtrixOperator(K8sAPI): return False - async def update_crawl_state(self, redis, crawl, status): + async def update_crawl_state(self, redis, crawl, status, pod_names): """update crawl state and check if crawl is now done""" results = await redis.hvals(f"{crawl.id}:status") stats = await get_redis_crawl_stats(redis, crawl.id) @@ -716,16 +720,31 @@ class BtrixOperator(K8sAPI): # check if all crawlers failed elif status_count.get("failed", 0) >= crawl.scale: + prev_state = None + # if stopping, and no pages finished, mark as canceled if status.stopping and not status.pagesDone: state = "canceled" else: state = "failed" + prev_state = status.state status = await self.mark_finished( redis, crawl.id, crawl.cid, status, state=state ) + if ( + self.log_failed_crawl_lines + and state == "failed" + and prev_state != "failed" + ): + print("crawl failed: ", pod_names, stats) + asyncio.create_task( + self.print_pod_logs( + pod_names, "crawler", self.log_failed_crawl_lines + ) + ) + # check for other statuses else: new_status = None diff --git a/backend/test/test_stop_cancel_crawl.py b/backend/test/test_stop_cancel_crawl.py index bbc114be..170fb9ff 100644 --- a/backend/test/test_stop_cancel_crawl.py +++ b/backend/test/test_stop_cancel_crawl.py @@ -144,6 +144,7 @@ def test_stop_crawl_partial( time.sleep(2) data = get_crawl(default_org_id, crawler_auth_headers, crawl_id) done = data.get("stats") and data.get("stats").get("done") > 0 + print("crawl stats", data) r = requests.post( f"{API_PREFIX}/orgs/{default_org_id}/crawls/{crawl_id}/stop", diff --git a/chart/templates/configmap.yaml b/chart/templates/configmap.yaml index 1c9a3002..fae1d0a9 100644 --- a/chart/templates/configmap.yaml +++ b/chart/templates/configmap.yaml @@ -43,6 +43,7 @@ data: FAST_RETRY_SECS: "{{ .Values.operator_fast_resync_secs | default 3 }}" + LOG_FAILED_CRAWL_LINES: "{{ .Values.log_failed_crawl_lines | default 0 }}" --- apiVersion: v1 diff --git a/chart/test/test.yaml b/chart/test/test.yaml index 86f8bea4..2a871a31 100644 --- a/chart/test/test.yaml +++ b/chart/test/test.yaml @@ -34,3 +34,5 @@ max_pages_per_crawl: 4 registration_enabled: "0" +# log failed crawl pods to operator backend +log_failed_crawl_lines: 200 diff --git a/chart/values.yaml b/chart/values.yaml index 5f58869b..e0aff8fc 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -111,6 +111,11 @@ job_memory: "70Mi" profile_browser_idle_seconds: 60 +# if set, print last 'log_failed_crawl_lines' of each failed +# crawl pod to backend operator stdout +# mostly intended for debugging / testing +# log_failed_crawl_lines: 200 + # Nginx Image # =========================================