From 02dd21018d95ab9b11a735c157d629c81761977d Mon Sep 17 00:00:00 2001 From: Jamil Date: Tue, 24 Jun 2025 21:47:17 -0700 Subject: [PATCH] fix(portal): log error when connected_nodes crossed (#9668) To avoid log spam, we only log an error when the threshold boundary is crossed. --- .../cluster/google_compute_labels_strategy.ex | 39 ++++++++++++++----- 1 file changed, 29 insertions(+), 10 deletions(-) diff --git a/elixir/apps/domain/lib/domain/cluster/google_compute_labels_strategy.ex b/elixir/apps/domain/lib/domain/cluster/google_compute_labels_strategy.ex index 29e52b517..08f5f507f 100644 --- a/elixir/apps/domain/lib/domain/cluster/google_compute_labels_strategy.ex +++ b/elixir/apps/domain/lib/domain/cluster/google_compute_labels_strategy.ex @@ -28,6 +28,8 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do unless Domain.GoogleCloudPlatform.enabled?(), do: "Google Cloud Platform clustering strategy requires GoogleCloudPlatform to be enabled" + state = Map.put(state, :below_threshold?, false) + {:ok, state, {:continue, :start}} end @@ -68,16 +70,7 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do problem_nodes: inspect(problem_nodes) ) - # Only log error if the number of connected nodes falls below the expected threshold - unless enough_nodes_connected?(state) do - Logger.error("Connected nodes count is below threshold", - connected_nodes: inspect(state.connected_nodes), - problem_nodes: inspect(problem_nodes), - config: inspect(state.config) - ) - end - - state + maybe_log_threshold_error(state, problem_nodes) end else {:error, fetch_failed_reason} -> @@ -130,6 +123,32 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do end end + # Only log error if the number of connected nodes falls below the expected threshold. + # We only log crossing the boundary to avoid flooding the logs with messages + defp maybe_log_threshold_error(state, problem_nodes) do + if enough_nodes_connected?(state) do + if state.below_threshold? do + Logger.info("Connected nodes count is back above threshold", + connected_nodes: inspect(state.connected_nodes), + problem_nodes: inspect(problem_nodes), + config: inspect(state.config) + ) + end + + %{state | below_threshold?: false} + else + unless state.below_threshold? do + Logger.error("Connected nodes count is below threshold", + connected_nodes: inspect(state.connected_nodes), + problem_nodes: inspect(problem_nodes), + config: inspect(state.config) + ) + end + + %{state | below_threshold?: true} + end + end + defp list_google_cloud_cluster_nodes(state) do project_id = Keyword.fetch!(state.config, :project_id) cluster_name = Keyword.fetch!(state.config, :cluster_name)