From bfca4e8411758622788260eed022658b54c632d4 Mon Sep 17 00:00:00 2001 From: Jamil Date: Sun, 1 Jun 2025 08:53:38 -0700 Subject: [PATCH] fix(portal): Use threshold-based logging for cluster errors (#9342) We periodically fetch a list of all `RUNNING` VMs in GCP and then try to connect to them for clustering. However, during deploys, it's expected that we won't be able to connect to new VMs until they are fully up. The fetch doesn't take health checks into account, so we need a threshold-based error logging. To address this, we do the following: - We only log an error when failing to connect to nodes if we are currently below the threshold for each of the `api`, `domain`, and `web` node counts - We silence node timeout errors, as these will happen during deploys --- .../cluster/google_compute_labels_strategy.ex | 49 +++++++++++++++++-- .../domain/lib/domain/telemetry/sentry.ex | 17 ++++--- terraform/environments | 2 +- 3 files changed, 58 insertions(+), 10 deletions(-) diff --git a/elixir/apps/domain/lib/domain/cluster/google_compute_labels_strategy.ex b/elixir/apps/domain/lib/domain/cluster/google_compute_labels_strategy.ex index 8f807274b..0eb637411 100644 --- a/elixir/apps/domain/lib/domain/cluster/google_compute_labels_strategy.ex +++ b/elixir/apps/domain/lib/domain/cluster/google_compute_labels_strategy.ex @@ -28,6 +28,8 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do unless Domain.GoogleCloudPlatform.enabled?(), do: "Google Cloud Platform clustering strategy requires GoogleCloudPlatform to be enabled" + state = Map.put(state, :connected_nodes, []) + {:ok, state, {:continue, :start}} end @@ -50,18 +52,31 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do end defp load(state) do - Process.send_after(self(), :load, polling_interval(state)) - with {:ok, nodes, state} <- fetch_nodes(state), :ok <- Cluster.Strategy.connect_nodes(state.topology, state.connect, state.list_nodes, nodes) do + Process.send_after(self(), :load, polling_interval(state)) + + state = Map.put(state, :connected_nodes, nodes) + state else {:error, reason} -> - Logger.error("Error fetching nodes or connecting to them", + Process.send_after(self(), :load, polling_interval(state)) + + # Expected during deploy as the list of nodes received does not factor in the health check + Logger.info("Error fetching nodes or connecting to them", reason: inspect(reason) ) + # Only log error if the number of connected nodes falls below the expected threshold + unless enough_nodes_connected?(state) do + Logger.error("Connected nodes count is below threshold", + connected_nodes: inspect(state.connected_nodes), + config: inspect(state.config) + ) + end + state end end @@ -145,4 +160,32 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do defp polling_interval(state) do Keyword.get(state.config, :polling_interval, @default_polling_interval) end + + defp enough_nodes_connected?(state) do + connected_nodes = state.connected_nodes + expected_api_node_count = Keyword.fetch!(state.config, :api_node_count) + expected_domain_node_count = Keyword.fetch!(state.config, :domain_node_count) + expected_web_node_count = Keyword.fetch!(state.config, :web_node_count) + connected_api_node_count = connected_node_count(connected_nodes, "api") + connected_domain_node_count = connected_node_count(connected_nodes, "domain") + connected_web_node_count = connected_node_count(connected_nodes, "web") + + api_nodes_connected? = connected_api_node_count >= expected_api_node_count + domain_nodes_connected? = connected_domain_node_count >= expected_domain_node_count + web_nodes_connected? = connected_web_node_count >= expected_web_node_count + + api_nodes_connected? and domain_nodes_connected? and web_nodes_connected? + end + + defp connected_node_count(nodes, target_app) do + Enum.count(nodes, fn node_name -> + app = + node_name + |> Atom.to_string() + |> String.split("@") + |> List.first() + + target_app == app + end) + end end diff --git a/elixir/apps/domain/lib/domain/telemetry/sentry.ex b/elixir/apps/domain/lib/domain/telemetry/sentry.ex index 0cccf88ba..6f5dd4042 100644 --- a/elixir/apps/domain/lib/domain/telemetry/sentry.ex +++ b/elixir/apps/domain/lib/domain/telemetry/sentry.ex @@ -1,13 +1,18 @@ defmodule Domain.Telemetry.Sentry do - def before_send(%{original_exception: %{skip_sentry: skip_sentry}} = event) do - if skip_sentry do + def before_send(%{original_exception: %{skip_sentry: skip_sentry}}) when skip_sentry do + nil + end + + def before_send(event) do + if String.contains?( + event.message, + "Node ~p not responding **~n** Removing (timedout) connection" + ) do + # This happens when libcluster loses connection to a node, which is normal during deploys. + # We have threshold-based error logging in Domain.Cluster.GoogleComputeLabelsStrategy to report those. nil else event end end - - def before_send(event) do - event - end end diff --git a/terraform/environments b/terraform/environments index b0c70d8fe..469fdf980 160000 --- a/terraform/environments +++ b/terraform/environments @@ -1 +1 @@ -Subproject commit b0c70d8fe3e5a92dc1432f0cb9a459cdaae6bc1f +Subproject commit 469fdf980152bd5d78571fa923d7ffc0e69136e9