fix(portal): Use threshold-based logging for cluster errors (#9342)

We periodically fetch a list of all `RUNNING` VMs in GCP and then try to connect to them for clustering. However, during deploys, it's expected that we won't be able to connect to new VMs until they are fully up. The fetch doesn't take health checks into account, so we need a threshold-based error logging. To address this, we do the following: - We only log an error when failing to connect to nodes if we are currently below the threshold for each of the `api`, `domain`, and `web` node counts - We silence node timeout errors, as these will happen during deploys
2026-01-27 18:18:55 +00:00 · 2025-06-01 08:53:38 -07:00
parent df500de47e
commit bfca4e8411
3 changed files with 58 additions and 10 deletions
--- a/elixir/apps/domain/lib/domain/cluster/google_compute_labels_strategy.ex
+++ b/elixir/apps/domain/lib/domain/cluster/google_compute_labels_strategy.ex
@@ -28,6 +28,8 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do
    unless Domain.GoogleCloudPlatform.enabled?(),
      do: "Google Cloud Platform clustering strategy requires GoogleCloudPlatform to be enabled"

+    state = Map.put(state, :connected_nodes, [])
+
    {:ok, state, {:continue, :start}}
  end

@@ -50,18 +52,31 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do
  end

  defp load(state) do
-    Process.send_after(self(), :load, polling_interval(state))
-
    with {:ok, nodes, state} <- fetch_nodes(state),
         :ok <-
           Cluster.Strategy.connect_nodes(state.topology, state.connect, state.list_nodes, nodes) do
+      Process.send_after(self(), :load, polling_interval(state))
+
+      state = Map.put(state, :connected_nodes, nodes)
+
      state
    else
      {:error, reason} ->
-        Logger.error("Error fetching nodes or connecting to them",
+        Process.send_after(self(), :load, polling_interval(state))
+
+        # Expected during deploy as the list of nodes received does not factor in the health check
+        Logger.info("Error fetching nodes or connecting to them",
          reason: inspect(reason)
        )

+        # Only log error if the number of connected nodes falls below the expected threshold
+        unless enough_nodes_connected?(state) do
+          Logger.error("Connected nodes count is below threshold",
+            connected_nodes: inspect(state.connected_nodes),
+            config: inspect(state.config)
+          )
+        end
+
        state
    end
  end
@@ -145,4 +160,32 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do
  defp polling_interval(state) do
    Keyword.get(state.config, :polling_interval, @default_polling_interval)
  end
+
+  defp enough_nodes_connected?(state) do
+    connected_nodes = state.connected_nodes
+    expected_api_node_count = Keyword.fetch!(state.config, :api_node_count)
+    expected_domain_node_count = Keyword.fetch!(state.config, :domain_node_count)
+    expected_web_node_count = Keyword.fetch!(state.config, :web_node_count)
+    connected_api_node_count = connected_node_count(connected_nodes, "api")
+    connected_domain_node_count = connected_node_count(connected_nodes, "domain")
+    connected_web_node_count = connected_node_count(connected_nodes, "web")
+
+    api_nodes_connected? = connected_api_node_count >= expected_api_node_count
+    domain_nodes_connected? = connected_domain_node_count >= expected_domain_node_count
+    web_nodes_connected? = connected_web_node_count >= expected_web_node_count
+
+    api_nodes_connected? and domain_nodes_connected? and web_nodes_connected?
+  end
+
+  defp connected_node_count(nodes, target_app) do
+    Enum.count(nodes, fn node_name ->
+      app =
+        node_name
+        |> Atom.to_string()
+        |> String.split("@")
+        |> List.first()
+
+      target_app == app
+    end)
+  end
 end
--- a/elixir/apps/domain/lib/domain/telemetry/sentry.ex
+++ b/elixir/apps/domain/lib/domain/telemetry/sentry.ex
@@ -1,13 +1,18 @@
 defmodule Domain.Telemetry.Sentry do
-  def before_send(%{original_exception: %{skip_sentry: skip_sentry}} = event) do
-    if skip_sentry do
+  def before_send(%{original_exception: %{skip_sentry: skip_sentry}}) when skip_sentry do
+    nil
+  end
+
+  def before_send(event) do
+    if String.contains?(
+         event.message,
+         "Node ~p not responding **~n** Removing (timedout) connection"
+       ) do
+      # This happens when libcluster loses connection to a node, which is normal during deploys.
+      # We have threshold-based error logging in Domain.Cluster.GoogleComputeLabelsStrategy to report those.
      nil
    else
      event
    end
  end
-
-  def before_send(event) do
-    event
-  end
 end
--- a/terraform/environments
+++ b/terraform/environments