diff --git a/elixir/apps/domain/lib/domain/cluster/google_compute_labels_strategy.ex b/elixir/apps/domain/lib/domain/cluster/google_compute_labels_strategy.ex index 8f807274b..0eb637411 100644 --- a/elixir/apps/domain/lib/domain/cluster/google_compute_labels_strategy.ex +++ b/elixir/apps/domain/lib/domain/cluster/google_compute_labels_strategy.ex @@ -28,6 +28,8 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do unless Domain.GoogleCloudPlatform.enabled?(), do: "Google Cloud Platform clustering strategy requires GoogleCloudPlatform to be enabled" + state = Map.put(state, :connected_nodes, []) + {:ok, state, {:continue, :start}} end @@ -50,18 +52,31 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do end defp load(state) do - Process.send_after(self(), :load, polling_interval(state)) - with {:ok, nodes, state} <- fetch_nodes(state), :ok <- Cluster.Strategy.connect_nodes(state.topology, state.connect, state.list_nodes, nodes) do + Process.send_after(self(), :load, polling_interval(state)) + + state = Map.put(state, :connected_nodes, nodes) + state else {:error, reason} -> - Logger.error("Error fetching nodes or connecting to them", + Process.send_after(self(), :load, polling_interval(state)) + + # Expected during deploy as the list of nodes received does not factor in the health check + Logger.info("Error fetching nodes or connecting to them", reason: inspect(reason) ) + # Only log error if the number of connected nodes falls below the expected threshold + unless enough_nodes_connected?(state) do + Logger.error("Connected nodes count is below threshold", + connected_nodes: inspect(state.connected_nodes), + config: inspect(state.config) + ) + end + state end end @@ -145,4 +160,32 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do defp polling_interval(state) do Keyword.get(state.config, :polling_interval, @default_polling_interval) end + + defp enough_nodes_connected?(state) do + connected_nodes = state.connected_nodes + expected_api_node_count = Keyword.fetch!(state.config, :api_node_count) + expected_domain_node_count = Keyword.fetch!(state.config, :domain_node_count) + expected_web_node_count = Keyword.fetch!(state.config, :web_node_count) + connected_api_node_count = connected_node_count(connected_nodes, "api") + connected_domain_node_count = connected_node_count(connected_nodes, "domain") + connected_web_node_count = connected_node_count(connected_nodes, "web") + + api_nodes_connected? = connected_api_node_count >= expected_api_node_count + domain_nodes_connected? = connected_domain_node_count >= expected_domain_node_count + web_nodes_connected? = connected_web_node_count >= expected_web_node_count + + api_nodes_connected? and domain_nodes_connected? and web_nodes_connected? + end + + defp connected_node_count(nodes, target_app) do + Enum.count(nodes, fn node_name -> + app = + node_name + |> Atom.to_string() + |> String.split("@") + |> List.first() + + target_app == app + end) + end end diff --git a/elixir/apps/domain/lib/domain/telemetry/sentry.ex b/elixir/apps/domain/lib/domain/telemetry/sentry.ex index 0cccf88ba..6f5dd4042 100644 --- a/elixir/apps/domain/lib/domain/telemetry/sentry.ex +++ b/elixir/apps/domain/lib/domain/telemetry/sentry.ex @@ -1,13 +1,18 @@ defmodule Domain.Telemetry.Sentry do - def before_send(%{original_exception: %{skip_sentry: skip_sentry}} = event) do - if skip_sentry do + def before_send(%{original_exception: %{skip_sentry: skip_sentry}}) when skip_sentry do + nil + end + + def before_send(event) do + if String.contains?( + event.message, + "Node ~p not responding **~n** Removing (timedout) connection" + ) do + # This happens when libcluster loses connection to a node, which is normal during deploys. + # We have threshold-based error logging in Domain.Cluster.GoogleComputeLabelsStrategy to report those. nil else event end end - - def before_send(event) do - event - end end diff --git a/terraform/environments b/terraform/environments index b0c70d8fe..469fdf980 160000 --- a/terraform/environments +++ b/terraform/environments @@ -1 +1 @@ -Subproject commit b0c70d8fe3e5a92dc1432f0cb9a459cdaae6bc1f +Subproject commit 469fdf980152bd5d78571fa923d7ffc0e69136e9