fix(portal): Use threshold-based logging for cluster errors (#9342)

We periodically fetch a list of all `RUNNING` VMs in GCP and then try to
connect to them for clustering. However, during deploys, it's expected
that we won't be able to connect to new VMs until they are fully up. The
fetch doesn't take health checks into account, so we need a
threshold-based error logging.

To address this, we do the following:

- We only log an error when failing to connect to nodes if we are
currently below the threshold for each of the `api`, `domain`, and `web`
node counts
- We silence node timeout errors, as these will happen during deploys
This commit is contained in:
Jamil
2025-06-01 08:53:38 -07:00
committed by GitHub
parent df500de47e
commit bfca4e8411
3 changed files with 58 additions and 10 deletions

View File

@@ -28,6 +28,8 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do
unless Domain.GoogleCloudPlatform.enabled?(),
do: "Google Cloud Platform clustering strategy requires GoogleCloudPlatform to be enabled"
state = Map.put(state, :connected_nodes, [])
{:ok, state, {:continue, :start}}
end
@@ -50,18 +52,31 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do
end
defp load(state) do
Process.send_after(self(), :load, polling_interval(state))
with {:ok, nodes, state} <- fetch_nodes(state),
:ok <-
Cluster.Strategy.connect_nodes(state.topology, state.connect, state.list_nodes, nodes) do
Process.send_after(self(), :load, polling_interval(state))
state = Map.put(state, :connected_nodes, nodes)
state
else
{:error, reason} ->
Logger.error("Error fetching nodes or connecting to them",
Process.send_after(self(), :load, polling_interval(state))
# Expected during deploy as the list of nodes received does not factor in the health check
Logger.info("Error fetching nodes or connecting to them",
reason: inspect(reason)
)
# Only log error if the number of connected nodes falls below the expected threshold
unless enough_nodes_connected?(state) do
Logger.error("Connected nodes count is below threshold",
connected_nodes: inspect(state.connected_nodes),
config: inspect(state.config)
)
end
state
end
end
@@ -145,4 +160,32 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do
defp polling_interval(state) do
Keyword.get(state.config, :polling_interval, @default_polling_interval)
end
defp enough_nodes_connected?(state) do
connected_nodes = state.connected_nodes
expected_api_node_count = Keyword.fetch!(state.config, :api_node_count)
expected_domain_node_count = Keyword.fetch!(state.config, :domain_node_count)
expected_web_node_count = Keyword.fetch!(state.config, :web_node_count)
connected_api_node_count = connected_node_count(connected_nodes, "api")
connected_domain_node_count = connected_node_count(connected_nodes, "domain")
connected_web_node_count = connected_node_count(connected_nodes, "web")
api_nodes_connected? = connected_api_node_count >= expected_api_node_count
domain_nodes_connected? = connected_domain_node_count >= expected_domain_node_count
web_nodes_connected? = connected_web_node_count >= expected_web_node_count
api_nodes_connected? and domain_nodes_connected? and web_nodes_connected?
end
defp connected_node_count(nodes, target_app) do
Enum.count(nodes, fn node_name ->
app =
node_name
|> Atom.to_string()
|> String.split("@")
|> List.first()
target_app == app
end)
end
end

View File

@@ -1,13 +1,18 @@
defmodule Domain.Telemetry.Sentry do
def before_send(%{original_exception: %{skip_sentry: skip_sentry}} = event) do
if skip_sentry do
def before_send(%{original_exception: %{skip_sentry: skip_sentry}}) when skip_sentry do
nil
end
def before_send(event) do
if String.contains?(
event.message,
"Node ~p not responding **~n** Removing (timedout) connection"
) do
# This happens when libcluster loses connection to a node, which is normal during deploys.
# We have threshold-based error logging in Domain.Cluster.GoogleComputeLabelsStrategy to report those.
nil
else
event
end
end
def before_send(event) do
event
end
end