mirror of
https://github.com/outbackdingo/firezone.git
synced 2026-01-27 18:18:55 +00:00
fix(portal): Use threshold-based logging for cluster errors (#9342)
We periodically fetch a list of all `RUNNING` VMs in GCP and then try to connect to them for clustering. However, during deploys, it's expected that we won't be able to connect to new VMs until they are fully up. The fetch doesn't take health checks into account, so we need a threshold-based error logging. To address this, we do the following: - We only log an error when failing to connect to nodes if we are currently below the threshold for each of the `api`, `domain`, and `web` node counts - We silence node timeout errors, as these will happen during deploys
This commit is contained in:
@@ -28,6 +28,8 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do
|
||||
unless Domain.GoogleCloudPlatform.enabled?(),
|
||||
do: "Google Cloud Platform clustering strategy requires GoogleCloudPlatform to be enabled"
|
||||
|
||||
state = Map.put(state, :connected_nodes, [])
|
||||
|
||||
{:ok, state, {:continue, :start}}
|
||||
end
|
||||
|
||||
@@ -50,18 +52,31 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do
|
||||
end
|
||||
|
||||
defp load(state) do
|
||||
Process.send_after(self(), :load, polling_interval(state))
|
||||
|
||||
with {:ok, nodes, state} <- fetch_nodes(state),
|
||||
:ok <-
|
||||
Cluster.Strategy.connect_nodes(state.topology, state.connect, state.list_nodes, nodes) do
|
||||
Process.send_after(self(), :load, polling_interval(state))
|
||||
|
||||
state = Map.put(state, :connected_nodes, nodes)
|
||||
|
||||
state
|
||||
else
|
||||
{:error, reason} ->
|
||||
Logger.error("Error fetching nodes or connecting to them",
|
||||
Process.send_after(self(), :load, polling_interval(state))
|
||||
|
||||
# Expected during deploy as the list of nodes received does not factor in the health check
|
||||
Logger.info("Error fetching nodes or connecting to them",
|
||||
reason: inspect(reason)
|
||||
)
|
||||
|
||||
# Only log error if the number of connected nodes falls below the expected threshold
|
||||
unless enough_nodes_connected?(state) do
|
||||
Logger.error("Connected nodes count is below threshold",
|
||||
connected_nodes: inspect(state.connected_nodes),
|
||||
config: inspect(state.config)
|
||||
)
|
||||
end
|
||||
|
||||
state
|
||||
end
|
||||
end
|
||||
@@ -145,4 +160,32 @@ defmodule Domain.Cluster.GoogleComputeLabelsStrategy do
|
||||
defp polling_interval(state) do
|
||||
Keyword.get(state.config, :polling_interval, @default_polling_interval)
|
||||
end
|
||||
|
||||
defp enough_nodes_connected?(state) do
|
||||
connected_nodes = state.connected_nodes
|
||||
expected_api_node_count = Keyword.fetch!(state.config, :api_node_count)
|
||||
expected_domain_node_count = Keyword.fetch!(state.config, :domain_node_count)
|
||||
expected_web_node_count = Keyword.fetch!(state.config, :web_node_count)
|
||||
connected_api_node_count = connected_node_count(connected_nodes, "api")
|
||||
connected_domain_node_count = connected_node_count(connected_nodes, "domain")
|
||||
connected_web_node_count = connected_node_count(connected_nodes, "web")
|
||||
|
||||
api_nodes_connected? = connected_api_node_count >= expected_api_node_count
|
||||
domain_nodes_connected? = connected_domain_node_count >= expected_domain_node_count
|
||||
web_nodes_connected? = connected_web_node_count >= expected_web_node_count
|
||||
|
||||
api_nodes_connected? and domain_nodes_connected? and web_nodes_connected?
|
||||
end
|
||||
|
||||
defp connected_node_count(nodes, target_app) do
|
||||
Enum.count(nodes, fn node_name ->
|
||||
app =
|
||||
node_name
|
||||
|> Atom.to_string()
|
||||
|> String.split("@")
|
||||
|> List.first()
|
||||
|
||||
target_app == app
|
||||
end)
|
||||
end
|
||||
end
|
||||
|
||||
@@ -1,13 +1,18 @@
|
||||
defmodule Domain.Telemetry.Sentry do
|
||||
def before_send(%{original_exception: %{skip_sentry: skip_sentry}} = event) do
|
||||
if skip_sentry do
|
||||
def before_send(%{original_exception: %{skip_sentry: skip_sentry}}) when skip_sentry do
|
||||
nil
|
||||
end
|
||||
|
||||
def before_send(event) do
|
||||
if String.contains?(
|
||||
event.message,
|
||||
"Node ~p not responding **~n** Removing (timedout) connection"
|
||||
) do
|
||||
# This happens when libcluster loses connection to a node, which is normal during deploys.
|
||||
# We have threshold-based error logging in Domain.Cluster.GoogleComputeLabelsStrategy to report those.
|
||||
nil
|
||||
else
|
||||
event
|
||||
end
|
||||
end
|
||||
|
||||
def before_send(event) do
|
||||
event
|
||||
end
|
||||
end
|
||||
|
||||
Submodule terraform/environments updated: b0c70d8fe3...469fdf9801
Reference in New Issue
Block a user