From c0a670d94759d6416cf42eef46c1fd52b889ed74 Mon Sep 17 00:00:00 2001 From: Jamil Date: Wed, 30 Apr 2025 20:48:35 -0700 Subject: [PATCH] fix(portal): Restart ReplicationConnection using Supervisor (#8953) When deploying, the cluster state diverges temporarily, which allows more than one `ReplicationConnection` process to start on the new nodes. (One of) the old nodes still has an active slot, and we get an "object in use" error `(Postgrex.Error) ERROR 55006 (object_in_use) replication slot "events_slot" is active for PID 603037`. Rather than use ReplicationConnection's restart behavior (which logs tons of errors with Logger.error), we can use the Supervisor here instead, and continue to try and start the ReplicationConnection until successful. Note that if the process name is registered (globally) and running, ReplicationConnection.start_link/1 simply returns `{:ok, pid}` instead of erroring out with `:already_running`, so eventually one of the nodes will succeed and the remaining ones will return the globally-registered pid. --- elixir/apps/domain/lib/domain/application.ex | 13 ++++++++----- elixir/config/config.exs | 2 -- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/elixir/apps/domain/lib/domain/application.ex b/elixir/apps/domain/lib/domain/application.ex index af5f7fdd9..8a75f8e53 100644 --- a/elixir/apps/domain/lib/domain/application.ex +++ b/elixir/apps/domain/lib/domain/application.ex @@ -22,9 +22,6 @@ defmodule Domain.Application do Domain.Repo, Domain.PubSub, - # WAL replication - replication_child_spec(), - # Infrastructure services # Note: only one of platform adapters will be actually started. Domain.GoogleCloudPlatform, @@ -44,7 +41,10 @@ defmodule Domain.Application do Domain.ComponentVersions, # Observability - Domain.Telemetry + Domain.Telemetry, + + # WAL replication + replication_child_spec() ] end @@ -61,7 +61,10 @@ defmodule Domain.Application do %{ id: Domain.Events.ReplicationConnection, start: {Domain.Events.ReplicationConnection, :start_link, [init_state]}, - restart: :transient + restart: :transient, + # Allow up to 240 restarts in 20 minutes - covers duration of a deploy + max_restarts: 240, + max_seconds: 1200 } end diff --git a/elixir/config/config.exs b/elixir/config/config.exs index 6f89d94df..4bee6860a 100644 --- a/elixir/config/config.exs +++ b/elixir/config/config.exs @@ -32,8 +32,6 @@ config :domain, Domain.Repo, config :domain, Domain.Events.ReplicationConnection, connection_opts: [ - # Automatically reconnect if we lose connection. - auto_reconnect: true, hostname: "localhost", port: 5432, ssl: false,