From f96cc3d58389be61c6f6c003846ccecc659d116b Mon Sep 17 00:00:00 2001 From: Thomas Eizinger Date: Wed, 10 Sep 2025 07:00:20 +0000 Subject: [PATCH] feat(relay): remove graceful shutdown (#10322) Initially, we added the graceful shutdown functionality to the relay to better deal with deploys and achieve as minimal downtime as possible. With the split of app and infrastructure that we now have, this functionality is no longer necessary as portal deploys don't touch the relay infra at all. Thus, we can remove this functionality which will actually speed-up deploys of the relays as systemd no longer has to time-out after sending the SIGTERM to the binary. --- .github/workflows/_integration_tests.yml | 1 - rust/relay/server/src/main.rs | 40 +++--------------------- scripts/tests/relay-graceful-shutdown.sh | 34 -------------------- 3 files changed, 5 insertions(+), 70 deletions(-) delete mode 100755 scripts/tests/relay-graceful-shutdown.sh diff --git a/.github/workflows/_integration_tests.yml b/.github/workflows/_integration_tests.yml index cf5c46cdf..784c77315 100644 --- a/.github/workflows/_integration_tests.yml +++ b/.github/workflows/_integration_tests.yml @@ -112,7 +112,6 @@ jobs: rust_log: debug - name: dns-nm - name: tcp-dns - - name: relay-graceful-shutdown - name: systemd/dns-systemd-resolved steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 diff --git a/rust/relay/server/src/main.rs b/rust/relay/server/src/main.rs index 051caca64..ec8be3f90 100644 --- a/rust/relay/server/src/main.rs +++ b/rust/relay/server/src/main.rs @@ -3,7 +3,7 @@ use anyhow::{Context, Result, bail}; use backoff::ExponentialBackoffBuilder; use clap::Parser; -use firezone_bin_shared::http_health_check; +use firezone_bin_shared::{http_health_check, signals}; use firezone_logging::{FilterReloadHandle, err_with_src, sentry_layer}; use firezone_relay::sockets::Sockets; use firezone_relay::{ @@ -419,9 +419,7 @@ struct Eventloop { ebpf: Option, - #[cfg(unix)] - sigterm: tokio::signal::unix::Signal, - shutting_down: bool, + sigterm: signals::Terminate, stats_log_interval: tokio::time::Interval, last_num_bytes_relayed: u64, @@ -475,9 +473,7 @@ where ebpf, buffer: [0u8; MAX_UDP_SIZE], last_heartbeat_sent, - #[cfg(unix)] - sigterm: tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate())?, - shutting_down: false, + sigterm: signals::Terminate::new()?, }) } @@ -485,10 +481,6 @@ where loop { let mut ready = false; - if self.shutting_down && self.channel.is_none() && self.server.num_allocations() == 0 { - return Poll::Ready(Ok(())); - } - ready!(self.sockets.flush(cx))?; // Priority 1: Execute the pending commands of the server. @@ -667,31 +659,9 @@ where Some(Poll::Pending) | None => {} } - #[cfg(unix)] match self.sigterm.poll_recv(cx) { - Poll::Ready(Some(())) => { - if self.shutting_down { - tracing::info!("Forcing shutdown on repeated SIGTERM"); - - return Poll::Ready(Ok(())); - } - - tracing::info!(active_allocations = %self.server.num_allocations(), "Received SIGTERM, initiating graceful shutdown"); - - self.shutting_down = true; - - if let Some(portal) = self.channel.as_mut() { - match portal.close() { - Ok(()) => {} - Err(phoenix_channel::Connecting) => { - self.channel = None; // If we are still connecting, just discard the websocket connection. - } - } - } - - ready = true; - } - Poll::Ready(None) | Poll::Pending => {} + Poll::Ready(()) => return Poll::Ready(Ok(())), + Poll::Pending => {} } if self.stats_log_interval.poll_tick(cx).is_ready() { diff --git a/scripts/tests/relay-graceful-shutdown.sh b/scripts/tests/relay-graceful-shutdown.sh deleted file mode 100755 index 72dd5fc85..000000000 --- a/scripts/tests/relay-graceful-shutdown.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/usr/bin/env bash - -source "./scripts/tests/lib.sh" - -# Arrange: Setup a relayed connection -force_relayed_connections -client_curl_resource "172.20.0.100/get" -client_curl_resource "[172:20:0::100]/get" - -# Act: Send SIGTERM -docker compose kill relay-1 --signal SIGTERM - -sleep 2 # Closing websocket isn't instant. - -# Assert: Dataplane still works -client_curl_resource "172.20.0.100/get" -client_curl_resource "[172:20:0::100]/get" - -# Assert: Websocket connection is cut -OPEN_SOCKETS=$(relay1 netstat -tn | grep "ESTABLISHED" | grep 8081 || true) # Portal listens on port 8081 -test -z "$OPEN_SOCKETS" - -# Act: Send 2nd SIGTERM -docker compose kill relay-1 --signal SIGTERM - -sleep 5 # Wait for container to be fully exited - -# Seems to be necessary to return the correct state -docker compose ps relay-1 --all -sleep 1 - -# Assert: Container exited -container_state=$(docker compose ps relay-1 --all --format json | jq --raw-output '.State') -assert_equals "$container_state" "exited"