feat(relay): remove graceful shutdown (#10322)

Initially, we added the graceful shutdown functionality to the relay to better deal with deploys and achieve as minimal downtime as possible. With the split of app and infrastructure that we now have, this functionality is no longer necessary as portal deploys don't touch the relay infra at all. Thus, we can remove this functionality which will actually speed-up deploys of the relays as systemd no longer has to time-out after sending the SIGTERM to the binary.
2026-01-27 10:18:54 +00:00 · 2025-09-10 07:00:20 +00:00
parent af7f4c9992
commit f96cc3d583
3 changed files with 5 additions and 70 deletions
--- a/.github/workflows/_integration_tests.yml
+++ b/.github/workflows/_integration_tests.yml
@@ -112,7 +112,6 @@ jobs:
            rust_log: debug
          - name: dns-nm
          - name: tcp-dns
-          - name: relay-graceful-shutdown
          - name: systemd/dns-systemd-resolved
    steps:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
--- a/rust/relay/server/src/main.rs
+++ b/rust/relay/server/src/main.rs
@@ -3,7 +3,7 @@
 use anyhow::{Context, Result, bail};
 use backoff::ExponentialBackoffBuilder;
 use clap::Parser;
-use firezone_bin_shared::http_health_check;
+use firezone_bin_shared::{http_health_check, signals};
 use firezone_logging::{FilterReloadHandle, err_with_src, sentry_layer};
 use firezone_relay::sockets::Sockets;
 use firezone_relay::{
@@ -419,9 +419,7 @@ struct Eventloop<R> {

    ebpf: Option<ebpf::Program>,

-    #[cfg(unix)]
-    sigterm: tokio::signal::unix::Signal,
-    shutting_down: bool,
+    sigterm: signals::Terminate,

    stats_log_interval: tokio::time::Interval,
    last_num_bytes_relayed: u64,
@@ -475,9 +473,7 @@ where
            ebpf,
            buffer: [0u8; MAX_UDP_SIZE],
            last_heartbeat_sent,
-            #[cfg(unix)]
-            sigterm: tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate())?,
-            shutting_down: false,
+            sigterm: signals::Terminate::new()?,
        })
    }

@@ -485,10 +481,6 @@ where
        loop {
            let mut ready = false;

-            if self.shutting_down && self.channel.is_none() && self.server.num_allocations() == 0 {
-                return Poll::Ready(Ok(()));
-            }
-
            ready!(self.sockets.flush(cx))?;

            // Priority 1: Execute the pending commands of the server.
@@ -667,31 +659,9 @@ where
                Some(Poll::Pending) | None => {}
            }

-            #[cfg(unix)]
            match self.sigterm.poll_recv(cx) {
-                Poll::Ready(Some(())) => {
-                    if self.shutting_down {
-                        tracing::info!("Forcing shutdown on repeated SIGTERM");
-
-                        return Poll::Ready(Ok(()));
-                    }
-
-                    tracing::info!(active_allocations = %self.server.num_allocations(), "Received SIGTERM, initiating graceful shutdown");
-
-                    self.shutting_down = true;
-
-                    if let Some(portal) = self.channel.as_mut() {
-                        match portal.close() {
-                            Ok(()) => {}
-                            Err(phoenix_channel::Connecting) => {
-                                self.channel = None; // If we are still connecting, just discard the websocket connection.
-                            }
-                        }
-                    }
-
-                    ready = true;
-                }
-                Poll::Ready(None) | Poll::Pending => {}
+                Poll::Ready(()) => return Poll::Ready(Ok(())),
+                Poll::Pending => {}
            }

            if self.stats_log_interval.poll_tick(cx).is_ready() {
--- a/scripts/tests/relay-graceful-shutdown.sh
+++ b/scripts/tests/relay-graceful-shutdown.sh
@@ -1,34 +0,0 @@
-#!/usr/bin/env bash
-
-source "./scripts/tests/lib.sh"
-
-# Arrange: Setup a relayed connection
-force_relayed_connections
-client_curl_resource "172.20.0.100/get"
-client_curl_resource "[172:20:0::100]/get"
-
-# Act: Send SIGTERM
-docker compose kill relay-1 --signal SIGTERM
-
-sleep 2 # Closing websocket isn't instant.
-
-# Assert: Dataplane still works
-client_curl_resource "172.20.0.100/get"
-client_curl_resource "[172:20:0::100]/get"
-
-# Assert: Websocket connection is cut
-OPEN_SOCKETS=$(relay1 netstat -tn | grep "ESTABLISHED" | grep 8081 || true) # Portal listens on port 8081
-test -z "$OPEN_SOCKETS"
-
-# Act: Send 2nd SIGTERM
-docker compose kill relay-1 --signal SIGTERM
-
-sleep 5 # Wait for container to be fully exited
-
-# Seems to be necessary to return the correct state
-docker compose ps relay-1 --all
-sleep 1
-
-# Assert: Container exited
-container_state=$(docker compose ps relay-1 --all --format json | jq --raw-output '.State')
-assert_equals "$container_state" "exited"