feat(relay): remove graceful shutdown (#10322)

Initially, we added the graceful shutdown functionality to the relay to
better deal with deploys and achieve as minimal downtime as possible.
With the split of app and infrastructure that we now have, this
functionality is no longer necessary as portal deploys don't touch the
relay infra at all.

Thus, we can remove this functionality which will actually speed-up
deploys of the relays as systemd no longer has to time-out after sending
the SIGTERM to the binary.
This commit is contained in:
Thomas Eizinger
2025-09-10 07:00:20 +00:00
committed by GitHub
parent af7f4c9992
commit f96cc3d583
3 changed files with 5 additions and 70 deletions

View File

@@ -112,7 +112,6 @@ jobs:
rust_log: debug
- name: dns-nm
- name: tcp-dns
- name: relay-graceful-shutdown
- name: systemd/dns-systemd-resolved
steps:
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

View File

@@ -3,7 +3,7 @@
use anyhow::{Context, Result, bail};
use backoff::ExponentialBackoffBuilder;
use clap::Parser;
use firezone_bin_shared::http_health_check;
use firezone_bin_shared::{http_health_check, signals};
use firezone_logging::{FilterReloadHandle, err_with_src, sentry_layer};
use firezone_relay::sockets::Sockets;
use firezone_relay::{
@@ -419,9 +419,7 @@ struct Eventloop<R> {
ebpf: Option<ebpf::Program>,
#[cfg(unix)]
sigterm: tokio::signal::unix::Signal,
shutting_down: bool,
sigterm: signals::Terminate,
stats_log_interval: tokio::time::Interval,
last_num_bytes_relayed: u64,
@@ -475,9 +473,7 @@ where
ebpf,
buffer: [0u8; MAX_UDP_SIZE],
last_heartbeat_sent,
#[cfg(unix)]
sigterm: tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate())?,
shutting_down: false,
sigterm: signals::Terminate::new()?,
})
}
@@ -485,10 +481,6 @@ where
loop {
let mut ready = false;
if self.shutting_down && self.channel.is_none() && self.server.num_allocations() == 0 {
return Poll::Ready(Ok(()));
}
ready!(self.sockets.flush(cx))?;
// Priority 1: Execute the pending commands of the server.
@@ -667,31 +659,9 @@ where
Some(Poll::Pending) | None => {}
}
#[cfg(unix)]
match self.sigterm.poll_recv(cx) {
Poll::Ready(Some(())) => {
if self.shutting_down {
tracing::info!("Forcing shutdown on repeated SIGTERM");
return Poll::Ready(Ok(()));
}
tracing::info!(active_allocations = %self.server.num_allocations(), "Received SIGTERM, initiating graceful shutdown");
self.shutting_down = true;
if let Some(portal) = self.channel.as_mut() {
match portal.close() {
Ok(()) => {}
Err(phoenix_channel::Connecting) => {
self.channel = None; // If we are still connecting, just discard the websocket connection.
}
}
}
ready = true;
}
Poll::Ready(None) | Poll::Pending => {}
Poll::Ready(()) => return Poll::Ready(Ok(())),
Poll::Pending => {}
}
if self.stats_log_interval.poll_tick(cx).is_ready() {

View File

@@ -1,34 +0,0 @@
#!/usr/bin/env bash
source "./scripts/tests/lib.sh"
# Arrange: Setup a relayed connection
force_relayed_connections
client_curl_resource "172.20.0.100/get"
client_curl_resource "[172:20:0::100]/get"
# Act: Send SIGTERM
docker compose kill relay-1 --signal SIGTERM
sleep 2 # Closing websocket isn't instant.
# Assert: Dataplane still works
client_curl_resource "172.20.0.100/get"
client_curl_resource "[172:20:0::100]/get"
# Assert: Websocket connection is cut
OPEN_SOCKETS=$(relay1 netstat -tn | grep "ESTABLISHED" | grep 8081 || true) # Portal listens on port 8081
test -z "$OPEN_SOCKETS"
# Act: Send 2nd SIGTERM
docker compose kill relay-1 --signal SIGTERM
sleep 5 # Wait for container to be fully exited
# Seems to be necessary to return the correct state
docker compose ps relay-1 --all
sleep 1
# Assert: Container exited
container_state=$(docker compose ps relay-1 --all --format json | jq --raw-output '.State')
assert_equals "$container_state" "exited"