fix(gateway): polish error handling in main (#7500)

Currently, the Gateway logs all errors that happen when the event-loop
exits on ERROR level. This creates Sentry alerts for things like
"Unauthorized" errors or "404 Not found".

That isn't useful to us. To mitigate this, we polish the code a bit to
only log an ERROR when we actually fail to setup something during
startup (like the TUN device). In all other cases, we now log a more
user-friendly message on INFO but still exit with the appropriate exit
code (0 on CTRL+C, 1 on any other error).
This commit is contained in:
Thomas Eizinger
2024-12-13 15:51:58 +11:00
committed by GitHub
parent db2dd4a618
commit 30376cd79a
4 changed files with 68 additions and 59 deletions

View File

@@ -56,8 +56,8 @@ jobs:
with:
tool: cargo-udeps,cargo-deny
- run: |
rustup install --no-self-update nightly-2024-09-01 --profile minimal # The exact nightly version doesn't matter, just pin a random one.
cargo +nightly-2024-09-01 udeps --all-targets --all-features ${{ steps.setup-rust.outputs.packages }}
rustup install --no-self-update nightly-2024-12-13 --profile minimal # The exact nightly version doesn't matter, just pin a random one.
cargo +nightly-2024-12-13 udeps --all-targets --all-features ${{ steps.setup-rust.outputs.packages }}
name: Check for unused dependencies
- run: cargo fmt -- --check
- run: cargo doc --all-features --no-deps --document-private-items ${{ steps.setup-rust.outputs.packages }}

View File

@@ -66,7 +66,10 @@ impl Eventloop {
}
impl Eventloop {
pub fn poll(&mut self, cx: &mut Context<'_>) -> Poll<Result<Infallible>> {
pub fn poll(
&mut self,
cx: &mut Context<'_>,
) -> Poll<Result<Infallible, phoenix_channel::Error>> {
loop {
match self.tunnel.poll_next_event(cx) {
Poll::Ready(Ok(event)) => {

View File

@@ -16,11 +16,11 @@ use phoenix_channel::LoginUrl;
use futures::channel::mpsc;
use futures::{future, StreamExt, TryFutureExt};
use phoenix_channel::{PhoenixChannel, PublicKeyParam};
use phoenix_channel::PhoenixChannel;
use secrecy::{Secret, SecretString};
use std::convert::Infallible;
use std::path::Path;
use std::pin::pin;
use std::process::ExitCode;
use std::sync::Arc;
use tokio::io::AsyncWriteExt;
use tokio::signal::ctrl_c;
@@ -32,7 +32,7 @@ mod eventloop;
const ID_PATH: &str = "/var/lib/firezone/gateway_id";
fn main() {
fn main() -> ExitCode {
rustls::crypto::ring::default_provider()
.install_default()
.expect("Calling `install_default` only once per process should always succeed");
@@ -52,23 +52,30 @@ fn main() {
.build()
.expect("Failed to create tokio runtime");
match runtime.block_on(try_main(cli, &mut telemetry)) {
Ok(()) => runtime.block_on(telemetry.stop()),
match runtime
.block_on(try_main(cli, &mut telemetry))
.context("Failed to start Gateway")
{
Ok(ExitCode::SUCCESS) => {
runtime.block_on(telemetry.stop());
ExitCode::SUCCESS
}
Ok(_) => {
runtime.block_on(telemetry.stop_on_crash());
ExitCode::FAILURE
}
Err(e) => {
// Enforce errors only being printed on a single line using the technique recommended in the anyhow docs:
// https://docs.rs/anyhow/latest/anyhow/struct.Error.html#display-representations
//
// By default, `anyhow` prints a stacktrace when it exits.
// That looks like a "crash" but we "just" exit with a fatal error.
tracing::error!(error = anyhow_dyn_err(&e));
runtime.block_on(telemetry.stop_on_crash());
std::process::exit(1);
ExitCode::FAILURE
}
}
}
async fn try_main(cli: Cli, telemetry: &mut Telemetry) -> Result<()> {
async fn try_main(cli: Cli, telemetry: &mut Telemetry) -> Result<ExitCode> {
firezone_logging::setup_global_subscriber(layer::Identity::default())?;
let firezone_id = get_firezone_id(cli.firezone_id).await
@@ -82,8 +89,35 @@ async fn try_main(cli: Cli, telemetry: &mut Telemetry) -> Result<()> {
cli.firezone_name,
)?;
let task = tokio::spawn(run(login, cli.tun_threads)).err_into();
let mut tunnel = GatewayTunnel::new(Arc::new(tcp_socket_factory), Arc::new(udp_socket_factory));
let portal = PhoenixChannel::disconnected(
Secret::new(login),
get_user_agent(None, env!("CARGO_PKG_VERSION")),
PHOENIX_TOPIC,
(),
|| {
ExponentialBackoffBuilder::default()
.with_max_elapsed_time(None)
.build()
},
Arc::new(tcp_socket_factory),
)
.context("Failed to resolve portal URL")?;
let (sender, receiver) = mpsc::channel::<Interface>(10);
let mut tun_device_manager = TunDeviceManager::new(ip_packet::PACKET_SIZE, cli.tun_threads)?;
let tun = tun_device_manager.make_tun()?;
tunnel.set_tun(Box::new(tun));
tokio::spawn(update_device_task(tun_device_manager, receiver));
let task = tokio::spawn(future::poll_fn({
let mut eventloop = Eventloop::new(tunnel, portal, sender);
move |cx| eventloop.poll(cx)
}))
.err_into();
let ctrl_c = pin!(ctrl_c().map_err(anyhow::Error::new));
tokio::spawn(http_health_check::serve(
@@ -95,13 +129,17 @@ async fn try_main(cli: Cli, telemetry: &mut Telemetry) -> Result<()> {
.await
.map_err(|e| e.factor_first().0)?
{
future::Either::Left((res, _)) => {
res?;
}
future::Either::Right(_) => {}
};
future::Either::Left((Err(e), _)) => {
tracing::info!("Failed to login to portal: {e}");
Ok(())
Ok(ExitCode::FAILURE)
}
future::Either::Right(((), _)) => {
tracing::info!("Received CTRL+C, goodbye!");
Ok(ExitCode::SUCCESS)
}
}
}
async fn get_firezone_id(env_id: Option<String>) -> Result<String> {
@@ -125,38 +163,6 @@ async fn get_firezone_id(env_id: Option<String>) -> Result<String> {
Ok(id)
}
async fn run(login: LoginUrl<PublicKeyParam>, num_tun_threads: usize) -> Result<Infallible> {
let mut tunnel = GatewayTunnel::new(Arc::new(tcp_socket_factory), Arc::new(udp_socket_factory));
let portal = PhoenixChannel::disconnected(
Secret::new(login),
get_user_agent(None, env!("CARGO_PKG_VERSION")),
PHOENIX_TOPIC,
(),
|| {
ExponentialBackoffBuilder::default()
.with_max_elapsed_time(None)
.build()
},
Arc::new(tcp_socket_factory),
)?;
let (sender, receiver) = mpsc::channel::<Interface>(10);
let mut tun_device_manager = TunDeviceManager::new(ip_packet::PACKET_SIZE, num_tun_threads)?;
let tun = tun_device_manager.make_tun()?;
tunnel.set_tun(Box::new(tun));
let update_device_task = update_device_task(tun_device_manager, receiver);
let mut eventloop = Eventloop::new(tunnel, portal, sender);
let eventloop_task = future::poll_fn(move |cx| eventloop.poll(cx));
let ((), result) = futures::join!(update_device_task, eventloop_task);
result.context("Eventloop failed")?;
unreachable!()
}
async fn update_device_task(
mut tun_device: TunDeviceManager,
mut receiver: mpsc::Receiver<Interface>,

View File

@@ -25,7 +25,6 @@ use serde::{de::DeserializeOwned, Deserialize, Serialize};
use socket_factory::{SocketFactory, TcpSocket, TcpStream};
use std::task::{Context, Poll, Waker};
use tokio_tungstenite::client_async_tls;
use tokio_tungstenite::tungstenite::http::StatusCode;
use tokio_tungstenite::{
tungstenite::{handshake::client::Request, Message},
MaybeTlsStream, WebSocketStream,
@@ -34,6 +33,7 @@ use url::Url;
pub use get_user_agent::get_user_agent;
pub use login_url::{DeviceInfo, LoginUrl, LoginUrlError, NoParams, PublicKeyParam};
pub use tokio_tungstenite::tungstenite::http::StatusCode;
const MAX_BUFFERED_MESSAGES: usize = 32; // Chosen pretty arbitrarily. If we are connected, these should never build up.
@@ -132,13 +132,13 @@ async fn connect(
#[derive(Debug, thiserror::Error)]
pub enum Error {
#[error("client error: {0}")]
#[error("Failed to establish WebSocket connection: {0}")]
Client(StatusCode),
#[error("token expired")]
#[error("Authentication token expired")]
TokenExpired,
#[error("max retries reached: {final_error}")]
#[error("Got disconnected from portal and hit the max-retry limit. Last connection error: {final_error}")]
MaxRetriesReached { final_error: String },
#[error("login failed: {0}")]
#[error("Failed to login with portal: {0}")]
LoginFailed(ErrorReply),
}