From 30376cd79ae3c39d5477e02c3d878906e9ea2901 Mon Sep 17 00:00:00 2001 From: Thomas Eizinger Date: Fri, 13 Dec 2024 15:51:58 +1100 Subject: [PATCH] fix(gateway): polish error handling in `main` (#7500) Currently, the Gateway logs all errors that happen when the event-loop exits on ERROR level. This creates Sentry alerts for things like "Unauthorized" errors or "404 Not found". That isn't useful to us. To mitigate this, we polish the code a bit to only log an ERROR when we actually fail to setup something during startup (like the TUN device). In all other cases, we now log a more user-friendly message on INFO but still exit with the appropriate exit code (0 on CTRL+C, 1 on any other error). --- .github/workflows/_rust.yml | 4 +- rust/gateway/src/eventloop.rs | 5 +- rust/gateway/src/main.rs | 108 +++++++++++++++++--------------- rust/phoenix-channel/src/lib.rs | 10 +-- 4 files changed, 68 insertions(+), 59 deletions(-) diff --git a/.github/workflows/_rust.yml b/.github/workflows/_rust.yml index 8414e2822..d7c1d4581 100644 --- a/.github/workflows/_rust.yml +++ b/.github/workflows/_rust.yml @@ -56,8 +56,8 @@ jobs: with: tool: cargo-udeps,cargo-deny - run: | - rustup install --no-self-update nightly-2024-09-01 --profile minimal # The exact nightly version doesn't matter, just pin a random one. - cargo +nightly-2024-09-01 udeps --all-targets --all-features ${{ steps.setup-rust.outputs.packages }} + rustup install --no-self-update nightly-2024-12-13 --profile minimal # The exact nightly version doesn't matter, just pin a random one. + cargo +nightly-2024-12-13 udeps --all-targets --all-features ${{ steps.setup-rust.outputs.packages }} name: Check for unused dependencies - run: cargo fmt -- --check - run: cargo doc --all-features --no-deps --document-private-items ${{ steps.setup-rust.outputs.packages }} diff --git a/rust/gateway/src/eventloop.rs b/rust/gateway/src/eventloop.rs index 48d8243a8..6f396e954 100644 --- a/rust/gateway/src/eventloop.rs +++ b/rust/gateway/src/eventloop.rs @@ -66,7 +66,10 @@ impl Eventloop { } impl Eventloop { - pub fn poll(&mut self, cx: &mut Context<'_>) -> Poll> { + pub fn poll( + &mut self, + cx: &mut Context<'_>, + ) -> Poll> { loop { match self.tunnel.poll_next_event(cx) { Poll::Ready(Ok(event)) => { diff --git a/rust/gateway/src/main.rs b/rust/gateway/src/main.rs index a13c94e4a..ed70cf0d4 100644 --- a/rust/gateway/src/main.rs +++ b/rust/gateway/src/main.rs @@ -16,11 +16,11 @@ use phoenix_channel::LoginUrl; use futures::channel::mpsc; use futures::{future, StreamExt, TryFutureExt}; -use phoenix_channel::{PhoenixChannel, PublicKeyParam}; +use phoenix_channel::PhoenixChannel; use secrecy::{Secret, SecretString}; -use std::convert::Infallible; use std::path::Path; use std::pin::pin; +use std::process::ExitCode; use std::sync::Arc; use tokio::io::AsyncWriteExt; use tokio::signal::ctrl_c; @@ -32,7 +32,7 @@ mod eventloop; const ID_PATH: &str = "/var/lib/firezone/gateway_id"; -fn main() { +fn main() -> ExitCode { rustls::crypto::ring::default_provider() .install_default() .expect("Calling `install_default` only once per process should always succeed"); @@ -52,23 +52,30 @@ fn main() { .build() .expect("Failed to create tokio runtime"); - match runtime.block_on(try_main(cli, &mut telemetry)) { - Ok(()) => runtime.block_on(telemetry.stop()), + match runtime + .block_on(try_main(cli, &mut telemetry)) + .context("Failed to start Gateway") + { + Ok(ExitCode::SUCCESS) => { + runtime.block_on(telemetry.stop()); + + ExitCode::SUCCESS + } + Ok(_) => { + runtime.block_on(telemetry.stop_on_crash()); + + ExitCode::FAILURE + } Err(e) => { - // Enforce errors only being printed on a single line using the technique recommended in the anyhow docs: - // https://docs.rs/anyhow/latest/anyhow/struct.Error.html#display-representations - // - // By default, `anyhow` prints a stacktrace when it exits. - // That looks like a "crash" but we "just" exit with a fatal error. tracing::error!(error = anyhow_dyn_err(&e)); runtime.block_on(telemetry.stop_on_crash()); - std::process::exit(1); + ExitCode::FAILURE } } } -async fn try_main(cli: Cli, telemetry: &mut Telemetry) -> Result<()> { +async fn try_main(cli: Cli, telemetry: &mut Telemetry) -> Result { firezone_logging::setup_global_subscriber(layer::Identity::default())?; let firezone_id = get_firezone_id(cli.firezone_id).await @@ -82,8 +89,35 @@ async fn try_main(cli: Cli, telemetry: &mut Telemetry) -> Result<()> { cli.firezone_name, )?; - let task = tokio::spawn(run(login, cli.tun_threads)).err_into(); + let mut tunnel = GatewayTunnel::new(Arc::new(tcp_socket_factory), Arc::new(udp_socket_factory)); + let portal = PhoenixChannel::disconnected( + Secret::new(login), + get_user_agent(None, env!("CARGO_PKG_VERSION")), + PHOENIX_TOPIC, + (), + || { + ExponentialBackoffBuilder::default() + .with_max_elapsed_time(None) + .build() + }, + Arc::new(tcp_socket_factory), + ) + .context("Failed to resolve portal URL")?; + let (sender, receiver) = mpsc::channel::(10); + + let mut tun_device_manager = TunDeviceManager::new(ip_packet::PACKET_SIZE, cli.tun_threads)?; + let tun = tun_device_manager.make_tun()?; + tunnel.set_tun(Box::new(tun)); + + tokio::spawn(update_device_task(tun_device_manager, receiver)); + + let task = tokio::spawn(future::poll_fn({ + let mut eventloop = Eventloop::new(tunnel, portal, sender); + + move |cx| eventloop.poll(cx) + })) + .err_into(); let ctrl_c = pin!(ctrl_c().map_err(anyhow::Error::new)); tokio::spawn(http_health_check::serve( @@ -95,13 +129,17 @@ async fn try_main(cli: Cli, telemetry: &mut Telemetry) -> Result<()> { .await .map_err(|e| e.factor_first().0)? { - future::Either::Left((res, _)) => { - res?; - } - future::Either::Right(_) => {} - }; + future::Either::Left((Err(e), _)) => { + tracing::info!("Failed to login to portal: {e}"); - Ok(()) + Ok(ExitCode::FAILURE) + } + future::Either::Right(((), _)) => { + tracing::info!("Received CTRL+C, goodbye!"); + + Ok(ExitCode::SUCCESS) + } + } } async fn get_firezone_id(env_id: Option) -> Result { @@ -125,38 +163,6 @@ async fn get_firezone_id(env_id: Option) -> Result { Ok(id) } -async fn run(login: LoginUrl, num_tun_threads: usize) -> Result { - let mut tunnel = GatewayTunnel::new(Arc::new(tcp_socket_factory), Arc::new(udp_socket_factory)); - let portal = PhoenixChannel::disconnected( - Secret::new(login), - get_user_agent(None, env!("CARGO_PKG_VERSION")), - PHOENIX_TOPIC, - (), - || { - ExponentialBackoffBuilder::default() - .with_max_elapsed_time(None) - .build() - }, - Arc::new(tcp_socket_factory), - )?; - - let (sender, receiver) = mpsc::channel::(10); - let mut tun_device_manager = TunDeviceManager::new(ip_packet::PACKET_SIZE, num_tun_threads)?; - let tun = tun_device_manager.make_tun()?; - tunnel.set_tun(Box::new(tun)); - - let update_device_task = update_device_task(tun_device_manager, receiver); - - let mut eventloop = Eventloop::new(tunnel, portal, sender); - let eventloop_task = future::poll_fn(move |cx| eventloop.poll(cx)); - - let ((), result) = futures::join!(update_device_task, eventloop_task); - - result.context("Eventloop failed")?; - - unreachable!() -} - async fn update_device_task( mut tun_device: TunDeviceManager, mut receiver: mpsc::Receiver, diff --git a/rust/phoenix-channel/src/lib.rs b/rust/phoenix-channel/src/lib.rs index 3d11cb31f..7a3efc125 100644 --- a/rust/phoenix-channel/src/lib.rs +++ b/rust/phoenix-channel/src/lib.rs @@ -25,7 +25,6 @@ use serde::{de::DeserializeOwned, Deserialize, Serialize}; use socket_factory::{SocketFactory, TcpSocket, TcpStream}; use std::task::{Context, Poll, Waker}; use tokio_tungstenite::client_async_tls; -use tokio_tungstenite::tungstenite::http::StatusCode; use tokio_tungstenite::{ tungstenite::{handshake::client::Request, Message}, MaybeTlsStream, WebSocketStream, @@ -34,6 +33,7 @@ use url::Url; pub use get_user_agent::get_user_agent; pub use login_url::{DeviceInfo, LoginUrl, LoginUrlError, NoParams, PublicKeyParam}; +pub use tokio_tungstenite::tungstenite::http::StatusCode; const MAX_BUFFERED_MESSAGES: usize = 32; // Chosen pretty arbitrarily. If we are connected, these should never build up. @@ -132,13 +132,13 @@ async fn connect( #[derive(Debug, thiserror::Error)] pub enum Error { - #[error("client error: {0}")] + #[error("Failed to establish WebSocket connection: {0}")] Client(StatusCode), - #[error("token expired")] + #[error("Authentication token expired")] TokenExpired, - #[error("max retries reached: {final_error}")] + #[error("Got disconnected from portal and hit the max-retry limit. Last connection error: {final_error}")] MaxRetriesReached { final_error: String }, - #[error("login failed: {0}")] + #[error("Failed to login with portal: {0}")] LoginFailed(ErrorReply), }