From de35bb067eab07438f8ea5ca4dfec447cce3bb18 Mon Sep 17 00:00:00 2001 From: Thomas Eizinger Date: Mon, 18 Nov 2024 18:17:08 +0000 Subject: [PATCH] fix(telemetry): don't embed errors values in `telemetry_event!` (#7366) Due to https://github.com/getsentry/sentry-rust/issues/702, errors which are embedded as `tracing::Value` unfortunately get silently discarded when reported as part of Sentry "Event"s and not "Exception"s. The design idea of these telemetry events is that they aren't fatal errors so we don't need to treat them with the highest priority. They may also appear quite often, so to save performance and bandwidth, we sample them at a rate of 1% at creation time. In order to not lose the context of these errors, we instead format them into the message. This makes them completely identical to the `debug!` logs which we have on every call-site of `telemetry_event!` which prompted me to make that implicit as part of creating the `telemetry_event!`. Resolves: #7343. --- rust/connlib/clients/shared/src/eventloop.rs | 5 ++--- rust/connlib/snownet/src/allocation.rs | 6 ++---- rust/connlib/snownet/src/node.rs | 6 +++--- rust/connlib/tunnel/src/client.rs | 16 +++++----------- rust/connlib/tunnel/src/io.rs | 7 +++---- rust/gateway/src/eventloop.rs | 11 +++++------ rust/logging/src/err_with_sources.rs | 4 ++-- rust/logging/src/lib.rs | 5 ++++- rust/logging/src/unwrap_or.rs | 4 ++-- rust/phoenix-channel/src/lib.rs | 4 ++-- 10 files changed, 30 insertions(+), 38 deletions(-) diff --git a/rust/connlib/clients/shared/src/eventloop.rs b/rust/connlib/clients/shared/src/eventloop.rs index 7e3a8f36a..9e4925174 100644 --- a/rust/connlib/clients/shared/src/eventloop.rs +++ b/rust/connlib/clients/shared/src/eventloop.rs @@ -1,7 +1,7 @@ use crate::{callbacks::Callbacks, PHOENIX_TOPIC}; use anyhow::Result; use connlib_model::ResourceId; -use firezone_logging::{anyhow_dyn_err, err_with_sources, std_dyn_err, telemetry_event}; +use firezone_logging::{anyhow_dyn_err, err_with_src, telemetry_event}; use firezone_tunnel::messages::{client::*, *}; use firezone_tunnel::ClientTunnel; use phoenix_channel::{ErrorReply, OutboundRequestId, PhoenixChannel, PublicKeyParam}; @@ -92,8 +92,7 @@ where continue; } Poll::Ready(Err(e)) => { - tracing::debug!("Tunnel error: {}", err_with_sources(&e)); - telemetry_event!(error = std_dyn_err(&e), "Tunnel error"); + telemetry_event!("Tunnel error: {}", err_with_src(&e)); continue; } Poll::Pending => {} diff --git a/rust/connlib/snownet/src/allocation.rs b/rust/connlib/snownet/src/allocation.rs index 58e2a593b..602fd8677 100644 --- a/rust/connlib/snownet/src/allocation.rs +++ b/rust/connlib/snownet/src/allocation.rs @@ -7,7 +7,7 @@ use crate::{ }; use ::backoff::backoff::Backoff; use bytecodec::{DecodeExt as _, EncodeExt as _}; -use firezone_logging::{err_with_sources, std_dyn_err}; +use firezone_logging::{err_with_src, std_dyn_err}; use hex_display::HexDisplayExt as _; use rand::random; use std::{ @@ -1248,9 +1248,7 @@ impl ChannelBindings { fn try_decode<'p>(&mut self, packet: &'p [u8], now: Instant) -> Option<(SocketAddr, &'p [u8])> { let (channel_number, payload) = crate::channel_data::decode(packet) - .inspect_err(|e| { - tracing::debug!("Malformed channel data message: {}", err_with_sources(e)) - }) + .inspect_err(|e| tracing::debug!("Malformed channel data message: {}", err_with_src(e))) .ok()?; let Some(channel) = self.inner.get_mut(&channel_number) else { tracing::debug!(%channel_number, "Unknown channel"); diff --git a/rust/connlib/snownet/src/node.rs b/rust/connlib/snownet/src/node.rs index d041787e8..3d9925b64 100644 --- a/rust/connlib/snownet/src/node.rs +++ b/rust/connlib/snownet/src/node.rs @@ -9,7 +9,7 @@ use boringtun::noise::{Tunn, TunnResult}; use boringtun::x25519::PublicKey; use boringtun::{noise::rate_limiter::RateLimiter, x25519::StaticSecret}; use core::fmt; -use firezone_logging::err_with_sources; +use firezone_logging::err_with_src; use hex_display::HexDisplayExt; use ip_packet::{ ConvertibleIpv4Packet, ConvertibleIpv6Packet, IpPacket, IpPacketBuf, MAX_DATAGRAM_PAYLOAD, @@ -336,7 +336,7 @@ where let candidate = match Candidate::from_sdp_string(&candidate) { Ok(c) => c, Err(e) => { - tracing::debug!("Failed to parse candidate: {}", err_with_sources(&e)); + tracing::debug!("Failed to parse candidate: {}", err_with_src(&e)); return; } }; @@ -372,7 +372,7 @@ where let candidate = match Candidate::from_sdp_string(&candidate) { Ok(c) => c, Err(e) => { - tracing::debug!("Failed to parse candidate: {}", err_with_sources(&e)); + tracing::debug!("Failed to parse candidate: {}", err_with_src(&e)); return; } }; diff --git a/rust/connlib/tunnel/src/client.rs b/rust/connlib/tunnel/src/client.rs index 9c8fc5884..53e9a2e7c 100644 --- a/rust/connlib/tunnel/src/client.rs +++ b/rust/connlib/tunnel/src/client.rs @@ -15,7 +15,7 @@ use connlib_model::PublicKey; use connlib_model::{GatewayId, RelayId, ResourceId, ResourceStatus, ResourceView}; use connlib_model::{Site, SiteId}; use firezone_logging::{ - anyhow_dyn_err, err_with_sources, std_dyn_err, telemetry_event, unwrap_or_debug, unwrap_or_warn, + anyhow_dyn_err, err_with_src, telemetry_event, unwrap_or_debug, unwrap_or_warn, }; use ip_network::{IpNetwork, Ipv4Network, Ipv6Network}; use ip_network_table::IpNetworkTable; @@ -327,7 +327,7 @@ impl ClientState { packet.as_ref(), now, ) - .inspect_err(|e| tracing::debug!(%local, num_bytes = %packet.len(), "Failed to decapsulate incoming packet: {}", err_with_sources(e))) + .inspect_err(|e| tracing::debug!(%local, num_bytes = %packet.len(), "Failed to decapsulate incoming packet: {}", err_with_src(e))) .ok()??; if self.tcp_dns_client.accepts(&packet) { @@ -381,9 +381,7 @@ impl ClientState { } }) .unwrap_or_else(|e| { - let error = std_dyn_err(&e); - tracing::debug!("Recursive UDP DNS query failed: {}", err_with_sources(&e)); - telemetry_event!(error, "Recursive UDP DNS query failed"); + telemetry_event!("Recursive UDP DNS query failed: {}", err_with_src(&e)); dns::servfail(response.query.for_slice_ref()) }); @@ -399,9 +397,7 @@ impl ClientState { tracing::trace!("Received recursive TCP DNS response"); }) .unwrap_or_else(|e| { - let error = std_dyn_err(&e); - tracing::debug!("Recursive TCP DNS query failed: {}", err_with_sources(&e)); - telemetry_event!(error, "Recursive TCP DNS query failed"); + telemetry_event!("Recursive TCP DNS query failed: {}", err_with_src(&e)); dns::servfail(response.query.for_slice_ref()) }); @@ -453,9 +449,7 @@ impl ClientState { let transmit = self .node .encapsulate(gid, packet, now, buffer) - .inspect_err( - |e| tracing::debug!(%gid, "Failed to encapsulate: {}", err_with_sources(e)), - ) + .inspect_err(|e| tracing::debug!(%gid, "Failed to encapsulate: {}", err_with_src(e))) .ok()??; Some(transmit) diff --git a/rust/connlib/tunnel/src/io.rs b/rust/connlib/tunnel/src/io.rs index 42516c00d..c31b7a9ce 100644 --- a/rust/connlib/tunnel/src/io.rs +++ b/rust/connlib/tunnel/src/io.rs @@ -1,6 +1,6 @@ use crate::{device_channel::Device, dns, sockets::Sockets}; use domain::base::Message; -use firezone_logging::{err_with_sources, telemetry_event, telemetry_span}; +use firezone_logging::{err_with_src, telemetry_event, telemetry_span}; use futures::{ future::{self, Either}, stream, Stream, StreamExt, @@ -361,7 +361,7 @@ async fn tun_send_recv( { Either::Left((Some(Command::SendPacket(p)), _)) => { if let Err(e) = device.write(p) { - tracing::debug!("Failed to write TUN packet: {}", err_with_sources(&e)); + tracing::debug!("Failed to write TUN packet: {}", err_with_src(&e)); }; } Either::Left((Some(Command::UpdateTun(tun)), _)) => { @@ -380,7 +380,7 @@ async fn tun_send_recv( Either::Right((Err(e), _)) => { tracing::debug!( "Failed to read packet from TUN device: {}", - err_with_sources(&e) + err_with_src(&e) ); } }; @@ -419,7 +419,6 @@ fn outgoing_packet_stream( fn is_max_wg_packet_size(d: &DatagramIn) -> bool { let len = d.packet.len(); if len > MAX_DATAGRAM_PAYLOAD { - tracing::debug!(from = %d.from, %len, "Dropping too large datagram (max allowed: {MAX_DATAGRAM_PAYLOAD} bytes)"); telemetry_event!(from = %d.from, %len, "Dropping too large datagram (max allowed: {MAX_DATAGRAM_PAYLOAD} bytes)"); return false; diff --git a/rust/gateway/src/eventloop.rs b/rust/gateway/src/eventloop.rs index cb49d51e4..43af26e22 100644 --- a/rust/gateway/src/eventloop.rs +++ b/rust/gateway/src/eventloop.rs @@ -5,7 +5,7 @@ use connlib_model::{ClientId, ResourceId}; #[cfg(not(target_os = "windows"))] use dns_lookup::{AddrInfoHints, AddrInfoIter, LookupError}; use firezone_logging::{ - anyhow_dyn_err, err_with_sources, std_dyn_err, telemetry_event, telemetry_span, + anyhow_dyn_err, err_with_src, std_dyn_err, telemetry_event, telemetry_span, }; use firezone_tunnel::messages::gateway::{ AllowAccess, ClientIceCandidates, ClientsIceCandidates, ConnectionReady, EgressMessages, @@ -80,8 +80,7 @@ impl Eventloop { continue; } Poll::Ready(Err(e)) => { - tracing::debug!("Tunnel error: {}", err_with_sources(&e)); - telemetry_event!(error = std_dyn_err(&e), "Tunnel error"); + telemetry_event!("Tunnel error: {}", err_with_src(&e)); continue; } Poll::Pending => {} @@ -355,7 +354,7 @@ impl Eventloop { req: RequestConnection, ) { let addresses = result - .inspect_err(|e| tracing::debug!(client = %req.client.id, reference = %req.reference, "DNS resolution timed out as part of connection request: {}", err_with_sources(e))) + .inspect_err(|e| tracing::debug!(client = %req.client.id, reference = %req.reference, "DNS resolution timed out as part of connection request: {}", err_with_src(e))) .unwrap_or_default(); let answer = match self.tunnel.state_mut().accept( @@ -411,7 +410,7 @@ impl Eventloop { pub fn allow_access(&mut self, result: Result, Timeout>, req: AllowAccess) { let addresses = result - .inspect_err(|e| tracing::debug!(client = %req.client_id, reference = %req.reference, "DNS resolution timed out as part of allow access request: {}", err_with_sources(e))) + .inspect_err(|e| tracing::debug!(client = %req.client_id, reference = %req.reference, "DNS resolution timed out as part of allow access request: {}", err_with_src(e))) .unwrap_or_default(); if let Err(e) = self.tunnel.state_mut().allow_access( @@ -435,7 +434,7 @@ impl Eventloop { name: DomainName, ) { let addresses = result - .inspect_err(|e| tracing::debug!(%conn_id, "DNS resolution timed out as part of allow access request: {}", err_with_sources(e))) + .inspect_err(|e| tracing::debug!(%conn_id, "DNS resolution timed out as part of allow access request: {}", err_with_src(e))) .unwrap_or_default(); self.tunnel.state_mut().refresh_translation( diff --git a/rust/logging/src/err_with_sources.rs b/rust/logging/src/err_with_sources.rs index 61a9c0f0d..88e9a4401 100644 --- a/rust/logging/src/err_with_sources.rs +++ b/rust/logging/src/err_with_sources.rs @@ -2,7 +2,7 @@ use core::fmt; use std::error::Error; /// Returns a [`fmt::Display`] adapter that prints the error and all its sources. -pub fn err_with_sources<'a>(e: &'a (dyn Error + 'static)) -> ErrorWithSources<'a> { +pub fn err_with_src<'a>(e: &'a (dyn Error + 'static)) -> ErrorWithSources<'a> { ErrorWithSources { e } } @@ -30,7 +30,7 @@ mod tests { fn prints_errors_with_sources() { let error = Error3(Error2(Error1)); - let display = err_with_sources(&error); + let display = err_with_src(&error); assert_eq!(display.to_string(), "Argh: Failed to do the thing: oh no!"); } diff --git a/rust/logging/src/lib.rs b/rust/logging/src/lib.rs index 0e8c47bf7..8320faa24 100644 --- a/rust/logging/src/lib.rs +++ b/rust/logging/src/lib.rs @@ -17,7 +17,7 @@ use tracing_subscriber::{ }; pub use dyn_err::{anyhow_dyn_err, std_dyn_err}; -pub use err_with_sources::{err_with_sources, ErrorWithSources}; +pub use err_with_sources::{err_with_src, ErrorWithSources}; pub use format::Format; /// Registers a global subscriber with stdout logging and `additional_layer` @@ -152,12 +152,15 @@ macro_rules! telemetry_span { /// Creates a `telemetry` event. /// /// In order to save CPU power, `telemetry` events are sampled at a rate of 1% at creation time. +/// In addition, all telemetry events are logged at the `DEBUG` level. #[macro_export] macro_rules! telemetry_event { ($($arg:tt)*) => { if $crate::__export::rand::random::() < $crate::TELEMETRY_SAMPLE_RATE { $crate::__export::tracing::trace!(target: $crate::TELEMETRY_TARGET, $($arg)*); } + + $crate::__export::tracing::debug!($($arg)*); }; } diff --git a/rust/logging/src/unwrap_or.rs b/rust/logging/src/unwrap_or.rs index 087d12c5d..23eedb3a6 100644 --- a/rust/logging/src/unwrap_or.rs +++ b/rust/logging/src/unwrap_or.rs @@ -26,7 +26,7 @@ macro_rules! unwrap_or_debug { Err(e) => { let error: &dyn ::std::error::Error = e.as_ref(); - ::tracing::debug!($($arg)*, $crate::err_with_sources(error)) + ::tracing::debug!($($arg)*, $crate::err_with_src(error)) } } }; @@ -43,7 +43,7 @@ macro_rules! unwrap_or_trace { Err(e) => { let error: &dyn ::std::error::Error = e.as_ref(); - ::tracing::debug!($($arg)*, $crate::err_with_sources(error)) + ::tracing::debug!($($arg)*, $crate::err_with_src(error)) } } }; diff --git a/rust/phoenix-channel/src/lib.rs b/rust/phoenix-channel/src/lib.rs index e2add7be0..25f86bd2e 100644 --- a/rust/phoenix-channel/src/lib.rs +++ b/rust/phoenix-channel/src/lib.rs @@ -15,7 +15,7 @@ use std::{io, mem}; use backoff::backoff::Backoff; use backoff::ExponentialBackoff; use base64::Engine; -use firezone_logging::{err_with_sources, std_dyn_err, telemetry_span}; +use firezone_logging::{err_with_src, std_dyn_err, telemetry_span}; use futures::future::BoxFuture; use futures::{FutureExt, SinkExt, StreamExt}; use heartbeat::{Heartbeat, MissedLastHeartbeat}; @@ -422,7 +422,7 @@ where let socket_factory = self.socket_factory.clone(); let socket_addresses = self.socket_addresses(); - tracing::debug!(?backoff, max_elapsed_time = ?self.reconnect_backoff.max_elapsed_time, "Reconnecting to portal on transient client error: {}", err_with_sources(&e)); + tracing::debug!(?backoff, max_elapsed_time = ?self.reconnect_backoff.max_elapsed_time, "Reconnecting to portal on transient client error: {}", err_with_src(&e)); self.state = State::Connecting(Box::pin(async move { tokio::time::sleep(backoff).await;