fix(telemetry): don't embed errors values in telemetry_event! (#7366)

Due to https://github.com/getsentry/sentry-rust/issues/702, errors which
are embedded as `tracing::Value` unfortunately get silently discarded
when reported as part of Sentry "Event"s and not "Exception"s.

The design idea of these telemetry events is that they aren't fatal
errors so we don't need to treat them with the highest priority. They
may also appear quite often, so to save performance and bandwidth, we
sample them at a rate of 1% at creation time.

In order to not lose the context of these errors, we instead format them
into the message. This makes them completely identical to the `debug!`
logs which we have on every call-site of `telemetry_event!` which
prompted me to make that implicit as part of creating the
`telemetry_event!`.

Resolves: #7343.
This commit is contained in:
Thomas Eizinger
2024-11-18 18:17:08 +00:00
committed by GitHub
parent 24f7ba530d
commit de35bb067e
10 changed files with 30 additions and 38 deletions

View File

@@ -1,7 +1,7 @@
use crate::{callbacks::Callbacks, PHOENIX_TOPIC};
use anyhow::Result;
use connlib_model::ResourceId;
use firezone_logging::{anyhow_dyn_err, err_with_sources, std_dyn_err, telemetry_event};
use firezone_logging::{anyhow_dyn_err, err_with_src, telemetry_event};
use firezone_tunnel::messages::{client::*, *};
use firezone_tunnel::ClientTunnel;
use phoenix_channel::{ErrorReply, OutboundRequestId, PhoenixChannel, PublicKeyParam};
@@ -92,8 +92,7 @@ where
continue;
}
Poll::Ready(Err(e)) => {
tracing::debug!("Tunnel error: {}", err_with_sources(&e));
telemetry_event!(error = std_dyn_err(&e), "Tunnel error");
telemetry_event!("Tunnel error: {}", err_with_src(&e));
continue;
}
Poll::Pending => {}

View File

@@ -7,7 +7,7 @@ use crate::{
};
use ::backoff::backoff::Backoff;
use bytecodec::{DecodeExt as _, EncodeExt as _};
use firezone_logging::{err_with_sources, std_dyn_err};
use firezone_logging::{err_with_src, std_dyn_err};
use hex_display::HexDisplayExt as _;
use rand::random;
use std::{
@@ -1248,9 +1248,7 @@ impl ChannelBindings {
fn try_decode<'p>(&mut self, packet: &'p [u8], now: Instant) -> Option<(SocketAddr, &'p [u8])> {
let (channel_number, payload) = crate::channel_data::decode(packet)
.inspect_err(|e| {
tracing::debug!("Malformed channel data message: {}", err_with_sources(e))
})
.inspect_err(|e| tracing::debug!("Malformed channel data message: {}", err_with_src(e)))
.ok()?;
let Some(channel) = self.inner.get_mut(&channel_number) else {
tracing::debug!(%channel_number, "Unknown channel");

View File

@@ -9,7 +9,7 @@ use boringtun::noise::{Tunn, TunnResult};
use boringtun::x25519::PublicKey;
use boringtun::{noise::rate_limiter::RateLimiter, x25519::StaticSecret};
use core::fmt;
use firezone_logging::err_with_sources;
use firezone_logging::err_with_src;
use hex_display::HexDisplayExt;
use ip_packet::{
ConvertibleIpv4Packet, ConvertibleIpv6Packet, IpPacket, IpPacketBuf, MAX_DATAGRAM_PAYLOAD,
@@ -336,7 +336,7 @@ where
let candidate = match Candidate::from_sdp_string(&candidate) {
Ok(c) => c,
Err(e) => {
tracing::debug!("Failed to parse candidate: {}", err_with_sources(&e));
tracing::debug!("Failed to parse candidate: {}", err_with_src(&e));
return;
}
};
@@ -372,7 +372,7 @@ where
let candidate = match Candidate::from_sdp_string(&candidate) {
Ok(c) => c,
Err(e) => {
tracing::debug!("Failed to parse candidate: {}", err_with_sources(&e));
tracing::debug!("Failed to parse candidate: {}", err_with_src(&e));
return;
}
};

View File

@@ -15,7 +15,7 @@ use connlib_model::PublicKey;
use connlib_model::{GatewayId, RelayId, ResourceId, ResourceStatus, ResourceView};
use connlib_model::{Site, SiteId};
use firezone_logging::{
anyhow_dyn_err, err_with_sources, std_dyn_err, telemetry_event, unwrap_or_debug, unwrap_or_warn,
anyhow_dyn_err, err_with_src, telemetry_event, unwrap_or_debug, unwrap_or_warn,
};
use ip_network::{IpNetwork, Ipv4Network, Ipv6Network};
use ip_network_table::IpNetworkTable;
@@ -327,7 +327,7 @@ impl ClientState {
packet.as_ref(),
now,
)
.inspect_err(|e| tracing::debug!(%local, num_bytes = %packet.len(), "Failed to decapsulate incoming packet: {}", err_with_sources(e)))
.inspect_err(|e| tracing::debug!(%local, num_bytes = %packet.len(), "Failed to decapsulate incoming packet: {}", err_with_src(e)))
.ok()??;
if self.tcp_dns_client.accepts(&packet) {
@@ -381,9 +381,7 @@ impl ClientState {
}
})
.unwrap_or_else(|e| {
let error = std_dyn_err(&e);
tracing::debug!("Recursive UDP DNS query failed: {}", err_with_sources(&e));
telemetry_event!(error, "Recursive UDP DNS query failed");
telemetry_event!("Recursive UDP DNS query failed: {}", err_with_src(&e));
dns::servfail(response.query.for_slice_ref())
});
@@ -399,9 +397,7 @@ impl ClientState {
tracing::trace!("Received recursive TCP DNS response");
})
.unwrap_or_else(|e| {
let error = std_dyn_err(&e);
tracing::debug!("Recursive TCP DNS query failed: {}", err_with_sources(&e));
telemetry_event!(error, "Recursive TCP DNS query failed");
telemetry_event!("Recursive TCP DNS query failed: {}", err_with_src(&e));
dns::servfail(response.query.for_slice_ref())
});
@@ -453,9 +449,7 @@ impl ClientState {
let transmit = self
.node
.encapsulate(gid, packet, now, buffer)
.inspect_err(
|e| tracing::debug!(%gid, "Failed to encapsulate: {}", err_with_sources(e)),
)
.inspect_err(|e| tracing::debug!(%gid, "Failed to encapsulate: {}", err_with_src(e)))
.ok()??;
Some(transmit)

View File

@@ -1,6 +1,6 @@
use crate::{device_channel::Device, dns, sockets::Sockets};
use domain::base::Message;
use firezone_logging::{err_with_sources, telemetry_event, telemetry_span};
use firezone_logging::{err_with_src, telemetry_event, telemetry_span};
use futures::{
future::{self, Either},
stream, Stream, StreamExt,
@@ -361,7 +361,7 @@ async fn tun_send_recv(
{
Either::Left((Some(Command::SendPacket(p)), _)) => {
if let Err(e) = device.write(p) {
tracing::debug!("Failed to write TUN packet: {}", err_with_sources(&e));
tracing::debug!("Failed to write TUN packet: {}", err_with_src(&e));
};
}
Either::Left((Some(Command::UpdateTun(tun)), _)) => {
@@ -380,7 +380,7 @@ async fn tun_send_recv(
Either::Right((Err(e), _)) => {
tracing::debug!(
"Failed to read packet from TUN device: {}",
err_with_sources(&e)
err_with_src(&e)
);
}
};
@@ -419,7 +419,6 @@ fn outgoing_packet_stream(
fn is_max_wg_packet_size(d: &DatagramIn) -> bool {
let len = d.packet.len();
if len > MAX_DATAGRAM_PAYLOAD {
tracing::debug!(from = %d.from, %len, "Dropping too large datagram (max allowed: {MAX_DATAGRAM_PAYLOAD} bytes)");
telemetry_event!(from = %d.from, %len, "Dropping too large datagram (max allowed: {MAX_DATAGRAM_PAYLOAD} bytes)");
return false;

View File

@@ -5,7 +5,7 @@ use connlib_model::{ClientId, ResourceId};
#[cfg(not(target_os = "windows"))]
use dns_lookup::{AddrInfoHints, AddrInfoIter, LookupError};
use firezone_logging::{
anyhow_dyn_err, err_with_sources, std_dyn_err, telemetry_event, telemetry_span,
anyhow_dyn_err, err_with_src, std_dyn_err, telemetry_event, telemetry_span,
};
use firezone_tunnel::messages::gateway::{
AllowAccess, ClientIceCandidates, ClientsIceCandidates, ConnectionReady, EgressMessages,
@@ -80,8 +80,7 @@ impl Eventloop {
continue;
}
Poll::Ready(Err(e)) => {
tracing::debug!("Tunnel error: {}", err_with_sources(&e));
telemetry_event!(error = std_dyn_err(&e), "Tunnel error");
telemetry_event!("Tunnel error: {}", err_with_src(&e));
continue;
}
Poll::Pending => {}
@@ -355,7 +354,7 @@ impl Eventloop {
req: RequestConnection,
) {
let addresses = result
.inspect_err(|e| tracing::debug!(client = %req.client.id, reference = %req.reference, "DNS resolution timed out as part of connection request: {}", err_with_sources(e)))
.inspect_err(|e| tracing::debug!(client = %req.client.id, reference = %req.reference, "DNS resolution timed out as part of connection request: {}", err_with_src(e)))
.unwrap_or_default();
let answer = match self.tunnel.state_mut().accept(
@@ -411,7 +410,7 @@ impl Eventloop {
pub fn allow_access(&mut self, result: Result<Vec<IpAddr>, Timeout>, req: AllowAccess) {
let addresses = result
.inspect_err(|e| tracing::debug!(client = %req.client_id, reference = %req.reference, "DNS resolution timed out as part of allow access request: {}", err_with_sources(e)))
.inspect_err(|e| tracing::debug!(client = %req.client_id, reference = %req.reference, "DNS resolution timed out as part of allow access request: {}", err_with_src(e)))
.unwrap_or_default();
if let Err(e) = self.tunnel.state_mut().allow_access(
@@ -435,7 +434,7 @@ impl Eventloop {
name: DomainName,
) {
let addresses = result
.inspect_err(|e| tracing::debug!(%conn_id, "DNS resolution timed out as part of allow access request: {}", err_with_sources(e)))
.inspect_err(|e| tracing::debug!(%conn_id, "DNS resolution timed out as part of allow access request: {}", err_with_src(e)))
.unwrap_or_default();
self.tunnel.state_mut().refresh_translation(

View File

@@ -2,7 +2,7 @@ use core::fmt;
use std::error::Error;
/// Returns a [`fmt::Display`] adapter that prints the error and all its sources.
pub fn err_with_sources<'a>(e: &'a (dyn Error + 'static)) -> ErrorWithSources<'a> {
pub fn err_with_src<'a>(e: &'a (dyn Error + 'static)) -> ErrorWithSources<'a> {
ErrorWithSources { e }
}
@@ -30,7 +30,7 @@ mod tests {
fn prints_errors_with_sources() {
let error = Error3(Error2(Error1));
let display = err_with_sources(&error);
let display = err_with_src(&error);
assert_eq!(display.to_string(), "Argh: Failed to do the thing: oh no!");
}

View File

@@ -17,7 +17,7 @@ use tracing_subscriber::{
};
pub use dyn_err::{anyhow_dyn_err, std_dyn_err};
pub use err_with_sources::{err_with_sources, ErrorWithSources};
pub use err_with_sources::{err_with_src, ErrorWithSources};
pub use format::Format;
/// Registers a global subscriber with stdout logging and `additional_layer`
@@ -152,12 +152,15 @@ macro_rules! telemetry_span {
/// Creates a `telemetry` event.
///
/// In order to save CPU power, `telemetry` events are sampled at a rate of 1% at creation time.
/// In addition, all telemetry events are logged at the `DEBUG` level.
#[macro_export]
macro_rules! telemetry_event {
($($arg:tt)*) => {
if $crate::__export::rand::random::<f32>() < $crate::TELEMETRY_SAMPLE_RATE {
$crate::__export::tracing::trace!(target: $crate::TELEMETRY_TARGET, $($arg)*);
}
$crate::__export::tracing::debug!($($arg)*);
};
}

View File

@@ -26,7 +26,7 @@ macro_rules! unwrap_or_debug {
Err(e) => {
let error: &dyn ::std::error::Error = e.as_ref();
::tracing::debug!($($arg)*, $crate::err_with_sources(error))
::tracing::debug!($($arg)*, $crate::err_with_src(error))
}
}
};
@@ -43,7 +43,7 @@ macro_rules! unwrap_or_trace {
Err(e) => {
let error: &dyn ::std::error::Error = e.as_ref();
::tracing::debug!($($arg)*, $crate::err_with_sources(error))
::tracing::debug!($($arg)*, $crate::err_with_src(error))
}
}
};

View File

@@ -15,7 +15,7 @@ use std::{io, mem};
use backoff::backoff::Backoff;
use backoff::ExponentialBackoff;
use base64::Engine;
use firezone_logging::{err_with_sources, std_dyn_err, telemetry_span};
use firezone_logging::{err_with_src, std_dyn_err, telemetry_span};
use futures::future::BoxFuture;
use futures::{FutureExt, SinkExt, StreamExt};
use heartbeat::{Heartbeat, MissedLastHeartbeat};
@@ -422,7 +422,7 @@ where
let socket_factory = self.socket_factory.clone();
let socket_addresses = self.socket_addresses();
tracing::debug!(?backoff, max_elapsed_time = ?self.reconnect_backoff.max_elapsed_time, "Reconnecting to portal on transient client error: {}", err_with_sources(&e));
tracing::debug!(?backoff, max_elapsed_time = ?self.reconnect_backoff.max_elapsed_time, "Reconnecting to portal on transient client error: {}", err_with_src(&e));
self.state = State::Connecting(Box::pin(async move {
tokio::time::sleep(backoff).await;