diff --git a/rust/connlib/tunnel/src/client.rs b/rust/connlib/tunnel/src/client.rs index d4d33c273..7ac54e551 100644 --- a/rust/connlib/tunnel/src/client.rs +++ b/rust/connlib/tunnel/src/client.rs @@ -14,7 +14,9 @@ use bimap::BiMap; use connlib_model::PublicKey; use connlib_model::{GatewayId, RelayId, ResourceId, ResourceStatus, ResourceView}; use connlib_model::{Site, SiteId}; -use firezone_logging::{anyhow_dyn_err, std_dyn_err, unwrap_or_debug, unwrap_or_warn}; +use firezone_logging::{ + anyhow_dyn_err, std_dyn_err, telemetry_event, unwrap_or_debug, unwrap_or_warn, +}; use ip_network::{IpNetwork, Ipv4Network, Ipv6Network}; use ip_network_table::IpNetworkTable; use ip_packet::{IpPacket, UdpSlice, MAX_DATAGRAM_PAYLOAD}; @@ -381,7 +383,7 @@ impl ClientState { .unwrap_or_else(|e| { let error = std_dyn_err(&e); tracing::debug!(error, "Recursive UDP DNS query failed"); - tracing::trace!(target: "telemetry", error, "Recursive UDP DNS query failed"); + telemetry_event!(error, "Recursive UDP DNS query failed"); dns::servfail(response.query.for_slice_ref()) }); @@ -399,7 +401,7 @@ impl ClientState { .unwrap_or_else(|e| { let error = std_dyn_err(&e); tracing::debug!(error, "Recursive TCP DNS query failed"); - tracing::trace!(target: "telemetry", error, "Recursive TCP DNS query failed"); + telemetry_event!(error, "Recursive TCP DNS query failed"); dns::servfail(response.query.for_slice_ref()) }); diff --git a/rust/connlib/tunnel/src/dns.rs b/rust/connlib/tunnel/src/dns.rs index 6760fac6d..992656ffd 100644 --- a/rust/connlib/tunnel/src/dns.rs +++ b/rust/connlib/tunnel/src/dns.rs @@ -10,7 +10,7 @@ use domain::{ }, dep::octseq::OctetsInto, }; -use firezone_logging::{anyhow_dyn_err, std_dyn_err}; +use firezone_logging::{anyhow_dyn_err, std_dyn_err, telemetry_span}; use itertools::Itertools; use pattern::{Candidate, Pattern}; use std::io; @@ -233,6 +233,8 @@ impl StubResolver { /// /// This performs a linear search and is thus O(N) and **must not** be called in the hot-path of packet routing. fn match_resource_linear(&self, domain: &DomainName) -> Option { + let _span = telemetry_span!("match_resource_linear").entered(); + let name = Candidate::from_domain(domain); for (pattern, id) in &self.dns_resources { diff --git a/rust/connlib/tunnel/src/gateway.rs b/rust/connlib/tunnel/src/gateway.rs index 4c59ddfe9..245c55c0c 100644 --- a/rust/connlib/tunnel/src/gateway.rs +++ b/rust/connlib/tunnel/src/gateway.rs @@ -8,7 +8,7 @@ use anyhow::Context; use boringtun::x25519::PublicKey; use chrono::{DateTime, Utc}; use connlib_model::{ClientId, DomainName, RelayId, ResourceId}; -use firezone_logging::{anyhow_dyn_err, std_dyn_err}; +use firezone_logging::{anyhow_dyn_err, std_dyn_err, telemetry_span}; use ip_network::{Ipv4Network, Ipv6Network}; use ip_packet::{FzP2pControlSlice, IpPacket}; use secrecy::{ExposeSecret as _, Secret}; @@ -259,6 +259,8 @@ impl GatewayState { resolved_ips: Vec, now: Instant, ) { + let _span = telemetry_span!("refresh_translation").entered(); + let Some(peer) = self.peers.get_mut(&client) else { return; }; diff --git a/rust/connlib/tunnel/src/io.rs b/rust/connlib/tunnel/src/io.rs index f9489c6bb..c86f3cc67 100644 --- a/rust/connlib/tunnel/src/io.rs +++ b/rust/connlib/tunnel/src/io.rs @@ -1,6 +1,6 @@ use crate::{device_channel::Device, dns, sockets::Sockets}; use domain::base::Message; -use firezone_logging::std_dyn_err; +use firezone_logging::{std_dyn_err, telemetry_span}; use futures::{ future::{self, Either}, stream, Stream, StreamExt, @@ -22,6 +22,7 @@ use tokio::{ io::{AsyncReadExt, AsyncWriteExt}, sync::mpsc, }; +use tracing::Instrument; use tun::Tun; /// Bundles together all side-effects that connlib needs to have access to. @@ -252,7 +253,8 @@ impl Io { .map_err(|_| io::Error::other("Failed to parse DNS message"))?; Ok(message) - }, + } + .instrument(telemetry_span!("recursive_udp_dns_query")), meta, ) .is_err() @@ -294,7 +296,8 @@ impl Io { .map_err(|_| io::Error::other("Failed to parse DNS message"))?; Ok(message) - }, + } + .instrument(telemetry_span!("recursive_tcp_dns_query")), meta, ) .is_err() diff --git a/rust/gateway/src/eventloop.rs b/rust/gateway/src/eventloop.rs index 7244a5fd6..c4b5fa326 100644 --- a/rust/gateway/src/eventloop.rs +++ b/rust/gateway/src/eventloop.rs @@ -4,7 +4,7 @@ use connlib_model::DomainName; use connlib_model::{ClientId, ResourceId}; #[cfg(not(target_os = "windows"))] use dns_lookup::{AddrInfoHints, AddrInfoIter, LookupError}; -use firezone_logging::{anyhow_dyn_err, std_dyn_err}; +use firezone_logging::{anyhow_dyn_err, std_dyn_err, telemetry_span}; use firezone_tunnel::messages::gateway::{ AllowAccess, ClientIceCandidates, ClientsIceCandidates, ConnectionReady, EgressMessages, IngressMessages, RejectAccess, RequestConnection, @@ -20,6 +20,7 @@ use std::io; use std::net::IpAddr; use std::task::{Context, Poll}; use std::time::{Duration, Instant}; +use tracing::Instrument; pub const PHOENIX_TOPIC: &str = "gateway"; @@ -451,7 +452,10 @@ async fn resolve(domain: Option) -> Vec { let dname = domain.to_string(); - match tokio::task::spawn_blocking(move || resolve_addresses(&dname)).await { + match tokio::task::spawn_blocking(move || resolve_addresses(&dname)) + .instrument(telemetry_span!("resolve_dns_resource")) + .await + { Ok(Ok(addresses)) => addresses, Ok(Err(e)) => { tracing::warn!(error = std_dyn_err(&e), %domain, "DNS resolution failed"); diff --git a/rust/headless-client/src/ipc_service.rs b/rust/headless-client/src/ipc_service.rs index 17f3695d3..61c6657af 100644 --- a/rust/headless-client/src/ipc_service.rs +++ b/rust/headless-client/src/ipc_service.rs @@ -9,7 +9,7 @@ use firezone_bin_shared::{ platform::{tcp_socket_factory, udp_socket_factory, DnsControlMethod}, TunDeviceManager, TOKEN_ENV_KEY, }; -use firezone_logging::{anyhow_dyn_err, std_dyn_err}; +use firezone_logging::{anyhow_dyn_err, std_dyn_err, telemetry_span}; use firezone_telemetry::Telemetry; use futures::{ future::poll_fn, @@ -558,8 +558,7 @@ impl<'a> Handler<'a> { /// /// Throws matchable errors for bad URLs, unable to reach the portal, or unable to create the tunnel device fn connect_to_firezone(&mut self, api_url: &str, token: SecretString) -> Result<(), Error> { - let _connect_span = - tracing::trace_span!(target: "telemetry", "connect_to_firezone").entered(); + let _connect_span = telemetry_span!("connect_to_firezone").entered(); assert!(self.session.is_none()); let device_id = device_id::get_or_create().map_err(|e| Error::DeviceId(e.to_string()))?; @@ -606,7 +605,7 @@ impl<'a> Handler<'a> { connlib.set_dns(dns); let tun = { - let _guard = tracing::trace_span!(target: "telemetry", "create_tun_device").entered(); + let _guard = telemetry_span!("create_tun_device").entered(); self.tun_device .make_tun() diff --git a/rust/headless-client/src/main.rs b/rust/headless-client/src/main.rs index cc59bc689..68e2a238b 100644 --- a/rust/headless-client/src/main.rs +++ b/rust/headless-client/src/main.rs @@ -12,7 +12,7 @@ use firezone_bin_shared::{ use firezone_headless_client::{ device_id, signals, CallbackHandler, CliCommon, ConnlibMsg, DnsController, }; -use firezone_logging::anyhow_dyn_err; +use firezone_logging::{anyhow_dyn_err, telemetry_span}; use firezone_telemetry::Telemetry; use futures::{FutureExt as _, StreamExt as _}; use phoenix_channel::get_user_agent; @@ -207,8 +207,7 @@ fn main() -> Result<()> { let mut last_connlib_start_instant = Some(Instant::now()); rt.block_on(async { - let connect_span = - tracing::trace_span!(target: "telemetry", "connect_to_firezone").entered(); + let connect_span = telemetry_span!("connect_to_firezone").entered(); // The Headless Client will bail out here if there's no Internet, because `PhoenixChannel` will try to // resolve the portal host and fail. This is intentional behavior. The Headless Client should always be running under a manager like `systemd` or Windows' Service Controller, @@ -250,7 +249,7 @@ fn main() -> Result<()> { drop(tokio_handle); let tun = { - let _guard = tracing::trace_span!(target: "telemetry", "create_tun_device").entered(); + let _guard = telemetry_span!("create_tun_device").entered(); tun_device.make_tun()? }; diff --git a/rust/logging/src/lib.rs b/rust/logging/src/lib.rs index 4b277d7fa..66625f746 100644 --- a/rust/logging/src/lib.rs +++ b/rust/logging/src/lib.rs @@ -24,7 +24,7 @@ where let subscriber = Registry::default() .with(additional_layer.with_filter(filter(&directives))) - .with(sentry_layer().with_filter(filter(""))) // Sentry layer has its own event filtering mechanism, so we only exclude the noisy crates. + .with(sentry_layer()) .with( fmt::layer() .event_format(Format::new()) @@ -87,7 +87,7 @@ pub fn test_global(directives: &str) { /// /// This layer configuration supports a special `telemetry` event. /// Telemetry events are events logged on the `TRACE` level for the `telemetry` target. -/// They are sampled at a rate of 1%. +/// These events SHOULD be created using [`telemetry_event`] to ensure that they are sampled correctly. /// The idea here is that some events logged via `tracing` should not necessarily end up in the users log file. /// Yet, if they happen a lot, we still want to know about them. /// Coupling the `telemetry` target to the `TRACE` level pretty much prevents these events from ever showing up in log files. @@ -96,8 +96,8 @@ pub fn test_global(directives: &str) { /// ## Telemetry spans /// /// Only spans with the `telemetry` target on level `TRACE` will be submitted to Sentry. -/// They are subject to the sampling rate defined in the Sentry client configuration. -pub fn sentry_layer() -> sentry_tracing::SentryLayer +/// Similar to telemetry events, these should be created with [`telemetry_span`] to ensure they are sampled correctly. +pub fn sentry_layer() -> impl Layer + Send + Sync where S: Subscriber + for<'a> LookupSpan<'a>, { @@ -105,16 +105,47 @@ where .event_filter(move |md| match *md.level() { tracing::Level::ERROR | tracing::Level::WARN => EventFilter::Exception, tracing::Level::INFO | tracing::Level::DEBUG => EventFilter::Breadcrumb, - tracing::Level::TRACE if md.target() == "telemetry" => { - // rand::random generates floats in the range of [0, 1). - if rand::random::() < 0.01 { - EventFilter::Event - } else { - EventFilter::Ignore - } - } + tracing::Level::TRACE if md.target() == TELEMETRY_TARGET => EventFilter::Event, _ => EventFilter::Ignore, }) - .span_filter(|md| *md.level() == tracing::Level::TRACE && md.target() == "telemetry") + .span_filter(|md| *md.level() == tracing::Level::TRACE && md.target() == TELEMETRY_TARGET) .enable_span_attributes() + .with_filter(filter("trace")) // Filter out noisy crates but pass all events otherwise. +} + +#[doc(hidden)] +pub const TELEMETRY_TARGET: &str = "telemetry"; +#[doc(hidden)] +pub const TELEMETRY_SAMPLE_RATE: f32 = 0.01; + +/// Creates a `telemetry` span that will be active until dropped. +/// +/// In order to save CPU power, `telemetry` spans are sampled at a rate of 1% at creation time. +#[macro_export] +macro_rules! telemetry_span { + ($($arg:tt)*) => { + if $crate::__export::rand::random::() < $crate::TELEMETRY_SAMPLE_RATE { + $crate::__export::tracing::trace_span!(target: $crate::TELEMETRY_TARGET, $($arg)*) + } else { + $crate::__export::tracing::Span::none() + } + }; +} + +/// Creates a `telemetry` event. +/// +/// In order to save CPU power, `telemetry` events are sampled at a rate of 1% at creation time. +#[macro_export] +macro_rules! telemetry_event { + ($($arg:tt)*) => { + if $crate::__export::rand::random::() < $crate::TELEMETRY_SAMPLE_RATE { + $crate::__export::tracing::trace!(target: $crate::TELEMETRY_TARGET, $($arg)*); + } + }; +} + +#[doc(hidden)] +pub mod __export { + pub use rand; + pub use tracing; } diff --git a/rust/phoenix-channel/src/lib.rs b/rust/phoenix-channel/src/lib.rs index ce3751cb3..ea670ed20 100644 --- a/rust/phoenix-channel/src/lib.rs +++ b/rust/phoenix-channel/src/lib.rs @@ -13,7 +13,7 @@ use std::{io, mem}; use backoff::backoff::Backoff; use backoff::ExponentialBackoff; use base64::Engine; -use firezone_logging::std_dyn_err; +use firezone_logging::{std_dyn_err, telemetry_span}; use futures::future::BoxFuture; use futures::{FutureExt, SinkExt, StreamExt}; use heartbeat::{Heartbeat, MissedLastHeartbeat}; @@ -271,8 +271,7 @@ where let host_and_port = url.expose_secret().host_and_port(); - let _guard = - tracing::trace_span!(target: "telemetry", "resolve_portal_url", host = %host_and_port.0).entered(); + let _span = telemetry_span!("resolve_portal_url", host = %host_and_port.0).entered(); // Statically resolve the host in the URL to a set of addresses. // We don't use these directly because we need to connect to the domain via TLS which requires a hostname. diff --git a/rust/telemetry/src/lib.rs b/rust/telemetry/src/lib.rs index cf493685c..7f9730110 100644 --- a/rust/telemetry/src/lib.rs +++ b/rust/telemetry/src/lib.rs @@ -71,7 +71,9 @@ impl Telemetry { environment: Some(environment.into()), // We can't get the release number ourselves because we don't know if we're embedded in a GUI Client or a Headless Client. release: Some(release.into()), - traces_sample_rate: 0.1, + // We submit all spans but only send the ones with `target: telemetry`. + // Those spans are created further down and are throttled at creation time to save CPU. + traces_sample_rate: 1.0, max_breadcrumbs: 500, ..Default::default() },