chore(rust): remove telemetry spans and events (#9634)

Originally, we introduced these to gather some data from logs / warnings
that we considered to be too spammy. We've since merged a
burst-protection that will at most submit the same event once every 5
minutes.

The data from the telemetry spans themselves have not been used at all.
This commit is contained in:
Thomas Eizinger
2025-06-25 19:15:57 +02:00
committed by GitHub
parent 6972d4d62a
commit d5be185ae4
11 changed files with 14 additions and 103 deletions

1
rust/Cargo.lock generated
View File

@@ -2502,7 +2502,6 @@ dependencies = [
"nu-ansi-term 0.50.1",
"output_vt100",
"parking_lot",
"rand 0.8.5",
"sentry-tracing",
"supports-color",
"tempfile",

View File

@@ -13,7 +13,7 @@ use std::{io, mem};
use backoff::ExponentialBackoff;
use backoff::backoff::Backoff;
use base64::Engine;
use firezone_logging::{err_with_src, telemetry_span};
use firezone_logging::err_with_src;
use futures::future::BoxFuture;
use futures::{FutureExt, SinkExt, StreamExt};
use itertools::Itertools as _;
@@ -262,8 +262,6 @@ where
) -> io::Result<Self> {
let host_and_port = url.expose_secret().host_and_port();
let _span = telemetry_span!("resolve_portal_url", host = %host_and_port.0).entered();
// Statically resolve the host in the URL to a set of addresses.
// We use these when connecting the socket to avoid a dependency on DNS resolution later on.
let resolved_addresses = host_and_port

View File

@@ -19,7 +19,7 @@ use anyhow::Context;
use bimap::BiMap;
use connlib_model::{GatewayId, PublicKey, RelayId, ResourceId, ResourceStatus, ResourceView};
use connlib_model::{Site, SiteId};
use firezone_logging::{err_with_src, telemetry_event, unwrap_or_debug, unwrap_or_warn};
use firezone_logging::{err_with_src, unwrap_or_debug, unwrap_or_warn};
use ip_network::{IpNetwork, Ipv4Network, Ipv6Network};
use ip_network_table::IpNetworkTable;
use ip_packet::{IpPacket, MAX_UDP_PAYLOAD};
@@ -474,7 +474,7 @@ impl ClientState {
}
})
.unwrap_or_else(|e| {
telemetry_event!("Recursive UDP DNS query failed: {}", err_with_src(&e));
tracing::debug!("Recursive UDP DNS query failed: {}", err_with_src(&e));
dns_types::Response::servfail(&response.query)
});
@@ -490,7 +490,7 @@ impl ClientState {
tracing::trace!("Received recursive TCP DNS response");
})
.unwrap_or_else(|e| {
telemetry_event!("Recursive TCP DNS query failed: {}", err_with_src(&e));
tracing::debug!("Recursive TCP DNS query failed: {}", err_with_src(&e));
dns_types::Response::servfail(&response.query)
});

View File

@@ -5,7 +5,7 @@ use dns_types::{
DomainName, DomainNameRef, OwnedRecordData, Query, RecordType, Response, ResponseBuilder,
ResponseCode,
};
use firezone_logging::{err_with_src, telemetry_span};
use firezone_logging::err_with_src;
use itertools::Itertools;
use pattern::{Candidate, Pattern};
use std::io;
@@ -231,8 +231,6 @@ impl StubResolver {
///
/// This performs a linear search and is thus O(N) and **must not** be called in the hot-path of packet routing.
fn match_resource_linear(&self, domain: &dns_types::DomainName) -> Option<Resource> {
let _span = telemetry_span!("match_resource_linear").entered();
let name = Candidate::from_domain(domain);
for (pattern, r) in &self.dns_resources {

View File

@@ -5,7 +5,6 @@ mod udp_dns;
use crate::{device_channel::Device, dns, otel, sockets::Sockets};
use anyhow::{Context as _, Result};
use firezone_logging::{telemetry_event, telemetry_span};
use futures::FutureExt as _;
use futures_bounded::FuturesTupleSet;
use gat_lending_iterator::LendingIterator;
@@ -22,7 +21,6 @@ use std::{
task::{Context, Poll, ready},
time::{Duration, Instant},
};
use tracing::Instrument;
use tun::Tun;
/// How many IP packets we will at most read from the MPSC-channel connected to our TUN device thread.
@@ -371,8 +369,7 @@ impl Io {
if self
.dns_queries
.try_push(
udp_dns::send(self.udp_socket_factory.clone(), query.server, query.message)
.instrument(telemetry_span!("recursive_udp_dns_query")),
udp_dns::send(self.udp_socket_factory.clone(), query.server, query.message),
meta,
)
.is_err()
@@ -384,8 +381,7 @@ impl Io {
if self
.dns_queries
.try_push(
tcp_dns::send(self.tcp_socket_factory.clone(), query.server, query.message)
.instrument(telemetry_span!("recursive_tcp_dns_query")),
tcp_dns::send(self.tcp_socket_factory.clone(), query.server, query.message),
meta,
)
.is_err()
@@ -416,8 +412,6 @@ impl Io {
fn is_max_wg_packet_size(d: &DatagramIn) -> bool {
let len = d.packet.len();
if len > MAX_FZ_PAYLOAD {
telemetry_event!(from = %d.from, %len, "Dropping too large datagram (max allowed: {MAX_FZ_PAYLOAD} bytes)");
return false;
}

View File

@@ -4,8 +4,8 @@ use boringtun::x25519::PublicKey;
use dns_lookup::{AddrInfoHints, AddrInfoIter, LookupError};
use dns_types::DomainName;
use firezone_bin_shared::TunDeviceManager;
use firezone_logging::telemetry_span;
use firezone_telemetry::{Telemetry, analytics};
use firezone_tunnel::messages::gateway::{
AllowAccess, ClientIceCandidates, ClientsIceCandidates, ConnectionReady, EgressMessages,
IngressMessages, RejectAccess, RequestConnection,
@@ -24,7 +24,6 @@ use std::task::{Context, Poll};
use std::time::{Duration, Instant};
use std::{io, mem};
use tokio::sync::Mutex;
use tracing::Instrument;
use crate::RELEASE;
@@ -585,7 +584,6 @@ async fn resolve(domain: DomainName) -> Result<Vec<IpAddr>> {
let dname = domain.to_string();
let addresses = tokio::task::spawn_blocking(move || resolve_addresses(&dname))
.instrument(telemetry_span!("resolve_dns_resource"))
.await
.context("DNS resolution task failed")?
.context("DNS resolution failed")?;

View File

@@ -13,7 +13,7 @@ use firezone_bin_shared::{
platform::{tcp_socket_factory, udp_socket_factory},
signals,
};
use firezone_logging::{FilterReloadHandle, err_with_src, telemetry_span};
use firezone_logging::{FilterReloadHandle, err_with_src};
use firezone_telemetry::{Telemetry, analytics};
use futures::{
Future as _, SinkExt as _, Stream, StreamExt,
@@ -577,8 +577,6 @@ impl<'a> Handler<'a> {
fn try_connect(&mut self, api_url: &str, token: SecretString) -> Result<Session> {
let started_at = Instant::now();
let _connect_span = telemetry_span!("connect_to_firezone").entered();
assert!(self.session.is_none());
let device_id = device_id::get_or_create().context("Failed to get-or-create device ID")?;
@@ -624,13 +622,10 @@ impl<'a> Handler<'a> {
tracing::debug!(?dns, "Calling `set_dns`...");
connlib.set_dns(dns);
let tun = {
let _guard = telemetry_span!("create_tun_device").entered();
self.tun_device
.make_tun()
.context("Failed to create TUN device")?
};
let tun = self
.tun_device
.make_tun()
.context("Failed to create TUN device")?;
connlib.set_tun(tun);
Ok(Session::Creating {

View File

@@ -11,7 +11,6 @@ use firezone_bin_shared::{
platform::{tcp_socket_factory, udp_socket_factory},
signals,
};
use firezone_logging::telemetry_span;
use firezone_telemetry::{Telemetry, analytics, otel};
use opentelemetry_sdk::metrics::{PeriodicReader, SdkMeterProvider};
use phoenix_channel::PhoenixChannel;
@@ -245,8 +244,6 @@ fn main() -> Result<()> {
opentelemetry::global::set_meter_provider(provider);
}
let connect_span = telemetry_span!("connect_to_firezone").entered();
// The Headless Client will bail out here if there's no Internet, because `PhoenixChannel` will try to
// resolve the portal host and fail. This is intentional behavior. The Headless Client should always be running under a manager like `systemd` or Windows' Service Controller,
// so when it fails it will be restarted with backoff. `systemd` can additionally make us wait
@@ -288,16 +285,10 @@ fn main() -> Result<()> {
let mut telemetry_refresh = tokio::time::interval(Duration::from_secs(60));
let tun = {
let _guard = telemetry_span!("create_tun_device").entered();
tun_device.make_tun()?
};
let tun = tun_device.make_tun()?;
session.set_tun(tun);
session.set_dns(dns_controller.system_resolvers());
drop(connect_span);
let result = loop {
let event = tokio::select! {
() = terminate.recv() => {

View File

@@ -12,7 +12,6 @@ anyhow = { workspace = true }
nu-ansi-term = { workspace = true }
output_vt100 = { workspace = true }
parking_lot = { workspace = true }
rand = { workspace = true }
sentry-tracing = { workspace = true }
supports-color = { workspace = true }
time = { workspace = true, features = ["formatting"] }

View File

@@ -198,21 +198,6 @@ pub fn test_global(directives: &str) {
///
/// - error and warn events are reported as sentry exceptions
/// - info and debug events are captured as breadcrumbs (and submitted together with warns & errors)
///
/// ## Telemetry events
///
/// This layer configuration supports a special `telemetry` event.
/// Telemetry events are events logged on the `TRACE` level for the `telemetry` target.
/// These events SHOULD be created using [`telemetry_event`] to ensure that they are sampled correctly.
/// The idea here is that some events logged via `tracing` should not necessarily end up in the users log file.
/// Yet, if they happen a lot, we still want to know about them.
/// Coupling the `telemetry` target to the `TRACE` level pretty much prevents these events from ever showing up in log files.
/// By sampling them, we prevent flooding Sentry with lots of these logs.
///
/// ## Telemetry spans
///
/// Only spans with the `telemetry` target on level `TRACE` will be submitted to Sentry.
/// Similar to telemetry events, these should be created with [`telemetry_span`] to ensure they are sampled correctly.
pub fn sentry_layer<S>() -> impl Layer<S> + Send + Sync
where
S: Subscriber + for<'a> LookupSpan<'a>,
@@ -223,7 +208,6 @@ where
.event_filter(move |md| match *md.level() {
Level::ERROR | Level::WARN => EventFilter::Event | EventFilter::Breadcrumb | EventFilter::Log,
Level::INFO | Level::DEBUG => EventFilter::Breadcrumb | EventFilter::Log,
Level::TRACE if md.target() == TELEMETRY_TARGET => EventFilter::Event,
_ => EventFilter::Ignore,
})
.span_filter(|md| {
@@ -264,43 +248,3 @@ where
],
).not())
}
#[doc(hidden)]
pub const TELEMETRY_TARGET: &str = "telemetry";
#[doc(hidden)]
pub const TELEMETRY_SAMPLE_RATE: f32 = 0.01;
/// Creates a `telemetry` span that will be active until dropped.
///
/// In order to save CPU power, `telemetry` spans are sampled at a rate of 1% at creation time.
#[macro_export]
macro_rules! telemetry_span {
($($arg:tt)*) => {
if $crate::__export::rand::random::<f32>() < $crate::TELEMETRY_SAMPLE_RATE {
$crate::__export::tracing::trace_span!(target: $crate::TELEMETRY_TARGET, $($arg)*)
} else {
$crate::__export::tracing::Span::none()
}
};
}
/// Creates a `telemetry` event.
///
/// In order to save CPU power, `telemetry` events are sampled at a rate of 1% at creation time.
/// In addition, all telemetry events are logged at the `DEBUG` level.
#[macro_export]
macro_rules! telemetry_event {
($($arg:tt)*) => {
if $crate::__export::rand::random::<f32>() < $crate::TELEMETRY_SAMPLE_RATE {
$crate::__export::tracing::trace!(target: $crate::TELEMETRY_TARGET, $($arg)*);
}
$crate::__export::tracing::debug!($($arg)*);
};
}
#[doc(hidden)]
pub mod __export {
pub use rand;
pub use tracing;
}

View File

@@ -156,11 +156,6 @@ impl Telemetry {
environment: Some(Cow::Borrowed(environment.as_str())),
// We can't get the release number ourselves because we don't know if we're embedded in a GUI Client or a Headless Client.
release: Some(release.to_owned().into()),
traces_sampler: Some(Arc::new(|tx| {
// Only submit `telemetry` spans to Sentry.
// Those get sampled at creation time (to save CPU power) so we want to submit all of them.
if tx.name() == "telemetry" { 1.0 } else { 0.0 }
})),
max_breadcrumbs: 500,
before_send: Some(event_rate_limiter(Duration::from_secs(60 * 5))),
enable_logs,