mirror of
https://github.com/outbackdingo/firezone.git
synced 2026-01-27 10:18:54 +00:00
chore(rust): remove telemetry spans and events (#9634)
Originally, we introduced these to gather some data from logs / warnings that we considered to be too spammy. We've since merged a burst-protection that will at most submit the same event once every 5 minutes. The data from the telemetry spans themselves have not been used at all.
This commit is contained in:
1
rust/Cargo.lock
generated
1
rust/Cargo.lock
generated
@@ -2502,7 +2502,6 @@ dependencies = [
|
||||
"nu-ansi-term 0.50.1",
|
||||
"output_vt100",
|
||||
"parking_lot",
|
||||
"rand 0.8.5",
|
||||
"sentry-tracing",
|
||||
"supports-color",
|
||||
"tempfile",
|
||||
|
||||
@@ -13,7 +13,7 @@ use std::{io, mem};
|
||||
use backoff::ExponentialBackoff;
|
||||
use backoff::backoff::Backoff;
|
||||
use base64::Engine;
|
||||
use firezone_logging::{err_with_src, telemetry_span};
|
||||
use firezone_logging::err_with_src;
|
||||
use futures::future::BoxFuture;
|
||||
use futures::{FutureExt, SinkExt, StreamExt};
|
||||
use itertools::Itertools as _;
|
||||
@@ -262,8 +262,6 @@ where
|
||||
) -> io::Result<Self> {
|
||||
let host_and_port = url.expose_secret().host_and_port();
|
||||
|
||||
let _span = telemetry_span!("resolve_portal_url", host = %host_and_port.0).entered();
|
||||
|
||||
// Statically resolve the host in the URL to a set of addresses.
|
||||
// We use these when connecting the socket to avoid a dependency on DNS resolution later on.
|
||||
let resolved_addresses = host_and_port
|
||||
|
||||
@@ -19,7 +19,7 @@ use anyhow::Context;
|
||||
use bimap::BiMap;
|
||||
use connlib_model::{GatewayId, PublicKey, RelayId, ResourceId, ResourceStatus, ResourceView};
|
||||
use connlib_model::{Site, SiteId};
|
||||
use firezone_logging::{err_with_src, telemetry_event, unwrap_or_debug, unwrap_or_warn};
|
||||
use firezone_logging::{err_with_src, unwrap_or_debug, unwrap_or_warn};
|
||||
use ip_network::{IpNetwork, Ipv4Network, Ipv6Network};
|
||||
use ip_network_table::IpNetworkTable;
|
||||
use ip_packet::{IpPacket, MAX_UDP_PAYLOAD};
|
||||
@@ -474,7 +474,7 @@ impl ClientState {
|
||||
}
|
||||
})
|
||||
.unwrap_or_else(|e| {
|
||||
telemetry_event!("Recursive UDP DNS query failed: {}", err_with_src(&e));
|
||||
tracing::debug!("Recursive UDP DNS query failed: {}", err_with_src(&e));
|
||||
|
||||
dns_types::Response::servfail(&response.query)
|
||||
});
|
||||
@@ -490,7 +490,7 @@ impl ClientState {
|
||||
tracing::trace!("Received recursive TCP DNS response");
|
||||
})
|
||||
.unwrap_or_else(|e| {
|
||||
telemetry_event!("Recursive TCP DNS query failed: {}", err_with_src(&e));
|
||||
tracing::debug!("Recursive TCP DNS query failed: {}", err_with_src(&e));
|
||||
|
||||
dns_types::Response::servfail(&response.query)
|
||||
});
|
||||
|
||||
@@ -5,7 +5,7 @@ use dns_types::{
|
||||
DomainName, DomainNameRef, OwnedRecordData, Query, RecordType, Response, ResponseBuilder,
|
||||
ResponseCode,
|
||||
};
|
||||
use firezone_logging::{err_with_src, telemetry_span};
|
||||
use firezone_logging::err_with_src;
|
||||
use itertools::Itertools;
|
||||
use pattern::{Candidate, Pattern};
|
||||
use std::io;
|
||||
@@ -231,8 +231,6 @@ impl StubResolver {
|
||||
///
|
||||
/// This performs a linear search and is thus O(N) and **must not** be called in the hot-path of packet routing.
|
||||
fn match_resource_linear(&self, domain: &dns_types::DomainName) -> Option<Resource> {
|
||||
let _span = telemetry_span!("match_resource_linear").entered();
|
||||
|
||||
let name = Candidate::from_domain(domain);
|
||||
|
||||
for (pattern, r) in &self.dns_resources {
|
||||
|
||||
@@ -5,7 +5,6 @@ mod udp_dns;
|
||||
|
||||
use crate::{device_channel::Device, dns, otel, sockets::Sockets};
|
||||
use anyhow::{Context as _, Result};
|
||||
use firezone_logging::{telemetry_event, telemetry_span};
|
||||
use futures::FutureExt as _;
|
||||
use futures_bounded::FuturesTupleSet;
|
||||
use gat_lending_iterator::LendingIterator;
|
||||
@@ -22,7 +21,6 @@ use std::{
|
||||
task::{Context, Poll, ready},
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
use tracing::Instrument;
|
||||
use tun::Tun;
|
||||
|
||||
/// How many IP packets we will at most read from the MPSC-channel connected to our TUN device thread.
|
||||
@@ -371,8 +369,7 @@ impl Io {
|
||||
if self
|
||||
.dns_queries
|
||||
.try_push(
|
||||
udp_dns::send(self.udp_socket_factory.clone(), query.server, query.message)
|
||||
.instrument(telemetry_span!("recursive_udp_dns_query")),
|
||||
udp_dns::send(self.udp_socket_factory.clone(), query.server, query.message),
|
||||
meta,
|
||||
)
|
||||
.is_err()
|
||||
@@ -384,8 +381,7 @@ impl Io {
|
||||
if self
|
||||
.dns_queries
|
||||
.try_push(
|
||||
tcp_dns::send(self.tcp_socket_factory.clone(), query.server, query.message)
|
||||
.instrument(telemetry_span!("recursive_tcp_dns_query")),
|
||||
tcp_dns::send(self.tcp_socket_factory.clone(), query.server, query.message),
|
||||
meta,
|
||||
)
|
||||
.is_err()
|
||||
@@ -416,8 +412,6 @@ impl Io {
|
||||
fn is_max_wg_packet_size(d: &DatagramIn) -> bool {
|
||||
let len = d.packet.len();
|
||||
if len > MAX_FZ_PAYLOAD {
|
||||
telemetry_event!(from = %d.from, %len, "Dropping too large datagram (max allowed: {MAX_FZ_PAYLOAD} bytes)");
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@@ -4,8 +4,8 @@ use boringtun::x25519::PublicKey;
|
||||
use dns_lookup::{AddrInfoHints, AddrInfoIter, LookupError};
|
||||
use dns_types::DomainName;
|
||||
use firezone_bin_shared::TunDeviceManager;
|
||||
use firezone_logging::telemetry_span;
|
||||
use firezone_telemetry::{Telemetry, analytics};
|
||||
|
||||
use firezone_tunnel::messages::gateway::{
|
||||
AllowAccess, ClientIceCandidates, ClientsIceCandidates, ConnectionReady, EgressMessages,
|
||||
IngressMessages, RejectAccess, RequestConnection,
|
||||
@@ -24,7 +24,6 @@ use std::task::{Context, Poll};
|
||||
use std::time::{Duration, Instant};
|
||||
use std::{io, mem};
|
||||
use tokio::sync::Mutex;
|
||||
use tracing::Instrument;
|
||||
|
||||
use crate::RELEASE;
|
||||
|
||||
@@ -585,7 +584,6 @@ async fn resolve(domain: DomainName) -> Result<Vec<IpAddr>> {
|
||||
let dname = domain.to_string();
|
||||
|
||||
let addresses = tokio::task::spawn_blocking(move || resolve_addresses(&dname))
|
||||
.instrument(telemetry_span!("resolve_dns_resource"))
|
||||
.await
|
||||
.context("DNS resolution task failed")?
|
||||
.context("DNS resolution failed")?;
|
||||
|
||||
@@ -13,7 +13,7 @@ use firezone_bin_shared::{
|
||||
platform::{tcp_socket_factory, udp_socket_factory},
|
||||
signals,
|
||||
};
|
||||
use firezone_logging::{FilterReloadHandle, err_with_src, telemetry_span};
|
||||
use firezone_logging::{FilterReloadHandle, err_with_src};
|
||||
use firezone_telemetry::{Telemetry, analytics};
|
||||
use futures::{
|
||||
Future as _, SinkExt as _, Stream, StreamExt,
|
||||
@@ -577,8 +577,6 @@ impl<'a> Handler<'a> {
|
||||
fn try_connect(&mut self, api_url: &str, token: SecretString) -> Result<Session> {
|
||||
let started_at = Instant::now();
|
||||
|
||||
let _connect_span = telemetry_span!("connect_to_firezone").entered();
|
||||
|
||||
assert!(self.session.is_none());
|
||||
let device_id = device_id::get_or_create().context("Failed to get-or-create device ID")?;
|
||||
|
||||
@@ -624,13 +622,10 @@ impl<'a> Handler<'a> {
|
||||
tracing::debug!(?dns, "Calling `set_dns`...");
|
||||
connlib.set_dns(dns);
|
||||
|
||||
let tun = {
|
||||
let _guard = telemetry_span!("create_tun_device").entered();
|
||||
|
||||
self.tun_device
|
||||
.make_tun()
|
||||
.context("Failed to create TUN device")?
|
||||
};
|
||||
let tun = self
|
||||
.tun_device
|
||||
.make_tun()
|
||||
.context("Failed to create TUN device")?;
|
||||
connlib.set_tun(tun);
|
||||
|
||||
Ok(Session::Creating {
|
||||
|
||||
@@ -11,7 +11,6 @@ use firezone_bin_shared::{
|
||||
platform::{tcp_socket_factory, udp_socket_factory},
|
||||
signals,
|
||||
};
|
||||
use firezone_logging::telemetry_span;
|
||||
use firezone_telemetry::{Telemetry, analytics, otel};
|
||||
use opentelemetry_sdk::metrics::{PeriodicReader, SdkMeterProvider};
|
||||
use phoenix_channel::PhoenixChannel;
|
||||
@@ -245,8 +244,6 @@ fn main() -> Result<()> {
|
||||
opentelemetry::global::set_meter_provider(provider);
|
||||
}
|
||||
|
||||
let connect_span = telemetry_span!("connect_to_firezone").entered();
|
||||
|
||||
// The Headless Client will bail out here if there's no Internet, because `PhoenixChannel` will try to
|
||||
// resolve the portal host and fail. This is intentional behavior. The Headless Client should always be running under a manager like `systemd` or Windows' Service Controller,
|
||||
// so when it fails it will be restarted with backoff. `systemd` can additionally make us wait
|
||||
@@ -288,16 +285,10 @@ fn main() -> Result<()> {
|
||||
|
||||
let mut telemetry_refresh = tokio::time::interval(Duration::from_secs(60));
|
||||
|
||||
let tun = {
|
||||
let _guard = telemetry_span!("create_tun_device").entered();
|
||||
|
||||
tun_device.make_tun()?
|
||||
};
|
||||
let tun = tun_device.make_tun()?;
|
||||
session.set_tun(tun);
|
||||
session.set_dns(dns_controller.system_resolvers());
|
||||
|
||||
drop(connect_span);
|
||||
|
||||
let result = loop {
|
||||
let event = tokio::select! {
|
||||
() = terminate.recv() => {
|
||||
|
||||
@@ -12,7 +12,6 @@ anyhow = { workspace = true }
|
||||
nu-ansi-term = { workspace = true }
|
||||
output_vt100 = { workspace = true }
|
||||
parking_lot = { workspace = true }
|
||||
rand = { workspace = true }
|
||||
sentry-tracing = { workspace = true }
|
||||
supports-color = { workspace = true }
|
||||
time = { workspace = true, features = ["formatting"] }
|
||||
|
||||
@@ -198,21 +198,6 @@ pub fn test_global(directives: &str) {
|
||||
///
|
||||
/// - error and warn events are reported as sentry exceptions
|
||||
/// - info and debug events are captured as breadcrumbs (and submitted together with warns & errors)
|
||||
///
|
||||
/// ## Telemetry events
|
||||
///
|
||||
/// This layer configuration supports a special `telemetry` event.
|
||||
/// Telemetry events are events logged on the `TRACE` level for the `telemetry` target.
|
||||
/// These events SHOULD be created using [`telemetry_event`] to ensure that they are sampled correctly.
|
||||
/// The idea here is that some events logged via `tracing` should not necessarily end up in the users log file.
|
||||
/// Yet, if they happen a lot, we still want to know about them.
|
||||
/// Coupling the `telemetry` target to the `TRACE` level pretty much prevents these events from ever showing up in log files.
|
||||
/// By sampling them, we prevent flooding Sentry with lots of these logs.
|
||||
///
|
||||
/// ## Telemetry spans
|
||||
///
|
||||
/// Only spans with the `telemetry` target on level `TRACE` will be submitted to Sentry.
|
||||
/// Similar to telemetry events, these should be created with [`telemetry_span`] to ensure they are sampled correctly.
|
||||
pub fn sentry_layer<S>() -> impl Layer<S> + Send + Sync
|
||||
where
|
||||
S: Subscriber + for<'a> LookupSpan<'a>,
|
||||
@@ -223,7 +208,6 @@ where
|
||||
.event_filter(move |md| match *md.level() {
|
||||
Level::ERROR | Level::WARN => EventFilter::Event | EventFilter::Breadcrumb | EventFilter::Log,
|
||||
Level::INFO | Level::DEBUG => EventFilter::Breadcrumb | EventFilter::Log,
|
||||
Level::TRACE if md.target() == TELEMETRY_TARGET => EventFilter::Event,
|
||||
_ => EventFilter::Ignore,
|
||||
})
|
||||
.span_filter(|md| {
|
||||
@@ -264,43 +248,3 @@ where
|
||||
],
|
||||
).not())
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub const TELEMETRY_TARGET: &str = "telemetry";
|
||||
#[doc(hidden)]
|
||||
pub const TELEMETRY_SAMPLE_RATE: f32 = 0.01;
|
||||
|
||||
/// Creates a `telemetry` span that will be active until dropped.
|
||||
///
|
||||
/// In order to save CPU power, `telemetry` spans are sampled at a rate of 1% at creation time.
|
||||
#[macro_export]
|
||||
macro_rules! telemetry_span {
|
||||
($($arg:tt)*) => {
|
||||
if $crate::__export::rand::random::<f32>() < $crate::TELEMETRY_SAMPLE_RATE {
|
||||
$crate::__export::tracing::trace_span!(target: $crate::TELEMETRY_TARGET, $($arg)*)
|
||||
} else {
|
||||
$crate::__export::tracing::Span::none()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/// Creates a `telemetry` event.
|
||||
///
|
||||
/// In order to save CPU power, `telemetry` events are sampled at a rate of 1% at creation time.
|
||||
/// In addition, all telemetry events are logged at the `DEBUG` level.
|
||||
#[macro_export]
|
||||
macro_rules! telemetry_event {
|
||||
($($arg:tt)*) => {
|
||||
if $crate::__export::rand::random::<f32>() < $crate::TELEMETRY_SAMPLE_RATE {
|
||||
$crate::__export::tracing::trace!(target: $crate::TELEMETRY_TARGET, $($arg)*);
|
||||
}
|
||||
|
||||
$crate::__export::tracing::debug!($($arg)*);
|
||||
};
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub mod __export {
|
||||
pub use rand;
|
||||
pub use tracing;
|
||||
}
|
||||
|
||||
@@ -156,11 +156,6 @@ impl Telemetry {
|
||||
environment: Some(Cow::Borrowed(environment.as_str())),
|
||||
// We can't get the release number ourselves because we don't know if we're embedded in a GUI Client or a Headless Client.
|
||||
release: Some(release.to_owned().into()),
|
||||
traces_sampler: Some(Arc::new(|tx| {
|
||||
// Only submit `telemetry` spans to Sentry.
|
||||
// Those get sampled at creation time (to save CPU power) so we want to submit all of them.
|
||||
if tx.name() == "telemetry" { 1.0 } else { 0.0 }
|
||||
})),
|
||||
max_breadcrumbs: 500,
|
||||
before_send: Some(event_rate_limiter(Duration::from_secs(60 * 5))),
|
||||
enable_logs,
|
||||
|
||||
Reference in New Issue
Block a user