mirror of
https://github.com/outbackdingo/firezone.git
synced 2026-01-27 10:18:54 +00:00
chore(telemetry): improve telemetry spans and events (#7206)
DNS resolution is a critical part of `connlib`. If it is slow for whatever reason, users will notice this. To make sure we notice as well, we add `telemetry` spans to the client's and gateway's DNS resolution. For the client, this applies to all DNS queries that we forward to the upstream servers. For the gateway, this applies to all DNS resources. In addition to those IO operations, we also instrument the `match_resource_linear` function. This function operates in `O(n)` of all defined DNS resources. It _should_ be fast enough to not create an impact but it can't hurt to measure this regardless. Lastly, we also instrument `refresh_translations` on the gateway. Refreshing the DNS resolution of a DNS resource should really only happen, when the previous IP addresses become stale yet the user is still trying to send traffic to them. We don't actually have any data on how often that happens. By instrumenting it, we can gather some of this data. To make sure that none of these telemetry events and spans hurt the end-user performance, we introduce macros to `firezone-logging` that sample the creation of these events and spans at a rate of 1%. I ran a flamegraph and none of these even showed up. The most critical one here is probably the `match_resource_linear` span because it happens on every DNS query. Resolves: #7198. --------- Signed-off-by: Thomas Eizinger <thomas@eizinger.io>
This commit is contained in:
@@ -14,7 +14,9 @@ use bimap::BiMap;
|
||||
use connlib_model::PublicKey;
|
||||
use connlib_model::{GatewayId, RelayId, ResourceId, ResourceStatus, ResourceView};
|
||||
use connlib_model::{Site, SiteId};
|
||||
use firezone_logging::{anyhow_dyn_err, std_dyn_err, unwrap_or_debug, unwrap_or_warn};
|
||||
use firezone_logging::{
|
||||
anyhow_dyn_err, std_dyn_err, telemetry_event, unwrap_or_debug, unwrap_or_warn,
|
||||
};
|
||||
use ip_network::{IpNetwork, Ipv4Network, Ipv6Network};
|
||||
use ip_network_table::IpNetworkTable;
|
||||
use ip_packet::{IpPacket, UdpSlice, MAX_DATAGRAM_PAYLOAD};
|
||||
@@ -381,7 +383,7 @@ impl ClientState {
|
||||
.unwrap_or_else(|e| {
|
||||
let error = std_dyn_err(&e);
|
||||
tracing::debug!(error, "Recursive UDP DNS query failed");
|
||||
tracing::trace!(target: "telemetry", error, "Recursive UDP DNS query failed");
|
||||
telemetry_event!(error, "Recursive UDP DNS query failed");
|
||||
|
||||
dns::servfail(response.query.for_slice_ref())
|
||||
});
|
||||
@@ -399,7 +401,7 @@ impl ClientState {
|
||||
.unwrap_or_else(|e| {
|
||||
let error = std_dyn_err(&e);
|
||||
tracing::debug!(error, "Recursive TCP DNS query failed");
|
||||
tracing::trace!(target: "telemetry", error, "Recursive TCP DNS query failed");
|
||||
telemetry_event!(error, "Recursive TCP DNS query failed");
|
||||
|
||||
dns::servfail(response.query.for_slice_ref())
|
||||
});
|
||||
|
||||
@@ -10,7 +10,7 @@ use domain::{
|
||||
},
|
||||
dep::octseq::OctetsInto,
|
||||
};
|
||||
use firezone_logging::{anyhow_dyn_err, std_dyn_err};
|
||||
use firezone_logging::{anyhow_dyn_err, std_dyn_err, telemetry_span};
|
||||
use itertools::Itertools;
|
||||
use pattern::{Candidate, Pattern};
|
||||
use std::io;
|
||||
@@ -233,6 +233,8 @@ impl StubResolver {
|
||||
///
|
||||
/// This performs a linear search and is thus O(N) and **must not** be called in the hot-path of packet routing.
|
||||
fn match_resource_linear(&self, domain: &DomainName) -> Option<ResourceId> {
|
||||
let _span = telemetry_span!("match_resource_linear").entered();
|
||||
|
||||
let name = Candidate::from_domain(domain);
|
||||
|
||||
for (pattern, id) in &self.dns_resources {
|
||||
|
||||
@@ -8,7 +8,7 @@ use anyhow::Context;
|
||||
use boringtun::x25519::PublicKey;
|
||||
use chrono::{DateTime, Utc};
|
||||
use connlib_model::{ClientId, DomainName, RelayId, ResourceId};
|
||||
use firezone_logging::{anyhow_dyn_err, std_dyn_err};
|
||||
use firezone_logging::{anyhow_dyn_err, std_dyn_err, telemetry_span};
|
||||
use ip_network::{Ipv4Network, Ipv6Network};
|
||||
use ip_packet::{FzP2pControlSlice, IpPacket};
|
||||
use secrecy::{ExposeSecret as _, Secret};
|
||||
@@ -259,6 +259,8 @@ impl GatewayState {
|
||||
resolved_ips: Vec<IpAddr>,
|
||||
now: Instant,
|
||||
) {
|
||||
let _span = telemetry_span!("refresh_translation").entered();
|
||||
|
||||
let Some(peer) = self.peers.get_mut(&client) else {
|
||||
return;
|
||||
};
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::{device_channel::Device, dns, sockets::Sockets};
|
||||
use domain::base::Message;
|
||||
use firezone_logging::std_dyn_err;
|
||||
use firezone_logging::{std_dyn_err, telemetry_span};
|
||||
use futures::{
|
||||
future::{self, Either},
|
||||
stream, Stream, StreamExt,
|
||||
@@ -22,6 +22,7 @@ use tokio::{
|
||||
io::{AsyncReadExt, AsyncWriteExt},
|
||||
sync::mpsc,
|
||||
};
|
||||
use tracing::Instrument;
|
||||
use tun::Tun;
|
||||
|
||||
/// Bundles together all side-effects that connlib needs to have access to.
|
||||
@@ -252,7 +253,8 @@ impl Io {
|
||||
.map_err(|_| io::Error::other("Failed to parse DNS message"))?;
|
||||
|
||||
Ok(message)
|
||||
},
|
||||
}
|
||||
.instrument(telemetry_span!("recursive_udp_dns_query")),
|
||||
meta,
|
||||
)
|
||||
.is_err()
|
||||
@@ -294,7 +296,8 @@ impl Io {
|
||||
.map_err(|_| io::Error::other("Failed to parse DNS message"))?;
|
||||
|
||||
Ok(message)
|
||||
},
|
||||
}
|
||||
.instrument(telemetry_span!("recursive_tcp_dns_query")),
|
||||
meta,
|
||||
)
|
||||
.is_err()
|
||||
|
||||
@@ -4,7 +4,7 @@ use connlib_model::DomainName;
|
||||
use connlib_model::{ClientId, ResourceId};
|
||||
#[cfg(not(target_os = "windows"))]
|
||||
use dns_lookup::{AddrInfoHints, AddrInfoIter, LookupError};
|
||||
use firezone_logging::{anyhow_dyn_err, std_dyn_err};
|
||||
use firezone_logging::{anyhow_dyn_err, std_dyn_err, telemetry_span};
|
||||
use firezone_tunnel::messages::gateway::{
|
||||
AllowAccess, ClientIceCandidates, ClientsIceCandidates, ConnectionReady, EgressMessages,
|
||||
IngressMessages, RejectAccess, RequestConnection,
|
||||
@@ -20,6 +20,7 @@ use std::io;
|
||||
use std::net::IpAddr;
|
||||
use std::task::{Context, Poll};
|
||||
use std::time::{Duration, Instant};
|
||||
use tracing::Instrument;
|
||||
|
||||
pub const PHOENIX_TOPIC: &str = "gateway";
|
||||
|
||||
@@ -451,7 +452,10 @@ async fn resolve(domain: Option<DomainName>) -> Vec<IpAddr> {
|
||||
|
||||
let dname = domain.to_string();
|
||||
|
||||
match tokio::task::spawn_blocking(move || resolve_addresses(&dname)).await {
|
||||
match tokio::task::spawn_blocking(move || resolve_addresses(&dname))
|
||||
.instrument(telemetry_span!("resolve_dns_resource"))
|
||||
.await
|
||||
{
|
||||
Ok(Ok(addresses)) => addresses,
|
||||
Ok(Err(e)) => {
|
||||
tracing::warn!(error = std_dyn_err(&e), %domain, "DNS resolution failed");
|
||||
|
||||
@@ -9,7 +9,7 @@ use firezone_bin_shared::{
|
||||
platform::{tcp_socket_factory, udp_socket_factory, DnsControlMethod},
|
||||
TunDeviceManager, TOKEN_ENV_KEY,
|
||||
};
|
||||
use firezone_logging::{anyhow_dyn_err, std_dyn_err};
|
||||
use firezone_logging::{anyhow_dyn_err, std_dyn_err, telemetry_span};
|
||||
use firezone_telemetry::Telemetry;
|
||||
use futures::{
|
||||
future::poll_fn,
|
||||
@@ -558,8 +558,7 @@ impl<'a> Handler<'a> {
|
||||
///
|
||||
/// Throws matchable errors for bad URLs, unable to reach the portal, or unable to create the tunnel device
|
||||
fn connect_to_firezone(&mut self, api_url: &str, token: SecretString) -> Result<(), Error> {
|
||||
let _connect_span =
|
||||
tracing::trace_span!(target: "telemetry", "connect_to_firezone").entered();
|
||||
let _connect_span = telemetry_span!("connect_to_firezone").entered();
|
||||
|
||||
assert!(self.session.is_none());
|
||||
let device_id = device_id::get_or_create().map_err(|e| Error::DeviceId(e.to_string()))?;
|
||||
@@ -606,7 +605,7 @@ impl<'a> Handler<'a> {
|
||||
connlib.set_dns(dns);
|
||||
|
||||
let tun = {
|
||||
let _guard = tracing::trace_span!(target: "telemetry", "create_tun_device").entered();
|
||||
let _guard = telemetry_span!("create_tun_device").entered();
|
||||
|
||||
self.tun_device
|
||||
.make_tun()
|
||||
|
||||
@@ -12,7 +12,7 @@ use firezone_bin_shared::{
|
||||
use firezone_headless_client::{
|
||||
device_id, signals, CallbackHandler, CliCommon, ConnlibMsg, DnsController,
|
||||
};
|
||||
use firezone_logging::anyhow_dyn_err;
|
||||
use firezone_logging::{anyhow_dyn_err, telemetry_span};
|
||||
use firezone_telemetry::Telemetry;
|
||||
use futures::{FutureExt as _, StreamExt as _};
|
||||
use phoenix_channel::get_user_agent;
|
||||
@@ -207,8 +207,7 @@ fn main() -> Result<()> {
|
||||
let mut last_connlib_start_instant = Some(Instant::now());
|
||||
|
||||
rt.block_on(async {
|
||||
let connect_span =
|
||||
tracing::trace_span!(target: "telemetry", "connect_to_firezone").entered();
|
||||
let connect_span = telemetry_span!("connect_to_firezone").entered();
|
||||
|
||||
// The Headless Client will bail out here if there's no Internet, because `PhoenixChannel` will try to
|
||||
// resolve the portal host and fail. This is intentional behavior. The Headless Client should always be running under a manager like `systemd` or Windows' Service Controller,
|
||||
@@ -250,7 +249,7 @@ fn main() -> Result<()> {
|
||||
drop(tokio_handle);
|
||||
|
||||
let tun = {
|
||||
let _guard = tracing::trace_span!(target: "telemetry", "create_tun_device").entered();
|
||||
let _guard = telemetry_span!("create_tun_device").entered();
|
||||
|
||||
tun_device.make_tun()?
|
||||
};
|
||||
|
||||
@@ -24,7 +24,7 @@ where
|
||||
|
||||
let subscriber = Registry::default()
|
||||
.with(additional_layer.with_filter(filter(&directives)))
|
||||
.with(sentry_layer().with_filter(filter(""))) // Sentry layer has its own event filtering mechanism, so we only exclude the noisy crates.
|
||||
.with(sentry_layer())
|
||||
.with(
|
||||
fmt::layer()
|
||||
.event_format(Format::new())
|
||||
@@ -87,7 +87,7 @@ pub fn test_global(directives: &str) {
|
||||
///
|
||||
/// This layer configuration supports a special `telemetry` event.
|
||||
/// Telemetry events are events logged on the `TRACE` level for the `telemetry` target.
|
||||
/// They are sampled at a rate of 1%.
|
||||
/// These events SHOULD be created using [`telemetry_event`] to ensure that they are sampled correctly.
|
||||
/// The idea here is that some events logged via `tracing` should not necessarily end up in the users log file.
|
||||
/// Yet, if they happen a lot, we still want to know about them.
|
||||
/// Coupling the `telemetry` target to the `TRACE` level pretty much prevents these events from ever showing up in log files.
|
||||
@@ -96,8 +96,8 @@ pub fn test_global(directives: &str) {
|
||||
/// ## Telemetry spans
|
||||
///
|
||||
/// Only spans with the `telemetry` target on level `TRACE` will be submitted to Sentry.
|
||||
/// They are subject to the sampling rate defined in the Sentry client configuration.
|
||||
pub fn sentry_layer<S>() -> sentry_tracing::SentryLayer<S>
|
||||
/// Similar to telemetry events, these should be created with [`telemetry_span`] to ensure they are sampled correctly.
|
||||
pub fn sentry_layer<S>() -> impl Layer<S> + Send + Sync
|
||||
where
|
||||
S: Subscriber + for<'a> LookupSpan<'a>,
|
||||
{
|
||||
@@ -105,16 +105,47 @@ where
|
||||
.event_filter(move |md| match *md.level() {
|
||||
tracing::Level::ERROR | tracing::Level::WARN => EventFilter::Exception,
|
||||
tracing::Level::INFO | tracing::Level::DEBUG => EventFilter::Breadcrumb,
|
||||
tracing::Level::TRACE if md.target() == "telemetry" => {
|
||||
// rand::random generates floats in the range of [0, 1).
|
||||
if rand::random::<f32>() < 0.01 {
|
||||
EventFilter::Event
|
||||
} else {
|
||||
EventFilter::Ignore
|
||||
}
|
||||
}
|
||||
tracing::Level::TRACE if md.target() == TELEMETRY_TARGET => EventFilter::Event,
|
||||
_ => EventFilter::Ignore,
|
||||
})
|
||||
.span_filter(|md| *md.level() == tracing::Level::TRACE && md.target() == "telemetry")
|
||||
.span_filter(|md| *md.level() == tracing::Level::TRACE && md.target() == TELEMETRY_TARGET)
|
||||
.enable_span_attributes()
|
||||
.with_filter(filter("trace")) // Filter out noisy crates but pass all events otherwise.
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub const TELEMETRY_TARGET: &str = "telemetry";
|
||||
#[doc(hidden)]
|
||||
pub const TELEMETRY_SAMPLE_RATE: f32 = 0.01;
|
||||
|
||||
/// Creates a `telemetry` span that will be active until dropped.
|
||||
///
|
||||
/// In order to save CPU power, `telemetry` spans are sampled at a rate of 1% at creation time.
|
||||
#[macro_export]
|
||||
macro_rules! telemetry_span {
|
||||
($($arg:tt)*) => {
|
||||
if $crate::__export::rand::random::<f32>() < $crate::TELEMETRY_SAMPLE_RATE {
|
||||
$crate::__export::tracing::trace_span!(target: $crate::TELEMETRY_TARGET, $($arg)*)
|
||||
} else {
|
||||
$crate::__export::tracing::Span::none()
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
/// Creates a `telemetry` event.
|
||||
///
|
||||
/// In order to save CPU power, `telemetry` events are sampled at a rate of 1% at creation time.
|
||||
#[macro_export]
|
||||
macro_rules! telemetry_event {
|
||||
($($arg:tt)*) => {
|
||||
if $crate::__export::rand::random::<f32>() < $crate::TELEMETRY_SAMPLE_RATE {
|
||||
$crate::__export::tracing::trace!(target: $crate::TELEMETRY_TARGET, $($arg)*);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub mod __export {
|
||||
pub use rand;
|
||||
pub use tracing;
|
||||
}
|
||||
|
||||
@@ -13,7 +13,7 @@ use std::{io, mem};
|
||||
use backoff::backoff::Backoff;
|
||||
use backoff::ExponentialBackoff;
|
||||
use base64::Engine;
|
||||
use firezone_logging::std_dyn_err;
|
||||
use firezone_logging::{std_dyn_err, telemetry_span};
|
||||
use futures::future::BoxFuture;
|
||||
use futures::{FutureExt, SinkExt, StreamExt};
|
||||
use heartbeat::{Heartbeat, MissedLastHeartbeat};
|
||||
@@ -271,8 +271,7 @@ where
|
||||
|
||||
let host_and_port = url.expose_secret().host_and_port();
|
||||
|
||||
let _guard =
|
||||
tracing::trace_span!(target: "telemetry", "resolve_portal_url", host = %host_and_port.0).entered();
|
||||
let _span = telemetry_span!("resolve_portal_url", host = %host_and_port.0).entered();
|
||||
|
||||
// Statically resolve the host in the URL to a set of addresses.
|
||||
// We don't use these directly because we need to connect to the domain via TLS which requires a hostname.
|
||||
|
||||
@@ -71,7 +71,9 @@ impl Telemetry {
|
||||
environment: Some(environment.into()),
|
||||
// We can't get the release number ourselves because we don't know if we're embedded in a GUI Client or a Headless Client.
|
||||
release: Some(release.into()),
|
||||
traces_sample_rate: 0.1,
|
||||
// We submit all spans but only send the ones with `target: telemetry`.
|
||||
// Those spans are created further down and are throttled at creation time to save CPU.
|
||||
traces_sample_rate: 1.0,
|
||||
max_breadcrumbs: 500,
|
||||
..Default::default()
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user