feat(telemetry): pre-resolve Sentry ingest host (#10206)

Our Sentry client needs to resolve DNS before being able to send logs or
errors to the backend. Currently, this DNS resolution happens on-demand
as we don't take any control of the underlying HTTP client.

In addition, this will use HTTP/1.1 by default which isn't as efficient
as it could be, especially with concurrent requests.

Finally, if we decide to ever proxy all Sentry for traffic through our
own domain, we have to take control of the underlying client anyway.

To resolve all of the above, we create a custom `TransportFactory` where
we reuse the existing `ReqwestHttpTransport` but provide an already
configured `reqwest::Client` that always uses HTTP/2 with a
pre-configured set of DNS records for the given ingest host.
This commit is contained in:
Thomas Eizinger
2025-08-21 03:28:05 +00:00
committed by GitHub
parent b4cbc4f33b
commit 46afa52f78
11 changed files with 161 additions and 50 deletions

1
rust/Cargo.lock generated
View File

@@ -6005,6 +6005,7 @@ dependencies = [
"futures-channel",
"futures-core",
"futures-util",
"h2",
"http 1.3.1",
"http-body",
"http-body-util",

View File

@@ -263,7 +263,7 @@ impl WrappedSession {
.enable_all()
.build()?;
let mut telemetry = Telemetry::default();
let mut telemetry = Telemetry::new().context("Failed to create telemetry client")?;
runtime.block_on(telemetry.start(&api_url, RELEASE, APPLE_DSN, device_id.clone()));
Telemetry::set_account_slug(account_slug.clone());

View File

@@ -242,7 +242,7 @@ fn connect(
.build()
.context("Failed to create tokio runtime")?;
let mut telemetry = Telemetry::default();
let mut telemetry = Telemetry::new().context("Failed to create telemetry client")?;
runtime.block_on(telemetry.start(&api_url, RELEASE, platform::DSN, device_id.clone()));
Telemetry::set_account_slug(account_slug.clone());

View File

@@ -50,7 +50,7 @@ fn main() -> ExitCode {
.install_default()
.expect("Calling `install_default` only once per process should always succeed");
let mut telemetry = Telemetry::default();
let mut telemetry = Telemetry::new().expect("Failed to create telemetry client");
let runtime = tokio::runtime::Builder::new_current_thread()
.enable_all()

View File

@@ -26,10 +26,17 @@ fn main() -> ExitCode {
std::env::set_var("GDK_BACKEND", "x11");
}
let mut telemetry = Telemetry::default();
let cli = Cli::parse();
let mut telemetry = if cli.is_telemetry_allowed() {
Telemetry::new().expect("Failed to create telemetry client")
} else {
Telemetry::disabled()
};
let rt = tokio::runtime::Runtime::new().expect("failed to build runtime");
match try_main(&rt, &mut bootstrap_log_guard, &mut telemetry) {
match try_main(cli, &rt, &mut bootstrap_log_guard, &mut telemetry) {
Ok(()) => {
rt.block_on(telemetry.stop());
@@ -46,12 +53,11 @@ fn main() -> ExitCode {
}
fn try_main(
cli: Cli,
rt: &Runtime,
bootstrap_log_guard: &mut Option<DefaultGuard>,
telemetry: &mut Telemetry,
) -> Result<()> {
let cli = Cli::parse();
let config = gui::RunConfig {
inject_faults: cli.inject_faults,
debug_update_check: cli.debug_update_check,

View File

@@ -281,6 +281,8 @@ impl<'a> Handler<'a> {
) -> Result<Self> {
dns_controller.deactivate()?;
let telemetry = Telemetry::new().context("Failed to create telemetry client")?;
tracing::info!(
server_pid = std::process::id(),
"Listening for GUI to connect over IPC..."
@@ -306,7 +308,7 @@ impl<'a> Handler<'a> {
ipc_tx,
log_filter_reloader,
session: Session::None,
telemetry: Telemetry::default(),
telemetry,
tun_device,
dns_notifier,
network_notifier,

View File

@@ -188,8 +188,9 @@ fn main() -> Result<()> {
None => device_id::get_or_create().context("Could not get `firezone_id` from CLI, could not read it from disk, could not generate it and save it to disk")?.id,
};
let mut telemetry = Telemetry::default();
if cli.is_telemetry_allowed() {
let mut telemetry = if cli.is_telemetry_allowed() {
let mut telemetry = Telemetry::new().context("Failed to create telemetry client")?;
rt.block_on(telemetry.start(
cli.api_url.as_ref(),
RELEASE,
@@ -198,7 +199,11 @@ fn main() -> Result<()> {
));
analytics::identify(RELEASE.to_owned(), None);
}
telemetry
} else {
Telemetry::disabled()
};
tracing::info!(arch = std::env::consts::ARCH, version = VERSION);

View File

@@ -112,7 +112,7 @@ fn parse_filter(directives: &str) -> Result<EnvFilter, ParseError> {
///
/// By prepending this directive to the active log filter, a simple directive like `debug` actually produces useful logs.
/// If necessary, you can still activate logs from these crates by restating them in your directive with a lower filter, i.e. `netlink_proto=debug`.
const IRRELEVANT_CRATES: &str = "netlink_proto=warn,os_info=warn,rustls=warn,opentelemetry_sdk=info,opentelemetry=info,hyper_util=info";
const IRRELEVANT_CRATES: &str = "netlink_proto=warn,os_info=warn,rustls=warn,opentelemetry_sdk=info,opentelemetry=info,hyper_util=info,h2=info";
let env_filter = if directives.is_empty() {
EnvFilter::try_new(IRRELEVANT_CRATES)?

View File

@@ -125,15 +125,20 @@ fn main() {
.build()
.expect("Failed to build tokio runtime");
let mut telemetry = Telemetry::default();
if args.telemetry {
let mut telemetry = if args.telemetry {
let mut telemetry = Telemetry::new().expect("Failed to create telemetry client");
runtime.block_on(telemetry.start(
args.api_url.as_str(),
VERSION.unwrap_or("unknown"),
RELAY_DSN,
String::new(), // Relays don't have a Firezone ID.
));
}
telemetry
} else {
Telemetry::disabled()
};
match runtime.block_on(try_main(args)) {
Ok(()) => runtime.block_on(telemetry.stop()),

View File

@@ -13,7 +13,7 @@ moka = { workspace = true, features = ["sync"] }
opentelemetry = { workspace = true }
opentelemetry_sdk = { workspace = true, features = ["metrics"] }
parking_lot = { workspace = true }
reqwest = { workspace = true }
reqwest = { workspace = true, features = ["http2"] }
sentry = { workspace = true, features = ["contexts", "backtrace", "debug-images", "panic", "reqwest", "rustls", "tracing", "release-health", "logs"] }
serde = { workspace = true }
serde_json = { workspace = true }

View File

@@ -1,12 +1,21 @@
#![cfg_attr(test, allow(clippy::unwrap_used))]
use std::{borrow::Cow, collections::BTreeMap, fmt, str::FromStr, sync::Arc, time::Duration};
use std::{
borrow::Cow,
collections::BTreeMap,
fmt,
net::{SocketAddr, ToSocketAddrs as _},
str::FromStr,
sync::Arc,
time::Duration,
};
use anyhow::{Context, Result, anyhow, bail};
use api_url::ApiUrl;
use sentry::{
BeforeCallback, User,
protocol::{Event, Log, LogAttribute, SessionStatus},
transports::ReqwestHttpTransport,
};
use sha2::Digest as _;
@@ -22,34 +31,56 @@ mod posthog;
pub use maybe_push_metrics_exporter::MaybePushMetricsExporter;
pub use noop_push_metrics_exporter::NoopPushMetricsExporter;
pub struct Dsn(&'static str);
pub struct Dsn {
public_key: &'static str,
project_id: u64,
}
// TODO: Dynamic DSN
// Sentry docs say this does not need to be protected:
// > DSNs are safe to keep public because they only allow submission of new events and related event data; they do not allow read access to any information.
// <https://docs.sentry.io/concepts/key-terms/dsn-explainer/#dsn-utilization>
pub const ANDROID_DSN: Dsn = Dsn(
"https://928a6ee1f6af9734100b8bc89b2dc87d@o4507971108339712.ingest.us.sentry.io/4508175126233088",
);
pub const APPLE_DSN: Dsn = Dsn(
"https://66c71f83675f01abfffa8eb977bcbbf7@o4507971108339712.ingest.us.sentry.io/4508175177023488",
);
pub const GATEWAY_DSN: Dsn = Dsn(
"https://f763102cc3937199ec483fbdae63dfdc@o4507971108339712.ingest.us.sentry.io/4508162914451456",
);
pub const GUI_DSN: Dsn = Dsn(
"https://2e17bf5ed24a78c0ac9e84a5de2bd6fc@o4507971108339712.ingest.us.sentry.io/4508008945549312",
);
pub const HEADLESS_DSN: Dsn = Dsn(
"https://bc27dca8bb37be0142c48c4f89647c13@o4507971108339712.ingest.us.sentry.io/4508010028728320",
);
pub const RELAY_DSN: Dsn = Dsn(
"https://9d5f664d8f8f7f1716d4b63a58bcafd5@o4507971108339712.ingest.us.sentry.io/4508373298970624",
);
pub const TESTING: Dsn = Dsn(
"https://55ef451fca9054179a11f5d132c02f45@o4507971108339712.ingest.us.sentry.io/4508792604852224",
);
const INGEST_HOST: &str = "o4507971108339712.ingest.us.sentry.io";
pub const ANDROID_DSN: Dsn = Dsn {
public_key: "928a6ee1f6af9734100b8bc89b2dc87d",
project_id: 4508175126233088,
};
pub const APPLE_DSN: Dsn = Dsn {
public_key: "66c71f83675f01abfffa8eb977bcbbf7",
project_id: 4508175177023488,
};
pub const GATEWAY_DSN: Dsn = Dsn {
public_key: "f763102cc3937199ec483fbdae63dfdc",
project_id: 4508162914451456,
};
pub const GUI_DSN: Dsn = Dsn {
public_key: "2e17bf5ed24a78c0ac9e84a5de2bd6fc",
project_id: 4508008945549312,
};
pub const HEADLESS_DSN: Dsn = Dsn {
public_key: "bc27dca8bb37be0142c48c4f89647c13",
project_id: 4508010028728320,
};
pub const RELAY_DSN: Dsn = Dsn {
public_key: "9d5f664d8f8f7f1716d4b63a58bcafd5",
project_id: 4508373298970624,
};
pub const TESTING: Dsn = Dsn {
public_key: "55ef451fca9054179a11f5d132c02f45",
project_id: 4508792604852224,
};
impl fmt::Display for Dsn {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"https://{}@{INGEST_HOST}/{}",
self.public_key, self.project_id
)
}
}
#[derive(Debug, PartialEq, Clone, Copy)]
pub(crate) enum Env {
@@ -104,9 +135,9 @@ impl fmt::Display for Env {
}
}
#[derive(Default)]
pub struct Telemetry {
inner: Option<sentry::ClientInitGuard>,
transport: TransportFactory,
}
impl Drop for Telemetry {
@@ -121,6 +152,22 @@ impl Drop for Telemetry {
}
impl Telemetry {
pub fn new() -> Result<Self> {
Ok(Self {
inner: Default::default(),
// The instances for `Telemetry` get created at the very beginning of each program entry-point.
// Therefore, it is safe to perform DNS resolution there right away before the tunnel is up.
transport: TransportFactory::resolve_ingest_host()?,
})
}
pub fn disabled() -> Self {
Self {
inner: None,
transport: TransportFactory::without_addresses(),
}
}
pub async fn start(&mut self, api_url: &str, release: &str, dsn: Dsn, firezone_id: String) {
// Can't use URLs as `environment` directly, because Sentry doesn't allow slashes in environments.
// <https://docs.sentry.io/platforms/rust/configuration/environments/>
@@ -156,17 +203,21 @@ impl Telemetry {
feature_flags::evaluate_now(firezone_id.clone(), environment).await;
tracing::info!(%environment, "Starting telemetry");
let client_options = sentry::ClientOptions {
environment: Some(Cow::Borrowed(environment.as_str())),
// We can't get the release number ourselves because we don't know if we're embedded in a GUI Client or a Headless Client.
release: Some(release.to_owned().into()),
max_breadcrumbs: 500,
before_send: Some(event_rate_limiter(Duration::from_secs(60 * 5))),
enable_logs: true,
before_send_log: Some(Arc::new(append_tracing_fields_to_message)),
..Default::default()
};
let inner = sentry::init((
dsn.0,
dsn.to_string(),
sentry::ClientOptions {
environment: Some(Cow::Borrowed(environment.as_str())),
// We can't get the release number ourselves because we don't know if we're embedded in a GUI Client or a Headless Client.
release: Some(release.to_owned().into()),
max_breadcrumbs: 500,
before_send: Some(event_rate_limiter(Duration::from_secs(60 * 5))),
enable_logs: true,
before_send_log: Some(Arc::new(append_tracing_fields_to_message)),
..Default::default()
transport: Some(Arc::new(self.transport.clone())),
..client_options
},
));
// Configure scope on the main hub so that all threads will get the tags
@@ -345,6 +396,47 @@ fn set_current_user(user: Option<sentry::User>) {
sentry::Hub::main().configure_scope(|scope| scope.set_user(user));
}
#[derive(Debug, Clone)]
pub struct TransportFactory {
ingest_domain_addresses: Vec<SocketAddr>,
}
impl TransportFactory {
pub fn resolve_ingest_host() -> Result<Self> {
let resolved_addresses = (INGEST_HOST, 443u16)
.to_socket_addrs()
.with_context(|| format!("Failed to resolve {INGEST_HOST}"))?
.collect();
tracing::debug!(host = %INGEST_HOST, addresses = ?resolved_addresses, "Resolved ingest host IPs");
Ok(Self {
ingest_domain_addresses: resolved_addresses,
})
}
fn without_addresses() -> Self {
Self {
ingest_domain_addresses: Default::default(),
}
}
}
impl sentry::TransportFactory for TransportFactory {
fn create_transport(&self, options: &sentry::ClientOptions) -> Arc<dyn sentry::Transport> {
let client = reqwest::ClientBuilder::new()
.http2_prior_knowledge()
.http2_keep_alive_while_idle(true)
.http2_keep_alive_timeout(Duration::from_secs(1))
.http2_keep_alive_interval(Duration::from_secs(5)) // Ensure we detect broken connections, i.e. when enabling / disabling the Internet Resource.
.resolve_to_addrs(INGEST_HOST, &self.ingest_domain_addresses)
.build()
.expect("Failed to build HTTP client");
Arc::new(ReqwestHttpTransport::with_client(options, client))
}
}
#[cfg(test)]
mod tests {
use std::time::SystemTime;
@@ -353,7 +445,7 @@ mod tests {
#[tokio::test]
async fn starting_session_for_unsupported_env_disables_current_one() {
let mut telemetry = Telemetry::default();
let mut telemetry = Telemetry::new().unwrap();
telemetry
.start("wss://api.firez.one", "1.0.0", TESTING, String::new())
.await;