From 3e18fa8ca220ed1e5aa59f158245065e37d295db Mon Sep 17 00:00:00 2001 From: Thomas Eizinger Date: Wed, 13 Nov 2024 00:16:44 +0000 Subject: [PATCH] chore(telemetry): misc. clean-up (#7326) Bundles together several minor improvements around telemetry: - Removes the obsolete "Firezone" context: This is now included in the user context as of #7310. - Entirely encapsulates `sentry` within the `telemetry` module - Concludes sessions that were not explicitly closed as "abnormal" --- rust/gui-client/src-tauri/src/client.rs | 28 ++------------------- rust/gui-client/src-tauri/src/client/gui.rs | 17 ++++++------- rust/telemetry/src/lib.rs | 27 +++++++++++++++----- 3 files changed, 30 insertions(+), 42 deletions(-) diff --git a/rust/gui-client/src-tauri/src/client.rs b/rust/gui-client/src-tauri/src/client.rs index 608870a6f..e1c19ff95 100644 --- a/rust/gui-client/src-tauri/src/client.rs +++ b/rust/gui-client/src-tauri/src/client.rs @@ -5,7 +5,6 @@ use firezone_gui_client_common::{ }; use firezone_logging::{anyhow_dyn_err, std_dyn_err}; use firezone_telemetry as telemetry; -use std::collections::BTreeMap; use tracing::instrument; use tracing_subscriber::EnvFilter; @@ -101,22 +100,8 @@ fn run_gui(cli: Cli) -> Result<()> { ); // Get the device ID before starting Tokio, so that all the worker threads will inherit the correct scope. // Technically this means we can fail to get the device ID on a newly-installed system, since the IPC service may not have fully started up when the GUI process reaches this point, but in practice it's unlikely. - match firezone_headless_client::device_id::get() { - Ok(id) => { - // We should still be in the main thread, but explicitly get the main thread hub anyway. - telemetry::Hub::main().configure_scope(|scope| { - scope.set_context( - "firezone", - firezone_telemetry::Context::Other(BTreeMap::from([( - "id".to_string(), - id.id.into(), - )])), - ) - }); - } - Err(error) => { - telemetry::capture_anyhow(&error.context("Failed to read device ID")); - } + if let Ok(id) = firezone_headless_client::device_id::get() { + telemetry.set_firezone_id(id.id); } fix_log_filter(&mut settings)?; let common::logging::Handles { @@ -165,15 +150,6 @@ fn start_logging(directives: &str) -> Result { ?system_uptime_seconds, "`gui-client` started logging" ); - telemetry::add_breadcrumb(telemetry::Breadcrumb { - ty: "logging_start".into(), - category: None, - data: BTreeMap::from([ - ("directives".into(), directives.into()), - ("system_uptime_seconds".into(), system_uptime_seconds.into()), - ]), - ..Default::default() - }); Ok(logging_handles) } diff --git a/rust/gui-client/src-tauri/src/client/gui.rs b/rust/gui-client/src-tauri/src/client/gui.rs index c154f7d5d..db526ead8 100644 --- a/rust/gui-client/src-tauri/src/client/gui.rs +++ b/rust/gui-client/src-tauri/src/client/gui.rs @@ -204,11 +204,6 @@ pub(crate) fn run( tracing::warn!( "Will crash / error / panic on purpose in {delay} seconds to test error handling." ); - telemetry::add_breadcrumb(telemetry::Breadcrumb { - ty: "fail_on_purpose".into(), - message: Some("Will crash / error / panic on purpose to test error handling.".into()), - ..Default::default() - }); tokio::time::sleep(Duration::from_secs(delay)).await; tracing::warn!("Crashing / erroring / panicking on purpose"); ctlr_tx.send(ControllerRequest::Fail(failure)).await?; @@ -259,25 +254,27 @@ pub(crate) fn run( let exit_code = match result { Err(_panic) => { // The panic will have been recorded already by Sentry's panic hook. - telemetry::end_session_with_status(telemetry::SessionStatus::Crashed); + telemetry.stop_on_crash().await; + 1 } Ok(Err(error)) => { tracing::error!(error = std_dyn_err(&error), "run_controller returned an error"); errors::show_error_dialog(&error).unwrap(); - telemetry::end_session_with_status(telemetry::SessionStatus::Crashed); + + telemetry.stop_on_crash().await; + 1 } Ok(Ok(_)) => { - telemetry::end_session(); + telemetry.stop().await; + 0 } }; - // In a normal Rust application, Sentry's guard would flush on drop: https://docs.sentry.io/platforms/rust/configuration/draining/ // But due to a limit in `tao` we cannot return from the event loop and must call `std::process::exit` (or Tauri's wrapper), so we explicitly flush here. // TODO: This limit may not exist in Tauri v2 - telemetry.stop().await; tracing::info!(?exit_code); app_handle.exit(exit_code); diff --git a/rust/telemetry/src/lib.rs b/rust/telemetry/src/lib.rs index 80f7ba29b..e0f3a5ede 100644 --- a/rust/telemetry/src/lib.rs +++ b/rust/telemetry/src/lib.rs @@ -1,10 +1,6 @@ use std::time::Duration; -pub use sentry::{ - add_breadcrumb, capture_error, capture_message, end_session, end_session_with_status, - types::protocol::v7::{Context, SessionStatus}, - Breadcrumb, Hub, Level, -}; +use sentry::protocol::SessionStatus; pub use sentry_anyhow::capture_anyhow; pub struct Dsn(&'static str); @@ -40,6 +36,17 @@ pub struct Telemetry { firezone_id: Option, } +impl Drop for Telemetry { + fn drop(&mut self) { + if self.inner.is_none() { + return; + } + + // Conclude telemetry session as "abnormal" if we get dropped without closing it properly first. + sentry::end_session_with_status(SessionStatus::Abnormal); + } +} + impl Telemetry { pub fn start(&mut self, api_url: &str, release: &str, dsn: Dsn) { // Since it's `arc_swap` and not `Option`, there is a TOCTOU here, @@ -89,11 +96,19 @@ impl Telemetry { /// Flushes events to sentry.io and drops the guard pub async fn stop(&mut self) { + self.end_session(SessionStatus::Exited).await; + } + + pub async fn stop_on_crash(&mut self) { + self.end_session(SessionStatus::Crashed).await; + } + + async fn end_session(&mut self, status: SessionStatus) { let Some(inner) = self.inner.take() else { return; }; tracing::info!("Stopping telemetry"); - sentry::end_session(); + sentry::end_session_with_status(status); // Sentry uses blocking IO for flushing .. let _ = tokio::task::spawn_blocking(move || {