From 3e18fa8ca220ed1e5aa59f158245065e37d295db Mon Sep 17 00:00:00 2001
From: Thomas Eizinger <thomas@eizinger.io>
Date: Wed, 13 Nov 2024 00:16:44 +0000
Subject: [PATCH] chore(telemetry): misc. clean-up (#7326)

Bundles together several minor improvements around telemetry:

- Removes the obsolete "Firezone" context: This is now included in the
user context as of #7310.
- Entirely encapsulates `sentry` within the `telemetry` module
- Concludes sessions that were not explicitly closed as "abnormal"
---
 rust/gui-client/src-tauri/src/client.rs     | 28 ++-------------------
 rust/gui-client/src-tauri/src/client/gui.rs | 17 ++++++-------
 rust/telemetry/src/lib.rs                   | 27 +++++++++++++++-----
 3 files changed, 30 insertions(+), 42 deletions(-)

diff --git a/rust/gui-client/src-tauri/src/client.rs b/rust/gui-client/src-tauri/src/client.rs
index 608870a6f..e1c19ff95 100644
--- a/rust/gui-client/src-tauri/src/client.rs
+++ b/rust/gui-client/src-tauri/src/client.rs
@@ -5,7 +5,6 @@ use firezone_gui_client_common::{
 };
 use firezone_logging::{anyhow_dyn_err, std_dyn_err};
 use firezone_telemetry as telemetry;
-use std::collections::BTreeMap;
 use tracing::instrument;
 use tracing_subscriber::EnvFilter;
 
@@ -101,22 +100,8 @@ fn run_gui(cli: Cli) -> Result<()> {
     );
     // Get the device ID before starting Tokio, so that all the worker threads will inherit the correct scope.
     // Technically this means we can fail to get the device ID on a newly-installed system, since the IPC service may not have fully started up when the GUI process reaches this point, but in practice it's unlikely.
-    match firezone_headless_client::device_id::get() {
-        Ok(id) => {
-            // We should still be in the main thread, but explicitly get the main thread hub anyway.
-            telemetry::Hub::main().configure_scope(|scope| {
-                scope.set_context(
-                    "firezone",
-                    firezone_telemetry::Context::Other(BTreeMap::from([(
-                        "id".to_string(),
-                        id.id.into(),
-                    )])),
-                )
-            });
-        }
-        Err(error) => {
-            telemetry::capture_anyhow(&error.context("Failed to read device ID"));
-        }
+    if let Ok(id) = firezone_headless_client::device_id::get() {
+        telemetry.set_firezone_id(id.id);
     }
     fix_log_filter(&mut settings)?;
     let common::logging::Handles {
@@ -165,15 +150,6 @@ fn start_logging(directives: &str) -> Result<common::logging::Handles> {
         ?system_uptime_seconds,
         "`gui-client` started logging"
     );
-    telemetry::add_breadcrumb(telemetry::Breadcrumb {
-        ty: "logging_start".into(),
-        category: None,
-        data: BTreeMap::from([
-            ("directives".into(), directives.into()),
-            ("system_uptime_seconds".into(), system_uptime_seconds.into()),
-        ]),
-        ..Default::default()
-    });
 
     Ok(logging_handles)
 }
diff --git a/rust/gui-client/src-tauri/src/client/gui.rs b/rust/gui-client/src-tauri/src/client/gui.rs
index c154f7d5d..db526ead8 100644
--- a/rust/gui-client/src-tauri/src/client/gui.rs
+++ b/rust/gui-client/src-tauri/src/client/gui.rs
@@ -204,11 +204,6 @@ pub(crate) fn run(
                         tracing::warn!(
                             "Will crash / error / panic on purpose in {delay} seconds to test error handling."
                         );
-                        telemetry::add_breadcrumb(telemetry::Breadcrumb {
-                            ty: "fail_on_purpose".into(),
-                            message: Some("Will crash / error / panic on purpose to test error handling.".into()),
-                            ..Default::default()
-                        });
                         tokio::time::sleep(Duration::from_secs(delay)).await;
                         tracing::warn!("Crashing / erroring / panicking on purpose");
                         ctlr_tx.send(ControllerRequest::Fail(failure)).await?;
@@ -259,25 +254,27 @@ pub(crate) fn run(
                     let exit_code = match result {
                         Err(_panic) => {
                             // The panic will have been recorded already by Sentry's panic hook.
-                            telemetry::end_session_with_status(telemetry::SessionStatus::Crashed);
+                            telemetry.stop_on_crash().await;
+
                             1
                         }
                         Ok(Err(error)) => {
                             tracing::error!(error = std_dyn_err(&error), "run_controller returned an error");
                             errors::show_error_dialog(&error).unwrap();
-                            telemetry::end_session_with_status(telemetry::SessionStatus::Crashed);
+
+                            telemetry.stop_on_crash().await;
+
                             1
                         }
                         Ok(Ok(_)) => {
-                            telemetry::end_session();
+                            telemetry.stop().await;
+
                             0
                         }
                     };
 
-                    // In a normal Rust application, Sentry's guard would flush on drop: https://docs.sentry.io/platforms/rust/configuration/draining/
                     // But due to a limit in `tao` we cannot return from the event loop and must call `std::process::exit` (or Tauri's wrapper), so we explicitly flush here.
                     // TODO: This limit may not exist in Tauri v2
-                    telemetry.stop().await;
 
                     tracing::info!(?exit_code);
                     app_handle.exit(exit_code);
diff --git a/rust/telemetry/src/lib.rs b/rust/telemetry/src/lib.rs
index 80f7ba29b..e0f3a5ede 100644
--- a/rust/telemetry/src/lib.rs
+++ b/rust/telemetry/src/lib.rs
@@ -1,10 +1,6 @@
 use std::time::Duration;
 
-pub use sentry::{
-    add_breadcrumb, capture_error, capture_message, end_session, end_session_with_status,
-    types::protocol::v7::{Context, SessionStatus},
-    Breadcrumb, Hub, Level,
-};
+use sentry::protocol::SessionStatus;
 pub use sentry_anyhow::capture_anyhow;
 
 pub struct Dsn(&'static str);
@@ -40,6 +36,17 @@ pub struct Telemetry {
     firezone_id: Option<String>,
 }
 
+impl Drop for Telemetry {
+    fn drop(&mut self) {
+        if self.inner.is_none() {
+            return;
+        }
+
+        // Conclude telemetry session as "abnormal" if we get dropped without closing it properly first.
+        sentry::end_session_with_status(SessionStatus::Abnormal);
+    }
+}
+
 impl Telemetry {
     pub fn start(&mut self, api_url: &str, release: &str, dsn: Dsn) {
         // Since it's `arc_swap` and not `Option`, there is a TOCTOU here,
@@ -89,11 +96,19 @@ impl Telemetry {
 
     /// Flushes events to sentry.io and drops the guard
     pub async fn stop(&mut self) {
+        self.end_session(SessionStatus::Exited).await;
+    }
+
+    pub async fn stop_on_crash(&mut self) {
+        self.end_session(SessionStatus::Crashed).await;
+    }
+
+    async fn end_session(&mut self, status: SessionStatus) {
         let Some(inner) = self.inner.take() else {
             return;
         };
         tracing::info!("Stopping telemetry");
-        sentry::end_session();
+        sentry::end_session_with_status(status);
 
         // Sentry uses blocking IO for flushing ..
         let _ = tokio::task::spawn_blocking(move || {