chore(telemetry): capture Firezone ID and account in user ctx (#7310)

Sentry has a feature called the "User context" which allows us to assign
events to individual users. This in turn will give us statistics in
Sentry, how many users are affected by a certain issue.

Unfortunately, Sentry's user context cannot be built-up step-by-step but
has to be set as a whole. To achieve this, we need to slightly refactor
`Telemetry` to not be `clone`d and instead passed around by mutable
reference.

Resolves: #7248.
Related: https://github.com/getsentry/sentry-rust/issues/706.
This commit is contained in:
Thomas Eizinger
2024-11-11 19:50:14 +00:00
committed by GitHub
parent 06a274c4e5
commit 488c599d5b
13 changed files with 107 additions and 104 deletions

8
rust/Cargo.lock generated
View File

@@ -181,12 +181,6 @@ dependencies = [
"x11rb",
]
[[package]]
name = "arc-swap"
version = "1.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457"
[[package]]
name = "arrayvec"
version = "0.7.6"
@@ -2013,6 +2007,7 @@ dependencies = [
"firezone-headless-client",
"firezone-logging",
"firezone-telemetry",
"futures",
"native-dialog",
"nix 0.29.0",
"rand 0.8.5",
@@ -2193,7 +2188,6 @@ dependencies = [
name = "firezone-telemetry"
version = "0.1.0"
dependencies = [
"arc-swap",
"sentry",
"sentry-anyhow",
"thiserror",

View File

@@ -348,8 +348,9 @@ fn connect(
let device_info = string_from_jstring!(env, device_info);
let device_info = serde_json::from_str(&device_info).unwrap();
let telemetry = Telemetry::default();
let mut telemetry = Telemetry::default();
telemetry.start(&api_url, env!("CARGO_PKG_VERSION"), ANDROID_DSN);
telemetry.set_firezone_id(device_id.clone());
let handle = init_logging(&PathBuf::from(log_dir), log_filter);
install_rustls_crypto_provider();
@@ -469,7 +470,7 @@ pub unsafe extern "system" fn Java_dev_firezone_android_tunnel_ConnlibSession_di
) {
let session = session_ptr as *mut SessionWrapper;
catch_and_throw(&mut env, |_| {
let session = Box::from_raw(session);
let mut session = Box::from_raw(session);
session.runtime.block_on(session.telemetry.stop());
session.inner.disconnect();

View File

@@ -193,8 +193,9 @@ impl WrappedSession {
callback_handler: ffi::CallbackHandler,
device_info: String,
) -> Result<Self> {
let telemetry = Telemetry::default();
let mut telemetry = Telemetry::default();
telemetry.start(&api_url, env!("CARGO_PKG_VERSION"), APPLE_DSN);
telemetry.set_firezone_id(device_id.clone());
let logger = init_logging(log_dir.into(), log_filter)?;
install_rustls_crypto_provider();
@@ -264,7 +265,7 @@ impl WrappedSession {
self.inner.set_disabled_resources(disabled_resources)
}
fn disconnect(self) {
fn disconnect(mut self) {
self.runtime.block_on(self.telemetry.stop());
self.inner.disconnect();
}

View File

@@ -39,7 +39,7 @@ async fn main() {
.expect("Calling `install_default` only once per process should always succeed");
let cli = Cli::parse();
let telemetry = Telemetry::default();
let mut telemetry = Telemetry::default();
if cli.is_telemetry_allowed() {
telemetry.start(
cli.api_url.as_str(),
@@ -53,17 +53,18 @@ async fn main() {
//
// By default, `anyhow` prints a stacktrace when it exits.
// That looks like a "crash" but we "just" exit with a fatal error.
if let Err(e) = try_main(cli).await {
if let Err(e) = try_main(cli, &mut telemetry).await {
tracing::error!(error = anyhow_dyn_err(&e));
std::process::exit(1);
}
}
async fn try_main(cli: Cli) -> Result<()> {
async fn try_main(cli: Cli, telemetry: &mut Telemetry) -> Result<()> {
firezone_logging::setup_global_subscriber(layer::Identity::default());
let firezone_id = get_firezone_id(cli.firezone_id).await
.context("Couldn't read FIREZONE_ID or write it to disk: Please provide it through the env variable or provide rw access to /var/lib/firezone/")?;
telemetry.set_firezone_id(firezone_id.clone());
let login = LoginUrl::gateway(
cli.api_url,

View File

@@ -73,11 +73,17 @@ pub(crate) struct Response {
}
#[derive(Default, Deserialize, Serialize)]
pub(crate) struct Session {
pub struct Session {
pub(crate) account_slug: String,
pub(crate) actor_name: String,
}
impl Session {
pub fn account_slug(&self) -> &str {
&self.account_slug
}
}
struct SessionAndToken {
session: Session,
token: SecretString,
@@ -118,7 +124,7 @@ impl Auth {
}
/// Returns the session iff we are signed in.
pub(crate) fn session(&self) -> Option<&Session> {
pub fn session(&self) -> Option<&Session> {
match &self.state {
State::SignedIn(x) => Some(x),
State::NeedResponse(_) | State::SignedOut => None,
@@ -234,7 +240,6 @@ impl Auth {
match std::fs::read_to_string(session_data_path()?) {
Ok(x) => {
session = serde_json::from_str(&x).map_err(|_| Error::Serde)?;
firezone_telemetry::set_account_slug(session.account_slug.to_string());
}
Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
Err(e) => return Err(Error::ReadFile(e)),

View File

@@ -26,7 +26,7 @@ mod ran_before;
pub type CtlrTx = mpsc::Sender<ControllerRequest>;
pub struct Controller<I: GuiIntegration> {
pub struct Controller<'a, I: GuiIntegration> {
/// Debugging-only settings like API URL, auth URL, log filter
advanced_settings: AdvancedSettings,
// Sign-in state with the portal / deep links
@@ -41,23 +41,23 @@ pub struct Controller<I: GuiIntegration> {
release: Option<updates::Release>,
rx: mpsc::Receiver<ControllerRequest>,
status: Status,
telemetry: Telemetry,
telemetry: &'a mut Telemetry,
updates_rx: mpsc::Receiver<Option<updates::Notification>>,
uptime: crate::uptime::Tracker,
}
pub struct Builder<I: GuiIntegration> {
pub struct Builder<'a, I: GuiIntegration> {
pub advanced_settings: AdvancedSettings,
pub ctlr_tx: CtlrTx,
pub integration: I,
pub log_filter_reloader: LogFilterReloader,
pub rx: mpsc::Receiver<ControllerRequest>,
pub telemetry: Telemetry,
pub telemetry: &'a mut Telemetry,
pub updates_rx: mpsc::Receiver<Option<updates::Notification>>,
}
impl<I: GuiIntegration> Builder<I> {
pub async fn build(self) -> Result<Controller<I>> {
impl<'a, I: GuiIntegration> Builder<'a, I> {
pub async fn build(self) -> Result<Controller<'a, I>> {
let Builder {
advanced_settings,
ctlr_tx,
@@ -203,8 +203,10 @@ impl Status {
}
}
impl<I: GuiIntegration> Controller<I> {
impl<'a, I: GuiIntegration> Controller<'a, I> {
pub async fn main_loop(mut self) -> Result<(), Error> {
let account_slug = self.auth.session().map(|s| s.account_slug.to_owned());
// Start telemetry
{
const VERSION: &str = env!("CARGO_PKG_VERSION");
@@ -212,10 +214,15 @@ impl<I: GuiIntegration> Controller<I> {
let environment = self.advanced_settings.api_url.to_string();
self.telemetry
.start(&environment, VERSION, firezone_telemetry::GUI_DSN);
if let Some(account_slug) = account_slug.clone() {
self.telemetry.set_account_slug(account_slug);
}
self.ipc_client
.send_msg(&IpcClientMsg::StartTelemetry {
environment,
version: VERSION.to_owned(),
account_slug,
})
.await?;
}

View File

@@ -23,6 +23,7 @@ firezone-gui-client-common = { path = "../src-common" }
firezone-headless-client = { path = "../../headless-client" }
firezone-logging = { workspace = true }
firezone-telemetry = { workspace = true }
futures = "0.3.31"
native-dialog = "0.7.0"
rand = "0.8.5"
rustls = { workspace = true }

View File

@@ -61,7 +61,7 @@ pub(crate) fn run() -> Result<()> {
Some(Cmd::SmokeTest) => {
// Can't check elevation here because the Windows CI is always elevated
let settings = common::settings::load_advanced_settings().unwrap_or_default();
let telemetry = telemetry::Telemetry::default();
let mut telemetry = telemetry::Telemetry::default();
telemetry.start(
settings.api_url.as_ref(),
env!("CARGO_PKG_VERSION"),
@@ -92,7 +92,7 @@ pub(crate) fn run() -> Result<()> {
// Can't `instrument` this because logging isn't running when we enter it.
fn run_gui(cli: Cli) -> Result<()> {
let mut settings = common::settings::load_advanced_settings().unwrap_or_default();
let telemetry = telemetry::Telemetry::default();
let mut telemetry = telemetry::Telemetry::default();
// In the future telemetry will be opt-in per organization, that's why this isn't just at the top of `main`
telemetry.start(
settings.api_url.as_ref(),

View File

@@ -20,8 +20,9 @@ use firezone_gui_client_common::{
use firezone_headless_client::LogFilterReloader;
use firezone_logging::{anyhow_dyn_err, std_dyn_err};
use firezone_telemetry as telemetry;
use futures::FutureExt;
use secrecy::{ExposeSecret as _, SecretString};
use std::{str::FromStr, time::Duration};
use std::{panic::AssertUnwindSafe, str::FromStr, time::Duration};
use tauri::Manager;
use tauri_plugin_shell::ShellExt as _;
use tokio::sync::{mpsc, oneshot};
@@ -122,7 +123,7 @@ pub(crate) fn run(
cli: client::Cli,
advanced_settings: AdvancedSettings,
reloader: LogFilterReloader,
telemetry: telemetry::Telemetry,
mut telemetry: telemetry::Telemetry,
) -> Result<(), Error> {
// Needed for the deep link server
let rt = tokio::runtime::Runtime::new().context("Couldn't start Tokio runtime")?;
@@ -238,16 +239,15 @@ pub(crate) fn run(
let app_handle = app.handle().clone();
let _ctlr_task = tokio::spawn(async move {
// Spawn two nested Tasks so the outer can catch panics from the inner
let task = tokio::spawn(run_controller(
let result = AssertUnwindSafe(run_controller(
ctlr_tx,
integration,
ctlr_rx,
advanced_settings,
reloader,
telemetry.clone(),
&mut telemetry,
updates_rx,
));
)).catch_unwind().await;
// See <https://github.com/tauri-apps/tauri/issues/8631>
// This should be the ONLY place we call `app.exit` or `app_handle.exit`,
@@ -256,9 +256,9 @@ pub(crate) fn run(
// This seems to be a platform limitation that Tauri is unable to hide
// from us. It was the source of much consternation at time of writing.
let exit_code = match task.await {
Err(error) => {
tracing::error!(error = std_dyn_err(&error), "run_controller panicked");
let exit_code = match result {
Err(_panic) => {
// The panic will have been recorded already by Sentry's panic hook.
telemetry::end_session_with_status(telemetry::SessionStatus::Crashed);
1
}
@@ -476,7 +476,7 @@ async fn run_controller(
rx: mpsc::Receiver<ControllerRequest>,
advanced_settings: AdvancedSettings,
log_filter_reloader: LogFilterReloader,
telemetry: telemetry::Telemetry,
telemetry: &mut telemetry::Telemetry,
updates_rx: mpsc::Receiver<Option<updates::Notification>>,
) -> Result<(), Error> {
tracing::debug!("Entered `run_controller`");

View File

@@ -19,14 +19,7 @@ use futures::{
use phoenix_channel::LoginUrl;
use secrecy::SecretString;
use serde::{Deserialize, Serialize};
use std::{
collections::{BTreeMap, BTreeSet},
net::IpAddr,
path::PathBuf,
pin::pin,
sync::Arc,
time::Duration,
};
use std::{collections::BTreeSet, net::IpAddr, path::PathBuf, pin::pin, sync::Arc, time::Duration};
use tokio::{sync::mpsc, task::spawn_blocking, time::Instant};
use tracing::subscriber::set_global_default;
use tracing_subscriber::{layer::SubscriberExt, EnvFilter, Layer, Registry};
@@ -96,6 +89,7 @@ pub enum ClientMsg {
StartTelemetry {
environment: String,
version: String,
account_slug: Option<String>,
},
StopTelemetry,
}
@@ -201,7 +195,7 @@ fn run_smoke_test() -> Result<()> {
// and we need to recover. <https://github.com/firezone/firezone/issues/4899>
dns_controller.deactivate()?;
let mut signals = signals::Terminate::new()?;
let telemetry = Telemetry::default();
let mut telemetry = Telemetry::default();
// Couldn't get the loop to work here yet, so SIGHUP is not implemented
rt.block_on(async {
@@ -211,7 +205,7 @@ fn run_smoke_test() -> Result<()> {
&mut server,
&mut dns_controller,
&log_filter_reloader,
telemetry,
&mut telemetry,
)
.await?
.run(&mut signals)
@@ -234,26 +228,18 @@ async fn ipc_listen(
let firezone_id = device_id::get_or_create()
.context("Failed to read / create device ID")?
.id;
// TODO: Telemetry is initialized wrong here:
// Sentry's contexts must be set on the main thread hub before starting Tokio, so that other thread hubs will inherit the context.
firezone_telemetry::configure_scope(|scope| {
scope.set_context(
"firezone",
firezone_telemetry::Context::Other(BTreeMap::from([(
"id".to_string(),
firezone_id.into(),
)])),
)
});
let mut telemetry = Telemetry::default();
telemetry.set_firezone_id(firezone_id);
let mut server = IpcServer::new(ServiceId::Prod).await?;
let mut dns_controller = DnsController { dns_control_method };
let telemetry = Telemetry::default();
loop {
let mut handler_fut = pin!(Handler::new(
&mut server,
&mut dns_controller,
log_filter_reloader,
telemetry.clone(),
&mut telemetry,
));
let Some(handler) = poll_fn(|cx| {
if let Poll::Ready(()) = signals.poll_recv(cx) {
@@ -285,7 +271,7 @@ struct Handler<'a> {
last_connlib_start_instant: Option<Instant>,
log_filter_reloader: &'a LogFilterReloader,
session: Option<Session>,
telemetry: Telemetry, // Handle to the sentry.io telemetry module
telemetry: &'a mut Telemetry, // Handle to the sentry.io telemetry module
tun_device: TunDeviceManager,
}
@@ -316,7 +302,7 @@ impl<'a> Handler<'a> {
server: &mut IpcServer,
dns_controller: &'a mut DnsController,
log_filter_reloader: &'a LogFilterReloader,
telemetry: Telemetry,
telemetry: &'a mut Telemetry,
) -> Result<Self> {
dns_controller.deactivate()?;
let (ipc_rx, ipc_tx) = server
@@ -549,9 +535,15 @@ impl<'a> Handler<'a> {
ClientMsg::StartTelemetry {
environment,
version,
} => self
.telemetry
.start(&environment, &version, firezone_telemetry::IPC_SERVICE_DSN),
account_slug,
} => {
self.telemetry
.start(&environment, &version, firezone_telemetry::IPC_SERVICE_DSN);
if let Some(account_slug) = account_slug {
self.telemetry.set_account_slug(account_slug);
}
}
ClientMsg::StopTelemetry => {
self.telemetry.stop().await;
}
@@ -569,6 +561,7 @@ impl<'a> Handler<'a> {
assert!(self.session.is_none());
let device_id = device_id::get_or_create().map_err(|e| Error::DeviceId(e.to_string()))?;
self.telemetry.set_firezone_id(device_id.id.clone());
let url = LoginUrl::client(
Url::parse(api_url).map_err(|e| Error::UrlParse(e.to_string()))?,

View File

@@ -20,7 +20,6 @@ use phoenix_channel::LoginUrl;
use phoenix_channel::PhoenixChannel;
use secrecy::{Secret, SecretString};
use std::{
collections::BTreeMap,
path::{Path, PathBuf},
sync::Arc,
};
@@ -128,7 +127,7 @@ fn main() -> Result<()> {
// and we need to recover. <https://github.com/firezone/firezone/issues/4899>
dns_controller.deactivate()?;
let telemetry = Telemetry::default();
let mut telemetry = Telemetry::default();
telemetry.start(
cli.api_url.as_ref(),
VERSION,
@@ -175,15 +174,7 @@ fn main() -> Result<()> {
Some(id) => id,
None => device_id::get_or_create().context("Could not get `firezone_id` from CLI, could not read it from disk, could not generate it and save it to disk")?.id,
};
firezone_telemetry::configure_scope(|scope| {
scope.set_context(
"firezone",
firezone_telemetry::Context::Other(BTreeMap::from([(
"id".to_string(),
firezone_id.clone().into(),
)])),
)
});
telemetry.set_firezone_id(firezone_id.clone());
let url = LoginUrl::client(
cli.api_url,

View File

@@ -4,7 +4,6 @@ version = "0.1.0"
edition = "2021"
[dependencies]
arc-swap = "1.7.1"
sentry = { version = "0.34.0", default-features = false, features = ["contexts", "backtrace", "debug-images", "panic", "reqwest", "rustls", "tracing"] }
sentry-anyhow = "0.34.0"
tokio = { workspace = true, features = ["rt"] }

View File

@@ -1,9 +1,7 @@
use arc_swap::ArcSwapOption;
use std::time::Duration;
pub use sentry::{
add_breadcrumb, capture_error, capture_message, configure_scope, end_session,
end_session_with_status,
add_breadcrumb, capture_error, capture_message, end_session, end_session_with_status,
types::protocol::v7::{Context, SessionStatus},
Breadcrumb, Hub, Level,
};
@@ -36,22 +34,17 @@ pub struct Telemetry {
/// be able to gracefully close down Sentry, even though it won't have
/// a mutable handle to it, and even though `main` is actually what holds
/// the handle. Wrapping it all in `ArcSwap` makes it simpler.
inner: ArcSwapOption<sentry::ClientInitGuard>,
}
inner: Option<sentry::ClientInitGuard>,
impl Clone for Telemetry {
fn clone(&self) -> Self {
Self {
inner: ArcSwapOption::new(self.inner.load().clone()),
}
}
account_slug: Option<String>,
firezone_id: Option<String>,
}
impl Telemetry {
pub fn start(&self, api_url: &str, release: &str, dsn: Dsn) {
pub fn start(&mut self, api_url: &str, release: &str, dsn: Dsn) {
// Since it's `arc_swap` and not `Option`, there is a TOCTOU here,
// but in practice it should never trigger
if self.inner.load().is_some() {
if self.inner.is_some() {
return;
}
@@ -88,13 +81,15 @@ impl Telemetry {
let ctx = sentry::integrations::contexts::utils::rust_context();
scope.set_context("rust", ctx);
});
self.inner.swap(Some(inner.into()));
self.inner.replace(inner);
sentry::start_session();
self.update_user_context();
}
/// Flushes events to sentry.io and drops the guard
pub async fn stop(&self) {
let Some(inner) = self.inner.swap(None) else {
pub async fn stop(&mut self) {
let Some(inner) = self.inner.take() else {
return;
};
tracing::info!("Stopping telemetry");
@@ -113,16 +108,31 @@ impl Telemetry {
})
.await;
}
}
/// Sets the Firezone account slug on the Sentry hub
///
/// This sets the entire set of "user" info, so it will conflict if we set any other user ID later.
pub fn set_account_slug(account_slug: String) {
let mut user = sentry::User::default();
user.other
.insert("account_slug".to_string(), account_slug.into());
sentry::Hub::main().configure_scope(|scope| scope.set_user(Some(user)));
pub fn set_account_slug(&mut self, slug: String) {
self.account_slug = Some(slug);
self.update_user_context();
}
pub fn set_firezone_id(&mut self, id: String) {
self.firezone_id = Some(id);
self.update_user_context();
}
fn update_user_context(&self) {
let mut user = sentry::User {
id: self.firezone_id.clone(),
..Default::default()
};
user.other.extend(
self.account_slug
.clone()
.map(|slug| ("account_slug".to_owned(), slug.into())),
);
sentry::Hub::main().configure_scope(|scope| scope.set_user(Some(user)));
}
}
#[cfg(test)]
@@ -136,7 +146,7 @@ mod tests {
async fn sentry() {
// Smoke-test Sentry itself by turning it on and off a couple times
{
let tele = Telemetry::default();
let mut tele = Telemetry::default();
// Expect no telemetry because the telemetry module needs to be enabled before it can do anything
negative_error("X7X4CKH3");
@@ -166,13 +176,13 @@ mod tests {
// Test starting up with the choice opted-in
{
{
let tele = Telemetry::default();
let mut tele = Telemetry::default();
negative_error("4H7HFTNX");
tele.start("test", ENV, HEADLESS_DSN);
}
{
negative_error("GF46D6IL");
let tele = Telemetry::default();
let mut tele = Telemetry::default();
tele.start("test", ENV, HEADLESS_DSN);
error("OKOEUKSW");
}