feat(rust/gui-client): add sentry.io error reporting (#6782)

Refs #6138 

Sentry is always enabled for now. In the near future we'll make it
opt-out per device and opt-in per org (see #6138 for details)

- Replaces the `crash_handling` module
- Catches panics in GUI process, tunnel daemon, and Headless Client
- Added a couple "breadcrumbs" to play with that feature
- User ID is not set yet
- Environment is set to the API URL, e.g. `wss://api.firezone.dev`
- Reports panics from the connlib async task
- Release should be automatically pulled from the Cargo version which we
automatically set in the version Makefile

Example screenshot of sentry.io with a caught panic:

<img width="861" alt="image"
src="https://github.com/user-attachments/assets/c5188d86-10d0-4d94-b503-3fba51a21a90">
This commit is contained in:
Reactor Scram
2024-09-27 11:34:54 -05:00
committed by GitHub
parent d35a9c4615
commit 05a2b28d9f
21 changed files with 1414 additions and 1363 deletions

View File

@@ -28,7 +28,7 @@ outputs:
value: ${{
(runner.os == 'Linux' && '--workspace') ||
(runner.os == 'macOS' && '-p connlib-client-apple -p connlib-client-shared -p firezone-tunnel -p snownet') ||
(runner.os == 'Windows' && '-p connlib-client-shared -p firezone-bin-shared -p firezone-gui-client -p firezone-gui-client-common -p firezone-headless-client -p firezone-iced-client -p firezone-logging -p firezone-tunnel -p gui-smoke-test -p snownet') }}
(runner.os == 'Windows' && '-p connlib-client-shared -p firezone-bin-shared -p firezone-gui-client -p firezone-gui-client-common -p firezone-headless-client -p firezone-iced-client -p firezone-logging -p firezone-telemetry -p firezone-tunnel -p gui-smoke-test -p snownet') }}
runs:
using: "composite"

View File

@@ -161,7 +161,7 @@ jobs:
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
tool: dump_syms,minidump-stackwalk
tool: dump_syms
- name: Run smoke test
working-directory: ./rust
run: cargo run -p gui-smoke-test

2132
rust/Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@@ -17,6 +17,7 @@ members = [
"phoenix-channel",
"relay",
"socket-factory",
"telemetry",
"tests/gui-smoke-test",
"tests/http-test-server",
"tun"
@@ -47,11 +48,12 @@ rustls = { version = "0.23.10", default-features = false, features = ["ring"] }
connlib-client-android = { path = "connlib/clients/android" }
connlib-client-apple = { path = "connlib/clients/apple" }
connlib-client-shared = { path = "connlib/clients/shared" }
firezone-bin-shared = { path = "bin-shared" }
firezone-gateway = { path = "gateway" }
firezone-headless-client = { path = "headless-client" }
firezone-gui-client = { path = "gui-client/src-tauri" }
firezone-logging = { path = "logging" }
firezone-bin-shared = { path = "bin-shared" }
firezone-telemetry = { path = "telemetry" }
snownet = { path = "connlib/snownet" }
firezone-relay = { path = "relay" }
connlib-shared = { path = "connlib/shared" }

View File

@@ -11,6 +11,7 @@ anyhow = "1.0.82"
backoff = { workspace = true }
bimap = "0.6"
connlib-shared = { workspace = true }
firezone-telemetry = { workspace = true }
firezone-tunnel = { workspace = true }
ip_network = { version = "0.4", default-features = false }
phoenix-channel = { workspace = true }

View File

@@ -8,6 +8,7 @@ pub use firezone_tunnel::keypair;
use connlib_shared::messages::ResourceId;
use eventloop::Command;
use firezone_telemetry as telemetry;
use firezone_tunnel::ClientTunnel;
use messages::{IngressMessages, ReplyMessages};
use phoenix_channel::PhoenixChannel;
@@ -157,29 +158,33 @@ async fn connect_supervisor<CB>(
tracing::info!("connlib exited gracefully");
}
Ok(Err(e)) => {
telemetry::capture_error(&e);
tracing::error!("connlib failed: {e}");
callbacks.on_disconnect(&e);
}
Err(e) => match e.try_into_panic() {
Ok(panic) => {
if let Some(msg) = panic.downcast_ref::<&str>() {
tracing::error!("connlib panicked: {msg}");
callbacks.on_disconnect(&DisconnectError::Panic(msg.to_string()));
return;
}
if let Some(msg) = panic.downcast_ref::<String>() {
tracing::error!("connlib panicked: {msg}");
callbacks.on_disconnect(&DisconnectError::Panic(msg.to_string()));
return;
}
Err(e) => {
telemetry::capture_error(&e);
match e.try_into_panic() {
Ok(panic) => {
if let Some(msg) = panic.downcast_ref::<&str>() {
tracing::error!("connlib panicked: {msg}");
callbacks.on_disconnect(&DisconnectError::Panic(msg.to_string()));
return;
}
if let Some(msg) = panic.downcast_ref::<String>() {
tracing::error!("connlib panicked: {msg}");
callbacks.on_disconnect(&DisconnectError::Panic(msg.to_string()));
return;
}
tracing::error!("connlib panicked with a non-string payload");
callbacks.on_disconnect(&DisconnectError::PanicNonStringPayload);
tracing::error!("connlib panicked with a non-string payload");
callbacks.on_disconnect(&DisconnectError::PanicNonStringPayload);
}
Err(_) => {
tracing::error!("connlib task was cancelled");
callbacks.on_disconnect(&DisconnectError::Cancelled);
}
}
Err(_) => {
tracing::error!("connlib task was cancelled");
callbacks.on_disconnect(&DisconnectError::Cancelled);
}
},
}
}
}

View File

@@ -9,13 +9,12 @@ anyhow = { version = "1.0" }
arboard = { version = "3.4.0", default-features = false }
atomicwrites = { workspace = true }
connlib-shared = { workspace = true }
crash-handler = "0.6.2"
firezone-bin-shared = { workspace = true }
firezone-headless-client = { path = "../../headless-client" }
firezone-logging = { workspace = true }
firezone-telemetry = { workspace = true }
futures = { version = "0.3", default-features = false }
hex = "0.4.3"
minidumper = "0.8.3"
native-dialog = "0.7.0"
output_vt100 = "0.1"
png = "0.17.13" # `png` is mostly free since we already need it for Tauri

View File

@@ -13,6 +13,7 @@ use firezone_headless_client::{
IpcClientMsg::{self, SetDisabledResources},
IpcServerMsg, IpcServiceError, LogFilterReloader,
};
use firezone_telemetry::Telemetry;
use secrecy::{ExposeSecret as _, SecretString};
use std::{collections::BTreeSet, path::PathBuf, time::Instant};
use tokio::sync::{mpsc, oneshot};
@@ -39,6 +40,7 @@ pub struct Controller<I: GuiIntegration> {
release: Option<updates::Release>,
rx: mpsc::Receiver<ControllerRequest>,
status: Status,
telemetry: Telemetry,
updates_rx: mpsc::Receiver<Option<updates::Notification>>,
uptime: crate::uptime::Tracker,
}
@@ -49,6 +51,7 @@ pub struct Builder<I: GuiIntegration> {
pub integration: I,
pub log_filter_reloader: LogFilterReloader,
pub rx: mpsc::Receiver<ControllerRequest>,
pub telemetry: Telemetry,
pub updates_rx: mpsc::Receiver<Option<updates::Notification>>,
}
@@ -60,6 +63,7 @@ impl<I: GuiIntegration> Builder<I> {
integration,
log_filter_reloader,
rx,
telemetry,
updates_rx,
} = self;
@@ -77,6 +81,7 @@ impl<I: GuiIntegration> Builder<I> {
release: None,
rx,
status: Default::default(),
telemetry,
updates_rx,
uptime: Default::default(),
})
@@ -183,6 +188,16 @@ impl Status {
impl<I: GuiIntegration> Controller<I> {
pub async fn main_loop(mut self) -> Result<(), Error> {
// Start telemetry
{
let environment = self.advanced_settings.api_url.to_string();
self.telemetry
.start(environment.clone(), firezone_telemetry::GUI_DSN);
self.ipc_client
.send_msg(&IpcClientMsg::StartTelemetry { environment })
.await?;
}
if let Some(token) = self
.auth
.token()
@@ -267,6 +282,8 @@ impl<I: GuiIntegration> Controller<I> {
tracing::error!(?error, "ipc_client");
}
// Don't close telemetry here, `run` will close it.
Ok(())
}

View File

@@ -1,200 +0,0 @@
//! A module for handling crashes and writing minidump files
//!
//! Mostly copied from <https://github.com/EmbarkStudios/crash-handling/blob/main/minidumper/examples/diskwrite.rs>
//!
//! TODO: Capture crash dumps on panic. <https://github.com/firezone/firezone/issues/3520>
//!
//! To get human-usable stack traces out of a dump, do this:
//! (Copied from <https://github.com/firezone/firezone/issues/3111#issuecomment-1887975171>)
//!
//! - Get the pdb corresponding to the client exe
//! - `cargo install --locked dump_syms minidump-stackwalk`
//! - Use dump_syms to convert the pdb to a syms file
//! - `minidump-stackwalk --symbols-path firezone.syms crash.dmp`
use anyhow::{anyhow, bail, Context, Result};
use crash_handler::CrashHandler;
use firezone_headless_client::known_dirs;
use std::{fs::File, io::Write, path::PathBuf};
use time::OffsetDateTime;
/// Attaches a crash handler to the client process
///
/// Returns a CrashHandler that must be kept alive until the program exits.
/// Dropping the handler will detach it.
///
/// If you need this on non-Windows, re-visit
/// <https://github.com/EmbarkStudios/crash-handling/blob/main/minidumper/examples/diskwrite.rs>
/// Linux has a special `set_ptracer` call that is handy
/// MacOS needs a special `ping` call to flush messages inside the crash handler
pub fn attach_handler() -> Result<CrashHandler> {
// Attempt to connect to the server
let (client, _server) = start_server_and_connect()?;
// SAFETY: Unsafe is required here because this will run after the program
// has crashed. We should try to do as little as possible, basically just
// tell the crash handler process to get our minidump and then return.
// https://docs.rs/crash-handler/0.6.0/crash_handler/trait.CrashEvent.html#safety
let handler = CrashHandler::attach(unsafe {
crash_handler::make_crash_event(move |crash_context| {
let handled = client.request_dump(crash_context).is_ok();
tracing::error!("Firezone crashed and wrote a crash dump.");
crash_handler::CrashEventResult::Handled(handled)
})
})
.context("failed to attach signal handler")?;
Ok(handler)
}
/// Main function for the server process, for out-of-process crash handling.
///
/// The server process seems to be the preferred method,
/// since it's hard to run complex code in a process
/// that's already crashed and likely suffered memory corruption.
///
/// <https://jake-shadle.github.io/crash-reporting/#implementation>
/// <https://chromium.googlesource.com/breakpad/breakpad/+/master/docs/getting_started_with_breakpad.md#terminology>
pub fn server(socket_path: PathBuf) -> Result<()> {
let mut server = minidumper::Server::with_name(&*socket_path)?;
let ab = std::sync::atomic::AtomicBool::new(false);
server.run(Box::new(Handler::default()), &ab, None)?;
Ok(())
}
fn start_server_and_connect() -> Result<(minidumper::Client, std::process::Child)> {
let exe = std::env::current_exe().context("unable to find our own exe path")?;
// Path of a Unix domain socket for IPC with the crash handler server
// <https://github.com/EmbarkStudios/crash-handling/issues/10>
let socket_path = known_dirs::runtime()
.context("`known_dirs::runtime` failed")?
.join("crash_handler_pipe");
std::fs::create_dir_all(
socket_path
.parent()
.context("`known_dirs::runtime` should have a parent")?,
)
.context("Failed to create dir for crash_handler_pipe")?;
let mut server = None;
// I don't understand why there's a loop here. The original was an infinite loop,
// so I reduced it to 10 and it still worked.
// <https://github.com/EmbarkStudios/crash-handling/blob/16c2545f2a46b6b21d1e401cfeaf0d5b9a130b08/minidumper/examples/diskwrite.rs#L72>
for _ in 0..10 {
// Create the crash client first so we can error out if another instance of
// the Firezone client is already using this socket for crash handling.
if let Ok(client) = minidumper::Client::with_name(&*socket_path) {
return Ok((
client,
server.ok_or_else(|| {
anyhow!(
"should be impossible to make a client if we didn't make the server yet"
)
})?,
));
}
server = Some(
std::process::Command::new(&exe)
.arg("crash-handler-server")
.arg(&socket_path)
.spawn()
.context("unable to spawn server process")?,
);
// Give it time to start
std::thread::sleep(std::time::Duration::from_millis(100));
}
bail!("Couldn't set up crash handler server")
}
/// Crash handler that runs inside the crash handler process.
///
/// The minidumper docs call this the "server" process because it's an IPC server,
/// not to be confused with network servers for Firezone itself.
struct Handler {
start_time: OffsetDateTime,
}
impl Default for Handler {
fn default() -> Self {
// Capture the time at startup so that the crash dump file will have
// a similar timestamp to the log file
Self {
start_time: OffsetDateTime::now_utc(),
}
}
}
impl minidumper::ServerHandler for Handler {
/// Called when a crash has been received and a backing file needs to be
/// created to store it.
#[expect(clippy::print_stderr)]
fn create_minidump_file(&self) -> Result<(File, PathBuf), std::io::Error> {
let format = time::format_description::parse(firezone_logging::file::TIME_FORMAT)
.expect("static format description should always be parsable");
let date = self
.start_time
.format(&format)
.expect("formatting a timestamp should always be possible");
let dump_path = known_dirs::logs()
.expect("Should be able to find logs dir to put crash dump in")
.join(format!("crash.{date}.dmp"));
// `tracing` is unlikely to work inside the crash handler subprocess, so
// just print to stderr and it may show up on the terminal. This helps in CI / local dev.
eprintln!("Creating minidump at {}", dump_path.display());
let Some(dir) = dump_path.parent() else {
return Err(std::io::ErrorKind::NotFound.into());
};
std::fs::create_dir_all(dir)?;
let file = File::create(&dump_path)?;
Ok((file, dump_path))
}
/// Called when a crash has been fully written as a minidump to the provided
/// file. Also returns the full heap buffer as well.
fn on_minidump_created(
&self,
result: Result<minidumper::MinidumpBinary, minidumper::Error>,
) -> minidumper::LoopAction {
match result {
Ok(mut md_bin) => {
let _ = md_bin.file.flush();
// Copy the timestamped crash file to a well-known filename,
// this makes it easier for the smoke test to find it
std::fs::copy(
&md_bin.path,
known_dirs::logs()
.expect("Should be able to find logs dir")
.join("last_crash.dmp"),
)
.ok();
tracing::info!("wrote minidump to disk");
}
Err(e) => {
tracing::error!("failed to write minidump: {:#}", e);
}
}
// Tells the server to exit, which will in turn exit the process
minidumper::LoopAction::Exit
}
fn on_message(&self, kind: u32, buffer: Vec<u8>) {
tracing::info!(
"kind: {kind}, message: {}",
String::from_utf8(buffer).expect("message should be valid UTF-8")
);
}
fn on_client_disconnected(&self, num_clients: usize) -> minidumper::LoopAction {
if num_clients == 0 {
minidumper::LoopAction::Exit
} else {
minidumper::LoopAction::Continue
}
}
}

View File

@@ -1,7 +1,6 @@
pub mod auth;
pub mod compositor;
pub mod controller;
pub mod crash_handling;
pub mod deep_link;
pub mod errors;
pub mod ipc;

View File

@@ -22,6 +22,7 @@ firezone-bin-shared = { workspace = true }
firezone-gui-client-common = { path = "../src-common" }
firezone-headless-client = { path = "../../headless-client" }
firezone-logging = { workspace = true }
firezone-telemetry = { workspace = true }
native-dialog = "0.7.0"
rand = "0.8.5"
rustls = { workspace = true }
@@ -38,7 +39,6 @@ thiserror = { version = "1.0", default-features = false }
tokio = { workspace = true, features = ["signal", "time", "macros", "rt", "rt-multi-thread"] }
tokio-util = { version = "0.7.11", features = ["codec"] }
tracing = { workspace = true }
tracing-panic = "0.1.2"
tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }
url = { version = "2.5.2", features = ["serde"] }
uuid = { version = "1.10.0", features = ["v4"] }

View File

@@ -1,9 +1,10 @@
use anyhow::{bail, Context as _, Result};
use clap::{Args, Parser};
use firezone_gui_client_common::{
self as common, controller::Failure, crash_handling, deep_link, settings::AdvancedSettings,
self as common, controller::Failure, deep_link, settings::AdvancedSettings,
};
use std::path::PathBuf;
use firezone_telemetry as telemetry;
use std::collections::BTreeMap;
use tracing::instrument;
use tracing_subscriber::EnvFilter;
@@ -24,7 +25,6 @@ pub(crate) enum Error {
/// The program's entry point, equivalent to `main`
#[instrument(skip_all)]
pub(crate) fn run() -> Result<()> {
std::panic::set_hook(Box::new(tracing_panic::panic_hook));
let cli = Cli::parse();
rustls::crypto::ring::default_provider()
@@ -46,7 +46,6 @@ pub(crate) fn run() -> Result<()> {
}
}
}
Some(Cmd::CrashHandlerServer { socket_path }) => crash_handling::server(socket_path),
Some(Cmd::Debug { command }) => debug_commands::run(command),
// If we already tried to elevate ourselves, don't try again
Some(Cmd::Elevated) => run_gui(cli),
@@ -60,12 +59,14 @@ pub(crate) fn run() -> Result<()> {
Some(Cmd::SmokeTest) => {
// Can't check elevation here because the Windows CI is always elevated
let settings = common::settings::load_advanced_settings().unwrap_or_default();
let telemetry = telemetry::Telemetry::default();
telemetry.start(settings.api_url.to_string(), telemetry::GUI_DSN);
// Don't fix the log filter for smoke tests
let common::logging::Handles {
logger: _logger,
reloader,
} = start_logging(&settings.log_filter)?;
let result = gui::run(cli, settings, reloader);
let result = gui::run(cli, settings, reloader, telemetry);
if let Err(error) = &result {
// In smoke-test mode, don't show the dialog, since it might be running
// unattended in CI and the dialog would hang forever
@@ -85,12 +86,15 @@ pub(crate) fn run() -> Result<()> {
// Can't `instrument` this because logging isn't running when we enter it.
fn run_gui(cli: Cli) -> Result<()> {
let mut settings = common::settings::load_advanced_settings().unwrap_or_default();
let telemetry = telemetry::Telemetry::default();
// In the future telemetry will be opt-in per organization, that's why this isn't just at the top of `main`
telemetry.start(settings.api_url.to_string(), telemetry::GUI_DSN);
fix_log_filter(&mut settings)?;
let common::logging::Handles {
logger: _logger,
reloader,
} = start_logging(&settings.log_filter)?;
let result = gui::run(cli, settings, reloader);
let result = gui::run(cli, settings, reloader, telemetry);
// Make sure errors get logged, at least to stderr
if let Err(error) = &result {
@@ -123,14 +127,26 @@ fn fix_log_filter(settings: &mut AdvancedSettings) -> Result<()> {
/// Don't drop the log handle or logging will stop.
fn start_logging(directives: &str) -> Result<common::logging::Handles> {
let logging_handles = common::logging::setup(directives)?;
let git_version = firezone_bin_shared::git_version!("gui-client-*");
let system_uptime_seconds = firezone_headless_client::uptime::get().map(|dur| dur.as_secs());
tracing::info!(
arch = std::env::consts::ARCH,
os = std::env::consts::OS,
?directives,
git_version = firezone_bin_shared::git_version!("gui-client-*"),
system_uptime_seconds = firezone_headless_client::uptime::get().map(|dur| dur.as_secs()),
?git_version,
?system_uptime_seconds,
"`gui-client` started logging"
);
telemetry::add_breadcrumb(telemetry::Breadcrumb {
ty: "logging_start".into(),
category: None,
data: BTreeMap::from([
("directives".into(), directives.into()),
("git_version".into(), git_version.into()),
("system_uptime_seconds".into(), system_uptime_seconds.into()),
]),
..Default::default()
});
Ok(logging_handles)
}
@@ -189,9 +205,6 @@ impl Cli {
#[derive(clap::Subcommand)]
pub enum Cmd {
CrashHandlerServer {
socket_path: PathBuf,
},
Debug {
#[command(subcommand)]
command: debug_commands::Cmd,

View File

@@ -11,13 +11,14 @@ use anyhow::{bail, Context, Result};
use common::system_tray::Event as TrayMenuEvent;
use firezone_gui_client_common::{
self as common,
controller::{Builder as ControllerBuilder, ControllerRequest, CtlrTx, GuiIntegration},
crash_handling, deep_link,
controller::{ControllerRequest, CtlrTx, GuiIntegration},
deep_link,
errors::{self, Error},
settings::AdvancedSettings,
updates,
};
use firezone_headless_client::LogFilterReloader;
use firezone_telemetry as telemetry;
use secrecy::{ExposeSecret as _, SecretString};
use std::{str::FromStr, time::Duration};
use tauri::{Manager, SystemTrayEvent};
@@ -119,18 +120,8 @@ pub(crate) fn run(
cli: client::Cli,
advanced_settings: AdvancedSettings,
reloader: LogFilterReloader,
telemetry: telemetry::Telemetry,
) -> Result<(), Error> {
// Need to keep this alive so crashes will be handled. Dropping detaches it.
let _crash_handler = match crash_handling::attach_handler() {
Ok(x) => Some(x),
Err(error) => {
// TODO: None of these logs are actually written yet
// <https://github.com/firezone/firezone/issues/3211>
tracing::warn!(?error, "Did not set up crash handler");
None
}
};
// Needed for the deep link server
let rt = tokio::runtime::Runtime::new().context("Couldn't start Tokio runtime")?;
let _guard = rt.enter();
@@ -225,6 +216,11 @@ pub(crate) fn run(
tracing::warn!(
"Will crash / error / panic on purpose in {delay} seconds to test error handling."
);
telemetry::add_breadcrumb(telemetry::Breadcrumb {
ty: "fail_on_purpose".into(),
message: Some("Will crash / error / panic on purpose to test error handling.".into()),
..Default::default()
});
tokio::time::sleep(Duration::from_secs(delay)).await;
tracing::warn!("Crashing / erroring / panicking on purpose");
ctlr_tx.send(ControllerRequest::Fail(failure)).await?;
@@ -251,19 +247,16 @@ pub(crate) fn run(
let app_handle = app.handle();
let _ctlr_task = tokio::spawn(async move {
let app_handle_2 = app_handle.clone();
// Spawn two nested Tasks so the outer can catch panics from the inner
let task = tokio::spawn(async move {
run_controller(
app_handle_2,
ctlr_tx,
ctlr_rx,
advanced_settings,
reloader,
updates_rx,
)
.await
});
let task = tokio::spawn(run_controller(
app_handle.clone(),
ctlr_tx,
ctlr_rx,
advanced_settings,
reloader,
telemetry.clone(),
updates_rx,
));
// See <https://github.com/tauri-apps/tauri/issues/8631>
// This should be the ONLY place we call `app.exit` or `app_handle.exit`,
@@ -274,17 +267,28 @@ pub(crate) fn run(
let exit_code = match task.await {
Err(error) => {
telemetry::capture_error(&error);
tracing::error!(?error, "run_controller panicked");
telemetry::end_session_with_status(telemetry::SessionStatus::Crashed);
1
}
Ok(Err(error)) => {
telemetry::capture_error(&error);
tracing::error!(?error, "run_controller returned an error");
errors::show_error_dialog(&error).unwrap();
telemetry::end_session_with_status(telemetry::SessionStatus::Crashed);
1
}
Ok(Ok(_)) => 0,
Ok(Ok(_)) => {
telemetry::end_session();
0
}
};
// In a normal Rust application, Sentry's guard would flush on drop: https://docs.sentry.io/platforms/rust/configuration/draining/
// But due to a limit in `tao` we cannot return from the event loop and must call `std::process::exit` (or Tauri's wrapper), so we explicitly flush here.
telemetry.stop();
tracing::info!(?exit_code);
app_handle.exit(exit_code);
});
@@ -442,17 +446,18 @@ async fn run_controller(
rx: mpsc::Receiver<ControllerRequest>,
advanced_settings: AdvancedSettings,
log_filter_reloader: LogFilterReloader,
telemetry: telemetry::Telemetry,
updates_rx: mpsc::Receiver<Option<updates::Notification>>,
) -> Result<(), Error> {
tracing::debug!("Entered `run_controller`");
let tray = system_tray::Tray::new(app.tray_handle());
let integration = TauriIntegration { app, tray };
let controller = ControllerBuilder {
let controller = firezone_gui_client_common::controller::Builder {
advanced_settings,
ctlr_tx,
integration,
integration: TauriIntegration { app, tray },
log_filter_reloader,
rx,
telemetry,
updates_rx,
}
.build()

View File

@@ -15,6 +15,7 @@ connlib-client-shared = { workspace = true }
connlib-shared = { workspace = true }
firezone-bin-shared = { workspace = true }
firezone-logging = { workspace = true }
firezone-telemetry = { workspace = true }
futures = "0.3.30"
humantime = "2.1"
ip-packet = { workspace = true }

View File

@@ -10,6 +10,7 @@ use firezone_bin_shared::{
platform::{tcp_socket_factory, udp_socket_factory, DnsControlMethod},
TunDeviceManager, TOKEN_ENV_KEY,
};
use firezone_telemetry::Telemetry;
use futures::{
future::poll_fn,
task::{Context, Poll},
@@ -81,6 +82,8 @@ pub enum ClientMsg {
Reset,
SetDns(Vec<IpAddr>),
SetDisabledResources(BTreeSet<ResourceId>),
StartTelemetry { environment: String },
StopTelemetry,
}
/// Messages that end up in the GUI, either forwarded from connlib or from the IPC service.
@@ -178,15 +181,21 @@ fn run_smoke_test() -> Result<()> {
// and we need to recover. <https://github.com/firezone/firezone/issues/4899>
dns_controller.deactivate()?;
let mut signals = signals::Terminate::new()?;
let telemetry = Telemetry::default();
// Couldn't get the loop to work here yet, so SIGHUP is not implemented
rt.block_on(async {
device_id::get_or_create().context("Failed to read / create device ID")?;
let mut server = IpcServer::new(ServiceId::Prod).await?;
let _ = Handler::new(&mut server, &mut dns_controller, &log_filter_reloader)
.await?
.run(&mut signals)
.await;
let _ = Handler::new(
&mut server,
&mut dns_controller,
&log_filter_reloader,
telemetry,
)
.await?
.run(&mut signals)
.await;
Ok::<_, anyhow::Error>(())
})
}
@@ -205,11 +214,13 @@ async fn ipc_listen(
device_id::get_or_create().context("Failed to read / create device ID")?;
let mut server = IpcServer::new(ServiceId::Prod).await?;
let mut dns_controller = DnsController { dns_control_method };
let telemetry = Telemetry::default();
loop {
let mut handler_fut = pin!(Handler::new(
&mut server,
&mut dns_controller,
log_filter_reloader
log_filter_reloader,
telemetry.clone(),
));
let Some(handler) = poll_fn(|cx| {
if let Poll::Ready(()) = signals.poll_recv(cx) {
@@ -241,6 +252,7 @@ struct Handler<'a> {
last_connlib_start_instant: Option<Instant>,
log_filter_reloader: &'a LogFilterReloader,
session: Option<Session>,
telemetry: Telemetry, // Handle to the sentry.io telemetry module
tun_device: TunDeviceManager,
}
@@ -271,6 +283,7 @@ impl<'a> Handler<'a> {
server: &mut IpcServer,
dns_controller: &'a mut DnsController,
log_filter_reloader: &'a LogFilterReloader,
telemetry: Telemetry,
) -> Result<Self> {
dns_controller.deactivate()?;
let (ipc_rx, ipc_tx) = server
@@ -286,6 +299,7 @@ impl<'a> Handler<'a> {
last_connlib_start_instant: None,
log_filter_reloader,
session: None,
telemetry,
tun_device,
})
}
@@ -484,6 +498,10 @@ impl<'a> Handler<'a> {
session.connlib.set_disabled_resources(disabled_resources);
}
ClientMsg::StartTelemetry { environment } => self
.telemetry
.start(environment, firezone_telemetry::IPC_SERVICE_DSN),
ClientMsg::StopTelemetry => self.telemetry.stop(),
}
Ok(())
}
@@ -494,6 +512,11 @@ impl<'a> Handler<'a> {
///
/// Throws matchable errors for bad URLs, unable to reach the portal, or unable to create the tunnel device
fn connect_to_firezone(&mut self, api_url: &str, token: SecretString) -> Result<(), Error> {
let ctx = firezone_telemetry::TransactionContext::new(
"connect_to_firezone",
"Connecting to Firezone",
);
let transaction = firezone_telemetry::start_transaction(ctx);
assert!(self.session.is_none());
let device_id = device_id::get_or_create().map_err(|e| Error::DeviceId(e.to_string()))?;
let (private_key, public_key) = keypair();
@@ -518,6 +541,7 @@ impl<'a> Handler<'a> {
};
// Synchronous DNS resolution here
let phoenix_span = transaction.start_child("phoenix", "Resolve DNS for PhoenixChannel");
let portal = PhoenixChannel::connect(
Secret::new(url),
get_user_agent(None, env!("CARGO_PKG_VERSION")),
@@ -529,6 +553,7 @@ impl<'a> Handler<'a> {
Arc::new(tcp_socket_factory),
)
.map_err(|e| Error::PortalConnection(e.to_string()))?;
phoenix_span.finish();
// Read the resolvers before starting connlib, in case connlib's startup interferes.
let dns = self.dns_controller.system_resolvers();
@@ -540,15 +565,18 @@ impl<'a> Handler<'a> {
// Call `set_dns` before `set_tun` so that the tunnel starts up with a valid list of resolvers.
tracing::debug!(?dns, "Calling `set_dns`...");
connlib.set_dns(dns);
let tun_span = transaction.start_child("tun", "Raise tunnel with `make_tun`");
let tun = self
.tun_device
.make_tun()
.map_err(|e| Error::TunnelDevice(e.to_string()))?;
connlib.set_tun(Box::new(tun));
tun_span.finish();
let session = Session { cb_rx, connlib };
self.session = Some(session);
transaction.finish();
Ok(())
}
}

View File

@@ -13,6 +13,7 @@ use firezone_bin_shared::{
use firezone_headless_client::{
device_id, signals, CallbackHandler, CliCommon, ConnlibMsg, DnsController,
};
use firezone_telemetry::Telemetry;
use futures::{FutureExt as _, StreamExt as _};
use phoenix_channel::PhoenixChannel;
use secrecy::{Secret, SecretString};
@@ -114,6 +115,8 @@ fn main() -> Result<()> {
let token_env_var = cli.token.take().map(SecretString::from);
let cli = cli;
let telemetry = Telemetry::default();
telemetry.start(cli.api_url.to_string(), firezone_telemetry::HEADLESS_DSN);
// Docs indicate that `remove_var` should actually be marked unsafe
// SAFETY: We haven't spawned any other threads, this code should be the first
@@ -186,11 +189,18 @@ fn main() -> Result<()> {
};
rt.block_on(async {
let ctx = firezone_telemetry::TransactionContext::new(
"connect_to_firezone",
"Connecting to Firezone",
);
let transaction = firezone_telemetry::start_transaction(ctx);
// The Headless Client will bail out here if there's no Internet, because `PhoenixChannel` will try to
// resolve the portal host and fail. This is intentional behavior. The Headless Client should always be running under a manager like `systemd` or Windows' Service Controller,
// so when it fails it will be restarted with backoff. `systemd` can additionally make us wait
// for an Internet connection if it launches us at startup.
// When running interactively, it is useful for the user to see that we can't reach the portal.
let phoenix_span = transaction.start_child("phoenix", "Connect PhoenixChannel");
let portal = PhoenixChannel::connect(
Secret::new(url),
get_user_agent(None, env!("CARGO_PKG_VERSION")),
@@ -201,6 +211,7 @@ fn main() -> Result<()> {
.build(),
Arc::new(tcp_socket_factory),
)?;
phoenix_span.finish();
let session = Session::connect(args, portal, rt.handle().clone());
let mut terminate = signals::Terminate::new()?;
@@ -223,10 +234,14 @@ fn main() -> Result<()> {
new_network_notifier(tokio_handle.clone(), dns_control_method).await?;
drop(tokio_handle);
let tun_span = transaction.start_child("tun", "Raise tunnel");
let tun = tun_device.make_tun()?;
session.set_tun(Box::new(tun));
tun_span.finish();
session.set_dns(dns_controller.system_resolvers());
transaction.finish();
let result = loop {
let mut dns_changed = pin!(dns_notifier.notified().fuse());
let mut network_changed = pin!(network_notifier.notified().fuse());

15
rust/telemetry/Cargo.toml Normal file
View File

@@ -0,0 +1,15 @@
[package]
name = "firezone-telemetry"
version = "0.1.0"
edition = "2021"
[dependencies]
arc-swap = "1.7.1"
sentry = { version = "0.34.0", default-features = false, features = ["panic", "reqwest", "rustls", "tracing"] }
tracing = { workspace = true }
[dev-dependencies]
thiserror = "1.0.63"
[lints]
workspace = true

166
rust/telemetry/src/lib.rs Normal file
View File

@@ -0,0 +1,166 @@
use arc_swap::ArcSwapOption;
use std::time::Duration;
pub use sentry::{
add_breadcrumb, capture_error, end_session, end_session_with_status, start_transaction,
types::protocol::v7::SessionStatus, Breadcrumb, TransactionContext,
};
pub struct Dsn(&'static str);
// TODO: Dynamic DSN
// Sentry docs say this does not need to be protected:
// > DSNs are safe to keep public because they only allow submission of new events and related event data; they do not allow read access to any information.
// <https://docs.sentry.io/concepts/key-terms/dsn-explainer/#dsn-utilization>
pub const GUI_DSN: Dsn = Dsn("https://2e17bf5ed24a78c0ac9e84a5de2bd6fc@o4507971108339712.ingest.us.sentry.io/4508008945549312");
pub const HEADLESS_DSN: Dsn = Dsn("https://bc27dca8bb37be0142c48c4f89647c13@o4507971108339712.ingest.us.sentry.io/4508010028728320");
pub const IPC_SERVICE_DSN: Dsn = Dsn("https://0590b89fd4479494a1e7ffa4dc705001@o4507971108339712.ingest.us.sentry.io/4508008896069632");
#[derive(Default)]
pub struct Telemetry {
/// The sentry guard itself
///
/// `arc_swap` is used here because the borrowing for the GUI process is
/// complex. If the user has already consented to using telemetry, we want
/// to start Sentry as soon as possible inside `main`, and keep the guard
/// alive there. But sentry's `Drop` hook will never get called during
/// normal GUI operation because Tauri internally calls `std::process::exit`
/// to bail out of its Tao event loop. So we want the GUI controller to
/// be able to gracefully close down Sentry, even though it won't have
/// a mutable handle to it, and even though `main` is actually what holds
/// the handle. Wrapping it all in `ArcSwap` makes it simpler.
inner: ArcSwapOption<sentry::ClientInitGuard>,
}
impl Clone for Telemetry {
fn clone(&self) -> Self {
Self {
inner: ArcSwapOption::new(self.inner.load().clone()),
}
}
}
impl Telemetry {
pub fn start(&self, environment: String, dsn: Dsn) {
// Since it's `arc_swap` and not `Option`, there is a TOCTOU here,
// but in practice it should never trigger
if self.inner.load().is_some() {
return;
}
tracing::info!("Starting telemetry");
let inner = sentry::init((
dsn.0,
sentry::ClientOptions {
environment: Some(environment.into()),
release: sentry::release_name!(),
traces_sample_rate: 1.0,
..Default::default()
},
));
self.inner.swap(Some(inner.into()));
sentry::start_session();
}
/// Flushes events to sentry.io and drops the guard
pub fn stop(&self) {
let Some(inner) = self.inner.swap(None) else {
return;
};
tracing::info!("Stopping telemetry");
sentry::end_session();
// `flush`'s return value is flipped from the docs
// <https://github.com/getsentry/sentry-rust/issues/677>
if inner.flush(Some(Duration::from_secs(5))) {
tracing::error!("Failed to flush telemetry events to sentry.io");
} else {
tracing::debug!("Flushed telemetry");
}
}
}
#[cfg(test)]
mod tests {
use super::*;
// To avoid problems with global mutable state, we run unrelated tests in the same test case.
#[test]
fn sentry() {
// Test this flush problem
{
let telemetry = sentry::init((
HEADLESS_DSN.0,
sentry::ClientOptions {
release: sentry::release_name!(),
traces_sample_rate: 1.0,
..Default::default()
},
));
sentry::start_session();
sentry::end_session();
// `flush`'s return value is flipped from the docs
// <https://github.com/getsentry/sentry-rust/issues/677>
assert!(!telemetry.flush(Some(Duration::from_secs(5))));
}
// Smoke-test Sentry itself by turning it on and off a couple times
{
let tele = Telemetry::default();
// Expect no telemetry because the telemetry module needs to be enabled before it can do anything
negative_error("X7X4CKH3");
tele.start("test".to_string(), HEADLESS_DSN);
// Expect telemetry because the user opted in.
sentry::add_breadcrumb(sentry::Breadcrumb {
ty: "test_crumb".into(),
message: Some("This breadcrumb may appear before error #QELADAGH".into()),
..Default::default()
});
error("QELADAGH");
tele.stop();
// Expect no telemetry because the user opted back out.
negative_error("2RSIYAPX");
tele.start("test".to_string(), HEADLESS_DSN);
// Cycle one more time to be sure.
error("S672IOBZ");
tele.stop();
// Expect no telemetry after the module is closed.
negative_error("W57GJKUO");
}
// Test starting up with the choice opted-in
{
{
let tele = Telemetry::default();
negative_error("4H7HFTNX");
tele.start("test".to_string(), HEADLESS_DSN);
}
{
negative_error("GF46D6IL");
let tele = Telemetry::default();
tele.start("test".to_string(), HEADLESS_DSN);
error("OKOEUKSW");
}
}
}
#[derive(Debug, thiserror::Error)]
enum Error {
#[error("Test error #{0}, this should appear in the sentry.io portal")]
Positive(&'static str),
#[error("Negative #{0} - You should NEVER see this")]
Negative(&'static str),
}
fn error(code: &'static str) {
sentry::capture_error(&Error::Positive(code));
}
fn negative_error(code: &'static str) {
sentry::capture_error(&Error::Negative(code));
}
}

View File

@@ -48,7 +48,7 @@ fn main() -> Result<()> {
gui.wait()?.fz_exit_ok().context("GUI process")?;
ipc_service.wait()?.fz_exit_ok().context("IPC service")?;
// Force the GUI to crash and then try to read the crash dump
// Force the GUI to crash
let mut ipc_service = ipc_service_command().arg("run-smoke-test").popen()?;
let mut gui = app.gui_command(&["--crash"])?.popen()?;
@@ -56,8 +56,6 @@ fn main() -> Result<()> {
gui.wait()?;
ipc_service.wait()?.fz_exit_ok().context("IPC service")?;
app.check_crash_dump()?;
if cli.manual_tests {
manual_tests(&app)?;
}
@@ -86,20 +84,6 @@ struct App {
username: String,
}
impl App {
fn check_crash_dump(&self) -> Result<()> {
Exec::cmd("minidump-stackwalk")
.args(&[
OsStr::new("--symbols-path"),
syms_path().as_os_str(),
self.crash_dump_path().as_os_str(),
])
.join()?
.fz_exit_ok()?;
Ok(())
}
}
#[cfg(target_os = "linux")]
impl App {
fn new() -> Result<Self> {
@@ -124,12 +108,6 @@ impl App {
Ok(Self { username })
}
fn crash_dump_path(&self) -> PathBuf {
Path::new("/home")
.join(&self.username)
.join(".cache/dev.firezone.client/data/logs/last_crash.dmp")
}
// `args` can't just be appended because of the `xvfb-run` wrapper
fn gui_command(&self, args: &[&str]) -> Result<Exec> {
let gui_path = gui_path().canonicalize()?;
@@ -168,13 +146,6 @@ impl App {
Ok(Self {})
}
fn crash_dump_path(&self) -> PathBuf {
let app_data = std::env::var("LOCALAPPDATA").expect("$LOCALAPPDATA should be set");
PathBuf::from(app_data)
.join("dev.firezone.client/data/logs")
.join("last_crash.dmp")
}
// Strange signature needed to match Linux
fn gui_command(&self, args: &[&str]) -> Result<Exec> {
Ok(Exec::cmd(gui_path()).arg("--no-deep-links").args(args))

View File

@@ -19,6 +19,9 @@ export default function GUI({ title }: { title: string }) {
Ensures Firefox doesn't attempt to use DNS over HTTPS when Firezone is
active.
</ChangeItem>
<ChangeItem pull="6782">
Adds always-on error reporting using sentry.io.
</ChangeItem>
</Unreleased>
<Entry version="1.3.6" date={new Date("2024-09-25")}>
<ChangeItem pull="6809">

View File

@@ -16,6 +16,9 @@ export default function Headless() {
Ensures Firefox doesn't attempt to use DNS over HTTPS when Firezone is
active.
</ChangeItem>
<ChangeItem pull="6782">
Adds always-on error reporting using sentry.io.
</ChangeItem>
</Unreleased>
<Entry version="1.3.3" date={new Date("2024-09-25")}>
<ChangeItem pull="6809">