refactor(gui-client): remove the heartbeat module (#5682)

We added this to diagnose a hang in the IPC service, #5441. That hang,
to the best of our knowledge, was caused by a deadlock which we fixed in
#5571. So the heartbeat task just adds a lot of noise to the stdout
which is annoying for debugging and won't be used in production logs.

The system uptime measuring is still useful, so we now log that just
once when logging starts, next to the git version and log directives.

If we see this pattern in either process' logs, we know something is
suspicious:
- Log file ends without a clean shutdown message
- Next log file starts with a high system uptime

Updates should always result in a clean shutdown message, and a sudden
power loss (mains power outage, or laptop battery dying) would result in
the system uptime being low for the 2nd log file.
This commit is contained in:
Reactor Scram
2024-07-02 18:33:47 +00:00
committed by GitHub
parent ee8b42af26
commit 4b6b706d46
7 changed files with 15 additions and 72 deletions

View File

@@ -29,7 +29,7 @@ use settings::AdvancedSettings;
/// * `g` doesn't mean anything
/// * `ed5437c88` is the Git commit hash
/// * `-modified` is present if the working dir has any changes from that commit number
pub const GIT_VERSION: &str = git_version::git_version!(
const GIT_VERSION: &str = git_version::git_version!(
args = ["--always", "--dirty=-modified", "--tags"],
fallback = "unknown"
);
@@ -157,7 +157,12 @@ fn fix_log_filter(settings: &mut AdvancedSettings) -> Result<()> {
/// Don't drop the log handle or logging will stop.
fn start_logging(directives: &str) -> Result<logging::Handles> {
let logging_handles = logging::setup(directives)?;
tracing::info!(?GIT_VERSION, "`gui-client` started logging");
tracing::info!(
?directives,
?GIT_VERSION,
system_uptime_seconds = firezone_headless_client::uptime::get().map(|dur| dur.as_secs()),
"`gui-client` started logging"
);
Ok(logging_handles)
}

View File

@@ -80,7 +80,6 @@ pub(crate) fn run(
// Needed for the deep link server
let rt = tokio::runtime::Runtime::new().context("Couldn't start Tokio runtime")?;
let _guard = rt.enter();
rt.spawn(firezone_headless_client::heartbeat::heartbeat());
let (ctlr_tx, ctlr_rx) = mpsc::channel(5);

View File

@@ -1,61 +0,0 @@
//! A heartbeat that logs to `INFO` at exponentially increasing intervals
//! so it won't take up much disk space
//!
//! The IPC service is quiet when signed out, and the GUI is quiet when it's in steady
//! state, so this heartbeat allows us to estimate roughly how long each process stayed
//! up when looking at user logs, using unlimited disk space per run of the app.
use crate::uptime;
use std::time::Duration;
use tokio::time::{sleep_until, Instant};
/// Logs a heartbeat to `tracing::info!`. Put this in a Tokio task and forget about it.
pub async fn heartbeat() {
let mut hb = Heartbeat::default();
loop {
sleep_until(hb.next_instant).await;
let system_uptime = uptime::get();
tracing::info!(?system_uptime, "Heartbeat");
hb.tick();
}
}
struct Heartbeat {
next_instant: Instant,
dur: Duration,
}
impl Default for Heartbeat {
fn default() -> Self {
Self {
next_instant: Instant::now(),
dur: Duration::from_secs(1),
}
}
}
impl Heartbeat {
fn tick(&mut self) {
self.next_instant += self.dur;
self.dur *= 2;
}
}
#[cfg(test)]
mod tests {
/// Make sure this can run for a few years with no issue
#[test]
fn years() {
let mut hb = super::Heartbeat::default();
const SECONDS_PER_DAY: u64 = 86_400;
const DAYS_PER_YEAR: u64 = 365;
let far_future =
hb.next_instant + std::time::Duration::from_secs(SECONDS_PER_DAY * DAYS_PER_YEAR * 50);
// It will only print 32 lines or so for the next 50+ years
for _ in 0..50 {
hb.tick();
}
assert!(hb.next_instant > far_future);
}
}

View File

@@ -91,7 +91,6 @@ fn run_debug_ipc_service() -> Result<()> {
crate::setup_stdout_logging()?;
let rt = tokio::runtime::Runtime::new()?;
let _guard = rt.enter();
rt.spawn(crate::heartbeat::heartbeat());
let mut signals = Signals::new()?;
// Couldn't get the loop to work here yet, so SIGHUP is not implemented
@@ -297,11 +296,15 @@ fn setup_logging(log_dir: Option<PathBuf>) -> Result<connlib_client_shared::file
std::fs::create_dir_all(&log_dir)
.context("We should have permissions to create our log dir")?;
let (layer, handle) = file_logger::layer(&log_dir);
let log_filter = get_log_filter().context("Couldn't read log filter")?;
let filter = EnvFilter::new(&log_filter);
let directives = get_log_filter().context("Couldn't read log filter")?;
let filter = EnvFilter::new(&directives);
let subscriber = Registry::default().with(layer.with_filter(filter));
set_global_default(subscriber).context("`set_global_default` should always work)")?;
tracing::info!(git_version = crate::GIT_VERSION, ?log_filter);
tracing::info!(
git_version = crate::GIT_VERSION,
system_uptime_seconds = crate::uptime::get().map(|dur| dur.as_secs()),
?directives
);
Ok(handle)
}

View File

@@ -10,7 +10,6 @@ pub(crate) fn run_ipc_service(cli: CliCommon) -> Result<()> {
anyhow::bail!("This is the IPC service binary, it's not meant to run interactively.");
}
let rt = tokio::runtime::Runtime::new()?;
rt.spawn(crate::heartbeat::heartbeat());
if let Err(error) = rt.block_on(super::ipc_listen()) {
tracing::error!(?error, "`ipc_listen` failed");
}

View File

@@ -76,7 +76,6 @@ fn fallible_windows_service_run(
tracing::info!(?arguments, "fallible_windows_service_run");
let rt = tokio::runtime::Runtime::new()?;
rt.spawn(crate::heartbeat::heartbeat());
let ipc_task = rt.spawn(super::ipc_listen());
let ipc_task_ah = ipc_task.abort_handle();

View File

@@ -29,11 +29,10 @@ use platform::Signals;
pub(crate) mod device_id;
// Pub because the GUI reads the system resolvers
pub mod dns_control;
pub mod heartbeat;
mod ipc_service;
pub mod known_dirs;
mod standalone;
mod uptime;
pub mod uptime;
#[cfg(target_os = "linux")]
#[path = "linux.rs"]