refactor(gui-client): improve error handling in Windows service (#9178)

The current error handling logic in the Windows service is a bit dodgy.
We first initialise the logger but then pass the logging handle into the
`try_run_service` function. This means that any failures in starting the
Windows service aren't actually logged because the handle has been
dropped by then already.
This commit is contained in:
Thomas Eizinger
2025-05-21 00:46:46 +10:00
committed by GitHub
parent 0d75600b93
commit 00b40ae267

View File

@@ -1,6 +1,5 @@
use anyhow::{Context as _, Result, bail};
use anyhow::{Context as _, Result};
use firezone_bin_shared::DnsControlMethod;
use firezone_logging::FilterReloadHandle;
use firezone_telemetry::Telemetry;
use futures::channel::mpsc;
use std::path::PathBuf;
@@ -194,40 +193,31 @@ fn uninstall_tunnel_service(
///
/// Linux uses the CLI args from here, Windows does not
pub fn run(_log_dir: Option<PathBuf>, _dns_control: DnsControlMethod) -> Result<()> {
windows_service::service_dispatcher::start(SERVICE_NAME, ffi_service_run).context("windows_service::service_dispatcher failed. This isn't running in an interactive terminal, right?")
windows_service::service_dispatcher::start(SERVICE_NAME, run_service_ffi).context("windows_service::service_dispatcher failed. This isn't running in an interactive terminal, right?")
}
// Generates `ffi_service_run` from `service_run`
windows_service::define_windows_service!(ffi_service_run, service_run);
// Generates `run_service_ffi` from `service_run`
windows_service::define_windows_service!(run_service_ffi, run_service);
fn service_run(arguments: Vec<OsString>) {
fn run_service(arguments: Vec<OsString>) {
// `arguments` doesn't seem to work right when running as a Windows service
// (even though it's meant for that) so just use the default log dir.
let (handle, log_filter_reloader) =
let (_handle, log_filter_reloader) =
crate::logging::setup_tunnel(None).expect("Should be able to set up logging");
if let Err(error) = fallible_service_run(arguments, handle, log_filter_reloader) {
tracing::error!("`fallible_windows_service_run` returned an error: {error:#}");
}
}
// Most of the Windows-specific service stuff should go here
//
// The arguments don't seem to match the ones passed to the main thread at all.
//
// If Windows stops us gracefully, this function may never return.
fn fallible_service_run(
arguments: Vec<OsString>,
logging_handle: firezone_logging::file::Handle,
log_filter_reloader: FilterReloadHandle,
) -> Result<()> {
tracing::info!(?arguments, "fallible_windows_service_run");
if !elevation_check()? {
bail!("Tunnel service failed its elevation check, try running as admin / root");
tracing::info!(?arguments, "run_service");
if !elevation_check().is_ok_and(|elevated| elevated) {
tracing::info!("Tunnel service failed its elevation check, try running as admin / root");
return;
}
let rt = tokio::runtime::Builder::new_current_thread()
.enable_all()
.build()?;
.build()
.expect("Failed to create tokio runtime");
let (mut shutdown_tx, shutdown_rx) = mpsc::channel(1);
let event_handler = move |control_event| -> ServiceControlHandlerResult {
@@ -265,9 +255,18 @@ fn fallible_service_run(
}
};
let status_handle = match service_control_handler::register(SERVICE_NAME, event_handler)
.context("Failed to register Windows service")
{
Ok(handle) => handle,
Err(e) => {
tracing::error!("{e:#}");
return;
}
};
// Tell Windows that we're running (equivalent to sd_notify in systemd)
let status_handle = service_control_handler::register(SERVICE_NAME, event_handler)?;
status_handle.set_service_status(ServiceStatus {
let _ = status_handle.set_service_status(ServiceStatus {
service_type: SERVICE_TYPE,
current_state: ServiceState::Running,
controls_accepted: ServiceControlAccept::POWER_EVENT
@@ -277,79 +276,43 @@ fn fallible_service_run(
checkpoint: 0,
wait_hint: Duration::default(),
process_id: None,
})?;
});
let mut signals = firezone_bin_shared::signals::Terminate::from_channel(shutdown_rx);
let mut telemetry = Telemetry::default();
// Add new features in `service_run_async` if possible.
// We don't want to bail out of `fallible_service_run` and forget to tell
// Windows that we're shutting down.
let result = rt
.block_on(service_run_async(
.block_on(super::ipc_listen(
DnsControlMethod::Nrpt,
&log_filter_reloader,
&mut signals,
&mut telemetry,
shutdown_rx,
))
.inspect(|_| rt.block_on(telemetry.stop()))
.inspect(|()| {
tracing::info!("Windows service exited gracefully");
rt.block_on(telemetry.stop())
})
.inspect_err(|e| {
tracing::error!("Tunnel service failed: {e:#}");
rt.block_on(telemetry.stop_on_crash())
});
// Drop the logging handle so it flushes the logs before we let Windows kill our process.
// There is no obvious and elegant way to do this, since the logging and `ServiceState`
// changes are interleaved, not nested:
// - Start logging
// - ServiceState::Running
// - Stop logging
// - ServiceState::Stopped
std::mem::drop(logging_handle);
// Tell Windows that we're stopping
// Per Windows docs, this will cause Windows to kill our process eventually.
status_handle
.set_service_status(ServiceStatus {
service_type: SERVICE_TYPE,
current_state: ServiceState::Stopped,
controls_accepted: ServiceControlAccept::empty(),
exit_code: ServiceExitCode::Win32(if result.is_ok() { 0 } else { 1 }),
checkpoint: 0,
wait_hint: Duration::default(),
process_id: None,
})
.context("Should be able to tell Windows we're stopping")?;
// Generally unreachable. Windows typically kills the process first,
// but doesn't guarantee it.
Ok(())
}
/// The main loop for the Windows service
///
/// This is split off from other functions because we don't want to accidentally
/// bail out of a fallible function and not tell Windows that we're stopping
/// the service. So it's okay to bail out of `service_run_async`, but not
/// out of its caller.
///
/// Logging must already be set up before calling this.
async fn service_run_async(
log_filter_reloader: &FilterReloadHandle,
telemetry: &mut Telemetry,
shutdown_rx: mpsc::Receiver<()>,
) -> Result<()> {
// Useless - Windows will never send us Ctrl+C when running as a service
// This just keeps the signatures simpler
let mut signals = firezone_bin_shared::signals::Terminate::from_channel(shutdown_rx);
super::ipc_listen(
DnsControlMethod::Nrpt,
log_filter_reloader,
&mut signals,
telemetry,
)
.await
.context("`ipc_listen` threw an error")?;
Ok(())
let _ = status_handle.set_service_status(ServiceStatus {
service_type: SERVICE_TYPE,
current_state: ServiceState::Stopped,
controls_accepted: ServiceControlAccept::empty(),
exit_code: ServiceExitCode::Win32(if result.is_ok() { 0 } else { 1 }),
checkpoint: 0,
wait_hint: Duration::default(),
process_id: None,
});
}
#[cfg(test)]