mirror of
https://github.com/outbackdingo/firezone.git
synced 2026-01-27 10:18:54 +00:00
refactor(gui-client): improve error handling in Windows service (#9178)
The current error handling logic in the Windows service is a bit dodgy. We first initialise the logger but then pass the logging handle into the `try_run_service` function. This means that any failures in starting the Windows service aren't actually logged because the handle has been dropped by then already.
This commit is contained in:
@@ -1,6 +1,5 @@
|
||||
use anyhow::{Context as _, Result, bail};
|
||||
use anyhow::{Context as _, Result};
|
||||
use firezone_bin_shared::DnsControlMethod;
|
||||
use firezone_logging::FilterReloadHandle;
|
||||
use firezone_telemetry::Telemetry;
|
||||
use futures::channel::mpsc;
|
||||
use std::path::PathBuf;
|
||||
@@ -194,40 +193,31 @@ fn uninstall_tunnel_service(
|
||||
///
|
||||
/// Linux uses the CLI args from here, Windows does not
|
||||
pub fn run(_log_dir: Option<PathBuf>, _dns_control: DnsControlMethod) -> Result<()> {
|
||||
windows_service::service_dispatcher::start(SERVICE_NAME, ffi_service_run).context("windows_service::service_dispatcher failed. This isn't running in an interactive terminal, right?")
|
||||
windows_service::service_dispatcher::start(SERVICE_NAME, run_service_ffi).context("windows_service::service_dispatcher failed. This isn't running in an interactive terminal, right?")
|
||||
}
|
||||
|
||||
// Generates `ffi_service_run` from `service_run`
|
||||
windows_service::define_windows_service!(ffi_service_run, service_run);
|
||||
// Generates `run_service_ffi` from `service_run`
|
||||
windows_service::define_windows_service!(run_service_ffi, run_service);
|
||||
|
||||
fn service_run(arguments: Vec<OsString>) {
|
||||
fn run_service(arguments: Vec<OsString>) {
|
||||
// `arguments` doesn't seem to work right when running as a Windows service
|
||||
// (even though it's meant for that) so just use the default log dir.
|
||||
let (handle, log_filter_reloader) =
|
||||
let (_handle, log_filter_reloader) =
|
||||
crate::logging::setup_tunnel(None).expect("Should be able to set up logging");
|
||||
if let Err(error) = fallible_service_run(arguments, handle, log_filter_reloader) {
|
||||
tracing::error!("`fallible_windows_service_run` returned an error: {error:#}");
|
||||
}
|
||||
}
|
||||
|
||||
// Most of the Windows-specific service stuff should go here
|
||||
//
|
||||
// The arguments don't seem to match the ones passed to the main thread at all.
|
||||
//
|
||||
// If Windows stops us gracefully, this function may never return.
|
||||
fn fallible_service_run(
|
||||
arguments: Vec<OsString>,
|
||||
logging_handle: firezone_logging::file::Handle,
|
||||
log_filter_reloader: FilterReloadHandle,
|
||||
) -> Result<()> {
|
||||
tracing::info!(?arguments, "fallible_windows_service_run");
|
||||
if !elevation_check()? {
|
||||
bail!("Tunnel service failed its elevation check, try running as admin / root");
|
||||
tracing::info!(?arguments, "run_service");
|
||||
|
||||
if !elevation_check().is_ok_and(|elevated| elevated) {
|
||||
tracing::info!("Tunnel service failed its elevation check, try running as admin / root");
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
let rt = tokio::runtime::Builder::new_current_thread()
|
||||
.enable_all()
|
||||
.build()?;
|
||||
.build()
|
||||
.expect("Failed to create tokio runtime");
|
||||
|
||||
let (mut shutdown_tx, shutdown_rx) = mpsc::channel(1);
|
||||
|
||||
let event_handler = move |control_event| -> ServiceControlHandlerResult {
|
||||
@@ -265,9 +255,18 @@ fn fallible_service_run(
|
||||
}
|
||||
};
|
||||
|
||||
let status_handle = match service_control_handler::register(SERVICE_NAME, event_handler)
|
||||
.context("Failed to register Windows service")
|
||||
{
|
||||
Ok(handle) => handle,
|
||||
Err(e) => {
|
||||
tracing::error!("{e:#}");
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
// Tell Windows that we're running (equivalent to sd_notify in systemd)
|
||||
let status_handle = service_control_handler::register(SERVICE_NAME, event_handler)?;
|
||||
status_handle.set_service_status(ServiceStatus {
|
||||
let _ = status_handle.set_service_status(ServiceStatus {
|
||||
service_type: SERVICE_TYPE,
|
||||
current_state: ServiceState::Running,
|
||||
controls_accepted: ServiceControlAccept::POWER_EVENT
|
||||
@@ -277,79 +276,43 @@ fn fallible_service_run(
|
||||
checkpoint: 0,
|
||||
wait_hint: Duration::default(),
|
||||
process_id: None,
|
||||
})?;
|
||||
});
|
||||
|
||||
let mut signals = firezone_bin_shared::signals::Terminate::from_channel(shutdown_rx);
|
||||
let mut telemetry = Telemetry::default();
|
||||
|
||||
// Add new features in `service_run_async` if possible.
|
||||
// We don't want to bail out of `fallible_service_run` and forget to tell
|
||||
// Windows that we're shutting down.
|
||||
let result = rt
|
||||
.block_on(service_run_async(
|
||||
.block_on(super::ipc_listen(
|
||||
DnsControlMethod::Nrpt,
|
||||
&log_filter_reloader,
|
||||
&mut signals,
|
||||
&mut telemetry,
|
||||
shutdown_rx,
|
||||
))
|
||||
.inspect(|_| rt.block_on(telemetry.stop()))
|
||||
.inspect(|()| {
|
||||
tracing::info!("Windows service exited gracefully");
|
||||
|
||||
rt.block_on(telemetry.stop())
|
||||
})
|
||||
.inspect_err(|e| {
|
||||
tracing::error!("Tunnel service failed: {e:#}");
|
||||
|
||||
rt.block_on(telemetry.stop_on_crash())
|
||||
});
|
||||
|
||||
// Drop the logging handle so it flushes the logs before we let Windows kill our process.
|
||||
// There is no obvious and elegant way to do this, since the logging and `ServiceState`
|
||||
// changes are interleaved, not nested:
|
||||
// - Start logging
|
||||
// - ServiceState::Running
|
||||
// - Stop logging
|
||||
// - ServiceState::Stopped
|
||||
std::mem::drop(logging_handle);
|
||||
|
||||
// Tell Windows that we're stopping
|
||||
// Per Windows docs, this will cause Windows to kill our process eventually.
|
||||
status_handle
|
||||
.set_service_status(ServiceStatus {
|
||||
service_type: SERVICE_TYPE,
|
||||
current_state: ServiceState::Stopped,
|
||||
controls_accepted: ServiceControlAccept::empty(),
|
||||
exit_code: ServiceExitCode::Win32(if result.is_ok() { 0 } else { 1 }),
|
||||
checkpoint: 0,
|
||||
wait_hint: Duration::default(),
|
||||
process_id: None,
|
||||
})
|
||||
.context("Should be able to tell Windows we're stopping")?;
|
||||
// Generally unreachable. Windows typically kills the process first,
|
||||
// but doesn't guarantee it.
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// The main loop for the Windows service
|
||||
///
|
||||
/// This is split off from other functions because we don't want to accidentally
|
||||
/// bail out of a fallible function and not tell Windows that we're stopping
|
||||
/// the service. So it's okay to bail out of `service_run_async`, but not
|
||||
/// out of its caller.
|
||||
///
|
||||
/// Logging must already be set up before calling this.
|
||||
async fn service_run_async(
|
||||
log_filter_reloader: &FilterReloadHandle,
|
||||
telemetry: &mut Telemetry,
|
||||
shutdown_rx: mpsc::Receiver<()>,
|
||||
) -> Result<()> {
|
||||
// Useless - Windows will never send us Ctrl+C when running as a service
|
||||
// This just keeps the signatures simpler
|
||||
let mut signals = firezone_bin_shared::signals::Terminate::from_channel(shutdown_rx);
|
||||
super::ipc_listen(
|
||||
DnsControlMethod::Nrpt,
|
||||
log_filter_reloader,
|
||||
&mut signals,
|
||||
telemetry,
|
||||
)
|
||||
.await
|
||||
.context("`ipc_listen` threw an error")?;
|
||||
|
||||
Ok(())
|
||||
let _ = status_handle.set_service_status(ServiceStatus {
|
||||
service_type: SERVICE_TYPE,
|
||||
current_state: ServiceState::Stopped,
|
||||
controls_accepted: ServiceControlAccept::empty(),
|
||||
exit_code: ServiceExitCode::Win32(if result.is_ok() { 0 } else { 1 }),
|
||||
checkpoint: 0,
|
||||
wait_hint: Duration::default(),
|
||||
process_id: None,
|
||||
});
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
Reference in New Issue
Block a user