fix(headless-client/windows): improve Client startup times on Windows (#5375)

Closes #5026
Closes #5879 

On the resource-constrained Windows Server 2022 test VM, the median
sign-in time dropped from 5.0 seconds to 2.2 seconds.

# Changes

- Measure end-to-end connection time in the GUI process
- Use `ipconfig` instead of Powershell to flush DNS faster
- Activate DNS control by manipulating the Windows Registry directly
instead of calling Powershell
- Remove deactivate step when changing DNS servers (seals a DNS leak
when roaming networks)
- Remove completely redundant `Set-DnsClientServerAddress` step from
activating DNS control
- Remove `Remove-NetRoute` powershell cmdlet that seems to do nothing

# Benchmark 7

- Optimized release builds
- x86-64 constrained VM (1 CPU thread, 2 GB RAM)

Main with measurement added, `c1c99197e` from #5864 

- 6.0 s
- 5.5 s
- 4.1 s
- 5.0 s
- 4.1 s
- (Median = 5.0 s)

Main with speedups added, `2128329f9` from #5375, this PR

- 3.7 s
- 2.2 s
- 1.9 s
- 2.3 s
- 2.0 s
- (Median = 2.2 s)

```[tasklist]
### Next steps
- [x] Benchmark on the resource-constrained VM
- [x] Move raw benchmark data to a comment and summarize in the description
- [x] Clean up tasks that don't need to be in the commit
- [x] Merge
```

# Hypothetical further optimizations

- Ditch the `netsh` subprocess in `set_ips`

---------

Signed-off-by: Reactor Scram <ReactorScram@users.noreply.github.com>
This commit is contained in:
Reactor Scram
2024-07-16 16:55:29 -05:00
committed by GitHub
parent 58db5f0639
commit 63623346b9
11 changed files with 140 additions and 97 deletions

1
rust/Cargo.lock generated
View File

@@ -1978,6 +1978,7 @@ dependencies = [
"uuid",
"windows 0.57.0",
"windows-service",
"winreg 0.52.0",
]
[[package]]

View File

@@ -1,12 +1,7 @@
use crate::MTU;
use connlib_shared::{
windows::{CREATE_NO_WINDOW, TUNNEL_NAME},
Result,
};
use connlib_shared::{windows::TUNNEL_NAME, Result};
use std::{
io,
os::windows::process::CommandExt,
process::{Command, Stdio},
str::FromStr,
sync::Arc,
task::{ready, Context, Poll},
@@ -82,17 +77,6 @@ impl Tun {
let adapter = &Adapter::create(&wintun, ADAPTER_NAME, TUNNEL_NAME, Some(uuid))?;
let iface_idx = adapter.get_adapter_index()?;
// Remove any routes that were previously associated with us
// TODO: Pick a more elegant way to do this
Command::new("powershell")
.creation_flags(CREATE_NO_WINDOW)
.arg("-Command")
.arg(format!(
"Remove-NetRoute -InterfaceIndex {iface_idx} -Confirm:$false"
))
.stdout(Stdio::null())
.status()?;
set_iface_config(adapter.get_luid(), MTU as u32)?;
let session = Arc::new(adapter.start_session(RING_BUFFER_SIZE)?);

View File

@@ -491,10 +491,12 @@ impl Controller {
let api_url = self.advanced_settings.api_url.clone();
tracing::info!(api_url = api_url.to_string(), "Starting connlib...");
// Count the start instant from before we connect
let start_instant = Instant::now();
self.ipc_client
.connect_to_firezone(api_url.as_str(), token)
.await?;
// Change the status after we begin connecting
self.status = Status::Connecting { start_instant };
self.refresh_system_tray_menu()?;
@@ -804,6 +806,8 @@ async fn run_controller(
let mut dns_listener = network_changes::DnsListener::new()?;
loop {
// TODO: Add `ControllerRequest::NetworkChange` and `DnsChange` and replace
// `tokio::select!` with a `poll_*` function
tokio::select! {
() = com_worker.notified() => {
let new_have_internet = network_changes::check_internet().context("Failed to check for internet")?;
@@ -845,6 +849,7 @@ async fn run_controller(
}
},
}
// Code down here may not run because the `select` sometimes `continue`s.
}
if let Err(error) = com_worker.close() {

View File

@@ -56,6 +56,7 @@ ring = "0.17"
thiserror = { version = "1.0", default-features = false }
tracing-subscriber = { version = "0.3.17", features = ["env-filter"] }
windows-service = "0.7.0"
winreg = "0.52.0"
[target.'cfg(target_os = "windows")'.dependencies.windows]
version = "0.57.0"

View File

@@ -8,7 +8,7 @@ mod windows;
#[cfg(target_os = "windows")]
use windows as platform;
pub(crate) use platform::{deactivate, system_resolvers, DnsController};
pub(crate) use platform::{system_resolvers, DnsController};
// TODO: Move DNS and network change listening to the IPC service, so this won't
// need to be public.

View File

@@ -25,6 +25,11 @@ enum DnsControlMethod {
Systemd,
}
/// Controls system-wide DNS.
///
/// Always call `deactivate` when Firezone starts.
///
/// Only one of these should exist on the entire system at a time.
pub(crate) struct DnsController {
dns_control_method: Option<DnsControlMethod>,
}
@@ -41,15 +46,23 @@ impl Default for DnsController {
impl Drop for DnsController {
fn drop(&mut self) {
tracing::debug!("Reverting DNS control...");
if let Some(DnsControlMethod::EtcResolvConf) = self.dns_control_method {
// TODO: Check that nobody else modified the file while we were running.
etc_resolv_conf::revert().ok();
if let Err(error) = self.deactivate() {
tracing::error!(?error, "Failed to deactivate DNS control");
}
}
}
impl DnsController {
#[allow(clippy::unnecessary_wraps)]
pub(crate) fn deactivate(&mut self) -> Result<()> {
tracing::debug!("Deactivating DNS control...");
if let Some(DnsControlMethod::EtcResolvConf) = self.dns_control_method {
// TODO: Check that nobody else modified the file while we were running.
etc_resolv_conf::revert()?;
}
Ok(())
}
/// Set the computer's system-wide DNS servers
///
/// The `mut` in `&mut self` is not needed by Rust's rules, but
@@ -185,12 +198,6 @@ fn parse_resolvectl_output(s: &str) -> Vec<IpAddr> {
.collect()
}
// Does nothing on Linux, needed to match the Windows interface
#[allow(clippy::unnecessary_wraps)]
pub(crate) fn deactivate() -> Result<()> {
Ok(())
}
#[cfg(test)]
mod tests {
use std::net::IpAddr;

View File

@@ -1,5 +1,9 @@
use anyhow::{Context, Result};
use std::{io::Write, net::IpAddr, path::PathBuf};
use std::{
io::{self, Write},
net::IpAddr,
path::PathBuf,
};
pub(crate) const ETC_RESOLV_CONF: &str = "/etc/resolv.conf";
pub(crate) const ETC_RESOLV_CONF_BACKUP: &str = "/etc/resolv.conf.before-firezone";
@@ -36,7 +40,7 @@ pub(crate) async fn configure(dns_config: &[IpAddr]) -> Result<()> {
/// Revert changes Firezone made to `/etc/resolv.conf`
///
/// Must be sync because it's called in `Tun::drop`
/// Must be sync because it's called in `Drop` impls
#[cfg_attr(test, mutants::skip)] // Would modify system-wide `/etc/resolv.conf`
pub(crate) fn revert() -> Result<()> {
revert_at_paths(&ResolvPaths::default())
@@ -123,11 +127,18 @@ async fn configure_at_paths(dns_config: &[IpAddr], paths: &ResolvPaths) -> Resul
}
fn revert_at_paths(paths: &ResolvPaths) -> Result<()> {
std::fs::copy(&paths.backup, &paths.resolv).context("Failed to restore backup")?;
match std::fs::copy(&paths.backup, &paths.resolv) {
Err(e) if e.kind() == io::ErrorKind::NotFound => {
tracing::debug!("Didn't revert `/etc/resolv.conf`, no backup file found");
return Ok(());
}
Err(e) => Err(e).context("Failed to restore `/etc/resolv.conf` backup")?,
Ok(_) => {}
}
// Don't delete the backup file - If we lose power here, and the revert is rolled back,
// we may want it. Filesystems are not atomic by default, and have weak ordering,
// so we don't want to end up in a state where the backup is deleted and the revert was rolled back.
tracing::info!("Reverted resolv.conf file");
tracing::info!("Reverted `/etc/resolv.conf`l");
Ok(())
}

View File

@@ -14,13 +14,18 @@
//! <https://superuser.com/a/1752670>
use anyhow::{Context as _, Result};
use connlib_shared::windows::{CREATE_NO_WINDOW, TUNNEL_NAME};
use std::{net::IpAddr, os::windows::process::CommandExt, process::Command};
use connlib_shared::windows::CREATE_NO_WINDOW;
use std::{net::IpAddr, os::windows::process::CommandExt, path::Path, process::Command};
pub fn system_resolvers_for_gui() -> Result<Vec<IpAddr>> {
system_resolvers()
}
/// Controls system-wide DNS.
///
/// Always call `deactivate` when Firezone starts.
///
/// Only one of these should exist on the entire system at a time.
#[derive(Default)]
pub(crate) struct DnsController {}
@@ -30,16 +35,31 @@ const FZ_MAGIC: &str = "firezone-fd0020211111";
impl Drop for DnsController {
fn drop(&mut self) {
if let Err(error) = deactivate() {
if let Err(error) = self.deactivate() {
tracing::error!(?error, "Failed to deactivate DNS control");
}
}
}
impl DnsController {
/// Set the computer's system-wide DNS servers
/// Deactivate any control Firezone has over the computer's DNS
///
/// There's a gap in this because on Windows we deactivate and re-activate control.
/// Must be `sync` so we can call it from `Drop`
/// TODO: Replace this with manual registry twiddling one day if we feel safe.
pub(crate) fn deactivate(&mut self) -> Result<()> {
Command::new("powershell")
.creation_flags(CREATE_NO_WINDOW)
.args(["-Command", "Get-DnsClientNrptRule", "|"])
.args(["where", "Comment", "-eq", FZ_MAGIC, "|"])
.args(["foreach", "{"])
.args(["Remove-DnsClientNrptRule", "-Name", "$_.Name", "-Force"])
.args(["}"])
.status()?;
tracing::info!("Deactivated DNS control");
Ok(())
}
/// Set the computer's system-wide DNS servers
///
/// The `mut` in `&mut self` is not needed by Rust's rules, but
/// it would be bad if this was called from 2 threads at once.
@@ -47,7 +67,6 @@ impl DnsController {
/// Must be async to match the Linux signature
#[allow(clippy::unused_async)]
pub(crate) async fn set_dns(&mut self, dns_config: &[IpAddr]) -> Result<()> {
deactivate().context("Failed to deactivate DNS control")?;
activate(dns_config).context("Failed to activate DNS control")?;
Ok(())
}
@@ -57,9 +76,9 @@ impl DnsController {
/// `&self` is needed to match the Linux signature
pub(crate) fn flush(&self) -> Result<()> {
tracing::debug!("Flushing Windows DNS cache...");
Command::new("powershell")
Command::new("ipconfig")
.creation_flags(CREATE_NO_WINDOW)
.args(["-Command", "Clear-DnsClientCache"])
.args(["/flushdns"])
.status()?;
tracing::debug!("Flushed DNS.");
Ok(())
@@ -82,57 +101,48 @@ pub(crate) fn system_resolvers() -> Result<Vec<IpAddr>> {
Ok(resolvers)
}
/// A UUID for the Firezone Client NRPT rule, chosen randomly at dev time.
///
/// Our NRPT rule should always live in the registry at
/// `Computer\HKEY_LOCAL_MACHINE\SYSTEM\CurrentControlSet\Services\Dnscache\Parameters\DnsPolicyConfig\$NRPT_REG_KEY`
///
/// We can use this UUID as a handle to enable, disable, or modify the rule.
const NRPT_REG_KEY: &str = "{6C0507CB-C884-4A78-BC55-0ACEE21227F6}";
/// Tells Windows to send all DNS queries to our sentinels
///
/// Parameters:
/// - `dns_config_string`: Comma-separated IP addresses of DNS servers, e.g. "1.1.1.1,8.8.8.8"
pub(crate) fn activate(dns_config: &[IpAddr]) -> Result<()> {
let dns_config_string = dns_config
.iter()
.map(|ip| format!("\"{ip}\""))
.collect::<Vec<_>>()
.join(",");
// Set our DNS IP as the DNS server for our interface
fn activate(dns_config: &[IpAddr]) -> Result<()> {
// TODO: Known issue where web browsers will keep a connection open to a site,
// using QUIC, HTTP/2, or even HTTP/1.1, and so they won't resolve the DNS
// again unless you let that connection time out:
// <https://github.com/firezone/firezone/issues/3113#issuecomment-1882096111>
Command::new("powershell")
.creation_flags(CREATE_NO_WINDOW)
.arg("-Command")
.arg(format!(
"Set-DnsClientServerAddress {TUNNEL_NAME} -ServerAddresses({dns_config_string})"
))
.status()?;
tracing::info!("Activating DNS control");
Command::new("powershell")
.creation_flags(CREATE_NO_WINDOW)
.args([
"-Command",
"Add-DnsClientNrptRule",
"-Namespace",
".",
"-Comment",
FZ_MAGIC,
"-NameServers",
&dns_config_string,
])
.status()?;
Ok(())
}
let dns_config_string = dns_config
.iter()
.map(|ip| ip.to_string())
.collect::<Vec<_>>()
.join(";");
let hkcu = winreg::RegKey::predef(winreg::enums::HKEY_LOCAL_MACHINE);
let base = Path::new("SYSTEM")
.join("CurrentControlSet")
.join("Services")
.join("Dnscache")
.join("Parameters")
.join("DnsPolicyConfig")
.join(NRPT_REG_KEY);
let (key, _) = hkcu.create_subkey(base)?;
key.set_value("Comment", &FZ_MAGIC)?;
key.set_value("ConfigOptions", &0x8u32)?;
key.set_value("DisplayName", &"Firezone SplitDNS")?;
key.set_value("GenericDNSServers", &dns_config_string)?;
key.set_value("IPSECCARestriction", &"")?;
key.set_value("Name", &vec!["."])?;
key.set_value("Version", &0x2u32)?;
// Must be `sync` so we can call it from `Drop`
pub(crate) fn deactivate() -> Result<()> {
Command::new("powershell")
.creation_flags(CREATE_NO_WINDOW)
.args(["-Command", "Get-DnsClientNrptRule", "|"])
.args(["where", "Comment", "-eq", FZ_MAGIC, "|"])
.args(["foreach", "{"])
.args(["Remove-DnsClientNrptRule", "-Name", "$_.Name", "-Force"])
.args(["}"])
.status()?;
tracing::info!("Deactivated DNS control");
Ok(())
}

View File

@@ -120,13 +120,20 @@ fn run_smoke_test() -> Result<()> {
crate::setup_stdout_logging()?;
let rt = tokio::runtime::Runtime::new()?;
let _guard = rt.enter();
let mut dns_controller = DnsController::default();
// Deactivate Firezone DNS control in case the system or IPC service crashed
// and we need to recover. <https://github.com/firezone/firezone/issues/4899>
dns_controller.deactivate()?;
let mut signals = signals::Terminate::new()?;
// Couldn't get the loop to work here yet, so SIGHUP is not implemented
rt.block_on(async {
device_id::get_or_create().context("Failed to read / create device ID")?;
let mut server = IpcServer::new(ServiceId::Prod).await?;
let _ = Handler::new(&mut server).await?.run(&mut signals).await;
let _ = Handler::new(&mut server, &mut dns_controller)
.await?
.run(&mut signals)
.await;
Ok::<_, anyhow::Error>(())
})
}
@@ -140,8 +147,9 @@ async fn ipc_listen(signals: &mut signals::Terminate) -> Result<()> {
// This also gives the GUI a safe place to put the log filter config
device_id::get_or_create().context("Failed to read / create device ID")?;
let mut server = IpcServer::new(ServiceId::Prod).await?;
let mut dns_controller = DnsController::default();
loop {
let mut handler_fut = pin!(Handler::new(&mut server));
let mut handler_fut = pin!(Handler::new(&mut server, &mut dns_controller));
let Some(handler) = poll_fn(|cx| {
if let Poll::Ready(()) = signals.poll_recv(cx) {
Poll::Ready(None)
@@ -165,11 +173,11 @@ async fn ipc_listen(signals: &mut signals::Terminate) -> Result<()> {
}
/// Handles one IPC client
struct Handler {
struct Handler<'a> {
callback_handler: CallbackHandler,
cb_rx: mpsc::Receiver<InternalServerMsg>,
connlib: Option<connlib_client_shared::Session>,
dns_controller: DnsController,
dns_controller: &'a mut DnsController,
ipc_rx: ipc::ServerRead,
ipc_tx: ipc::ServerWrite,
last_connlib_start_instant: Option<Instant>,
@@ -193,9 +201,9 @@ enum HandlerOk {
ServiceTerminating,
}
impl Handler {
async fn new(server: &mut IpcServer) -> Result<Self> {
dns_control::deactivate()?;
impl<'a> Handler<'a> {
async fn new(server: &mut IpcServer, dns_controller: &'a mut DnsController) -> Result<Self> {
dns_controller.deactivate()?;
let (ipc_rx, ipc_tx) = server
.next_client_split()
.await
@@ -207,7 +215,7 @@ impl Handler {
callback_handler: CallbackHandler { cb_tx },
cb_rx,
connlib: None,
dns_controller: Default::default(),
dns_controller,
ipc_rx,
ipc_tx,
last_connlib_start_instant: None,
@@ -295,8 +303,7 @@ impl Handler {
// The first `OnUpdateResources` marks when connlib is fully initialized
if let IpcServerMsg::OnUpdateResources(_) = &msg {
if let Some(instant) = self.last_connlib_start_instant.take() {
let dur = instant.elapsed();
tracing::info!(?dur, "Connlib started");
tracing::info!(elapsed = ?instant.elapsed(), "Tunnel ready");
}
// On every resources update, flush DNS to mitigate <https://github.com/firezone/firezone/issues/5052>
@@ -357,7 +364,7 @@ impl Handler {
ClientMsg::Disconnect => {
if let Some(connlib) = self.connlib.take() {
connlib.disconnect();
dns_control::deactivate()?;
self.dns_controller.deactivate()?;
} else {
tracing::error!("Error - Got Disconnect when we're already not connected");
}

View File

@@ -15,7 +15,7 @@ use std::{
pin::pin,
sync::Arc,
};
use tokio::sync::mpsc;
use tokio::{sync::mpsc, time::Instant};
use tokio_stream::wrappers::ReceiverStream;
/// Command-line args for the headless Client
@@ -46,6 +46,12 @@ struct Cli {
#[arg(long)]
check: bool,
/// Connect to the Firezone network and initialize, then exit
///
/// Use this to check how fast you can connect.
#[arg(long)]
exit: bool,
/// Friendly name for this client to display in the UI.
#[arg(long, env = "FIREZONE_NAME")]
firezone_name: Option<String>,
@@ -127,7 +133,6 @@ pub fn run_only_headless_client() -> Result<()> {
cli.token_path.display()
)
})?;
tracing::info!("Running in headless / standalone mode");
// TODO: Should this default to 30 days?
let max_partition_time = cli.common.max_partition_time.map(|d| d.into());
@@ -154,6 +159,8 @@ pub fn run_only_headless_client() -> Result<()> {
let (cb_tx, cb_rx) = mpsc::channel(10);
let callbacks = CallbackHandler { cb_tx };
// The name matches that in `ipc_service.rs`
let mut last_connlib_start_instant = Some(Instant::now());
platform::setup_before_connlib()?;
let args = ConnectArgs {
url,
@@ -174,6 +181,9 @@ pub fn run_only_headless_client() -> Result<()> {
let mut terminate = pin!(terminate.recv().fuse());
let mut hangup = pin!(hangup.recv().fuse());
let mut dns_controller = DnsController::default();
// Deactivate Firezone DNS control in case the system or IPC service crashed
// and we need to recover. <https://github.com/firezone/firezone/issues/4899>
dns_controller.deactivate()?;
let mut tun_device = TunDeviceManager::new()?;
let mut cb_rx = ReceiverStream::new(cb_rx).fuse();
@@ -213,7 +223,14 @@ pub fn run_only_headless_client() -> Result<()> {
dns_controller.set_dns(&dns).await?;
}
InternalServerMsg::OnUpdateRoutes { ipv4, ipv6 } => {
tun_device.set_routes(ipv4, ipv6).await?
tun_device.set_routes(ipv4, ipv6).await?;
if let Some(instant) = last_connlib_start_instant.take() {
tracing::info!(elapsed = ?instant.elapsed(), "Tunnel ready");
}
if cli.exit {
tracing::info!("Exiting due to `--exit` CLI flag");
break Ok(());
}
}
}
}

View File

@@ -17,8 +17,8 @@ export default function GUI({ title }: { title: string }) {
<ChangeItem enable={title === "Linux GUI"} pull="5848">
If the GUI is open during an update, close it and prompt the user to restart the GUI.
</ChangeItem>
<ChangeItem enable={title === "Windows"}>
This is a maintenance release with no user-facing changes.
<ChangeItem enable={title === "Windows"} pull="5375">
Improves sign-in speed and fixes a DNS leak
</ChangeItem>
</ul>
</Entry>*/}