From a6ffdd26545942600655b30ec815693af513db44 Mon Sep 17 00:00:00 2001 From: Thomas Eizinger Date: Thu, 17 Jul 2025 08:50:31 +0800 Subject: [PATCH] feat(snownet): reduce rekey-attempt-time to 15s (#9891) From Sentry reports and user-submitted logs, we know that it is possible for Client and Gateway to de-sync in regards to what each other's public key is. In such a scenario, ICE will succeed to make a connection but `boringtun` will fail to handshake a tunnel. By default, `boringtun` tries for 90s to handshake a session before it gives up and expires it. In Firezone, the ICE agent takes care of establishing connectivity whereas `boringtun` itself just encrypts and decrypts packets. As such, if ICE is working, we know that packets aren't getting lost but instead, there must be some other issue as to why we cannot establish a session. To improve the UX in these error cases, we reduce the rekey-attempt-time to 15s. This roughly matches our ICE timeout. Those 15s count from the moment we send the first handshake which is just after ICE completes. Thus we can be sure that after at most 15s, we either have a working WireGuard session or the connection gets cleaned up. Related: #9890 Related: #9850 --- rust/Cargo.lock | 2 +- rust/connlib/snownet/src/node.rs | 46 ++++++++++--------- rust/telemetry/src/feature_flags.rs | 18 -------- website/src/components/Changelog/Android.tsx | 4 ++ website/src/components/Changelog/Apple.tsx | 7 ++- website/src/components/Changelog/GUI.tsx | 7 ++- website/src/components/Changelog/Gateway.tsx | 4 ++ website/src/components/Changelog/Headless.tsx | 7 ++- 8 files changed, 52 insertions(+), 43 deletions(-) diff --git a/rust/Cargo.lock b/rust/Cargo.lock index e649eed7e..41c826f0a 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -951,7 +951,7 @@ dependencies = [ [[package]] name = "boringtun" version = "0.6.1" -source = "git+https://github.com/firezone/boringtun?branch=master#5b1892f0616f97d82599b6e324356aa9ea629c33" +source = "git+https://github.com/firezone/boringtun?branch=master#3d5df9c2a6f55424e02671374f835cc7db1d7a44" dependencies = [ "aead", "base64 0.22.1", diff --git a/rust/connlib/snownet/src/node.rs b/rust/connlib/snownet/src/node.rs index 8a9959ba7..00d0aa45f 100644 --- a/rust/connlib/snownet/src/node.rs +++ b/rust/connlib/snownet/src/node.rs @@ -11,7 +11,6 @@ use boringtun::{noise::rate_limiter::RateLimiter, x25519::StaticSecret}; use bufferpool::{Buffer, BufferPool}; use core::fmt; use firezone_logging::err_with_src; -use firezone_telemetry::{analytics, feature_flags}; use hex_display::HexDisplayExt; use ip_packet::{ConvertibleIpv4Packet, ConvertibleIpv6Packet, IpPacket, IpPacketBuf}; use itertools::Itertools; @@ -747,18 +746,31 @@ where tracing::warn!("No TURN servers connected; connection may fail to establish"); } + let mut tunnel = Tunn::new_at( + self.private_key.clone(), + remote, + Some(key), + None, + self.index.next(), + Some(self.rate_limiter.clone()), + self.rng.next_u64(), + now, + ); + // By default, boringtun has a rekey attempt time of 90(!) seconds. + // In case of a state de-sync or other issues, this means we try for + // 90s to make a handshake, all whilst our ICE layer thinks the connection + // is working perfectly fine. + // This results in a bad UX as the user has to essentially wait for 90s + // before Firezone can fix the state and make a new connection. + // + // By aligning the rekey-attempt-time roughly with our ICE timeout, we ensure + // that even if the hole-punch was successful, it will take at most 15s + // until we have a WireGuard tunnel to send packets into. + tunnel.set_rekey_attempt_time(Duration::from_secs(15)); + Connection { agent, - tunnel: Tunn::new_at( - self.private_key.clone(), - remote, - Some(key), - None, - self.index.next(), - Some(self.rate_limiter.clone()), - self.rng.next_u64(), - now, - ), + tunnel, next_wg_timer_update: now, stats: Default::default(), buffer: vec![0; ip_packet::MAX_FZ_PAYLOAD], @@ -2214,16 +2226,8 @@ where .decapsulate_at(Some(src), packet, ip_packet.buf(), now) { TunnResult::Done => ControlFlow::Break(Ok(())), - TunnResult::Err(e @ WireGuardError::InvalidAeadTag) - if crate::is_handshake(packet) - && feature_flags::fail_handshake_on_decryption_errors() => - { - analytics::feature_flag_called("fail-handshake-on-decryption-errors"); - - tracing::info!("Connection handshake failed ({e})"); - self.state = ConnectionState::Failed; - - ControlFlow::Break(Ok(())) // Return `Ok` because we are already logging the error. + TunnResult::Err(e) if crate::is_handshake(packet) => { + ControlFlow::Break(Err(anyhow::Error::new(e).context("handshake packet"))) } TunnResult::Err(e) => ControlFlow::Break(Err(anyhow::Error::new(e))), diff --git a/rust/telemetry/src/feature_flags.rs b/rust/telemetry/src/feature_flags.rs index ff4190cc6..e3b9be0a4 100644 --- a/rust/telemetry/src/feature_flags.rs +++ b/rust/telemetry/src/feature_flags.rs @@ -43,10 +43,6 @@ pub fn export_metrics() -> bool { false // Placeholder until we actually deploy an OTEL collector. } -pub fn fail_handshake_on_decryption_errors() -> bool { - FEATURE_FLAGS.fail_handshake_on_decryption_errors() -} - pub(crate) async fn evaluate_now(user_id: String, env: Env) { if user_id.is_empty() { return; @@ -160,8 +156,6 @@ struct FeatureFlagsResponse { stream_logs: bool, #[serde(default)] map_enobufs_to_wouldblock: bool, - #[serde(default)] - fail_handshake_on_decryption_errors: bool, } #[derive(Debug, Default)] @@ -170,7 +164,6 @@ struct FeatureFlags { drop_llmnr_nxdomain_responses: AtomicBool, stream_logs: AtomicBool, map_enobufs_to_wouldblock: AtomicBool, - fail_handshake_on_decryption_errors: AtomicBool, } /// Accessors to the actual feature flags. @@ -187,7 +180,6 @@ impl FeatureFlags { drop_llmnr_nxdomain_responses, stream_logs, map_enobufs_to_wouldblock, - fail_handshake_on_decryption_errors, }: FeatureFlagsResponse, ) { self.icmp_unreachable_instead_of_nat64 @@ -197,8 +189,6 @@ impl FeatureFlags { self.stream_logs.store(stream_logs, Ordering::Relaxed); self.map_enobufs_to_wouldblock .store(map_enobufs_to_wouldblock, Ordering::Relaxed); - self.fail_handshake_on_decryption_errors - .store(fail_handshake_on_decryption_errors, Ordering::Relaxed); } fn icmp_unreachable_instead_of_nat64(&self) -> bool { @@ -217,11 +207,6 @@ impl FeatureFlags { fn map_enobufs_to_wouldblock(&self) -> bool { self.map_enobufs_to_wouldblock.load(Ordering::Relaxed) } - - fn fail_handshake_on_decryption_errors(&self) -> bool { - self.fail_handshake_on_decryption_errors - .load(Ordering::Relaxed) - } } fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context { @@ -232,7 +217,6 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context DropLlmnrNxdomainResponses { result: bool }, StreamLogs { result: bool }, MapENOBUFSToWouldBlock { result: bool }, - FailHandshakeOnDecryptionErrors { result: bool }, } // Exhaustive destruction so we don't forget to update this when we add a flag. @@ -241,7 +225,6 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context drop_llmnr_nxdomain_responses, stream_logs, map_enobufs_to_wouldblock, - fail_handshake_on_decryption_errors, } = flags; let value = serde_json::json!({ @@ -252,7 +235,6 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context SentryFlag::DropLlmnrNxdomainResponses { result: drop_llmnr_nxdomain_responses }, SentryFlag::StreamLogs { result: stream_logs }, SentryFlag::MapENOBUFSToWouldBlock { result: map_enobufs_to_wouldblock }, - SentryFlag::FailHandshakeOnDecryptionErrors { result: fail_handshake_on_decryption_errors }, ] }); diff --git a/website/src/components/Changelog/Android.tsx b/website/src/components/Changelog/Android.tsx index e69f30986..f0e164fb0 100644 --- a/website/src/components/Changelog/Android.tsx +++ b/website/src/components/Changelog/Android.tsx @@ -25,6 +25,10 @@ export default function Android() { Fixes an issue where Firezone failed to sign-in on systems with non-ASCII characters in their kernel build name. + + Fixes an issue where connections would sometimes take up to 90s to + establish. + diff --git a/website/src/components/Changelog/Apple.tsx b/website/src/components/Changelog/Apple.tsx index b6de6e506..d25091ad7 100644 --- a/website/src/components/Changelog/Apple.tsx +++ b/website/src/components/Changelog/Apple.tsx @@ -24,7 +24,12 @@ export default function Apple() { return ( {/* When you cut a release, remove any solved issues from the "known issues" lists over in `client-apps`. This must not be done when the issue's PR merges. */} - + + + Fixes an issue where connections would sometimes take up to 90s to + establish. + + Fixes an issue where certain log files would not be recreated after diff --git a/website/src/components/Changelog/GUI.tsx b/website/src/components/Changelog/GUI.tsx index 300fd106a..23d722220 100644 --- a/website/src/components/Changelog/GUI.tsx +++ b/website/src/components/Changelog/GUI.tsx @@ -10,7 +10,12 @@ export default function GUI({ os }: { os: OS }) { return ( {/* When you cut a release, remove any solved issues from the "known issues" lists over in `client-apps`. This must not be done when the issue's PR merges. */} - + + + Fixes an issue where connections would sometimes take up to 90s to + establish. + + Fixes a rare crash during sign-in. diff --git a/website/src/components/Changelog/Gateway.tsx b/website/src/components/Changelog/Gateway.tsx index 75be45c86..e4293b784 100644 --- a/website/src/components/Changelog/Gateway.tsx +++ b/website/src/components/Changelog/Gateway.tsx @@ -34,6 +34,10 @@ export default function Gateway() { Adds support for translating Time-Exceeded ICMP errors in the DNS resource NAT, allowing `tracepath` to work through a Firezone tunnel. + + Fixes an issue where connections would sometimes take up to 90s to + establish. + diff --git a/website/src/components/Changelog/Headless.tsx b/website/src/components/Changelog/Headless.tsx index 57e5933ca..5e3847109 100644 --- a/website/src/components/Changelog/Headless.tsx +++ b/website/src/components/Changelog/Headless.tsx @@ -9,7 +9,12 @@ export default function Headless({ os }: { os: OS }) { return ( {/* When you cut a release, remove any solved issues from the "known issues" lists over in `client-apps`. This must not be done when the issue's PR merges. */} - + + + Fixes an issue where connections would sometimes take up to 90s to + establish. + + Fixes an issue where connections would fail to establish if both