diff --git a/rust/Cargo.lock b/rust/Cargo.lock index e649eed7e..41c826f0a 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -951,7 +951,7 @@ dependencies = [ [[package]] name = "boringtun" version = "0.6.1" -source = "git+https://github.com/firezone/boringtun?branch=master#5b1892f0616f97d82599b6e324356aa9ea629c33" +source = "git+https://github.com/firezone/boringtun?branch=master#3d5df9c2a6f55424e02671374f835cc7db1d7a44" dependencies = [ "aead", "base64 0.22.1", diff --git a/rust/connlib/snownet/src/node.rs b/rust/connlib/snownet/src/node.rs index 8a9959ba7..00d0aa45f 100644 --- a/rust/connlib/snownet/src/node.rs +++ b/rust/connlib/snownet/src/node.rs @@ -11,7 +11,6 @@ use boringtun::{noise::rate_limiter::RateLimiter, x25519::StaticSecret}; use bufferpool::{Buffer, BufferPool}; use core::fmt; use firezone_logging::err_with_src; -use firezone_telemetry::{analytics, feature_flags}; use hex_display::HexDisplayExt; use ip_packet::{ConvertibleIpv4Packet, ConvertibleIpv6Packet, IpPacket, IpPacketBuf}; use itertools::Itertools; @@ -747,18 +746,31 @@ where tracing::warn!("No TURN servers connected; connection may fail to establish"); } + let mut tunnel = Tunn::new_at( + self.private_key.clone(), + remote, + Some(key), + None, + self.index.next(), + Some(self.rate_limiter.clone()), + self.rng.next_u64(), + now, + ); + // By default, boringtun has a rekey attempt time of 90(!) seconds. + // In case of a state de-sync or other issues, this means we try for + // 90s to make a handshake, all whilst our ICE layer thinks the connection + // is working perfectly fine. + // This results in a bad UX as the user has to essentially wait for 90s + // before Firezone can fix the state and make a new connection. + // + // By aligning the rekey-attempt-time roughly with our ICE timeout, we ensure + // that even if the hole-punch was successful, it will take at most 15s + // until we have a WireGuard tunnel to send packets into. + tunnel.set_rekey_attempt_time(Duration::from_secs(15)); + Connection { agent, - tunnel: Tunn::new_at( - self.private_key.clone(), - remote, - Some(key), - None, - self.index.next(), - Some(self.rate_limiter.clone()), - self.rng.next_u64(), - now, - ), + tunnel, next_wg_timer_update: now, stats: Default::default(), buffer: vec![0; ip_packet::MAX_FZ_PAYLOAD], @@ -2214,16 +2226,8 @@ where .decapsulate_at(Some(src), packet, ip_packet.buf(), now) { TunnResult::Done => ControlFlow::Break(Ok(())), - TunnResult::Err(e @ WireGuardError::InvalidAeadTag) - if crate::is_handshake(packet) - && feature_flags::fail_handshake_on_decryption_errors() => - { - analytics::feature_flag_called("fail-handshake-on-decryption-errors"); - - tracing::info!("Connection handshake failed ({e})"); - self.state = ConnectionState::Failed; - - ControlFlow::Break(Ok(())) // Return `Ok` because we are already logging the error. + TunnResult::Err(e) if crate::is_handshake(packet) => { + ControlFlow::Break(Err(anyhow::Error::new(e).context("handshake packet"))) } TunnResult::Err(e) => ControlFlow::Break(Err(anyhow::Error::new(e))), diff --git a/rust/telemetry/src/feature_flags.rs b/rust/telemetry/src/feature_flags.rs index ff4190cc6..e3b9be0a4 100644 --- a/rust/telemetry/src/feature_flags.rs +++ b/rust/telemetry/src/feature_flags.rs @@ -43,10 +43,6 @@ pub fn export_metrics() -> bool { false // Placeholder until we actually deploy an OTEL collector. } -pub fn fail_handshake_on_decryption_errors() -> bool { - FEATURE_FLAGS.fail_handshake_on_decryption_errors() -} - pub(crate) async fn evaluate_now(user_id: String, env: Env) { if user_id.is_empty() { return; @@ -160,8 +156,6 @@ struct FeatureFlagsResponse { stream_logs: bool, #[serde(default)] map_enobufs_to_wouldblock: bool, - #[serde(default)] - fail_handshake_on_decryption_errors: bool, } #[derive(Debug, Default)] @@ -170,7 +164,6 @@ struct FeatureFlags { drop_llmnr_nxdomain_responses: AtomicBool, stream_logs: AtomicBool, map_enobufs_to_wouldblock: AtomicBool, - fail_handshake_on_decryption_errors: AtomicBool, } /// Accessors to the actual feature flags. @@ -187,7 +180,6 @@ impl FeatureFlags { drop_llmnr_nxdomain_responses, stream_logs, map_enobufs_to_wouldblock, - fail_handshake_on_decryption_errors, }: FeatureFlagsResponse, ) { self.icmp_unreachable_instead_of_nat64 @@ -197,8 +189,6 @@ impl FeatureFlags { self.stream_logs.store(stream_logs, Ordering::Relaxed); self.map_enobufs_to_wouldblock .store(map_enobufs_to_wouldblock, Ordering::Relaxed); - self.fail_handshake_on_decryption_errors - .store(fail_handshake_on_decryption_errors, Ordering::Relaxed); } fn icmp_unreachable_instead_of_nat64(&self) -> bool { @@ -217,11 +207,6 @@ impl FeatureFlags { fn map_enobufs_to_wouldblock(&self) -> bool { self.map_enobufs_to_wouldblock.load(Ordering::Relaxed) } - - fn fail_handshake_on_decryption_errors(&self) -> bool { - self.fail_handshake_on_decryption_errors - .load(Ordering::Relaxed) - } } fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context { @@ -232,7 +217,6 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context DropLlmnrNxdomainResponses { result: bool }, StreamLogs { result: bool }, MapENOBUFSToWouldBlock { result: bool }, - FailHandshakeOnDecryptionErrors { result: bool }, } // Exhaustive destruction so we don't forget to update this when we add a flag. @@ -241,7 +225,6 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context drop_llmnr_nxdomain_responses, stream_logs, map_enobufs_to_wouldblock, - fail_handshake_on_decryption_errors, } = flags; let value = serde_json::json!({ @@ -252,7 +235,6 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context SentryFlag::DropLlmnrNxdomainResponses { result: drop_llmnr_nxdomain_responses }, SentryFlag::StreamLogs { result: stream_logs }, SentryFlag::MapENOBUFSToWouldBlock { result: map_enobufs_to_wouldblock }, - SentryFlag::FailHandshakeOnDecryptionErrors { result: fail_handshake_on_decryption_errors }, ] }); diff --git a/website/src/components/Changelog/Android.tsx b/website/src/components/Changelog/Android.tsx index e69f30986..f0e164fb0 100644 --- a/website/src/components/Changelog/Android.tsx +++ b/website/src/components/Changelog/Android.tsx @@ -25,6 +25,10 @@ export default function Android() { Fixes an issue where Firezone failed to sign-in on systems with non-ASCII characters in their kernel build name. + + Fixes an issue where connections would sometimes take up to 90s to + establish. + diff --git a/website/src/components/Changelog/Apple.tsx b/website/src/components/Changelog/Apple.tsx index b6de6e506..d25091ad7 100644 --- a/website/src/components/Changelog/Apple.tsx +++ b/website/src/components/Changelog/Apple.tsx @@ -24,7 +24,12 @@ export default function Apple() { return ( {/* When you cut a release, remove any solved issues from the "known issues" lists over in `client-apps`. This must not be done when the issue's PR merges. */} - + + + Fixes an issue where connections would sometimes take up to 90s to + establish. + + Fixes an issue where certain log files would not be recreated after diff --git a/website/src/components/Changelog/GUI.tsx b/website/src/components/Changelog/GUI.tsx index 300fd106a..23d722220 100644 --- a/website/src/components/Changelog/GUI.tsx +++ b/website/src/components/Changelog/GUI.tsx @@ -10,7 +10,12 @@ export default function GUI({ os }: { os: OS }) { return ( {/* When you cut a release, remove any solved issues from the "known issues" lists over in `client-apps`. This must not be done when the issue's PR merges. */} - + + + Fixes an issue where connections would sometimes take up to 90s to + establish. + + Fixes a rare crash during sign-in. diff --git a/website/src/components/Changelog/Gateway.tsx b/website/src/components/Changelog/Gateway.tsx index 75be45c86..e4293b784 100644 --- a/website/src/components/Changelog/Gateway.tsx +++ b/website/src/components/Changelog/Gateway.tsx @@ -34,6 +34,10 @@ export default function Gateway() { Adds support for translating Time-Exceeded ICMP errors in the DNS resource NAT, allowing `tracepath` to work through a Firezone tunnel. + + Fixes an issue where connections would sometimes take up to 90s to + establish. + diff --git a/website/src/components/Changelog/Headless.tsx b/website/src/components/Changelog/Headless.tsx index 57e5933ca..5e3847109 100644 --- a/website/src/components/Changelog/Headless.tsx +++ b/website/src/components/Changelog/Headless.tsx @@ -9,7 +9,12 @@ export default function Headless({ os }: { os: OS }) { return ( {/* When you cut a release, remove any solved issues from the "known issues" lists over in `client-apps`. This must not be done when the issue's PR merges. */} - + + + Fixes an issue where connections would sometimes take up to 90s to + establish. + + Fixes an issue where connections would fail to establish if both