From f5425ac8e4422f5ffb3d659cf98ec17f2b3feedd Mon Sep 17 00:00:00 2001 From: Thomas Eizinger Date: Mon, 14 Jul 2025 15:22:23 +0200 Subject: [PATCH] fix(snownet): fail connection on handshake decryption errors (#9850) As per the WireGuard paper, `boringtun` tries to handshake with the remote peer for 90s before it gives up. This timeout is important because when a session is discarded due to e.g. missing replies, WireGuard attempts to handshake a new session. Without this timeout, we would then try to handshake a session forever. Unfortunately, `boringtun` does not distinguish a missing handshake response from a bad one. Decryption errors whilst decoding a handshake response are simply passed up to the upper layer, in our case `snownet`. I am not sure how we can actually fail to decrypt a handshake but the pattern we are seeing in customer logs is that this happens over and over again, so there is no point in having `boringtun` retry the handshake. Therefore, we immediately fail the connection when this happens. Failed connections are immediately removed, triggering the client send a new connection-intent to the portal. Such a new connection intent will then sync-up the state between Client and Gateway so both of them use the most recent public key. Resolves: #9845 --- rust/Cargo.lock | 1 + rust/connlib/snownet/Cargo.toml | 1 + rust/connlib/snownet/src/lib.rs | 7 +++++++ rust/connlib/snownet/src/node.rs | 12 ++++++++++++ rust/telemetry/src/feature_flags.rs | 18 ++++++++++++++++++ 5 files changed, 39 insertions(+) diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 342ae5420..bc412e9e7 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -6826,6 +6826,7 @@ dependencies = [ "bytes", "derive_more 2.0.1", "firezone-logging", + "firezone-telemetry", "hex", "hex-display", "ip-packet", diff --git a/rust/connlib/snownet/Cargo.toml b/rust/connlib/snownet/Cargo.toml index 7672fe37c..f358888f0 100644 --- a/rust/connlib/snownet/Cargo.toml +++ b/rust/connlib/snownet/Cargo.toml @@ -12,6 +12,7 @@ bytecodec = { workspace = true } bytes = { workspace = true } derive_more = { workspace = true, features = ["debug"] } firezone-logging = { workspace = true } +firezone-telemetry = { workspace = true } hex = { workspace = true } hex-display = { workspace = true } ip-packet = { workspace = true } diff --git a/rust/connlib/snownet/src/lib.rs b/rust/connlib/snownet/src/lib.rs index 2b0fadedd..0257c75bc 100644 --- a/rust/connlib/snownet/src/lib.rs +++ b/rust/connlib/snownet/src/lib.rs @@ -24,3 +24,10 @@ pub use stats::{ConnectionStats, NodeStats}; pub fn is_wireguard(payload: &[u8]) -> bool { boringtun::noise::Tunn::parse_incoming_packet(payload).is_ok() } + +pub(crate) fn is_handshake(payload: &[u8]) -> bool { + use boringtun::noise::Packet; + + boringtun::noise::Tunn::parse_incoming_packet(payload) + .is_ok_and(|p| matches!(p, Packet::HandshakeInit(_) | Packet::HandshakeResponse(_))) +} diff --git a/rust/connlib/snownet/src/node.rs b/rust/connlib/snownet/src/node.rs index 9e353bb4e..11b23435e 100644 --- a/rust/connlib/snownet/src/node.rs +++ b/rust/connlib/snownet/src/node.rs @@ -11,6 +11,7 @@ use boringtun::{noise::rate_limiter::RateLimiter, x25519::StaticSecret}; use bufferpool::{Buffer, BufferPool}; use core::fmt; use firezone_logging::err_with_src; +use firezone_telemetry::{analytics, feature_flags}; use hex_display::HexDisplayExt; use ip_packet::{ConvertibleIpv4Packet, ConvertibleIpv6Packet, IpPacket, IpPacketBuf}; use itertools::Itertools; @@ -2178,6 +2179,17 @@ where .decapsulate_at(Some(src), packet, ip_packet.buf(), now) { TunnResult::Done => ControlFlow::Break(Ok(())), + TunnResult::Err(e @ WireGuardError::InvalidAeadTag) + if crate::is_handshake(packet) + && feature_flags::fail_handshake_on_decryption_errors() => + { + analytics::feature_flag_called("fail-handshake-on-decryption-errors"); + + tracing::info!("Connection handshake failed ({e})"); + self.state = ConnectionState::Failed; + + ControlFlow::Break(Ok(())) // Return `Ok` because we are already logging the error. + } TunnResult::Err(e) => ControlFlow::Break(Err(anyhow::Error::new(e))), // For WriteToTunnel{V4,V6}, boringtun returns the source IP of the packet that was tunneled to us. diff --git a/rust/telemetry/src/feature_flags.rs b/rust/telemetry/src/feature_flags.rs index e3b9be0a4..ff4190cc6 100644 --- a/rust/telemetry/src/feature_flags.rs +++ b/rust/telemetry/src/feature_flags.rs @@ -43,6 +43,10 @@ pub fn export_metrics() -> bool { false // Placeholder until we actually deploy an OTEL collector. } +pub fn fail_handshake_on_decryption_errors() -> bool { + FEATURE_FLAGS.fail_handshake_on_decryption_errors() +} + pub(crate) async fn evaluate_now(user_id: String, env: Env) { if user_id.is_empty() { return; @@ -156,6 +160,8 @@ struct FeatureFlagsResponse { stream_logs: bool, #[serde(default)] map_enobufs_to_wouldblock: bool, + #[serde(default)] + fail_handshake_on_decryption_errors: bool, } #[derive(Debug, Default)] @@ -164,6 +170,7 @@ struct FeatureFlags { drop_llmnr_nxdomain_responses: AtomicBool, stream_logs: AtomicBool, map_enobufs_to_wouldblock: AtomicBool, + fail_handshake_on_decryption_errors: AtomicBool, } /// Accessors to the actual feature flags. @@ -180,6 +187,7 @@ impl FeatureFlags { drop_llmnr_nxdomain_responses, stream_logs, map_enobufs_to_wouldblock, + fail_handshake_on_decryption_errors, }: FeatureFlagsResponse, ) { self.icmp_unreachable_instead_of_nat64 @@ -189,6 +197,8 @@ impl FeatureFlags { self.stream_logs.store(stream_logs, Ordering::Relaxed); self.map_enobufs_to_wouldblock .store(map_enobufs_to_wouldblock, Ordering::Relaxed); + self.fail_handshake_on_decryption_errors + .store(fail_handshake_on_decryption_errors, Ordering::Relaxed); } fn icmp_unreachable_instead_of_nat64(&self) -> bool { @@ -207,6 +217,11 @@ impl FeatureFlags { fn map_enobufs_to_wouldblock(&self) -> bool { self.map_enobufs_to_wouldblock.load(Ordering::Relaxed) } + + fn fail_handshake_on_decryption_errors(&self) -> bool { + self.fail_handshake_on_decryption_errors + .load(Ordering::Relaxed) + } } fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context { @@ -217,6 +232,7 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context DropLlmnrNxdomainResponses { result: bool }, StreamLogs { result: bool }, MapENOBUFSToWouldBlock { result: bool }, + FailHandshakeOnDecryptionErrors { result: bool }, } // Exhaustive destruction so we don't forget to update this when we add a flag. @@ -225,6 +241,7 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context drop_llmnr_nxdomain_responses, stream_logs, map_enobufs_to_wouldblock, + fail_handshake_on_decryption_errors, } = flags; let value = serde_json::json!({ @@ -235,6 +252,7 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context SentryFlag::DropLlmnrNxdomainResponses { result: drop_llmnr_nxdomain_responses }, SentryFlag::StreamLogs { result: stream_logs }, SentryFlag::MapENOBUFSToWouldBlock { result: map_enobufs_to_wouldblock }, + SentryFlag::FailHandshakeOnDecryptionErrors { result: fail_handshake_on_decryption_errors }, ] });