fix(snownet): fail connection on handshake decryption errors (#9850)

As per the WireGuard paper, `boringtun` tries to handshake with the
remote peer for 90s before it gives up. This timeout is important
because when a session is discarded due to e.g. missing replies,
WireGuard attempts to handshake a new session. Without this timeout, we
would then try to handshake a session forever.

Unfortunately, `boringtun` does not distinguish a missing handshake
response from a bad one. Decryption errors whilst decoding a handshake
response are simply passed up to the upper layer, in our case `snownet`.

I am not sure how we can actually fail to decrypt a handshake but the
pattern we are seeing in customer logs is that this happens over and
over again, so there is no point in having `boringtun` retry the
handshake. Therefore, we immediately fail the connection when this
happens.

Failed connections are immediately removed, triggering the client send a
new connection-intent to the portal. Such a new connection intent will
then sync-up the state between Client and Gateway so both of them use
the most recent public key.

Resolves: #9845
This commit is contained in:
Thomas Eizinger
2025-07-14 15:22:23 +02:00
committed by GitHub
parent cecca37073
commit f5425ac8e4
5 changed files with 39 additions and 0 deletions

1
rust/Cargo.lock generated
View File

@@ -6826,6 +6826,7 @@ dependencies = [
"bytes",
"derive_more 2.0.1",
"firezone-logging",
"firezone-telemetry",
"hex",
"hex-display",
"ip-packet",

View File

@@ -12,6 +12,7 @@ bytecodec = { workspace = true }
bytes = { workspace = true }
derive_more = { workspace = true, features = ["debug"] }
firezone-logging = { workspace = true }
firezone-telemetry = { workspace = true }
hex = { workspace = true }
hex-display = { workspace = true }
ip-packet = { workspace = true }

View File

@@ -24,3 +24,10 @@ pub use stats::{ConnectionStats, NodeStats};
pub fn is_wireguard(payload: &[u8]) -> bool {
boringtun::noise::Tunn::parse_incoming_packet(payload).is_ok()
}
pub(crate) fn is_handshake(payload: &[u8]) -> bool {
use boringtun::noise::Packet;
boringtun::noise::Tunn::parse_incoming_packet(payload)
.is_ok_and(|p| matches!(p, Packet::HandshakeInit(_) | Packet::HandshakeResponse(_)))
}

View File

@@ -11,6 +11,7 @@ use boringtun::{noise::rate_limiter::RateLimiter, x25519::StaticSecret};
use bufferpool::{Buffer, BufferPool};
use core::fmt;
use firezone_logging::err_with_src;
use firezone_telemetry::{analytics, feature_flags};
use hex_display::HexDisplayExt;
use ip_packet::{ConvertibleIpv4Packet, ConvertibleIpv6Packet, IpPacket, IpPacketBuf};
use itertools::Itertools;
@@ -2178,6 +2179,17 @@ where
.decapsulate_at(Some(src), packet, ip_packet.buf(), now)
{
TunnResult::Done => ControlFlow::Break(Ok(())),
TunnResult::Err(e @ WireGuardError::InvalidAeadTag)
if crate::is_handshake(packet)
&& feature_flags::fail_handshake_on_decryption_errors() =>
{
analytics::feature_flag_called("fail-handshake-on-decryption-errors");
tracing::info!("Connection handshake failed ({e})");
self.state = ConnectionState::Failed;
ControlFlow::Break(Ok(())) // Return `Ok` because we are already logging the error.
}
TunnResult::Err(e) => ControlFlow::Break(Err(anyhow::Error::new(e))),
// For WriteToTunnel{V4,V6}, boringtun returns the source IP of the packet that was tunneled to us.

View File

@@ -43,6 +43,10 @@ pub fn export_metrics() -> bool {
false // Placeholder until we actually deploy an OTEL collector.
}
pub fn fail_handshake_on_decryption_errors() -> bool {
FEATURE_FLAGS.fail_handshake_on_decryption_errors()
}
pub(crate) async fn evaluate_now(user_id: String, env: Env) {
if user_id.is_empty() {
return;
@@ -156,6 +160,8 @@ struct FeatureFlagsResponse {
stream_logs: bool,
#[serde(default)]
map_enobufs_to_wouldblock: bool,
#[serde(default)]
fail_handshake_on_decryption_errors: bool,
}
#[derive(Debug, Default)]
@@ -164,6 +170,7 @@ struct FeatureFlags {
drop_llmnr_nxdomain_responses: AtomicBool,
stream_logs: AtomicBool,
map_enobufs_to_wouldblock: AtomicBool,
fail_handshake_on_decryption_errors: AtomicBool,
}
/// Accessors to the actual feature flags.
@@ -180,6 +187,7 @@ impl FeatureFlags {
drop_llmnr_nxdomain_responses,
stream_logs,
map_enobufs_to_wouldblock,
fail_handshake_on_decryption_errors,
}: FeatureFlagsResponse,
) {
self.icmp_unreachable_instead_of_nat64
@@ -189,6 +197,8 @@ impl FeatureFlags {
self.stream_logs.store(stream_logs, Ordering::Relaxed);
self.map_enobufs_to_wouldblock
.store(map_enobufs_to_wouldblock, Ordering::Relaxed);
self.fail_handshake_on_decryption_errors
.store(fail_handshake_on_decryption_errors, Ordering::Relaxed);
}
fn icmp_unreachable_instead_of_nat64(&self) -> bool {
@@ -207,6 +217,11 @@ impl FeatureFlags {
fn map_enobufs_to_wouldblock(&self) -> bool {
self.map_enobufs_to_wouldblock.load(Ordering::Relaxed)
}
fn fail_handshake_on_decryption_errors(&self) -> bool {
self.fail_handshake_on_decryption_errors
.load(Ordering::Relaxed)
}
}
fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context {
@@ -217,6 +232,7 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context
DropLlmnrNxdomainResponses { result: bool },
StreamLogs { result: bool },
MapENOBUFSToWouldBlock { result: bool },
FailHandshakeOnDecryptionErrors { result: bool },
}
// Exhaustive destruction so we don't forget to update this when we add a flag.
@@ -225,6 +241,7 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context
drop_llmnr_nxdomain_responses,
stream_logs,
map_enobufs_to_wouldblock,
fail_handshake_on_decryption_errors,
} = flags;
let value = serde_json::json!({
@@ -235,6 +252,7 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context
SentryFlag::DropLlmnrNxdomainResponses { result: drop_llmnr_nxdomain_responses },
SentryFlag::StreamLogs { result: stream_logs },
SentryFlag::MapENOBUFSToWouldBlock { result: map_enobufs_to_wouldblock },
SentryFlag::FailHandshakeOnDecryptionErrors { result: fail_handshake_on_decryption_errors },
]
});