feat(snownet): reduce rekey-attempt-time to 15s (#9891)

From Sentry reports and user-submitted logs, we know that it is possible
for Client and Gateway to de-sync in regards to what each other's public
key is. In such a scenario, ICE will succeed to make a connection but
`boringtun` will fail to handshake a tunnel. By default, `boringtun`
tries for 90s to handshake a session before it gives up and expires it.

In Firezone, the ICE agent takes care of establishing connectivity
whereas `boringtun` itself just encrypts and decrypts packets. As such,
if ICE is working, we know that packets aren't getting lost but instead,
there must be some other issue as to why we cannot establish a session.

To improve the UX in these error cases, we reduce the rekey-attempt-time
to 15s. This roughly matches our ICE timeout. Those 15s count from the
moment we send the first handshake which is just after ICE completes.
Thus we can be sure that after at most 15s, we either have a working
WireGuard session or the connection gets cleaned up.

Related: #9890
Related: #9850
This commit is contained in:
Thomas Eizinger
2025-07-17 08:50:31 +08:00
committed by GitHub
parent 72fbe306b6
commit a6ffdd2654
8 changed files with 52 additions and 43 deletions

2
rust/Cargo.lock generated
View File

@@ -951,7 +951,7 @@ dependencies = [
[[package]]
name = "boringtun"
version = "0.6.1"
source = "git+https://github.com/firezone/boringtun?branch=master#5b1892f0616f97d82599b6e324356aa9ea629c33"
source = "git+https://github.com/firezone/boringtun?branch=master#3d5df9c2a6f55424e02671374f835cc7db1d7a44"
dependencies = [
"aead",
"base64 0.22.1",

View File

@@ -11,7 +11,6 @@ use boringtun::{noise::rate_limiter::RateLimiter, x25519::StaticSecret};
use bufferpool::{Buffer, BufferPool};
use core::fmt;
use firezone_logging::err_with_src;
use firezone_telemetry::{analytics, feature_flags};
use hex_display::HexDisplayExt;
use ip_packet::{ConvertibleIpv4Packet, ConvertibleIpv6Packet, IpPacket, IpPacketBuf};
use itertools::Itertools;
@@ -747,18 +746,31 @@ where
tracing::warn!("No TURN servers connected; connection may fail to establish");
}
let mut tunnel = Tunn::new_at(
self.private_key.clone(),
remote,
Some(key),
None,
self.index.next(),
Some(self.rate_limiter.clone()),
self.rng.next_u64(),
now,
);
// By default, boringtun has a rekey attempt time of 90(!) seconds.
// In case of a state de-sync or other issues, this means we try for
// 90s to make a handshake, all whilst our ICE layer thinks the connection
// is working perfectly fine.
// This results in a bad UX as the user has to essentially wait for 90s
// before Firezone can fix the state and make a new connection.
//
// By aligning the rekey-attempt-time roughly with our ICE timeout, we ensure
// that even if the hole-punch was successful, it will take at most 15s
// until we have a WireGuard tunnel to send packets into.
tunnel.set_rekey_attempt_time(Duration::from_secs(15));
Connection {
agent,
tunnel: Tunn::new_at(
self.private_key.clone(),
remote,
Some(key),
None,
self.index.next(),
Some(self.rate_limiter.clone()),
self.rng.next_u64(),
now,
),
tunnel,
next_wg_timer_update: now,
stats: Default::default(),
buffer: vec![0; ip_packet::MAX_FZ_PAYLOAD],
@@ -2214,16 +2226,8 @@ where
.decapsulate_at(Some(src), packet, ip_packet.buf(), now)
{
TunnResult::Done => ControlFlow::Break(Ok(())),
TunnResult::Err(e @ WireGuardError::InvalidAeadTag)
if crate::is_handshake(packet)
&& feature_flags::fail_handshake_on_decryption_errors() =>
{
analytics::feature_flag_called("fail-handshake-on-decryption-errors");
tracing::info!("Connection handshake failed ({e})");
self.state = ConnectionState::Failed;
ControlFlow::Break(Ok(())) // Return `Ok` because we are already logging the error.
TunnResult::Err(e) if crate::is_handshake(packet) => {
ControlFlow::Break(Err(anyhow::Error::new(e).context("handshake packet")))
}
TunnResult::Err(e) => ControlFlow::Break(Err(anyhow::Error::new(e))),

View File

@@ -43,10 +43,6 @@ pub fn export_metrics() -> bool {
false // Placeholder until we actually deploy an OTEL collector.
}
pub fn fail_handshake_on_decryption_errors() -> bool {
FEATURE_FLAGS.fail_handshake_on_decryption_errors()
}
pub(crate) async fn evaluate_now(user_id: String, env: Env) {
if user_id.is_empty() {
return;
@@ -160,8 +156,6 @@ struct FeatureFlagsResponse {
stream_logs: bool,
#[serde(default)]
map_enobufs_to_wouldblock: bool,
#[serde(default)]
fail_handshake_on_decryption_errors: bool,
}
#[derive(Debug, Default)]
@@ -170,7 +164,6 @@ struct FeatureFlags {
drop_llmnr_nxdomain_responses: AtomicBool,
stream_logs: AtomicBool,
map_enobufs_to_wouldblock: AtomicBool,
fail_handshake_on_decryption_errors: AtomicBool,
}
/// Accessors to the actual feature flags.
@@ -187,7 +180,6 @@ impl FeatureFlags {
drop_llmnr_nxdomain_responses,
stream_logs,
map_enobufs_to_wouldblock,
fail_handshake_on_decryption_errors,
}: FeatureFlagsResponse,
) {
self.icmp_unreachable_instead_of_nat64
@@ -197,8 +189,6 @@ impl FeatureFlags {
self.stream_logs.store(stream_logs, Ordering::Relaxed);
self.map_enobufs_to_wouldblock
.store(map_enobufs_to_wouldblock, Ordering::Relaxed);
self.fail_handshake_on_decryption_errors
.store(fail_handshake_on_decryption_errors, Ordering::Relaxed);
}
fn icmp_unreachable_instead_of_nat64(&self) -> bool {
@@ -217,11 +207,6 @@ impl FeatureFlags {
fn map_enobufs_to_wouldblock(&self) -> bool {
self.map_enobufs_to_wouldblock.load(Ordering::Relaxed)
}
fn fail_handshake_on_decryption_errors(&self) -> bool {
self.fail_handshake_on_decryption_errors
.load(Ordering::Relaxed)
}
}
fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context {
@@ -232,7 +217,6 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context
DropLlmnrNxdomainResponses { result: bool },
StreamLogs { result: bool },
MapENOBUFSToWouldBlock { result: bool },
FailHandshakeOnDecryptionErrors { result: bool },
}
// Exhaustive destruction so we don't forget to update this when we add a flag.
@@ -241,7 +225,6 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context
drop_llmnr_nxdomain_responses,
stream_logs,
map_enobufs_to_wouldblock,
fail_handshake_on_decryption_errors,
} = flags;
let value = serde_json::json!({
@@ -252,7 +235,6 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context
SentryFlag::DropLlmnrNxdomainResponses { result: drop_llmnr_nxdomain_responses },
SentryFlag::StreamLogs { result: stream_logs },
SentryFlag::MapENOBUFSToWouldBlock { result: map_enobufs_to_wouldblock },
SentryFlag::FailHandshakeOnDecryptionErrors { result: fail_handshake_on_decryption_errors },
]
});

View File

@@ -25,6 +25,10 @@ export default function Android() {
Fixes an issue where Firezone failed to sign-in on systems with
non-ASCII characters in their kernel build name.
</ChangeItem>
<ChangeItem pull="9891">
Fixes an issue where connections would sometimes take up to 90s to
establish.
</ChangeItem>
</Unreleased>
<Entry version="1.5.2" date={new Date("2025-06-30")}>
<ChangeItem pull="9621">

View File

@@ -24,7 +24,12 @@ export default function Apple() {
return (
<Entries downloadLinks={downloadLinks} title="macOS / iOS">
{/* When you cut a release, remove any solved issues from the "known issues" lists over in `client-apps`. This must not be done when the issue's PR merges. */}
<Unreleased></Unreleased>
<Unreleased>
<ChangeItem pull="9891">
Fixes an issue where connections would sometimes take up to 90s to
establish.
</ChangeItem>
</Unreleased>
<Entry version="1.5.4" date={new Date("2025-07-11")}>
<ChangeItem pull="9597">
Fixes an issue where certain log files would not be recreated after

View File

@@ -10,7 +10,12 @@ export default function GUI({ os }: { os: OS }) {
return (
<Entries downloadLinks={downloadLinks(os)} title={title(os)}>
{/* When you cut a release, remove any solved issues from the "known issues" lists over in `client-apps`. This must not be done when the issue's PR merges. */}
<Unreleased></Unreleased>
<Unreleased>
<ChangeItem pull="9891">
Fixes an issue where connections would sometimes take up to 90s to
establish.
</ChangeItem>
</Unreleased>
<Entry version="1.5.5" date={new Date("2025-07-09")}>
<ChangeItem pull="9779">Fixes a rare crash during sign-in.</ChangeItem>
<ChangeItem pull="9725">

View File

@@ -34,6 +34,10 @@ export default function Gateway() {
Adds support for translating Time-Exceeded ICMP errors in the DNS
resource NAT, allowing `tracepath` to work through a Firezone tunnel.
</ChangeItem>
<ChangeItem pull="9891">
Fixes an issue where connections would sometimes take up to 90s to
establish.
</ChangeItem>
</Unreleased>
<Entry version="1.4.12" date={new Date("2025-06-30")}>
<ChangeItem pull="9657">

View File

@@ -9,7 +9,12 @@ export default function Headless({ os }: { os: OS }) {
return (
<Entries downloadLinks={downloadLinks(os)} title={title(os)}>
{/* When you cut a release, remove any solved issues from the "known issues" lists over in `client-apps`. This must not be done when the issue's PR merges. */}
<Unreleased></Unreleased>
<Unreleased>
<ChangeItem pull="9891">
Fixes an issue where connections would sometimes take up to 90s to
establish.
</ChangeItem>
</Unreleased>
<Entry version="1.5.1" date={new Date("2025-07-04")}>
<ChangeItem pull="9564">
Fixes an issue where connections would fail to establish if both