mirror of
https://github.com/outbackdingo/firezone.git
synced 2026-01-27 10:18:54 +00:00
feat(snownet): reduce rekey-attempt-time to 15s (#9891)
From Sentry reports and user-submitted logs, we know that it is possible for Client and Gateway to de-sync in regards to what each other's public key is. In such a scenario, ICE will succeed to make a connection but `boringtun` will fail to handshake a tunnel. By default, `boringtun` tries for 90s to handshake a session before it gives up and expires it. In Firezone, the ICE agent takes care of establishing connectivity whereas `boringtun` itself just encrypts and decrypts packets. As such, if ICE is working, we know that packets aren't getting lost but instead, there must be some other issue as to why we cannot establish a session. To improve the UX in these error cases, we reduce the rekey-attempt-time to 15s. This roughly matches our ICE timeout. Those 15s count from the moment we send the first handshake which is just after ICE completes. Thus we can be sure that after at most 15s, we either have a working WireGuard session or the connection gets cleaned up. Related: #9890 Related: #9850
This commit is contained in:
2
rust/Cargo.lock
generated
2
rust/Cargo.lock
generated
@@ -951,7 +951,7 @@ dependencies = [
|
||||
[[package]]
|
||||
name = "boringtun"
|
||||
version = "0.6.1"
|
||||
source = "git+https://github.com/firezone/boringtun?branch=master#5b1892f0616f97d82599b6e324356aa9ea629c33"
|
||||
source = "git+https://github.com/firezone/boringtun?branch=master#3d5df9c2a6f55424e02671374f835cc7db1d7a44"
|
||||
dependencies = [
|
||||
"aead",
|
||||
"base64 0.22.1",
|
||||
|
||||
@@ -11,7 +11,6 @@ use boringtun::{noise::rate_limiter::RateLimiter, x25519::StaticSecret};
|
||||
use bufferpool::{Buffer, BufferPool};
|
||||
use core::fmt;
|
||||
use firezone_logging::err_with_src;
|
||||
use firezone_telemetry::{analytics, feature_flags};
|
||||
use hex_display::HexDisplayExt;
|
||||
use ip_packet::{ConvertibleIpv4Packet, ConvertibleIpv6Packet, IpPacket, IpPacketBuf};
|
||||
use itertools::Itertools;
|
||||
@@ -747,18 +746,31 @@ where
|
||||
tracing::warn!("No TURN servers connected; connection may fail to establish");
|
||||
}
|
||||
|
||||
let mut tunnel = Tunn::new_at(
|
||||
self.private_key.clone(),
|
||||
remote,
|
||||
Some(key),
|
||||
None,
|
||||
self.index.next(),
|
||||
Some(self.rate_limiter.clone()),
|
||||
self.rng.next_u64(),
|
||||
now,
|
||||
);
|
||||
// By default, boringtun has a rekey attempt time of 90(!) seconds.
|
||||
// In case of a state de-sync or other issues, this means we try for
|
||||
// 90s to make a handshake, all whilst our ICE layer thinks the connection
|
||||
// is working perfectly fine.
|
||||
// This results in a bad UX as the user has to essentially wait for 90s
|
||||
// before Firezone can fix the state and make a new connection.
|
||||
//
|
||||
// By aligning the rekey-attempt-time roughly with our ICE timeout, we ensure
|
||||
// that even if the hole-punch was successful, it will take at most 15s
|
||||
// until we have a WireGuard tunnel to send packets into.
|
||||
tunnel.set_rekey_attempt_time(Duration::from_secs(15));
|
||||
|
||||
Connection {
|
||||
agent,
|
||||
tunnel: Tunn::new_at(
|
||||
self.private_key.clone(),
|
||||
remote,
|
||||
Some(key),
|
||||
None,
|
||||
self.index.next(),
|
||||
Some(self.rate_limiter.clone()),
|
||||
self.rng.next_u64(),
|
||||
now,
|
||||
),
|
||||
tunnel,
|
||||
next_wg_timer_update: now,
|
||||
stats: Default::default(),
|
||||
buffer: vec![0; ip_packet::MAX_FZ_PAYLOAD],
|
||||
@@ -2214,16 +2226,8 @@ where
|
||||
.decapsulate_at(Some(src), packet, ip_packet.buf(), now)
|
||||
{
|
||||
TunnResult::Done => ControlFlow::Break(Ok(())),
|
||||
TunnResult::Err(e @ WireGuardError::InvalidAeadTag)
|
||||
if crate::is_handshake(packet)
|
||||
&& feature_flags::fail_handshake_on_decryption_errors() =>
|
||||
{
|
||||
analytics::feature_flag_called("fail-handshake-on-decryption-errors");
|
||||
|
||||
tracing::info!("Connection handshake failed ({e})");
|
||||
self.state = ConnectionState::Failed;
|
||||
|
||||
ControlFlow::Break(Ok(())) // Return `Ok` because we are already logging the error.
|
||||
TunnResult::Err(e) if crate::is_handshake(packet) => {
|
||||
ControlFlow::Break(Err(anyhow::Error::new(e).context("handshake packet")))
|
||||
}
|
||||
TunnResult::Err(e) => ControlFlow::Break(Err(anyhow::Error::new(e))),
|
||||
|
||||
|
||||
@@ -43,10 +43,6 @@ pub fn export_metrics() -> bool {
|
||||
false // Placeholder until we actually deploy an OTEL collector.
|
||||
}
|
||||
|
||||
pub fn fail_handshake_on_decryption_errors() -> bool {
|
||||
FEATURE_FLAGS.fail_handshake_on_decryption_errors()
|
||||
}
|
||||
|
||||
pub(crate) async fn evaluate_now(user_id: String, env: Env) {
|
||||
if user_id.is_empty() {
|
||||
return;
|
||||
@@ -160,8 +156,6 @@ struct FeatureFlagsResponse {
|
||||
stream_logs: bool,
|
||||
#[serde(default)]
|
||||
map_enobufs_to_wouldblock: bool,
|
||||
#[serde(default)]
|
||||
fail_handshake_on_decryption_errors: bool,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default)]
|
||||
@@ -170,7 +164,6 @@ struct FeatureFlags {
|
||||
drop_llmnr_nxdomain_responses: AtomicBool,
|
||||
stream_logs: AtomicBool,
|
||||
map_enobufs_to_wouldblock: AtomicBool,
|
||||
fail_handshake_on_decryption_errors: AtomicBool,
|
||||
}
|
||||
|
||||
/// Accessors to the actual feature flags.
|
||||
@@ -187,7 +180,6 @@ impl FeatureFlags {
|
||||
drop_llmnr_nxdomain_responses,
|
||||
stream_logs,
|
||||
map_enobufs_to_wouldblock,
|
||||
fail_handshake_on_decryption_errors,
|
||||
}: FeatureFlagsResponse,
|
||||
) {
|
||||
self.icmp_unreachable_instead_of_nat64
|
||||
@@ -197,8 +189,6 @@ impl FeatureFlags {
|
||||
self.stream_logs.store(stream_logs, Ordering::Relaxed);
|
||||
self.map_enobufs_to_wouldblock
|
||||
.store(map_enobufs_to_wouldblock, Ordering::Relaxed);
|
||||
self.fail_handshake_on_decryption_errors
|
||||
.store(fail_handshake_on_decryption_errors, Ordering::Relaxed);
|
||||
}
|
||||
|
||||
fn icmp_unreachable_instead_of_nat64(&self) -> bool {
|
||||
@@ -217,11 +207,6 @@ impl FeatureFlags {
|
||||
fn map_enobufs_to_wouldblock(&self) -> bool {
|
||||
self.map_enobufs_to_wouldblock.load(Ordering::Relaxed)
|
||||
}
|
||||
|
||||
fn fail_handshake_on_decryption_errors(&self) -> bool {
|
||||
self.fail_handshake_on_decryption_errors
|
||||
.load(Ordering::Relaxed)
|
||||
}
|
||||
}
|
||||
|
||||
fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context {
|
||||
@@ -232,7 +217,6 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context
|
||||
DropLlmnrNxdomainResponses { result: bool },
|
||||
StreamLogs { result: bool },
|
||||
MapENOBUFSToWouldBlock { result: bool },
|
||||
FailHandshakeOnDecryptionErrors { result: bool },
|
||||
}
|
||||
|
||||
// Exhaustive destruction so we don't forget to update this when we add a flag.
|
||||
@@ -241,7 +225,6 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context
|
||||
drop_llmnr_nxdomain_responses,
|
||||
stream_logs,
|
||||
map_enobufs_to_wouldblock,
|
||||
fail_handshake_on_decryption_errors,
|
||||
} = flags;
|
||||
|
||||
let value = serde_json::json!({
|
||||
@@ -252,7 +235,6 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context
|
||||
SentryFlag::DropLlmnrNxdomainResponses { result: drop_llmnr_nxdomain_responses },
|
||||
SentryFlag::StreamLogs { result: stream_logs },
|
||||
SentryFlag::MapENOBUFSToWouldBlock { result: map_enobufs_to_wouldblock },
|
||||
SentryFlag::FailHandshakeOnDecryptionErrors { result: fail_handshake_on_decryption_errors },
|
||||
]
|
||||
});
|
||||
|
||||
|
||||
@@ -25,6 +25,10 @@ export default function Android() {
|
||||
Fixes an issue where Firezone failed to sign-in on systems with
|
||||
non-ASCII characters in their kernel build name.
|
||||
</ChangeItem>
|
||||
<ChangeItem pull="9891">
|
||||
Fixes an issue where connections would sometimes take up to 90s to
|
||||
establish.
|
||||
</ChangeItem>
|
||||
</Unreleased>
|
||||
<Entry version="1.5.2" date={new Date("2025-06-30")}>
|
||||
<ChangeItem pull="9621">
|
||||
|
||||
@@ -24,7 +24,12 @@ export default function Apple() {
|
||||
return (
|
||||
<Entries downloadLinks={downloadLinks} title="macOS / iOS">
|
||||
{/* When you cut a release, remove any solved issues from the "known issues" lists over in `client-apps`. This must not be done when the issue's PR merges. */}
|
||||
<Unreleased></Unreleased>
|
||||
<Unreleased>
|
||||
<ChangeItem pull="9891">
|
||||
Fixes an issue where connections would sometimes take up to 90s to
|
||||
establish.
|
||||
</ChangeItem>
|
||||
</Unreleased>
|
||||
<Entry version="1.5.4" date={new Date("2025-07-11")}>
|
||||
<ChangeItem pull="9597">
|
||||
Fixes an issue where certain log files would not be recreated after
|
||||
|
||||
@@ -10,7 +10,12 @@ export default function GUI({ os }: { os: OS }) {
|
||||
return (
|
||||
<Entries downloadLinks={downloadLinks(os)} title={title(os)}>
|
||||
{/* When you cut a release, remove any solved issues from the "known issues" lists over in `client-apps`. This must not be done when the issue's PR merges. */}
|
||||
<Unreleased></Unreleased>
|
||||
<Unreleased>
|
||||
<ChangeItem pull="9891">
|
||||
Fixes an issue where connections would sometimes take up to 90s to
|
||||
establish.
|
||||
</ChangeItem>
|
||||
</Unreleased>
|
||||
<Entry version="1.5.5" date={new Date("2025-07-09")}>
|
||||
<ChangeItem pull="9779">Fixes a rare crash during sign-in.</ChangeItem>
|
||||
<ChangeItem pull="9725">
|
||||
|
||||
@@ -34,6 +34,10 @@ export default function Gateway() {
|
||||
Adds support for translating Time-Exceeded ICMP errors in the DNS
|
||||
resource NAT, allowing `tracepath` to work through a Firezone tunnel.
|
||||
</ChangeItem>
|
||||
<ChangeItem pull="9891">
|
||||
Fixes an issue where connections would sometimes take up to 90s to
|
||||
establish.
|
||||
</ChangeItem>
|
||||
</Unreleased>
|
||||
<Entry version="1.4.12" date={new Date("2025-06-30")}>
|
||||
<ChangeItem pull="9657">
|
||||
|
||||
@@ -9,7 +9,12 @@ export default function Headless({ os }: { os: OS }) {
|
||||
return (
|
||||
<Entries downloadLinks={downloadLinks(os)} title={title(os)}>
|
||||
{/* When you cut a release, remove any solved issues from the "known issues" lists over in `client-apps`. This must not be done when the issue's PR merges. */}
|
||||
<Unreleased></Unreleased>
|
||||
<Unreleased>
|
||||
<ChangeItem pull="9891">
|
||||
Fixes an issue where connections would sometimes take up to 90s to
|
||||
establish.
|
||||
</ChangeItem>
|
||||
</Unreleased>
|
||||
<Entry version="1.5.1" date={new Date("2025-07-04")}>
|
||||
<ChangeItem pull="9564">
|
||||
Fixes an issue where connections would fail to establish if both
|
||||
|
||||
Reference in New Issue
Block a user