feat(gateway): extend ICE timeout (#10887)

Currently, a `snownet` Client and Server always have the same ICE
timeout configuration. This doesn't necessarily have to be the case. A
Gateway cannot establish connections to a Client anyway and thus, we can
have much laxer requirements on when we detect that a Client has
disappeared (without saying "goodbye").

Extending the idle and default ICE timeout values should hopefully
reduce the number of false-positive disconnects that users may
experience where a Gateway cuts a connection because it believes the
Client is gone when in reality, perhaps a few STUN packets just got lost
or backed up.

Changing the ICE timeout exposes a few corner-cases in how we track and
use time within `snownet`. In particular, it is now obviously possible
for a Gateway to still retain the connection state of a Client whilst
the Client has long disconnected but now reconnects using the same ICE
credentials and private key.

Our proptests uncovered some state misalignment in that scenario due to
some remaining time impurity within `boringtun` (see
https://github.com/firezone/boringtun/pull/126 for details). In
addition, our idle state transitions needed to be updated to also take
into account candidate changes on both sides in order to achieve a
deterministic outcome.
This commit is contained in:
Thomas Eizinger
2025-11-19 14:02:13 +11:00
committed by GitHub
parent ccee476daa
commit 9b0ae92b29
9 changed files with 177 additions and 73 deletions

2
rust/Cargo.lock generated
View File

@@ -915,7 +915,7 @@ checksum = "119771309b95163ec7aaf79810da82f7cd0599c19722d48b9c03894dca833966"
[[package]]
name = "boringtun"
version = "0.6.1"
source = "git+https://github.com/firezone/boringtun?branch=master#069a483bb185422feaeddf37d3af62beb650803c"
source = "git+https://github.com/firezone/boringtun?branch=master#8300b7fe57e66c051d5632a5434c30010b523ea9"
dependencies = [
"aead",
"base64 0.22.1",

View File

@@ -60,33 +60,34 @@ pub struct Server {}
#[non_exhaustive]
pub struct Client {}
trait Mode {
enum RoleKind {
Client,
Server,
}
trait Role {
fn new() -> Self;
fn is_client(&self) -> bool;
fn is_server(&self) -> bool {
!self.is_client()
}
fn kind(&self) -> RoleKind;
}
impl Mode for Server {
fn is_client(&self) -> bool {
false
}
impl Role for Server {
fn new() -> Self {
Self {}
}
fn kind(&self) -> RoleKind {
RoleKind::Server
}
}
impl Mode for Client {
fn is_client(&self) -> bool {
true
}
impl Role for Client {
fn new() -> Self {
Self {}
}
fn kind(&self) -> RoleKind {
RoleKind::Client
}
}
/// A node within a `snownet` network maintains connections to several other nodes.
@@ -113,7 +114,7 @@ impl Mode for Client {
/// 3. Call [`Node::handle_timeout`] once that time is reached
///
/// A [`Node`] is generic over three things:
/// - `T`: The mode it is operating in, either [`Client`] or [`Server`].
/// - `T`: The role it is operating in, either [`Client`] or [`Server`].
/// - `TId`: The type to use for uniquely identifying connections.
/// - `RId`: The type to use for uniquely identifying relays.
///
@@ -138,22 +139,27 @@ pub struct Node<T, TId, RId> {
stats: NodeStats,
buffer_pool: BufferPool<Vec<u8>>,
mode: T,
role: T,
rng: StdRng,
/// The number of seconds since the UNIX epoch.
unix_ts: Duration,
/// The [`Instant`] at the time we read the UNIX epoch above.
unix_now: Instant,
}
#[derive(thiserror::Error, Debug)]
#[error("No TURN servers available")]
pub struct NoTurnServers {}
#[expect(private_bounds, reason = "We don't want `Mode` to be public API")]
#[expect(private_bounds, reason = "We don't want `Role` to be public API")]
impl<T, TId, RId> Node<T, TId, RId>
where
TId: Eq + Hash + Copy + Ord + fmt::Display,
RId: Copy + Eq + Hash + PartialEq + Ord + fmt::Debug + fmt::Display,
T: Mode,
T: Role,
{
pub fn new(seed: [u8; 32], now: Instant) -> Self {
pub fn new(seed: [u8; 32], now: Instant, unix_ts: Duration) -> Self {
let mut rng = StdRng::from_seed(seed);
let private_key = StaticSecret::random_from_rng(&mut rng);
let public_key = &(&private_key).into();
@@ -164,7 +170,7 @@ where
session_id: SessionId::new(*public_key),
private_key,
public_key: *public_key,
mode: T::new(),
role: T::new(),
index,
rate_limiter: Arc::new(RateLimiter::new_at(public_key, HANDSHAKE_RATE_LIMIT, now)),
buffered_transmits: VecDeque::default(),
@@ -174,6 +180,8 @@ where
connections: Default::default(),
stats: Default::default(),
buffer_pool: BufferPool::new(ip_packet::MAX_FZ_PAYLOAD, "snownet"),
unix_now: now,
unix_ts,
}
}
@@ -302,8 +310,10 @@ where
let selected_relay = self.sample_relay()?;
let mut agent = new_agent();
agent.set_controlling(self.mode.is_client());
let mut agent = match self.role.kind() {
RoleKind::Client => new_client_agent(),
RoleKind::Server => new_server_agent(),
};
agent.set_local_credentials(local_creds);
agent.set_remote_credentials(remote_creds);
@@ -404,7 +414,7 @@ where
#[tracing::instrument(level = "info", skip_all, fields(%cid))]
pub fn add_remote_candidate(&mut self, cid: TId, candidate: Candidate, now: Instant) {
let Some((agent, relay)) = self.connections.agent_mut(cid) else {
let Some((agent, maybe_state, relay)) = self.connections.agent_and_state_mut(cid) else {
tracing::debug!(ignored_candidate = %candidate, "Unknown connection");
return;
};
@@ -433,21 +443,21 @@ where
allocation.bind_channel(candidate.addr(), now);
let Ok(connection) = self.connections.get_established_mut(&cid, now) else {
return;
if let Some(state) = maybe_state {
// Make sure we move out of idle mode when we add new candidates.
state.on_candidate(cid, agent, now);
};
// Make sure we move out of idle mode when we add new candidates.
connection
.state
.on_candidate(cid, &mut connection.agent, now);
}
#[tracing::instrument(level = "info", skip_all, fields(%cid))]
pub fn remove_remote_candidate(&mut self, cid: TId, candidate: Candidate, now: Instant) {
if let Some((agent, _)) = self.connections.agent_mut(cid) {
if let Some((agent, maybe_state, _)) = self.connections.agent_and_state_mut(cid) {
agent.invalidate_candidate(&candidate);
agent.handle_timeout(now); // We may have invalidated the last candidate, ensure we check our nomination state.
if let Some(state) = maybe_state {
state.on_candidate(cid, agent, now);
}
}
}
@@ -501,7 +511,7 @@ where
) -> Result<Option<Transmit>> {
let conn = self.connections.get_established_mut(&cid, now)?;
if self.mode.is_server() && !conn.state.has_nominated_socket() {
if matches!(self.role.kind(), RoleKind::Server) && !conn.state.has_nominated_socket() {
tracing::debug!(
?packet,
"ICE is still in progress; dropping packet because server should not initiate WireGuard sessions"
@@ -574,7 +584,7 @@ where
pub fn handle_timeout(&mut self, now: Instant) {
self.allocations.handle_timeout(now);
self.allocations_drain_events();
self.allocations_drain_events(now);
for (id, connection) in self.connections.iter_established_mut() {
connection.handle_timeout(id, now, &mut self.allocations, &mut self.buffered_transmits);
@@ -601,6 +611,7 @@ where
&self.allocations,
&mut self.pending_events,
&mut self.rng,
now,
);
self.connections
.handle_timeout(&mut self.pending_events, now);
@@ -726,6 +737,8 @@ where
Some(self.rate_limiter.clone()),
self.rng.next_u64(),
now,
self.unix_now,
self.unix_ts,
);
// By default, boringtun has a rekey attempt time of 90(!) seconds.
// In case of a state de-sync or other issues, this means we try for
@@ -979,14 +992,24 @@ where
.map_break(|b| b.with_context(|| format!("cid={cid} length={}", packet.len())))
}
fn allocations_drain_events(&mut self) {
fn allocations_drain_events(&mut self, now: Instant) {
while let Some((rid, event)) = self.allocations.poll_event() {
tracing::trace!(%rid, ?event);
match event {
allocation::Event::New(candidate) => {
for (cid, agent) in self.connections.agents_by_relay_mut(rid) {
add_local_candidate(cid, agent, candidate.clone(), &mut self.pending_events)
for (cid, agent, maybe_state) in
self.connections.agents_and_state_by_relay_mut(rid)
{
add_local_candidate(
cid,
agent,
candidate.clone(),
&mut self.pending_events,
);
if let Some(state) = maybe_state {
state.on_candidate(cid, agent, now);
}
}
}
allocation::Event::Invalid(candidate) => {
@@ -1038,8 +1061,7 @@ where
tracing::info!("Replacing existing established connection");
};
let mut agent = new_agent();
agent.set_controlling(true);
let agent = new_client_agent();
let session_key = x25519::StaticSecret::random_from_rng(rand::thread_rng());
let ice_creds = agent.local_credentials();
@@ -1146,8 +1168,7 @@ where
tracing::info!("Replacing existing established connection");
};
let mut agent = new_agent();
agent.set_controlling(false);
let mut agent = new_server_agent();
agent.set_remote_credentials(IceCreds {
ufrag: offer.credentials.username,
pass: offer.credentials.password,
@@ -1628,7 +1649,7 @@ impl ConnectionState {
Self::Failed | Self::Connecting { .. } => return,
};
self.transition_to_connected(cid, peer_socket, agent, "new candidate", now);
self.transition_to_connected(cid, peer_socket, agent, "candidates changed", now);
}
fn on_outgoing<TId>(&mut self, cid: TId, agent: &mut IceAgent, packet: &IpPacket, now: Instant)
@@ -2380,22 +2401,39 @@ where
Some(transmit)
}
fn new_agent() -> IceAgent {
fn new_client_agent() -> IceAgent {
let mut agent = IceAgent::new();
agent.set_controlling(true);
agent.set_timing_advance(Duration::ZERO);
apply_default_stun_timings(&mut agent);
agent
}
fn new_server_agent() -> IceAgent {
let mut agent = IceAgent::new();
agent.set_controlling(false);
agent.set_timing_advance(Duration::ZERO);
apply_default_stun_timings(&mut agent);
agent
}
fn apply_default_stun_timings(agent: &mut IceAgent) {
agent.set_max_stun_retransmits(12);
agent.set_max_stun_rto(Duration::from_millis(1500));
let retrans = if agent.controlling() { 12 } else { 45 };
let max_stun_rto = if agent.controlling() { 1500 } else { 15_000 };
agent.set_max_stun_retransmits(retrans);
agent.set_max_stun_rto(Duration::from_millis(max_stun_rto));
agent.set_initial_stun_rto(Duration::from_millis(250))
}
fn apply_idle_stun_timings(agent: &mut IceAgent) {
agent.set_max_stun_retransmits(4);
let retrans = if agent.controlling() { 4 } else { 40 };
agent.set_max_stun_retransmits(retrans);
agent.set_max_stun_rto(Duration::from_secs(25));
agent.set_initial_stun_rto(Duration::from_secs(25));
}
@@ -2429,8 +2467,8 @@ mod tests {
use super::*;
#[test]
fn default_ice_timeout() {
let mut agent = IceAgent::new();
fn client_default_ice_timeout() {
let mut agent = new_client_agent();
apply_default_stun_timings(&mut agent);
@@ -2438,14 +2476,32 @@ mod tests {
}
#[test]
fn idle_ice_timeout() {
let mut agent = IceAgent::new();
fn client_idle_ice_timeout() {
let mut agent = new_client_agent();
apply_idle_stun_timings(&mut agent);
assert_eq!(agent.ice_timeout(), Duration::from_secs(100))
}
#[test]
fn server_default_ice_timeout() {
let mut agent = new_server_agent();
apply_default_stun_timings(&mut agent);
assert_eq!(agent.ice_timeout(), Duration::from_millis(600_750))
}
#[test]
fn server_idle_ice_timeout() {
let mut agent = new_server_agent();
apply_idle_stun_timings(&mut agent);
assert_eq!(agent.ice_timeout(), Duration::from_secs(1000))
}
#[test]
fn generates_correct_optimistic_candidates() {
let base = SocketAddr::V4(SocketAddrV4::new(Ipv4Addr::new(10, 0, 0, 1), 52625));

View File

@@ -13,7 +13,10 @@ use str0m::ice::IceAgent;
use crate::{
ConnectionStats, Event,
node::{Connection, InitialConnection, add_local_candidate, allocations::Allocations},
node::{
Connection, ConnectionState, InitialConnection, add_local_candidate,
allocations::Allocations,
},
};
pub struct Connections<TId, RId> {
@@ -102,6 +105,7 @@ where
allocations: &Allocations<RId>,
pending_events: &mut VecDeque<Event<TId>>,
rng: &mut impl Rng,
now: Instant,
) {
for (_, c) in self.iter_initial_mut() {
if allocations.contains(&c.relay) {
@@ -136,6 +140,7 @@ where
for candidate in allocation.current_relay_candidates() {
add_local_candidate(cid, &mut c.agent, candidate, pending_events);
c.state.on_candidate(cid, &mut c.agent, now);
}
}
}
@@ -169,28 +174,33 @@ where
existing
}
pub(crate) fn agent_mut(&mut self, id: TId) -> Option<(&mut IceAgent, RId)> {
let maybe_initial_connection = self.initial.get_mut(&id).map(|i| (&mut i.agent, i.relay));
pub(crate) fn agent_and_state_mut(
&mut self,
id: TId,
) -> Option<(&mut IceAgent, Option<&mut ConnectionState>, RId)> {
let maybe_initial_connection = self
.initial
.get_mut(&id)
.map(|i| (&mut i.agent, None, i.relay));
let maybe_established_connection = self
.established
.get_mut(&id)
.map(|c| (&mut c.agent, c.relay.id));
.map(|c| (&mut c.agent, Some(&mut c.state), c.relay.id));
maybe_initial_connection.or(maybe_established_connection)
}
pub(crate) fn agents_by_relay_mut(
pub(crate) fn agents_and_state_by_relay_mut(
&mut self,
id: RId,
) -> impl Iterator<Item = (TId, &mut IceAgent)> + '_ {
) -> impl Iterator<Item = (TId, &mut IceAgent, Option<&mut ConnectionState>)> + '_ {
let initial_connections = self
.initial
.iter_mut()
.filter_map(move |(cid, i)| (i.relay == id).then_some((*cid, &mut i.agent)));
let established_connections = self
.established
.iter_mut()
.filter_map(move |(cid, c)| (c.relay.id == id).then_some((*cid, &mut c.agent)));
.filter_map(move |(cid, i)| (i.relay == id).then_some((*cid, &mut i.agent, None)));
let established_connections = self.established.iter_mut().filter_map(move |(cid, c)| {
(c.relay.id == id).then_some((*cid, &mut c.agent, Some(&mut c.state)))
});
initial_connections.chain(established_connections)
}
@@ -522,6 +532,8 @@ mod tests {
None,
0,
Instant::now(),
Instant::now(),
Duration::ZERO,
),
remote_pub_key: PublicKey::from(rand::random::<[u8; 32]>()),
next_wg_timer_update: Instant::now(),

View File

@@ -188,6 +188,7 @@ impl ClientState {
records: BTreeSet<DnsResourceRecord>,
is_internet_resource_active: bool,
now: Instant,
unix_ts: Duration,
) -> Self {
Self {
resources_gateways: Default::default(),
@@ -198,7 +199,7 @@ impl ClientState {
buffered_events: Default::default(),
tun_config: Default::default(),
buffered_packets: Default::default(),
node: ClientNode::new(seed, now),
node: ClientNode::new(seed, now, unix_ts),
sites_status: Default::default(),
gateways_site: Default::default(),
stub_resolver: StubResolver::new(records),
@@ -2033,7 +2034,13 @@ mod tests {
impl ClientState {
pub fn for_test() -> ClientState {
ClientState::new(rand::random(), Default::default(), false, Instant::now())
ClientState::new(
rand::random(),
Default::default(),
false,
Instant::now(),
Duration::ZERO,
)
}
}

View File

@@ -73,10 +73,10 @@ impl DnsResourceNatEntry {
}
impl GatewayState {
pub(crate) fn new(seed: [u8; 32], now: Instant) -> Self {
pub(crate) fn new(seed: [u8; 32], now: Instant, unix_ts: Duration) -> Self {
Self {
peers: Default::default(),
node: ServerNode::new(seed, now),
node: ServerNode::new(seed, now, unix_ts),
next_expiry_resources_check: Default::default(),
buffered_events: VecDeque::default(),
buffered_transmits: VecDeque::default(),

View File

@@ -21,7 +21,7 @@ use std::{
net::{IpAddr, Ipv4Addr, Ipv6Addr, SocketAddr},
sync::Arc,
task::{Context, Poll, ready},
time::{Duration, Instant},
time::{Duration, Instant, SystemTime},
};
use tun::Tun;
@@ -124,6 +124,9 @@ impl ClientTunnel {
records,
is_internet_resource_active,
Instant::now(),
SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.expect("Should be able to compute UNIX timestamp"),
),
buffers: Buffers::default(),
packet_counter: opentelemetry::global::meter("connlib")
@@ -300,7 +303,13 @@ impl GatewayTunnel {
) -> Self {
Self {
io: Io::new(tcp_socket_factory, udp_socket_factory.clone(), nameservers),
role_state: GatewayState::new(rand::random(), Instant::now()),
role_state: GatewayState::new(
rand::random(),
Instant::now(),
SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.expect("Should be able to compute UNIX timestamp"),
),
buffers: Buffers::default(),
packet_counter: opentelemetry::global::meter("connlib")
.u64_counter("system.network.packets")

View File

@@ -16,6 +16,7 @@ use crate::{
client::{CidrResource, DnsResource, InternetResource, Resource},
messages::Interface,
};
use chrono::{DateTime, Utc};
use connlib_model::{ClientId, GatewayId, RelayId, ResourceId, ResourceStatus, Site, SiteId};
use dns_types::{DomainName, Query, RecordData, RecordType};
use ip_network::{IpNetwork, Ipv4Network, Ipv6Network};
@@ -110,6 +111,7 @@ impl SimClient {
key: PrivateKey,
is_internet_resource_active: bool,
now: Instant,
utc_now: DateTime<Utc>,
) {
let dns_resource_records = self.dns_resource_record_cache.clone();
@@ -123,6 +125,10 @@ impl SimClient {
dns_resource_records,
is_internet_resource_active,
now,
utc_now
.signed_duration_since(DateTime::UNIX_EPOCH)
.to_std()
.unwrap(),
);
self.search_domain = None;
@@ -497,12 +503,16 @@ impl RefClient {
/// Initialize the [`ClientState`].
///
/// This simulates receiving the `init` message from the portal.
pub(crate) fn init(self, now: Instant) -> SimClient {
pub(crate) fn init(self, now: Instant, utc_now: DateTime<Utc>) -> SimClient {
let mut client_state = ClientState::new(
self.key.0,
Default::default(),
self.internet_resource_active,
now,
utc_now
.signed_duration_since(DateTime::UNIX_EPOCH)
.to_std()
.unwrap(),
); // Cheating a bit here by reusing the key as seed.
client_state.update_interface_config(Interface {
ipv4: self.tunnel_ip4,

View File

@@ -315,8 +315,16 @@ impl RefGateway {
id: GatewayId,
tcp_resources: BTreeSet<SocketAddr>,
now: Instant,
utc_now: DateTime<Utc>,
) -> SimGateway {
let mut sut = GatewayState::new(self.key.0, now); // Cheating a bit here by reusing the key as seed.
let mut sut = GatewayState::new(
self.key.0,
now,
utc_now
.signed_duration_since(DateTime::UNIX_EPOCH)
.to_std()
.unwrap(),
); // Cheating a bit here by reusing the key as seed.
sut.update_tun_device(IpConfig {
v4: self.tunnel_ip4,
v6: self.tunnel_ip6,

View File

@@ -55,7 +55,7 @@ impl TunnelTest {
pub(crate) fn init_test(ref_state: &ReferenceState, flux_capacitor: FluxCapacitor) -> Self {
// Construct client, gateway and relay from the initial state.
let mut client = ref_state.client.map(
|ref_client, _, _| ref_client.init(flux_capacitor.now()),
|ref_client, _, _| ref_client.init(flux_capacitor.now(), flux_capacitor.now()),
debug_span!("client"),
);
@@ -74,6 +74,7 @@ impl TunnelTest {
.copied()
.collect(),
flux_capacitor.now(),
flux_capacitor.now(),
)
},
debug_span!("gateway", %gid),
@@ -124,6 +125,7 @@ impl TunnelTest {
) -> Self {
let mut buffered_transmits = BufferedTransmits::default();
let now = state.flux_capacitor.now();
let utc_now = state.flux_capacitor.now();
// Act: Apply the transition
match transition {
@@ -426,7 +428,7 @@ impl TunnelTest {
let internet_resource_state = ref_state.client.inner().internet_resource_active;
state.client.exec_mut(|c| {
c.restart(key, internet_resource_state, now);
c.restart(key, internet_resource_state, now, utc_now);
// Apply to new instance.
c.sut.update_interface_config(Interface {