From cd177a64485b8b76c757243a18cab99589d72db1 Mon Sep 17 00:00:00 2001 From: Thomas Eizinger Date: Sat, 2 Aug 2025 08:27:49 +0000 Subject: [PATCH] fix(gateway): don't remove peer state on disconnect (#10040) When the connection to a Client disappears, the Gateway currently clears all state related to this peer. Whilst eagerly cleaning up memory can be good, in this case, it may lead to the Client thinking it has access to a resource when in reality it doesn't. Just because the connection to a Client failed doesn't mean their access authorizations are invalid. In case the Client reconnects, it should be able to just continue sending traffic. At the moment, this only works if the connection also failed on the Client and therefore, its view of the world in regards to "which resources do I have access to" was also reset. What we are seeing in Sentry reports though is that Clients are attempting to access these resources, thinking they have access but the Gateway denies it because it has lost the access authorization state. --- rust/connlib/snownet/src/lib.rs | 2 +- rust/connlib/snownet/src/node.rs | 6 +++++- rust/connlib/tunnel/src/gateway.rs | 6 ++++-- rust/gateway/src/eventloop.rs | 6 ++++++ 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/rust/connlib/snownet/src/lib.rs b/rust/connlib/snownet/src/lib.rs index b8a7bbc7e..5b2f5142e 100644 --- a/rust/connlib/snownet/src/lib.rs +++ b/rust/connlib/snownet/src/lib.rs @@ -16,7 +16,7 @@ pub use allocation::RelaySocket; pub use node::{Answer, Offer}; pub use node::{ Client, ClientNode, Credentials, Event, HANDSHAKE_TIMEOUT, NoTurnServers, Node, Server, - ServerNode, Transmit, + ServerNode, Transmit, UnknownConnection, }; pub use stats::{ConnectionStats, NodeStats}; diff --git a/rust/connlib/snownet/src/node.rs b/rust/connlib/snownet/src/node.rs index 50ea8fb45..c1f62190f 100644 --- a/rust/connlib/snownet/src/node.rs +++ b/rust/connlib/snownet/src/node.rs @@ -426,7 +426,7 @@ where let conn = self .connections .get_established_mut(&cid) - .with_context(|| format!("Unknown connection {cid}"))?; + .context(UnknownConnection(cid.to_string()))?; if self.mode.is_server() && !conn.state.has_nominated_socket() { tracing::debug!( @@ -1440,6 +1440,10 @@ fn remove_local_candidate( } } +#[derive(thiserror::Error, Debug)] +#[error("Unknown connection: {0}")] +pub struct UnknownConnection(String); + #[deprecated] pub struct Offer { /// The Wireguard session key for a connection. diff --git a/rust/connlib/tunnel/src/gateway.rs b/rust/connlib/tunnel/src/gateway.rs index 8f4d32af2..3e8c7fb98 100644 --- a/rust/connlib/tunnel/src/gateway.rs +++ b/rust/connlib/tunnel/src/gateway.rs @@ -400,8 +400,10 @@ impl GatewayState { while let Some(event) = self.node.poll_event() { match event { - snownet::Event::ConnectionFailed(id) | snownet::Event::ConnectionClosed(id) => { - self.peers.remove(&id); + snownet::Event::ConnectionFailed(_) | snownet::Event::ConnectionClosed(_) => { + // We purposely don't clear the peer-state here. + // The Client might re-establish the connection but if it hasn't cleared its local state too, + // it will consider all its access authorizations to be still valid. } snownet::Event::NewIceCandidate { connection, diff --git a/rust/gateway/src/eventloop.rs b/rust/gateway/src/eventloop.rs index 0718f4f81..2a65efdd7 100644 --- a/rust/gateway/src/eventloop.rs +++ b/rust/gateway/src/eventloop.rs @@ -115,6 +115,12 @@ impl Eventloop { continue; } + // Unknown connection just means packets are bouncing on the TUN device because the Client disconnected. + if e.root_cause().is::() { + tracing::debug!("{e:#}"); + continue; + } + if e.root_cause() .downcast_ref::() .is_some_and(|e| e.kind() == io::ErrorKind::PermissionDenied)