From 28cddc830488d17fc25daabffd8d59f40bab9278 Mon Sep 17 00:00:00 2001 From: Thomas Eizinger Date: Wed, 12 Jun 2024 08:37:33 +1000 Subject: [PATCH] chore(snownet): improve logs on blocked STUN traffic (#5305) Detecting blocked STUN traffic is somewhat tricky. What we can observe is not receiving any responses from a relay (neither on IPv4 nor IPv6). Once an `Allocation` gives up retrying requests with a relay (after 60s), we now de-allocate the `Allocation` and print the following message: > INFO snownet::node: Disconnecting from relay; no response received. Is STUN blocked? id=613f68ac-483e-4e9d-bf87-457fd7223bf6 I chose to go with the wording of "disconnecting from relay" as sysdamins likely don't have any clue of what an "Allocation" is. The error message is specific to a relay though so it could also be emitted if a relay is down for > 60s or not responding for whatever reason. Resolves: #5281. --- rust/connlib/snownet/src/allocation.rs | 67 ++++++++++++++++++++++++-- rust/connlib/snownet/src/node.rs | 16 +++--- 2 files changed, 70 insertions(+), 13 deletions(-) diff --git a/rust/connlib/snownet/src/allocation.rs b/rust/connlib/snownet/src/allocation.rs index 3cec115c2..a76acfe4e 100644 --- a/rust/connlib/snownet/src/allocation.rs +++ b/rust/connlib/snownet/src/allocation.rs @@ -174,6 +174,14 @@ impl Socket { } } +#[derive(Debug, thiserror::Error, PartialEq)] +pub enum FreeReason { + #[error("authentication error")] + AuthenticationError, + #[error("no response received. Is STUN blocked?")] + NoResponseReceived, +} + impl Allocation { pub fn new( server: RelaySocket, @@ -715,9 +723,32 @@ impl Allocation { /// Whether this [`Allocation`] can be freed. /// - /// This is tied to having our credentials cleared (i.e due to an authentication error) and having emitted all events. - pub fn can_be_freed(&self) -> bool { - self.credentials.is_none() && self.events.is_empty() + /// This is tied to having our credentials cleared (i.e due to an authentication error) and having emitted all events or not having received a single response. + pub fn can_be_freed(&self) -> Option { + let pending_work = !self.events.is_empty() + || !self.buffered_transmits.is_empty() + || !self.sent_requests.is_empty(); + + let no_responses = !self.received_any_response(); + let auth_failure = !self.has_credentials(); + + if !pending_work && no_responses { + return Some(FreeReason::NoResponseReceived); + } + + if !pending_work && auth_failure { + return Some(FreeReason::AuthenticationError); + } + + None + } + + pub fn received_any_response(&self) -> bool { + self.active_socket.is_some() + } + + pub fn has_credentials(&self) -> bool { + self.credentials.is_some() } fn log_update(&self) { @@ -2155,7 +2186,10 @@ mod tests { CandidateEvent::Invalid(Candidate::relayed(RELAY_ADDR_IP6, Protocol::Udp).unwrap()), ] ); - assert!(allocation.can_be_freed()); + assert_eq!( + allocation.can_be_freed(), + Some(FreeReason::AuthenticationError) + ); } #[test] @@ -2183,7 +2217,7 @@ mod tests { fn allocation_is_not_freed_on_startup() { let allocation = Allocation::for_test_ip4(Instant::now()); - assert!(!allocation.can_be_freed()); + assert_eq!(allocation.can_be_freed(), None); } #[test] @@ -2325,6 +2359,29 @@ mod tests { assert_eq!(allocation.poll_transmit(), None); } + #[test] + fn allocation_can_be_freed_after_all_requests_time_out() { + let mut allocation = Allocation::for_test_dual(Instant::now()); + + loop { + let Some(timeout) = allocation.poll_timeout() else { + break; + }; + + allocation.handle_timeout(timeout); + + // We expect two transmits. + // The order is not deterministic because internally it is a `HashMap`. + let _ = allocation.poll_transmit().unwrap(); + let _ = allocation.poll_transmit().unwrap(); + } + + assert_eq!( + allocation.can_be_freed(), + Some(FreeReason::NoResponseReceived) + ); + } + fn ch(peer: SocketAddr, now: Instant) -> Channel { Channel { peer, diff --git a/rust/connlib/snownet/src/node.rs b/rust/connlib/snownet/src/node.rs index 21b6601c8..1ef3dd532 100644 --- a/rust/connlib/snownet/src/node.rs +++ b/rust/connlib/snownet/src/node.rs @@ -451,15 +451,15 @@ where self.next_rate_limiter_reset = Some(now + Duration::from_secs(1)); } - self.allocations.retain(|id, allocation| { - if allocation.can_be_freed() { - tracing::info!(%id, "Freeing memory of allocation"); + self.allocations + .retain(|id, allocation| match allocation.can_be_freed() { + Some(e) => { + tracing::info!(%id, "Disconnecting from relay; {e}"); - return false; - } - - true - }); + false + } + None => true, + }); self.connections.remove_failed(&mut self.pending_events); }