From 52a9079d6a0baa687e123fd8746eae54bb91e479 Mon Sep 17 00:00:00 2001 From: Thomas Eizinger Date: Thu, 31 Jul 2025 04:10:38 +0000 Subject: [PATCH] feat(snownet): use in-flight channels to relay data (#10062) In #7548, we added a feature to Firezone where TURN channels get bound on-demand as they are needed. To ensure many communication paths work, we also proactively bind them as soon as we receive a candidate from a remote. When a new remote candidate gets added, str0m forms pairs with all the existing local candidates and starts testing these candidate pairs. For local relay candidates, this means sending a channel data message from the allocation. At the moment, this results in the following pattern in the logs: ``` Received candidate from remote cid=20af9d29-c973-4d77-909a-abed5d7a0234 candidate=Candidate(relay=[3231E680683CFC98E69A12A60F426AA5E5F110CB]:62759/udp raddr=[59A533B0D4D3CB3717FD3D655E1D419E1C9C0772]:0 prio=37492735) No channel to peer, binding new one active_socket=462A7A508E3C99875E69C2519CA020330A6004EC:3478 peer=[3231E680683CFC98E69A12A60F426AA5E5F110CB]:62759 Already binding a channel to peer active_socket=Some(462A7A508E3C99875E69C2519CA020330A6004EC:3478) peer=[3231E680683CFC98E69A12A60F426AA5E5F110CB]:62759 class=success response from=462A7A508E3C99875E69C2519CA020330A6004EC:3478 method=channel bind rtt=9.928424ms tid=042F52145848D6C1574BB997 ``` What happens here is: 1. We receive a new candidate and proactively bind a channel (this is a silent operation and therefore not visible in the logs). 2. str0m formed new pairs for these candidates and starts testing them, triggering a new channel binding because the previous one isn't completed yet. 3. We refuse to make another channel binding because we see that we already have one in-flight. 4. The channel binding succeeds. What we do now is: If we want to send data to a peer through a channel, we check whether we have a connected OR an in-flight channel and send it in both cases. If the channel binding is still in-flight, we therefore just pipeline the channel data message just after it. Chances are that - assuming no packet re-orderings on the network - by the time our channel data message arrives at the relay that binding is active and can be relayed. This allows the very first binding attempt from str0m to already succeed instead of waiting for the timeout and sending another binding request. In addition, it makes these logs less confusing. --- rust/connlib/snownet/src/allocation.rs | 40 ++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/rust/connlib/snownet/src/allocation.rs b/rust/connlib/snownet/src/allocation.rs index 15342a065..e36018d01 100644 --- a/rust/connlib/snownet/src/allocation.rs +++ b/rust/connlib/snownet/src/allocation.rs @@ -851,6 +851,8 @@ impl Allocation { return; }; + tracing::debug!(number = %channel, "Binding new channel"); + self.authenticate_and_queue( make_channel_bind_request(peer, channel, self.software.clone()), None, @@ -867,10 +869,15 @@ impl Allocation { let active_socket = self.active_socket?.addr; let payload_length = buffer.len() - 4; - let channel_number = match self.channel_bindings.connected_channel_to_peer(peer, now) { + let connected_channel_to_peer = self.channel_bindings.connected_channel_to_peer(peer, now); + let inflight_channel_to_peer = self.channel_bindings.inflight_channel_to_peer(peer, now); + + // We use connected and in-flight channels in order to optimistically send data. + // Chances are, by the time the channel data message arrives, the channel will have been bound already. + // Whether or not we drop the packet here or on the relay if happened to not be bound does not matter. + let channel_number = match connected_channel_to_peer.or(inflight_channel_to_peer) { Some(cn) => cn, None => { - tracing::debug!(%peer, %active_socket, "No channel to peer, binding new one"); self.bind_channel(peer, now); return None; @@ -1498,6 +1505,13 @@ impl ChannelBindings { .map(|(n, _)| *n) } + fn inflight_channel_to_peer(&self, peer: SocketAddr, now: Instant) -> Option { + self.inner + .iter() + .find(|(_, c)| c.inflight_to_peer(peer, now)) + .map(|(n, _)| *n) + } + fn bound_channel_to_peer(&self, peer: SocketAddr, now: Instant) -> Option { self.inner .iter() @@ -1553,6 +1567,13 @@ impl Channel { self.peer == peer && self.age(now) < Self::CHANNEL_LIFETIME && self.bound } + /// Check if this channel is to-be-bound to the given peer. + /// + /// In case the channel is older than its lifetime (10 minutes), this returns false because the relay will have de-allocated the channel. + fn inflight_to_peer(&self, peer: SocketAddr, now: Instant) -> bool { + self.peer == peer && self.age(now) < Self::CHANNEL_LIFETIME + } + /// Check if this channel is bound to the given peer. fn bound_to_peer(&self, peer: SocketAddr, now: Instant) -> bool { self.peer == peer @@ -1934,12 +1955,25 @@ mod tests { } #[test] - fn does_not_relay_to_with_unbound_channel() { + fn relays_to_inflight_channel() { let mut allocation = Allocation::for_test_ip4(Instant::now()) .with_binding_response(PEER1, Instant::now()) .with_allocate_response(&[RELAY_ADDR_IP4], Instant::now()); allocation.bind_channel(PEER2_IP4, Instant::now()); + let mut buffer = channel_data_packet_buffer(b"foobar"); + let encode_ok = + allocation.encode_channel_data_header(PEER2_IP4, &mut buffer, Instant::now()); + + assert!(encode_ok.is_some()) + } + + #[test] + fn does_not_relay_to_with_unbound_channel() { + let mut allocation = Allocation::for_test_ip4(Instant::now()) + .with_binding_response(PEER1, Instant::now()) + .with_allocate_response(&[RELAY_ADDR_IP4], Instant::now()); + let mut buffer = channel_data_packet_buffer(b"foobar"); let encode_ok = allocation.encode_channel_data_header(PEER2_IP4, &mut buffer, Instant::now());