fix(connlib): signal all local candidates on upsert (#10920)

Firezone's UDP connections are designed to be idempotent. If a Client
discards its "half" of the connection but the Gateway still keeps the
state around, a subsequent connection setup by the Client will reuse
connection state on the Gateway. To fully support this, `snownet`
re-sends all its local candidates to the remote peer whenever a
connection gets upserted.

The current `seed_agent_with_local_candidates` function attempts to do
this job but its design overlooked a crucial detail: Re-adding a
candidate that the `IceAgent` already knows about is considered to be
redundant. As such, the candidate is not re-signalled to the remote!

The real-world consequences for this are subtle. `str0m`'s support for
peer-reflexive candidate means that incoming STUN binding requests are
still answered, even if they come from an address that the agent doesn't
know anything about, i.e. it has never been told about that candidate.
Thus, what happens right now is that when a Client re-creates a
connection that is still present on the Gateway, it will start receiving
STUN binding requests for candidates it doesn't know about and create
peer-reflexive candidates for them.

Where this does show up is in our test-suite which has fairly strict
timing constraints. When we simulate the re-deploy of relays, we expect
connections to be migrated to a new relay immediately. To support this,
the current relay candidates are invalidated on both sides. This however
only works if the current candidate is correctly recognised by the local
ICE agent. Peer-reflexive candidates are created on-demand and typically
only serve a placeholder-like role until we learn about the real
candidate that is being used. Due to the above described behaviour of
`seed_agent_with_local_candidates`, this however may not happen at all.
As a result, attempting to invalidate a relay candidate fails because we
don't recognise the relay candidate as we only have a peer-reflexive
one.

Putting all of this together, whilst not re-sending all candidates
doesn't cause immediate issues for a connection, it may cause problems
at a later point when we are trying to invalidate a currently active
candidate to achieve a speedy failover to a new one.
This commit is contained in:
Thomas Eizinger
2025-11-21 15:01:42 +11:00
committed by GitHub
parent ef24617e2c
commit aab779e68b
5 changed files with 89 additions and 96 deletions

View File

@@ -278,12 +278,19 @@ where
c.state.on_upsert(cid, &mut c.agent, now);
seed_agent_with_local_candidates(
cid,
c.relay.id,
&mut c.agent,
&self.allocations,
&mut self.pending_events,
// Take all current candidates.
let current_candidates = c.agent.local_candidates().to_vec();
// Re-seed connection with all candidates.
let new_candidates =
seed_agent_with_local_candidates(c.relay.id, &mut c.agent, &self.allocations);
// Tell the remote about all of them.
self.pending_events.extend(
std::iter::empty()
.chain(current_candidates)
.chain(new_candidates)
.map(|candidate| new_ice_candidate_event(cid, candidate)),
);
// Initiate a new WG session.
@@ -317,12 +324,13 @@ where
agent.set_local_credentials(local_creds);
agent.set_remote_credentials(remote_creds);
seed_agent_with_local_candidates(
cid,
selected_relay,
&mut agent,
&self.allocations,
&mut self.pending_events,
self.pending_events.extend(
self.allocations
.candidates_for_relay(&selected_relay)
.filter_map(|candidate| {
add_local_candidate(&mut agent, candidate)
.map(|c| new_ice_candidate_event(cid, c))
}),
);
let connection = self.init_connection(
@@ -1001,12 +1009,11 @@ where
for (cid, agent, maybe_state) in
self.connections.agents_and_state_by_relay_mut(rid)
{
add_local_candidate(
cid,
agent,
candidate.clone(),
&mut self.pending_events,
);
if let Some(candidate) = add_local_candidate(agent, candidate.clone()) {
self.pending_events
.push_back(new_ice_candidate_event(cid, candidate));
}
if let Some(state) = maybe_state {
state.on_candidate(cid, agent, now);
}
@@ -1110,13 +1117,12 @@ where
let selected_relay = initial.relay;
seed_agent_with_local_candidates(
cid,
selected_relay,
&mut agent,
&self.allocations,
&mut self.pending_events,
);
for candidate in
seed_agent_with_local_candidates(selected_relay, &mut agent, &self.allocations)
{
self.pending_events
.push_back(new_ice_candidate_event(cid, candidate));
}
let index = self.index.next();
let connection = self.init_connection(
@@ -1182,13 +1188,13 @@ where
};
let selected_relay = self.sample_relay()?;
seed_agent_with_local_candidates(
cid,
selected_relay,
&mut agent,
&self.allocations,
&mut self.pending_events,
);
for candidate in
seed_agent_with_local_candidates(selected_relay, &mut agent, &self.allocations)
{
self.pending_events
.push_back(new_ice_candidate_event(cid, candidate));
}
let index = self.index.next();
let connection = self.init_connection(
@@ -1211,30 +1217,18 @@ where
}
}
fn seed_agent_with_local_candidates<TId, RId>(
connection: TId,
/// Seeds the agent with all local candidates, returning an iterator of all candidates that should be signalled to the remote.
fn seed_agent_with_local_candidates<'a, RId>(
selected_relay: RId,
agent: &mut IceAgent,
agent: &'a mut IceAgent,
allocations: &Allocations<RId>,
pending_events: &mut VecDeque<Event<TId>>,
) where
) -> impl Iterator<Item = Candidate> + use<'a, RId>
where
RId: Ord + fmt::Display + Copy,
TId: fmt::Display + Copy,
{
let shared_candidates = allocations.shared_candidates();
let relay_candidates = allocations
.get_by_id(&selected_relay)
.into_iter()
.flat_map(|allocation| allocation.current_relay_candidates());
// Candidates with a higher priority are better, therefore: Reverse the ordering by priority.
for candidate in shared_candidates
.chain(relay_candidates)
.sorted_by_key(|c| c.prio())
.rev()
{
add_local_candidate(connection, agent, candidate, pending_events);
}
allocations
.candidates_for_relay(&selected_relay)
.filter_map(move |c| add_local_candidate(agent, c))
}
/// Generate optimistic candidates based on the ones we have already received.
@@ -1299,38 +1293,25 @@ fn generate_optimistic_candidates(agent: &mut IceAgent) {
}
}
fn add_local_candidate<TId>(
id: TId,
agent: &mut IceAgent,
candidate: Candidate,
pending_events: &mut VecDeque<Event<TId>>,
) where
TId: fmt::Display,
{
/// Attempts to add the candidate to the agent, returning back the candidate if it should be signalled to the remote.
fn add_local_candidate(agent: &mut IceAgent, candidate: Candidate) -> Option<Candidate> {
// srflx candidates don't need to be added to the local agent because we always send from the `base` anyway.
if candidate.kind() == CandidateKind::ServerReflexive {
signal_candidate_to_remote(id, &candidate, pending_events);
return;
return Some(candidate);
}
let Some(candidate) = agent.add_local_candidate(candidate) else {
return;
};
let candidate = agent.add_local_candidate(candidate)?;
signal_candidate_to_remote(id, candidate, pending_events);
Some(candidate.clone())
}
fn signal_candidate_to_remote<TId>(
id: TId,
candidate: &Candidate,
pending_events: &mut VecDeque<Event<TId>>,
) {
fn new_ice_candidate_event<TId>(id: TId, candidate: Candidate) -> Event<TId> {
tracing::debug!(?candidate, "Signalling candidate to remote");
pending_events.push_back(Event::NewIceCandidate {
Event::NewIceCandidate {
connection: id,
candidate: candidate.clone(),
})
candidate,
}
}
fn invalidate_allocation_candidates<TId, RId>(

View File

@@ -74,6 +74,23 @@ where
.unwrap_or(MutAllocationRef::Unknown)
}
pub(crate) fn candidates_for_relay(
&self,
id: &RId,
) -> impl Iterator<Item = Candidate> + use<RId> {
let shared_candidates = self.shared_candidates();
let relay_candidates = self
.get_by_id(id)
.into_iter()
.flat_map(|allocation| allocation.current_relay_candidates());
// Candidates with a higher priority are better, therefore: Reverse the ordering by priority.
shared_candidates
.chain(relay_candidates)
.sorted_by_key(|c| c.prio())
.rev()
}
pub(crate) fn iter_mut(&mut self) -> impl Iterator<Item = (&RId, &mut Allocation)> {
self.inner.iter_mut()
}
@@ -144,13 +161,6 @@ where
Some((*id, a))
}
pub(crate) fn shared_candidates(&self) -> impl Iterator<Item = Candidate> {
self.inner
.values()
.flat_map(|allocation| allocation.host_and_server_reflexive_candidates())
.unique()
}
pub(crate) fn poll_timeout(&mut self) -> Option<(Instant, &'static str)> {
self.inner
.values_mut()
@@ -192,6 +202,13 @@ where
None => true,
});
}
fn shared_candidates(&self) -> impl Iterator<Item = Candidate> {
self.inner
.values()
.flat_map(|allocation| allocation.host_and_server_reflexive_candidates())
.unique()
}
}
pub(crate) enum MutAllocationRef<'a, RId> {

View File

@@ -15,7 +15,7 @@ use crate::{
ConnectionStats, Event,
node::{
Connection, ConnectionState, InitialConnection, add_local_candidate,
allocations::Allocations,
allocations::Allocations, new_ice_candidate_event,
},
};
@@ -138,10 +138,14 @@ where
c.relay.id = rid;
for candidate in allocation.current_relay_candidates() {
add_local_candidate(cid, &mut c.agent, candidate, pending_events);
c.state.on_candidate(cid, &mut c.agent, now);
for candidate in allocation
.current_relay_candidates()
.filter_map(|candidate| add_local_candidate(&mut c.agent, candidate))
{
pending_events.push_back(new_ice_candidate_event(cid, candidate));
}
c.state.on_candidate(cid, &mut c.agent, now);
}
}

View File

@@ -251,3 +251,5 @@ cc cd931decdb174867cc8067e66287a7fd6fe85f0490c25a408431f405987ec5e4
cc 208cbcc2338fd8f325abc5be770c51b97296729bf09bd4472be5734789c1d37b
cc 6c01d07f2820e879d267944e063a55d0f4f07777a925a532f9da312b9b398db1
cc 9fac3117f02193b4b1dfbbfd02fa6062c8f62eb5c94d36d9bd9c694dcda5ddab
cc 659d967c73dae7e617a63ff86978636ef8aac193b70236adb823354067523629
cc 1c531f3260ae1f80cf5ab4fb8c0cef5dd1d468ed4da1ff7170a12d203b5546a5

View File

@@ -643,17 +643,6 @@ impl RefClient {
for status in self.site_status.values_mut() {
*status = ResourceStatus::Unknown;
}
// TCP connections will automatically re-create connections to Gateways.
for ((_, dst, _, _), r) in self
.expected_tcp_connections
.clone()
.into_iter()
.collect::<Vec<_>>()
{
self.connect_to_resource(r, dst);
self.set_resource_online(r);
}
}
pub(crate) fn add_internet_resource(&mut self, resource: InternetResource) {