Connlib: reduce failover timeout (#2897)

This reduces the failover time by depending on webrtc's keepalive
instead of wireguard's.

We have much more control over that, since boringtun doesn't bubble up
any of the keepalives timeout(only a trace warning).

In the a next commit, when things are more stable, we should just get
rid of wireguard's keep alive. When we remove webrtc we will build our
own.

Events based on `keepalive` timeouts are key to our failover system, so
we **need** it.

Draft because it's built on top of #2891 (which is completely separate
code but without that the failover just doesn't work correctly)
This commit is contained in:
Gabi
2023-12-13 19:40:04 -03:00
committed by GitHub
parent 8499580388
commit 186ceb18e6
5 changed files with 29 additions and 4 deletions

View File

@@ -12,7 +12,7 @@ use connlib_shared::{
};
use webrtc::ice_transport::{
ice_candidate::RTCIceCandidate, ice_gatherer::RTCIceGatherOptions,
ice_parameters::RTCIceParameters,
ice_parameters::RTCIceParameters, ice_transport_state::RTCIceTransportState,
};
use webrtc::ice_transport::{ice_candidate_type::RTCIceCandidateType, RTCIceTransport};
use webrtc::ice_transport::{ice_credential_type::RTCIceCredentialType, ice_server::RTCIceServer};
@@ -151,7 +151,8 @@ fn insert_peers<TId: Copy, TTransform>(
}
}
fn start_handlers<TId, TTransform>(
fn start_handlers<TId, TTransform, TRoleState>(
tunnel: Arc<Tunnel<impl Callbacks + 'static, TRoleState>>,
device: Arc<ArcSwapOption<Device>>,
callbacks: impl Callbacks + 'static,
peer: Arc<Peer<TId, TTransform>>,
@@ -160,8 +161,17 @@ fn start_handlers<TId, TTransform>(
) where
TId: Copy + Send + Sync + fmt::Debug + 'static,
TTransform: Send + Sync + PacketTransform + 'static,
TRoleState: RoleState<Id = TId>,
{
ice.on_connection_state_change(Box::new(|_| Box::pin(async {})));
let conn_id = peer.conn_id;
ice.on_connection_state_change(Box::new(move |state| {
let tunnel = tunnel.clone();
Box::pin(async move {
if state == RTCIceTransportState::Failed {
tunnel.peers_to_stop.lock().push_back(conn_id);
}
})
}));
tokio::spawn({
async move {
// If this fails receiver will be dropped and the connection will expire at some point

View File

@@ -165,6 +165,7 @@ where
let (peer_sender, peer_receiver) = tokio::sync::mpsc::channel(PEER_QUEUE_SIZE);
start_handlers(
Arc::clone(self),
Arc::clone(&self.device),
self.callbacks.clone(),
peer.clone(),

View File

@@ -205,7 +205,7 @@ where
}
fn new_tunnel(
&self,
self: &Arc<Self>,
peer_config: PeerConfig,
client_id: ClientId,
resource: ResourceDescription,
@@ -239,6 +239,7 @@ where
let (peer_sender, peer_receiver) = tokio::sync::mpsc::channel(PEER_QUEUE_SIZE);
start_handlers(
Arc::clone(self),
Arc::clone(&self.device),
self.callbacks.clone(),
peer.clone(),

View File

@@ -213,6 +213,7 @@ where
};
let mut answer_builder = msg_builder.start_answer(message, Rcode::NoError).ok()?;
answer_builder.header_mut().set_ra(true);
// W/O object-safety there's no other way to access the inner type
// we could as well implement the ComposeRecordData trait for RecordData
@@ -227,6 +228,7 @@ where
RecordData::Ptr(r) => answer_builder.push((qname, Class::In, DNS_TTL, r)),
}
.ok()?;
Some(answer_builder.finish())
}

View File

@@ -87,6 +87,12 @@ const MAX_CONCURRENT_ICE_GATHERING: usize = 100;
// Note: Taken from boringtun
const HANDSHAKE_RATE_LIMIT: u64 = 100;
// These 2 are the default timeouts
const ICE_DISCONNECTED_TIMEOUT: Duration = Duration::from_secs(5);
const ICE_KEEPALIVE: Duration = Duration::from_secs(2);
// This is approximately how long failoever will take :)
const ICE_FAILED_TIMEOUT: Duration = Duration::from_secs(10);
pub(crate) fn get_v4(ip: IpAddr) -> Option<Ipv4Addr> {
match ip {
IpAddr::V4(v4) => Some(v4),
@@ -471,6 +477,11 @@ where
registry = register_default_interceptors(registry, &mut media_engine)?;
let mut setting_engine = SettingEngine::default();
setting_engine.set_interface_filter(Box::new(|name| !name.contains("tun")));
setting_engine.set_ice_timeouts(
Some(ICE_DISCONNECTED_TIMEOUT),
Some(ICE_FAILED_TIMEOUT),
Some(ICE_KEEPALIVE),
);
let webrtc_api = APIBuilder::new()
.with_media_engine(media_engine)