From 186ceb18e6fc59502952d2e229dcfbee0d01d62c Mon Sep 17 00:00:00 2001 From: Gabi Date: Wed, 13 Dec 2023 19:40:04 -0300 Subject: [PATCH] Connlib: reduce failover timeout (#2897) This reduces the failover time by depending on webrtc's keepalive instead of wireguard's. We have much more control over that, since boringtun doesn't bubble up any of the keepalives timeout(only a trace warning). In the a next commit, when things are more stable, we should just get rid of wireguard's keep alive. When we remove webrtc we will build our own. Events based on `keepalive` timeouts are key to our failover system, so we **need** it. Draft because it's built on top of #2891 (which is completely separate code but without that the failover just doesn't work correctly) --- rust/connlib/tunnel/src/control_protocol.rs | 16 +++++++++++++--- .../tunnel/src/control_protocol/client.rs | 1 + .../tunnel/src/control_protocol/gateway.rs | 3 ++- rust/connlib/tunnel/src/dns.rs | 2 ++ rust/connlib/tunnel/src/lib.rs | 11 +++++++++++ 5 files changed, 29 insertions(+), 4 deletions(-) diff --git a/rust/connlib/tunnel/src/control_protocol.rs b/rust/connlib/tunnel/src/control_protocol.rs index 29455b979..29858ad27 100644 --- a/rust/connlib/tunnel/src/control_protocol.rs +++ b/rust/connlib/tunnel/src/control_protocol.rs @@ -12,7 +12,7 @@ use connlib_shared::{ }; use webrtc::ice_transport::{ ice_candidate::RTCIceCandidate, ice_gatherer::RTCIceGatherOptions, - ice_parameters::RTCIceParameters, + ice_parameters::RTCIceParameters, ice_transport_state::RTCIceTransportState, }; use webrtc::ice_transport::{ice_candidate_type::RTCIceCandidateType, RTCIceTransport}; use webrtc::ice_transport::{ice_credential_type::RTCIceCredentialType, ice_server::RTCIceServer}; @@ -151,7 +151,8 @@ fn insert_peers( } } -fn start_handlers( +fn start_handlers( + tunnel: Arc>, device: Arc>, callbacks: impl Callbacks + 'static, peer: Arc>, @@ -160,8 +161,17 @@ fn start_handlers( ) where TId: Copy + Send + Sync + fmt::Debug + 'static, TTransform: Send + Sync + PacketTransform + 'static, + TRoleState: RoleState, { - ice.on_connection_state_change(Box::new(|_| Box::pin(async {}))); + let conn_id = peer.conn_id; + ice.on_connection_state_change(Box::new(move |state| { + let tunnel = tunnel.clone(); + Box::pin(async move { + if state == RTCIceTransportState::Failed { + tunnel.peers_to_stop.lock().push_back(conn_id); + } + }) + })); tokio::spawn({ async move { // If this fails receiver will be dropped and the connection will expire at some point diff --git a/rust/connlib/tunnel/src/control_protocol/client.rs b/rust/connlib/tunnel/src/control_protocol/client.rs index 5fadde738..822f31e7e 100644 --- a/rust/connlib/tunnel/src/control_protocol/client.rs +++ b/rust/connlib/tunnel/src/control_protocol/client.rs @@ -165,6 +165,7 @@ where let (peer_sender, peer_receiver) = tokio::sync::mpsc::channel(PEER_QUEUE_SIZE); start_handlers( + Arc::clone(self), Arc::clone(&self.device), self.callbacks.clone(), peer.clone(), diff --git a/rust/connlib/tunnel/src/control_protocol/gateway.rs b/rust/connlib/tunnel/src/control_protocol/gateway.rs index 4b823fbe5..c41827982 100644 --- a/rust/connlib/tunnel/src/control_protocol/gateway.rs +++ b/rust/connlib/tunnel/src/control_protocol/gateway.rs @@ -205,7 +205,7 @@ where } fn new_tunnel( - &self, + self: &Arc, peer_config: PeerConfig, client_id: ClientId, resource: ResourceDescription, @@ -239,6 +239,7 @@ where let (peer_sender, peer_receiver) = tokio::sync::mpsc::channel(PEER_QUEUE_SIZE); start_handlers( + Arc::clone(self), Arc::clone(&self.device), self.callbacks.clone(), peer.clone(), diff --git a/rust/connlib/tunnel/src/dns.rs b/rust/connlib/tunnel/src/dns.rs index f94391e29..70f5cc4af 100644 --- a/rust/connlib/tunnel/src/dns.rs +++ b/rust/connlib/tunnel/src/dns.rs @@ -213,6 +213,7 @@ where }; let mut answer_builder = msg_builder.start_answer(message, Rcode::NoError).ok()?; + answer_builder.header_mut().set_ra(true); // W/O object-safety there's no other way to access the inner type // we could as well implement the ComposeRecordData trait for RecordData @@ -227,6 +228,7 @@ where RecordData::Ptr(r) => answer_builder.push((qname, Class::In, DNS_TTL, r)), } .ok()?; + Some(answer_builder.finish()) } diff --git a/rust/connlib/tunnel/src/lib.rs b/rust/connlib/tunnel/src/lib.rs index bd7774fca..72b99e7ac 100644 --- a/rust/connlib/tunnel/src/lib.rs +++ b/rust/connlib/tunnel/src/lib.rs @@ -87,6 +87,12 @@ const MAX_CONCURRENT_ICE_GATHERING: usize = 100; // Note: Taken from boringtun const HANDSHAKE_RATE_LIMIT: u64 = 100; +// These 2 are the default timeouts +const ICE_DISCONNECTED_TIMEOUT: Duration = Duration::from_secs(5); +const ICE_KEEPALIVE: Duration = Duration::from_secs(2); +// This is approximately how long failoever will take :) +const ICE_FAILED_TIMEOUT: Duration = Duration::from_secs(10); + pub(crate) fn get_v4(ip: IpAddr) -> Option { match ip { IpAddr::V4(v4) => Some(v4), @@ -471,6 +477,11 @@ where registry = register_default_interceptors(registry, &mut media_engine)?; let mut setting_engine = SettingEngine::default(); setting_engine.set_interface_filter(Box::new(|name| !name.contains("tun"))); + setting_engine.set_ice_timeouts( + Some(ICE_DISCONNECTED_TIMEOUT), + Some(ICE_FAILED_TIMEOUT), + Some(ICE_KEEPALIVE), + ); let webrtc_api = APIBuilder::new() .with_media_engine(media_engine)