From e8f2320d082712a0cf2c9dcae83ad3bdf49a2de2 Mon Sep 17 00:00:00 2001 From: Thomas Eizinger Date: Sat, 23 Mar 2024 09:16:38 +1100 Subject: [PATCH] fix(gateway): answer with empty list of addresses on DNS resolution failure (#4266) Currently, a failure during DNS resolution results in the client hanging during the connection setup. Instead, we fall back to an empty list which results in an empty DNS query result for the client. That in turn will make most application consider the DNS request failed. As far as I know, we don't currently retry these DNS requests, meaning a user would have to sign-in and out again to fix this state. Whilst not ideal, I think this is a better behaviour and what we currently have where the initial connection just hangs. --- rust/connlib/shared/src/messages.rs | 33 +++++ rust/connlib/tunnel/src/gateway.rs | 23 +--- rust/connlib/tunnel/src/lib.rs | 2 +- rust/gateway/src/eventloop.rs | 201 +++++++++++++--------------- 4 files changed, 132 insertions(+), 127 deletions(-) diff --git a/rust/connlib/shared/src/messages.rs b/rust/connlib/shared/src/messages.rs index 3368b94ed..e31255780 100644 --- a/rust/connlib/shared/src/messages.rs +++ b/rust/connlib/shared/src/messages.rs @@ -147,6 +147,25 @@ pub enum ResourceDescription { Cidr(ResourceDescriptionCidr), } +impl ResourceDescription { + pub fn into_resolved( + self, + addresses: Vec, + ) -> ResourceDescription { + match self { + ResourceDescription::Dns(ResourceDescriptionDns { id, address, name }) => { + ResourceDescription::Dns(ResolvedResourceDescriptionDns { + id, + domain: address, + name, + addresses, + }) + } + ResourceDescription::Cidr(c) => ResourceDescription::Cidr(c), + } + } +} + impl PartialOrd for ResourceDescription { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) @@ -207,6 +226,20 @@ pub struct ResourceDescriptionDns { pub name: String, } +/// Description of a resource that maps to a DNS record which had its domain already resolved. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ResolvedResourceDescriptionDns { + pub id: ResourceId, + /// Internal resource's domain name. + pub domain: String, + /// Name of the resource. + /// + /// Used only for display. + pub name: String, + + pub addresses: Vec, +} + impl ResourceDescription { pub fn dns_name(&self) -> Option<&str> { match self { diff --git a/rust/connlib/tunnel/src/gateway.rs b/rust/connlib/tunnel/src/gateway.rs index 798025d57..0e068203c 100644 --- a/rust/connlib/tunnel/src/gateway.rs +++ b/rust/connlib/tunnel/src/gateway.rs @@ -7,7 +7,7 @@ use boringtun::x25519::PublicKey; use chrono::{DateTime, Utc}; use connlib_shared::messages::{ Answer, ClientId, ConnectionAccepted, DomainResponse, Interface as InterfaceConfig, Key, Offer, - Relay, ResourceId, + Relay, ResolvedResourceDescriptionDns, ResourceDescription, ResourceId, }; use connlib_shared::{Callbacks, Dname, Error, Result, StaticSecret}; use ip_network::IpNetwork; @@ -22,23 +22,6 @@ const PEERS_IPV6: &str = "fd00:2021:1111::/107"; const EXPIRE_RESOURCES_INTERVAL: Duration = Duration::from_secs(1); -/// Description of a resource that maps to a DNS record which had its domain already resolved. -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct ResolvedResourceDescriptionDns { - pub id: ResourceId, - /// Internal resource's domain name. - pub domain: String, - /// Name of the resource. - /// - /// Used only for display. - pub name: String, - - pub addresses: Vec, -} - -pub type ResourceDescription = - connlib_shared::messages::ResourceDescription; - impl GatewayTunnel where CB: Callbacks + 'static, @@ -81,7 +64,7 @@ where relays: Vec, domain: Option, expires_at: Option>, - resource: ResourceDescription, + resource: ResourceDescription, ) -> Result { let (resource_addresses, id) = match &resource { ResourceDescription::Dns(r) => { @@ -137,7 +120,7 @@ where pub fn allow_access( &mut self, - resource: ResourceDescription, + resource: ResourceDescription, client: ClientId, expires_at: Option>, domain: Option, diff --git a/rust/connlib/tunnel/src/lib.rs b/rust/connlib/tunnel/src/lib.rs index 3f32b17b5..5ed6bac76 100644 --- a/rust/connlib/tunnel/src/lib.rs +++ b/rust/connlib/tunnel/src/lib.rs @@ -16,7 +16,7 @@ use std::{ }; pub use client::{ClientState, Request}; -pub use gateway::{GatewayState, ResolvedResourceDescriptionDns}; +pub use gateway::GatewayState; mod client; mod device_channel; diff --git a/rust/gateway/src/eventloop.rs b/rust/gateway/src/eventloop.rs index 213f92b30..003cd0144 100644 --- a/rust/gateway/src/eventloop.rs +++ b/rust/gateway/src/eventloop.rs @@ -3,16 +3,17 @@ use crate::messages::{ EgressMessages, IngressMessages, RejectAccess, RequestConnection, }; use crate::CallbackHandler; -use anyhow::{bail, Result}; +use anyhow::Result; use boringtun::x25519::PublicKey; use connlib_shared::{ - messages::{GatewayResponse, ResourceAccepted, ResourceDescription}, + messages::{GatewayResponse, ResourceAccepted}, Dname, }; #[cfg(not(target_os = "windows"))] use dns_lookup::{AddrInfoHints, AddrInfoIter, LookupError}; use either::Either; -use firezone_tunnel::{GatewayTunnel, ResolvedResourceDescriptionDns}; +use firezone_tunnel::GatewayTunnel; +use futures_bounded::Timeout; use ip_network::IpNetwork; use phoenix_channel::PhoenixChannel; use std::convert::Infallible; @@ -25,10 +26,8 @@ pub struct Eventloop { tunnel: GatewayTunnel, portal: PhoenixChannel<(), IngressMessages, EgressMessages>, - resolve_tasks: futures_bounded::FuturesTupleSet< - Result>, - Either, - >, + resolve_tasks: + futures_bounded::FuturesTupleSet, Either>, } impl Eventloop { @@ -73,77 +72,13 @@ impl Eventloop { } match self.resolve_tasks.poll_unpin(cx) { - Poll::Ready((Ok(Ok(resource)), Either::Left(req))) => { - let ips = req.client.peer.ips(); - - match self.tunnel.accept( - req.client.id, - req.client.peer.preshared_key, - req.client.payload.ice_parameters, - PublicKey::from(req.client.peer.public_key.0), - ips, - req.relays, - req.client.payload.domain, - req.expires_at, - resource, - ) { - Ok(accepted) => { - self.portal.send( - PHOENIX_TOPIC, - EgressMessages::ConnectionReady(ConnectionReady { - reference: req.reference, - gateway_payload: GatewayResponse::ConnectionAccepted(accepted), - }), - ); - - // TODO: If outbound request fails, cleanup connection. - } - Err(e) => { - let client = req.client.id; - - self.tunnel.cleanup_connection(&client); - tracing::debug!(%client, "Connection request failed: {:#}", anyhow::Error::new(e)); - } - } - + Poll::Ready((result, Either::Left(req))) => { + self.accept_connection(result, req); continue; } - Poll::Ready((Ok(Ok(resource)), Either::Right(req))) => { - let maybe_domain_response = self.tunnel.allow_access( - resource, - req.client_id, - req.expires_at, - req.payload, - ); + Poll::Ready((result, Either::Right(req))) => { + self.allow_access(result, req); - if let Some(domain_response) = maybe_domain_response { - self.portal.send( - PHOENIX_TOPIC, - EgressMessages::ConnectionReady(ConnectionReady { - reference: req.reference, - gateway_payload: GatewayResponse::ResourceAccepted( - ResourceAccepted { domain_response }, - ), - }), - ); - } - - continue; - } - Poll::Ready((Ok(Err(dns_error)), Either::Left(req))) => { - tracing::debug!(client = %req.client.id, reference = %req.reference, "Failed to resolve domains as part of connection request: {dns_error}"); - continue; - } - Poll::Ready((Ok(Err(dns_error)), Either::Right(req))) => { - tracing::debug!(client = %req.client_id, reference = %req.reference, "Failed to resolve domains as part of allow access request: {dns_error}"); - continue; - } - Poll::Ready((Err(dns_timeout), Either::Left(req))) => { - tracing::debug!(client = %req.client.id, reference = %req.reference, "DNS resolution timed out as part of connection request: {dns_timeout}"); - continue; - } - Poll::Ready((Err(dns_timeout), Either::Right(req))) => { - tracing::debug!(client = %req.client_id, reference = %req.reference, "DNS resolution timed out as part of allow access request: {dns_timeout}"); continue; } Poll::Pending => {} @@ -156,10 +91,7 @@ impl Eventloop { if self .resolve_tasks .try_push( - resolve_resource_description( - req.resource.clone(), - req.client.payload.domain.clone(), - ), + resolve(req.client.payload.domain.clone()), Either::Left(req), ) .is_err() @@ -175,10 +107,7 @@ impl Eventloop { }) => { if self .resolve_tasks - .try_push( - resolve_resource_description(req.resource.clone(), req.payload.clone()), - Either::Right(req), - ) + .try_push(resolve(req.payload.clone()), Either::Right(req)) .is_err() { tracing::warn!("Too many allow access requests, dropping existing one"); @@ -228,34 +157,94 @@ impl Eventloop { return Poll::Pending; } } + + pub fn accept_connection( + &mut self, + result: Result, Timeout>, + req: RequestConnection, + ) { + let addresses = result + .inspect_err(|e| tracing::debug!(client = %req.client.id, reference = %req.reference, "DNS resolution timed out as part of connection request: {e}")) + .unwrap_or_default(); + + let ips = req.client.peer.ips(); + + match self.tunnel.accept( + req.client.id, + req.client.peer.preshared_key, + req.client.payload.ice_parameters, + PublicKey::from(req.client.peer.public_key.0), + ips, + req.relays, + req.client.payload.domain, + req.expires_at, + req.resource.into_resolved(addresses), + ) { + Ok(accepted) => { + self.portal.send( + PHOENIX_TOPIC, + EgressMessages::ConnectionReady(ConnectionReady { + reference: req.reference, + gateway_payload: GatewayResponse::ConnectionAccepted(accepted), + }), + ); + + // TODO: If outbound request fails, cleanup connection. + } + Err(e) => { + let client = req.client.id; + + self.tunnel.cleanup_connection(&client); + tracing::debug!(%client, "Connection request failed: {:#}", anyhow::Error::new(e)); + } + } + } + + pub fn allow_access(&mut self, result: Result, Timeout>, req: AllowAccess) { + let addresses = result + .inspect_err(|e| tracing::debug!(client = %req.client_id, reference = %req.reference, "DNS resolution timed out as part of allow access request: {e}")) + .unwrap_or_default(); + + let maybe_domain_response = self.tunnel.allow_access( + req.resource.into_resolved(addresses), + req.client_id, + req.expires_at, + req.payload, + ); + + if let Some(domain_response) = maybe_domain_response { + self.portal.send( + PHOENIX_TOPIC, + EgressMessages::ConnectionReady(ConnectionReady { + reference: req.reference, + gateway_payload: GatewayResponse::ResourceAccepted(ResourceAccepted { + domain_response, + }), + }), + ); + } + } } -async fn resolve_resource_description( - resource: ResourceDescription, - domain: Option, -) -> Result> { - match resource { - ResourceDescription::Dns(dns) => { - let Some(domain) = domain.clone() else { - debug_assert!( - false, - "We should never get a DNS resource access request without the subdomain" - ); - bail!("Protocol error: Request for DNS resource without the subdomain being tried to access.") - }; +async fn resolve(domain: Option) -> Vec { + let Some(domain) = domain.clone() else { + return vec![]; + }; - let addresses = - tokio::task::spawn_blocking(move || resolve_addresses(&domain.to_string())) - .await??; + let dname = domain.to_string(); - Ok(ResourceDescription::Dns(ResolvedResourceDescriptionDns { - id: dns.id, - domain: dns.address, - name: dns.name, - addresses, - })) + match tokio::task::spawn_blocking(move || resolve_addresses(&dname)).await { + Ok(Ok(addresses)) => addresses, + Ok(Err(e)) => { + tracing::warn!("Failed to resolve '{domain}': {e}"); + + vec![] + } + Err(e) => { + tracing::warn!("Failed to resolve '{domain}': {e}"); + + vec![] } - ResourceDescription::Cidr(cdir) => Ok(ResourceDescription::Cidr(cdir)), } }