From ed2bc0bd25f7fd089f593414a142cbd4a5285d8b Mon Sep 17 00:00:00 2001 From: Thomas Eizinger Date: Thu, 23 Oct 2025 06:14:45 +1100 Subject: [PATCH] feat(gateway): revise handling of DNS resolution errors (#10623) Even prior to #10373, failures in resolving a name on the Gateway for a DNS resource resulted in a failure of setting up the DNS resource NAT. Without the DNS resource NAT, packets for that resource bounced on the Gateway because we didn't have any traffic filters. A non-existent filter is being treated as a "traffic not allowed" error and we respond with an ICMP permission denied error. For domains where both the A and AAAA query result in NXDOMAIN, that isn't necessarily appropriate. Instead, I am proposing that for such cases, we want to return a regular "address/host unreachable" ICMP error instead of the more specific "permission denied" variant. To achieve that, we refactor the Gateway's peer state to be able to hold an `Option` inside the `TranslationState`. This allows us to always insert an entry for each proxy IP, even if we did not resolve any IPs for it. Then, when receiving traffic for a proxy IP where the resolved IP is `None`, we reply with the appropriate ICMP error. As part of this, we also simplify the assignment of the proxy IPs. With the NAT64 module removed, there is no more reason to cross-assign IPv4 and IPv6 addresses. We can simply leave the mappings for e.g. IPv6 proxy addresses empty if the AAAA query didn't resolve anything. From the Client's perspective, not much changes. The DNS resource NAT setup will now succeed, even for domains that don't resolve to anything. This doesn't change any behaviour though as we are currently already passing packets through for failed DNS resource NAT setups. The main change is that we now send back a different ICMP error. Most importantly, the "address/host unreachable variant" does not trigger #10462. --- .github/workflows/_rust.yml | 1 + docker-compose.yml | 1 + .../tunnel/proptest-regressions/tests.txt | 2 + .../tunnel/src/gateway/client_on_gateway.rs | 90 +++++++++---------- rust/connlib/tunnel/src/proptest.rs | 9 +- rust/connlib/tunnel/src/tests/dns_records.rs | 4 - rust/connlib/tunnel/src/tests/reference.rs | 48 ++++++++-- rust/connlib/tunnel/src/tests/strategies.rs | 12 +-- rust/gateway/src/eventloop.rs | 35 ++++++-- 9 files changed, 125 insertions(+), 77 deletions(-) diff --git a/.github/workflows/_rust.yml b/.github/workflows/_rust.yml index 2dab9b842..70078bd3a 100644 --- a/.github/workflows/_rust.yml +++ b/.github/workflows/_rust.yml @@ -116,6 +116,7 @@ jobs: rg --count --no-ignore "Revoking resource authorization" "$TESTCASES_DIR" rg --count --no-ignore "Re-seeding records for DNS resources" "$TESTCASES_DIR" rg --count --no-ignore "Resource is known but its addressability changed" "$TESTCASES_DIR" + rg --count --no-ignore "No A / AAAA records for domain" "$TESTCASES_DIR" # Make sure we are recovering from ICE disconnect rg --count --no-ignore "State change \(got new possible\): Disconnected -> Checking" "$TESTCASES_DIR" diff --git a/docker-compose.yml b/docker-compose.yml index e3fd3b19c..9e8356a62 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -245,6 +245,7 @@ services: RUST_LOG: ${RUST_LOG:-wire=trace,debug,flow_logs=trace} FIREZONE_API_URL: ws://api:8081 FIREZONE_ID: 4694E56C-7643-4A15-9DF3-638E5B05F570 + FZFF_GATEWAY_USERSPACE_DNS_A_AAAA_RECORDS: true command: - sh - -c diff --git a/rust/connlib/tunnel/proptest-regressions/tests.txt b/rust/connlib/tunnel/proptest-regressions/tests.txt index dda981536..4e3a09edf 100644 --- a/rust/connlib/tunnel/proptest-regressions/tests.txt +++ b/rust/connlib/tunnel/proptest-regressions/tests.txt @@ -220,3 +220,5 @@ cc 3bdd819cda2577278b0372cb7598418227ecab83271c48f5b28dc192f766061e cc 764c22e664da06820cd02cba259196edeec94cce45e450959ce9354be7bc9f1c cc 04193ee1047f542c469aa0893bf636df9c317943022d922e231de3e821b39486 cc e8520f159df085f7dbe6dce8b121336d33708af9f804a8a14bf6b5a3eb3a9d4d +cc fd95c0fcd3af20d73849004cb642b09c5bacfa7ca25781d0268441d49fe3b6cf +cc 4f65d01188bb870aa8f4893530f77ace3434c133cf24735619196df6d043f4cf diff --git a/rust/connlib/tunnel/src/gateway/client_on_gateway.rs b/rust/connlib/tunnel/src/gateway/client_on_gateway.rs index 4a3a32e64..7f0eca9a8 100644 --- a/rust/connlib/tunnel/src/gateway/client_on_gateway.rs +++ b/rust/connlib/tunnel/src/gateway/client_on_gateway.rs @@ -10,6 +10,7 @@ use dns_types::DomainName; use ip_network::{IpNetwork, Ipv4Network, Ipv6Network}; use ip_network_table::IpNetworkTable; use ip_packet::{IpPacket, Protocol, UnsupportedProtocol}; +use itertools::{EitherOrBoth, Itertools}; use crate::client::{IPV4_RESOURCES, IPV6_RESOURCES}; use crate::gateway::filter_engine::FilterEngine; @@ -105,35 +106,51 @@ impl ClientOnGateway { anyhow::ensure!(crate::dns::is_subdomain(&name, address)); - let mapped_ipv4 = mapped_ipv4(&resolved_ips); - let mapped_ipv6 = mapped_ipv6(&resolved_ips); + if resolved_ips.is_empty() { + tracing::debug!(domain = %name, %resource_id, "No A / AAAA records for domain") + } - let ipv4_maps = proxy_ips - .iter() - .filter(|ip| ip.is_ipv4()) - .zip(mapped_ipv4.iter().cycle().copied()); + let ipv4_maps = proxy_ips.iter().filter(|ip| ip.is_ipv4()).zip_longest( + resolved_ips + .iter() + .filter(|ip| ip.is_ipv4()) + .copied() + .cycle(), + ); + let ipv6_maps = proxy_ips.iter().filter(|ip| ip.is_ipv6()).zip_longest( + resolved_ips + .iter() + .filter(|ip| ip.is_ipv6()) + .copied() + .cycle(), + ); - let ipv6_maps = proxy_ips - .iter() - .filter(|ip| ip.is_ipv6()) - .zip(mapped_ipv6.iter().cycle().copied()); + let mut ip_maps = ipv4_maps.chain(ipv6_maps); - let ip_maps = ipv4_maps.chain(ipv6_maps); + loop { + let Some(either_or_both) = ip_maps.next() else { + break; + }; + + let (proxy_ip, maybe_real_ip) = match either_or_both { + EitherOrBoth::Both(proxy_ip, real_ip) => (proxy_ip, Some(real_ip)), + EitherOrBoth::Left(proxy_ip) => (proxy_ip, None), + EitherOrBoth::Right(_) => break, + }; - for (proxy_ip, real_ip) in ip_maps { if let Some(state) = self.permanent_translations.get(proxy_ip) && self.nat_table.has_entry_for_inside(*proxy_ip) - && state.resolved_ip != real_ip + && state.resolved_ip != maybe_real_ip { - tracing::debug!(%name, %proxy_ip, new_real_ip = %real_ip, current_real_ip = %state.resolved_ip, "Skipping DNS resource NAT entry because we have open NAT sessions for it"); + tracing::debug!(%name, %proxy_ip, new_real_ip = ?maybe_real_ip, current_real_ip = ?state.resolved_ip, "Skipping DNS resource NAT entry because we have open NAT sessions for it"); continue; } - tracing::debug!(%name, %proxy_ip, %real_ip); + tracing::debug!(%name, %proxy_ip, real_ip = ?maybe_real_ip); self.permanent_translations.insert( *proxy_ip, - TranslationState::new(resource_id, real_ip, name.clone()), + TranslationState::new(resource_id, maybe_real_ip, name.clone()), ); } @@ -404,10 +421,16 @@ impl ClientOnGateway { )); }; - if state.resolved_ip.is_ipv4() != dst.is_ipv4() { + let Some(resolved_ip) = state.resolved_ip else { + return Ok(TranslateOutboundResult::DestinationUnreachable( + ip_packet::make::icmp_dest_unreachable_network(&packet)?, + )); + }; + + if resolved_ip.is_ipv4() != dst.is_ipv4() { tracing::debug!( %dst, - resolved = %state.resolved_ip, + resolved = %resolved_ip, "Cannot translate between IP versions" ); @@ -418,7 +441,7 @@ impl ClientOnGateway { let (source_protocol, real_ip) = self.nat_table - .translate_outgoing(&packet, state.resolved_ip, now)?; + .translate_outgoing(&packet, resolved_ip, now)?; packet .translate_destination(source_protocol, real_ip) @@ -651,13 +674,13 @@ struct TranslationState { /// Which (DNS) resource we belong to. resource_id: ResourceId, /// The IP we have resolved for the domain. - resolved_ip: IpAddr, + resolved_ip: Option, /// The domain we have resolved. domain: DomainName, } impl TranslationState { - fn new(resource_id: ResourceId, resolved_ip: IpAddr, domain: DomainName) -> Self { + fn new(resource_id: ResourceId, resolved_ip: Option, domain: DomainName) -> Self { Self { resource_id, resolved_ip, @@ -666,30 +689,6 @@ impl TranslationState { } } -fn ipv4_addresses(ip: &BTreeSet) -> BTreeSet { - ip.iter().filter(|ip| ip.is_ipv4()).copied().collect() -} - -fn ipv6_addresses(ip: &BTreeSet) -> BTreeSet { - ip.iter().filter(|ip| ip.is_ipv6()).copied().collect() -} - -fn mapped_ipv4(ips: &BTreeSet) -> BTreeSet { - if !ipv4_addresses(ips).is_empty() { - ipv4_addresses(ips) - } else { - ipv6_addresses(ips) - } -} - -fn mapped_ipv6(ips: &BTreeSet) -> BTreeSet { - if !ipv6_addresses(ips).is_empty() { - ipv6_addresses(ips) - } else { - ipv4_addresses(ips) - } -} - fn is_dns_addr(addr: IpAddr) -> bool { IpNetwork::from(IPV4_RESOURCES).contains(addr) || IpNetwork::from(IPV6_RESOURCES).contains(addr) } @@ -1280,7 +1279,6 @@ mod proptests { }; use crate::proptest::*; use ip_packet::make::{TcpFlags, icmp_request_packet, tcp_packet, udp_packet}; - use itertools::Itertools as _; use proptest::{ arbitrary::any, collection, prop_oneof, diff --git a/rust/connlib/tunnel/src/proptest.rs b/rust/connlib/tunnel/src/proptest.rs index a06f0dd81..21934bf46 100644 --- a/rust/connlib/tunnel/src/proptest.rs +++ b/rust/connlib/tunnel/src/proptest.rs @@ -1,4 +1,5 @@ use connlib_model::{ClientId, GatewayId, IpStack, RelayId, ResourceId, Site, SiteId}; +use dns_types::DomainName; use ip_network::{IpNetwork, Ipv4Network, Ipv6Network}; use proptest::{ arbitrary::{any, any_with}, @@ -38,7 +39,7 @@ pub fn dns_resource(sites: impl Strategy>) -> impl Strategy impl Strategy { any_with::("[a-z]{3,6}".into()) } -pub fn domain_name(depth: Range) -> impl Strategy { - collection::vec(domain_label(), depth).prop_map(|labels| labels.join(".")) +pub fn domain_name(depth: Range) -> impl Strategy { + collection::vec(domain_label(), depth) + .prop_map(|labels| labels.join(".")) + .prop_map(|d| d.parse().unwrap()) } /// A strategy of IP networks, configurable by the size of the host mask. diff --git a/rust/connlib/tunnel/src/tests/dns_records.rs b/rust/connlib/tunnel/src/tests/dns_records.rs index 564423199..dc41eb468 100644 --- a/rust/connlib/tunnel/src/tests/dns_records.rs +++ b/rust/connlib/tunnel/src/tests/dns_records.rs @@ -42,10 +42,6 @@ impl DnsRecords { self.inner.keys().cloned() } - pub(crate) fn contains_domain(&self, name: &DomainName) -> bool { - self.inner.contains_key(name) - } - pub(crate) fn merge(&mut self, other: Self) { self.inner.extend(other.inner); } diff --git a/rust/connlib/tunnel/src/tests/reference.rs b/rust/connlib/tunnel/src/tests/reference.rs index fa795e376..845d7015a 100644 --- a/rust/connlib/tunnel/src/tests/reference.rs +++ b/rust/connlib/tunnel/src/tests/reference.rs @@ -5,6 +5,7 @@ use super::{ strategies::*, stub_portal::StubPortal, transition::*, }; use crate::client; +use crate::proptest::domain_label; use crate::{dns::is_subdomain, proptest::relay_id}; use connlib_model::{GatewayId, RelayId, Site, StaticSecret}; use dns_types::{DomainName, RecordType}; @@ -351,6 +352,33 @@ impl ReferenceState { .prop_map(Transition::SendDnsQueries) }, ) + .with_if_not_empty( + 2, + ( + state.wildcard_dns_resources_on_client(), + state.reachable_dns_servers(), + ), + |(wildcard_dns_resources, dns_servers)| { + dns_queries( + ( + sample::select(wildcard_dns_resources).prop_flat_map(|r| { + let base = r.address.trim_start_matches("*.").to_owned(); + + domain_label().prop_map(move |label| { + format!("{label}.{base}").parse().unwrap() + }) + }), + prop_oneof![ + Just(vec![RecordType::A]), + Just(vec![RecordType::AAAA]), + Just(vec![RecordType::A, RecordType::AAAA]) + ], + ), + sample::select(dns_servers), + ) + .prop_map(Transition::SendDnsQueries) + }, + ) .with_if_not_empty( 1, state @@ -683,12 +711,6 @@ impl ReferenceState { .sending_socket_for(query.dns_server.ip()) .is_some(); - let is_ptr_query = matches!(query.r_type, RecordType::PTR); - let is_known_domain = state.global_dns_records.contains_domain(&query.domain); - - // In case we sampled a PTR query, the domain doesn't have to exist. - let ptr_or_known_domain = is_ptr_query || is_known_domain; - let has_dns_server = state .client .inner() @@ -707,7 +729,6 @@ impl ReferenceState { }; has_socket_for_server - && ptr_or_known_domain && has_dns_server && gateway_is_present_in_case_dns_server_is_cidr_resource }), @@ -873,6 +894,19 @@ impl ReferenceState { .collect() } + fn wildcard_dns_resources_on_client(&self) -> Vec { + self.portal + .all_resources() + .into_iter() + .flat_map(|r| match r { + client::Resource::Dns(r) => Some(r), + client::Resource::Cidr(_) | client::Resource::Internet(_) => None, + }) + .filter(|r| self.client.inner().has_resource(r.id)) + .filter(|r| r.address.starts_with("*.")) + .collect() + } + fn regular_sites(&self) -> Vec { let all_sites = self .portal diff --git a/rust/connlib/tunnel/src/tests/strategies.rs b/rust/connlib/tunnel/src/tests/strategies.rs index bb15ba47b..bbefaf310 100644 --- a/rust/connlib/tunnel/src/tests/strategies.rs +++ b/rust/connlib/tunnel/src/tests/strategies.rs @@ -23,7 +23,7 @@ use std::{ pub(crate) fn global_dns_records() -> impl Strategy { collection::btree_map( - domain_name(2..4).prop_map(|d| d.parse().unwrap()), + domain_name(2..4), collection::btree_set(dns_record(), 1..6), 0..5, ) @@ -62,15 +62,9 @@ fn txt_record() -> impl Strategy> { } fn srv_record() -> impl Strategy { - ( - any::(), - any::(), - any::(), - domain_name(2..4).prop_map(|d| d.parse().unwrap()), + (any::(), any::(), any::(), domain_name(2..4)).prop_map( + |(priority, weight, port, target)| dns_types::records::srv(priority, weight, port, target), ) - .prop_map(|(priority, weight, port, target)| { - dns_types::records::srv(priority, weight, port, target) - }) } pub(crate) fn latency(max: u64) -> impl Strategy { diff --git a/rust/gateway/src/eventloop.rs b/rust/gateway/src/eventloop.rs index be103d04c..81a2375dc 100644 --- a/rust/gateway/src/eventloop.rs +++ b/rust/gateway/src/eventloop.rs @@ -16,7 +16,7 @@ use firezone_tunnel::{ DnsResourceNatEntry, GatewayEvent, GatewayTunnel, IPV4_TUNNEL, IPV6_TUNNEL, IpConfig, ResolveDnsRequest, TunnelError, }; -use futures::FutureExt as _; +use futures::{FutureExt as _, TryFutureExt}; use hickory_resolver::TokioResolver; use phoenix_channel::{PhoenixChannel, PublicKeyParam}; use std::collections::{BTreeMap, BTreeSet}; @@ -27,7 +27,7 @@ use std::pin::pin; use std::sync::Arc; use std::task::{Context, Poll}; use std::time::{Duration, Instant}; -use std::{io, mem}; +use std::{io, iter, mem}; use tokio::sync::mpsc; use crate::RELEASE; @@ -664,12 +664,31 @@ impl Eventloop { let resolver = self.resolver.clone(); async move { - let ips = resolver - .lookup_ip(domain.to_string()) - .await - .with_context(|| format!("Failed to lookup domain '{domain}'"))? - .iter() - .collect::>(); + let ipv4_lookup = resolver + .ipv4_lookup(domain.to_string()) + .map_ok(|ipv4| ipv4.into_iter().map(|r| IpAddr::V4(r.0))); + let ipv6_lookup = resolver + .ipv6_lookup(domain.to_string()) + .map_ok(|ipv6| ipv6.into_iter().map(|r| IpAddr::V6(r.0))); + + let ips = match futures::future::join(ipv4_lookup, ipv6_lookup).await { + (Ok(ipv4), Ok(ipv6)) => iter::empty().chain(ipv4).chain(ipv6).collect(), + (Ok(ipv4), Err(e)) => { + tracing::debug!(%domain, "AAAA lookup failed: {e}"); + + ipv4.collect() + } + (Err(e), Ok(ipv6)) => { + tracing::debug!(%domain, "A lookup failed: {e}"); + + ipv6.collect() + } + (Err(e1), Err(e2)) => { + tracing::debug!(%domain, "A and AAAA lookup failed: [{e1}; {e2}]"); + + vec![] + } + }; Ok(ips) }