From aa68029a33406792625390735378e44fd1828b4e Mon Sep 17 00:00:00 2001 From: Thomas Eizinger Date: Tue, 23 Sep 2025 06:00:16 +0000 Subject: [PATCH] feat(gateway): use hickory resolver to resolve A/AAAA queries (#10373) At present, the Gateway performs DNS resolution for A & AAAA queries via `libc`. The `resolve` system call only provides us with the resolved IPs but not any of the metadata around the query such as TTL. As a result, we can only cache DNS queries for a static amount of time, currently 30s. It would be more correct to cache them for their TTL instead. To do so, we re-introduce `hickory-resolver` to our codebase. Deliberately, we only use it for resolving A and AAAA records on the Gateway for now. DNS resolution for SRV & TXT records happens one layer below and uses the same infrastructure as DNS resolution on the Client. Merging this is difficult however because the Gateway still supports the control protocol of 1.3.x clients. That one requires DNS resolution prior to setting up the connection of DNS resources which means it needs to happen in the event-loop of the Gateway binary and cannot be moved into the `Tunnel` where DNS resolution for Client and SRV/TXT records happen. Once we can drop support for 1.3.x clients, this Gateway's event-loop will simplify drastically which will allow us to refactor this to a more unified approach of DNS resolution. Until then, we can at least fix the hardcoded TTL by using `hickory-resolver` in the event-loop. The functionality is guarded behind a feature-flag which - as usual - is off by default (i.e. for as long as we haven't fetched the flags). The feature flag is already configured to `true` for staging and production so we can test the new behaviour. Resolves: #8232 Related: #10385 --- rust/Cargo.lock | 93 ++++++++++++++++++-- rust/Cargo.toml | 1 + rust/gateway/Cargo.toml | 1 + rust/gateway/src/eventloop.rs | 29 +++++- rust/gateway/src/main.rs | 9 +- rust/telemetry/src/feature_flags.rs | 18 ++++ website/src/components/Changelog/Gateway.tsx | 4 + 7 files changed, 143 insertions(+), 12 deletions(-) diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 543782994..63ff8adc2 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -1578,6 +1578,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "critical-section" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "790eea4361631c5e7d22598ecd5723ff611904e3344ce8720784c93e3d83d40b" + [[package]] name = "crossbeam-channel" version = "0.5.15" @@ -2035,14 +2041,14 @@ dependencies = [ [[package]] name = "dns-lookup" -version = "2.0.4" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5766087c2235fec47fafa4cfecc81e494ee679d0fd4a59887ea0919bfb0e4fc" +checksum = "cf5597a4b7fe5275fc9dcf88ce26326bc8e4cb87d0130f33752d4c5f717793cf" dependencies = [ "cfg-if", "libc", - "socket2 0.5.10", - "windows-sys 0.48.0", + "socket2 0.6.0", + "windows-sys 0.60.2", ] [[package]] @@ -2197,6 +2203,18 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a3d8a32ae18130a3c84dd492d4215c3d913c3b07c6b63c2eb3eb7ff1101ab7bf" +[[package]] +name = "enum-as-inner" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1e6a265c649f3f5979b601d26f1d05ada116434c87741c9493cb56218f76cbc" +dependencies = [ + "heck 0.5.0", + "proc-macro2", + "quote", + "syn 2.0.106", +] + [[package]] name = "enumflags2" version = "0.7.11" @@ -2402,6 +2420,7 @@ dependencies = [ "firezone-tunnel", "futures", "futures-bounded", + "hickory-resolver", "ip-packet", "ip_network", "libc", @@ -3361,6 +3380,52 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46" +[[package]] +name = "hickory-proto" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8a6fe56c0038198998a6f217ca4e7ef3a5e51f46163bd6dd60b5c71ca6c6502" +dependencies = [ + "async-trait", + "cfg-if", + "data-encoding", + "enum-as-inner", + "futures-channel", + "futures-io", + "futures-util", + "idna", + "ipnet", + "once_cell", + "rand 0.9.1", + "ring", + "thiserror 2.0.16", + "tinyvec", + "tokio", + "tracing", + "url", +] + +[[package]] +name = "hickory-resolver" +version = "0.25.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc62a9a99b0bfb44d2ab95a7208ac952d31060efc16241c87eaf36406fecf87a" +dependencies = [ + "cfg-if", + "futures-util", + "hickory-proto", + "ipconfig", + "moka", + "once_cell", + "parking_lot", + "rand 0.9.1", + "resolv-conf", + "smallvec", + "thiserror 2.0.16", + "tokio", + "tracing", +] + [[package]] name = "hkdf" version = "0.12.4" @@ -4161,7 +4226,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6a793df0d7afeac54f95b471d3af7f0d4fb975699f972341a4b76988d49cdf0c" dependencies = [ "cfg-if", - "windows-targets 0.53.0", + "windows-targets 0.52.6", ] [[package]] @@ -4975,6 +5040,10 @@ name = "once_cell" version = "1.21.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" +dependencies = [ + "critical-section", + "portable-atomic", +] [[package]] name = "opaque-debug" @@ -9314,6 +9383,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.3", +] + [[package]] name = "windows-targets" version = "0.42.2" @@ -9362,10 +9440,11 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.53.0" +version = "0.53.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b1e4c7e8ceaaf9cb7d7507c974735728ab453b67ef8f18febdd7c11fe59dca8b" +checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91" dependencies = [ + "windows-link 0.1.3", "windows_aarch64_gnullvm 0.53.0", "windows_aarch64_msvc 0.53.0", "windows_i686_gnu 0.53.0", diff --git a/rust/Cargo.toml b/rust/Cargo.toml index bbe02c344..1faae9880 100644 --- a/rust/Cargo.toml +++ b/rust/Cargo.toml @@ -93,6 +93,7 @@ glob = "0.3.3" hex = "0.4.3" hex-display = "0.3.0" hex-literal = "0.4.1" +hickory-resolver = "0.25.2" humantime = "2.3" ip-packet = { path = "connlib/ip-packet" } ip_network = { version = "0.4", default-features = false } diff --git a/rust/gateway/Cargo.toml b/rust/gateway/Cargo.toml index 5db1492ae..b673a8a93 100644 --- a/rust/gateway/Cargo.toml +++ b/rust/gateway/Cargo.toml @@ -21,6 +21,7 @@ firezone-telemetry = { workspace = true } firezone-tunnel = { workspace = true } futures = { workspace = true } futures-bounded = { workspace = true } +hickory-resolver = { workspace = true } ip-packet = { workspace = true } ip_network = { workspace = true } libc = { workspace = true, features = ["std", "const-extern-fn", "extra_traits"] } diff --git a/rust/gateway/src/eventloop.rs b/rust/gateway/src/eventloop.rs index f214e69cd..2b6ae7122 100644 --- a/rust/gateway/src/eventloop.rs +++ b/rust/gateway/src/eventloop.rs @@ -16,6 +16,8 @@ use firezone_tunnel::{ DnsResourceNatEntry, GatewayEvent, GatewayTunnel, IPV4_TUNNEL, IPV6_TUNNEL, IpConfig, ResolveDnsRequest, TunnelError, }; +use futures::FutureExt as _; +use hickory_resolver::TokioResolver; use phoenix_channel::{PhoenixChannel, PublicKeyParam}; use std::collections::{BTreeMap, BTreeSet}; use std::future::{self, Future, poll_fn}; @@ -32,7 +34,7 @@ use crate::RELEASE; pub const PHOENIX_TOPIC: &str = "gateway"; -/// How long we allow a DNS resolution via `libc::get_addr_info`. +/// How long we allow a DNS resolution via hickory. const DNS_RESOLUTION_TIMEOUT: Duration = Duration::from_secs(10); /// Cache DNS responses for 30 seconds. @@ -55,6 +57,7 @@ pub struct Eventloop { // Tunnel is `Option` because we need to take ownership on shutdown. tunnel: Option, tun_device_manager: TunDeviceManager, + resolver: TokioResolver, resolve_tasks: futures_bounded::FuturesTupleSet, Arc>, ResolveTrigger>, @@ -79,6 +82,7 @@ impl Eventloop { tunnel: GatewayTunnel, mut portal: PhoenixChannel<(), IngressMessages, PublicKeyParam>, tun_device_manager: TunDeviceManager, + resolver: TokioResolver, ) -> Result { portal.connect(PublicKeyParam(tunnel.public_key().to_bytes())); @@ -94,6 +98,7 @@ impl Eventloop { Ok(Self { tunnel: Some(tunnel), tun_device_manager, + resolver, resolve_tasks: futures_bounded::FuturesTupleSet::new(DNS_RESOLUTION_TIMEOUT, 1000), logged_permission_denied: false, dns_cache: moka::future::Cache::builder() @@ -651,10 +656,26 @@ impl Eventloop { &self, domain: DomainName, ) -> impl Future, Arc>> + use<> { - let do_resolve = resolve(domain.clone()); - let cache = self.dns_cache.clone(); + if firezone_telemetry::feature_flags::gateway_userspace_dns_a_aaaa_records() { + let resolver = self.resolver.clone(); - async move { cache.try_get_with(domain, do_resolve).await } + async move { + let ips = resolver + .lookup_ip(domain.to_string()) + .await + .with_context(|| format!("Failed to lookup domain '{domain}'"))? + .iter() + .collect::>(); + + Ok(ips) + } + .boxed() + } else { + let do_resolve = resolve(domain.clone()); + let cache = self.dns_cache.clone(); + + async move { cache.try_get_with(domain, do_resolve).await }.boxed() + } } } diff --git a/rust/gateway/src/main.rs b/rust/gateway/src/main.rs index d61fed4b1..167fa920d 100644 --- a/rust/gateway/src/main.rs +++ b/rust/gateway/src/main.rs @@ -11,6 +11,7 @@ use firezone_telemetry::{ MaybePushMetricsExporter, NoopPushMetricsExporter, Telemetry, feature_flags, otel, }; use firezone_tunnel::GatewayTunnel; +use hickory_resolver::config::ResolveHosts; use ip_packet::IpPacket; use opentelemetry_otlp::WithExportConfig; use opentelemetry_sdk::metrics::SdkMeterProvider; @@ -210,7 +211,13 @@ async fn try_main(cli: Cli, telemetry: &mut Telemetry) -> Result<()> { || true, )); - Eventloop::new(tunnel, portal, tun_device_manager)? + let mut resolver_builder = hickory_resolver::TokioResolver::builder_tokio()?; + resolver_builder.options_mut().cache_size = 512; + resolver_builder.options_mut().use_hosts_file = ResolveHosts::Always; + + let resolver = resolver_builder.build(); + + Eventloop::new(tunnel, portal, tun_device_manager, resolver)? .run() .await?; diff --git a/rust/telemetry/src/feature_flags.rs b/rust/telemetry/src/feature_flags.rs index 9a26a60bd..a10fb021a 100644 --- a/rust/telemetry/src/feature_flags.rs +++ b/rust/telemetry/src/feature_flags.rs @@ -40,6 +40,10 @@ pub fn map_enobufs_to_would_block() -> bool { FEATURE_FLAGS.map_enobufs_to_wouldblock() } +pub fn gateway_userspace_dns_a_aaaa_records() -> bool { + FEATURE_FLAGS.gateway_userspace_dns_a_aaaa_records() +} + pub fn export_metrics() -> bool { false // Placeholder until we actually deploy an OTEL collector. } @@ -160,6 +164,8 @@ struct FeatureFlagsResponse { stream_logs: bool, #[serde(default)] map_enobufs_to_wouldblock: bool, + #[serde(default)] + gateway_userspace_dns_a_aaaa_records: bool, } #[derive(Debug, Deserialize, Default, Clone)] @@ -175,6 +181,7 @@ struct FeatureFlags { drop_llmnr_nxdomain_responses: AtomicBool, stream_logs: RwLock, map_enobufs_to_wouldblock: AtomicBool, + gateway_userspace_dns_a_aaaa_records: AtomicBool, } /// Accessors to the actual feature flags. @@ -191,6 +198,7 @@ impl FeatureFlags { drop_llmnr_nxdomain_responses, stream_logs, map_enobufs_to_wouldblock, + gateway_userspace_dns_a_aaaa_records, }: FeatureFlagsResponse, payloads: FeatureFlagPayloadsResponse, ) { @@ -200,6 +208,8 @@ impl FeatureFlags { .store(drop_llmnr_nxdomain_responses, Ordering::Relaxed); self.map_enobufs_to_wouldblock .store(map_enobufs_to_wouldblock, Ordering::Relaxed); + self.gateway_userspace_dns_a_aaaa_records + .store(gateway_userspace_dns_a_aaaa_records, Ordering::Relaxed); let log_filter = if stream_logs { LogFilter::parse(payloads.stream_logs) @@ -226,6 +236,11 @@ impl FeatureFlags { fn map_enobufs_to_wouldblock(&self) -> bool { self.map_enobufs_to_wouldblock.load(Ordering::Relaxed) } + + fn gateway_userspace_dns_a_aaaa_records(&self) -> bool { + self.gateway_userspace_dns_a_aaaa_records + .load(Ordering::Relaxed) + } } fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context { @@ -236,6 +251,7 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context DropLlmnrNxdomainResponses { result: bool }, StreamLogs { result: bool }, MapENOBUFSToWouldBlock { result: bool }, + GatewayUserspaceDnsAAaaaRecords { result: bool }, } // Exhaustive destruction so we don't forget to update this when we add a flag. @@ -244,6 +260,7 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context drop_llmnr_nxdomain_responses, stream_logs, map_enobufs_to_wouldblock, + gateway_userspace_dns_a_aaaa_records, } = flags; let value = serde_json::json!({ @@ -254,6 +271,7 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context SentryFlag::DropLlmnrNxdomainResponses { result: drop_llmnr_nxdomain_responses }, SentryFlag::StreamLogs { result: stream_logs }, SentryFlag::MapENOBUFSToWouldBlock { result: map_enobufs_to_wouldblock }, + SentryFlag::GatewayUserspaceDnsAAaaaRecords { result: gateway_userspace_dns_a_aaaa_records }, ] }); diff --git a/website/src/components/Changelog/Gateway.tsx b/website/src/components/Changelog/Gateway.tsx index 2a488f601..ed5cdfaaf 100644 --- a/website/src/components/Changelog/Gateway.tsx +++ b/website/src/components/Changelog/Gateway.tsx @@ -38,6 +38,10 @@ export default function Gateway() { section for details. + + Switches to user-space DNS resolution, allowing for accurate caching + based on the TTL in the DNS response. +