feat(gateway): use hickory resolver to resolve A/AAAA queries (#10373)

At present, the Gateway performs DNS resolution for A & AAAA queries via
`libc`. The `resolve` system call only provides us with the resolved IPs
but not any of the metadata around the query such as TTL. As a result,
we can only cache DNS queries for a static amount of time, currently
30s. It would be more correct to cache them for their TTL instead.

To do so, we re-introduce `hickory-resolver` to our codebase.
Deliberately, we only use it for resolving A and AAAA records on the
Gateway for now. DNS resolution for SRV & TXT records happens one layer
below and uses the same infrastructure as DNS resolution on the Client.

Merging this is difficult however because the Gateway still supports the
control protocol of 1.3.x clients. That one requires DNS resolution
prior to setting up the connection of DNS resources which means it needs
to happen in the event-loop of the Gateway binary and cannot be moved
into the `Tunnel` where DNS resolution for Client and SRV/TXT records
happen.

Once we can drop support for 1.3.x clients, this Gateway's event-loop
will simplify drastically which will allow us to refactor this to a more
unified approach of DNS resolution. Until then, we can at least fix the
hardcoded TTL by using `hickory-resolver` in the event-loop.

The functionality is guarded behind a feature-flag which - as usual - is
off by default (i.e. for as long as we haven't fetched the flags). The
feature flag is already configured to `true` for staging and production
so we can test the new behaviour.

Resolves: #8232
Related: #10385
This commit is contained in:
Thomas Eizinger
2025-09-23 06:00:16 +00:00
committed by GitHub
parent c692efa2de
commit aa68029a33
7 changed files with 143 additions and 12 deletions

93
rust/Cargo.lock generated
View File

@@ -1578,6 +1578,12 @@ dependencies = [
"cfg-if",
]
[[package]]
name = "critical-section"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "790eea4361631c5e7d22598ecd5723ff611904e3344ce8720784c93e3d83d40b"
[[package]]
name = "crossbeam-channel"
version = "0.5.15"
@@ -2035,14 +2041,14 @@ dependencies = [
[[package]]
name = "dns-lookup"
version = "2.0.4"
version = "2.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e5766087c2235fec47fafa4cfecc81e494ee679d0fd4a59887ea0919bfb0e4fc"
checksum = "cf5597a4b7fe5275fc9dcf88ce26326bc8e4cb87d0130f33752d4c5f717793cf"
dependencies = [
"cfg-if",
"libc",
"socket2 0.5.10",
"windows-sys 0.48.0",
"socket2 0.6.0",
"windows-sys 0.60.2",
]
[[package]]
@@ -2197,6 +2203,18 @@ version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a3d8a32ae18130a3c84dd492d4215c3d913c3b07c6b63c2eb3eb7ff1101ab7bf"
[[package]]
name = "enum-as-inner"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1e6a265c649f3f5979b601d26f1d05ada116434c87741c9493cb56218f76cbc"
dependencies = [
"heck 0.5.0",
"proc-macro2",
"quote",
"syn 2.0.106",
]
[[package]]
name = "enumflags2"
version = "0.7.11"
@@ -2402,6 +2420,7 @@ dependencies = [
"firezone-tunnel",
"futures",
"futures-bounded",
"hickory-resolver",
"ip-packet",
"ip_network",
"libc",
@@ -3361,6 +3380,52 @@ version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6fe2267d4ed49bc07b63801559be28c718ea06c4738b7a03c94df7386d2cde46"
[[package]]
name = "hickory-proto"
version = "0.25.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8a6fe56c0038198998a6f217ca4e7ef3a5e51f46163bd6dd60b5c71ca6c6502"
dependencies = [
"async-trait",
"cfg-if",
"data-encoding",
"enum-as-inner",
"futures-channel",
"futures-io",
"futures-util",
"idna",
"ipnet",
"once_cell",
"rand 0.9.1",
"ring",
"thiserror 2.0.16",
"tinyvec",
"tokio",
"tracing",
"url",
]
[[package]]
name = "hickory-resolver"
version = "0.25.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc62a9a99b0bfb44d2ab95a7208ac952d31060efc16241c87eaf36406fecf87a"
dependencies = [
"cfg-if",
"futures-util",
"hickory-proto",
"ipconfig",
"moka",
"once_cell",
"parking_lot",
"rand 0.9.1",
"resolv-conf",
"smallvec",
"thiserror 2.0.16",
"tokio",
"tracing",
]
[[package]]
name = "hkdf"
version = "0.12.4"
@@ -4161,7 +4226,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a793df0d7afeac54f95b471d3af7f0d4fb975699f972341a4b76988d49cdf0c"
dependencies = [
"cfg-if",
"windows-targets 0.53.0",
"windows-targets 0.52.6",
]
[[package]]
@@ -4975,6 +5040,10 @@ name = "once_cell"
version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
dependencies = [
"critical-section",
"portable-atomic",
]
[[package]]
name = "opaque-debug"
@@ -9314,6 +9383,15 @@ dependencies = [
"windows-targets 0.52.6",
]
[[package]]
name = "windows-sys"
version = "0.60.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb"
dependencies = [
"windows-targets 0.53.3",
]
[[package]]
name = "windows-targets"
version = "0.42.2"
@@ -9362,10 +9440,11 @@ dependencies = [
[[package]]
name = "windows-targets"
version = "0.53.0"
version = "0.53.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b1e4c7e8ceaaf9cb7d7507c974735728ab453b67ef8f18febdd7c11fe59dca8b"
checksum = "d5fe6031c4041849d7c496a8ded650796e7b6ecc19df1a431c1a363342e5dc91"
dependencies = [
"windows-link 0.1.3",
"windows_aarch64_gnullvm 0.53.0",
"windows_aarch64_msvc 0.53.0",
"windows_i686_gnu 0.53.0",

View File

@@ -93,6 +93,7 @@ glob = "0.3.3"
hex = "0.4.3"
hex-display = "0.3.0"
hex-literal = "0.4.1"
hickory-resolver = "0.25.2"
humantime = "2.3"
ip-packet = { path = "connlib/ip-packet" }
ip_network = { version = "0.4", default-features = false }

View File

@@ -21,6 +21,7 @@ firezone-telemetry = { workspace = true }
firezone-tunnel = { workspace = true }
futures = { workspace = true }
futures-bounded = { workspace = true }
hickory-resolver = { workspace = true }
ip-packet = { workspace = true }
ip_network = { workspace = true }
libc = { workspace = true, features = ["std", "const-extern-fn", "extra_traits"] }

View File

@@ -16,6 +16,8 @@ use firezone_tunnel::{
DnsResourceNatEntry, GatewayEvent, GatewayTunnel, IPV4_TUNNEL, IPV6_TUNNEL, IpConfig,
ResolveDnsRequest, TunnelError,
};
use futures::FutureExt as _;
use hickory_resolver::TokioResolver;
use phoenix_channel::{PhoenixChannel, PublicKeyParam};
use std::collections::{BTreeMap, BTreeSet};
use std::future::{self, Future, poll_fn};
@@ -32,7 +34,7 @@ use crate::RELEASE;
pub const PHOENIX_TOPIC: &str = "gateway";
/// How long we allow a DNS resolution via `libc::get_addr_info`.
/// How long we allow a DNS resolution via hickory.
const DNS_RESOLUTION_TIMEOUT: Duration = Duration::from_secs(10);
/// Cache DNS responses for 30 seconds.
@@ -55,6 +57,7 @@ pub struct Eventloop {
// Tunnel is `Option` because we need to take ownership on shutdown.
tunnel: Option<GatewayTunnel>,
tun_device_manager: TunDeviceManager,
resolver: TokioResolver,
resolve_tasks:
futures_bounded::FuturesTupleSet<Result<Vec<IpAddr>, Arc<anyhow::Error>>, ResolveTrigger>,
@@ -79,6 +82,7 @@ impl Eventloop {
tunnel: GatewayTunnel,
mut portal: PhoenixChannel<(), IngressMessages, PublicKeyParam>,
tun_device_manager: TunDeviceManager,
resolver: TokioResolver,
) -> Result<Self> {
portal.connect(PublicKeyParam(tunnel.public_key().to_bytes()));
@@ -94,6 +98,7 @@ impl Eventloop {
Ok(Self {
tunnel: Some(tunnel),
tun_device_manager,
resolver,
resolve_tasks: futures_bounded::FuturesTupleSet::new(DNS_RESOLUTION_TIMEOUT, 1000),
logged_permission_denied: false,
dns_cache: moka::future::Cache::builder()
@@ -651,10 +656,26 @@ impl Eventloop {
&self,
domain: DomainName,
) -> impl Future<Output = Result<Vec<IpAddr>, Arc<anyhow::Error>>> + use<> {
let do_resolve = resolve(domain.clone());
let cache = self.dns_cache.clone();
if firezone_telemetry::feature_flags::gateway_userspace_dns_a_aaaa_records() {
let resolver = self.resolver.clone();
async move { cache.try_get_with(domain, do_resolve).await }
async move {
let ips = resolver
.lookup_ip(domain.to_string())
.await
.with_context(|| format!("Failed to lookup domain '{domain}'"))?
.iter()
.collect::<Vec<_>>();
Ok(ips)
}
.boxed()
} else {
let do_resolve = resolve(domain.clone());
let cache = self.dns_cache.clone();
async move { cache.try_get_with(domain, do_resolve).await }.boxed()
}
}
}

View File

@@ -11,6 +11,7 @@ use firezone_telemetry::{
MaybePushMetricsExporter, NoopPushMetricsExporter, Telemetry, feature_flags, otel,
};
use firezone_tunnel::GatewayTunnel;
use hickory_resolver::config::ResolveHosts;
use ip_packet::IpPacket;
use opentelemetry_otlp::WithExportConfig;
use opentelemetry_sdk::metrics::SdkMeterProvider;
@@ -210,7 +211,13 @@ async fn try_main(cli: Cli, telemetry: &mut Telemetry) -> Result<()> {
|| true,
));
Eventloop::new(tunnel, portal, tun_device_manager)?
let mut resolver_builder = hickory_resolver::TokioResolver::builder_tokio()?;
resolver_builder.options_mut().cache_size = 512;
resolver_builder.options_mut().use_hosts_file = ResolveHosts::Always;
let resolver = resolver_builder.build();
Eventloop::new(tunnel, portal, tun_device_manager, resolver)?
.run()
.await?;

View File

@@ -40,6 +40,10 @@ pub fn map_enobufs_to_would_block() -> bool {
FEATURE_FLAGS.map_enobufs_to_wouldblock()
}
pub fn gateway_userspace_dns_a_aaaa_records() -> bool {
FEATURE_FLAGS.gateway_userspace_dns_a_aaaa_records()
}
pub fn export_metrics() -> bool {
false // Placeholder until we actually deploy an OTEL collector.
}
@@ -160,6 +164,8 @@ struct FeatureFlagsResponse {
stream_logs: bool,
#[serde(default)]
map_enobufs_to_wouldblock: bool,
#[serde(default)]
gateway_userspace_dns_a_aaaa_records: bool,
}
#[derive(Debug, Deserialize, Default, Clone)]
@@ -175,6 +181,7 @@ struct FeatureFlags {
drop_llmnr_nxdomain_responses: AtomicBool,
stream_logs: RwLock<LogFilter>,
map_enobufs_to_wouldblock: AtomicBool,
gateway_userspace_dns_a_aaaa_records: AtomicBool,
}
/// Accessors to the actual feature flags.
@@ -191,6 +198,7 @@ impl FeatureFlags {
drop_llmnr_nxdomain_responses,
stream_logs,
map_enobufs_to_wouldblock,
gateway_userspace_dns_a_aaaa_records,
}: FeatureFlagsResponse,
payloads: FeatureFlagPayloadsResponse,
) {
@@ -200,6 +208,8 @@ impl FeatureFlags {
.store(drop_llmnr_nxdomain_responses, Ordering::Relaxed);
self.map_enobufs_to_wouldblock
.store(map_enobufs_to_wouldblock, Ordering::Relaxed);
self.gateway_userspace_dns_a_aaaa_records
.store(gateway_userspace_dns_a_aaaa_records, Ordering::Relaxed);
let log_filter = if stream_logs {
LogFilter::parse(payloads.stream_logs)
@@ -226,6 +236,11 @@ impl FeatureFlags {
fn map_enobufs_to_wouldblock(&self) -> bool {
self.map_enobufs_to_wouldblock.load(Ordering::Relaxed)
}
fn gateway_userspace_dns_a_aaaa_records(&self) -> bool {
self.gateway_userspace_dns_a_aaaa_records
.load(Ordering::Relaxed)
}
}
fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context {
@@ -236,6 +251,7 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context
DropLlmnrNxdomainResponses { result: bool },
StreamLogs { result: bool },
MapENOBUFSToWouldBlock { result: bool },
GatewayUserspaceDnsAAaaaRecords { result: bool },
}
// Exhaustive destruction so we don't forget to update this when we add a flag.
@@ -244,6 +260,7 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context
drop_llmnr_nxdomain_responses,
stream_logs,
map_enobufs_to_wouldblock,
gateway_userspace_dns_a_aaaa_records,
} = flags;
let value = serde_json::json!({
@@ -254,6 +271,7 @@ fn sentry_flag_context(flags: FeatureFlagsResponse) -> sentry::protocol::Context
SentryFlag::DropLlmnrNxdomainResponses { result: drop_llmnr_nxdomain_responses },
SentryFlag::StreamLogs { result: stream_logs },
SentryFlag::MapENOBUFSToWouldBlock { result: map_enobufs_to_wouldblock },
SentryFlag::GatewayUserspaceDnsAAaaaRecords { result: gateway_userspace_dns_a_aaaa_records },
]
});