fix(gateway): evaluate fastest nameserver every 60s (#9060)

Currently, the Gateway reads all nameservers from `/etc/resolv.conf` on
startup and evaluates the fastest one to use for SRV and TXT DNS queries
that are forwarded by the Client. If the machine just booted and we do
not have Internet connectivity just yet, this fails which leaves the
Gateway in state where it cannot fulfill those queries.

In order to ensure we always use the fastest one and to self-heal from
such situations, we add a 60s timer that refreshes this state.
Currently, this will **not** re-read the nameservers from
`/etc/resolv.conf` but still use the same IPs read on startup.
This commit is contained in:
Thomas Eizinger
2025-05-09 13:08:35 +09:30
committed by GitHub
parent 20d8246ce8
commit ac339ff63b
3 changed files with 25 additions and 10 deletions

View File

@@ -47,6 +47,7 @@ pub struct Io {
gso_queue: GsoQueue,
nameservers: NameserverSet,
reval_nameserver_interval: tokio::time::Interval,
udp_dns_server: l4_udp_dns_server::Server,
tcp_dns_server: l4_tcp_dns_server::Server,
@@ -92,6 +93,7 @@ pub enum Input<D, I> {
}
const DNS_QUERY_TIMEOUT: Duration = Duration::from_secs(5);
const RE_EVALUATE_NAMESERVER_INTERVAL: Duration = Duration::from_secs(60);
impl Io {
/// Creates a new I/O abstraction
@@ -105,18 +107,16 @@ impl Io {
let mut sockets = Sockets::default();
sockets.rebind(udp_socket_factory.clone()); // Bind sockets on startup.
let mut nameservers = NameserverSet::new(
nameservers,
tcp_socket_factory.clone(),
udp_socket_factory.clone(),
);
nameservers.evaluate();
Self {
outbound_packet_buffer: VecDeque::default(),
timeout: None,
sockets,
nameservers,
nameservers: NameserverSet::new(
nameservers,
tcp_socket_factory.clone(),
udp_socket_factory.clone(),
),
reval_nameserver_interval: tokio::time::interval(RE_EVALUATE_NAMESERVER_INTERVAL),
tcp_socket_factory,
udp_socket_factory,
dns_queries: FuturesTupleSet::new(DNS_QUERY_TIMEOUT, 1000),
@@ -171,7 +171,13 @@ impl Io {
>,
> {
ready!(self.flush(cx)?);
ready!(self.nameservers.poll(cx));
if self.reval_nameserver_interval.poll_tick(cx).is_ready() {
self.nameservers.evaluate();
}
// We purposely don't want to block the event loop here because we can do plenty of other work while this is running.
let _ = self.nameservers.poll(cx);
if let Poll::Ready(network) = self.sockets.poll_recv_from(cx) {
return Poll::Ready(Ok(Input::Network(

View File

@@ -51,7 +51,12 @@ impl NameserverSet {
}
pub fn evaluate(&mut self) {
self.nameserver_by_rtt.clear();
if self.inner.is_empty() {
return;
}
tracing::info!(ips = ?self.inner, "Evaluating fastest nameserver");
let start = Instant::now();
for nameserver in self.inner.iter().copied() {

View File

@@ -27,6 +27,10 @@ export default function Gateway() {
Fixes an issue where ICMP unreachable errors for large packets would
not be sent.
</ChangeItem>
<ChangeItem pull="9060">
Fixes an issue where service discovery for DNS resources would fail
in case the Gateway's started up with no network connectivity.
</ChangeItem>
</Unreleased>
<Entry version="1.4.8" date={new Date("2025-05-02")}>
<ChangeItem pull="9009">