fix(gateway): don't invalidate active NAT sessions (#8937)

Whenever the Gateway is instructed to (re)create the NAT for a DNS
resource, it performs a DNS query and then overwrite the existing
entries in the NAT table. Depending on how the DNS records are defined,
this may lead to a very bad user experience where connections are cut
regularly.

In particular, if a service utilises round-robin DNS where a DNS query
only ever returns a single entry yet that entry may change as soon as
the TTL expires, all connections for this particular DNS resource for a
Client get cut.

To fix this, we now first check for active NAT sessions for a given
proxy IP and only replace it if we don't have an open NAT session. The
NAT sessions have a TTL of 1 minute, meaning there needs to be at least
1 outgoing packet from the Client every minute to keep it open.
This commit is contained in:
Thomas Eizinger
2025-04-30 16:58:58 +10:00
committed by GitHub
parent 968db2ae39
commit f7df445924
3 changed files with 93 additions and 16 deletions

View File

@@ -147,6 +147,11 @@ impl ClientOnGateway {
for (proxy_ip, real_ip) in ip_maps {
tracing::debug!(%name, %proxy_ip, %real_ip);
if self.nat_table.has_entry_for_inside(*proxy_ip) {
tracing::debug!(%name, %proxy_ip, %real_ip, "Skipping DNS resource NAT entry because we have open NAT sessions for it");
continue;
}
self.permanent_translations
.insert(*proxy_ip, TranslationState::new(resource_id, real_ip));
}
@@ -852,12 +857,12 @@ mod tests {
peer.setup_nat(
foo_name().parse().unwrap(),
resource_id(),
BTreeSet::from([foo_real_ip().into()]),
BTreeSet::from([foo_proxy_ip().into()]),
BTreeSet::from([foo_real_ip1().into()]),
BTreeSet::from([foo_proxy_ip1().into()]),
)
.unwrap();
assert_eq!(bar_contained_ip(), foo_real_ip());
assert_eq!(bar_contained_ip(), foo_real_ip1());
let pkt = ip_packet::make::udp_packet(
client_tun_ipv4(),
@@ -886,7 +891,7 @@ mod tests {
let pkt = ip_packet::make::udp_packet(
client_tun_ipv4(),
foo_proxy_ip(),
foo_proxy_ip1(),
1,
bar_allowed_port(),
vec![0, 0, 0, 0, 0, 0, 0, 0],
@@ -900,7 +905,7 @@ mod tests {
let pkt = ip_packet::make::udp_packet(
client_tun_ipv4(),
foo_proxy_ip(),
foo_proxy_ip1(),
1,
foo_allowed_port(),
vec![0, 0, 0, 0, 0, 0, 0, 0],
@@ -918,14 +923,14 @@ mod tests {
peer.setup_nat(
foo_name().parse().unwrap(),
resource_id(),
BTreeSet::from([foo_real_ip().into()]),
BTreeSet::from([foo_proxy_ip().into()]),
BTreeSet::from([foo_real_ip1().into()]),
BTreeSet::from([foo_proxy_ip1().into()]),
)
.unwrap();
let pkt = ip_packet::make::udp_packet(
client_tun_ipv4(),
foo_proxy_ip(),
foo_proxy_ip1(),
1,
foo_allowed_port(),
vec![0, 0, 0, 0, 0, 0, 0, 0],
@@ -936,7 +941,7 @@ mod tests {
let pkt = ip_packet::make::udp_packet(
client_tun_ipv4(),
foo_proxy_ip(),
foo_proxy_ip1(),
1,
600,
vec![0, 0, 0, 0, 0, 0, 0, 0],
@@ -969,14 +974,14 @@ mod tests {
peer.setup_nat(
foo_name().parse().unwrap(),
resource_id(),
BTreeSet::from([foo_real_ip().into()]),
BTreeSet::from([foo_proxy_ip().into()]),
BTreeSet::from([foo_real_ip1().into()]),
BTreeSet::from([foo_proxy_ip1().into()]),
)
.unwrap();
let request = ip_packet::make::udp_packet(
client_tun_ipv4(),
foo_proxy_ip(),
foo_proxy_ip1(),
1,
foo_allowed_port(),
vec![0, 0, 0, 0, 0, 0, 0, 0],
@@ -991,7 +996,7 @@ mod tests {
));
let response = ip_packet::make::udp_packet(
foo_real_ip(),
foo_real_ip1(),
client_tun_ipv4(),
foo_allowed_port(),
1,
@@ -1008,7 +1013,7 @@ mod tests {
);
let response = ip_packet::make::udp_packet(
foo_real_ip(),
foo_real_ip1(),
client_tun_ipv4(),
foo_allowed_port(),
1,
@@ -1025,6 +1030,61 @@ mod tests {
);
}
#[test]
fn setting_up_dns_resource_nat_does_not_clear_existing_nat_session() {
let _guard = firezone_logging::test("trace");
let now = Instant::now();
let mut peer = ClientOnGateway::new(client_id(), client_tun(), gateway_tun());
peer.add_resource(foo_dns_resource(), None);
peer.setup_nat(
foo_name().parse().unwrap(),
resource_id(),
BTreeSet::from([foo_real_ip1().into()]),
BTreeSet::from([foo_proxy_ip1().into(), foo_proxy_ip2().into()]),
)
.unwrap();
let request = ip_packet::make::udp_packet(
client_tun_ipv4(),
foo_proxy_ip1(),
1,
foo_allowed_port(),
vec![0, 0, 0, 0, 0, 0, 0, 0],
)
.unwrap();
let result = peer.translate_outbound(request.clone(), now).unwrap();
assert!(matches!(result, TranslateOutboundResult::Send(_)));
peer.setup_nat(
foo_name().parse().unwrap(),
resource_id(),
BTreeSet::from([foo_real_ip2().into()]), // Setting up with a new IP!
BTreeSet::from([foo_proxy_ip1().into(), foo_proxy_ip2().into()]),
)
.unwrap();
let result = peer.translate_outbound(request, now).unwrap();
assert!(matches!(result, TranslateOutboundResult::Send(_)));
let response = ip_packet::make::udp_packet(
foo_real_ip1(),
client_tun_ipv4(),
foo_allowed_port(),
1,
vec![0, 0, 0, 0, 0, 0, 0, 0],
)
.unwrap();
let response = peer.translate_inbound(response, now).unwrap();
assert!(response.is_some());
}
fn foo_dns_resource() -> crate::messages::gateway::ResourceDescription {
crate::messages::gateway::ResourceDescription::Dns(
crate::messages::gateway::ResourceDescriptionDns {
@@ -1069,18 +1129,26 @@ mod tests {
443
}
fn foo_real_ip() -> Ipv4Addr {
fn foo_real_ip1() -> Ipv4Addr {
"10.0.0.1".parse().unwrap()
}
fn foo_real_ip2() -> Ipv4Addr {
"10.0.0.2".parse().unwrap()
}
fn bar_contained_ip() -> Ipv4Addr {
"10.0.0.1".parse().unwrap()
}
fn foo_proxy_ip() -> Ipv4Addr {
fn foo_proxy_ip1() -> Ipv4Addr {
"100.96.0.1".parse().unwrap()
}
fn foo_proxy_ip2() -> Ipv4Addr {
"100.96.0.2".parse().unwrap()
}
fn foo_name() -> String {
"foo.com".to_string()
}

View File

@@ -39,6 +39,11 @@ impl NatTable {
}
}
/// Returns true if the NAT table has any entries with the given "inside" IP address.
pub(crate) fn has_entry_for_inside(&self, ip: IpAddr) -> bool {
self.table.left_values().any(|(_, c)| c == &ip)
}
pub(crate) fn translate_outgoing(
&mut self,
packet: &IpPacket,

View File

@@ -36,6 +36,10 @@ export default function Gateway() {
Improves connection reliability by maintaining the order of IP packets
across GSO batches.
</ChangeItem>
<ChangeItem pull="8937">
Fixes an issue where connections to DNS resources which utilise round-robin
DNS may be interrupted whenever the Client re-queried the DNS name.
</ChangeItem>
</Unreleased>
<Entry version="1.4.6" date={new Date("2025-04-15")}>
<ChangeItem pull="8383">