From 6e2be658b03b965321e82424fc47fae293793617 Mon Sep 17 00:00:00 2001 From: Thomas Eizinger Date: Tue, 18 Nov 2025 15:23:14 +1100 Subject: [PATCH] chore(gateway): log unroutable packets only on DEBUG (#10897) Currently, the Gateway logs all kinds of errors during packet processing on WARN. Whilst it is generally good to be aware of warnings / errors, some of these scenarios are particularly noisy. For various reasons, we may not be able to route a packet arriving from the TUN device. In such cases, we now return an `UnroutablePacket` error to the event-loop which is special-cased to only log on DEBUG. It also includes the 5 tuple as variables, which should make log analysis a bit easier if we want to filter on specific parts of the 5 tuple. --- rust/connlib/snownet/src/node.rs | 12 +- rust/connlib/tunnel/src/client.rs | 4 +- rust/connlib/tunnel/src/gateway.rs | 39 ++--- .../tunnel/src/gateway/client_on_gateway.rs | 100 ++++------- .../tunnel/src/gateway/unroutable_packet.rs | 156 ++++++++++++++++++ rust/connlib/tunnel/src/io.rs | 6 + rust/connlib/tunnel/src/lib.rs | 17 +- rust/gateway/src/eventloop.rs | 6 +- 8 files changed, 237 insertions(+), 103 deletions(-) create mode 100644 rust/connlib/tunnel/src/gateway/unroutable_packet.rs diff --git a/rust/connlib/snownet/src/node.rs b/rust/connlib/snownet/src/node.rs index 2206ef4e7..b52dd431b 100644 --- a/rust/connlib/snownet/src/node.rs +++ b/rust/connlib/snownet/src/node.rs @@ -366,7 +366,7 @@ where self.pending_events.push_back(Event::ConnectionClosed(cid)); - match connection.encapsulate(cid, peer_socket, goodbye, now, &mut self.allocations) { + match connection.encapsulate(cid, peer_socket, &goodbye, now, &mut self.allocations) { Ok(Some(transmit)) => { tracing::info!("Connection closed proactively (sent goodbye)"); @@ -496,7 +496,7 @@ where pub fn encapsulate( &mut self, cid: TId, - packet: IpPacket, + packet: &IpPacket, now: Instant, ) -> Result> { let conn = self.connections.get_established_mut(&cid, now)?; @@ -512,7 +512,7 @@ where let socket = match &mut conn.state { ConnectionState::Connecting { ip_buffer, .. } => { - ip_buffer.enqueue(packet); + ip_buffer.enqueue(packet.clone()); let num_buffered = ip_buffer.len(); tracing::debug!(%num_buffered, %cid, "ICE is still in progress, buffering IP packet"); @@ -1970,7 +1970,7 @@ where ); transmits.extend(ip_buffer.into_iter().flat_map(|packet| { let transmit = self - .encapsulate(cid, remote_socket, packet, now, allocations) + .encapsulate(cid, remote_socket, &packet, now, allocations) .inspect_err(|e| { tracing::debug!( "Failed to encapsulate buffered IP packet: {e:#}" @@ -2128,14 +2128,14 @@ where &mut self, cid: TId, socket: PeerSocket, - packet: IpPacket, + packet: &IpPacket, now: Instant, allocations: &mut BTreeMap, ) -> Result> where TId: fmt::Display, { - self.state.on_outgoing(cid, &mut self.agent, &packet, now); + self.state.on_outgoing(cid, &mut self.agent, packet, now); let packet_start = if socket.send_from_relay() { 4 } else { 0 }; diff --git a/rust/connlib/tunnel/src/client.rs b/rust/connlib/tunnel/src/client.rs index 3c51ed798..5a9bee258 100644 --- a/rust/connlib/tunnel/src/client.rs +++ b/rust/connlib/tunnel/src/client.rs @@ -601,7 +601,7 @@ impl ClientState { let transmit = self .node - .encapsulate(gid, packet, now) + .encapsulate(gid, &packet, now) .inspect_err(|e| tracing::debug!(%gid, "Failed to encapsulate: {e:#}")) .ok()??; @@ -1870,7 +1870,7 @@ fn encapsulate_and_buffer( buffered_transmits: &mut VecDeque, ) { let Some(transmit) = node - .encapsulate(gid, packet, now) + .encapsulate(gid, &packet, now) .inspect_err(|e| tracing::debug!(%gid, "Failed to encapsulate: {e}")) .ok() .flatten() diff --git a/rust/connlib/tunnel/src/gateway.rs b/rust/connlib/tunnel/src/gateway.rs index 089f3dc38..7eed42a45 100644 --- a/rust/connlib/tunnel/src/gateway.rs +++ b/rust/connlib/tunnel/src/gateway.rs @@ -2,8 +2,12 @@ mod client_on_gateway; mod filter_engine; mod flow_tracker; mod nat_table; +mod unroutable_packet; + +pub use crate::gateway::unroutable_packet::UnroutablePacket; pub(crate) use crate::gateway::client_on_gateway::ClientOnGateway; +pub(crate) use crate::gateway::unroutable_packet::RoutingError; use crate::gateway::client_on_gateway::TranslateOutboundResult; use crate::gateway::flow_tracker::FlowTracker; @@ -111,31 +115,28 @@ impl GatewayState { let dst = packet.destination(); - if !crate::is_peer(dst) { - return Ok(None); - } + anyhow::ensure!(crate::is_peer(dst), UnroutablePacket::not_a_peer(&packet)); + + let peer = self + .peers + .peer_by_ip_mut(dst) + .with_context(|| UnroutablePacket::no_peer_state(&packet))?; - let Some(peer) = self.peers.peer_by_ip_mut(dst) else { - tracing::debug!(%dst, "Unknown client, perhaps already disconnected?"); - return Ok(None); - }; let cid = peer.id(); flow_tracker::inbound_tun::record_client(cid); - let Some(packet) = peer + let packet = peer .translate_inbound(packet, now) - .context("Failed to translate inbound packet")? - else { - return Ok(None); - }; + .context("Failed to translate inbound packet")?; - let Some(encrypted_packet) = self - .node - .encapsulate(cid, packet, now) - .context("Failed to encapsulate")? - else { - return Ok(None); + let encrypted_packet = match self.node.encapsulate(cid, &packet, now) { + Ok(Some(encrypted_packet)) => encrypted_packet, + Ok(None) => return Ok(None), + Err(e) if e.is::() => { + return Err(e.context(UnroutablePacket::not_connected(&packet))); + } + Err(e) => return Err(e), }; flow_tracker::inbound_tun::record_wireguard_packet( @@ -719,7 +720,7 @@ fn encrypt_packet( now: Instant, ) -> Result> { let transmit = node - .encapsulate(cid, packet, now) + .encapsulate(cid, &packet, now) .context("Failed to encapsulate")?; Ok(transmit) diff --git a/rust/connlib/tunnel/src/gateway/client_on_gateway.rs b/rust/connlib/tunnel/src/gateway/client_on_gateway.rs index a94e61abe..8ddd8a40e 100644 --- a/rust/connlib/tunnel/src/gateway/client_on_gateway.rs +++ b/rust/connlib/tunnel/src/gateway/client_on_gateway.rs @@ -16,10 +16,11 @@ use crate::client::{IPV4_RESOURCES, IPV6_RESOURCES}; use crate::gateway::filter_engine::FilterEngine; use crate::gateway::flow_tracker; use crate::gateway::nat_table::{NatTable, TranslateIncomingResult}; +use crate::gateway::unroutable_packet::UnroutablePacket; use crate::messages::gateway::Filters; use crate::messages::gateway::ResourceDescription; use crate::utils::network_contains_network; -use crate::{GatewayEvent, IpConfig, NotAllowedResource, NotClientIp, otel}; +use crate::{GatewayEvent, IpConfig, NotAllowedResource, NotClientIp}; /// The state of one client on a gateway. pub struct ClientOnGateway { @@ -37,8 +38,6 @@ pub struct ClientOnGateway { permanent_translations: BTreeMap, nat_table: NatTable, buffered_events: VecDeque, - - num_dropped_packets: opentelemetry::metrics::Counter, } #[derive(Debug, PartialEq)] @@ -66,7 +65,6 @@ impl ClientOnGateway { nat_table: Default::default(), buffered_events: Default::default(), internet_resource_enabled: None, - num_dropped_packets: otel::metrics::network_packet_dropped(), } } @@ -339,46 +337,30 @@ impl ClientOnGateway { &mut self, packet: IpPacket, now: Instant, - ) -> anyhow::Result> { + ) -> anyhow::Result { // Traffic from our own IP is allowed. match packet.source() { - IpAddr::V4(src) if src == self.gateway_tun.v4 => return Ok(Some(packet)), - IpAddr::V6(src) if src == self.gateway_tun.v6 => return Ok(Some(packet)), + IpAddr::V4(src) if src == self.gateway_tun.v4 => return Ok(packet), + IpAddr::V6(src) if src == self.gateway_tun.v6 => return Ok(packet), IpAddr::V4(_) | IpAddr::V6(_) => {} } - let Some(packet) = self.transform_tun_to_network(packet, now)? else { - return Ok(None); - }; + let packet = self.transform_tun_to_network(packet, now)?; self.ensure_client_ip(packet.destination())?; // Always allow ICMP errors to pass through, even in the presence of filters that don't allow ICMP. if packet.icmp_error().is_ok_and(|e| e.is_some()) { - return Ok(Some(packet)); + return Ok(packet); } - match self.classify_resource(packet.source(), packet.source_protocol()) { - Ok(rid) => { - flow_tracker::inbound_tun::record_resource(rid); - } - Err(e) => { - tracing::debug!( - "Inbound packet is not allowed, perhaps from an old client session? error = {e:#}" - ); - self.num_dropped_packets.add( - 1, - &[ - otel::attr::network_type_for_packet(&packet), - otel::attr::network_io_direction_receive(), - otel::attr::error_type(e.root_cause().to_string()), - ], - ); - return Ok(None); - } - } + let rid = self + .classify_resource(packet.source(), packet.source_protocol()) + .with_context(|| UnroutablePacket::not_allowed(&packet))?; - Ok(Some(packet)) + flow_tracker::inbound_tun::record_resource(rid); + + Ok(packet) } fn transform_network_to_tun( @@ -442,7 +424,7 @@ impl ClientOnGateway { &mut self, mut packet: IpPacket, now: Instant, - ) -> anyhow::Result> { + ) -> anyhow::Result { let (proto, ip) = match self.nat_table.translate_incoming(&packet, now)? { TranslateIncomingResult::Ok { proto, src } => (proto, src), TranslateIncomingResult::IcmpError(prototype) => { @@ -452,29 +434,15 @@ impl ClientOnGateway { .into_packet(self.client_tun.v4, self.client_tun.v6) .context("Failed to create ICMP error")?; - return Ok(Some(icmp_error)); + return Ok(icmp_error); } TranslateIncomingResult::ExpiredNatSession => { - tracing::debug!( - ?packet, - "Expired NAT session for inbound packet of DNS resource; dropping" - ); - - self.num_dropped_packets.add( - 1, - &[ - otel::attr::network_type_for_packet(&packet), - otel::attr::network_io_direction_receive(), - otel::attr::error_type("ExpiredNatSession"), - ], - ); - - return Ok(None); + bail!(UnroutablePacket::expired_nat_session(&packet)) } TranslateIncomingResult::NoNatSession => { // No NAT session means packet is likely for Internet Resource or a CIDR resource. - return Ok(Some(packet)); + return Ok(packet); } }; @@ -483,7 +451,7 @@ impl ClientOnGateway { .context("Failed to translate packet to new source")?; packet.update_checksum(); - Ok(Some(packet)) + Ok(packet) } pub(crate) fn is_allowed(&self, resource: ResourceId) -> bool { @@ -740,7 +708,7 @@ mod tests { use ip_packet::make::TcpFlags; use crate::{ - gateway::nat_table, + gateway::{RoutingError, nat_table}, messages::gateway::{Filter, PortRange, ResourceDescriptionCidr}, }; @@ -866,11 +834,7 @@ mod tests { peer.translate_outbound(request, Instant::now()).unwrap(), TranslateOutboundResult::Send(_) )); - assert!( - peer.translate_inbound(response, Instant::now()) - .unwrap() - .is_some() - ); + peer.translate_inbound(response, Instant::now()).unwrap(); } #[test] @@ -1046,10 +1010,7 @@ mod tests { now += Duration::from_secs(30); peer.handle_timeout(now); - assert!( - matches!(peer.translate_inbound(response, now), Ok(Some(_))), - "After 30s remote should still be able to send a packet back" - ); + peer.translate_inbound(response, now).unwrap(); let response = ip_packet::make::udp_packet( foo_real_ip1(), @@ -1063,10 +1024,13 @@ mod tests { now += nat_table::UDP_TTL; peer.handle_timeout(now); - assert!( - matches!(peer.translate_inbound(response, now), Ok(None)), - "After 1 minute of inactivity, NAT session should be freed" - ); + let err = peer + .translate_inbound(response, now) + .unwrap_err() + .downcast::() + .unwrap(); + + assert_eq!(err.reason(), RoutingError::ExpiredNatSession); } #[test] @@ -1125,9 +1089,7 @@ mod tests { ) .unwrap(); - let response = peer.translate_inbound(response, now).unwrap(); - - assert!(response.is_some()); + peer.translate_inbound(response, now).unwrap(); } { @@ -1161,9 +1123,7 @@ mod tests { ) .unwrap(); - let response = peer.translate_inbound(response, now).unwrap(); - - assert!(response.is_some()); + peer.translate_inbound(response, now).unwrap(); } } diff --git a/rust/connlib/tunnel/src/gateway/unroutable_packet.rs b/rust/connlib/tunnel/src/gateway/unroutable_packet.rs new file mode 100644 index 000000000..11a6bda6e --- /dev/null +++ b/rust/connlib/tunnel/src/gateway/unroutable_packet.rs @@ -0,0 +1,156 @@ +use std::{ + fmt::Display, + net::{IpAddr, SocketAddr}, +}; + +use ip_packet::{IpPacket, Protocol}; + +#[derive(Debug, thiserror::Error)] +#[error("Unroutable packet: {error}")] +pub struct UnroutablePacket { + five_tuple: FiveTuple, + error: RoutingError, +} + +impl UnroutablePacket { + pub fn not_allowed(packet: &IpPacket) -> Self { + Self { + five_tuple: FiveTuple::for_packet(packet), + error: RoutingError::NotAllowed, + } + } + + pub fn expired_nat_session(packet: &IpPacket) -> Self { + Self { + five_tuple: FiveTuple::for_packet(packet), + error: RoutingError::ExpiredNatSession, + } + } + + pub fn not_a_peer(packet: &IpPacket) -> Self { + Self { + five_tuple: FiveTuple::for_packet(packet), + error: RoutingError::NotAPeer, + } + } + + pub fn no_peer_state(packet: &IpPacket) -> Self { + Self { + five_tuple: FiveTuple::for_packet(packet), + error: RoutingError::NoPeerState, + } + } + + pub fn not_connected(packet: &IpPacket) -> Self { + Self { + five_tuple: FiveTuple::for_packet(packet), + error: RoutingError::NotConnected, + } + } + + pub fn reason(&self) -> RoutingError { + self.error + } + + pub fn source(&self) -> impl Display { + self.five_tuple.src + } + + pub fn destination(&self) -> impl Display { + self.five_tuple.dst + } + + pub fn proto(&self) -> impl Display { + self.five_tuple.proto + } +} + +#[derive(Debug, derive_more::Display, Clone, Copy)] +enum MaybeIpOrSocket { + #[display("{_0}")] + Ip(IpAddr), + #[display("{_0}")] + Socket(SocketAddr), + #[display("unknown")] + Unknown, +} + +#[derive(Debug, derive_more::Display, Clone, Copy)] +enum MaybeProto { + #[display("TCP")] + Tcp, + #[display("UDP")] + Udp, + #[display("ICMP")] + Icmp, + #[display("unknown")] + Unknown, +} + +#[derive(Debug, Clone, Copy)] +struct FiveTuple { + src: MaybeIpOrSocket, + dst: MaybeIpOrSocket, + proto: MaybeProto, +} + +impl FiveTuple { + fn for_packet(p: &IpPacket) -> Self { + let src_ip = p.source(); + let dst_ip = p.destination(); + let src_proto = p.source_protocol(); + let dst_proto = p.destination_protocol(); + + match (src_proto, dst_proto) { + (Ok(Protocol::Tcp(src_port)), Ok(Protocol::Tcp(dst_port))) => Self { + src: MaybeIpOrSocket::Socket(SocketAddr::new(src_ip, src_port)), + dst: MaybeIpOrSocket::Socket(SocketAddr::new(dst_ip, dst_port)), + proto: MaybeProto::Tcp, + }, + (Ok(Protocol::Udp(src_port)), Ok(Protocol::Udp(dst_port))) => Self { + src: MaybeIpOrSocket::Socket(SocketAddr::new(src_ip, src_port)), + dst: MaybeIpOrSocket::Socket(SocketAddr::new(dst_ip, dst_port)), + proto: MaybeProto::Udp, + }, + (Ok(Protocol::Icmp(_)), Ok(Protocol::Icmp(_))) => Self { + src: MaybeIpOrSocket::Ip(src_ip), + dst: MaybeIpOrSocket::Ip(dst_ip), + proto: MaybeProto::Icmp, + }, + _ => Self { + src: MaybeIpOrSocket::Unknown, + dst: MaybeIpOrSocket::Unknown, + proto: MaybeProto::Unknown, + }, + } + } +} + +#[derive(Debug, derive_more::Display, Clone, Copy, PartialEq, Eq)] +pub enum RoutingError { + #[display("Not allowed")] + NotAllowed, + #[display("Expired NAT session")] + ExpiredNatSession, + #[display("Not a Firezone peer")] + NotAPeer, + #[display("No peer state")] + NoPeerState, + #[display("No connection")] + NotConnected, + #[display("Other")] + Other, +} + +impl From for opentelemetry::Value { + fn from(value: RoutingError) -> Self { + match value { + RoutingError::NotAllowed => opentelemetry::Value::from("NotAllowed"), + RoutingError::ExpiredNatSession => opentelemetry::Value::from("ExpiredNatSession"), + RoutingError::NotAPeer => opentelemetry::Value::from("NotAPeer"), + RoutingError::NoPeerState => opentelemetry::Value::from("NoPeerState"), + RoutingError::NotConnected => opentelemetry::Value::from("NotConnected"), + RoutingError::Other => opentelemetry::Value::from("Other"), + } + } +} diff --git a/rust/connlib/tunnel/src/io.rs b/rust/connlib/tunnel/src/io.rs index b6d64acea..63c64a925 100644 --- a/rust/connlib/tunnel/src/io.rs +++ b/rust/connlib/tunnel/src/io.rs @@ -62,6 +62,7 @@ pub struct Io { tun: Device, outbound_packet_buffer: VecDeque, packet_counter: opentelemetry::metrics::Counter, + dropped_packets: opentelemetry::metrics::Counter, } #[derive(Debug)] @@ -178,6 +179,7 @@ impl Io { .u64_counter("system.network.packets") .with_description("The number of packets processed.") .build(), + dropped_packets: otel::metrics::network_packet_dropped(), } } @@ -525,6 +527,10 @@ impl Io { .ok_or(io::Error::other("No DNS server"))? .send_response(to, message) } + + pub(crate) fn inc_dropped_packet(&self, attrs: &[opentelemetry::KeyValue]) { + self.dropped_packets.add(1, attrs); + } } fn is_max_wg_packet_size(d: &DatagramIn) -> bool { diff --git a/rust/connlib/tunnel/src/lib.rs b/rust/connlib/tunnel/src/lib.rs index 139e13832..c67ad8cc4 100644 --- a/rust/connlib/tunnel/src/lib.rs +++ b/rust/connlib/tunnel/src/lib.rs @@ -71,7 +71,7 @@ pub type ClientTunnel = Tunnel; pub use client::ClientState; pub use client::dns_config::DnsMapping; pub use dns::DnsResourceRecord; -pub use gateway::{DnsResourceNatEntry, GatewayState, ResolveDnsRequest}; +pub use gateway::{DnsResourceNatEntry, GatewayState, ResolveDnsRequest, UnroutablePacket}; pub use sockets::UdpSocketThreadStopped; pub use utils::turn; @@ -420,7 +420,20 @@ impl GatewayTunnel { Ok(None) => { self.role_state.handle_timeout(now, Utc::now()); } - Err(e) => error.push(e), + Err(e) => { + let routing_error = e + .downcast_ref::() + .map(|e| e.reason()) + .unwrap_or(gateway::RoutingError::Other); + + // TODO: Include more attributes here like IPv4/IPv6? + self.io.inc_dropped_packet(&[ + otel::attr::error_type(routing_error), + otel::attr::network_io_direction_receive(), + ]); + + error.push(e); + } } } diff --git a/rust/gateway/src/eventloop.rs b/rust/gateway/src/eventloop.rs index 187ad9d4d..91419d201 100644 --- a/rust/gateway/src/eventloop.rs +++ b/rust/gateway/src/eventloop.rs @@ -324,10 +324,8 @@ impl Eventloop { continue; } - if e.downcast_ref::() - .is_some_and(|e| e.recently_disconnected()) - { - tracing::debug!("{e:#}"); + if let Some(e) = e.downcast_ref::() { + tracing::debug!(src = %e.source(), dst = %e.destination(), proto = %e.proto(), "{e:#}"); continue; }