From 8dd794d8c820ecd1f25f1786582679516f92bf63 Mon Sep 17 00:00:00 2001 From: Thomas Eizinger Date: Thu, 1 May 2025 04:24:10 +1000 Subject: [PATCH] chore(gateway): record metrics about dropped packets (#8942) When a NAT session expires or other unallowed traffic is routed to the Gateway, we drop these packets. It will be useful to learn, how often that actually happens and what the reason is for why they got dropped. To do so, we add a counter metric for these packets. --------- Signed-off-by: Thomas Eizinger Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- rust/connlib/tunnel/src/otel.rs | 20 ++++++++++++++++-- rust/connlib/tunnel/src/peer.rs | 21 +++++++++++++++++++ .../tunnel/src/unique_packet_buffer.rs | 9 ++------ 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/rust/connlib/tunnel/src/otel.rs b/rust/connlib/tunnel/src/otel.rs index 2baed3ea9..7acd9bf29 100644 --- a/rust/connlib/tunnel/src/otel.rs +++ b/rust/connlib/tunnel/src/otel.rs @@ -1,7 +1,7 @@ use std::{io, net::SocketAddr}; use ip_packet::IpPacket; -use opentelemetry::KeyValue; +use opentelemetry::{KeyValue, Value}; pub fn network_transport_udp() -> KeyValue { KeyValue::new("network.transport", "udp") @@ -58,7 +58,23 @@ pub fn io_error_code(e: &io::Error) -> KeyValue { } pub fn io_error_type(e: &io::Error) -> KeyValue { - KeyValue::new("error.type", format!("io::ErrorKind::{:?}", e.kind())) + error_type(format!("io::ErrorKind::{:?}", e.kind())) +} + +pub fn error_type(ty: impl Into) -> KeyValue { + KeyValue::new("error.type", ty) +} + +pub mod metrics { + use opentelemetry::metrics::Counter; + + pub fn network_packet_dropped() -> Counter { + opentelemetry::global::meter("connlib") + .u64_counter("network.packet.dropped") + .with_description("Count of packets that are dropped or discarded") + .with_unit("{packet}") + .init() + } } #[cfg(test)] diff --git a/rust/connlib/tunnel/src/peer.rs b/rust/connlib/tunnel/src/peer.rs index eedae2d9b..b32c84797 100644 --- a/rust/connlib/tunnel/src/peer.rs +++ b/rust/connlib/tunnel/src/peer.rs @@ -75,6 +75,8 @@ pub struct ClientOnGateway { permanent_translations: BTreeMap, nat_table: NatTable, buffered_events: VecDeque, + + num_dropped_packets: opentelemetry::metrics::Counter, } impl ClientOnGateway { @@ -93,6 +95,7 @@ impl ClientOnGateway { nat_table: Default::default(), buffered_events: Default::default(), internet_resource_enabled: false, + num_dropped_packets: crate::otel::metrics::network_packet_dropped(), } } @@ -358,6 +361,15 @@ impl ClientOnGateway { "Inbound packet is not allowed, perhaps from an old client session? error = {e:#}" ); + self.num_dropped_packets.add( + 1, + &[ + crate::otel::network_type_for_packet(&packet), + crate::otel::network_io_direction_receive(), + crate::otel::error_type(e.root_cause().to_string()), + ], + ); + return Ok(None); } @@ -386,6 +398,15 @@ impl ClientOnGateway { "Expired NAT session for inbound packet of DNS resource; dropping" ); + self.num_dropped_packets.add( + 1, + &[ + crate::otel::network_type_for_packet(&packet), + crate::otel::network_io_direction_receive(), + crate::otel::error_type("ExpiredNatSession"), + ], + ); + return Ok(None); } TranslateIncomingResult::NoNatSession => { diff --git a/rust/connlib/tunnel/src/unique_packet_buffer.rs b/rust/connlib/tunnel/src/unique_packet_buffer.rs index 180d2653f..f3a50ea6c 100644 --- a/rust/connlib/tunnel/src/unique_packet_buffer.rs +++ b/rust/connlib/tunnel/src/unique_packet_buffer.rs @@ -14,13 +14,7 @@ impl UniquePacketBuffer { Self { buffer: AllocRingBuffer::with_capacity_power_of_2(capacity), tag, - num_dropped_packets: opentelemetry::global::meter("connlib") - .u64_counter("system.network.packet.dropped") - .with_description( - "The number of packets which have been dropped due to buffer overflows.", - ) - .with_unit("{packet}") - .init(), + num_dropped_packets: crate::otel::metrics::network_packet_dropped(), } } @@ -50,6 +44,7 @@ impl UniquePacketBuffer { crate::otel::network_type_for_packet(&new), crate::otel::network_io_direction_transmit(), KeyValue::new("system.buffer.pool.name", self.tag), + crate::otel::error_type("BufferFull"), ], ); }