fix(relay): increase page size for metrics to 4096 (#8646)

The default here is 2 which is nowhere near enough of a batch-size for
us to read all perf events generated by the kernel when it is actually
relaying data via eBPF (we generate 1 perf event per relayed packet). If
we don't read them fast enough, the kernel has to drop some, meaning we
skew our metrics as to how much data we've relayed via eBPF.

This has been tested in my local setup and I've seen north of 500 events
being read in a single batch now.

---------

Signed-off-by: Thomas Eizinger <thomas@eizinger.io>
This commit is contained in:
Thomas Eizinger
2025-04-04 01:28:22 +00:00
committed by GitHub
parent 818b97a3a6
commit ebb71e0f54

View File

@@ -15,6 +15,13 @@ use stun_codec::rfc5766::attributes::ChannelNumber;
use crate::{AllocationPort, ClientSocket, PeerSocket};
/// How many [`StatsEvent`]s we will at most read in one batch.
///
/// Must be a power of two, hence it is defined as a hex value.
/// Must be sufficiently large to read large batches from the kernel every time we get scheduled.
/// Otherwise the kernel has to drop some and we skew our metrics.
const PAGE_COUNT: usize = 0x1000;
pub struct Program {
ebpf: aya::Ebpf,
@@ -54,7 +61,7 @@ impl Program {
.context("Failed to determine number of CPUs")?
{
// open a separate perf buffer for each cpu
let mut stats_array_buf = stats.open(cpu_id, None)?;
let mut stats_array_buf = stats.open(cpu_id, Some(PAGE_COUNT))?;
tracing::debug!(%cpu_id, "Subscribing to stats events from eBPF kernel");
@@ -63,7 +70,7 @@ impl Program {
let data_relayed = data_relayed.clone();
async move {
let mut buffers = (0..1000)
let mut buffers = (0..PAGE_COUNT)
.map(|_| BytesMut::with_capacity(std::mem::size_of::<StatsEvent>()))
.collect::<Vec<_>>();