feat(gateway): allow exporting metrics to an OTEL collector (#9838)

As a first step in preparation for sending OTEL metrics from Clients and
Gateways to a cloud-hosted OTEL collector, we extend the CLI of the
Gateway with configuration options to provide a gRPC endpoint to an OTEL
collector.

If `FIREZONE_METRICS` is set to `otel-collector` and an endpoint is
configured via `OTLP_GRPC_ENDPOINT`, we will report our metrics to that
collector.

The future plan for extending this is such that if `FIREZONE_METRICS` is
set to `otel-collector` (which will likely be the default) and no
`OTLP_GRPC_ENDPOINT` is set, then we will use our own, hosted OTEL
collector and report metrics IF the `export-metrics` feature-flag is set
to `true`.

This is a similar integration as we have done it with streaming logs to
Sentry. We can therefore enable it on a similar granularity as we do
with the logs and e.g. only enable it for the `firezone` account to
start with.

In meantime, customers can already make use of those metrics if they'd
like by using the current integration.

Resolves: #1550
Related: #7419

---------

Co-authored-by: Antoine Labarussias <antoinelabarussias@gmail.com>
This commit is contained in:
Thomas Eizinger
2025-07-14 05:54:38 +02:00
committed by GitHub
parent 70e4b6572f
commit cecca37073
10 changed files with 148 additions and 21 deletions

2
rust/Cargo.lock generated
View File

@@ -2386,6 +2386,7 @@ dependencies = [
"nix 0.30.1",
"num_cpus",
"opentelemetry",
"opentelemetry-otlp",
"opentelemetry-stdout",
"opentelemetry_sdk",
"phoenix-channel",
@@ -2583,6 +2584,7 @@ name = "firezone-telemetry"
version = "0.1.0"
dependencies = [
"anyhow",
"futures",
"hex",
"ip-packet",
"moka",

View File

@@ -27,6 +27,7 @@ libc = { workspace = true, features = ["std", "const-extern-fn", "extra_traits"]
moka = { workspace = true, features = ["future"] }
num_cpus = { workspace = true }
opentelemetry = { workspace = true, features = ["metrics"] }
opentelemetry-otlp = { workspace = true, features = ["metrics", "grpc-tonic"] }
opentelemetry-stdout = { workspace = true, features = ["metrics"] }
opentelemetry_sdk = { workspace = true, features = ["rt-tokio"] }
phoenix-channel = { workspace = true }

View File

@@ -7,10 +7,13 @@ use firezone_bin_shared::{
platform::{tcp_socket_factory, udp_socket_factory},
};
use firezone_telemetry::{Telemetry, otel};
use firezone_telemetry::{
MaybePushMetricsExporter, NoopPushMetricsExporter, Telemetry, feature_flags, otel,
};
use firezone_tunnel::GatewayTunnel;
use ip_packet::IpPacket;
use opentelemetry_sdk::metrics::{PeriodicReader, SdkMeterProvider};
use opentelemetry_otlp::WithExportConfig;
use opentelemetry_sdk::metrics::SdkMeterProvider;
use phoenix_channel::LoginUrl;
use phoenix_channel::get_user_agent;
@@ -115,17 +118,34 @@ async fn try_main(cli: Cli, telemetry: &mut Telemetry) -> Result<()> {
.await;
}
if cli.metrics {
let exporter = opentelemetry_stdout::MetricExporter::default();
let reader = PeriodicReader::builder(exporter).build();
let provider = SdkMeterProvider::builder()
.with_reader(reader)
.with_resource(otel::default_resource_with([
otel::attr::service_name!(),
otel::attr::service_version!(),
otel::attr::service_instance_id(firezone_id.clone()),
]))
.build();
if let Some(backend) = cli.metrics {
let resource = otel::default_resource_with([
otel::attr::service_name!(),
otel::attr::service_version!(),
otel::attr::service_instance_id(firezone_id.clone()),
]);
let provider = match (backend, cli.otlp_grpc_endpoint) {
(MetricsExporter::Stdout, _) => SdkMeterProvider::builder()
.with_periodic_exporter(opentelemetry_stdout::MetricExporter::default())
.with_resource(resource)
.build(),
(MetricsExporter::OtelCollector, Some(endpoint)) => SdkMeterProvider::builder()
.with_periodic_exporter(tonic_otlp_exporter(endpoint)?)
.with_resource(resource)
.build(),
(MetricsExporter::OtelCollector, None) => SdkMeterProvider::builder()
.with_periodic_exporter(MaybePushMetricsExporter {
inner: {
// TODO: Once Firezone has a hosted OTLP exporter, it will go here.
NoopPushMetricsExporter
},
should_export: feature_flags::export_metrics,
})
.with_resource(resource)
.build(),
};
opentelemetry::global::set_meter_provider(provider);
}
@@ -195,6 +215,18 @@ async fn try_main(cli: Cli, telemetry: &mut Telemetry) -> Result<()> {
}
}
fn tonic_otlp_exporter(
endpoint: String,
) -> Result<opentelemetry_otlp::MetricExporter, anyhow::Error> {
let metric_exporter = opentelemetry_otlp::MetricExporter::builder()
.with_tonic()
.with_endpoint(format!("http://{endpoint}"))
.build()
.context("Failed to build OTLP metric exporter")?;
Ok(metric_exporter)
}
async fn get_firezone_id(env_id: Option<String>) -> Result<String> {
if let Some(id) = env_id
&& !id.is_empty()
@@ -250,9 +282,15 @@ struct Cli {
#[arg(long, env = "FIREZONE_NUM_TUN_THREADS", default_value_t)]
tun_threads: NumThreads,
/// Dump internal metrics to stdout every 60s.
#[arg(long, hide = true, env = "FIREZONE_METRICS", default_value_t = false)]
metrics: bool,
/// Where to export metrics to.
#[arg(long, hide = true, env = "FIREZONE_METRICS")]
metrics: Option<MetricsExporter>,
/// Send metrics to a custom OTLP collector.
///
/// By default, Firezone's hosted OTLP collector is used.
#[arg(long, env, hide = true)]
otlp_grpc_endpoint: Option<String>,
/// Validates the checksums of all packets leaving the TUN device.
#[arg(
@@ -264,6 +302,12 @@ struct Cli {
validate_checksums: bool,
}
#[derive(Debug, Clone, Copy, clap::ValueEnum)]
enum MetricsExporter {
Stdout,
OtelCollector,
}
impl Cli {
fn is_telemetry_allowed(&self) -> bool {
!self.no_telemetry

View File

@@ -102,8 +102,8 @@ struct Cli {
no_telemetry: bool,
/// Dump internal metrics to stdout every 60s.
#[arg(long, env = "FIREZONE_METRICS", default_value_t = false)]
metrics: bool,
#[arg(long, hide = true, env = "FIREZONE_METRICS")]
metrics: Option<MetricsExporter>,
/// A filesystem path where the token can be found
// Apparently passing secrets through stdin is the most secure method, but
@@ -114,6 +114,11 @@ struct Cli {
token_path: PathBuf,
}
#[derive(Debug, Clone, Copy, clap::ValueEnum)]
enum MetricsExporter {
Stdout,
}
impl Cli {
fn is_telemetry_allowed(&self) -> bool {
!self.no_telemetry
@@ -223,7 +228,7 @@ fn main() -> Result<()> {
let mut last_connlib_start_instant = Some(Instant::now());
rt.block_on(async {
if cli.metrics {
if let Some(MetricsExporter::Stdout) = cli.metrics {
let exporter = opentelemetry_stdout::MetricExporter::default();
let reader = PeriodicReader::builder(exporter).build();
let provider = SdkMeterProvider::builder()

View File

@@ -6,11 +6,12 @@ license = { workspace = true }
[dependencies]
anyhow = { workspace = true }
futures = { workspace = true }
hex = { workspace = true }
ip-packet = { workspace = true }
moka = { workspace = true, features = ["sync"] }
opentelemetry = { workspace = true }
opentelemetry_sdk = { workspace = true }
opentelemetry_sdk = { workspace = true, features = ["metrics"] }
parking_lot = { workspace = true }
reqwest = { workspace = true }
sentry = { workspace = true, features = ["contexts", "backtrace", "debug-images", "panic", "reqwest", "rustls", "tracing", "release-health", "logs"] }

View File

@@ -39,6 +39,10 @@ pub fn map_enobufs_to_would_block() -> bool {
FEATURE_FLAGS.map_enobufs_to_wouldblock()
}
pub fn export_metrics() -> bool {
false // Placeholder until we actually deploy an OTEL collector.
}
pub(crate) async fn evaluate_now(user_id: String, env: Env) {
if user_id.is_empty() {
return;

View File

@@ -15,8 +15,13 @@ pub mod feature_flags;
pub mod otel;
mod api_url;
mod maybe_push_metrics_exporter;
mod noop_push_metrics_exporter;
mod posthog;
pub use maybe_push_metrics_exporter::MaybePushMetricsExporter;
pub use noop_push_metrics_exporter::NoopPushMetricsExporter;
pub struct Dsn(&'static str);
// TODO: Dynamic DSN

View File

@@ -0,0 +1,38 @@
use std::future::Future;
use futures::future::Either;
use opentelemetry_sdk::{
error::OTelSdkResult,
metrics::{Temporality, data::ResourceMetrics, exporter::PushMetricExporter},
};
pub struct MaybePushMetricsExporter<E, F> {
pub inner: E,
pub should_export: F,
}
impl<E, F> PushMetricExporter for MaybePushMetricsExporter<E, F>
where
E: PushMetricExporter,
F: Fn() -> bool + Send + Sync + 'static,
{
fn export(&self, metrics: &mut ResourceMetrics) -> impl Future<Output = OTelSdkResult> + Send {
if (self.should_export)() {
return Either::Left(self.inner.export(metrics));
}
Either::Right(std::future::ready(Ok(())))
}
fn force_flush(&self) -> OTelSdkResult {
self.inner.force_flush()
}
fn shutdown(&self) -> OTelSdkResult {
self.inner.shutdown()
}
fn temporality(&self) -> Temporality {
self.inner.temporality()
}
}

View File

@@ -0,0 +1,26 @@
use std::future::Future;
use opentelemetry_sdk::{
error::OTelSdkResult,
metrics::{Temporality, data::ResourceMetrics, exporter::PushMetricExporter},
};
pub struct NoopPushMetricsExporter;
impl PushMetricExporter for NoopPushMetricsExporter {
fn export(&self, _: &mut ResourceMetrics) -> impl Future<Output = OTelSdkResult> + Send {
std::future::ready(Ok(()))
}
fn force_flush(&self) -> OTelSdkResult {
Ok(())
}
fn shutdown(&self) -> OTelSdkResult {
Ok(())
}
fn temporality(&self) -> Temporality {
Temporality::default()
}
}

View File

@@ -1,7 +1,7 @@
use opentelemetry::KeyValue;
use opentelemetry_sdk::{
Resource,
resource::{ResourceDetector, TelemetryResourceDetector},
resource::{EnvResourceDetector, ResourceDetector, TelemetryResourceDetector},
};
pub mod attr {
@@ -117,6 +117,7 @@ pub fn default_resource_with<const N: usize>(attributes: [KeyValue; N]) -> Resou
Resource::builder_empty()
.with_detector(Box::new(TelemetryResourceDetector))
.with_detector(Box::new(OsResourceDetector))
.with_detector(Box::new(EnvResourceDetector::new()))
.with_attributes(attributes)
.build()
}