From f800875affcbb300cd4cea8d3f5061a2ec31e8c8 Mon Sep 17 00:00:00 2001 From: Thomas Eizinger Date: Fri, 26 Jul 2024 02:36:42 +1000 Subject: [PATCH] fix(relay): don't hang when connecting to OTLP exporter (#6034) The dependency update in #6003 introduced a regression: Connecting to the OTLP exporter was hanging forever and thus the relay failed to start up. The hang seems to be related to _dropping_ the `meter_provider`. Looking at the changelog update, this change was actually called out: https://github.com/open-telemetry/opentelemetry-rust/blob/main/opentelemetry-otlp/CHANGELOG.md#v0170. By setting these providers globally, the relay starts up just fine. To ensure this doesn't regress again, we add an OTEL collector to our `docker-compose.yml` and configure the `relay-1` to connect to it. --- docker-compose.yml | 5 +++++ rust/relay/src/main.rs | 27 +++++++++++++++------------ 2 files changed, 20 insertions(+), 12 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 256dc629e..5f079ef05 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -429,6 +429,7 @@ services: RUST_LOG: ${RUST_LOG:-debug} RUST_BACKTRACE: 1 FIREZONE_API_URL: ws://api:8081 + OTLP_GRPC_ENDPOINT: otlp:4317 build: target: dev context: rust @@ -491,6 +492,10 @@ services: app: ipv4_address: ${RELAY_2_PUBLIC_IP4_ADDR:-172.28.0.201} + otel: + image: otel/opentelemetry-collector:latest + networks: + app: # IPv6 is currently causing flakiness with GH actions and on our testbed. # Disabling until there's more time to debug. diff --git a/rust/relay/src/main.rs b/rust/relay/src/main.rs index f6d905b53..89a091b2e 100644 --- a/rust/relay/src/main.rs +++ b/rust/relay/src/main.rs @@ -7,8 +7,6 @@ use firezone_relay::{ PeerSocket, Server, Sleep, }; use futures::{future, FutureExt}; -use opentelemetry::KeyValue; -use opentelemetry_otlp::WithExportConfig; use phoenix_channel::{Event, LoginUrl, PhoenixChannel}; use rand::rngs::StdRng; use rand::{Rng, SeedableRng}; @@ -175,7 +173,9 @@ async fn main() -> Result<()> { /// /// If the user has specified [`TraceCollector::Otlp`], we will set up an OTLP-exporter that connects to an OTLP collector specified at `Args.otlp_grpc_endpoint`. fn setup_tracing(args: &Args) -> Result<()> { - use opentelemetry::trace::TracerProvider as _; + use opentelemetry::{global, trace::TracerProvider as _, KeyValue}; + use opentelemetry_otlp::WithExportConfig; + use opentelemetry_sdk::{runtime::Tokio, trace::Config, Resource}; // Use `tracing_core` directly for the temp logger because that one does not initialize a `log` logger. // A `log` Logger cannot be unset once set, so we can't use that for our temp logger during the setup. @@ -196,14 +196,16 @@ fn setup_tracing(args: &Args) -> Result<()> { .tonic() .with_endpoint(grpc_endpoint.clone()); - let provider = opentelemetry_otlp::new_pipeline() + let tracer_provider = opentelemetry_otlp::new_pipeline() .tracing() .with_exporter(exporter) - .with_trace_config(opentelemetry_sdk::trace::Config::default().with_resource( - opentelemetry_sdk::Resource::new(vec![KeyValue::new("service.name", "relay")]), - )) - .install_batch(opentelemetry_sdk::runtime::Tokio) + .with_trace_config( + Config::default() + .with_resource(Resource::new(vec![KeyValue::new("service.name", "relay")])), + ) + .install_batch(Tokio) .context("Failed to create OTLP trace pipeline")?; + global::set_tracer_provider(tracer_provider.clone()); tracing::trace!(target: "relay", "Successfully initialized trace provider on tokio runtime"); @@ -211,19 +213,20 @@ fn setup_tracing(args: &Args) -> Result<()> { .tonic() .with_endpoint(grpc_endpoint); - opentelemetry_otlp::new_pipeline() - .metrics(opentelemetry_sdk::runtime::Tokio) + let meter_provider = opentelemetry_otlp::new_pipeline() + .metrics(Tokio) .with_exporter(exporter) .build() .context("Failed to create OTLP metrics pipeline")?; + global::set_meter_provider(meter_provider); - tracing::trace!(target: "relay", "Successfully initialized metric controller on tokio runtime"); + tracing::trace!(target: "relay", "Successfully initialized metric provider on tokio runtime"); tracing_subscriber::registry() .with(log_layer(args).with_filter(env_filter())) .with( tracing_opentelemetry::layer() - .with_tracer(provider.tracer("relay")) + .with_tracer(tracer_provider.tracer("relay")) .with_filter(env_filter()), ) .into()