From cdfe086ca9bd461bb829260e0acef048f3f10a3b Mon Sep 17 00:00:00 2001 From: Thomas Eizinger Date: Fri, 8 Sep 2023 15:45:44 +1000 Subject: [PATCH] feat(relay): push spans to OTLP-receiver (#1995) When I built https://github.com/firezone/firezone/pull/1994, I hadn't yet fully understood how tracing works on Google Cloud. Logs and traces are separated in Google Cloud. Most importantly, traces need to be _pushed_ to Google Cloud whereas logs are scraped automatically. Logs can _reference_ traces via particular fields, in particular `logging.googleapis.com/spanId` and `projects/{project_id}/traces/{trace_id}`. Within the container-optimised OS that we are running on, we are already authenticated to all the Google APIs. Thus, we can utilize the `GcpAuthorizer` from the `opentelemetry_stackdriver` module which will automatically obtain a token from the internal metadata endpoint. Thus no external configuration is necessary. We split the configuration for logs / traces into two components: - `LOG_FORMAT`: Specifies how the logs are formatted. Can be `human`, `json` or `google-cloud`. - `TRACE_RECEIVER`: Optional. Specifies where the traces are sent to. If specified, we will also slightly tweak the log configuration to embed the project ID which allows Google Cloud Trace to cross-reference log entries with traces. --- rust/Cargo.lock | 308 +++++++++++++++++++++++++++- rust/relay/Cargo.toml | 5 +- rust/relay/src/main.rs | 124 +++++++++-- terraform/modules/relay-app/main.tf | 8 +- 4 files changed, 416 insertions(+), 29 deletions(-) diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 00d075db1..3f9988659 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -302,6 +302,28 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "async-stream" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd56dd203fef61ac097dd65721a419ddccb106b2d2b70ba60a6b529f03961a51" +dependencies = [ + "async-stream-impl", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-stream-impl" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.28", +] + [[package]] name = "async-trait" version = "0.1.73" @@ -855,6 +877,15 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9cace84e55f07e7301bae1c519df89cdad8cc3cd868413d3fdbdeca9ff3db484" +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if 1.0.0", +] + [[package]] name = "crossbeam-channel" version = "0.5.8" @@ -1040,6 +1071,27 @@ dependencies = [ "subtle", ] +[[package]] +name = "dirs-next" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b98cf8ebf19c3d1b223e151f99a4f9f0690dca41414773390fc824184ac833e1" +dependencies = [ + "cfg-if 1.0.0", + "dirs-sys-next", +] + +[[package]] +name = "dirs-sys-next" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ebda144c4fe02d1f7ea1a7d9641b6fc6b580adcfa024ae48797ecdeb6825b4d" +dependencies = [ + "libc", + "redox_users", + "winapi", +] + [[package]] name = "displaydoc" version = "0.2.4" @@ -1277,6 +1329,16 @@ dependencies = [ "wintun", ] +[[package]] +name = "flate2" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c6c98ee8095e9d1dcbf2fcc6d95acccb90d1c81db1e44725c6a984b1dbdfb010" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "fnv" version = "1.0.7" @@ -1394,6 +1456,31 @@ dependencies = [ "url", ] +[[package]] +name = "gcp_auth" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ab5724fa45095bf1965ff491e75182818ba13f2a8755b04d484a9ec6408b622" +dependencies = [ + "async-trait", + "base64 0.21.3", + "dirs-next", + "hyper", + "hyper-rustls", + "ring", + "rustls 0.20.9", + "rustls-pemfile", + "serde", + "serde_json", + "thiserror", + "time 0.3.22", + "tokio", + "tracing", + "tracing-futures", + "url", + "which", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -1457,6 +1544,25 @@ dependencies = [ "subtle", ] +[[package]] +name = "h2" +version = "0.3.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91fc23aa11be92976ef4729127f1a74adf36d8436f7816b185d18df956790833" +dependencies = [ + "bytes", + "fnv", + "futures-core", + "futures-sink", + "futures-util", + "http", + "indexmap", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -1585,6 +1691,7 @@ dependencies = [ "futures-channel", "futures-core", "futures-util", + "h2", "http", "http-body", "httparse", @@ -1598,6 +1705,32 @@ dependencies = [ "want", ] +[[package]] +name = "hyper-rustls" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1788965e61b367cd03a62950836d5cd41560c3577d90e40e0819373194d1661c" +dependencies = [ + "http", + "hyper", + "rustls 0.20.9", + "rustls-native-certs", + "tokio", + "tokio-rustls 0.23.4", +] + +[[package]] +name = "hyper-timeout" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbb958482e8c7be4bc3cf272a766a2b0bf1a6755e7a6ae777f017a31d11b13b1" +dependencies = [ + "hyper", + "pin-project-lite", + "tokio", + "tokio-io-timeout", +] + [[package]] name = "iana-time-zone" version = "0.1.57" @@ -2243,6 +2376,35 @@ dependencies = [ "opentelemetry_sdk", ] +[[package]] +name = "opentelemetry-semantic-conventions" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24e33428e6bf08c6f7fcea4ddb8e358fab0fe48ab877a87c70c6ebe20f673ce5" +dependencies = [ + "opentelemetry", +] + +[[package]] +name = "opentelemetry-stackdriver" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff78d1abfa634182471924e542004d1b46996a31a481da114eda4abaad9d5b61" +dependencies = [ + "async-trait", + "futures", + "gcp_auth", + "hex", + "http", + "hyper", + "opentelemetry", + "opentelemetry-semantic-conventions", + "prost", + "prost-types", + "thiserror", + "tonic", +] + [[package]] name = "opentelemetry_api" version = "0.19.0" @@ -2277,6 +2439,8 @@ dependencies = [ "percent-encoding", "rand", "thiserror", + "tokio", + "tokio-stream", ] [[package]] @@ -2335,7 +2499,7 @@ checksum = "93f00c865fe7cabf650081affecd3871070f26767e7b2070a3ffae14c654b447" dependencies = [ "cfg-if 1.0.0", "libc", - "redox_syscall", + "redox_syscall 0.3.5", "smallvec", "windows-targets 0.48.1", ] @@ -2563,6 +2727,38 @@ dependencies = [ "unarray", ] +[[package]] +name = "prost" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b82eaa1d779e9a4bc1c3217db8ffbeabaae1dca241bf70183242128d48681cd" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-derive" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5d2d8d10f3c6ded6da8b05b5fb3b8a5082514344d56c9f871412d29b4e075b4" +dependencies = [ + "anyhow", + "itertools 0.10.5", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "prost-types" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "213622a1460818959ac1181aaeb2dc9c7f63df720db7d788b3e24eacd1983e13" +dependencies = [ + "prost", +] + [[package]] name = "quick-error" version = "1.2.3" @@ -2649,6 +2845,15 @@ dependencies = [ "url", ] +[[package]] +name = "redox_syscall" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fb5a58c1855b4b6819d59012155603f0b22ad30cad752600aadfcb695265519a" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "redox_syscall" version = "0.3.5" @@ -2658,6 +2863,17 @@ dependencies = [ "bitflags 1.3.2", ] +[[package]] +name = "redox_users" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b033d837a7cf162d7993aded9304e30a83213c648b6e389db233191f891e5c2b" +dependencies = [ + "getrandom", + "redox_syscall 0.2.16", + "thiserror", +] + [[package]] name = "regex" version = "1.9.3" @@ -2719,7 +2935,8 @@ dependencies = [ "hex", "hex-literal", "once_cell", - "opentelemetry_sdk", + "opentelemetry", + "opentelemetry-stackdriver", "phoenix-channel", "prometheus-client", "proptest", @@ -2876,6 +3093,18 @@ dependencies = [ "webpki 0.21.4", ] +[[package]] +name = "rustls" +version = "0.20.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b80e3dec595989ea8510028f30c408a4630db12c9cbb8de34203b89d6577e99" +dependencies = [ + "log", + "ring", + "sct 0.7.0", + "webpki 0.22.0", +] + [[package]] name = "rustls" version = "0.21.2" @@ -3386,7 +3615,7 @@ dependencies = [ "autocfg", "cfg-if 1.0.0", "fastrand", - "redox_syscall", + "redox_syscall 0.3.5", "rustix 0.37.22", "windows-sys 0.48.0", ] @@ -3514,6 +3743,16 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "tokio-io-timeout" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30b74022ada614a1b4834de765f9bb43877f910cc8ce4be40e89042c9223a8bf" +dependencies = [ + "pin-project-lite", + "tokio", +] + [[package]] name = "tokio-macros" version = "2.1.0" @@ -3525,6 +3764,17 @@ dependencies = [ "syn 2.0.28", ] +[[package]] +name = "tokio-rustls" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c43ee83903113e03984cb9e5cebe6c04a5116269e900e3ddba8f068a62adda59" +dependencies = [ + "rustls 0.20.9", + "tokio", + "webpki 0.22.0", +] + [[package]] name = "tokio-rustls" version = "0.24.1" @@ -3557,7 +3807,7 @@ dependencies = [ "rustls 0.21.2", "rustls-native-certs", "tokio", - "tokio-rustls", + "tokio-rustls 0.24.1", "tungstenite", "webpki-roots", ] @@ -3576,6 +3826,41 @@ dependencies = [ "tracing", ] +[[package]] +name = "tonic" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f219fad3b929bef19b1f86fbc0358d35daed8f2cac972037ac0dc10bbb8d5fb" +dependencies = [ + "async-stream", + "async-trait", + "axum", + "base64 0.13.1", + "bytes", + "flate2", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "hyper", + "hyper-timeout", + "percent-encoding", + "pin-project", + "prost", + "prost-derive", + "rustls-pemfile", + "tokio", + "tokio-rustls 0.23.4", + "tokio-stream", + "tokio-util", + "tower", + "tower-layer", + "tower-service", + "tracing", + "tracing-futures", +] + [[package]] name = "tower" version = "0.4.13" @@ -3584,11 +3869,16 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" dependencies = [ "futures-core", "futures-util", + "indexmap", "pin-project", "pin-project-lite", + "rand", + "slab", "tokio", + "tokio-util", "tower-layer", "tower-service", + "tracing", ] [[package]] @@ -3637,6 +3927,16 @@ dependencies = [ "valuable", ] +[[package]] +name = "tracing-futures" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97d095ae15e245a057c8e8451bab9b3ee1e1f68e9ba2b4fbc18d0ac5237835f2" +dependencies = [ + "pin-project", + "tracing", +] + [[package]] name = "tracing-log" version = "0.1.3" diff --git a/rust/relay/Cargo.toml b/rust/relay/Cargo.toml index 7d785bdbc..627e06c30 100644 --- a/rust/relay/Cargo.toml +++ b/rust/relay/Cargo.toml @@ -14,10 +14,11 @@ rand = "0.8.5" stun_codec = "0.3.1" tokio = { version = "1.32.0", features = ["macros", "rt-multi-thread", "net", "time"] } tracing = { version = "0.1.37", features = ["log"] } -tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } +tracing-subscriber = { version = "0.3", features = ["env-filter", "json", "fmt"] } tracing-stackdriver = { version = "0.7.2", features = ["opentelemetry"] } +opentelemetry-stackdriver = { version = "0.16.0", default-features = false, features = ["gcp_auth"] } tracing-opentelemetry = "0.19.0" -opentelemetry_sdk = "0.19.0" +opentelemetry = { version = "0.19.0", features = ["rt-tokio"] } env_logger = "0.10.0" bytes = "1.4.0" sha2 = "0.10.6" diff --git a/rust/relay/src/main.rs b/rust/relay/src/main.rs index 26caf8233..d30347fa6 100644 --- a/rust/relay/src/main.rs +++ b/rust/relay/src/main.rs @@ -2,6 +2,9 @@ use anyhow::{anyhow, bail, Context, Result}; use clap::Parser; use futures::channel::mpsc; use futures::{future, FutureExt, SinkExt, StreamExt}; +use opentelemetry::sdk::trace::TracerProvider; +use opentelemetry::trace::TracerProvider as _; +use opentelemetry_stackdriver::Authorizer; use phoenix_channel::{Error, Event, PhoenixChannel}; use prometheus_client::registry::Registry; use rand::rngs::StdRng; @@ -18,6 +21,7 @@ use std::pin::Pin; use std::task::Poll; use std::time::SystemTime; use tracing::level_filters::LevelFilter; +use tracing::Subscriber; use tracing_stackdriver::CloudTraceConfiguration; use tracing_subscriber::layer::SubscriberExt; use tracing_subscriber::util::SubscriberInitExt; @@ -57,35 +61,38 @@ struct Args { #[arg(long, env)] rng_seed: Option, - /// The Google Cloud `project-id`. - /// - /// If specified, logs will be tuned for Google Cloud Trace. + /// How to format the logs. + #[arg(long, env, default_value = "human")] + log_format: LogFormat, + + /// Where to send trace data to. #[arg(long, env)] - google_cloud_project_id: Option, + trace_collector: Option, +} + +#[derive(clap::ValueEnum, Debug, Clone, Copy)] +enum LogFormat { + Human, + Json, + GoogleCloud, +} + +#[derive(clap::ValueEnum, Debug, Clone, Copy)] +enum TraceCollector { + /// Sends traces to Google Cloud Trace. + GoogleCloudTrace, + // TODO: Extend with OTLP receiver } #[tokio::main] async fn main() -> Result<()> { let args = Args::parse(); - let env_filter = EnvFilter::builder() - .with_default_directive(LevelFilter::INFO.into()) - .from_env_lossy(); + setup_tracing(&args).await?; - if let Some(project_id) = args.google_cloud_project_id { - tracing_subscriber::registry() - .with( - tracing_stackdriver::layer() - .with_cloud_trace(CloudTraceConfiguration { project_id }) - .with_filter(env_filter), - ) - .with(tracing_opentelemetry::layer().with_tracer( - opentelemetry_sdk::export::trace::stdout::new_pipeline().install_simple(), - )) - .init() - } else { - tracing_subscriber::fmt().with_env_filter(env_filter).init() - } + // Must create a root span for traces to be sampled. + let root = tracing::error_span!("root"); + let _root = root.enter(); let public_addr = match (args.public_ip4_addr, args.public_ip6_addr) { (Some(ip4), Some(ip6)) => IpStack::Dual { ip4, ip6 }, @@ -180,6 +187,81 @@ async fn main() -> Result<()> { Ok(()) } +/// Sets up our tracing infrastructure. +/// +/// See [`log_layer`] for details on the base log layer. +/// +/// If the user has specified [`TraceCollector::GoogleCloudTrace`], we will attempt to connec to Google Cloud Trace. +/// This requires authentication. +/// Here is how we will attempt to obtain those, for details see . +/// +/// 1. Check if the `GOOGLE_APPLICATION_CREDENTIALS` environment variable if set; if so, use a custom service account as the token source. +/// 2. Look for credentials in `.config/gcloud/application_default_credentials.json`; if found, use these credentials to request refresh tokens. +/// 3. Send a HTTP request to the internal metadata server to retrieve a token; if it succeeds, use the default service account as the token source. +/// 4. Check if the `gcloud` tool is available on the PATH; if so, use the `gcloud auth print-access-token` command as the token source. +async fn setup_tracing(args: &Args) -> Result<()> { + let registry = tracing_subscriber::registry(); + + match args.trace_collector { + None => registry.with(log_layer(args, None)).try_init(), + Some(TraceCollector::GoogleCloudTrace) => { + let authorizer = opentelemetry_stackdriver::GcpAuthorizer::new() + .await + .context("Failed to find GCP credentials")?; + + let project_id = authorizer.project_id().to_owned(); + + let (exporter, driver) = opentelemetry_stackdriver::Builder::default() + .build(authorizer) + .await?; + tokio::spawn(driver); + + let tracer = TracerProvider::builder() + .with_batch_exporter(exporter, opentelemetry::runtime::Tokio) + .build() + .tracer("relay"); + + registry + .with(log_layer(args, Some(project_id))) + .with(tracing_opentelemetry::layer().with_tracer(tracer)) + .try_init() + } + } + .context("Failed to init tracing")?; + + Ok(()) +} + +/// Constructs the base log layer. +/// +/// The user has a choice between: +/// +/// - human-centered formatting +/// - JSON-formatting +/// - Google Cloud optimised formatting +fn log_layer( + args: &Args, + google_cloud_trace_project_id: Option, +) -> Box + Send + Sync> +where + T: Subscriber + for<'a> tracing_subscriber::registry::LookupSpan<'a>, +{ + let env_filter = EnvFilter::builder() + .with_default_directive(LevelFilter::INFO.into()) + .from_env_lossy(); + + let log_layer = match (args.log_format, google_cloud_trace_project_id) { + (LogFormat::Human, _) => tracing_subscriber::fmt::layer().boxed(), + (LogFormat::Json, _) => tracing_subscriber::fmt::layer().json().boxed(), + (LogFormat::GoogleCloud, None) => tracing_stackdriver::layer().boxed(), + (LogFormat::GoogleCloud, Some(project_id)) => tracing_stackdriver::layer() + .with_cloud_trace(CloudTraceConfiguration { project_id }) + .boxed(), + }; + + log_layer.with_filter(env_filter).boxed() +} + #[derive(serde::Serialize, PartialEq, Debug)] struct JoinMessage { stamp_secret: String, diff --git a/terraform/modules/relay-app/main.tf b/terraform/modules/relay-app/main.tf index b6dc8b4ad..8790fbd8e 100644 --- a/terraform/modules/relay-app/main.tf +++ b/terraform/modules/relay-app/main.tf @@ -23,8 +23,12 @@ locals { value = var.observability_log_level }, { - name = "GOOGLE_CLOUD_PROJECT_ID" - value = var.project_id + name = "LOG_FORMAT" + value = "google-cloud" + }, + { + name = "TRACE_RECEIVER" + value = "google-cloud-trace" }, { name = "METRICS_ADDR"