From d1761e5a5d05b7cdf7640bf17e029e80934d2032 Mon Sep 17 00:00:00 2001 From: Jamil Date: Wed, 5 Feb 2025 10:30:04 -0800 Subject: [PATCH] fix(infra): Bust cache for already-reported min/max metrics (#8026) On staging (only), we see these logs spammed: ``` {400, "{\n \"error\": {\n \"code\": 400,\n \"message\": \"One or more TimeSeries could not be written: timeSeries[22]: Metric kind for metric custom.googleapis.com/elixir/domain/repo/query/queue_time/min must be CUMULATIVE, but is GAUGE.; timeSeries[11]: Metric kind for metric custom.googleapis.com/elixir/domain/repo/query/query_time/max must be CUMULATIVE, but is GAUGE.; timeSeries[8]: Metric kind for metric custom.googleapis.com/elixir/domain/repo/query/idle_time/max must be CUMULATIVE, but is GAUGE.; timeSeries[7]: Metric kind for metric custom.googleapis.com/elixir/domain/repo/query/idle_time/min must be CUMULATIVE, but is GAUGE.; timeSeries[10]: Metric kind for metric custom.googleapis.com/elixir/domain/repo/query/query_time/min must be CUMULATIVE, but is GAUGE.; timeSeries[14]: Metric kind for metric custom.googleapis.com/elixir/domain/repo/query/query_time/max must be CUMULATIVE, but is GAUGE.; timeSeries[13]: Metric kind for metric custom.googleapis.com/elixir/domain/repo/query/query_time/min must be CUMULATIVE, but is GAUGE.; timeSeries[16]: Metric kind for metric custom.googleapis.com/elixir/domain/repo/query/query_time/min must be CUMULATIVE, but is GAUGE.; timeSeries[23]: Metric kind for metric custom.googleapis.com/elixir/domain/repo/query/queue_time/max must be CUMULATIVE, but is GAUGE.; timeSeries[20]: Metric kind for metric custom.googleapis.com/elixir/domain/repo/query/query_time/max must be CUMULATIVE, but is GAUGE.; timeSeries[19]: Metric kind for metric custom.googleapis.com/elixir/domain/repo/query/query_time/min must be CUMULATIVE, but is GAUGE.; timeSeries[17]: Metric kind for metric custom.googleapis.com/elixir/domain/repo/query/query_time/max must be CUMULATIVE, but is GAUGE.\",\n \"status\": \"INVALID_ARGUMENT\",\n \"details\": [\n {\n \"@type\": \"type.googleapis.com/google.monitoring.v3.CreateTimeSeriesSummary\",\n \"totalPointCount\": 36,\n \"successPointCount\": 24,\n \"errors\": [\n {\n \"status\": {\n \"code\": 3\n },\n \"pointCount\": 12\n }\n ]\n }\n ]\n }\n}\n"} ``` This does not happen on prod. As far as I can tell, GCP will cache the initial metric kind used when the metric ID was first used, which appears to be `CUMULATIVE` here. The correct metric kind for these is `GAUGE` since they're reporting a min/max value. Since GCP doesn't support resetting the auto-defined metric kind of a particular metric ID, we need to alter the ID to "bust" the cache and create a new definition. --- .../lib/domain/telemetry/google_cloud_metrics_reporter.ex | 4 ++-- .../domain/telemetry/google_cloud_metrics_reporter_test.exs | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/elixir/apps/domain/lib/domain/telemetry/google_cloud_metrics_reporter.ex b/elixir/apps/domain/lib/domain/telemetry/google_cloud_metrics_reporter.ex index dad8042a9..8f5a6b8e3 100644 --- a/elixir/apps/domain/lib/domain/telemetry/google_cloud_metrics_reporter.ex +++ b/elixir/apps/domain/lib/domain/telemetry/google_cloud_metrics_reporter.ex @@ -376,7 +376,7 @@ defmodule Domain.Telemetry.GoogleCloudMetricsReporter do }, %{ metric: %{ - type: "custom.googleapis.com/elixir/#{Enum.join(name, "/")}/min", + type: "custom.googleapis.com/elixir/#{Enum.join(name, "/")}/min_val", labels: labels }, resource: resource, @@ -392,7 +392,7 @@ defmodule Domain.Telemetry.GoogleCloudMetricsReporter do }, %{ metric: %{ - type: "custom.googleapis.com/elixir/#{Enum.join(name, "/")}/max", + type: "custom.googleapis.com/elixir/#{Enum.join(name, "/")}/max_val", labels: labels }, resource: resource, diff --git a/elixir/apps/domain/test/domain/telemetry/google_cloud_metrics_reporter_test.exs b/elixir/apps/domain/test/domain/telemetry/google_cloud_metrics_reporter_test.exs index 599572db9..4d59e78cf 100644 --- a/elixir/apps/domain/test/domain/telemetry/google_cloud_metrics_reporter_test.exs +++ b/elixir/apps/domain/test/domain/telemetry/google_cloud_metrics_reporter_test.exs @@ -347,7 +347,7 @@ defmodule Domain.Telemetry.GoogleCloudMetricsReporterTest do %{ "metric" => %{ "labels" => %{"app" => "myapp", "foo" => "bar"}, - "type" => "custom.googleapis.com/elixir/foo/min" + "type" => "custom.googleapis.com/elixir/foo/min_val" }, "metricKind" => "GAUGE", "points" => [ @@ -365,7 +365,7 @@ defmodule Domain.Telemetry.GoogleCloudMetricsReporterTest do %{ "metric" => %{ "labels" => %{"app" => "myapp", "foo" => "bar"}, - "type" => "custom.googleapis.com/elixir/foo/max" + "type" => "custom.googleapis.com/elixir/foo/max_val" }, "metricKind" => "GAUGE", "points" => [