From c93137d9a3ed237bc47765bd63c79968358da64c Mon Sep 17 00:00:00 2001 From: "Luis (LT) Carbonell" Date: Fri, 22 Sep 2023 10:53:26 -0400 Subject: [PATCH] Add Reindex In Progress Metric (#23160) * Add a telemetry metric to track if a reindex is in progress or not * changelog * Add other reindex related metrics * cleanup types * Add docs for these metrics * check for nil values --- changelog/23160.txt | 3 +++ vault/core_metrics.go | 23 +++++++++++++++++++ vault/core_util.go | 4 ++++ .../docs/internals/telemetry/metrics/all.mdx | 6 +++++ .../vault/core/replication/build_progress.mdx | 5 ++++ .../vault/core/replication/build_total.mdx | 5 ++++ .../vault/core/replication/reindex_stage.mdx | 11 +++++++++ 7 files changed, 57 insertions(+) create mode 100644 changelog/23160.txt create mode 100644 website/content/partials/telemetry-metrics/vault/core/replication/build_progress.mdx create mode 100644 website/content/partials/telemetry-metrics/vault/core/replication/build_total.mdx create mode 100644 website/content/partials/telemetry-metrics/vault/core/replication/reindex_stage.mdx diff --git a/changelog/23160.txt b/changelog/23160.txt new file mode 100644 index 0000000000..66e97bfaf6 --- /dev/null +++ b/changelog/23160.txt @@ -0,0 +1,3 @@ +```release-note:improvement +replication: Add re-index status metric to telemetry +``` diff --git a/vault/core_metrics.go b/vault/core_metrics.go index 11946bfeb5..022e2fcb7b 100644 --- a/vault/core_metrics.go +++ b/vault/core_metrics.go @@ -116,6 +116,29 @@ func (c *Core) metricsLoop(stopCh chan struct{}) { c.metricSink.SetGaugeWithLabels([]string{"core", "replication", "dr", "secondary"}, 0, nil) } + if haState == consts.Active { + reindexState := c.ReindexStage() + if reindexState != nil { + c.metricSink.SetGaugeWithLabels([]string{"core", "replication", "reindex_stage"}, float32(*reindexState), nil) + } else { + c.metricSink.SetGaugeWithLabels([]string{"core", "replication", "reindex_stage"}, 0, nil) + } + + buildProgress := c.BuildProgress() + if buildProgress != nil { + c.metricSink.SetGaugeWithLabels([]string{"core", "replication", "build_progress"}, float32(*buildProgress), nil) + } else { + c.metricSink.SetGaugeWithLabels([]string{"core", "replication", "build_progress"}, 0, nil) + } + + buildTotal := c.BuildTotal() + if buildTotal != nil { + c.metricSink.SetGaugeWithLabels([]string{"core", "replication", "build_total"}, float32(*buildTotal), nil) + } else { + c.metricSink.SetGaugeWithLabels([]string{"core", "replication", "build_total"}, 0, nil) + } + } + // If we're using a raft backend, emit raft metrics if rb, ok := c.underlyingPhysical.(*raft.RaftBackend); ok { rb.CollectMetrics(c.MetricSink()) diff --git a/vault/core_util.go b/vault/core_util.go index 18e12de23b..7b93e1d674 100644 --- a/vault/core_util.go +++ b/vault/core_util.go @@ -76,6 +76,10 @@ func (c *Core) UndoLogsEnabled() bool { return false } func (c *Core) UndoLogsPersisted() (bool, error) { return false, nil } func (c *Core) PersistUndoLogs() error { return nil } +func (c *Core) ReindexStage() *uint32 { return nil } +func (c *Core) BuildProgress() *uint32 { return nil } +func (c *Core) BuildTotal() *uint32 { return nil } + func (c *Core) teardownReplicationResolverHandler() {} func createSecondaries(*Core, *CoreConfig) {} diff --git a/website/content/docs/internals/telemetry/metrics/all.mdx b/website/content/docs/internals/telemetry/metrics/all.mdx index 7fd38ea1ac..f3258afd57 100644 --- a/website/content/docs/internals/telemetry/metrics/all.mdx +++ b/website/content/docs/internals/telemetry/metrics/all.mdx @@ -210,6 +210,12 @@ alphabetic order by name. @include 'telemetry-metrics/vault/core/replication/write_undo_logs.mdx' +@include 'telemetry-metrics/vault/core/replication/build_progress.mdx' + +@include 'telemetry-metrics/vault/core/replication/build_total.mdx' + +@include 'telemetry-metrics/vault/core/replication/reindex_stage.mdx' + @include 'telemetry-metrics/vault/core/seal_internal.mdx' @include 'telemetry-metrics/vault/core/seal_with_request.mdx' diff --git a/website/content/partials/telemetry-metrics/vault/core/replication/build_progress.mdx b/website/content/partials/telemetry-metrics/vault/core/replication/build_progress.mdx new file mode 100644 index 0000000000..44a00a2b95 --- /dev/null +++ b/website/content/partials/telemetry-metrics/vault/core/replication/build_progress.mdx @@ -0,0 +1,5 @@ +### vault.core.replication.build_progress ((#vault-core-replication-build_progress)) + +Metric type | Value | Description +----------- | ------- | ----------- +gauge | keys | Number of keys that have been inserted into the new tree diff --git a/website/content/partials/telemetry-metrics/vault/core/replication/build_total.mdx b/website/content/partials/telemetry-metrics/vault/core/replication/build_total.mdx new file mode 100644 index 0000000000..f6d10a2730 --- /dev/null +++ b/website/content/partials/telemetry-metrics/vault/core/replication/build_total.mdx @@ -0,0 +1,5 @@ +### vault.core.replication.build_total ((#vault-core-replication-build_total)) + +Metric type | Value | Description +----------- | ------- | ----------- +gauge | keys | Total number of keys that have to be inserted into the new tree diff --git a/website/content/partials/telemetry-metrics/vault/core/replication/reindex_stage.mdx b/website/content/partials/telemetry-metrics/vault/core/replication/reindex_stage.mdx new file mode 100644 index 0000000000..315ff5a3f7 --- /dev/null +++ b/website/content/partials/telemetry-metrics/vault/core/replication/reindex_stage.mdx @@ -0,0 +1,11 @@ +### vault.core.replication.reindex_stage ((#vault-core-replication-reindex_stage)) + +Metric type | Value | Description +----------- | ------- | ----------- +gauge | stage | Current stage of the reindexing process + +- A value of `4` indicates the reindex process is committing any differences between the newly created tree and the old tree. +- A value of `3` indicates the reindex process is replaying WALs to ensure no updates were missed while scanning and building. +- A value of `2` indicates the reindex process is currently building a new merkle tree based of the values for the keys obtained in the scanning stage. +- A value of `1` indicates the reindex process is currently creating a list of all known storage keys. +- A value of `0` indicates that a reindex is not in progress.