From a67e062254fbfea73baae649c626a7fe6f32b289 Mon Sep 17 00:00:00 2001 From: Mike Palmiotto Date: Wed, 4 Dec 2024 13:23:55 -0500 Subject: [PATCH] Fix unlocked mounts read (#29091) This PR fixes copy-paste error in the product usage code where we were taking out the authLock to access the mount table. While we're add it we can remove the existing lock grabbing in the product usage goroutine in favor of a serialized startup/teardown of censusManager and its core dependency which requires the lock. This requires some minor test edits, so created a test helper for that. By moving the censusManager teardown before expirationManager teardown, we can effectively ensure the goroutine is completely stopped outside of any expirationManager change. We are already guaranteed serial startup, so this should free us of any complex lock semantics. --- changelog/29091.txt | 3 +++ vault/core.go | 11 +++++++---- vault/core_metrics.go | 26 ++++++-------------------- vault/expiration.go | 7 +++++-- vault/expiration_test.go | 32 ++++++++++++++++++++------------ vault/token_store_test.go | 5 +---- 6 files changed, 42 insertions(+), 42 deletions(-) create mode 100644 changelog/29091.txt diff --git a/changelog/29091.txt b/changelog/29091.txt new file mode 100644 index 0000000000..409d8ddf4a --- /dev/null +++ b/changelog/29091.txt @@ -0,0 +1,3 @@ +```release-note:bug +core/metrics: Fix unlocked mounts read for usage reporting. +``` diff --git a/vault/core.go b/vault/core.go index bad9a45e96..d9e7ad62de 100644 --- a/vault/core.go +++ b/vault/core.go @@ -2898,14 +2898,17 @@ func (c *Core) preSeal() error { if err := c.teardownAudits(); err != nil { result = multierror.Append(result, fmt.Errorf("error tearing down audits: %w", err)) } - if err := c.stopExpiration(); err != nil { - result = multierror.Append(result, fmt.Errorf("error stopping expiration: %w", err)) - } + // Ensure that the ActivityLog and CensusManager are both completely torn + // down before stopping the ExpirationManager. This ordering is critical, + // due to a tight coupling between the ActivityLog, CensusManager, and + // ExpirationManager for product usage reporting. c.stopActivityLog() - // Clean up census on seal if err := c.teardownCensusManager(); err != nil { result = multierror.Append(result, fmt.Errorf("error tearing down reporting agent: %w", err)) } + if err := c.stopExpiration(); err != nil { + result = multierror.Append(result, fmt.Errorf("error stopping expiration: %w", err)) + } if err := c.teardownCredentials(context.Background()); err != nil { result = multierror.Append(result, fmt.Errorf("error tearing down credentials: %w", err)) } diff --git a/vault/core_metrics.go b/vault/core_metrics.go index 0ca9a0e094..60f0295d14 100644 --- a/vault/core_metrics.go +++ b/vault/core_metrics.go @@ -540,22 +540,15 @@ func getMeanNamespaceSecrets(mapOfNamespacesToSecrets map[string]int) int { func (c *Core) GetSecretEngineUsageMetrics() map[string]int { mounts := make(map[string]int) - c.authLock.RLock() - defer c.authLock.RUnlock() - - // we don't grab the statelock, so this code might run during or after the seal process. - // Therefore, we need to check if c.auth is nil. If we do not, this will panic when - // run after seal. - if c.auth == nil { - return mounts - } + c.mountsLock.RLock() + defer c.mountsLock.RUnlock() for _, entry := range c.mounts.Entries { - authType := entry.Type - if _, ok := mounts[authType]; !ok { - mounts[authType] = 1 + mountType := entry.Type + if _, ok := mounts[mountType]; !ok { + mounts[mountType] = 1 } else { - mounts[authType] += 1 + mounts[mountType] += 1 } } return mounts @@ -568,13 +561,6 @@ func (c *Core) GetAuthMethodUsageMetrics() map[string]int { c.authLock.RLock() defer c.authLock.RUnlock() - // we don't grab the statelock, so this code might run during or after the seal process. - // Therefore, we need to check if c.auth is nil. If we do not, this will panic when - // run after seal. - if c.auth == nil { - return mounts - } - for _, entry := range c.auth.Entries { authType := entry.Type if _, ok := mounts[authType]; !ok { diff --git a/vault/expiration.go b/vault/expiration.go index c2f652020c..37cb26c559 100644 --- a/vault/expiration.go +++ b/vault/expiration.go @@ -435,8 +435,11 @@ func (c *Core) setupExpiration(e ExpireLeaseStrategy) error { return nil } -// stopExpiration is used to stop the expiration manager before -// sealing the Vault. +// stopExpiration is used to stop the expiration manager before sealing Vault. +// This *must* be called after shutting down the ActivityLog and +// CensusManager to prevent Core's expirationManager reference from +// changing while being accessed by product usage reporting. This is +// an unfortunate side-effect of tight coupling between ActivityLog and Core. func (c *Core) stopExpiration() error { if c.expiration != nil { if err := c.expiration.Stop(); err != nil { diff --git a/vault/expiration_test.go b/vault/expiration_test.go index 9e0da07bdb..2ef6315625 100644 --- a/vault/expiration_test.go +++ b/vault/expiration_test.go @@ -855,10 +855,7 @@ func TestExpiration_Restore(t *testing.T) { } // Stop everything - err = c.stopExpiration() - if err != nil { - t.Fatalf("err: %v", err) - } + stopExpiration(t, c) if exp.leaseCount != 0 { t.Fatalf("expected %v leases, got %v", 0, exp.leaseCount) @@ -3008,6 +3005,23 @@ func registerOneLease(t *testing.T, ctx context.Context, exp *ExpirationManager) return leaseID } +// stopExpiration is a test helper which allows us to safely teardown the +// expiration manager. This preserves the shutdown order of Core for these few +// outlier tests that (previously) directly called [Core].stopExpiration(). +func stopExpiration(t *testing.T, core *Core) { + t.Helper() + core.stopActivityLog() + err := core.teardownCensusManager() + if err != nil { + t.Fatalf("error stopping census manager: %v", err) + } + + err = core.stopExpiration() + if err != nil { + t.Fatalf("error stopping expiration manager: %v", err) + } +} + func TestExpiration_MarkIrrevocable(t *testing.T) { c, _, _ := TestCoreUnsealed(t) exp := c.expiration @@ -3060,10 +3074,7 @@ func TestExpiration_MarkIrrevocable(t *testing.T) { } // stop and restore to verify that irrevocable leases are properly loaded from storage - err = c.stopExpiration() - if err != nil { - t.Fatalf("error stopping expiration manager: %v", err) - } + stopExpiration(t, c) err = exp.Restore(nil) if err != nil { @@ -3153,10 +3164,7 @@ func TestExpiration_StopClearsIrrevocableCache(t *testing.T) { exp.markLeaseIrrevocable(ctx, le, fmt.Errorf("test irrevocable error")) exp.pendingLock.Unlock() - err = c.stopExpiration() - if err != nil { - t.Fatalf("error stopping expiration manager: %v", err) - } + stopExpiration(t, c) if _, ok := exp.irrevocable.Load(leaseID); ok { t.Error("expiration manager irrevocable cache should be cleared on stop") diff --git a/vault/token_store_test.go b/vault/token_store_test.go index 8223108e98..6d58616966 100644 --- a/vault/token_store_test.go +++ b/vault/token_store_test.go @@ -1170,10 +1170,7 @@ func TestTokenStore_CreateLookup_ExpirationInRestoreMode(t *testing.T) { t.Fatalf("err: %v", err) } - err = c.stopExpiration() - if err != nil { - t.Fatal(err) - } + stopExpiration(t, c) // Reset expiration manager to restore mode ts.expiration.restoreModeLock.Lock()