From 66494c8129cddf33eb0cf435b6cb2f76bc47416f Mon Sep 17 00:00:00 2001 From: Jason O'Donnell <2160810+jasonodonnell@users.noreply.github.com> Date: Mon, 30 Oct 2023 12:49:46 -0400 Subject: [PATCH] core: fix bug where deadlock detection was always on for expiration and quotas (#23902) * server: fix bug where deadlock detection was on for expiration and quotas * trim spaces * Add tests * Use trimspace and lower * Update test * changelog * fix config parsing --- changelog/23902.txt | 5 ++ vault/core.go | 39 ++++++++++++---- vault/core_test.go | 48 ++++++++++++++++++++ vault/expiration.go | 22 +++++++-- vault/quotas/quotas.go | 28 +++++++++--- vault/quotas/quotas_rate_limit_test.go | 2 +- vault/quotas/quotas_test.go | 6 +-- vault/testing.go | 14 ++++++ website/content/docs/configuration/index.mdx | 9 ++-- 9 files changed, 147 insertions(+), 26 deletions(-) create mode 100644 changelog/23902.txt diff --git a/changelog/23902.txt b/changelog/23902.txt new file mode 100644 index 0000000000..cbfec65096 --- /dev/null +++ b/changelog/23902.txt @@ -0,0 +1,5 @@ +```release-note:bug +core: fix bug where deadlock detection was always on for expiration and quotas. +These can now be configured individually with `detect_deadlocks`. +``` + diff --git a/vault/core.go b/vault/core.go index 68661f963d..07c9d3965a 100644 --- a/vault/core.go +++ b/vault/core.go @@ -20,6 +20,7 @@ import ( "os" "path/filepath" "runtime" + "slices" "strconv" "strings" "sync" @@ -690,6 +691,9 @@ type Core struct { // If any role based quota (LCQ or RLQ) is enabled, don't track lease counts by role impreciseLeaseRoleTracking bool + + // Config value for "detect_deadlocks". + detectDeadlocks []string } // c.stateLock needs to be held in read mode before calling this function. @@ -947,19 +951,28 @@ func CreateCore(conf *CoreConfig) (*Core, error) { if conf.NumRollbackWorkers == 0 { conf.NumRollbackWorkers = RollbackDefaultNumWorkers } - // Use imported logging deadlock if requested - var stateLock locking.RWMutex - if strings.Contains(conf.DetectDeadlocks, "statelock") { - stateLock = &locking.DeadlockRWMutex{} - } else { - stateLock = &locking.SyncRWMutex{} - } effectiveSDKVersion := conf.EffectiveSDKVersion if effectiveSDKVersion == "" { effectiveSDKVersion = version.GetVersion().Version } + var detectDeadlocks []string + if conf.DetectDeadlocks != "" { + detectDeadlocks = strings.Split(conf.DetectDeadlocks, ",") + for k, v := range detectDeadlocks { + detectDeadlocks[k] = strings.ToLower(strings.TrimSpace(v)) + } + } + + // Use imported logging deadlock if requested + var stateLock locking.RWMutex + stateLock = &locking.SyncRWMutex{} + + if slices.Contains(detectDeadlocks, "statelock") { + stateLock = &locking.DeadlockRWMutex{} + } + // Setup the core c := &Core{ entCore: entCore{}, @@ -1033,6 +1046,7 @@ func CreateCore(conf *CoreConfig) (*Core, error) { rollbackMountPathMetrics: conf.MetricSink.TelemetryConsts.RollbackMetricsIncludeMountPoint, numRollbackWorkers: conf.NumRollbackWorkers, impreciseLeaseRoleTracking: conf.ImpreciseLeaseRoleTracking, + detectDeadlocks: detectDeadlocks, } c.standbyStopCh.Store(make(chan struct{})) @@ -1219,7 +1233,9 @@ func NewCore(conf *CoreConfig) (*Core, error) { // Quotas quotasLogger := conf.Logger.Named("quotas") - c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink) + + detectDeadlocks := slices.Contains(c.detectDeadlocks, "quotas") + c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink, detectDeadlocks) if err != nil { return nil, err } @@ -4188,3 +4204,10 @@ func (c *Core) GetRaftAutopilotState(ctx context.Context) (*raft.AutopilotState, func (c *Core) Events() *eventbus.EventBus { return c.events } + +func (c *Core) DetectStateLockDeadlocks() bool { + if _, ok := c.stateLock.(*locking.DeadlockRWMutex); ok { + return true + } + return false +} diff --git a/vault/core_test.go b/vault/core_test.go index 14e51ad3fa..5e777fe1de 100644 --- a/vault/core_test.go +++ b/vault/core_test.go @@ -3361,3 +3361,51 @@ func InduceDeadlock(t *testing.T, vaultcore *Core, expected uint32) { t.Fatalf("expected 1 deadlock, detected %d", deadlocks) } } + +func TestExpiration_DeadlockDetection(t *testing.T) { + testCore := TestCore(t) + testCoreUnsealed(t, testCore) + + if testCore.expiration.DetectDeadlocks() { + t.Fatal("expiration has deadlock detection enabled, it shouldn't") + } + + testCore = TestCoreWithDeadlockDetection(t, nil, false) + testCoreUnsealed(t, testCore) + + if !testCore.expiration.DetectDeadlocks() { + t.Fatal("expiration doesn't have deadlock detection enabled, it should") + } +} + +func TestQuotas_DeadlockDetection(t *testing.T) { + testCore := TestCore(t) + testCoreUnsealed(t, testCore) + + if testCore.quotaManager.DetectDeadlocks() { + t.Fatal("quotas has deadlock detection enabled, it shouldn't") + } + + testCore = TestCoreWithDeadlockDetection(t, nil, false) + testCoreUnsealed(t, testCore) + + if !testCore.quotaManager.DetectDeadlocks() { + t.Fatal("quotas doesn't have deadlock detection enabled, it should") + } +} + +func TestStatelock_DeadlockDetection(t *testing.T) { + testCore := TestCore(t) + testCoreUnsealed(t, testCore) + + if testCore.DetectStateLockDeadlocks() { + t.Fatal("statelock has deadlock detection enabled, it shouldn't") + } + + testCore = TestCoreWithDeadlockDetection(t, nil, false) + testCoreUnsealed(t, testCore) + + if !testCore.DetectStateLockDeadlocks() { + t.Fatal("statelock doesn't have deadlock detection enabled, it should") + } +} diff --git a/vault/expiration.go b/vault/expiration.go index c004629093..00680eaa88 100644 --- a/vault/expiration.go +++ b/vault/expiration.go @@ -11,6 +11,7 @@ import ( "math/rand" "os" "path" + "slices" "sort" "strconv" "strings" @@ -114,7 +115,7 @@ type ExpirationManager struct { pending sync.Map nonexpiring sync.Map leaseCount int - pendingLock locking.DeadlockRWMutex + pendingLock locking.RWMutex // A sync.Lock for every active leaseID lockPerLease sync.Map @@ -327,7 +328,7 @@ func getNumExpirationWorkers(c *Core, l log.Logger) int { // NewExpirationManager creates a new ExpirationManager that is backed // using a given view, and uses the provided router for revocation. -func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger) *ExpirationManager { +func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger, detectDeadlocks bool) *ExpirationManager { managerLogger := logger.Named("job-manager") jobManager := fairshare.NewJobManager("expire", getNumExpirationWorkers(c, logger), managerLogger, c.metricSink) jobManager.Start() @@ -340,6 +341,7 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log tokenStore: c.tokenStore, logger: logger, pending: sync.Map{}, + pendingLock: &locking.SyncRWMutex{}, nonexpiring: sync.Map{}, leaseCount: 0, tidyLock: new(int32), @@ -375,6 +377,11 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log exp.logger = log.New(&opts) } + if detectDeadlocks { + managerLogger.Debug("enabling deadlock detection") + exp.pendingLock = &locking.DeadlockRWMutex{} + } + go exp.uniquePoliciesGc() return exp @@ -390,7 +397,9 @@ func (c *Core) setupExpiration(e ExpireLeaseStrategy) error { // Create the manager expLogger := c.baseLogger.Named("expiration") - mgr := NewExpirationManager(c, view, e, expLogger) + + detectDeadlocks := slices.Contains(c.detectDeadlocks, "expiration") + mgr := NewExpirationManager(c, view, e, expLogger, detectDeadlocks) c.expiration = mgr // Link the token store to this @@ -2821,3 +2830,10 @@ func decodeLeaseEntry(buf []byte) (*leaseEntry, error) { out := new(leaseEntry) return out, jsonutil.DecodeJSON(buf, out) } + +func (e *ExpirationManager) DetectDeadlocks() bool { + if _, ok := e.pendingLock.(*locking.DeadlockRWMutex); ok { + return true + } + return false +} diff --git a/vault/quotas/quotas.go b/vault/quotas/quotas.go index 18e69346ad..cefa3d13cf 100644 --- a/vault/quotas/quotas.go +++ b/vault/quotas/quotas.go @@ -170,13 +170,13 @@ type Manager struct { metricSink *metricsutil.ClusterMetricSink // quotaLock is a lock for manipulating quotas and anything not covered by a more specific lock - quotaLock *locking.DeadlockRWMutex + quotaLock locking.RWMutex // quotaConfigLock is a lock for accessing config items, such as RateLimitExemptPaths - quotaConfigLock *locking.DeadlockRWMutex + quotaConfigLock locking.RWMutex // dbAndCacheLock is a lock for db and path caches that need to be reset during Reset() - dbAndCacheLock *locking.DeadlockRWMutex + dbAndCacheLock locking.RWMutex } // QuotaLeaseInformation contains all of the information lease-count quotas require @@ -275,7 +275,7 @@ type Request struct { // NewManager creates and initializes a new quota manager to hold all the quota // rules and to process incoming requests. -func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink) (*Manager, error) { +func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink, detectDeadlocks bool) (*Manager, error) { db, err := memdb.NewMemDB(dbSchema()) if err != nil { return nil, err @@ -287,9 +287,16 @@ func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.Clust metricSink: ms, rateLimitPathManager: pathmanager.New(), config: new(Config), - quotaLock: new(locking.DeadlockRWMutex), - quotaConfigLock: new(locking.DeadlockRWMutex), - dbAndCacheLock: new(locking.DeadlockRWMutex), + quotaLock: &locking.SyncRWMutex{}, + quotaConfigLock: &locking.SyncRWMutex{}, + dbAndCacheLock: &locking.SyncRWMutex{}, + } + + if detectDeadlocks { + logger.Debug("enabling deadlock detection") + manager.quotaLock = &locking.DeadlockRWMutex{} + manager.quotaConfigLock = &locking.DeadlockRWMutex{} + manager.dbAndCacheLock = &locking.DeadlockRWMutex{} } manager.init(walkFunc) @@ -1319,3 +1326,10 @@ func (m *Manager) HandleBackendDisabling(ctx context.Context, nsPath, mountPath return nil } + +func (m *Manager) DetectDeadlocks() bool { + if _, ok := m.quotaLock.(*locking.DeadlockRWMutex); ok { + return true + } + return false +} diff --git a/vault/quotas/quotas_rate_limit_test.go b/vault/quotas/quotas_rate_limit_test.go index 940a92c973..62d79051f2 100644 --- a/vault/quotas/quotas_rate_limit_test.go +++ b/vault/quotas/quotas_rate_limit_test.go @@ -218,7 +218,7 @@ func TestRateLimitQuota_Allow_WithBlock(t *testing.T) { func TestRateLimitQuota_Update(t *testing.T) { defer goleak.VerifyNone(t) - qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink()) + qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true) require.NoError(t, err) quota := NewRateLimitQuota("quota1", "", "", "", "", false, time.Second, 0, 10) diff --git a/vault/quotas/quotas_test.go b/vault/quotas/quotas_test.go index 45afed533c..58380c5922 100644 --- a/vault/quotas/quotas_test.go +++ b/vault/quotas/quotas_test.go @@ -16,7 +16,7 @@ import ( ) func TestQuotas_MountPathOverwrite(t *testing.T) { - qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink()) + qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true) require.NoError(t, err) quota := NewRateLimitQuota("tq", "", "kv1/", "", "", false, time.Second, 0, 10) @@ -43,7 +43,7 @@ func TestQuotas_MountPathOverwrite(t *testing.T) { } func TestQuotas_Precedence(t *testing.T) { - qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink()) + qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true) require.NoError(t, err) setQuotaFunc := func(t *testing.T, name, nsPath, mountPath, pathSuffix, role string, inheritable bool) Quota { @@ -142,7 +142,7 @@ func TestQuotas_QueryResolveRole_RateLimitQuotas(t *testing.T) { leaseWalkFunc := func(context.Context, func(request *Request) bool) error { return nil } - qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink()) + qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink(), true) require.NoError(t, err) rlqReq := &Request{ diff --git a/vault/testing.go b/vault/testing.go index 1989d321f4..c0947ea6d8 100644 --- a/vault/testing.go +++ b/vault/testing.go @@ -142,6 +142,20 @@ func TestCoreWithSeal(t testing.T, testSeal Seal, enableRaw bool) *Core { return TestCoreWithSealAndUI(t, conf) } +func TestCoreWithDeadlockDetection(t testing.T, testSeal Seal, enableRaw bool) *Core { + conf := &CoreConfig{ + Seal: testSeal, + EnableUI: false, + EnableRaw: enableRaw, + BuiltinRegistry: corehelpers.NewMockBuiltinRegistry(), + AuditBackends: map[string]audit.Factory{ + "file": auditFile.Factory, + }, + DetectDeadlocks: "expiration,quotas,statelock", + } + return TestCoreWithSealAndUI(t, conf) +} + func TestCoreWithCustomResponseHeaderAndUI(t testing.T, CustomResponseHeaders map[string]map[string]string, enableUI bool) (*Core, [][]byte, string) { confRaw := &server.Config{ SharedConfig: &configutil.SharedConfig{ diff --git a/website/content/docs/configuration/index.mdx b/website/content/docs/configuration/index.mdx index 1ae7f8d89e..cd414e606b 100644 --- a/website/content/docs/configuration/index.mdx +++ b/website/content/docs/configuration/index.mdx @@ -159,10 +159,11 @@ to specify where the configuration is. maximum request duration allowed before Vault cancels the request. This can be overridden per listener via the `max_request_duration` value. -- `detect_deadlocks` `(string: "")` - Specifies the internal mutex locks that should be monitored for -potential deadlocks. Currently supported value is `statelock`, which will cause "POTENTIAL DEADLOCK:" -to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this can have -a negative effect on performance due to the tracking of each lock attempt. +- `detect_deadlocks` `(string: "")` - A comma separated string that specifies the internal +mutex locks that should be monitored for potential deadlocks. Currently supported values +include `statelock`, `quotas` and `expiration` which will cause "POTENTIAL DEADLOCK:" +to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this +can have a negative effect on performance due to the tracking of each lock attempt. - `raw_storage_endpoint` `(bool: false)` – Enables the `sys/raw` endpoint which allows the decryption/encryption of raw data into and out of the security