core: fix bug where deadlock detection was always on for expiration and quotas (#23902)

* server: fix bug where deadlock detection was on for expiration and quotas

* trim spaces

* Add tests

* Use trimspace and lower

* Update test

* changelog

* fix config parsing
This commit is contained in:
Jason O'Donnell
2023-10-30 12:49:46 -04:00
committed by GitHub
parent 26bae55997
commit 66494c8129
9 changed files with 147 additions and 26 deletions

5
changelog/23902.txt Normal file
View File

@@ -0,0 +1,5 @@
```release-note:bug
core: fix bug where deadlock detection was always on for expiration and quotas.
These can now be configured individually with `detect_deadlocks`.
```

View File

@@ -20,6 +20,7 @@ import (
"os" "os"
"path/filepath" "path/filepath"
"runtime" "runtime"
"slices"
"strconv" "strconv"
"strings" "strings"
"sync" "sync"
@@ -690,6 +691,9 @@ type Core struct {
// If any role based quota (LCQ or RLQ) is enabled, don't track lease counts by role // If any role based quota (LCQ or RLQ) is enabled, don't track lease counts by role
impreciseLeaseRoleTracking bool impreciseLeaseRoleTracking bool
// Config value for "detect_deadlocks".
detectDeadlocks []string
} }
// c.stateLock needs to be held in read mode before calling this function. // c.stateLock needs to be held in read mode before calling this function.
@@ -947,19 +951,28 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
if conf.NumRollbackWorkers == 0 { if conf.NumRollbackWorkers == 0 {
conf.NumRollbackWorkers = RollbackDefaultNumWorkers conf.NumRollbackWorkers = RollbackDefaultNumWorkers
} }
// Use imported logging deadlock if requested
var stateLock locking.RWMutex
if strings.Contains(conf.DetectDeadlocks, "statelock") {
stateLock = &locking.DeadlockRWMutex{}
} else {
stateLock = &locking.SyncRWMutex{}
}
effectiveSDKVersion := conf.EffectiveSDKVersion effectiveSDKVersion := conf.EffectiveSDKVersion
if effectiveSDKVersion == "" { if effectiveSDKVersion == "" {
effectiveSDKVersion = version.GetVersion().Version effectiveSDKVersion = version.GetVersion().Version
} }
var detectDeadlocks []string
if conf.DetectDeadlocks != "" {
detectDeadlocks = strings.Split(conf.DetectDeadlocks, ",")
for k, v := range detectDeadlocks {
detectDeadlocks[k] = strings.ToLower(strings.TrimSpace(v))
}
}
// Use imported logging deadlock if requested
var stateLock locking.RWMutex
stateLock = &locking.SyncRWMutex{}
if slices.Contains(detectDeadlocks, "statelock") {
stateLock = &locking.DeadlockRWMutex{}
}
// Setup the core // Setup the core
c := &Core{ c := &Core{
entCore: entCore{}, entCore: entCore{},
@@ -1033,6 +1046,7 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
rollbackMountPathMetrics: conf.MetricSink.TelemetryConsts.RollbackMetricsIncludeMountPoint, rollbackMountPathMetrics: conf.MetricSink.TelemetryConsts.RollbackMetricsIncludeMountPoint,
numRollbackWorkers: conf.NumRollbackWorkers, numRollbackWorkers: conf.NumRollbackWorkers,
impreciseLeaseRoleTracking: conf.ImpreciseLeaseRoleTracking, impreciseLeaseRoleTracking: conf.ImpreciseLeaseRoleTracking,
detectDeadlocks: detectDeadlocks,
} }
c.standbyStopCh.Store(make(chan struct{})) c.standbyStopCh.Store(make(chan struct{}))
@@ -1219,7 +1233,9 @@ func NewCore(conf *CoreConfig) (*Core, error) {
// Quotas // Quotas
quotasLogger := conf.Logger.Named("quotas") quotasLogger := conf.Logger.Named("quotas")
c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink)
detectDeadlocks := slices.Contains(c.detectDeadlocks, "quotas")
c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink, detectDeadlocks)
if err != nil { if err != nil {
return nil, err return nil, err
} }
@@ -4188,3 +4204,10 @@ func (c *Core) GetRaftAutopilotState(ctx context.Context) (*raft.AutopilotState,
func (c *Core) Events() *eventbus.EventBus { func (c *Core) Events() *eventbus.EventBus {
return c.events return c.events
} }
func (c *Core) DetectStateLockDeadlocks() bool {
if _, ok := c.stateLock.(*locking.DeadlockRWMutex); ok {
return true
}
return false
}

View File

@@ -3361,3 +3361,51 @@ func InduceDeadlock(t *testing.T, vaultcore *Core, expected uint32) {
t.Fatalf("expected 1 deadlock, detected %d", deadlocks) t.Fatalf("expected 1 deadlock, detected %d", deadlocks)
} }
} }
func TestExpiration_DeadlockDetection(t *testing.T) {
testCore := TestCore(t)
testCoreUnsealed(t, testCore)
if testCore.expiration.DetectDeadlocks() {
t.Fatal("expiration has deadlock detection enabled, it shouldn't")
}
testCore = TestCoreWithDeadlockDetection(t, nil, false)
testCoreUnsealed(t, testCore)
if !testCore.expiration.DetectDeadlocks() {
t.Fatal("expiration doesn't have deadlock detection enabled, it should")
}
}
func TestQuotas_DeadlockDetection(t *testing.T) {
testCore := TestCore(t)
testCoreUnsealed(t, testCore)
if testCore.quotaManager.DetectDeadlocks() {
t.Fatal("quotas has deadlock detection enabled, it shouldn't")
}
testCore = TestCoreWithDeadlockDetection(t, nil, false)
testCoreUnsealed(t, testCore)
if !testCore.quotaManager.DetectDeadlocks() {
t.Fatal("quotas doesn't have deadlock detection enabled, it should")
}
}
func TestStatelock_DeadlockDetection(t *testing.T) {
testCore := TestCore(t)
testCoreUnsealed(t, testCore)
if testCore.DetectStateLockDeadlocks() {
t.Fatal("statelock has deadlock detection enabled, it shouldn't")
}
testCore = TestCoreWithDeadlockDetection(t, nil, false)
testCoreUnsealed(t, testCore)
if !testCore.DetectStateLockDeadlocks() {
t.Fatal("statelock doesn't have deadlock detection enabled, it should")
}
}

View File

@@ -11,6 +11,7 @@ import (
"math/rand" "math/rand"
"os" "os"
"path" "path"
"slices"
"sort" "sort"
"strconv" "strconv"
"strings" "strings"
@@ -114,7 +115,7 @@ type ExpirationManager struct {
pending sync.Map pending sync.Map
nonexpiring sync.Map nonexpiring sync.Map
leaseCount int leaseCount int
pendingLock locking.DeadlockRWMutex pendingLock locking.RWMutex
// A sync.Lock for every active leaseID // A sync.Lock for every active leaseID
lockPerLease sync.Map lockPerLease sync.Map
@@ -327,7 +328,7 @@ func getNumExpirationWorkers(c *Core, l log.Logger) int {
// NewExpirationManager creates a new ExpirationManager that is backed // NewExpirationManager creates a new ExpirationManager that is backed
// using a given view, and uses the provided router for revocation. // using a given view, and uses the provided router for revocation.
func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger) *ExpirationManager { func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger, detectDeadlocks bool) *ExpirationManager {
managerLogger := logger.Named("job-manager") managerLogger := logger.Named("job-manager")
jobManager := fairshare.NewJobManager("expire", getNumExpirationWorkers(c, logger), managerLogger, c.metricSink) jobManager := fairshare.NewJobManager("expire", getNumExpirationWorkers(c, logger), managerLogger, c.metricSink)
jobManager.Start() jobManager.Start()
@@ -340,6 +341,7 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log
tokenStore: c.tokenStore, tokenStore: c.tokenStore,
logger: logger, logger: logger,
pending: sync.Map{}, pending: sync.Map{},
pendingLock: &locking.SyncRWMutex{},
nonexpiring: sync.Map{}, nonexpiring: sync.Map{},
leaseCount: 0, leaseCount: 0,
tidyLock: new(int32), tidyLock: new(int32),
@@ -375,6 +377,11 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log
exp.logger = log.New(&opts) exp.logger = log.New(&opts)
} }
if detectDeadlocks {
managerLogger.Debug("enabling deadlock detection")
exp.pendingLock = &locking.DeadlockRWMutex{}
}
go exp.uniquePoliciesGc() go exp.uniquePoliciesGc()
return exp return exp
@@ -390,7 +397,9 @@ func (c *Core) setupExpiration(e ExpireLeaseStrategy) error {
// Create the manager // Create the manager
expLogger := c.baseLogger.Named("expiration") expLogger := c.baseLogger.Named("expiration")
mgr := NewExpirationManager(c, view, e, expLogger)
detectDeadlocks := slices.Contains(c.detectDeadlocks, "expiration")
mgr := NewExpirationManager(c, view, e, expLogger, detectDeadlocks)
c.expiration = mgr c.expiration = mgr
// Link the token store to this // Link the token store to this
@@ -2821,3 +2830,10 @@ func decodeLeaseEntry(buf []byte) (*leaseEntry, error) {
out := new(leaseEntry) out := new(leaseEntry)
return out, jsonutil.DecodeJSON(buf, out) return out, jsonutil.DecodeJSON(buf, out)
} }
func (e *ExpirationManager) DetectDeadlocks() bool {
if _, ok := e.pendingLock.(*locking.DeadlockRWMutex); ok {
return true
}
return false
}

View File

@@ -170,13 +170,13 @@ type Manager struct {
metricSink *metricsutil.ClusterMetricSink metricSink *metricsutil.ClusterMetricSink
// quotaLock is a lock for manipulating quotas and anything not covered by a more specific lock // quotaLock is a lock for manipulating quotas and anything not covered by a more specific lock
quotaLock *locking.DeadlockRWMutex quotaLock locking.RWMutex
// quotaConfigLock is a lock for accessing config items, such as RateLimitExemptPaths // quotaConfigLock is a lock for accessing config items, such as RateLimitExemptPaths
quotaConfigLock *locking.DeadlockRWMutex quotaConfigLock locking.RWMutex
// dbAndCacheLock is a lock for db and path caches that need to be reset during Reset() // dbAndCacheLock is a lock for db and path caches that need to be reset during Reset()
dbAndCacheLock *locking.DeadlockRWMutex dbAndCacheLock locking.RWMutex
} }
// QuotaLeaseInformation contains all of the information lease-count quotas require // QuotaLeaseInformation contains all of the information lease-count quotas require
@@ -275,7 +275,7 @@ type Request struct {
// NewManager creates and initializes a new quota manager to hold all the quota // NewManager creates and initializes a new quota manager to hold all the quota
// rules and to process incoming requests. // rules and to process incoming requests.
func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink) (*Manager, error) { func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink, detectDeadlocks bool) (*Manager, error) {
db, err := memdb.NewMemDB(dbSchema()) db, err := memdb.NewMemDB(dbSchema())
if err != nil { if err != nil {
return nil, err return nil, err
@@ -287,9 +287,16 @@ func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.Clust
metricSink: ms, metricSink: ms,
rateLimitPathManager: pathmanager.New(), rateLimitPathManager: pathmanager.New(),
config: new(Config), config: new(Config),
quotaLock: new(locking.DeadlockRWMutex), quotaLock: &locking.SyncRWMutex{},
quotaConfigLock: new(locking.DeadlockRWMutex), quotaConfigLock: &locking.SyncRWMutex{},
dbAndCacheLock: new(locking.DeadlockRWMutex), dbAndCacheLock: &locking.SyncRWMutex{},
}
if detectDeadlocks {
logger.Debug("enabling deadlock detection")
manager.quotaLock = &locking.DeadlockRWMutex{}
manager.quotaConfigLock = &locking.DeadlockRWMutex{}
manager.dbAndCacheLock = &locking.DeadlockRWMutex{}
} }
manager.init(walkFunc) manager.init(walkFunc)
@@ -1319,3 +1326,10 @@ func (m *Manager) HandleBackendDisabling(ctx context.Context, nsPath, mountPath
return nil return nil
} }
func (m *Manager) DetectDeadlocks() bool {
if _, ok := m.quotaLock.(*locking.DeadlockRWMutex); ok {
return true
}
return false
}

View File

@@ -218,7 +218,7 @@ func TestRateLimitQuota_Allow_WithBlock(t *testing.T) {
func TestRateLimitQuota_Update(t *testing.T) { func TestRateLimitQuota_Update(t *testing.T) {
defer goleak.VerifyNone(t) defer goleak.VerifyNone(t)
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink()) qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
require.NoError(t, err) require.NoError(t, err)
quota := NewRateLimitQuota("quota1", "", "", "", "", false, time.Second, 0, 10) quota := NewRateLimitQuota("quota1", "", "", "", "", false, time.Second, 0, 10)

View File

@@ -16,7 +16,7 @@ import (
) )
func TestQuotas_MountPathOverwrite(t *testing.T) { func TestQuotas_MountPathOverwrite(t *testing.T) {
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink()) qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
require.NoError(t, err) require.NoError(t, err)
quota := NewRateLimitQuota("tq", "", "kv1/", "", "", false, time.Second, 0, 10) quota := NewRateLimitQuota("tq", "", "kv1/", "", "", false, time.Second, 0, 10)
@@ -43,7 +43,7 @@ func TestQuotas_MountPathOverwrite(t *testing.T) {
} }
func TestQuotas_Precedence(t *testing.T) { func TestQuotas_Precedence(t *testing.T) {
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink()) qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
require.NoError(t, err) require.NoError(t, err)
setQuotaFunc := func(t *testing.T, name, nsPath, mountPath, pathSuffix, role string, inheritable bool) Quota { setQuotaFunc := func(t *testing.T, name, nsPath, mountPath, pathSuffix, role string, inheritable bool) Quota {
@@ -142,7 +142,7 @@ func TestQuotas_QueryResolveRole_RateLimitQuotas(t *testing.T) {
leaseWalkFunc := func(context.Context, func(request *Request) bool) error { leaseWalkFunc := func(context.Context, func(request *Request) bool) error {
return nil return nil
} }
qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink()) qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink(), true)
require.NoError(t, err) require.NoError(t, err)
rlqReq := &Request{ rlqReq := &Request{

View File

@@ -142,6 +142,20 @@ func TestCoreWithSeal(t testing.T, testSeal Seal, enableRaw bool) *Core {
return TestCoreWithSealAndUI(t, conf) return TestCoreWithSealAndUI(t, conf)
} }
func TestCoreWithDeadlockDetection(t testing.T, testSeal Seal, enableRaw bool) *Core {
conf := &CoreConfig{
Seal: testSeal,
EnableUI: false,
EnableRaw: enableRaw,
BuiltinRegistry: corehelpers.NewMockBuiltinRegistry(),
AuditBackends: map[string]audit.Factory{
"file": auditFile.Factory,
},
DetectDeadlocks: "expiration,quotas,statelock",
}
return TestCoreWithSealAndUI(t, conf)
}
func TestCoreWithCustomResponseHeaderAndUI(t testing.T, CustomResponseHeaders map[string]map[string]string, enableUI bool) (*Core, [][]byte, string) { func TestCoreWithCustomResponseHeaderAndUI(t testing.T, CustomResponseHeaders map[string]map[string]string, enableUI bool) (*Core, [][]byte, string) {
confRaw := &server.Config{ confRaw := &server.Config{
SharedConfig: &configutil.SharedConfig{ SharedConfig: &configutil.SharedConfig{

View File

@@ -159,10 +159,11 @@ to specify where the configuration is.
maximum request duration allowed before Vault cancels the request. This can maximum request duration allowed before Vault cancels the request. This can
be overridden per listener via the `max_request_duration` value. be overridden per listener via the `max_request_duration` value.
- `detect_deadlocks` `(string: "")` - Specifies the internal mutex locks that should be monitored for - `detect_deadlocks` `(string: "")` - A comma separated string that specifies the internal
potential deadlocks. Currently supported value is `statelock`, which will cause "POTENTIAL DEADLOCK:" mutex locks that should be monitored for potential deadlocks. Currently supported values
to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this can have include `statelock`, `quotas` and `expiration` which will cause "POTENTIAL DEADLOCK:"
a negative effect on performance due to the tracking of each lock attempt. to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this
can have a negative effect on performance due to the tracking of each lock attempt.
- `raw_storage_endpoint` `(bool: false)` Enables the `sys/raw` endpoint which - `raw_storage_endpoint` `(bool: false)` Enables the `sys/raw` endpoint which
allows the decryption/encryption of raw data into and out of the security allows the decryption/encryption of raw data into and out of the security