core: fix bug where deadlock detection was always on for expiration and quotas (#23902)

* server: fix bug where deadlock detection was on for expiration and quotas

* trim spaces

* Add tests

* Use trimspace and lower

* Update test

* changelog

* fix config parsing
This commit is contained in:
Jason O'Donnell
2023-10-30 12:49:46 -04:00
committed by GitHub
parent 26bae55997
commit 66494c8129
9 changed files with 147 additions and 26 deletions

5
changelog/23902.txt Normal file
View File

@@ -0,0 +1,5 @@
```release-note:bug
core: fix bug where deadlock detection was always on for expiration and quotas.
These can now be configured individually with `detect_deadlocks`.
```

View File

@@ -20,6 +20,7 @@ import (
"os"
"path/filepath"
"runtime"
"slices"
"strconv"
"strings"
"sync"
@@ -690,6 +691,9 @@ type Core struct {
// If any role based quota (LCQ or RLQ) is enabled, don't track lease counts by role
impreciseLeaseRoleTracking bool
// Config value for "detect_deadlocks".
detectDeadlocks []string
}
// c.stateLock needs to be held in read mode before calling this function.
@@ -947,19 +951,28 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
if conf.NumRollbackWorkers == 0 {
conf.NumRollbackWorkers = RollbackDefaultNumWorkers
}
// Use imported logging deadlock if requested
var stateLock locking.RWMutex
if strings.Contains(conf.DetectDeadlocks, "statelock") {
stateLock = &locking.DeadlockRWMutex{}
} else {
stateLock = &locking.SyncRWMutex{}
}
effectiveSDKVersion := conf.EffectiveSDKVersion
if effectiveSDKVersion == "" {
effectiveSDKVersion = version.GetVersion().Version
}
var detectDeadlocks []string
if conf.DetectDeadlocks != "" {
detectDeadlocks = strings.Split(conf.DetectDeadlocks, ",")
for k, v := range detectDeadlocks {
detectDeadlocks[k] = strings.ToLower(strings.TrimSpace(v))
}
}
// Use imported logging deadlock if requested
var stateLock locking.RWMutex
stateLock = &locking.SyncRWMutex{}
if slices.Contains(detectDeadlocks, "statelock") {
stateLock = &locking.DeadlockRWMutex{}
}
// Setup the core
c := &Core{
entCore: entCore{},
@@ -1033,6 +1046,7 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
rollbackMountPathMetrics: conf.MetricSink.TelemetryConsts.RollbackMetricsIncludeMountPoint,
numRollbackWorkers: conf.NumRollbackWorkers,
impreciseLeaseRoleTracking: conf.ImpreciseLeaseRoleTracking,
detectDeadlocks: detectDeadlocks,
}
c.standbyStopCh.Store(make(chan struct{}))
@@ -1219,7 +1233,9 @@ func NewCore(conf *CoreConfig) (*Core, error) {
// Quotas
quotasLogger := conf.Logger.Named("quotas")
c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink)
detectDeadlocks := slices.Contains(c.detectDeadlocks, "quotas")
c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink, detectDeadlocks)
if err != nil {
return nil, err
}
@@ -4188,3 +4204,10 @@ func (c *Core) GetRaftAutopilotState(ctx context.Context) (*raft.AutopilotState,
func (c *Core) Events() *eventbus.EventBus {
return c.events
}
func (c *Core) DetectStateLockDeadlocks() bool {
if _, ok := c.stateLock.(*locking.DeadlockRWMutex); ok {
return true
}
return false
}

View File

@@ -3361,3 +3361,51 @@ func InduceDeadlock(t *testing.T, vaultcore *Core, expected uint32) {
t.Fatalf("expected 1 deadlock, detected %d", deadlocks)
}
}
func TestExpiration_DeadlockDetection(t *testing.T) {
testCore := TestCore(t)
testCoreUnsealed(t, testCore)
if testCore.expiration.DetectDeadlocks() {
t.Fatal("expiration has deadlock detection enabled, it shouldn't")
}
testCore = TestCoreWithDeadlockDetection(t, nil, false)
testCoreUnsealed(t, testCore)
if !testCore.expiration.DetectDeadlocks() {
t.Fatal("expiration doesn't have deadlock detection enabled, it should")
}
}
func TestQuotas_DeadlockDetection(t *testing.T) {
testCore := TestCore(t)
testCoreUnsealed(t, testCore)
if testCore.quotaManager.DetectDeadlocks() {
t.Fatal("quotas has deadlock detection enabled, it shouldn't")
}
testCore = TestCoreWithDeadlockDetection(t, nil, false)
testCoreUnsealed(t, testCore)
if !testCore.quotaManager.DetectDeadlocks() {
t.Fatal("quotas doesn't have deadlock detection enabled, it should")
}
}
func TestStatelock_DeadlockDetection(t *testing.T) {
testCore := TestCore(t)
testCoreUnsealed(t, testCore)
if testCore.DetectStateLockDeadlocks() {
t.Fatal("statelock has deadlock detection enabled, it shouldn't")
}
testCore = TestCoreWithDeadlockDetection(t, nil, false)
testCoreUnsealed(t, testCore)
if !testCore.DetectStateLockDeadlocks() {
t.Fatal("statelock doesn't have deadlock detection enabled, it should")
}
}

View File

@@ -11,6 +11,7 @@ import (
"math/rand"
"os"
"path"
"slices"
"sort"
"strconv"
"strings"
@@ -114,7 +115,7 @@ type ExpirationManager struct {
pending sync.Map
nonexpiring sync.Map
leaseCount int
pendingLock locking.DeadlockRWMutex
pendingLock locking.RWMutex
// A sync.Lock for every active leaseID
lockPerLease sync.Map
@@ -327,7 +328,7 @@ func getNumExpirationWorkers(c *Core, l log.Logger) int {
// NewExpirationManager creates a new ExpirationManager that is backed
// using a given view, and uses the provided router for revocation.
func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger) *ExpirationManager {
func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger, detectDeadlocks bool) *ExpirationManager {
managerLogger := logger.Named("job-manager")
jobManager := fairshare.NewJobManager("expire", getNumExpirationWorkers(c, logger), managerLogger, c.metricSink)
jobManager.Start()
@@ -340,6 +341,7 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log
tokenStore: c.tokenStore,
logger: logger,
pending: sync.Map{},
pendingLock: &locking.SyncRWMutex{},
nonexpiring: sync.Map{},
leaseCount: 0,
tidyLock: new(int32),
@@ -375,6 +377,11 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log
exp.logger = log.New(&opts)
}
if detectDeadlocks {
managerLogger.Debug("enabling deadlock detection")
exp.pendingLock = &locking.DeadlockRWMutex{}
}
go exp.uniquePoliciesGc()
return exp
@@ -390,7 +397,9 @@ func (c *Core) setupExpiration(e ExpireLeaseStrategy) error {
// Create the manager
expLogger := c.baseLogger.Named("expiration")
mgr := NewExpirationManager(c, view, e, expLogger)
detectDeadlocks := slices.Contains(c.detectDeadlocks, "expiration")
mgr := NewExpirationManager(c, view, e, expLogger, detectDeadlocks)
c.expiration = mgr
// Link the token store to this
@@ -2821,3 +2830,10 @@ func decodeLeaseEntry(buf []byte) (*leaseEntry, error) {
out := new(leaseEntry)
return out, jsonutil.DecodeJSON(buf, out)
}
func (e *ExpirationManager) DetectDeadlocks() bool {
if _, ok := e.pendingLock.(*locking.DeadlockRWMutex); ok {
return true
}
return false
}

View File

@@ -170,13 +170,13 @@ type Manager struct {
metricSink *metricsutil.ClusterMetricSink
// quotaLock is a lock for manipulating quotas and anything not covered by a more specific lock
quotaLock *locking.DeadlockRWMutex
quotaLock locking.RWMutex
// quotaConfigLock is a lock for accessing config items, such as RateLimitExemptPaths
quotaConfigLock *locking.DeadlockRWMutex
quotaConfigLock locking.RWMutex
// dbAndCacheLock is a lock for db and path caches that need to be reset during Reset()
dbAndCacheLock *locking.DeadlockRWMutex
dbAndCacheLock locking.RWMutex
}
// QuotaLeaseInformation contains all of the information lease-count quotas require
@@ -275,7 +275,7 @@ type Request struct {
// NewManager creates and initializes a new quota manager to hold all the quota
// rules and to process incoming requests.
func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink) (*Manager, error) {
func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink, detectDeadlocks bool) (*Manager, error) {
db, err := memdb.NewMemDB(dbSchema())
if err != nil {
return nil, err
@@ -287,9 +287,16 @@ func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.Clust
metricSink: ms,
rateLimitPathManager: pathmanager.New(),
config: new(Config),
quotaLock: new(locking.DeadlockRWMutex),
quotaConfigLock: new(locking.DeadlockRWMutex),
dbAndCacheLock: new(locking.DeadlockRWMutex),
quotaLock: &locking.SyncRWMutex{},
quotaConfigLock: &locking.SyncRWMutex{},
dbAndCacheLock: &locking.SyncRWMutex{},
}
if detectDeadlocks {
logger.Debug("enabling deadlock detection")
manager.quotaLock = &locking.DeadlockRWMutex{}
manager.quotaConfigLock = &locking.DeadlockRWMutex{}
manager.dbAndCacheLock = &locking.DeadlockRWMutex{}
}
manager.init(walkFunc)
@@ -1319,3 +1326,10 @@ func (m *Manager) HandleBackendDisabling(ctx context.Context, nsPath, mountPath
return nil
}
func (m *Manager) DetectDeadlocks() bool {
if _, ok := m.quotaLock.(*locking.DeadlockRWMutex); ok {
return true
}
return false
}

View File

@@ -218,7 +218,7 @@ func TestRateLimitQuota_Allow_WithBlock(t *testing.T) {
func TestRateLimitQuota_Update(t *testing.T) {
defer goleak.VerifyNone(t)
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink())
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
require.NoError(t, err)
quota := NewRateLimitQuota("quota1", "", "", "", "", false, time.Second, 0, 10)

View File

@@ -16,7 +16,7 @@ import (
)
func TestQuotas_MountPathOverwrite(t *testing.T) {
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink())
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
require.NoError(t, err)
quota := NewRateLimitQuota("tq", "", "kv1/", "", "", false, time.Second, 0, 10)
@@ -43,7 +43,7 @@ func TestQuotas_MountPathOverwrite(t *testing.T) {
}
func TestQuotas_Precedence(t *testing.T) {
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink())
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
require.NoError(t, err)
setQuotaFunc := func(t *testing.T, name, nsPath, mountPath, pathSuffix, role string, inheritable bool) Quota {
@@ -142,7 +142,7 @@ func TestQuotas_QueryResolveRole_RateLimitQuotas(t *testing.T) {
leaseWalkFunc := func(context.Context, func(request *Request) bool) error {
return nil
}
qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink())
qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink(), true)
require.NoError(t, err)
rlqReq := &Request{

View File

@@ -142,6 +142,20 @@ func TestCoreWithSeal(t testing.T, testSeal Seal, enableRaw bool) *Core {
return TestCoreWithSealAndUI(t, conf)
}
func TestCoreWithDeadlockDetection(t testing.T, testSeal Seal, enableRaw bool) *Core {
conf := &CoreConfig{
Seal: testSeal,
EnableUI: false,
EnableRaw: enableRaw,
BuiltinRegistry: corehelpers.NewMockBuiltinRegistry(),
AuditBackends: map[string]audit.Factory{
"file": auditFile.Factory,
},
DetectDeadlocks: "expiration,quotas,statelock",
}
return TestCoreWithSealAndUI(t, conf)
}
func TestCoreWithCustomResponseHeaderAndUI(t testing.T, CustomResponseHeaders map[string]map[string]string, enableUI bool) (*Core, [][]byte, string) {
confRaw := &server.Config{
SharedConfig: &configutil.SharedConfig{

View File

@@ -159,10 +159,11 @@ to specify where the configuration is.
maximum request duration allowed before Vault cancels the request. This can
be overridden per listener via the `max_request_duration` value.
- `detect_deadlocks` `(string: "")` - Specifies the internal mutex locks that should be monitored for
potential deadlocks. Currently supported value is `statelock`, which will cause "POTENTIAL DEADLOCK:"
to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this can have
a negative effect on performance due to the tracking of each lock attempt.
- `detect_deadlocks` `(string: "")` - A comma separated string that specifies the internal
mutex locks that should be monitored for potential deadlocks. Currently supported values
include `statelock`, `quotas` and `expiration` which will cause "POTENTIAL DEADLOCK:"
to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this
can have a negative effect on performance due to the tracking of each lock attempt.
- `raw_storage_endpoint` `(bool: false)` Enables the `sys/raw` endpoint which
allows the decryption/encryption of raw data into and out of the security