mirror of
https://github.com/optim-enterprises-bv/vault.git
synced 2025-10-29 17:52:32 +00:00
core: fix bug where deadlock detection was always on for expiration and quotas (#23902)
* server: fix bug where deadlock detection was on for expiration and quotas * trim spaces * Add tests * Use trimspace and lower * Update test * changelog * fix config parsing
This commit is contained in:
5
changelog/23902.txt
Normal file
5
changelog/23902.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
```release-note:bug
|
||||
core: fix bug where deadlock detection was always on for expiration and quotas.
|
||||
These can now be configured individually with `detect_deadlocks`.
|
||||
```
|
||||
|
||||
@@ -20,6 +20,7 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
@@ -690,6 +691,9 @@ type Core struct {
|
||||
|
||||
// If any role based quota (LCQ or RLQ) is enabled, don't track lease counts by role
|
||||
impreciseLeaseRoleTracking bool
|
||||
|
||||
// Config value for "detect_deadlocks".
|
||||
detectDeadlocks []string
|
||||
}
|
||||
|
||||
// c.stateLock needs to be held in read mode before calling this function.
|
||||
@@ -947,19 +951,28 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
|
||||
if conf.NumRollbackWorkers == 0 {
|
||||
conf.NumRollbackWorkers = RollbackDefaultNumWorkers
|
||||
}
|
||||
// Use imported logging deadlock if requested
|
||||
var stateLock locking.RWMutex
|
||||
if strings.Contains(conf.DetectDeadlocks, "statelock") {
|
||||
stateLock = &locking.DeadlockRWMutex{}
|
||||
} else {
|
||||
stateLock = &locking.SyncRWMutex{}
|
||||
}
|
||||
|
||||
effectiveSDKVersion := conf.EffectiveSDKVersion
|
||||
if effectiveSDKVersion == "" {
|
||||
effectiveSDKVersion = version.GetVersion().Version
|
||||
}
|
||||
|
||||
var detectDeadlocks []string
|
||||
if conf.DetectDeadlocks != "" {
|
||||
detectDeadlocks = strings.Split(conf.DetectDeadlocks, ",")
|
||||
for k, v := range detectDeadlocks {
|
||||
detectDeadlocks[k] = strings.ToLower(strings.TrimSpace(v))
|
||||
}
|
||||
}
|
||||
|
||||
// Use imported logging deadlock if requested
|
||||
var stateLock locking.RWMutex
|
||||
stateLock = &locking.SyncRWMutex{}
|
||||
|
||||
if slices.Contains(detectDeadlocks, "statelock") {
|
||||
stateLock = &locking.DeadlockRWMutex{}
|
||||
}
|
||||
|
||||
// Setup the core
|
||||
c := &Core{
|
||||
entCore: entCore{},
|
||||
@@ -1033,6 +1046,7 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
|
||||
rollbackMountPathMetrics: conf.MetricSink.TelemetryConsts.RollbackMetricsIncludeMountPoint,
|
||||
numRollbackWorkers: conf.NumRollbackWorkers,
|
||||
impreciseLeaseRoleTracking: conf.ImpreciseLeaseRoleTracking,
|
||||
detectDeadlocks: detectDeadlocks,
|
||||
}
|
||||
|
||||
c.standbyStopCh.Store(make(chan struct{}))
|
||||
@@ -1219,7 +1233,9 @@ func NewCore(conf *CoreConfig) (*Core, error) {
|
||||
|
||||
// Quotas
|
||||
quotasLogger := conf.Logger.Named("quotas")
|
||||
c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink)
|
||||
|
||||
detectDeadlocks := slices.Contains(c.detectDeadlocks, "quotas")
|
||||
c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink, detectDeadlocks)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -4188,3 +4204,10 @@ func (c *Core) GetRaftAutopilotState(ctx context.Context) (*raft.AutopilotState,
|
||||
func (c *Core) Events() *eventbus.EventBus {
|
||||
return c.events
|
||||
}
|
||||
|
||||
func (c *Core) DetectStateLockDeadlocks() bool {
|
||||
if _, ok := c.stateLock.(*locking.DeadlockRWMutex); ok {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -3361,3 +3361,51 @@ func InduceDeadlock(t *testing.T, vaultcore *Core, expected uint32) {
|
||||
t.Fatalf("expected 1 deadlock, detected %d", deadlocks)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExpiration_DeadlockDetection(t *testing.T) {
|
||||
testCore := TestCore(t)
|
||||
testCoreUnsealed(t, testCore)
|
||||
|
||||
if testCore.expiration.DetectDeadlocks() {
|
||||
t.Fatal("expiration has deadlock detection enabled, it shouldn't")
|
||||
}
|
||||
|
||||
testCore = TestCoreWithDeadlockDetection(t, nil, false)
|
||||
testCoreUnsealed(t, testCore)
|
||||
|
||||
if !testCore.expiration.DetectDeadlocks() {
|
||||
t.Fatal("expiration doesn't have deadlock detection enabled, it should")
|
||||
}
|
||||
}
|
||||
|
||||
func TestQuotas_DeadlockDetection(t *testing.T) {
|
||||
testCore := TestCore(t)
|
||||
testCoreUnsealed(t, testCore)
|
||||
|
||||
if testCore.quotaManager.DetectDeadlocks() {
|
||||
t.Fatal("quotas has deadlock detection enabled, it shouldn't")
|
||||
}
|
||||
|
||||
testCore = TestCoreWithDeadlockDetection(t, nil, false)
|
||||
testCoreUnsealed(t, testCore)
|
||||
|
||||
if !testCore.quotaManager.DetectDeadlocks() {
|
||||
t.Fatal("quotas doesn't have deadlock detection enabled, it should")
|
||||
}
|
||||
}
|
||||
|
||||
func TestStatelock_DeadlockDetection(t *testing.T) {
|
||||
testCore := TestCore(t)
|
||||
testCoreUnsealed(t, testCore)
|
||||
|
||||
if testCore.DetectStateLockDeadlocks() {
|
||||
t.Fatal("statelock has deadlock detection enabled, it shouldn't")
|
||||
}
|
||||
|
||||
testCore = TestCoreWithDeadlockDetection(t, nil, false)
|
||||
testCoreUnsealed(t, testCore)
|
||||
|
||||
if !testCore.DetectStateLockDeadlocks() {
|
||||
t.Fatal("statelock doesn't have deadlock detection enabled, it should")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,6 +11,7 @@ import (
|
||||
"math/rand"
|
||||
"os"
|
||||
"path"
|
||||
"slices"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
@@ -114,7 +115,7 @@ type ExpirationManager struct {
|
||||
pending sync.Map
|
||||
nonexpiring sync.Map
|
||||
leaseCount int
|
||||
pendingLock locking.DeadlockRWMutex
|
||||
pendingLock locking.RWMutex
|
||||
|
||||
// A sync.Lock for every active leaseID
|
||||
lockPerLease sync.Map
|
||||
@@ -327,7 +328,7 @@ func getNumExpirationWorkers(c *Core, l log.Logger) int {
|
||||
|
||||
// NewExpirationManager creates a new ExpirationManager that is backed
|
||||
// using a given view, and uses the provided router for revocation.
|
||||
func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger) *ExpirationManager {
|
||||
func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger, detectDeadlocks bool) *ExpirationManager {
|
||||
managerLogger := logger.Named("job-manager")
|
||||
jobManager := fairshare.NewJobManager("expire", getNumExpirationWorkers(c, logger), managerLogger, c.metricSink)
|
||||
jobManager.Start()
|
||||
@@ -340,6 +341,7 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log
|
||||
tokenStore: c.tokenStore,
|
||||
logger: logger,
|
||||
pending: sync.Map{},
|
||||
pendingLock: &locking.SyncRWMutex{},
|
||||
nonexpiring: sync.Map{},
|
||||
leaseCount: 0,
|
||||
tidyLock: new(int32),
|
||||
@@ -375,6 +377,11 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log
|
||||
exp.logger = log.New(&opts)
|
||||
}
|
||||
|
||||
if detectDeadlocks {
|
||||
managerLogger.Debug("enabling deadlock detection")
|
||||
exp.pendingLock = &locking.DeadlockRWMutex{}
|
||||
}
|
||||
|
||||
go exp.uniquePoliciesGc()
|
||||
|
||||
return exp
|
||||
@@ -390,7 +397,9 @@ func (c *Core) setupExpiration(e ExpireLeaseStrategy) error {
|
||||
|
||||
// Create the manager
|
||||
expLogger := c.baseLogger.Named("expiration")
|
||||
mgr := NewExpirationManager(c, view, e, expLogger)
|
||||
|
||||
detectDeadlocks := slices.Contains(c.detectDeadlocks, "expiration")
|
||||
mgr := NewExpirationManager(c, view, e, expLogger, detectDeadlocks)
|
||||
c.expiration = mgr
|
||||
|
||||
// Link the token store to this
|
||||
@@ -2821,3 +2830,10 @@ func decodeLeaseEntry(buf []byte) (*leaseEntry, error) {
|
||||
out := new(leaseEntry)
|
||||
return out, jsonutil.DecodeJSON(buf, out)
|
||||
}
|
||||
|
||||
func (e *ExpirationManager) DetectDeadlocks() bool {
|
||||
if _, ok := e.pendingLock.(*locking.DeadlockRWMutex); ok {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -170,13 +170,13 @@ type Manager struct {
|
||||
metricSink *metricsutil.ClusterMetricSink
|
||||
|
||||
// quotaLock is a lock for manipulating quotas and anything not covered by a more specific lock
|
||||
quotaLock *locking.DeadlockRWMutex
|
||||
quotaLock locking.RWMutex
|
||||
|
||||
// quotaConfigLock is a lock for accessing config items, such as RateLimitExemptPaths
|
||||
quotaConfigLock *locking.DeadlockRWMutex
|
||||
quotaConfigLock locking.RWMutex
|
||||
|
||||
// dbAndCacheLock is a lock for db and path caches that need to be reset during Reset()
|
||||
dbAndCacheLock *locking.DeadlockRWMutex
|
||||
dbAndCacheLock locking.RWMutex
|
||||
}
|
||||
|
||||
// QuotaLeaseInformation contains all of the information lease-count quotas require
|
||||
@@ -275,7 +275,7 @@ type Request struct {
|
||||
|
||||
// NewManager creates and initializes a new quota manager to hold all the quota
|
||||
// rules and to process incoming requests.
|
||||
func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink) (*Manager, error) {
|
||||
func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink, detectDeadlocks bool) (*Manager, error) {
|
||||
db, err := memdb.NewMemDB(dbSchema())
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -287,9 +287,16 @@ func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.Clust
|
||||
metricSink: ms,
|
||||
rateLimitPathManager: pathmanager.New(),
|
||||
config: new(Config),
|
||||
quotaLock: new(locking.DeadlockRWMutex),
|
||||
quotaConfigLock: new(locking.DeadlockRWMutex),
|
||||
dbAndCacheLock: new(locking.DeadlockRWMutex),
|
||||
quotaLock: &locking.SyncRWMutex{},
|
||||
quotaConfigLock: &locking.SyncRWMutex{},
|
||||
dbAndCacheLock: &locking.SyncRWMutex{},
|
||||
}
|
||||
|
||||
if detectDeadlocks {
|
||||
logger.Debug("enabling deadlock detection")
|
||||
manager.quotaLock = &locking.DeadlockRWMutex{}
|
||||
manager.quotaConfigLock = &locking.DeadlockRWMutex{}
|
||||
manager.dbAndCacheLock = &locking.DeadlockRWMutex{}
|
||||
}
|
||||
|
||||
manager.init(walkFunc)
|
||||
@@ -1319,3 +1326,10 @@ func (m *Manager) HandleBackendDisabling(ctx context.Context, nsPath, mountPath
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (m *Manager) DetectDeadlocks() bool {
|
||||
if _, ok := m.quotaLock.(*locking.DeadlockRWMutex); ok {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
@@ -218,7 +218,7 @@ func TestRateLimitQuota_Allow_WithBlock(t *testing.T) {
|
||||
|
||||
func TestRateLimitQuota_Update(t *testing.T) {
|
||||
defer goleak.VerifyNone(t)
|
||||
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink())
|
||||
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
|
||||
require.NoError(t, err)
|
||||
|
||||
quota := NewRateLimitQuota("quota1", "", "", "", "", false, time.Second, 0, 10)
|
||||
|
||||
@@ -16,7 +16,7 @@ import (
|
||||
)
|
||||
|
||||
func TestQuotas_MountPathOverwrite(t *testing.T) {
|
||||
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink())
|
||||
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
|
||||
require.NoError(t, err)
|
||||
|
||||
quota := NewRateLimitQuota("tq", "", "kv1/", "", "", false, time.Second, 0, 10)
|
||||
@@ -43,7 +43,7 @@ func TestQuotas_MountPathOverwrite(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestQuotas_Precedence(t *testing.T) {
|
||||
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink())
|
||||
qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true)
|
||||
require.NoError(t, err)
|
||||
|
||||
setQuotaFunc := func(t *testing.T, name, nsPath, mountPath, pathSuffix, role string, inheritable bool) Quota {
|
||||
@@ -142,7 +142,7 @@ func TestQuotas_QueryResolveRole_RateLimitQuotas(t *testing.T) {
|
||||
leaseWalkFunc := func(context.Context, func(request *Request) bool) error {
|
||||
return nil
|
||||
}
|
||||
qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink())
|
||||
qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink(), true)
|
||||
require.NoError(t, err)
|
||||
|
||||
rlqReq := &Request{
|
||||
|
||||
@@ -142,6 +142,20 @@ func TestCoreWithSeal(t testing.T, testSeal Seal, enableRaw bool) *Core {
|
||||
return TestCoreWithSealAndUI(t, conf)
|
||||
}
|
||||
|
||||
func TestCoreWithDeadlockDetection(t testing.T, testSeal Seal, enableRaw bool) *Core {
|
||||
conf := &CoreConfig{
|
||||
Seal: testSeal,
|
||||
EnableUI: false,
|
||||
EnableRaw: enableRaw,
|
||||
BuiltinRegistry: corehelpers.NewMockBuiltinRegistry(),
|
||||
AuditBackends: map[string]audit.Factory{
|
||||
"file": auditFile.Factory,
|
||||
},
|
||||
DetectDeadlocks: "expiration,quotas,statelock",
|
||||
}
|
||||
return TestCoreWithSealAndUI(t, conf)
|
||||
}
|
||||
|
||||
func TestCoreWithCustomResponseHeaderAndUI(t testing.T, CustomResponseHeaders map[string]map[string]string, enableUI bool) (*Core, [][]byte, string) {
|
||||
confRaw := &server.Config{
|
||||
SharedConfig: &configutil.SharedConfig{
|
||||
|
||||
@@ -159,10 +159,11 @@ to specify where the configuration is.
|
||||
maximum request duration allowed before Vault cancels the request. This can
|
||||
be overridden per listener via the `max_request_duration` value.
|
||||
|
||||
- `detect_deadlocks` `(string: "")` - Specifies the internal mutex locks that should be monitored for
|
||||
potential deadlocks. Currently supported value is `statelock`, which will cause "POTENTIAL DEADLOCK:"
|
||||
to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this can have
|
||||
a negative effect on performance due to the tracking of each lock attempt.
|
||||
- `detect_deadlocks` `(string: "")` - A comma separated string that specifies the internal
|
||||
mutex locks that should be monitored for potential deadlocks. Currently supported values
|
||||
include `statelock`, `quotas` and `expiration` which will cause "POTENTIAL DEADLOCK:"
|
||||
to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this
|
||||
can have a negative effect on performance due to the tracking of each lock attempt.
|
||||
|
||||
- `raw_storage_endpoint` `(bool: false)` – Enables the `sys/raw` endpoint which
|
||||
allows the decryption/encryption of raw data into and out of the security
|
||||
|
||||
Reference in New Issue
Block a user