mirror of
				https://github.com/optim-enterprises-bv/vault.git
				synced 2025-10-30 18:17:55 +00:00 
			
		
		
		
	core: fix bug where deadlock detection was always on for expiration and quotas (#23902)
* server: fix bug where deadlock detection was on for expiration and quotas * trim spaces * Add tests * Use trimspace and lower * Update test * changelog * fix config parsing
This commit is contained in:
		
							
								
								
									
										5
									
								
								changelog/23902.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										5
									
								
								changelog/23902.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,5 @@ | ||||
| ```release-note:bug | ||||
| core: fix bug where deadlock detection was always on for expiration and quotas.  | ||||
| These can now be configured individually with `detect_deadlocks`. | ||||
| ``` | ||||
|  | ||||
| @@ -20,6 +20,7 @@ import ( | ||||
| 	"os" | ||||
| 	"path/filepath" | ||||
| 	"runtime" | ||||
| 	"slices" | ||||
| 	"strconv" | ||||
| 	"strings" | ||||
| 	"sync" | ||||
| @@ -690,6 +691,9 @@ type Core struct { | ||||
|  | ||||
| 	// If any role based quota (LCQ or RLQ) is enabled, don't track lease counts by role | ||||
| 	impreciseLeaseRoleTracking bool | ||||
|  | ||||
| 	// Config value for "detect_deadlocks". | ||||
| 	detectDeadlocks []string | ||||
| } | ||||
|  | ||||
| // c.stateLock needs to be held in read mode before calling this function. | ||||
| @@ -947,19 +951,28 @@ func CreateCore(conf *CoreConfig) (*Core, error) { | ||||
| 	if conf.NumRollbackWorkers == 0 { | ||||
| 		conf.NumRollbackWorkers = RollbackDefaultNumWorkers | ||||
| 	} | ||||
| 	// Use imported logging deadlock if requested | ||||
| 	var stateLock locking.RWMutex | ||||
| 	if strings.Contains(conf.DetectDeadlocks, "statelock") { | ||||
| 		stateLock = &locking.DeadlockRWMutex{} | ||||
| 	} else { | ||||
| 		stateLock = &locking.SyncRWMutex{} | ||||
| 	} | ||||
|  | ||||
| 	effectiveSDKVersion := conf.EffectiveSDKVersion | ||||
| 	if effectiveSDKVersion == "" { | ||||
| 		effectiveSDKVersion = version.GetVersion().Version | ||||
| 	} | ||||
|  | ||||
| 	var detectDeadlocks []string | ||||
| 	if conf.DetectDeadlocks != "" { | ||||
| 		detectDeadlocks = strings.Split(conf.DetectDeadlocks, ",") | ||||
| 		for k, v := range detectDeadlocks { | ||||
| 			detectDeadlocks[k] = strings.ToLower(strings.TrimSpace(v)) | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
| 	// Use imported logging deadlock if requested | ||||
| 	var stateLock locking.RWMutex | ||||
| 	stateLock = &locking.SyncRWMutex{} | ||||
|  | ||||
| 	if slices.Contains(detectDeadlocks, "statelock") { | ||||
| 		stateLock = &locking.DeadlockRWMutex{} | ||||
| 	} | ||||
|  | ||||
| 	// Setup the core | ||||
| 	c := &Core{ | ||||
| 		entCore:              entCore{}, | ||||
| @@ -1033,6 +1046,7 @@ func CreateCore(conf *CoreConfig) (*Core, error) { | ||||
| 		rollbackMountPathMetrics:       conf.MetricSink.TelemetryConsts.RollbackMetricsIncludeMountPoint, | ||||
| 		numRollbackWorkers:             conf.NumRollbackWorkers, | ||||
| 		impreciseLeaseRoleTracking:     conf.ImpreciseLeaseRoleTracking, | ||||
| 		detectDeadlocks:                detectDeadlocks, | ||||
| 	} | ||||
|  | ||||
| 	c.standbyStopCh.Store(make(chan struct{})) | ||||
| @@ -1219,7 +1233,9 @@ func NewCore(conf *CoreConfig) (*Core, error) { | ||||
|  | ||||
| 	// Quotas | ||||
| 	quotasLogger := conf.Logger.Named("quotas") | ||||
| 	c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink) | ||||
|  | ||||
| 	detectDeadlocks := slices.Contains(c.detectDeadlocks, "quotas") | ||||
| 	c.quotaManager, err = quotas.NewManager(quotasLogger, c.quotaLeaseWalker, c.metricSink, detectDeadlocks) | ||||
| 	if err != nil { | ||||
| 		return nil, err | ||||
| 	} | ||||
| @@ -4188,3 +4204,10 @@ func (c *Core) GetRaftAutopilotState(ctx context.Context) (*raft.AutopilotState, | ||||
| func (c *Core) Events() *eventbus.EventBus { | ||||
| 	return c.events | ||||
| } | ||||
|  | ||||
| func (c *Core) DetectStateLockDeadlocks() bool { | ||||
| 	if _, ok := c.stateLock.(*locking.DeadlockRWMutex); ok { | ||||
| 		return true | ||||
| 	} | ||||
| 	return false | ||||
| } | ||||
|   | ||||
| @@ -3361,3 +3361,51 @@ func InduceDeadlock(t *testing.T, vaultcore *Core, expected uint32) { | ||||
| 		t.Fatalf("expected 1 deadlock, detected %d", deadlocks) | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func TestExpiration_DeadlockDetection(t *testing.T) { | ||||
| 	testCore := TestCore(t) | ||||
| 	testCoreUnsealed(t, testCore) | ||||
|  | ||||
| 	if testCore.expiration.DetectDeadlocks() { | ||||
| 		t.Fatal("expiration has deadlock detection enabled, it shouldn't") | ||||
| 	} | ||||
|  | ||||
| 	testCore = TestCoreWithDeadlockDetection(t, nil, false) | ||||
| 	testCoreUnsealed(t, testCore) | ||||
|  | ||||
| 	if !testCore.expiration.DetectDeadlocks() { | ||||
| 		t.Fatal("expiration doesn't have deadlock detection enabled, it should") | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func TestQuotas_DeadlockDetection(t *testing.T) { | ||||
| 	testCore := TestCore(t) | ||||
| 	testCoreUnsealed(t, testCore) | ||||
|  | ||||
| 	if testCore.quotaManager.DetectDeadlocks() { | ||||
| 		t.Fatal("quotas has deadlock detection enabled, it shouldn't") | ||||
| 	} | ||||
|  | ||||
| 	testCore = TestCoreWithDeadlockDetection(t, nil, false) | ||||
| 	testCoreUnsealed(t, testCore) | ||||
|  | ||||
| 	if !testCore.quotaManager.DetectDeadlocks() { | ||||
| 		t.Fatal("quotas doesn't have deadlock detection enabled, it should") | ||||
| 	} | ||||
| } | ||||
|  | ||||
| func TestStatelock_DeadlockDetection(t *testing.T) { | ||||
| 	testCore := TestCore(t) | ||||
| 	testCoreUnsealed(t, testCore) | ||||
|  | ||||
| 	if testCore.DetectStateLockDeadlocks() { | ||||
| 		t.Fatal("statelock has deadlock detection enabled, it shouldn't") | ||||
| 	} | ||||
|  | ||||
| 	testCore = TestCoreWithDeadlockDetection(t, nil, false) | ||||
| 	testCoreUnsealed(t, testCore) | ||||
|  | ||||
| 	if !testCore.DetectStateLockDeadlocks() { | ||||
| 		t.Fatal("statelock doesn't have deadlock detection enabled, it should") | ||||
| 	} | ||||
| } | ||||
|   | ||||
| @@ -11,6 +11,7 @@ import ( | ||||
| 	"math/rand" | ||||
| 	"os" | ||||
| 	"path" | ||||
| 	"slices" | ||||
| 	"sort" | ||||
| 	"strconv" | ||||
| 	"strings" | ||||
| @@ -114,7 +115,7 @@ type ExpirationManager struct { | ||||
| 	pending     sync.Map | ||||
| 	nonexpiring sync.Map | ||||
| 	leaseCount  int | ||||
| 	pendingLock locking.DeadlockRWMutex | ||||
| 	pendingLock locking.RWMutex | ||||
|  | ||||
| 	// A sync.Lock for every active leaseID | ||||
| 	lockPerLease sync.Map | ||||
| @@ -327,7 +328,7 @@ func getNumExpirationWorkers(c *Core, l log.Logger) int { | ||||
|  | ||||
| // NewExpirationManager creates a new ExpirationManager that is backed | ||||
| // using a given view, and uses the provided router for revocation. | ||||
| func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger) *ExpirationManager { | ||||
| func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, logger log.Logger, detectDeadlocks bool) *ExpirationManager { | ||||
| 	managerLogger := logger.Named("job-manager") | ||||
| 	jobManager := fairshare.NewJobManager("expire", getNumExpirationWorkers(c, logger), managerLogger, c.metricSink) | ||||
| 	jobManager.Start() | ||||
| @@ -340,6 +341,7 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log | ||||
| 		tokenStore:  c.tokenStore, | ||||
| 		logger:      logger, | ||||
| 		pending:     sync.Map{}, | ||||
| 		pendingLock: &locking.SyncRWMutex{}, | ||||
| 		nonexpiring: sync.Map{}, | ||||
| 		leaseCount:  0, | ||||
| 		tidyLock:    new(int32), | ||||
| @@ -375,6 +377,11 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log | ||||
| 		exp.logger = log.New(&opts) | ||||
| 	} | ||||
|  | ||||
| 	if detectDeadlocks { | ||||
| 		managerLogger.Debug("enabling deadlock detection") | ||||
| 		exp.pendingLock = &locking.DeadlockRWMutex{} | ||||
| 	} | ||||
|  | ||||
| 	go exp.uniquePoliciesGc() | ||||
|  | ||||
| 	return exp | ||||
| @@ -390,7 +397,9 @@ func (c *Core) setupExpiration(e ExpireLeaseStrategy) error { | ||||
|  | ||||
| 	// Create the manager | ||||
| 	expLogger := c.baseLogger.Named("expiration") | ||||
| 	mgr := NewExpirationManager(c, view, e, expLogger) | ||||
|  | ||||
| 	detectDeadlocks := slices.Contains(c.detectDeadlocks, "expiration") | ||||
| 	mgr := NewExpirationManager(c, view, e, expLogger, detectDeadlocks) | ||||
| 	c.expiration = mgr | ||||
|  | ||||
| 	// Link the token store to this | ||||
| @@ -2821,3 +2830,10 @@ func decodeLeaseEntry(buf []byte) (*leaseEntry, error) { | ||||
| 	out := new(leaseEntry) | ||||
| 	return out, jsonutil.DecodeJSON(buf, out) | ||||
| } | ||||
|  | ||||
| func (e *ExpirationManager) DetectDeadlocks() bool { | ||||
| 	if _, ok := e.pendingLock.(*locking.DeadlockRWMutex); ok { | ||||
| 		return true | ||||
| 	} | ||||
| 	return false | ||||
| } | ||||
|   | ||||
| @@ -170,13 +170,13 @@ type Manager struct { | ||||
| 	metricSink *metricsutil.ClusterMetricSink | ||||
|  | ||||
| 	// quotaLock is a lock for manipulating quotas and anything not covered by a more specific lock | ||||
| 	quotaLock *locking.DeadlockRWMutex | ||||
| 	quotaLock locking.RWMutex | ||||
|  | ||||
| 	// quotaConfigLock is a lock for accessing config items, such as RateLimitExemptPaths | ||||
| 	quotaConfigLock *locking.DeadlockRWMutex | ||||
| 	quotaConfigLock locking.RWMutex | ||||
|  | ||||
| 	// dbAndCacheLock is a lock for db and path caches that need to be reset during Reset() | ||||
| 	dbAndCacheLock *locking.DeadlockRWMutex | ||||
| 	dbAndCacheLock locking.RWMutex | ||||
| } | ||||
|  | ||||
| // QuotaLeaseInformation contains all of the information lease-count quotas require | ||||
| @@ -275,7 +275,7 @@ type Request struct { | ||||
|  | ||||
| // NewManager creates and initializes a new quota manager to hold all the quota | ||||
| // rules and to process incoming requests. | ||||
| func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink) (*Manager, error) { | ||||
| func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.ClusterMetricSink, detectDeadlocks bool) (*Manager, error) { | ||||
| 	db, err := memdb.NewMemDB(dbSchema()) | ||||
| 	if err != nil { | ||||
| 		return nil, err | ||||
| @@ -287,9 +287,16 @@ func NewManager(logger log.Logger, walkFunc leaseWalkFunc, ms *metricsutil.Clust | ||||
| 		metricSink:           ms, | ||||
| 		rateLimitPathManager: pathmanager.New(), | ||||
| 		config:               new(Config), | ||||
| 		quotaLock:            new(locking.DeadlockRWMutex), | ||||
| 		quotaConfigLock:      new(locking.DeadlockRWMutex), | ||||
| 		dbAndCacheLock:       new(locking.DeadlockRWMutex), | ||||
| 		quotaLock:            &locking.SyncRWMutex{}, | ||||
| 		quotaConfigLock:      &locking.SyncRWMutex{}, | ||||
| 		dbAndCacheLock:       &locking.SyncRWMutex{}, | ||||
| 	} | ||||
|  | ||||
| 	if detectDeadlocks { | ||||
| 		logger.Debug("enabling deadlock detection") | ||||
| 		manager.quotaLock = &locking.DeadlockRWMutex{} | ||||
| 		manager.quotaConfigLock = &locking.DeadlockRWMutex{} | ||||
| 		manager.dbAndCacheLock = &locking.DeadlockRWMutex{} | ||||
| 	} | ||||
|  | ||||
| 	manager.init(walkFunc) | ||||
| @@ -1319,3 +1326,10 @@ func (m *Manager) HandleBackendDisabling(ctx context.Context, nsPath, mountPath | ||||
|  | ||||
| 	return nil | ||||
| } | ||||
|  | ||||
| func (m *Manager) DetectDeadlocks() bool { | ||||
| 	if _, ok := m.quotaLock.(*locking.DeadlockRWMutex); ok { | ||||
| 		return true | ||||
| 	} | ||||
| 	return false | ||||
| } | ||||
|   | ||||
| @@ -218,7 +218,7 @@ func TestRateLimitQuota_Allow_WithBlock(t *testing.T) { | ||||
|  | ||||
| func TestRateLimitQuota_Update(t *testing.T) { | ||||
| 	defer goleak.VerifyNone(t) | ||||
| 	qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink()) | ||||
| 	qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true) | ||||
| 	require.NoError(t, err) | ||||
|  | ||||
| 	quota := NewRateLimitQuota("quota1", "", "", "", "", false, time.Second, 0, 10) | ||||
|   | ||||
| @@ -16,7 +16,7 @@ import ( | ||||
| ) | ||||
|  | ||||
| func TestQuotas_MountPathOverwrite(t *testing.T) { | ||||
| 	qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink()) | ||||
| 	qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true) | ||||
| 	require.NoError(t, err) | ||||
|  | ||||
| 	quota := NewRateLimitQuota("tq", "", "kv1/", "", "", false, time.Second, 0, 10) | ||||
| @@ -43,7 +43,7 @@ func TestQuotas_MountPathOverwrite(t *testing.T) { | ||||
| } | ||||
|  | ||||
| func TestQuotas_Precedence(t *testing.T) { | ||||
| 	qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink()) | ||||
| 	qm, err := NewManager(logging.NewVaultLogger(log.Trace), nil, metricsutil.BlackholeSink(), true) | ||||
| 	require.NoError(t, err) | ||||
|  | ||||
| 	setQuotaFunc := func(t *testing.T, name, nsPath, mountPath, pathSuffix, role string, inheritable bool) Quota { | ||||
| @@ -142,7 +142,7 @@ func TestQuotas_QueryResolveRole_RateLimitQuotas(t *testing.T) { | ||||
| 	leaseWalkFunc := func(context.Context, func(request *Request) bool) error { | ||||
| 		return nil | ||||
| 	} | ||||
| 	qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink()) | ||||
| 	qm, err := NewManager(logging.NewVaultLogger(log.Trace), leaseWalkFunc, metricsutil.BlackholeSink(), true) | ||||
| 	require.NoError(t, err) | ||||
|  | ||||
| 	rlqReq := &Request{ | ||||
|   | ||||
| @@ -142,6 +142,20 @@ func TestCoreWithSeal(t testing.T, testSeal Seal, enableRaw bool) *Core { | ||||
| 	return TestCoreWithSealAndUI(t, conf) | ||||
| } | ||||
|  | ||||
| func TestCoreWithDeadlockDetection(t testing.T, testSeal Seal, enableRaw bool) *Core { | ||||
| 	conf := &CoreConfig{ | ||||
| 		Seal:            testSeal, | ||||
| 		EnableUI:        false, | ||||
| 		EnableRaw:       enableRaw, | ||||
| 		BuiltinRegistry: corehelpers.NewMockBuiltinRegistry(), | ||||
| 		AuditBackends: map[string]audit.Factory{ | ||||
| 			"file": auditFile.Factory, | ||||
| 		}, | ||||
| 		DetectDeadlocks: "expiration,quotas,statelock", | ||||
| 	} | ||||
| 	return TestCoreWithSealAndUI(t, conf) | ||||
| } | ||||
|  | ||||
| func TestCoreWithCustomResponseHeaderAndUI(t testing.T, CustomResponseHeaders map[string]map[string]string, enableUI bool) (*Core, [][]byte, string) { | ||||
| 	confRaw := &server.Config{ | ||||
| 		SharedConfig: &configutil.SharedConfig{ | ||||
|   | ||||
| @@ -159,10 +159,11 @@ to specify where the configuration is. | ||||
|   maximum request duration allowed before Vault cancels the request. This can | ||||
|   be overridden per listener via the `max_request_duration` value. | ||||
|  | ||||
| - `detect_deadlocks` `(string: "")` - Specifies the internal mutex locks that should be monitored for | ||||
| potential deadlocks. Currently supported value is `statelock`, which will cause "POTENTIAL DEADLOCK:" | ||||
| to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this can have | ||||
| a negative effect on performance due to the tracking of each lock attempt. | ||||
| - `detect_deadlocks` `(string: "")` - A comma separated string that specifies the internal  | ||||
| mutex locks that should be monitored for potential deadlocks. Currently supported values  | ||||
| include `statelock`, `quotas` and `expiration` which will cause "POTENTIAL DEADLOCK:" | ||||
| to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this  | ||||
| can have a negative effect on performance due to the tracking of each lock attempt. | ||||
|  | ||||
| - `raw_storage_endpoint` `(bool: false)` – Enables the `sys/raw` endpoint which | ||||
|   allows the decryption/encryption of raw data into and out of the security | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Jason O'Donnell
					Jason O'Donnell