add core state lock deadlock detection config option v2 (#18604)

* add core state lockd eadlock detection config option v2

* add changelog

* split out NewTestCluster function to maintain build flag

* replace long func with constant

* remove line

* rename file, and move where detect deadlock flag is set
This commit is contained in:
Ellie
2023-01-11 13:32:05 -06:00
committed by GitHub
parent 5564be70ea
commit 49da2544ce
14 changed files with 131 additions and 30 deletions

3
changelog/18604.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:improvement
core: add `detect_deadlocks` config to optionally detect core state deadlocks
```

View File

@@ -2619,6 +2619,7 @@ func createCoreConfig(c *ServerCommand, config *server.Config, backend physical.
CredentialBackends: c.CredentialBackends, CredentialBackends: c.CredentialBackends,
LogicalBackends: c.LogicalBackends, LogicalBackends: c.LogicalBackends,
Logger: c.logger, Logger: c.logger,
DetectDeadlocks: config.DetectDeadlocks,
DisableSentinelTrace: config.DisableSentinelTrace, DisableSentinelTrace: config.DisableSentinelTrace,
DisableCache: config.DisableCache, DisableCache: config.DisableCache,
DisableMlock: config.DisableMlock, DisableMlock: config.DisableMlock,

View File

@@ -97,6 +97,8 @@ type Config struct {
LogRequestsLevel string `hcl:"-"` LogRequestsLevel string `hcl:"-"`
LogRequestsLevelRaw interface{} `hcl:"log_requests_level"` LogRequestsLevelRaw interface{} `hcl:"log_requests_level"`
DetectDeadlocks string `hcl:"detect_deadlocks"`
EnableResponseHeaderRaftNodeID bool `hcl:"-"` EnableResponseHeaderRaftNodeID bool `hcl:"-"`
EnableResponseHeaderRaftNodeIDRaw interface{} `hcl:"enable_response_header_raft_node_id"` EnableResponseHeaderRaftNodeIDRaw interface{} `hcl:"enable_response_header_raft_node_id"`
@@ -389,6 +391,11 @@ func (c *Config) Merge(c2 *Config) *Config {
result.LogRequestsLevel = c2.LogRequestsLevel result.LogRequestsLevel = c2.LogRequestsLevel
} }
result.DetectDeadlocks = c.DetectDeadlocks
if c2.DetectDeadlocks != "" {
result.DetectDeadlocks = c2.DetectDeadlocks
}
result.EnableResponseHeaderRaftNodeID = c.EnableResponseHeaderRaftNodeID result.EnableResponseHeaderRaftNodeID = c.EnableResponseHeaderRaftNodeID
if c2.EnableResponseHeaderRaftNodeID { if c2.EnableResponseHeaderRaftNodeID {
result.EnableResponseHeaderRaftNodeID = c2.EnableResponseHeaderRaftNodeID result.EnableResponseHeaderRaftNodeID = c2.EnableResponseHeaderRaftNodeID
@@ -1025,6 +1032,8 @@ func (c *Config) Sanitized() map[string]interface{} {
"enable_response_header_raft_node_id": c.EnableResponseHeaderRaftNodeID, "enable_response_header_raft_node_id": c.EnableResponseHeaderRaftNodeID,
"log_requests_level": c.LogRequestsLevel, "log_requests_level": c.LogRequestsLevel,
"detect_deadlocks": c.DetectDeadlocks,
} }
for k, v := range sharedResult { for k, v := range sharedResult {
result[k] = v result[k] = v

View File

@@ -745,6 +745,7 @@ func testConfig_Sanitized(t *testing.T) {
"raw_storage_endpoint": true, "raw_storage_endpoint": true,
"introspection_endpoint": false, "introspection_endpoint": false,
"disable_sentinel_trace": true, "disable_sentinel_trace": true,
"detect_deadlocks": "",
"enable_ui": true, "enable_ui": true,
"enable_response_header_hostname": false, "enable_response_header_hostname": false,
"enable_response_header_raft_node_id": false, "enable_response_header_raft_node_id": false,

View File

@@ -1,21 +0,0 @@
//go:build deadlock
package locking
import (
"github.com/sasha-s/go-deadlock"
)
// DeadlockMutex, when the build tag `deadlock` is present, behaves like a
// sync.Mutex but does periodic checking to see if outstanding locks and requests
// look like a deadlock. If it finds a deadlock candidate it will output it
// prefixed with "POTENTIAL DEADLOCK", as described at
// https://github.com/sasha-s/go-deadlock
type DeadlockMutex struct {
deadlock.Mutex
}
// DeadlockRWMutex is the RW version of DeadlockMutex.
type DeadlockRWMutex struct {
deadlock.RWMutex
}

View File

@@ -1,19 +1,46 @@
//go:build !deadlock
package locking package locking
import ( import (
"sync" "sync"
"github.com/sasha-s/go-deadlock"
) )
// DeadlockMutex is just a sync.Mutex when the build tag `deadlock` is absent. // Common mutex interface to allow either built-in or imported deadlock use
// See its other definition in the corresponding deadlock-build-tag-constrained type Mutex interface {
// file for more details. Lock()
Unlock()
}
// Common r/w mutex interface to allow either built-in or imported deadlock use
type RWMutex interface {
Lock()
RLock()
RLocker() sync.Locker
RUnlock()
Unlock()
}
// DeadlockMutex (used when requested via config option `detact_deadlocks`),
// behaves like a sync.Mutex but does periodic checking to see if outstanding
// locks and requests look like a deadlock. If it finds a deadlock candidate it
// will output it prefixed with "POTENTIAL DEADLOCK", as described at
// https://github.com/sasha-s/go-deadlock
type DeadlockMutex struct { type DeadlockMutex struct {
sync.Mutex deadlock.Mutex
} }
// DeadlockRWMutex is the RW version of DeadlockMutex. // DeadlockRWMutex is the RW version of DeadlockMutex.
type DeadlockRWMutex struct { type DeadlockRWMutex struct {
deadlock.RWMutex
}
// Regular sync/mutex.
type SyncMutex struct {
sync.Mutex
}
// DeadlockRWMutex is the RW version of SyncMutex.
type SyncRWMutex struct {
sync.RWMutex sync.RWMutex
} }

View File

@@ -39,6 +39,7 @@ func TestSysConfigState_Sanitized(t *testing.T) {
"disable_printable_check": false, "disable_printable_check": false,
"disable_sealwrap": false, "disable_sealwrap": false,
"raw_storage_endpoint": false, "raw_storage_endpoint": false,
"detect_deadlocks": "",
"introspection_endpoint": false, "introspection_endpoint": false,
"disable_sentinel_trace": false, "disable_sentinel_trace": false,
"enable_ui": false, "enable_ui": false,

View File

@@ -316,7 +316,7 @@ type Core struct {
auditBackends map[string]audit.Factory auditBackends map[string]audit.Factory
// stateLock protects mutable state // stateLock protects mutable state
stateLock locking.DeadlockRWMutex stateLock locking.RWMutex
sealed *uint32 sealed *uint32
standby bool standby bool
@@ -732,6 +732,9 @@ type CoreConfig struct {
Logger log.Logger Logger log.Logger
// Use the deadlocks library to detect deadlocks
DetectDeadlocks string
// Disables the trace display for Sentinel checks // Disables the trace display for Sentinel checks
DisableSentinelTrace bool DisableSentinelTrace bool
@@ -904,6 +907,14 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
conf.NumExpirationWorkers = numExpirationWorkersDefault conf.NumExpirationWorkers = numExpirationWorkersDefault
} }
// Use imported logging deadlock if requested
var stateLock locking.RWMutex
if strings.Contains(conf.DetectDeadlocks, "statelock") {
stateLock = &locking.DeadlockRWMutex{}
} else {
stateLock = &locking.SyncRWMutex{}
}
effectiveSDKVersion := conf.EffectiveSDKVersion effectiveSDKVersion := conf.EffectiveSDKVersion
if effectiveSDKVersion == "" { if effectiveSDKVersion == "" {
effectiveSDKVersion = version.GetVersion().Version effectiveSDKVersion = version.GetVersion().Version
@@ -922,6 +933,7 @@ func CreateCore(conf *CoreConfig) (*Core, error) {
clusterListener: new(atomic.Value), clusterListener: new(atomic.Value),
customListenerHeader: new(atomic.Value), customListenerHeader: new(atomic.Value),
seal: conf.Seal, seal: conf.Seal,
stateLock: stateLock,
router: NewRouter(), router: NewRouter(),
sealed: new(uint32), sealed: new(uint32),
sealMigrationDone: new(uint32), sealMigrationDone: new(uint32),

View File

@@ -6,6 +6,7 @@ import (
"reflect" "reflect"
"strings" "strings"
"sync" "sync"
"sync/atomic"
"testing" "testing"
"time" "time"
@@ -21,6 +22,7 @@ import (
"github.com/hashicorp/vault/sdk/physical" "github.com/hashicorp/vault/sdk/physical"
"github.com/hashicorp/vault/sdk/physical/inmem" "github.com/hashicorp/vault/sdk/physical/inmem"
"github.com/hashicorp/vault/version" "github.com/hashicorp/vault/version"
"github.com/sasha-s/go-deadlock"
) )
// invalidKey is used to test Unseal // invalidKey is used to test Unseal
@@ -2836,3 +2838,51 @@ func TestCore_ServiceRegistration(t *testing.T) {
t.Fatal(diff) t.Fatal(diff)
} }
} }
func TestDetectedDeadlock(t *testing.T) {
testCore, _, _ := TestCoreUnsealedWithConfig(t, &CoreConfig{DetectDeadlocks: "statelock"})
InduceDeadlock(t, testCore, 1)
}
func TestDefaultDeadlock(t *testing.T) {
testCore, _, _ := TestCoreUnsealed(t)
InduceDeadlock(t, testCore, 0)
}
func RestoreDeadlockOpts() func() {
opts := deadlock.Opts
return func() {
deadlock.Opts = opts
}
}
func InduceDeadlock(t *testing.T, vaultcore *Core, expected uint32) {
defer RestoreDeadlockOpts()()
var deadlocks uint32
deadlock.Opts.OnPotentialDeadlock = func() {
atomic.AddUint32(&deadlocks, 1)
}
var mtx deadlock.Mutex
var wg sync.WaitGroup
wg.Add(1)
go func() {
defer wg.Done()
vaultcore.expiration.coreStateLock.Lock()
mtx.Lock()
mtx.Unlock()
vaultcore.expiration.coreStateLock.Unlock()
}()
wg.Wait()
wg.Add(1)
go func() {
defer wg.Done()
mtx.Lock()
vaultcore.expiration.coreStateLock.RLock()
vaultcore.expiration.coreStateLock.RUnlock()
mtx.Unlock()
}()
wg.Wait()
if atomic.LoadUint32(&deadlocks) != expected {
t.Fatalf("expected 1 deadlock, detected %d", deadlocks)
}
}

View File

@@ -139,7 +139,7 @@ type ExpirationManager struct {
quitCh chan struct{} quitCh chan struct{}
// do not hold coreStateLock in any API handler code - it is already held // do not hold coreStateLock in any API handler code - it is already held
coreStateLock *locking.DeadlockRWMutex coreStateLock locking.RWMutex
quitContext context.Context quitContext context.Context
leaseCheckCounter *uint32 leaseCheckCounter *uint32
@@ -350,7 +350,7 @@ func NewExpirationManager(c *Core, view *BarrierView, e ExpireLeaseStrategy, log
restoreLocks: locksutil.CreateLocks(), restoreLocks: locksutil.CreateLocks(),
quitCh: make(chan struct{}), quitCh: make(chan struct{}),
coreStateLock: &c.stateLock, coreStateLock: c.stateLock,
quitContext: c.activeContext, quitContext: c.activeContext,
leaseCheckCounter: new(uint32), leaseCheckCounter: new(uint32),

View File

@@ -0,0 +1,5 @@
//go:build deadlock
package vault
const TestDeadlockDetection = "statelock"

View File

@@ -0,0 +1,5 @@
//go:build !deadlock
package vault
const TestDeadlockDetection = ""

View File

@@ -212,6 +212,7 @@ func TestCoreWithSealAndUINoCleanup(t testing.T, opts *CoreConfig) *Core {
conf.EnableResponseHeaderHostname = opts.EnableResponseHeaderHostname conf.EnableResponseHeaderHostname = opts.EnableResponseHeaderHostname
conf.DisableSSCTokens = opts.DisableSSCTokens conf.DisableSSCTokens = opts.DisableSSCTokens
conf.PluginDirectory = opts.PluginDirectory conf.PluginDirectory = opts.PluginDirectory
conf.DetectDeadlocks = opts.DetectDeadlocks
if opts.Logger != nil { if opts.Logger != nil {
conf.Logger = opts.Logger conf.Logger = opts.Logger
@@ -1382,6 +1383,7 @@ func NewTestCluster(t testing.T, base *CoreConfig, opts *TestClusterOptions) *Te
var baseAddr *net.TCPAddr var baseAddr *net.TCPAddr
if opts != nil && opts.BaseListenAddress != "" { if opts != nil && opts.BaseListenAddress != "" {
baseAddr, err = net.ResolveTCPAddr("tcp", opts.BaseListenAddress) baseAddr, err = net.ResolveTCPAddr("tcp", opts.BaseListenAddress)
if err != nil { if err != nil {
t.Fatal("could not parse given base IP") t.Fatal("could not parse given base IP")
} }
@@ -1633,6 +1635,7 @@ func NewTestCluster(t testing.T, base *CoreConfig, opts *TestClusterOptions) *Te
} }
if base != nil { if base != nil {
coreConfig.DetectDeadlocks = TestDeadlockDetection
coreConfig.RawConfig = base.RawConfig coreConfig.RawConfig = base.RawConfig
coreConfig.DisableCache = base.DisableCache coreConfig.DisableCache = base.DisableCache
coreConfig.EnableUI = base.EnableUI coreConfig.EnableUI = base.EnableUI

View File

@@ -149,6 +149,11 @@ to specify where the configuration is.
maximum request duration allowed before Vault cancels the request. This can maximum request duration allowed before Vault cancels the request. This can
be overridden per listener via the `max_request_duration` value. be overridden per listener via the `max_request_duration` value.
- `detect_deadlocks` `(string: "")` - Specifies the internal mutex locks that should be monitored for
potential deadlocks. Currently supported value is `statelock`, which will cause "POTENTIAL DEADLOCK:"
to be logged when an attempt at a core state lock appears to be deadlocked. Enabling this can have
a negative effect on performance due to the tracking of each lock attempt.
- `raw_storage_endpoint` `(bool: false)` Enables the `sys/raw` endpoint which - `raw_storage_endpoint` `(bool: false)` Enables the `sys/raw` endpoint which
allows the decryption/encryption of raw data into and out of the security allows the decryption/encryption of raw data into and out of the security
barrier. This is a highly privileged endpoint. barrier. This is a highly privileged endpoint.