Reduce blocking in approle tidy by grabbing only one lock at a time. (#8418)

Revamp race test to make it aware of when tidies start/stop, and to generate dangling accessors that need cleaning up.  Also run for longer but with slightly less concurrency to ensure the writes and the cleanup actually overlap.
This commit is contained in:
ncabatoff
2020-03-03 16:43:24 -05:00
committed by GitHub
parent 152b4a8447
commit 6c92f1e40b
3 changed files with 252 additions and 215 deletions

View File

@@ -3,7 +3,6 @@ package approle
import (
"context"
"sync"
"time"
"github.com/hashicorp/vault/sdk/framework"
"github.com/hashicorp/vault/sdk/helper/consts"
@@ -57,8 +56,6 @@ type backend struct {
// secretIDListingLock is a dedicated lock for listing SecretIDAccessors
// for all the SecretIDs issued against an approle
secretIDListingLock sync.RWMutex
testTidyDelay time.Duration
}
func Factory(ctx context.Context, conf *logical.BackendConfig) (logical.Backend, error) {

View File

@@ -10,6 +10,7 @@ import (
"github.com/hashicorp/errwrap"
"github.com/hashicorp/vault/sdk/framework"
"github.com/hashicorp/vault/sdk/helper/consts"
"github.com/hashicorp/vault/sdk/helper/locksutil"
"github.com/hashicorp/vault/sdk/logical"
)
@@ -39,9 +40,20 @@ func (b *backend) tidySecretID(ctx context.Context, req *logical.Request) (*logi
return resp, nil
}
s := req.Storage
go b.tidySecretIDinternal(req.Storage)
go func() {
resp := &logical.Response{}
resp.AddWarning("Tidy operation successfully started. Any information from the operation will be printed to Vault's server logs.")
return logical.RespondWithStatusCode(resp, req, http.StatusAccepted)
}
type tidyHelperSecretIDAccessor struct {
secretIDAccessorStorageEntry
saltedSecretIDAccessor string
}
func (b *backend) tidySecretIDinternal(s logical.Storage) {
defer atomic.StoreUint32(b.tidySecretIDCASGuard, 0)
logger := b.Logger().Named("tidy")
@@ -49,35 +61,50 @@ func (b *backend) tidySecretID(ctx context.Context, req *logical.Request) (*logi
checkCount := 0
defer func() {
if b.testTidyDelay > 0 {
logger.Trace("done checking entries", "num_entries", checkCount)
}
}()
// Don't cancel when the original client request goes away
ctx = context.Background()
ctx := context.Background()
tidyFunc := func(secretIDPrefixToUse, accessorIDPrefixToUse string) error {
logger.Trace("listing role HMACs", "prefix", secretIDPrefixToUse)
roleNameHMACs, err := s.List(ctx, secretIDPrefixToUse)
salt, err := b.Salt(ctx)
if err != nil {
return err
logger.Error("error tidying secret IDs", "error", err)
return
}
tidyFunc := func(secretIDPrefixToUse, accessorIDPrefixToUse string) error {
logger.Trace("listing accessors", "prefix", accessorIDPrefixToUse)
// List all the accessors and add them all to a map
// These hashes are the result of salting the accessor id.
accessorHashes, err := s.List(ctx, accessorIDPrefixToUse)
if err != nil {
return err
}
accessorMap := make(map[string]bool, len(accessorHashes))
skipHashes := make(map[string]bool, len(accessorHashes))
accHashesByLockID := make([][]tidyHelperSecretIDAccessor, 256)
for _, accessorHash := range accessorHashes {
accessorMap[accessorHash] = true
var entry secretIDAccessorStorageEntry
entryIndex := accessorIDPrefixToUse + accessorHash
se, err := s.Get(ctx, entryIndex)
if err != nil {
return err
}
if se == nil {
continue
}
err = se.DecodeJSON(&entry)
if err != nil {
return err
}
time.Sleep(b.testTidyDelay)
lockIdx := locksutil.LockIndexForKey(entry.SecretIDHMAC)
accHashesByLockID[lockIdx] = append(accHashesByLockID[lockIdx], tidyHelperSecretIDAccessor{
secretIDAccessorStorageEntry: entry,
saltedSecretIDAccessor: accessorHash,
})
}
secretIDCleanupFunc := func(secretIDHMAC, roleNameHMAC, secretIDPrefixToUse string) error {
checkCount++
@@ -135,19 +162,20 @@ func (b *backend) tidySecretID(ctx context.Context, req *logical.Request) (*logi
return nil
}
// At this point, the secret ID is not expired and is valid. Delete
// the corresponding accessor from the accessorMap. This will leave
// only the dangling accessors in the map which can then be cleaned
// up later.
salt, err := b.Salt(ctx)
if err != nil {
return err
}
delete(accessorMap, salt.SaltID(result.SecretIDAccessor))
// At this point, the secret ID is not expired and is valid. Flag
// the corresponding accessor as not needing attention.
skipHashes[salt.SaltID(result.SecretIDAccessor)] = true
return nil
}
logger.Trace("listing role HMACs", "prefix", secretIDPrefixToUse)
roleNameHMACs, err := s.List(ctx, secretIDPrefixToUse)
if err != nil {
return err
}
for _, roleNameHMAC := range roleNameHMACs {
logger.Trace("listing secret ID HMACs", "role_hmac", roleNameHMAC)
secretIDHMACs, err := s.List(ctx, fmt.Sprintf("%s%s", secretIDPrefixToUse, roleNameHMAC))
@@ -164,59 +192,55 @@ func (b *backend) tidySecretID(ctx context.Context, req *logical.Request) (*logi
// Accessor indexes were not getting cleaned up until 0.9.3. This is a fix
// to clean up the dangling accessor entries.
if len(accessorMap) > 0 {
for _, lock := range b.secretIDLocks {
lock.Lock()
defer lock.Unlock()
}
for accessorHash, _ := range accessorMap {
logger.Trace("found dangling accessor, verifying")
// Ideally, locking on accessors should be performed here too
// but for that, accessors are required in plaintext, which are
// not available. The code above helps but it may still be
// racy.
// ...
// Look up the secret again now that we have all the locks. The
// lock is held when writing accessor/secret so if we have the
// lock we know we're not in a
// wrote-accessor-but-not-yet-secret case, which can be racy.
var entry secretIDAccessorStorageEntry
entryIndex := accessorIDPrefixToUse + accessorHash
se, err := s.Get(ctx, entryIndex)
if err != nil {
return err
}
if se != nil {
err = se.DecodeJSON(&entry)
if err != nil {
return err
}
// The storage entry doesn't store the role ID, so we have
// to go about this the long way; fortunately we shouldn't
// actually hit this very often
var found bool
searchloop:
if len(accessorHashes) > len(skipHashes) {
// There is some raciness here because we're querying secretids for
// roles without having a lock while doing so. Because
// accHashesByLockID was populated previously, at worst this may
// mean that we fail to clean up something we ought to.
var allSecretIDHMACs = make(map[string]struct{})
for _, roleNameHMAC := range roleNameHMACs {
secretIDHMACs, err := s.List(ctx, fmt.Sprintf("%s%s", secretIDPrefixToUse, roleNameHMAC))
secretIDHMACs, err := s.List(ctx, secretIDPrefixToUse+roleNameHMAC)
if err != nil {
return err
}
for _, v := range secretIDHMACs {
if v == entry.SecretIDHMAC {
found = true
logger.Trace("accessor verified, not removing")
break searchloop
allSecretIDHMACs[v] = struct{}{}
}
}
tidyEntries := func(entries []tidyHelperSecretIDAccessor) error {
for _, entry := range entries {
// Don't clean up accessor index entry if secretid cleanup func
// determined that it should stay.
if _, ok := skipHashes[entry.saltedSecretIDAccessor]; ok {
continue
}
if !found {
logger.Trace("could not verify dangling accessor, removing")
err = s.Delete(ctx, entryIndex)
if err != nil {
// Don't clean up accessor index entry if referenced in role.
if _, ok := allSecretIDHMACs[entry.SecretIDHMAC]; ok {
continue
}
if err := s.Delete(context.Background(), accessorIDPrefixToUse+entry.saltedSecretIDAccessor); err != nil {
return err
}
}
return nil
}
for lockIdx, entries := range accHashesByLockID {
// Ideally, locking on accessors should be performed here too
// but for that, accessors are required in plaintext, which are
// not available.
// ...
// The lock is held when writing accessor/secret so if we have
// the lock we know we're not in a
// wrote-accessor-but-not-yet-secret case, which can be racy.
b.secretIDLocks[lockIdx].Lock()
err = tidyEntries(entries)
b.secretIDLocks[lockIdx].Unlock()
if err != nil {
return err
}
}
}
@@ -224,7 +248,7 @@ func (b *backend) tidySecretID(ctx context.Context, req *logical.Request) (*logi
return nil
}
err := tidyFunc(secretIDPrefix, secretIDAccessorPrefix)
err = tidyFunc(secretIDPrefix, secretIDAccessorPrefix)
if err != nil {
logger.Error("error tidying global secret IDs", "error", err)
return
@@ -234,11 +258,6 @@ func (b *backend) tidySecretID(ctx context.Context, req *logical.Request) (*logi
logger.Error("error tidying local secret IDs", "error", err)
return
}
}()
resp := &logical.Response{}
resp.AddWarning("Tidy operation successfully started. Any information from the operation will be printed to Vault's server logs.")
return logical.RespondWithStatusCode(resp, req, http.StatusAccepted)
}
// pathTidySecretIDUpdate is used to delete the expired SecretID entries

View File

@@ -4,6 +4,7 @@ import (
"context"
"fmt"
"sync"
"sync/atomic"
"testing"
"time"
@@ -91,8 +92,6 @@ func TestAppRole_TidyDanglingAccessors_RaceTest(t *testing.T) {
var err error
b, storage := createBackendWithStorage(t)
b.testTidyDelay = 300 * time.Millisecond
// Create a role
createRole(t, b, storage, "role1", "a,b,c")
@@ -109,14 +108,9 @@ func TestAppRole_TidyDanglingAccessors_RaceTest(t *testing.T) {
count := 1
wg := &sync.WaitGroup{}
now := time.Now()
started := false
for {
if time.Now().Sub(now) > 700*time.Millisecond {
break
}
if time.Now().Sub(now) > 100*time.Millisecond && !started {
started = true
start := time.Now()
for time.Now().Sub(start) < 10*time.Second {
if time.Now().Sub(start) > 100*time.Millisecond && atomic.LoadUint32(b.tidySecretIDCASGuard) == 0 {
_, err = b.tidySecretID(context.Background(), &logical.Request{
Storage: storage,
})
@@ -137,23 +131,50 @@ func TestAppRole_TidyDanglingAccessors_RaceTest(t *testing.T) {
t.Fatalf("err:%v resp:%#v", err, resp)
}
}()
count++
}
t.Logf("wrote %d entries", count)
wg.Wait()
// Let tidy finish
time.Sleep(1 * time.Second)
// Run tidy again
_, err = b.tidySecretID(context.Background(), &logical.Request{
Storage: storage,
})
entry, err := logical.StorageEntryJSON(
fmt.Sprintf("accessor/invalid%d", count),
&secretIDAccessorStorageEntry{
SecretIDHMAC: "samplesecretidhmac",
},
)
err = storage.Put(context.Background(), entry)
if err != nil {
t.Fatal(err)
}
time.Sleep(2 * time.Second)
count++
time.Sleep(100 * time.Microsecond)
}
logger := b.Logger().Named(t.Name())
logger.Info("wrote entries", "count", count)
wg.Wait()
// Let tidy finish
for atomic.LoadUint32(b.tidySecretIDCASGuard) != 0 {
time.Sleep(100 * time.Millisecond)
}
logger.Info("running tidy again")
// Run tidy again
secret, err := b.tidySecretID(context.Background(), &logical.Request{
Storage: storage,
})
if err != nil || len(secret.Warnings) > 0 {
t.Fatal(err, secret.Warnings)
}
// Wait for tidy to start
for atomic.LoadUint32(b.tidySecretIDCASGuard) == 0 {
time.Sleep(100 * time.Millisecond)
}
// Let tidy finish
for atomic.LoadUint32(b.tidySecretIDCASGuard) != 0 {
time.Sleep(100 * time.Millisecond)
}
accessorHashes, err := storage.List(context.Background(), "accessor/")
if err != nil {