backport of commit dbfaa6f81a (#23286)

Co-authored-by: Steven Clark <steven.clark@hashicorp.com>
This commit is contained in:
hc-github-team-secure-vault-core
2023-09-26 14:18:03 -04:00
committed by GitHub
parent b4d07277a6
commit 0943b82368
4 changed files with 51 additions and 14 deletions

View File

@@ -10,6 +10,7 @@ import (
"sync" "sync"
"time" "time"
"github.com/hashicorp/vault/sdk/helper/consts"
"github.com/hashicorp/vault/sdk/logical" "github.com/hashicorp/vault/sdk/logical"
) )
@@ -42,6 +43,7 @@ type ChallengeValidation struct {
type ChallengeQueueEntry struct { type ChallengeQueueEntry struct {
Identifier string Identifier string
RetryAfter time.Time RetryAfter time.Time
NumRetries int // Track if we are spinning on a corrupted challenge
} }
type ACMEChallengeEngine struct { type ACMEChallengeEngine struct {
@@ -97,7 +99,7 @@ func (ace *ACMEChallengeEngine) Run(b *backend, state *acmeState, sc *storageCon
b.Logger().Error("failed loading existing ACME challenge validations:", "err", err) b.Logger().Error("failed loading existing ACME challenge validations:", "err", err)
} }
for true { for {
// err == nil on shutdown. // err == nil on shutdown.
b.Logger().Debug("Starting ACME challenge validation engine") b.Logger().Debug("Starting ACME challenge validation engine")
err := ace._run(b, state) err := ace._run(b, state)
@@ -119,7 +121,7 @@ func (ace *ACMEChallengeEngine) _run(b *backend, state *acmeState) error {
// We want at most a certain number of workers operating to verify // We want at most a certain number of workers operating to verify
// challenges. // challenges.
var finishedWorkersChannels []chan bool var finishedWorkersChannels []chan bool
for true { for {
// Wait until we've got more work to do. // Wait until we've got more work to do.
select { select {
case <-ace.Closing: case <-ace.Closing:
@@ -201,12 +203,17 @@ func (ace *ACMEChallengeEngine) _run(b *backend, state *acmeState) error {
// looping through the queue until we hit a repeat. // looping through the queue until we hit a repeat.
firstIdentifier = "" firstIdentifier = ""
// If we are no longer the active node, break out
if b.System().ReplicationState().HasState(consts.ReplicationDRSecondary | consts.ReplicationPerformanceStandby) {
break
}
// Here, we got a piece of work that is ready to check; create a // Here, we got a piece of work that is ready to check; create a
// channel and a new go routine and run it. Note that this still // channel and a new go routine and run it. Note that this still
// could have a RetryAfter date we're not aware of (e.g., if the // could have a RetryAfter date we're not aware of (e.g., if the
// cluster restarted as we do not read the entries there). // cluster restarted as we do not read the entries there).
channel := make(chan bool, 1) channel := make(chan bool, 1)
go ace.VerifyChallenge(runnerSC, task.Identifier, channel, config) go ace.VerifyChallenge(runnerSC, task.Identifier, task.NumRetries, channel, config)
finishedWorkersChannels = append(finishedWorkersChannels, channel) finishedWorkersChannels = append(finishedWorkersChannels, channel)
startedWork = true startedWork = true
} }
@@ -305,8 +312,9 @@ func (ace *ACMEChallengeEngine) AcceptChallenge(sc *storageContext, account stri
return nil return nil
} }
func (ace *ACMEChallengeEngine) VerifyChallenge(runnerSc *storageContext, id string, finished chan bool, config *acmeConfigEntry) { func (ace *ACMEChallengeEngine) VerifyChallenge(runnerSc *storageContext, id string, validationQueueRetries int, finished chan bool, config *acmeConfigEntry) {
sc, _ /* cancel func */ := runnerSc.WithFreshTimeout(MaxChallengeTimeout) sc, cancel := runnerSc.WithFreshTimeout(MaxChallengeTimeout)
defer cancel()
runnerSc.Backend.Logger().Debug("Starting verification of challenge", "id", id) runnerSc.Backend.Logger().Debug("Starting verification of challenge", "id", id)
if retry, retryAfter, err := ace._verifyChallenge(sc, id, config); err != nil { if retry, retryAfter, err := ace._verifyChallenge(sc, id, config); err != nil {
@@ -316,11 +324,28 @@ func (ace *ACMEChallengeEngine) VerifyChallenge(runnerSc *storageContext, id str
sc.Backend.Logger().Error(fmt.Sprintf("ACME validation failed for %v: %v", id, err)) sc.Backend.Logger().Error(fmt.Sprintf("ACME validation failed for %v: %v", id, err))
if retry { if retry {
validationQueueRetries++
// The retry logic within _verifyChallenge is dependent on being able to read and decode
// the ACME challenge entries. If we encounter such failures we would retry forever, so
// we have a secondary check here to see if we are consistently looping within the validation
// queue that is larger than the normal retry attempts we would allow.
if validationQueueRetries > MaxRetryAttempts*2 {
sc.Backend.Logger().Warn("reached max error attempts within challenge queue: %v, giving up", id)
_, _, err = ace._verifyChallengeCleanup(sc, nil, id)
if err != nil {
sc.Backend.Logger().Warn("Failed cleaning up challenge entry: %v", err)
}
finished <- true
return
}
ace.ValidationLock.Lock() ace.ValidationLock.Lock()
defer ace.ValidationLock.Unlock() defer ace.ValidationLock.Unlock()
ace.Validations.PushBack(&ChallengeQueueEntry{ ace.Validations.PushBack(&ChallengeQueueEntry{
Identifier: id, Identifier: id,
RetryAfter: retryAfter, RetryAfter: retryAfter,
NumRetries: validationQueueRetries,
}) })
// Let the validator know there's a pending challenge. // Let the validator know there's a pending challenge.
@@ -343,23 +368,23 @@ func (ace *ACMEChallengeEngine) VerifyChallenge(runnerSc *storageContext, id str
func (ace *ACMEChallengeEngine) _verifyChallenge(sc *storageContext, id string, config *acmeConfigEntry) (bool, time.Time, error) { func (ace *ACMEChallengeEngine) _verifyChallenge(sc *storageContext, id string, config *acmeConfigEntry) (bool, time.Time, error) {
now := time.Now() now := time.Now()
backoffTime := now.Add(1 * time.Second)
path := acmeValidationPrefix + id path := acmeValidationPrefix + id
challengeEntry, err := sc.Storage.Get(sc.Context, path) challengeEntry, err := sc.Storage.Get(sc.Context, path)
if err != nil { if err != nil {
return true, now, fmt.Errorf("error loading challenge %v: %w", id, err) return true, backoffTime, fmt.Errorf("error loading challenge %v: %w", id, err)
} }
if challengeEntry == nil { if challengeEntry == nil {
// Something must've successfully cleaned up our storage entry from // Something must've successfully cleaned up our storage entry from
// under us. Assume we don't need to rerun, else the client will // under us. Assume we don't need to rerun, else the client will
// trigger us to re-run. // trigger us to re-run.
err = nil return ace._verifyChallengeCleanup(sc, nil, id)
return ace._verifyChallengeCleanup(sc, err, id)
} }
var cv *ChallengeValidation var cv *ChallengeValidation
if err := challengeEntry.DecodeJSON(&cv); err != nil { if err := challengeEntry.DecodeJSON(&cv); err != nil {
return true, now, fmt.Errorf("error decoding challenge %v: %w", id, err) return true, backoffTime, fmt.Errorf("error decoding challenge %v: %w", id, err)
} }
if now.Before(cv.RetryAfter) { if now.Before(cv.RetryAfter) {
@@ -369,7 +394,7 @@ func (ace *ACMEChallengeEngine) _verifyChallenge(sc *storageContext, id string,
authzPath := getAuthorizationPath(cv.Account, cv.Authorization) authzPath := getAuthorizationPath(cv.Account, cv.Authorization)
authz, err := loadAuthorizationAtPath(sc, authzPath) authz, err := loadAuthorizationAtPath(sc, authzPath)
if err != nil { if err != nil {
return true, now, fmt.Errorf("error loading authorization %v/%v for challenge %v: %w", cv.Account, cv.Authorization, id, err) return true, backoffTime, fmt.Errorf("error loading authorization %v/%v for challenge %v: %w", cv.Account, cv.Authorization, id, err)
} }
if authz.Status != ACMEAuthorizationPending { if authz.Status != ACMEAuthorizationPending {
@@ -527,7 +552,7 @@ func (ace *ACMEChallengeEngine) _verifyChallengeCleanup(sc *storageContext, err
// Remove our ChallengeValidation entry only. // Remove our ChallengeValidation entry only.
if deleteErr := sc.Storage.Delete(sc.Context, acmeValidationPrefix+id); deleteErr != nil { if deleteErr := sc.Storage.Delete(sc.Context, acmeValidationPrefix+id); deleteErr != nil {
return true, now.Add(-1 * time.Second), fmt.Errorf("error deleting challenge %v (error prior to cleanup, if any: %v): %w", id, err, deleteErr) return true, now.Add(1 * time.Second), fmt.Errorf("error deleting challenge %v (error prior to cleanup, if any: %v): %w", id, err, deleteErr)
} }
if err != nil { if err != nil {

View File

@@ -18,6 +18,7 @@ import (
"github.com/hashicorp/go-secure-stdlib/nonceutil" "github.com/hashicorp/go-secure-stdlib/nonceutil"
"github.com/hashicorp/vault/sdk/framework" "github.com/hashicorp/vault/sdk/framework"
"github.com/hashicorp/vault/sdk/helper/consts"
"github.com/hashicorp/vault/sdk/logical" "github.com/hashicorp/vault/sdk/logical"
) )
@@ -76,6 +77,13 @@ func (a *acmeState) Initialize(b *backend, sc *storageContext) error {
return fmt.Errorf("error initializing ACME engine: %w", err) return fmt.Errorf("error initializing ACME engine: %w", err)
} }
if b.System().ReplicationState().HasState(consts.ReplicationDRSecondary | consts.ReplicationPerformanceStandby) {
// It is assumed, that if the node does become the active node later
// the plugin is re-initialized, so this is safe. It also spares the node
// from loading the existing queue into memory for no reason.
b.Logger().Debug("Not on an active node, skipping starting ACME challenge validation engine")
return nil
}
// Kick off our ACME challenge validation engine. // Kick off our ACME challenge validation engine.
go a.validator.Run(b, a, sc) go a.validator.Run(b, a, sc)

View File

@@ -64,9 +64,10 @@ func (ts *TestServer) setupRunner(domain string, network string) {
ContainerName: "bind9-dns-" + strings.ReplaceAll(domain, ".", "-"), ContainerName: "bind9-dns-" + strings.ReplaceAll(domain, ".", "-"),
NetworkName: network, NetworkName: network,
Ports: []string{"53/udp"}, Ports: []string{"53/udp"},
LogConsumer: func(s string) { // DNS container logging was disabled to reduce content within CI logs.
ts.log.Info(s) //LogConsumer: func(s string) {
}, // ts.log.Info(s)
//},
}) })
require.NoError(ts.t, err) require.NoError(ts.t, err)
} }

3
changelog/23278.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:bug
secrets/pki: Stop processing in-flight ACME verifications when an active node steps down
```