Add ability to perform automatic tidy operations (#16900)

* Add ability to perform automatic tidy operations

This enables the PKI secrets engine to allow tidy to be started
periodically by the engine itself, avoiding the need for interaction.
This operation is disabled by default (to avoid load on clusters which
don't need tidy to be run) but can be enabled.

In particular, a default tidy configuration is written (via
/config/auto-tidy) which mirrors the options passed to /tidy. Two
additional parameters, enabled and interval, are accepted, allowing
auto-tidy to be enabled or disabled and controlling the interval
(between successful tidy runs) to attempt auto-tidy.

Notably, a manual execution of tidy will delay additional auto-tidy
operations. Status is reported via the existing /tidy-status endpoint.

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Add changelog entry

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Add documentation on auto-tidy

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Add tests for auto-tidy

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Prevent race during parallel testing

We modified the RollbackManager's execution window to allow more
faithful testing of the periodicFunc. However, the TestAutoRebuild and
the new TestAutoTidy would then race against each other for modifying
the period and creating their clusters (before resetting to the old
value).

This changeset adds a lock around this, preventing the races.

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Use tidyStatusLock to gate lastTidy time

This prevents a data race between the periodic func and the execution of
the running tidy.

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

* Add read lock around tidyStatus gauges

When reading from tidyStatus for computing gauges, since the underlying
values aren't atomics, we really should be gating these with a read lock
around the status access.

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>

Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>
This commit is contained in:
Alexander Scheel
2022-08-30 15:45:54 -04:00
committed by GitHub
parent e75d2dcb71
commit f0c318e4e7
10 changed files with 542 additions and 68 deletions

View File

@@ -132,6 +132,7 @@ func Backend(conf *logical.BackendConfig) *backend {
pathRevokeWithKey(&b),
pathTidy(&b),
pathTidyStatus(&b),
pathConfigAutoTidy(&b),
// Issuer APIs
pathListIssuers(&b),
@@ -191,6 +192,9 @@ func Backend(conf *logical.BackendConfig) *backend {
b.crlBuilder = newCRLBuilder()
// Delay the first tidy until after we've started up.
b.lastTidy = time.Now()
return &b
}
@@ -204,6 +208,7 @@ type backend struct {
tidyStatusLock sync.RWMutex
tidyStatus *tidyStatus
lastTidy time.Time
pkiStorageVersion atomic.Value
crlBuilder *crlBuilder
@@ -405,34 +410,105 @@ func (b *backend) invalidate(ctx context.Context, key string) {
}
func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) error {
// First attempt to reload the CRL configuration.
sc := b.makeStorageContext(ctx, request.Storage)
if err := b.crlBuilder.reloadConfigIfRequired(sc); err != nil {
return err
}
// As we're (below) modifying the backing storage, we need to ensure
// we're not on a standby/secondary node.
if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) ||
b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) {
doCRL := func() error {
// First attempt to reload the CRL configuration.
if err := b.crlBuilder.reloadConfigIfRequired(sc); err != nil {
return err
}
// As we're (below) modifying the backing storage, we need to ensure
// we're not on a standby/secondary node.
if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) ||
b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) {
return nil
}
// Check if we're set to auto rebuild and a CRL is set to expire.
if err := b.crlBuilder.checkForAutoRebuild(sc); err != nil {
return err
}
// Then attempt to rebuild the CRLs if required.
if err := b.crlBuilder.rebuildIfForced(ctx, b, request); err != nil {
return err
}
// If a delta CRL was rebuilt above as part of the complete CRL rebuild,
// this will be a no-op. However, if we do need to rebuild delta CRLs,
// this would cause us to do so.
if err := b.crlBuilder.rebuildDeltaCRLsIfForced(sc); err != nil {
return err
}
return nil
}
// Check if we're set to auto rebuild and a CRL is set to expire.
if err := b.crlBuilder.checkForAutoRebuild(sc); err != nil {
return err
doAutoTidy := func() error {
// As we're (below) modifying the backing storage, we need to ensure
// we're not on a standby/secondary node.
if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) ||
b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) {
return nil
}
config, err := sc.getAutoTidyConfig()
if err != nil {
return err
}
if !config.Enabled || config.Interval <= 0*time.Second {
return nil
}
// Check if we should run another tidy...
now := time.Now()
b.tidyStatusLock.RLock()
nextOp := b.lastTidy.Add(config.Interval)
b.tidyStatusLock.RUnlock()
if now.Before(nextOp) {
return nil
}
// Ensure a tidy isn't already running... If it is, we'll trigger
// again when the running one finishes.
if !atomic.CompareAndSwapUint32(b.tidyCASGuard, 0, 1) {
return nil
}
// Prevent ourselves from starting another tidy operation while
// this one is still running. This operation runs in the background
// and has a separate error reporting mechanism.
b.tidyStatusLock.Lock()
b.lastTidy = now
b.tidyStatusLock.Unlock()
// Because the request from the parent storage will be cleared at
// some point (and potentially reused) -- due to tidy executing in
// a background goroutine -- we need to copy the storage entry off
// of the backend instead.
backendReq := &logical.Request{
Storage: b.storage,
}
b.startTidyOperation(backendReq, config)
return nil
}
// Then attempt to rebuild the CRLs if required.
if err := b.crlBuilder.rebuildIfForced(ctx, b, request); err != nil {
return err
crlErr := doCRL()
tidyErr := doAutoTidy()
if crlErr != nil && tidyErr != nil {
return fmt.Errorf("Error building CRLs:\n - %v\n\nError running auto-tidy:\n - %v\n", crlErr, tidyErr)
}
// If a delta CRL was rebuilt above as part of the complete CRL rebuild,
// this will be a no-op. However, if we do need to rebuild delta CRLs,
// this would cause us to do so.
if err := b.crlBuilder.rebuildDeltaCRLsIfForced(sc); err != nil {
return err
if crlErr != nil {
return fmt.Errorf("Error building CRLs:\n - %v\n", crlErr)
}
if tidyErr != nil {
return fmt.Errorf("Error running auto-tidy:\n - %v\n", tidyErr)
}
// Check if the CRL was invalidated due to issuer swap and update