Add ability to perform automatic tidy operations (#16900)

* Add ability to perform automatic tidy operations This enables the PKI secrets engine to allow tidy to be started periodically by the engine itself, avoiding the need for interaction. This operation is disabled by default (to avoid load on clusters which don't need tidy to be run) but can be enabled. In particular, a default tidy configuration is written (via /config/auto-tidy) which mirrors the options passed to /tidy. Two additional parameters, enabled and interval, are accepted, allowing auto-tidy to be enabled or disabled and controlling the interval (between successful tidy runs) to attempt auto-tidy. Notably, a manual execution of tidy will delay additional auto-tidy operations. Status is reported via the existing /tidy-status endpoint. Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Add changelog entry Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Add documentation on auto-tidy Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Add tests for auto-tidy Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Prevent race during parallel testing We modified the RollbackManager's execution window to allow more faithful testing of the periodicFunc. However, the TestAutoRebuild and the new TestAutoTidy would then race against each other for modifying the period and creating their clusters (before resetting to the old value). This changeset adds a lock around this, preventing the races. Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Use tidyStatusLock to gate lastTidy time This prevents a data race between the periodic func and the execution of the running tidy. Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Add read lock around tidyStatus gauges When reading from tidyStatus for computing gauges, since the underlying values aren't atomics, we really should be gating these with a read lock around the status access. Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>
2025-11-02 11:38:02 +00:00 · 2022-08-30 15:45:54 -04:00
parent e75d2dcb71
commit f0c318e4e7
10 changed files with 542 additions and 68 deletions
--- a/builtin/logical/pki/backend.go
+++ b/builtin/logical/pki/backend.go
@@ -132,6 +132,7 @@ func Backend(conf *logical.BackendConfig) *backend {
 			pathRevokeWithKey(&b),
 			pathTidy(&b),
 			pathTidyStatus(&b),
+			pathConfigAutoTidy(&b),

 			// Issuer APIs
 			pathListIssuers(&b),
@@ -191,6 +192,9 @@ func Backend(conf *logical.BackendConfig) *backend {

 	b.crlBuilder = newCRLBuilder()

+	// Delay the first tidy until after we've started up.
+	b.lastTidy = time.Now()
+
 	return &b
 }

@@ -204,6 +208,7 @@ type backend struct {

 	tidyStatusLock sync.RWMutex
 	tidyStatus     *tidyStatus
+	lastTidy       time.Time

 	pkiStorageVersion atomic.Value
 	crlBuilder        *crlBuilder
@@ -405,34 +410,105 @@ func (b *backend) invalidate(ctx context.Context, key string) {
 }

 func (b *backend) periodicFunc(ctx context.Context, request *logical.Request) error {
-	// First attempt to reload the CRL configuration.
 	sc := b.makeStorageContext(ctx, request.Storage)
-	if err := b.crlBuilder.reloadConfigIfRequired(sc); err != nil {
-		return err
-	}

-	// As we're (below) modifying the backing storage, we need to ensure
-	// we're not on a standby/secondary node.
-	if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) ||
-		b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) {
+	doCRL := func() error {
+		// First attempt to reload the CRL configuration.
+		if err := b.crlBuilder.reloadConfigIfRequired(sc); err != nil {
+			return err
+		}
+
+		// As we're (below) modifying the backing storage, we need to ensure
+		// we're not on a standby/secondary node.
+		if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) ||
+			b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) {
+			return nil
+		}
+
+		// Check if we're set to auto rebuild and a CRL is set to expire.
+		if err := b.crlBuilder.checkForAutoRebuild(sc); err != nil {
+			return err
+		}
+
+		// Then attempt to rebuild the CRLs if required.
+		if err := b.crlBuilder.rebuildIfForced(ctx, b, request); err != nil {
+			return err
+		}
+
+		// If a delta CRL was rebuilt above as part of the complete CRL rebuild,
+		// this will be a no-op. However, if we do need to rebuild delta CRLs,
+		// this would cause us to do so.
+		if err := b.crlBuilder.rebuildDeltaCRLsIfForced(sc); err != nil {
+			return err
+		}
+
 		return nil
 	}

-	// Check if we're set to auto rebuild and a CRL is set to expire.
-	if err := b.crlBuilder.checkForAutoRebuild(sc); err != nil {
-		return err
+	doAutoTidy := func() error {
+		// As we're (below) modifying the backing storage, we need to ensure
+		// we're not on a standby/secondary node.
+		if b.System().ReplicationState().HasState(consts.ReplicationPerformanceStandby) ||
+			b.System().ReplicationState().HasState(consts.ReplicationDRSecondary) {
+			return nil
+		}
+
+		config, err := sc.getAutoTidyConfig()
+		if err != nil {
+			return err
+		}
+
+		if !config.Enabled || config.Interval <= 0*time.Second {
+			return nil
+		}
+
+		// Check if we should run another tidy...
+		now := time.Now()
+		b.tidyStatusLock.RLock()
+		nextOp := b.lastTidy.Add(config.Interval)
+		b.tidyStatusLock.RUnlock()
+		if now.Before(nextOp) {
+			return nil
+		}
+
+		// Ensure a tidy isn't already running... If it is, we'll trigger
+		// again when the running one finishes.
+		if !atomic.CompareAndSwapUint32(b.tidyCASGuard, 0, 1) {
+			return nil
+		}
+
+		// Prevent ourselves from starting another tidy operation while
+		// this one is still running. This operation runs in the background
+		// and has a separate error reporting mechanism.
+		b.tidyStatusLock.Lock()
+		b.lastTidy = now
+		b.tidyStatusLock.Unlock()
+
+		// Because the request from the parent storage will be cleared at
+		// some point (and potentially reused) -- due to tidy executing in
+		// a background goroutine -- we need to copy the storage entry off
+		// of the backend instead.
+		backendReq := &logical.Request{
+			Storage: b.storage,
+		}
+
+		b.startTidyOperation(backendReq, config)
+		return nil
 	}

-	// Then attempt to rebuild the CRLs if required.
-	if err := b.crlBuilder.rebuildIfForced(ctx, b, request); err != nil {
-		return err
+	crlErr := doCRL()
+	tidyErr := doAutoTidy()
+
+	if crlErr != nil && tidyErr != nil {
+		return fmt.Errorf("Error building CRLs:\n - %v\n\nError running auto-tidy:\n - %v\n", crlErr, tidyErr)
 	}

-	// If a delta CRL was rebuilt above as part of the complete CRL rebuild,
-	// this will be a no-op. However, if we do need to rebuild delta CRLs,
-	// this would cause us to do so.
-	if err := b.crlBuilder.rebuildDeltaCRLsIfForced(sc); err != nil {
-		return err
+	if crlErr != nil {
+		return fmt.Errorf("Error building CRLs:\n - %v\n", crlErr)
+	}
+
+	if tidyErr != nil {
+		return fmt.Errorf("Error running auto-tidy:\n - %v\n", tidyErr)
 	}

 	// Check if the CRL was invalidated due to issuer swap and update