diff --git a/changelog/25497.txt b/changelog/25497.txt new file mode 100644 index 0000000000..d4bfe8fb2d --- /dev/null +++ b/changelog/25497.txt @@ -0,0 +1,3 @@ +```release-note:bug +agent: Fixes a high Vault load issue, by restarting the Conusl template server after backing off instead of immediately. +``` diff --git a/command/agent/exec/exec.go b/command/agent/exec/exec.go index d6da655453..1100878686 100644 --- a/command/agent/exec/exec.go +++ b/command/agent/exec/exec.go @@ -7,6 +7,7 @@ import ( "context" "fmt" "io" + "math" "os" "sort" "sync" @@ -19,6 +20,8 @@ import ( "github.com/hashicorp/vault/command/agent/config" "github.com/hashicorp/vault/command/agent/internal/ctmanager" "github.com/hashicorp/vault/helper/useragent" + "github.com/hashicorp/vault/sdk/helper/backoff" + "github.com/hashicorp/vault/sdk/helper/consts" "github.com/hashicorp/vault/sdk/helper/pointerutil" "golang.org/x/exp/slices" ) @@ -167,6 +170,10 @@ func (s *Server) Run(ctx context.Context, incomingVaultToken chan string) error // capture the errors related to restarting the child process restartChildProcessErrCh := make(chan error) + // create exponential backoff object to calculate backoff time before restarting a failed + // consul template server + restartBackoff := backoff.NewBackoff(math.MaxInt, consts.DefaultMinBackoff, consts.DefaultMaxBackoff) + for { select { case <-ctx.Done(): @@ -216,6 +223,17 @@ func (s *Server) Run(ctx context.Context, incomingVaultToken chan string) error return fmt.Errorf("template server: %w", err) } + // Calculate the amount of time to backoff using exponential backoff + sleep, err := restartBackoff.Next() + if err != nil { + s.logger.Error("template server: reached maximum number restart attempts") + restartBackoff.Reset() + } + + // Sleep for the calculated backoff time then attempt to create a new runner + s.logger.Warn(fmt.Sprintf("template server restart: retry attempt after %s", sleep)) + time.Sleep(sleep) + s.runner, err = manager.NewRunner(runnerConfig, true) if err != nil { return fmt.Errorf("template server failed to create: %w", err) diff --git a/command/agent/template/template.go b/command/agent/template/template.go index cc03b7e96f..fe88c7332e 100644 --- a/command/agent/template/template.go +++ b/command/agent/template/template.go @@ -13,6 +13,8 @@ import ( "errors" "fmt" "io" + "math" + "time" ctconfig "github.com/hashicorp/consul-template/config" "github.com/hashicorp/consul-template/manager" @@ -20,6 +22,8 @@ import ( "github.com/hashicorp/vault/command/agent/config" "github.com/hashicorp/vault/command/agent/internal/ctmanager" "github.com/hashicorp/vault/helper/useragent" + "github.com/hashicorp/vault/sdk/helper/backoff" + "github.com/hashicorp/vault/sdk/helper/consts" "github.com/hashicorp/vault/sdk/helper/pointerutil" "go.uber.org/atomic" ) @@ -143,6 +147,10 @@ func (ts *Server) Run(ctx context.Context, incoming chan string, templates []*ct } ts.lookupMap = lookupMap + // Create backoff object to calculate backoff time before restarting a failed + // consul template server + restartBackoff := backoff.NewBackoff(math.MaxInt, consts.DefaultMinBackoff, consts.DefaultMaxBackoff) + for { select { case <-ctx.Done(): @@ -191,6 +199,17 @@ func (ts *Server) Run(ctx context.Context, incoming chan string, templates []*ct return fmt.Errorf("template server: %w", err) } + // Calculate the amount of time to backoff using exponential backoff + sleep, err := restartBackoff.Next() + if err != nil { + ts.logger.Error("template server: reached maximum number of restart attempts") + restartBackoff.Reset() + } + + // Sleep for the calculated backoff time then attempt to create a new runner + ts.logger.Warn(fmt.Sprintf("template server restart: retry attempt after %s", sleep)) + time.Sleep(sleep) + ts.runner, err = manager.NewRunner(runnerConfig, false) if err != nil { return fmt.Errorf("template server failed to create: %w", err) diff --git a/command/agentproxyshared/auth/auth.go b/command/agentproxyshared/auth/auth.go index 9b880c0e6d..7c22f36d58 100644 --- a/command/agentproxyshared/auth/auth.go +++ b/command/agentproxyshared/auth/auth.go @@ -16,14 +16,10 @@ import ( "github.com/hashicorp/go-hclog" "github.com/hashicorp/vault/api" "github.com/hashicorp/vault/sdk/helper/backoff" + "github.com/hashicorp/vault/sdk/helper/consts" "github.com/hashicorp/vault/sdk/helper/jsonutil" ) -const ( - defaultMinBackoff = 1 * time.Second - defaultMaxBackoff = 5 * time.Minute -) - // AuthMethod is the interface that auto-auth methods implement for the agent/proxy // to use. type AuthMethod interface { @@ -132,10 +128,10 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error { } if ah.minBackoff <= 0 { - ah.minBackoff = defaultMinBackoff + ah.minBackoff = consts.DefaultMinBackoff } if ah.maxBackoff <= 0 { - ah.maxBackoff = defaultMaxBackoff + ah.maxBackoff = consts.DefaultMaxBackoff } if ah.minBackoff > ah.maxBackoff { return errors.New("auth handler: min_backoff cannot be greater than max_backoff") @@ -510,11 +506,11 @@ type autoAuthBackoff struct { func newAutoAuthBackoff(min, max time.Duration, exitErr bool) *autoAuthBackoff { if max <= 0 { - max = defaultMaxBackoff + max = consts.DefaultMaxBackoff } if min <= 0 { - min = defaultMinBackoff + min = consts.DefaultMinBackoff } retries := math.MaxInt diff --git a/command/agentproxyshared/auth/auth_test.go b/command/agentproxyshared/auth/auth_test.go index 4ecfff03f5..b866b317dc 100644 --- a/command/agentproxyshared/auth/auth_test.go +++ b/command/agentproxyshared/auth/auth_test.go @@ -13,6 +13,7 @@ import ( "github.com/hashicorp/vault/api" "github.com/hashicorp/vault/builtin/credential/userpass" vaulthttp "github.com/hashicorp/vault/http" + "github.com/hashicorp/vault/sdk/helper/consts" "github.com/hashicorp/vault/sdk/helper/logging" "github.com/hashicorp/vault/sdk/logical" "github.com/hashicorp/vault/vault" @@ -110,10 +111,10 @@ consumption: func TestAgentBackoff(t *testing.T) { max := 1024 * time.Second - backoff := newAutoAuthBackoff(defaultMinBackoff, max, false) + backoff := newAutoAuthBackoff(consts.DefaultMinBackoff, max, false) // Test initial value - if backoff.backoff.Current() > defaultMinBackoff || backoff.backoff.Current() < defaultMinBackoff*3/4 { + if backoff.backoff.Current() > consts.DefaultMinBackoff || backoff.backoff.Current() < consts.DefaultMinBackoff*3/4 { t.Fatalf("expected 1s initial backoff, got: %v", backoff.backoff.Current()) } @@ -141,7 +142,7 @@ func TestAgentBackoff(t *testing.T) { // Test reset backoff.backoff.Reset() - if backoff.backoff.Current() > defaultMinBackoff || backoff.backoff.Current() < defaultMinBackoff*3/4 { + if backoff.backoff.Current() > consts.DefaultMinBackoff || backoff.backoff.Current() < consts.DefaultMinBackoff*3/4 { t.Fatalf("expected 1s backoff after reset, got: %v", backoff.backoff.Current()) } } diff --git a/sdk/helper/consts/agent.go b/sdk/helper/consts/agent.go index 53b8b8e2e7..ff2b043749 100644 --- a/sdk/helper/consts/agent.go +++ b/sdk/helper/consts/agent.go @@ -3,6 +3,8 @@ package consts +import "time" + // AgentPathCacheClear is the path that the agent will use as its cache-clear // endpoint. const AgentPathCacheClear = "/agent/v1/cache-clear" @@ -13,3 +15,9 @@ const AgentPathMetrics = "/agent/v1/metrics" // AgentPathQuit is the path that the agent will use to trigger stopping it. const AgentPathQuit = "/agent/v1/quit" + +// DefaultMinBackoff is the default minimum backoff time for agent and proxy +const DefaultMinBackoff = 1 * time.Second + +// DefaultMaxBackoff is the default max backoff time for agent and proxy +const DefaultMaxBackoff = 5 * time.Minute