diff --git a/api/lifetime_watcher.go b/api/lifetime_watcher.go index 7070445cc0..4bc1390b93 100644 --- a/api/lifetime_watcher.go +++ b/api/lifetime_watcher.go @@ -6,6 +6,7 @@ package api import ( "errors" "math/rand" + "strings" "sync" "time" @@ -289,12 +290,18 @@ func (r *LifetimeWatcher) doRenewWithOptions(tokenMode bool, nonRenewable bool, switch { case nonRenewable || r.renewBehavior == RenewBehaviorRenewDisabled: // Can't or won't renew, just keep the same expiration so we exit - // when it's reauthentication time + // when it's re-authentication time remainingLeaseDuration = fallbackLeaseDuration default: // Renew the token renewal, err = renew(credString, r.increment) + if err != nil && strings.Contains(err.Error(), "permission denied") { + // We can't renew since the token doesn't have permission to. Fall back + // to the code path for non-renewable tokens. + nonRenewable = true + continue + } if err != nil || renewal == nil || (tokenMode && renewal.Auth == nil) { if r.renewBehavior == RenewBehaviorErrorOnErrors { if err != nil { diff --git a/api/renewer_test.go b/api/renewer_test.go index 7ba16e66ec..1c9a5d03e2 100644 --- a/api/renewer_test.go +++ b/api/renewer_test.go @@ -177,6 +177,20 @@ func TestLifetimeWatcher(t *testing.T) { expectError: nil, expectRenewal: true, }, + { + maxTestTime: time.Second, + name: "permission_denied_error", + leaseDurationSeconds: 60, + incrementSeconds: 10, + // This should cause the lifetime watcher to behave just + // like a non-renewable secret, i.e. wait until its lifetime + // then be done. + renew: func(_ string, _ int) (*Secret, error) { + return nil, fmt.Errorf("permission denied") + }, + expectError: nil, + expectRenewal: false, + }, } for _, tc := range cases { @@ -204,7 +218,9 @@ func TestLifetimeWatcher(t *testing.T) { for { select { case <-time.After(tc.maxTestTime): - t.Fatalf("renewal didn't happen") + if tc.expectRenewal || tc.expectError != nil { + t.Fatalf("expected error or renewal, and neither happened") + } case r := <-v.RenewCh(): if !tc.expectRenewal { t.Fatal("expected no renewals") diff --git a/changelog/26844.txt b/changelog/26844.txt new file mode 100644 index 0000000000..49f7bf2f16 --- /dev/null +++ b/changelog/26844.txt @@ -0,0 +1,3 @@ +```release-note:bug +auto-auth: Addressed issue where having no permissions to renew a renewable token caused auto-auth to attempt to renew constantly with no backoff +``` diff --git a/command/agent_test.go b/command/agent_test.go index 239128728a..235a8ede1a 100644 --- a/command/agent_test.go +++ b/command/agent_test.go @@ -3179,6 +3179,143 @@ vault { } } +// TestAgent_TokenRenewal tests that LifeTimeWatcher does not make +// many renewal attempts if the token's policy does not allow for it to renew +// itself. Prior to a bug fix in the PR that added this test, this would have resulted +// in hundreds of token renewal requests with no backoff. +func TestAgent_TokenRenewal(t *testing.T) { + logger := logging.NewVaultLogger(hclog.Trace) + cluster := vault.NewTestCluster(t, nil, &vault.TestClusterOptions{ + HandlerFunc: vaulthttp.Handler, + }) + cluster.Start() + defer cluster.Cleanup() + + serverClient := cluster.Cores[0].Client + + auditLogFileName := makeTempFile(t, "audit-log", "") + err := serverClient.Sys().EnableAuditWithOptions("file-audit-for-TestAgent_TokenRenewal", &api.EnableAuditOptions{ + Type: "file", + Options: map[string]string{ + "file_path": auditLogFileName, + }, + }) + require.NoError(t, err) + + // Unset the environment variable so that agent picks up the right test + // cluster address + defer os.Setenv(api.EnvVaultAddress, os.Getenv(api.EnvVaultAddress)) + os.Unsetenv(api.EnvVaultAddress) + + policyName := "less-than-default" + // Has a subset of the default policy's permissions + // Specifically removing renew-self. + err = serverClient.Sys().PutPolicy(policyName, ` +path "auth/token/lookup-self" { + capabilities = ["read"] +} + +# Allow tokens to revoke themselves +path "auth/token/revoke-self" { + capabilities = ["update"] +} + +# Allow a token to look up its own capabilities on a path +path "sys/capabilities-self" { + capabilities = ["update"] +} +`) + require.NoError(t, err) + + renewable := true + // Make the token renewable but give it no permissions + // (e.g. the permission to renew itself) + tokenCreateRequest := &api.TokenCreateRequest{ + Policies: []string{policyName}, + TTL: "10s", + Renewable: &renewable, + NoDefaultPolicy: true, + } + + secret, err := serverClient.Auth().Token().CreateOrphan(tokenCreateRequest) + require.NoError(t, err) + lowPermissionToken := secret.Auth.ClientToken + + tokenFileName := makeTempFile(t, "token-file", lowPermissionToken) + + sinkFileName := makeTempFile(t, "sink-file", "") + + autoAuthConfig := fmt.Sprintf(` +auto_auth { + method { + type = "token_file" + config = { + token_file_path = "%s" + } + } + + sink "file" { + config = { + path = "%s" + } + } +}`, tokenFileName, sinkFileName) + + config := fmt.Sprintf(` +vault { + address = "%s" + tls_skip_verify = true +} + +log_level = "trace" + +%s +`, serverClient.Address(), autoAuthConfig) + configPath := makeTempFile(t, "config.hcl", config) + + // Start the agent + ui, cmd := testAgentCommand(t, logger) + + cmd.startedCh = make(chan struct{}) + + wg := &sync.WaitGroup{} + wg.Add(1) + go func() { + cmd.Run([]string{"-config", configPath}) + wg.Done() + }() + + select { + case <-cmd.startedCh: + case <-time.After(5 * time.Second): + t.Errorf("timeout") + t.Errorf("stdout: %s", ui.OutputWriter.String()) + t.Errorf("stderr: %s", ui.ErrorWriter.String()) + } + + // Sleep, to allow the renewal/auth process to work and ensure that it doesn't + // go crazy with renewals. + time.Sleep(30 * time.Second) + + fileBytes, err := os.ReadFile(auditLogFileName) + require.NoError(t, err) + stringAudit := string(fileBytes) + + // This is a bit of an imperfect way to test things, but we want to make sure + // that a token like this doesn't keep triggering retries. + // Due to the fact this is an auto-auth specific thing, unit tests for the + // LifetimeWatcher wouldn't be sufficient here. + // Prior to the fix made in the same PR this test was added, it would trigger many, many + // retries (hundreds to thousands in less than a minute). + // We really want to make sure that doesn't happen. + numberOfRenewSelves := strings.Count(stringAudit, "auth/token/renew-self") + // We actually expect ~6, but I added some buffer for CI weirdness. It can also vary + // due to the grace added/removed from the sleep in LifetimeWatcher too. + if numberOfRenewSelves > 10 { + t.Fatalf("did too many renews -- Vault received %d renew-self requests", numberOfRenewSelves) + } +} + // TestAgent_Logging_ConsulTemplate attempts to ensure two things about Vault Agent logs: // 1. When -log-format command line arg is set to JSON, it is honored as the output format // for messages generated from within the consul-template library. diff --git a/command/agentproxyshared/auth/auth.go b/command/agentproxyshared/auth/auth.go index 49ae395c48..12c998da5a 100644 --- a/command/agentproxyshared/auth/auth.go +++ b/command/agentproxyshared/auth/auth.go @@ -7,6 +7,7 @@ import ( "context" "encoding/json" "errors" + "fmt" "math" "math/rand" "net/http" @@ -478,10 +479,9 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error { } metrics.IncrCounter([]string{ah.metricsSignifier, "auth", "success"}, 1) - // We don't want to trigger the renewal process for tokens with - // unlimited TTL, such as the root token. - if leaseDuration == 0 && isTokenFileMethod { - ah.logger.Info("not starting token renewal process, as token has unlimited TTL") + // We don't want to trigger the renewal process for the root token + if isRootToken(leaseDuration, isTokenFileMethod, secret) { + ah.logger.Info("not starting token renewal process, as token is root token") } else { ah.logger.Info("starting renewal process") go watcher.Renew() @@ -496,11 +496,31 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error { break LifetimeWatcherLoop case err := <-watcher.DoneCh(): - ah.logger.Info("lifetime watcher done channel triggered") + ah.logger.Info("lifetime watcher done channel triggered, re-authenticating") if err != nil { + ah.logger.Error("error renewing token", "error", err, "backoff", backoffCfg) metrics.IncrCounter([]string{ah.metricsSignifier, "auth", "failure"}, 1) - ah.logger.Error("error renewing token", "error", err) + + // Add some exponential backoff so that if auth is successful + // but the watcher errors, we won't go into an immediate + // aggressive retry loop. + // This might be quite a small sleep, since if we have a successful + // auth, we reset the backoff. Still, some backoff is important, and + // ensuring we follow the normal flow is important: + // auth -> try to renew + if !backoffSleep(ctx, backoffCfg) { + // We're at max retries. Return an error. + return fmt.Errorf("exceeded max retries failing to renew auth token") + } } + + // If the lease duration is 0, wait a second before re-authenticating + // so that we don't go into a loop, as the LifetimeWatcher will immediately + // return for tokens like this. + if leaseDuration == 0 { + time.Sleep(1 * time.Second) + } + break LifetimeWatcherLoop case <-watcher.RenewCh(): @@ -523,6 +543,24 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error { } } +// isRootToken checks if the secret in the argument is the root token +// This is determinable without leaseDuration and isTokenFileMethod, +// but those make it easier to rule out other tokens cheaply. +func isRootToken(leaseDuration int, isTokenFileMethod bool, secret *api.Secret) bool { + // This check is cheaper than the others, so we do this first. + if leaseDuration == 0 && isTokenFileMethod && !secret.Renewable { + if secret != nil { + policies, err := secret.TokenPolicies() + if err == nil { + if len(policies) == 1 && policies[0] == "root" { + return true + } + } + } + } + return false +} + // autoAuthBackoff tracks exponential backoff state. type autoAuthBackoff struct { backoff *backoff.Backoff