VAULT-25341 Address issue where having no permissions to renew caused auto-auth to attempt to renew with no backoff (#26844)

* VAULT-25341 Address issue where having no permissions to renew caused Agent and Proxy auth to attempt to renew with no backoff

* Fiddle with go.mod changes that shouldn't have happened

* VAULT-25341 small cleanup and extra test

* VAULT-25341 backoff only in error case

* VAULT-25341 godocs

* VAULT-25342 changelog

* Update command/agent_test.go

Co-authored-by: divyaac <divya.chandrasekaran@hashicorp.com>

* VAULT-25341 rename file audit

---------

Co-authored-by: divyaac <divya.chandrasekaran@hashicorp.com>
This commit is contained in:
Violet Hynes
2024-05-09 11:12:42 -04:00
committed by GitHub
parent 84d734d673
commit b16b94a72a
5 changed files with 209 additions and 8 deletions

View File

@@ -6,6 +6,7 @@ package api
import (
"errors"
"math/rand"
"strings"
"sync"
"time"
@@ -289,12 +290,18 @@ func (r *LifetimeWatcher) doRenewWithOptions(tokenMode bool, nonRenewable bool,
switch {
case nonRenewable || r.renewBehavior == RenewBehaviorRenewDisabled:
// Can't or won't renew, just keep the same expiration so we exit
// when it's reauthentication time
// when it's re-authentication time
remainingLeaseDuration = fallbackLeaseDuration
default:
// Renew the token
renewal, err = renew(credString, r.increment)
if err != nil && strings.Contains(err.Error(), "permission denied") {
// We can't renew since the token doesn't have permission to. Fall back
// to the code path for non-renewable tokens.
nonRenewable = true
continue
}
if err != nil || renewal == nil || (tokenMode && renewal.Auth == nil) {
if r.renewBehavior == RenewBehaviorErrorOnErrors {
if err != nil {

View File

@@ -177,6 +177,20 @@ func TestLifetimeWatcher(t *testing.T) {
expectError: nil,
expectRenewal: true,
},
{
maxTestTime: time.Second,
name: "permission_denied_error",
leaseDurationSeconds: 60,
incrementSeconds: 10,
// This should cause the lifetime watcher to behave just
// like a non-renewable secret, i.e. wait until its lifetime
// then be done.
renew: func(_ string, _ int) (*Secret, error) {
return nil, fmt.Errorf("permission denied")
},
expectError: nil,
expectRenewal: false,
},
}
for _, tc := range cases {
@@ -204,7 +218,9 @@ func TestLifetimeWatcher(t *testing.T) {
for {
select {
case <-time.After(tc.maxTestTime):
t.Fatalf("renewal didn't happen")
if tc.expectRenewal || tc.expectError != nil {
t.Fatalf("expected error or renewal, and neither happened")
}
case r := <-v.RenewCh():
if !tc.expectRenewal {
t.Fatal("expected no renewals")

3
changelog/26844.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:bug
auto-auth: Addressed issue where having no permissions to renew a renewable token caused auto-auth to attempt to renew constantly with no backoff
```

View File

@@ -3179,6 +3179,143 @@ vault {
}
}
// TestAgent_TokenRenewal tests that LifeTimeWatcher does not make
// many renewal attempts if the token's policy does not allow for it to renew
// itself. Prior to a bug fix in the PR that added this test, this would have resulted
// in hundreds of token renewal requests with no backoff.
func TestAgent_TokenRenewal(t *testing.T) {
logger := logging.NewVaultLogger(hclog.Trace)
cluster := vault.NewTestCluster(t, nil, &vault.TestClusterOptions{
HandlerFunc: vaulthttp.Handler,
})
cluster.Start()
defer cluster.Cleanup()
serverClient := cluster.Cores[0].Client
auditLogFileName := makeTempFile(t, "audit-log", "")
err := serverClient.Sys().EnableAuditWithOptions("file-audit-for-TestAgent_TokenRenewal", &api.EnableAuditOptions{
Type: "file",
Options: map[string]string{
"file_path": auditLogFileName,
},
})
require.NoError(t, err)
// Unset the environment variable so that agent picks up the right test
// cluster address
defer os.Setenv(api.EnvVaultAddress, os.Getenv(api.EnvVaultAddress))
os.Unsetenv(api.EnvVaultAddress)
policyName := "less-than-default"
// Has a subset of the default policy's permissions
// Specifically removing renew-self.
err = serverClient.Sys().PutPolicy(policyName, `
path "auth/token/lookup-self" {
capabilities = ["read"]
}
# Allow tokens to revoke themselves
path "auth/token/revoke-self" {
capabilities = ["update"]
}
# Allow a token to look up its own capabilities on a path
path "sys/capabilities-self" {
capabilities = ["update"]
}
`)
require.NoError(t, err)
renewable := true
// Make the token renewable but give it no permissions
// (e.g. the permission to renew itself)
tokenCreateRequest := &api.TokenCreateRequest{
Policies: []string{policyName},
TTL: "10s",
Renewable: &renewable,
NoDefaultPolicy: true,
}
secret, err := serverClient.Auth().Token().CreateOrphan(tokenCreateRequest)
require.NoError(t, err)
lowPermissionToken := secret.Auth.ClientToken
tokenFileName := makeTempFile(t, "token-file", lowPermissionToken)
sinkFileName := makeTempFile(t, "sink-file", "")
autoAuthConfig := fmt.Sprintf(`
auto_auth {
method {
type = "token_file"
config = {
token_file_path = "%s"
}
}
sink "file" {
config = {
path = "%s"
}
}
}`, tokenFileName, sinkFileName)
config := fmt.Sprintf(`
vault {
address = "%s"
tls_skip_verify = true
}
log_level = "trace"
%s
`, serverClient.Address(), autoAuthConfig)
configPath := makeTempFile(t, "config.hcl", config)
// Start the agent
ui, cmd := testAgentCommand(t, logger)
cmd.startedCh = make(chan struct{})
wg := &sync.WaitGroup{}
wg.Add(1)
go func() {
cmd.Run([]string{"-config", configPath})
wg.Done()
}()
select {
case <-cmd.startedCh:
case <-time.After(5 * time.Second):
t.Errorf("timeout")
t.Errorf("stdout: %s", ui.OutputWriter.String())
t.Errorf("stderr: %s", ui.ErrorWriter.String())
}
// Sleep, to allow the renewal/auth process to work and ensure that it doesn't
// go crazy with renewals.
time.Sleep(30 * time.Second)
fileBytes, err := os.ReadFile(auditLogFileName)
require.NoError(t, err)
stringAudit := string(fileBytes)
// This is a bit of an imperfect way to test things, but we want to make sure
// that a token like this doesn't keep triggering retries.
// Due to the fact this is an auto-auth specific thing, unit tests for the
// LifetimeWatcher wouldn't be sufficient here.
// Prior to the fix made in the same PR this test was added, it would trigger many, many
// retries (hundreds to thousands in less than a minute).
// We really want to make sure that doesn't happen.
numberOfRenewSelves := strings.Count(stringAudit, "auth/token/renew-self")
// We actually expect ~6, but I added some buffer for CI weirdness. It can also vary
// due to the grace added/removed from the sleep in LifetimeWatcher too.
if numberOfRenewSelves > 10 {
t.Fatalf("did too many renews -- Vault received %d renew-self requests", numberOfRenewSelves)
}
}
// TestAgent_Logging_ConsulTemplate attempts to ensure two things about Vault Agent logs:
// 1. When -log-format command line arg is set to JSON, it is honored as the output format
// for messages generated from within the consul-template library.

View File

@@ -7,6 +7,7 @@ import (
"context"
"encoding/json"
"errors"
"fmt"
"math"
"math/rand"
"net/http"
@@ -478,10 +479,9 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
}
metrics.IncrCounter([]string{ah.metricsSignifier, "auth", "success"}, 1)
// We don't want to trigger the renewal process for tokens with
// unlimited TTL, such as the root token.
if leaseDuration == 0 && isTokenFileMethod {
ah.logger.Info("not starting token renewal process, as token has unlimited TTL")
// We don't want to trigger the renewal process for the root token
if isRootToken(leaseDuration, isTokenFileMethod, secret) {
ah.logger.Info("not starting token renewal process, as token is root token")
} else {
ah.logger.Info("starting renewal process")
go watcher.Renew()
@@ -496,11 +496,31 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
break LifetimeWatcherLoop
case err := <-watcher.DoneCh():
ah.logger.Info("lifetime watcher done channel triggered")
ah.logger.Info("lifetime watcher done channel triggered, re-authenticating")
if err != nil {
ah.logger.Error("error renewing token", "error", err, "backoff", backoffCfg)
metrics.IncrCounter([]string{ah.metricsSignifier, "auth", "failure"}, 1)
ah.logger.Error("error renewing token", "error", err)
// Add some exponential backoff so that if auth is successful
// but the watcher errors, we won't go into an immediate
// aggressive retry loop.
// This might be quite a small sleep, since if we have a successful
// auth, we reset the backoff. Still, some backoff is important, and
// ensuring we follow the normal flow is important:
// auth -> try to renew
if !backoffSleep(ctx, backoffCfg) {
// We're at max retries. Return an error.
return fmt.Errorf("exceeded max retries failing to renew auth token")
}
}
// If the lease duration is 0, wait a second before re-authenticating
// so that we don't go into a loop, as the LifetimeWatcher will immediately
// return for tokens like this.
if leaseDuration == 0 {
time.Sleep(1 * time.Second)
}
break LifetimeWatcherLoop
case <-watcher.RenewCh():
@@ -523,6 +543,24 @@ func (ah *AuthHandler) Run(ctx context.Context, am AuthMethod) error {
}
}
// isRootToken checks if the secret in the argument is the root token
// This is determinable without leaseDuration and isTokenFileMethod,
// but those make it easier to rule out other tokens cheaply.
func isRootToken(leaseDuration int, isTokenFileMethod bool, secret *api.Secret) bool {
// This check is cheaper than the others, so we do this first.
if leaseDuration == 0 && isTokenFileMethod && !secret.Renewable {
if secret != nil {
policies, err := secret.TokenPolicies()
if err == nil {
if len(policies) == 1 && policies[0] == "root" {
return true
}
}
}
}
return false
}
// autoAuthBackoff tracks exponential backoff state.
type autoAuthBackoff struct {
backoff *backoff.Backoff