VAULT-31758: Store when a node is removed in the raft stable store (#29090)

* implementation and test

* changelog

* verify servers are healthy before removing
This commit is contained in:
miagilepner
2024-12-11 12:31:59 +01:00
committed by GitHub
parent 2b3f517076
commit 9bde015070
5 changed files with 94 additions and 2 deletions

3
changelog/29090.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:change
core/raft: Return an error on sys/storage/raft/join if a node that has been removed from raft cluster attempts to re-join when it still has existing raft data on disk.
```

View File

@@ -256,7 +256,7 @@ type RaftBackend struct {
// limits.
specialPathLimits map[string]uint64
removed atomic.Bool
removed *atomic.Bool
removedCallback func()
}
@@ -277,9 +277,11 @@ func (b *RaftBackend) IsRemoved() bool {
return b.removed.Load()
}
var removedKey = []byte("removed")
func (b *RaftBackend) RemoveSelf() error {
b.removed.Store(true)
return nil
return b.stableStore.SetUint64(removedKey, 1)
}
// LeaderJoinInfo contains information required by a node to join itself as a
@@ -593,6 +595,14 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend
snapStore = newSnapshotStoreDelay(snapStore, backendConfig.SnapshotDelay, logger)
}
isRemoved := new(atomic.Bool)
removedVal, err := stableStore.GetUint64(removedKey)
if err != nil {
logger.Error("error checking if this node is removed. continuing under the assumption that it's not", "error", err)
}
if removedVal == 1 {
isRemoved.Store(true)
}
return &RaftBackend{
logger: logger,
fsm: fsm,
@@ -619,6 +629,7 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend
raftLogVerifierEnabled: backendConfig.RaftLogVerifierEnabled,
raftLogVerificationInterval: backendConfig.RaftLogVerificationInterval,
effectiveSDKVersion: version.GetVersion().Version,
removed: isRemoved,
}, nil
}

View File

@@ -1539,3 +1539,25 @@ func TestSysHealth_Raft(t *testing.T) {
require.Nil(t, r.HAConnectionHealthy)
})
}
// TestRaftCluster_Removed_ReAdd creates a three node raft cluster and then
// removes one of the nodes. The removed follower tries to re-join, and the test
// verifies that it errors and cannot join.
func TestRaftCluster_Removed_ReAdd(t *testing.T) {
t.Parallel()
cluster, _ := raftCluster(t, nil)
defer cluster.Cleanup()
leader := cluster.Cores[0]
follower := cluster.Cores[2]
_, err := leader.Client.Logical().Write("/sys/storage/raft/remove-peer", map[string]interface{}{
"server_id": follower.NodeID,
})
require.NoError(t, err)
require.Eventually(t, follower.Sealed, 3*time.Second, 250*time.Millisecond)
joinReq := &api.RaftJoinRequest{LeaderAPIAddr: leader.Address.String()}
_, err = follower.Client.Sys().RaftJoin(joinReq)
require.Error(t, err)
}

View File

@@ -4,7 +4,10 @@
package raftha
import (
"errors"
"fmt"
"testing"
"time"
"github.com/hashicorp/go-hclog"
"github.com/hashicorp/vault/api"
@@ -327,3 +330,49 @@ func TestRaft_HA_ExistingCluster(t *testing.T) {
updateCluster(t)
}
// TestRaftHACluster_Removed_ReAdd creates a raft HA cluster with a file
// backend. The test adds two standbys to the cluster and then removes one of
// them. The removed follower tries to re-join, and the test verifies that it
// errors and cannot join.
func TestRaftHACluster_Removed_ReAdd(t *testing.T) {
t.Parallel()
var conf vault.CoreConfig
opts := vault.TestClusterOptions{HandlerFunc: vaulthttp.Handler}
teststorage.RaftHASetup(&conf, &opts, teststorage.MakeFileBackend)
cluster := vault.NewTestCluster(t, &conf, &opts)
defer cluster.Cleanup()
vault.TestWaitActive(t, cluster.Cores[0].Core)
leader := cluster.Cores[0]
follower := cluster.Cores[2]
joinReq := &api.RaftJoinRequest{LeaderCACert: string(cluster.CACertPEM)}
_, err := follower.Client.Sys().RaftJoin(joinReq)
require.NoError(t, err)
_, err = cluster.Cores[1].Client.Sys().RaftJoin(joinReq)
require.NoError(t, err)
testhelpers.RetryUntil(t, 3*time.Second, func() error {
resp, err := leader.Client.Sys().RaftAutopilotState()
if err != nil {
return err
}
if len(resp.Servers) != 3 {
return errors.New("need 3 servers")
}
for serverID, server := range resp.Servers {
if !server.Healthy {
return fmt.Errorf("server %s is unhealthy", serverID)
}
}
return nil
})
_, err = leader.Client.Logical().Write("/sys/storage/raft/remove-peer", map[string]interface{}{
"server_id": follower.NodeID,
})
require.NoError(t, err)
require.Eventually(t, follower.Sealed, 3*time.Second, 250*time.Millisecond)
_, err = follower.Client.Sys().RaftJoin(joinReq)
require.Error(t, err)
}

View File

@@ -1046,6 +1046,13 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
}
isRaftHAOnly := c.isRaftHAOnly()
if raftBackend.IsRemoved() {
if isRaftHAOnly {
return false, errors.New("node has been removed from the HA cluster. Raft data for this node must be cleaned up before it can be added back")
} else {
return false, errors.New("node has been removed from the HA cluster. All vault data for this node must be cleaned up before it can be added back")
}
}
// Prevent join from happening if we're using raft for storage and
// it has already been initialized.
if init && !isRaftHAOnly {