mirror of
https://github.com/optim-enterprises-bv/vault.git
synced 2025-10-29 09:42:25 +00:00
VAULT-31758: Store when a node is removed in the raft stable store (#29090)
* implementation and test * changelog * verify servers are healthy before removing
This commit is contained in:
3
changelog/29090.txt
Normal file
3
changelog/29090.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
```release-note:change
|
||||
core/raft: Return an error on sys/storage/raft/join if a node that has been removed from raft cluster attempts to re-join when it still has existing raft data on disk.
|
||||
```
|
||||
@@ -256,7 +256,7 @@ type RaftBackend struct {
|
||||
// limits.
|
||||
specialPathLimits map[string]uint64
|
||||
|
||||
removed atomic.Bool
|
||||
removed *atomic.Bool
|
||||
removedCallback func()
|
||||
}
|
||||
|
||||
@@ -277,9 +277,11 @@ func (b *RaftBackend) IsRemoved() bool {
|
||||
return b.removed.Load()
|
||||
}
|
||||
|
||||
var removedKey = []byte("removed")
|
||||
|
||||
func (b *RaftBackend) RemoveSelf() error {
|
||||
b.removed.Store(true)
|
||||
return nil
|
||||
return b.stableStore.SetUint64(removedKey, 1)
|
||||
}
|
||||
|
||||
// LeaderJoinInfo contains information required by a node to join itself as a
|
||||
@@ -593,6 +595,14 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend
|
||||
snapStore = newSnapshotStoreDelay(snapStore, backendConfig.SnapshotDelay, logger)
|
||||
}
|
||||
|
||||
isRemoved := new(atomic.Bool)
|
||||
removedVal, err := stableStore.GetUint64(removedKey)
|
||||
if err != nil {
|
||||
logger.Error("error checking if this node is removed. continuing under the assumption that it's not", "error", err)
|
||||
}
|
||||
if removedVal == 1 {
|
||||
isRemoved.Store(true)
|
||||
}
|
||||
return &RaftBackend{
|
||||
logger: logger,
|
||||
fsm: fsm,
|
||||
@@ -619,6 +629,7 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend
|
||||
raftLogVerifierEnabled: backendConfig.RaftLogVerifierEnabled,
|
||||
raftLogVerificationInterval: backendConfig.RaftLogVerificationInterval,
|
||||
effectiveSDKVersion: version.GetVersion().Version,
|
||||
removed: isRemoved,
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
||||
@@ -1539,3 +1539,25 @@ func TestSysHealth_Raft(t *testing.T) {
|
||||
require.Nil(t, r.HAConnectionHealthy)
|
||||
})
|
||||
}
|
||||
|
||||
// TestRaftCluster_Removed_ReAdd creates a three node raft cluster and then
|
||||
// removes one of the nodes. The removed follower tries to re-join, and the test
|
||||
// verifies that it errors and cannot join.
|
||||
func TestRaftCluster_Removed_ReAdd(t *testing.T) {
|
||||
t.Parallel()
|
||||
cluster, _ := raftCluster(t, nil)
|
||||
defer cluster.Cleanup()
|
||||
|
||||
leader := cluster.Cores[0]
|
||||
follower := cluster.Cores[2]
|
||||
|
||||
_, err := leader.Client.Logical().Write("/sys/storage/raft/remove-peer", map[string]interface{}{
|
||||
"server_id": follower.NodeID,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.Eventually(t, follower.Sealed, 3*time.Second, 250*time.Millisecond)
|
||||
|
||||
joinReq := &api.RaftJoinRequest{LeaderAPIAddr: leader.Address.String()}
|
||||
_, err = follower.Client.Sys().RaftJoin(joinReq)
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
@@ -4,7 +4,10 @@
|
||||
package raftha
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/hashicorp/go-hclog"
|
||||
"github.com/hashicorp/vault/api"
|
||||
@@ -327,3 +330,49 @@ func TestRaft_HA_ExistingCluster(t *testing.T) {
|
||||
|
||||
updateCluster(t)
|
||||
}
|
||||
|
||||
// TestRaftHACluster_Removed_ReAdd creates a raft HA cluster with a file
|
||||
// backend. The test adds two standbys to the cluster and then removes one of
|
||||
// them. The removed follower tries to re-join, and the test verifies that it
|
||||
// errors and cannot join.
|
||||
func TestRaftHACluster_Removed_ReAdd(t *testing.T) {
|
||||
t.Parallel()
|
||||
var conf vault.CoreConfig
|
||||
opts := vault.TestClusterOptions{HandlerFunc: vaulthttp.Handler}
|
||||
teststorage.RaftHASetup(&conf, &opts, teststorage.MakeFileBackend)
|
||||
cluster := vault.NewTestCluster(t, &conf, &opts)
|
||||
defer cluster.Cleanup()
|
||||
vault.TestWaitActive(t, cluster.Cores[0].Core)
|
||||
|
||||
leader := cluster.Cores[0]
|
||||
follower := cluster.Cores[2]
|
||||
joinReq := &api.RaftJoinRequest{LeaderCACert: string(cluster.CACertPEM)}
|
||||
_, err := follower.Client.Sys().RaftJoin(joinReq)
|
||||
require.NoError(t, err)
|
||||
_, err = cluster.Cores[1].Client.Sys().RaftJoin(joinReq)
|
||||
require.NoError(t, err)
|
||||
|
||||
testhelpers.RetryUntil(t, 3*time.Second, func() error {
|
||||
resp, err := leader.Client.Sys().RaftAutopilotState()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if len(resp.Servers) != 3 {
|
||||
return errors.New("need 3 servers")
|
||||
}
|
||||
for serverID, server := range resp.Servers {
|
||||
if !server.Healthy {
|
||||
return fmt.Errorf("server %s is unhealthy", serverID)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
_, err = leader.Client.Logical().Write("/sys/storage/raft/remove-peer", map[string]interface{}{
|
||||
"server_id": follower.NodeID,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.Eventually(t, follower.Sealed, 3*time.Second, 250*time.Millisecond)
|
||||
|
||||
_, err = follower.Client.Sys().RaftJoin(joinReq)
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
@@ -1046,6 +1046,13 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
|
||||
}
|
||||
|
||||
isRaftHAOnly := c.isRaftHAOnly()
|
||||
if raftBackend.IsRemoved() {
|
||||
if isRaftHAOnly {
|
||||
return false, errors.New("node has been removed from the HA cluster. Raft data for this node must be cleaned up before it can be added back")
|
||||
} else {
|
||||
return false, errors.New("node has been removed from the HA cluster. All vault data for this node must be cleaned up before it can be added back")
|
||||
}
|
||||
}
|
||||
// Prevent join from happening if we're using raft for storage and
|
||||
// it has already been initialized.
|
||||
if init && !isRaftHAOnly {
|
||||
|
||||
Reference in New Issue
Block a user