mirror of
https://github.com/optim-enterprises-bv/vault.git
synced 2025-10-30 18:17:55 +00:00
VAULT-31758: Store when a node is removed in the raft stable store (#29090)
* implementation and test * changelog * verify servers are healthy before removing
This commit is contained in:
3
changelog/29090.txt
Normal file
3
changelog/29090.txt
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
```release-note:change
|
||||||
|
core/raft: Return an error on sys/storage/raft/join if a node that has been removed from raft cluster attempts to re-join when it still has existing raft data on disk.
|
||||||
|
```
|
||||||
@@ -256,7 +256,7 @@ type RaftBackend struct {
|
|||||||
// limits.
|
// limits.
|
||||||
specialPathLimits map[string]uint64
|
specialPathLimits map[string]uint64
|
||||||
|
|
||||||
removed atomic.Bool
|
removed *atomic.Bool
|
||||||
removedCallback func()
|
removedCallback func()
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -277,9 +277,11 @@ func (b *RaftBackend) IsRemoved() bool {
|
|||||||
return b.removed.Load()
|
return b.removed.Load()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
var removedKey = []byte("removed")
|
||||||
|
|
||||||
func (b *RaftBackend) RemoveSelf() error {
|
func (b *RaftBackend) RemoveSelf() error {
|
||||||
b.removed.Store(true)
|
b.removed.Store(true)
|
||||||
return nil
|
return b.stableStore.SetUint64(removedKey, 1)
|
||||||
}
|
}
|
||||||
|
|
||||||
// LeaderJoinInfo contains information required by a node to join itself as a
|
// LeaderJoinInfo contains information required by a node to join itself as a
|
||||||
@@ -593,6 +595,14 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend
|
|||||||
snapStore = newSnapshotStoreDelay(snapStore, backendConfig.SnapshotDelay, logger)
|
snapStore = newSnapshotStoreDelay(snapStore, backendConfig.SnapshotDelay, logger)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
isRemoved := new(atomic.Bool)
|
||||||
|
removedVal, err := stableStore.GetUint64(removedKey)
|
||||||
|
if err != nil {
|
||||||
|
logger.Error("error checking if this node is removed. continuing under the assumption that it's not", "error", err)
|
||||||
|
}
|
||||||
|
if removedVal == 1 {
|
||||||
|
isRemoved.Store(true)
|
||||||
|
}
|
||||||
return &RaftBackend{
|
return &RaftBackend{
|
||||||
logger: logger,
|
logger: logger,
|
||||||
fsm: fsm,
|
fsm: fsm,
|
||||||
@@ -619,6 +629,7 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend
|
|||||||
raftLogVerifierEnabled: backendConfig.RaftLogVerifierEnabled,
|
raftLogVerifierEnabled: backendConfig.RaftLogVerifierEnabled,
|
||||||
raftLogVerificationInterval: backendConfig.RaftLogVerificationInterval,
|
raftLogVerificationInterval: backendConfig.RaftLogVerificationInterval,
|
||||||
effectiveSDKVersion: version.GetVersion().Version,
|
effectiveSDKVersion: version.GetVersion().Version,
|
||||||
|
removed: isRemoved,
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -1539,3 +1539,25 @@ func TestSysHealth_Raft(t *testing.T) {
|
|||||||
require.Nil(t, r.HAConnectionHealthy)
|
require.Nil(t, r.HAConnectionHealthy)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestRaftCluster_Removed_ReAdd creates a three node raft cluster and then
|
||||||
|
// removes one of the nodes. The removed follower tries to re-join, and the test
|
||||||
|
// verifies that it errors and cannot join.
|
||||||
|
func TestRaftCluster_Removed_ReAdd(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
cluster, _ := raftCluster(t, nil)
|
||||||
|
defer cluster.Cleanup()
|
||||||
|
|
||||||
|
leader := cluster.Cores[0]
|
||||||
|
follower := cluster.Cores[2]
|
||||||
|
|
||||||
|
_, err := leader.Client.Logical().Write("/sys/storage/raft/remove-peer", map[string]interface{}{
|
||||||
|
"server_id": follower.NodeID,
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Eventually(t, follower.Sealed, 3*time.Second, 250*time.Millisecond)
|
||||||
|
|
||||||
|
joinReq := &api.RaftJoinRequest{LeaderAPIAddr: leader.Address.String()}
|
||||||
|
_, err = follower.Client.Sys().RaftJoin(joinReq)
|
||||||
|
require.Error(t, err)
|
||||||
|
}
|
||||||
|
|||||||
@@ -4,7 +4,10 @@
|
|||||||
package raftha
|
package raftha
|
||||||
|
|
||||||
import (
|
import (
|
||||||
|
"errors"
|
||||||
|
"fmt"
|
||||||
"testing"
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/hashicorp/go-hclog"
|
"github.com/hashicorp/go-hclog"
|
||||||
"github.com/hashicorp/vault/api"
|
"github.com/hashicorp/vault/api"
|
||||||
@@ -327,3 +330,49 @@ func TestRaft_HA_ExistingCluster(t *testing.T) {
|
|||||||
|
|
||||||
updateCluster(t)
|
updateCluster(t)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TestRaftHACluster_Removed_ReAdd creates a raft HA cluster with a file
|
||||||
|
// backend. The test adds two standbys to the cluster and then removes one of
|
||||||
|
// them. The removed follower tries to re-join, and the test verifies that it
|
||||||
|
// errors and cannot join.
|
||||||
|
func TestRaftHACluster_Removed_ReAdd(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
var conf vault.CoreConfig
|
||||||
|
opts := vault.TestClusterOptions{HandlerFunc: vaulthttp.Handler}
|
||||||
|
teststorage.RaftHASetup(&conf, &opts, teststorage.MakeFileBackend)
|
||||||
|
cluster := vault.NewTestCluster(t, &conf, &opts)
|
||||||
|
defer cluster.Cleanup()
|
||||||
|
vault.TestWaitActive(t, cluster.Cores[0].Core)
|
||||||
|
|
||||||
|
leader := cluster.Cores[0]
|
||||||
|
follower := cluster.Cores[2]
|
||||||
|
joinReq := &api.RaftJoinRequest{LeaderCACert: string(cluster.CACertPEM)}
|
||||||
|
_, err := follower.Client.Sys().RaftJoin(joinReq)
|
||||||
|
require.NoError(t, err)
|
||||||
|
_, err = cluster.Cores[1].Client.Sys().RaftJoin(joinReq)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
testhelpers.RetryUntil(t, 3*time.Second, func() error {
|
||||||
|
resp, err := leader.Client.Sys().RaftAutopilotState()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if len(resp.Servers) != 3 {
|
||||||
|
return errors.New("need 3 servers")
|
||||||
|
}
|
||||||
|
for serverID, server := range resp.Servers {
|
||||||
|
if !server.Healthy {
|
||||||
|
return fmt.Errorf("server %s is unhealthy", serverID)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
_, err = leader.Client.Logical().Write("/sys/storage/raft/remove-peer", map[string]interface{}{
|
||||||
|
"server_id": follower.NodeID,
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Eventually(t, follower.Sealed, 3*time.Second, 250*time.Millisecond)
|
||||||
|
|
||||||
|
_, err = follower.Client.Sys().RaftJoin(joinReq)
|
||||||
|
require.Error(t, err)
|
||||||
|
}
|
||||||
|
|||||||
@@ -1046,6 +1046,13 @@ func (c *Core) JoinRaftCluster(ctx context.Context, leaderInfos []*raft.LeaderJo
|
|||||||
}
|
}
|
||||||
|
|
||||||
isRaftHAOnly := c.isRaftHAOnly()
|
isRaftHAOnly := c.isRaftHAOnly()
|
||||||
|
if raftBackend.IsRemoved() {
|
||||||
|
if isRaftHAOnly {
|
||||||
|
return false, errors.New("node has been removed from the HA cluster. Raft data for this node must be cleaned up before it can be added back")
|
||||||
|
} else {
|
||||||
|
return false, errors.New("node has been removed from the HA cluster. All vault data for this node must be cleaned up before it can be added back")
|
||||||
|
}
|
||||||
|
}
|
||||||
// Prevent join from happening if we're using raft for storage and
|
// Prevent join from happening if we're using raft for storage and
|
||||||
// it has already been initialized.
|
// it has already been initialized.
|
||||||
if init && !isRaftHAOnly {
|
if init && !isRaftHAOnly {
|
||||||
|
|||||||
Reference in New Issue
Block a user