mirror of
https://github.com/optim-enterprises-bv/vault.git
synced 2025-10-29 09:42:25 +00:00
Fix AP upgrade version issue (#27277)
* Fix AP upgrade version issue * add heartbeat logging at trace level * add log to show when heartbeats resume * Test the plumbing * Revert "Test the plumbing" This reverts commit e25fcd83516cd8b5b0ca6b543be64049c6a45f51. * Add CHANGELOG * Add plumbing test * Update misleading comment --------- Co-authored-by: Josh Black <raskchanky@gmail.com>
This commit is contained in:
4
changelog/27277.txt
Normal file
4
changelog/27277.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
```release-note:bug
|
||||
storage/raft (enterprise): Fix a regression introduced in 1.15.8 that causes
|
||||
autopilot to fail to discover new server versions and so not trigger an upgrade.
|
||||
```
|
||||
@@ -39,6 +39,7 @@ import (
|
||||
"github.com/hashicorp/vault/sdk/logical"
|
||||
"github.com/hashicorp/vault/sdk/physical"
|
||||
"github.com/hashicorp/vault/vault/cluster"
|
||||
"github.com/hashicorp/vault/version"
|
||||
etcdbolt "go.etcd.io/bbolt"
|
||||
)
|
||||
|
||||
@@ -582,6 +583,7 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend
|
||||
failGetInTxn: new(uint32),
|
||||
raftLogVerifierEnabled: backendConfig.RaftLogVerifierEnabled,
|
||||
raftLogVerificationInterval: backendConfig.RaftLogVerificationInterval,
|
||||
effectiveSDKVersion: version.GetVersion().Version,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -660,12 +662,6 @@ func (b *RaftBackend) FailGetInTxn(fail bool) {
|
||||
atomic.StoreUint32(b.failGetInTxn, val)
|
||||
}
|
||||
|
||||
func (b *RaftBackend) SetEffectiveSDKVersion(sdkVersion string) {
|
||||
b.l.Lock()
|
||||
b.effectiveSDKVersion = sdkVersion
|
||||
b.l.Unlock()
|
||||
}
|
||||
|
||||
func (b *RaftBackend) RedundancyZone() string {
|
||||
b.l.RLock()
|
||||
defer b.l.RUnlock()
|
||||
@@ -1089,6 +1085,11 @@ type SetupOpts struct {
|
||||
// RecoveryModeConfig is the configuration for the raft cluster in recovery
|
||||
// mode.
|
||||
RecoveryModeConfig *raft.Configuration
|
||||
|
||||
// EffectiveSDKVersion is typically the version string baked into the binary.
|
||||
// We pass it in though because it can be overridden in tests or via ENV in
|
||||
// core.
|
||||
EffectiveSDKVersion string
|
||||
}
|
||||
|
||||
func (b *RaftBackend) StartRecoveryCluster(ctx context.Context, peer Peer) error {
|
||||
@@ -1132,6 +1133,11 @@ func (b *RaftBackend) SetupCluster(ctx context.Context, opts SetupOpts) error {
|
||||
return errors.New("no local node id configured")
|
||||
}
|
||||
|
||||
if opts.EffectiveSDKVersion != "" {
|
||||
// Override the SDK version
|
||||
b.effectiveSDKVersion = opts.EffectiveSDKVersion
|
||||
}
|
||||
|
||||
// Setup the raft config
|
||||
raftConfig := raft.DefaultConfig()
|
||||
if err := b.applyConfigSettings(raftConfig); err != nil {
|
||||
|
||||
@@ -421,7 +421,10 @@ func (d *Delegate) KnownServers() map[raft.ServerID]*autopilot.Server {
|
||||
}
|
||||
followerVersion = leaderVersion
|
||||
} else {
|
||||
delete(d.emptyVersionLogs, currentServerID)
|
||||
if _, ok := d.emptyVersionLogs[currentServerID]; ok {
|
||||
d.logger.Trace("received non-empty version in heartbeat state. no longer need to fake it", "id", id, "update_version", followerVersion)
|
||||
delete(d.emptyVersionLogs, currentServerID)
|
||||
}
|
||||
}
|
||||
d.dl.Unlock()
|
||||
|
||||
|
||||
@@ -41,6 +41,41 @@ func TestRaft_Autopilot_Disable(t *testing.T) {
|
||||
require.Nil(t, nil, state)
|
||||
}
|
||||
|
||||
// TestRaft_Autopilot_BinaryVersionPlumbing is an apparently trivial test that
|
||||
// ensures that the default plumbing in Vault core to configure the binary
|
||||
// version of the raft library is working. Hopefully this will trivially pass
|
||||
// from now on, however it would have caught a regression in the past!
|
||||
func TestRaft_Autopilot_BinaryVersionPlumbing(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
coreCfg, clusterOpts := raftClusterBuilder(t, &RaftClusterOpts{
|
||||
EnableAutopilot: true,
|
||||
// We need 2 nodes because the code path that regressed was different on a
|
||||
// standby vs active node so we'd not detect the problem if we only test on
|
||||
// an active node.
|
||||
NumCores: 2,
|
||||
})
|
||||
|
||||
// Default options should not set EffectiveSDKVersion(Map) which would defeat
|
||||
// the point of this test by plumbing versions via config.
|
||||
require.Nil(t, clusterOpts.EffectiveSDKVersionMap)
|
||||
require.Empty(t, coreCfg.EffectiveSDKVersion)
|
||||
|
||||
c := vault.NewTestCluster(t, coreCfg, &clusterOpts)
|
||||
defer c.Cleanup()
|
||||
|
||||
// Wait for follower to be perf standby (in Ent, in CE it will only wait for
|
||||
// active). In either Enterprise or CE case, this should pass if we've plumbed
|
||||
// the version correctly so it's valuable for both.
|
||||
testhelpers.WaitForActiveNodeAndStandbys(t, c)
|
||||
for _, core := range c.Cores {
|
||||
be := core.UnderlyingRawStorage.(*raft.RaftBackend)
|
||||
require.Equal(t, version.GetVersion().Version, be.UpgradeVersion(),
|
||||
"expected raft upgrade version to default to Vault version for core %q",
|
||||
core.NodeID)
|
||||
}
|
||||
}
|
||||
|
||||
// TestRaft_Autopilot_Stabilization_And_State verifies that nodes get promoted
|
||||
// to be voters after the stabilization time has elapsed. Also checks that
|
||||
// the autopilot state is Healthy once all nodes are available.
|
||||
|
||||
@@ -149,9 +149,10 @@ func (c *Core) startRaftBackend(ctx context.Context) (retErr error) {
|
||||
raftBackend.SetRestoreCallback(c.raftSnapshotRestoreCallback(true, true))
|
||||
|
||||
if err := raftBackend.SetupCluster(ctx, raft.SetupOpts{
|
||||
TLSKeyring: raftTLS,
|
||||
ClusterListener: c.getClusterListener(),
|
||||
StartAsLeader: creating,
|
||||
TLSKeyring: raftTLS,
|
||||
ClusterListener: c.getClusterListener(),
|
||||
StartAsLeader: creating,
|
||||
EffectiveSDKVersion: c.effectiveSDKVersion,
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
@@ -309,7 +310,6 @@ func (c *Core) setupRaftActiveNode(ctx context.Context) error {
|
||||
}
|
||||
|
||||
c.logger.Info("starting raft active node")
|
||||
raftBackend.SetEffectiveSDKVersion(c.effectiveSDKVersion)
|
||||
|
||||
c.pendingRaftPeers = &sync.Map{}
|
||||
|
||||
|
||||
@@ -102,6 +102,15 @@ func (s *forwardedRequestRPCServer) Echo(ctx context.Context, in *EchoRequest) (
|
||||
}
|
||||
|
||||
if in.RaftAppliedIndex > 0 && len(in.RaftNodeID) > 0 && s.raftFollowerStates != nil {
|
||||
s.core.logger.Trace("forwarding RPC: echo received",
|
||||
"node_id", in.RaftNodeID,
|
||||
"applied_index", in.RaftAppliedIndex,
|
||||
"term", in.RaftTerm,
|
||||
"desired_suffrage", in.RaftDesiredSuffrage,
|
||||
"sdk_version", in.SdkVersion,
|
||||
"upgrade_version", in.RaftUpgradeVersion,
|
||||
"redundancy_zone", in.RaftRedundancyZone)
|
||||
|
||||
s.raftFollowerStates.Update(&raft.EchoRequestUpdate{
|
||||
NodeID: in.RaftNodeID,
|
||||
AppliedIndex: in.RaftAppliedIndex,
|
||||
|
||||
Reference in New Issue
Block a user