mirror of
				https://github.com/optim-enterprises-bv/vault.git
				synced 2025-10-31 18:48:08 +00:00 
			
		
		
		
	Fix AP upgrade version issue (#27277)
* Fix AP upgrade version issue * add heartbeat logging at trace level * add log to show when heartbeats resume * Test the plumbing * Revert "Test the plumbing" This reverts commit e25fcd83516cd8b5b0ca6b543be64049c6a45f51. * Add CHANGELOG * Add plumbing test * Update misleading comment --------- Co-authored-by: Josh Black <raskchanky@gmail.com>
This commit is contained in:
		
							
								
								
									
										4
									
								
								changelog/27277.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								changelog/27277.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,4 @@ | |||||||
|  | ```release-note:bug | ||||||
|  | storage/raft (enterprise): Fix a regression introduced in 1.15.8 that causes | ||||||
|  | autopilot to fail to discover new server versions and so not trigger an upgrade. | ||||||
|  | ``` | ||||||
| @@ -39,6 +39,7 @@ import ( | |||||||
| 	"github.com/hashicorp/vault/sdk/logical" | 	"github.com/hashicorp/vault/sdk/logical" | ||||||
| 	"github.com/hashicorp/vault/sdk/physical" | 	"github.com/hashicorp/vault/sdk/physical" | ||||||
| 	"github.com/hashicorp/vault/vault/cluster" | 	"github.com/hashicorp/vault/vault/cluster" | ||||||
|  | 	"github.com/hashicorp/vault/version" | ||||||
| 	etcdbolt "go.etcd.io/bbolt" | 	etcdbolt "go.etcd.io/bbolt" | ||||||
| ) | ) | ||||||
|  |  | ||||||
| @@ -582,6 +583,7 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend | |||||||
| 		failGetInTxn:                  new(uint32), | 		failGetInTxn:                  new(uint32), | ||||||
| 		raftLogVerifierEnabled:        backendConfig.RaftLogVerifierEnabled, | 		raftLogVerifierEnabled:        backendConfig.RaftLogVerifierEnabled, | ||||||
| 		raftLogVerificationInterval:   backendConfig.RaftLogVerificationInterval, | 		raftLogVerificationInterval:   backendConfig.RaftLogVerificationInterval, | ||||||
|  | 		effectiveSDKVersion:           version.GetVersion().Version, | ||||||
| 	}, nil | 	}, nil | ||||||
| } | } | ||||||
|  |  | ||||||
| @@ -660,12 +662,6 @@ func (b *RaftBackend) FailGetInTxn(fail bool) { | |||||||
| 	atomic.StoreUint32(b.failGetInTxn, val) | 	atomic.StoreUint32(b.failGetInTxn, val) | ||||||
| } | } | ||||||
|  |  | ||||||
| func (b *RaftBackend) SetEffectiveSDKVersion(sdkVersion string) { |  | ||||||
| 	b.l.Lock() |  | ||||||
| 	b.effectiveSDKVersion = sdkVersion |  | ||||||
| 	b.l.Unlock() |  | ||||||
| } |  | ||||||
|  |  | ||||||
| func (b *RaftBackend) RedundancyZone() string { | func (b *RaftBackend) RedundancyZone() string { | ||||||
| 	b.l.RLock() | 	b.l.RLock() | ||||||
| 	defer b.l.RUnlock() | 	defer b.l.RUnlock() | ||||||
| @@ -1089,6 +1085,11 @@ type SetupOpts struct { | |||||||
| 	// RecoveryModeConfig is the configuration for the raft cluster in recovery | 	// RecoveryModeConfig is the configuration for the raft cluster in recovery | ||||||
| 	// mode. | 	// mode. | ||||||
| 	RecoveryModeConfig *raft.Configuration | 	RecoveryModeConfig *raft.Configuration | ||||||
|  |  | ||||||
|  | 	// EffectiveSDKVersion is typically the version string baked into the binary. | ||||||
|  | 	// We pass it in though because it can be overridden in tests or via ENV in | ||||||
|  | 	// core. | ||||||
|  | 	EffectiveSDKVersion string | ||||||
| } | } | ||||||
|  |  | ||||||
| func (b *RaftBackend) StartRecoveryCluster(ctx context.Context, peer Peer) error { | func (b *RaftBackend) StartRecoveryCluster(ctx context.Context, peer Peer) error { | ||||||
| @@ -1132,6 +1133,11 @@ func (b *RaftBackend) SetupCluster(ctx context.Context, opts SetupOpts) error { | |||||||
| 		return errors.New("no local node id configured") | 		return errors.New("no local node id configured") | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|  | 	if opts.EffectiveSDKVersion != "" { | ||||||
|  | 		// Override the SDK version | ||||||
|  | 		b.effectiveSDKVersion = opts.EffectiveSDKVersion | ||||||
|  | 	} | ||||||
|  |  | ||||||
| 	// Setup the raft config | 	// Setup the raft config | ||||||
| 	raftConfig := raft.DefaultConfig() | 	raftConfig := raft.DefaultConfig() | ||||||
| 	if err := b.applyConfigSettings(raftConfig); err != nil { | 	if err := b.applyConfigSettings(raftConfig); err != nil { | ||||||
|   | |||||||
| @@ -421,7 +421,10 @@ func (d *Delegate) KnownServers() map[raft.ServerID]*autopilot.Server { | |||||||
| 			} | 			} | ||||||
| 			followerVersion = leaderVersion | 			followerVersion = leaderVersion | ||||||
| 		} else { | 		} else { | ||||||
| 			delete(d.emptyVersionLogs, currentServerID) | 			if _, ok := d.emptyVersionLogs[currentServerID]; ok { | ||||||
|  | 				d.logger.Trace("received non-empty version in heartbeat state. no longer need to fake it", "id", id, "update_version", followerVersion) | ||||||
|  | 				delete(d.emptyVersionLogs, currentServerID) | ||||||
|  | 			} | ||||||
| 		} | 		} | ||||||
| 		d.dl.Unlock() | 		d.dl.Unlock() | ||||||
|  |  | ||||||
|   | |||||||
| @@ -41,6 +41,41 @@ func TestRaft_Autopilot_Disable(t *testing.T) { | |||||||
| 	require.Nil(t, nil, state) | 	require.Nil(t, nil, state) | ||||||
| } | } | ||||||
|  |  | ||||||
|  | // TestRaft_Autopilot_BinaryVersionPlumbing is an apparently trivial test that | ||||||
|  | // ensures that the default plumbing in Vault core to configure the binary | ||||||
|  | // version of the raft library is working. Hopefully this will trivially pass | ||||||
|  | // from now on, however it would have caught a regression in the past! | ||||||
|  | func TestRaft_Autopilot_BinaryVersionPlumbing(t *testing.T) { | ||||||
|  | 	t.Parallel() | ||||||
|  |  | ||||||
|  | 	coreCfg, clusterOpts := raftClusterBuilder(t, &RaftClusterOpts{ | ||||||
|  | 		EnableAutopilot: true, | ||||||
|  | 		// We need 2 nodes because the code path that regressed was different on a | ||||||
|  | 		// standby vs active node so we'd not detect the problem if we only test on | ||||||
|  | 		// an active node. | ||||||
|  | 		NumCores: 2, | ||||||
|  | 	}) | ||||||
|  |  | ||||||
|  | 	// Default options should not set EffectiveSDKVersion(Map) which would defeat | ||||||
|  | 	// the point of this test by plumbing versions via config. | ||||||
|  | 	require.Nil(t, clusterOpts.EffectiveSDKVersionMap) | ||||||
|  | 	require.Empty(t, coreCfg.EffectiveSDKVersion) | ||||||
|  |  | ||||||
|  | 	c := vault.NewTestCluster(t, coreCfg, &clusterOpts) | ||||||
|  | 	defer c.Cleanup() | ||||||
|  |  | ||||||
|  | 	// Wait for follower to be perf standby (in Ent, in CE it will only wait for | ||||||
|  | 	// active). In either Enterprise or CE case, this should pass if we've plumbed | ||||||
|  | 	// the version correctly so it's valuable for both. | ||||||
|  | 	testhelpers.WaitForActiveNodeAndStandbys(t, c) | ||||||
|  | 	for _, core := range c.Cores { | ||||||
|  | 		be := core.UnderlyingRawStorage.(*raft.RaftBackend) | ||||||
|  | 		require.Equal(t, version.GetVersion().Version, be.UpgradeVersion(), | ||||||
|  | 			"expected raft upgrade version to default to Vault version for core %q", | ||||||
|  | 			core.NodeID) | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  |  | ||||||
| // TestRaft_Autopilot_Stabilization_And_State verifies that nodes get promoted | // TestRaft_Autopilot_Stabilization_And_State verifies that nodes get promoted | ||||||
| // to be voters after the stabilization time has elapsed.  Also checks that | // to be voters after the stabilization time has elapsed.  Also checks that | ||||||
| // the autopilot state is Healthy once all nodes are available. | // the autopilot state is Healthy once all nodes are available. | ||||||
|   | |||||||
| @@ -149,9 +149,10 @@ func (c *Core) startRaftBackend(ctx context.Context) (retErr error) { | |||||||
| 		raftBackend.SetRestoreCallback(c.raftSnapshotRestoreCallback(true, true)) | 		raftBackend.SetRestoreCallback(c.raftSnapshotRestoreCallback(true, true)) | ||||||
|  |  | ||||||
| 		if err := raftBackend.SetupCluster(ctx, raft.SetupOpts{ | 		if err := raftBackend.SetupCluster(ctx, raft.SetupOpts{ | ||||||
| 			TLSKeyring:      raftTLS, | 			TLSKeyring:          raftTLS, | ||||||
| 			ClusterListener: c.getClusterListener(), | 			ClusterListener:     c.getClusterListener(), | ||||||
| 			StartAsLeader:   creating, | 			StartAsLeader:       creating, | ||||||
|  | 			EffectiveSDKVersion: c.effectiveSDKVersion, | ||||||
| 		}); err != nil { | 		}); err != nil { | ||||||
| 			return err | 			return err | ||||||
| 		} | 		} | ||||||
| @@ -309,7 +310,6 @@ func (c *Core) setupRaftActiveNode(ctx context.Context) error { | |||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	c.logger.Info("starting raft active node") | 	c.logger.Info("starting raft active node") | ||||||
| 	raftBackend.SetEffectiveSDKVersion(c.effectiveSDKVersion) |  | ||||||
|  |  | ||||||
| 	c.pendingRaftPeers = &sync.Map{} | 	c.pendingRaftPeers = &sync.Map{} | ||||||
|  |  | ||||||
|   | |||||||
| @@ -102,6 +102,15 @@ func (s *forwardedRequestRPCServer) Echo(ctx context.Context, in *EchoRequest) ( | |||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	if in.RaftAppliedIndex > 0 && len(in.RaftNodeID) > 0 && s.raftFollowerStates != nil { | 	if in.RaftAppliedIndex > 0 && len(in.RaftNodeID) > 0 && s.raftFollowerStates != nil { | ||||||
|  | 		s.core.logger.Trace("forwarding RPC: echo received", | ||||||
|  | 			"node_id", in.RaftNodeID, | ||||||
|  | 			"applied_index", in.RaftAppliedIndex, | ||||||
|  | 			"term", in.RaftTerm, | ||||||
|  | 			"desired_suffrage", in.RaftDesiredSuffrage, | ||||||
|  | 			"sdk_version", in.SdkVersion, | ||||||
|  | 			"upgrade_version", in.RaftUpgradeVersion, | ||||||
|  | 			"redundancy_zone", in.RaftRedundancyZone) | ||||||
|  |  | ||||||
| 		s.raftFollowerStates.Update(&raft.EchoRequestUpdate{ | 		s.raftFollowerStates.Update(&raft.EchoRequestUpdate{ | ||||||
| 			NodeID:          in.RaftNodeID, | 			NodeID:          in.RaftNodeID, | ||||||
| 			AppliedIndex:    in.RaftAppliedIndex, | 			AppliedIndex:    in.RaftAppliedIndex, | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Paul Banks
					Paul Banks