mirror of
				https://github.com/optim-enterprises-bv/vault.git
				synced 2025-10-31 18:48:08 +00:00 
			
		
		
		
	Fix AP upgrade version issue (#27277)
* Fix AP upgrade version issue * add heartbeat logging at trace level * add log to show when heartbeats resume * Test the plumbing * Revert "Test the plumbing" This reverts commit e25fcd83516cd8b5b0ca6b543be64049c6a45f51. * Add CHANGELOG * Add plumbing test * Update misleading comment --------- Co-authored-by: Josh Black <raskchanky@gmail.com>
This commit is contained in:
		
							
								
								
									
										4
									
								
								changelog/27277.txt
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										4
									
								
								changelog/27277.txt
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,4 @@ | ||||
| ```release-note:bug | ||||
| storage/raft (enterprise): Fix a regression introduced in 1.15.8 that causes | ||||
| autopilot to fail to discover new server versions and so not trigger an upgrade. | ||||
| ``` | ||||
| @@ -39,6 +39,7 @@ import ( | ||||
| 	"github.com/hashicorp/vault/sdk/logical" | ||||
| 	"github.com/hashicorp/vault/sdk/physical" | ||||
| 	"github.com/hashicorp/vault/vault/cluster" | ||||
| 	"github.com/hashicorp/vault/version" | ||||
| 	etcdbolt "go.etcd.io/bbolt" | ||||
| ) | ||||
|  | ||||
| @@ -582,6 +583,7 @@ func NewRaftBackend(conf map[string]string, logger log.Logger) (physical.Backend | ||||
| 		failGetInTxn:                  new(uint32), | ||||
| 		raftLogVerifierEnabled:        backendConfig.RaftLogVerifierEnabled, | ||||
| 		raftLogVerificationInterval:   backendConfig.RaftLogVerificationInterval, | ||||
| 		effectiveSDKVersion:           version.GetVersion().Version, | ||||
| 	}, nil | ||||
| } | ||||
|  | ||||
| @@ -660,12 +662,6 @@ func (b *RaftBackend) FailGetInTxn(fail bool) { | ||||
| 	atomic.StoreUint32(b.failGetInTxn, val) | ||||
| } | ||||
|  | ||||
| func (b *RaftBackend) SetEffectiveSDKVersion(sdkVersion string) { | ||||
| 	b.l.Lock() | ||||
| 	b.effectiveSDKVersion = sdkVersion | ||||
| 	b.l.Unlock() | ||||
| } | ||||
|  | ||||
| func (b *RaftBackend) RedundancyZone() string { | ||||
| 	b.l.RLock() | ||||
| 	defer b.l.RUnlock() | ||||
| @@ -1089,6 +1085,11 @@ type SetupOpts struct { | ||||
| 	// RecoveryModeConfig is the configuration for the raft cluster in recovery | ||||
| 	// mode. | ||||
| 	RecoveryModeConfig *raft.Configuration | ||||
|  | ||||
| 	// EffectiveSDKVersion is typically the version string baked into the binary. | ||||
| 	// We pass it in though because it can be overridden in tests or via ENV in | ||||
| 	// core. | ||||
| 	EffectiveSDKVersion string | ||||
| } | ||||
|  | ||||
| func (b *RaftBackend) StartRecoveryCluster(ctx context.Context, peer Peer) error { | ||||
| @@ -1132,6 +1133,11 @@ func (b *RaftBackend) SetupCluster(ctx context.Context, opts SetupOpts) error { | ||||
| 		return errors.New("no local node id configured") | ||||
| 	} | ||||
|  | ||||
| 	if opts.EffectiveSDKVersion != "" { | ||||
| 		// Override the SDK version | ||||
| 		b.effectiveSDKVersion = opts.EffectiveSDKVersion | ||||
| 	} | ||||
|  | ||||
| 	// Setup the raft config | ||||
| 	raftConfig := raft.DefaultConfig() | ||||
| 	if err := b.applyConfigSettings(raftConfig); err != nil { | ||||
|   | ||||
| @@ -421,8 +421,11 @@ func (d *Delegate) KnownServers() map[raft.ServerID]*autopilot.Server { | ||||
| 			} | ||||
| 			followerVersion = leaderVersion | ||||
| 		} else { | ||||
| 			if _, ok := d.emptyVersionLogs[currentServerID]; ok { | ||||
| 				d.logger.Trace("received non-empty version in heartbeat state. no longer need to fake it", "id", id, "update_version", followerVersion) | ||||
| 				delete(d.emptyVersionLogs, currentServerID) | ||||
| 			} | ||||
| 		} | ||||
| 		d.dl.Unlock() | ||||
|  | ||||
| 		server := &autopilot.Server{ | ||||
|   | ||||
| @@ -41,6 +41,41 @@ func TestRaft_Autopilot_Disable(t *testing.T) { | ||||
| 	require.Nil(t, nil, state) | ||||
| } | ||||
|  | ||||
| // TestRaft_Autopilot_BinaryVersionPlumbing is an apparently trivial test that | ||||
| // ensures that the default plumbing in Vault core to configure the binary | ||||
| // version of the raft library is working. Hopefully this will trivially pass | ||||
| // from now on, however it would have caught a regression in the past! | ||||
| func TestRaft_Autopilot_BinaryVersionPlumbing(t *testing.T) { | ||||
| 	t.Parallel() | ||||
|  | ||||
| 	coreCfg, clusterOpts := raftClusterBuilder(t, &RaftClusterOpts{ | ||||
| 		EnableAutopilot: true, | ||||
| 		// We need 2 nodes because the code path that regressed was different on a | ||||
| 		// standby vs active node so we'd not detect the problem if we only test on | ||||
| 		// an active node. | ||||
| 		NumCores: 2, | ||||
| 	}) | ||||
|  | ||||
| 	// Default options should not set EffectiveSDKVersion(Map) which would defeat | ||||
| 	// the point of this test by plumbing versions via config. | ||||
| 	require.Nil(t, clusterOpts.EffectiveSDKVersionMap) | ||||
| 	require.Empty(t, coreCfg.EffectiveSDKVersion) | ||||
|  | ||||
| 	c := vault.NewTestCluster(t, coreCfg, &clusterOpts) | ||||
| 	defer c.Cleanup() | ||||
|  | ||||
| 	// Wait for follower to be perf standby (in Ent, in CE it will only wait for | ||||
| 	// active). In either Enterprise or CE case, this should pass if we've plumbed | ||||
| 	// the version correctly so it's valuable for both. | ||||
| 	testhelpers.WaitForActiveNodeAndStandbys(t, c) | ||||
| 	for _, core := range c.Cores { | ||||
| 		be := core.UnderlyingRawStorage.(*raft.RaftBackend) | ||||
| 		require.Equal(t, version.GetVersion().Version, be.UpgradeVersion(), | ||||
| 			"expected raft upgrade version to default to Vault version for core %q", | ||||
| 			core.NodeID) | ||||
| 	} | ||||
| } | ||||
|  | ||||
| // TestRaft_Autopilot_Stabilization_And_State verifies that nodes get promoted | ||||
| // to be voters after the stabilization time has elapsed.  Also checks that | ||||
| // the autopilot state is Healthy once all nodes are available. | ||||
|   | ||||
| @@ -152,6 +152,7 @@ func (c *Core) startRaftBackend(ctx context.Context) (retErr error) { | ||||
| 			TLSKeyring:          raftTLS, | ||||
| 			ClusterListener:     c.getClusterListener(), | ||||
| 			StartAsLeader:       creating, | ||||
| 			EffectiveSDKVersion: c.effectiveSDKVersion, | ||||
| 		}); err != nil { | ||||
| 			return err | ||||
| 		} | ||||
| @@ -309,7 +310,6 @@ func (c *Core) setupRaftActiveNode(ctx context.Context) error { | ||||
| 	} | ||||
|  | ||||
| 	c.logger.Info("starting raft active node") | ||||
| 	raftBackend.SetEffectiveSDKVersion(c.effectiveSDKVersion) | ||||
|  | ||||
| 	c.pendingRaftPeers = &sync.Map{} | ||||
|  | ||||
|   | ||||
| @@ -102,6 +102,15 @@ func (s *forwardedRequestRPCServer) Echo(ctx context.Context, in *EchoRequest) ( | ||||
| 	} | ||||
|  | ||||
| 	if in.RaftAppliedIndex > 0 && len(in.RaftNodeID) > 0 && s.raftFollowerStates != nil { | ||||
| 		s.core.logger.Trace("forwarding RPC: echo received", | ||||
| 			"node_id", in.RaftNodeID, | ||||
| 			"applied_index", in.RaftAppliedIndex, | ||||
| 			"term", in.RaftTerm, | ||||
| 			"desired_suffrage", in.RaftDesiredSuffrage, | ||||
| 			"sdk_version", in.SdkVersion, | ||||
| 			"upgrade_version", in.RaftUpgradeVersion, | ||||
| 			"redundancy_zone", in.RaftRedundancyZone) | ||||
|  | ||||
| 		s.raftFollowerStates.Update(&raft.EchoRequestUpdate{ | ||||
| 			NodeID:          in.RaftNodeID, | ||||
| 			AppliedIndex:    in.RaftAppliedIndex, | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Paul Banks
					Paul Banks