Prevent split-brain active node writes when using Consul (#23013)

* Add test to demonstrate a split-brain active node when using Consul * Add Consul session check to prevent split-brain updates * It's not right Co-authored-by: Josh Black <raskchanky@gmail.com> --------- Co-authored-by: Josh Black <raskchanky@gmail.com>
2025-10-30 02:02:43 +00:00 · 2023-09-22 16:16:01 +01:00
parent 1d61aeb8ae
commit 0fa36a36ae
13 changed files with 1140 additions and 96 deletions
--- a/sdk/helper/testcluster/util.go
+++ b/sdk/helper/testcluster/util.go
@@ -158,6 +158,20 @@ func NodeHealthy(ctx context.Context, cluster VaultCluster, nodeIdx int) error {
 }

 func LeaderNode(ctx context.Context, cluster VaultCluster) (int, error) {
+	// Be robust to multiple nodes thinking they are active. This is possible in
+	// certain network partition situations where the old leader has not
+	// discovered it's lost leadership yet. In tests this is only likely to come
+	// up when we are specifically provoking it, but it's possible it could happen
+	// at any point if leadership flaps of connectivity suffers transient errors
+	// etc. so be robust against it. The best solution would be to have some sort
+	// of epoch like the raft term that is guaranteed to be monotonically
+	// increasing through elections, however we don't have that abstraction for
+	// all HABackends in general. The best we have is the ActiveTime. In a
+	// distributed systems text book this would be bad to rely on due to clock
+	// sync issues etc. but for our tests it's likely fine because even if we are
+	// running separate Vault containers, they are all using the same hardware
+	// clock in the system.
+	leaderActiveTimes := make(map[int]time.Time)
 	for i, node := range cluster.Nodes() {
 		client := node.APIClient()
 		ctx, cancel := context.WithTimeout(ctx, 100*time.Millisecond)
@@ -166,9 +180,23 @@ func LeaderNode(ctx context.Context, cluster VaultCluster) (int, error) {
 		if err != nil || resp == nil || !resp.IsSelf {
 			continue
 		}
-		return i, nil
+		leaderActiveTimes[i] = resp.ActiveTime
 	}
-	return -1, fmt.Errorf("no leader found")
+	if len(leaderActiveTimes) == 0 {
+		return -1, fmt.Errorf("no leader found")
+	}
+	// At least one node thinks it is active. If multiple, pick the one with the
+	// most recent ActiveTime. Note if there is only one then this just returns
+	// it.
+	var newestLeaderIdx int
+	var newestActiveTime time.Time
+	for i, at := range leaderActiveTimes {
+		if at.After(newestActiveTime) {
+			newestActiveTime = at
+			newestLeaderIdx = i
+		}
+	}
+	return newestLeaderIdx, nil
 }

 func WaitForActiveNode(ctx context.Context, cluster VaultCluster) (int, error) {
@@ -189,7 +217,8 @@ func WaitForActiveNodeAndPerfStandbys(ctx context.Context, cluster VaultCluster)
 	// A sleep before calling WaitForActiveNodeAndPerfStandbys seems to sort
 	// things out, but so apparently does this.  We should be able to eliminate
 	// this call to WaitForActiveNode by reworking the logic in this method.
-	if _, err := WaitForActiveNode(ctx, cluster); err != nil {
+	leaderIdx, err := WaitForActiveNode(ctx, cluster)
+	if err != nil {
 		return err
 	}

@@ -203,7 +232,7 @@ func WaitForActiveNodeAndPerfStandbys(ctx context.Context, cluster VaultCluster)
 	if err != nil {
 		return err
 	}
-	leaderClient := cluster.Nodes()[0].APIClient()
+	leaderClient := cluster.Nodes()[leaderIdx].APIClient()

 	for ctx.Err() == nil {
 		err = leaderClient.Sys().MountWithContext(ctx, mountPoint, &api.MountInput{
@@ -244,6 +273,7 @@ func WaitForActiveNodeAndPerfStandbys(ctx context.Context, cluster VaultCluster)
 				var leader *api.LeaderResponse
 				leader, err = client.Sys().LeaderWithContext(ctx)
 				if err != nil {
+					logger.Trace("waiting for core", "core", coreNo, "err", err)
 					continue
 				}
 				switch {
@@ -261,6 +291,12 @@ func WaitForActiveNodeAndPerfStandbys(ctx context.Context, cluster VaultCluster)
 						atomic.AddInt64(&standbys, 1)
 						return
 					}
+				default:
+					logger.Trace("waiting for core", "core", coreNo,
+						"ha_enabled", leader.HAEnabled,
+						"is_self", leader.IsSelf,
+						"perf_standby", leader.PerfStandby,
+						"perf_standby_remote_wal", leader.PerfStandbyLastRemoteWAL)
 				}
 			}
 		}(i)