Prevent split-brain active node writes when using Consul (#23013)

* Add test to demonstrate a split-brain active node when using Consul

* Add Consul session check to prevent split-brain updates

* It's not right

Co-authored-by: Josh Black <raskchanky@gmail.com>

---------

Co-authored-by: Josh Black <raskchanky@gmail.com>
This commit is contained in:
Paul Banks
2023-09-22 16:16:01 +01:00
committed by GitHub
parent 1d61aeb8ae
commit 0fa36a36ae
13 changed files with 1140 additions and 96 deletions

View File

@@ -158,6 +158,20 @@ func NodeHealthy(ctx context.Context, cluster VaultCluster, nodeIdx int) error {
}
func LeaderNode(ctx context.Context, cluster VaultCluster) (int, error) {
// Be robust to multiple nodes thinking they are active. This is possible in
// certain network partition situations where the old leader has not
// discovered it's lost leadership yet. In tests this is only likely to come
// up when we are specifically provoking it, but it's possible it could happen
// at any point if leadership flaps of connectivity suffers transient errors
// etc. so be robust against it. The best solution would be to have some sort
// of epoch like the raft term that is guaranteed to be monotonically
// increasing through elections, however we don't have that abstraction for
// all HABackends in general. The best we have is the ActiveTime. In a
// distributed systems text book this would be bad to rely on due to clock
// sync issues etc. but for our tests it's likely fine because even if we are
// running separate Vault containers, they are all using the same hardware
// clock in the system.
leaderActiveTimes := make(map[int]time.Time)
for i, node := range cluster.Nodes() {
client := node.APIClient()
ctx, cancel := context.WithTimeout(ctx, 100*time.Millisecond)
@@ -166,9 +180,23 @@ func LeaderNode(ctx context.Context, cluster VaultCluster) (int, error) {
if err != nil || resp == nil || !resp.IsSelf {
continue
}
return i, nil
leaderActiveTimes[i] = resp.ActiveTime
}
return -1, fmt.Errorf("no leader found")
if len(leaderActiveTimes) == 0 {
return -1, fmt.Errorf("no leader found")
}
// At least one node thinks it is active. If multiple, pick the one with the
// most recent ActiveTime. Note if there is only one then this just returns
// it.
var newestLeaderIdx int
var newestActiveTime time.Time
for i, at := range leaderActiveTimes {
if at.After(newestActiveTime) {
newestActiveTime = at
newestLeaderIdx = i
}
}
return newestLeaderIdx, nil
}
func WaitForActiveNode(ctx context.Context, cluster VaultCluster) (int, error) {
@@ -189,7 +217,8 @@ func WaitForActiveNodeAndPerfStandbys(ctx context.Context, cluster VaultCluster)
// A sleep before calling WaitForActiveNodeAndPerfStandbys seems to sort
// things out, but so apparently does this. We should be able to eliminate
// this call to WaitForActiveNode by reworking the logic in this method.
if _, err := WaitForActiveNode(ctx, cluster); err != nil {
leaderIdx, err := WaitForActiveNode(ctx, cluster)
if err != nil {
return err
}
@@ -203,7 +232,7 @@ func WaitForActiveNodeAndPerfStandbys(ctx context.Context, cluster VaultCluster)
if err != nil {
return err
}
leaderClient := cluster.Nodes()[0].APIClient()
leaderClient := cluster.Nodes()[leaderIdx].APIClient()
for ctx.Err() == nil {
err = leaderClient.Sys().MountWithContext(ctx, mountPoint, &api.MountInput{
@@ -244,6 +273,7 @@ func WaitForActiveNodeAndPerfStandbys(ctx context.Context, cluster VaultCluster)
var leader *api.LeaderResponse
leader, err = client.Sys().LeaderWithContext(ctx)
if err != nil {
logger.Trace("waiting for core", "core", coreNo, "err", err)
continue
}
switch {
@@ -261,6 +291,12 @@ func WaitForActiveNodeAndPerfStandbys(ctx context.Context, cluster VaultCluster)
atomic.AddInt64(&standbys, 1)
return
}
default:
logger.Trace("waiting for core", "core", coreNo,
"ha_enabled", leader.HAEnabled,
"is_self", leader.IsSelf,
"perf_standby", leader.PerfStandby,
"perf_standby_remote_wal", leader.PerfStandbyLastRemoteWAL)
}
}
}(i)