From f625f506edcfc0bf072c2b4f3c6bab8380506d09 Mon Sep 17 00:00:00 2001 From: Josh Black Date: Thu, 9 Jan 2025 11:58:29 -0800 Subject: [PATCH] CE changes for vault-31750 (#29303) * ce changes for vault-31750 * add changelog * make proto * refactor naming * clarify error message * update changelog * one more time * make proto AGAIN --- changelog/29303.txt | 3 +++ physical/raft/raft.go | 22 ++++++++++++++++++++-- physical/raft/raft_autopilot.go | 4 ++++ sdk/plugin/pb/backend.pb.go | 8 ++++---- vault/logical_system_raft.go | 7 +++++++ 5 files changed, 38 insertions(+), 6 deletions(-) create mode 100644 changelog/29303.txt diff --git a/changelog/29303.txt b/changelog/29303.txt new file mode 100644 index 0000000000..c71765c2f4 --- /dev/null +++ b/changelog/29303.txt @@ -0,0 +1,3 @@ +```release-note:change +core (enterprise): Add tracking of performance standbys by their HA node ID so that RPC connections can be more easily cleaned up when nodes are removed. +``` diff --git a/physical/raft/raft.go b/physical/raft/raft.go index 9010270c3f..b68cc852ef 100644 --- a/physical/raft/raft.go +++ b/physical/raft/raft.go @@ -256,8 +256,9 @@ type RaftBackend struct { // limits. specialPathLimits map[string]uint64 - removed *atomic.Bool - removedCallback func() + removed *atomic.Bool + removedCallback func() + removedServerCleanup func(context.Context, string) (bool, error) } func (b *RaftBackend) IsNodeRemoved(ctx context.Context, nodeID string) (bool, error) { @@ -284,6 +285,23 @@ func (b *RaftBackend) RemoveSelf() error { return b.stableStore.SetUint64(removedKey, 1) } +func (b *RaftBackend) SetRemovedServerCleanupFunc(f func(context.Context, string) (bool, error)) { + b.l.Lock() + b.removedServerCleanup = f + b.l.Unlock() +} + +func (b *RaftBackend) RemovedServerCleanup(ctx context.Context, nodeID string) (bool, error) { + b.l.RLock() + defer b.l.RUnlock() + + if b.removedServerCleanup != nil { + return b.removedServerCleanup(ctx, nodeID) + } + + return false, nil +} + // LeaderJoinInfo contains information required by a node to join itself as a // follower to an existing raft cluster type LeaderJoinInfo struct { diff --git a/physical/raft/raft_autopilot.go b/physical/raft/raft_autopilot.go index fb17283e66..ba2744bd80 100644 --- a/physical/raft/raft_autopilot.go +++ b/physical/raft/raft_autopilot.go @@ -666,6 +666,10 @@ func (d *Delegate) RemoveFailedServer(server *autopilot.Server) { } d.followerStates.Delete(string(server.ID)) + _, err := d.RemovedServerCleanup(context.Background(), string(server.ID)) + if err != nil { + d.logger.Error("failed to run cleanup", "error", err) + } }() } diff --git a/sdk/plugin/pb/backend.pb.go b/sdk/plugin/pb/backend.pb.go index 3461638800..1216138d3b 100644 --- a/sdk/plugin/pb/backend.pb.go +++ b/sdk/plugin/pb/backend.pb.go @@ -3761,10 +3761,10 @@ var File_sdk_plugin_pb_backend_proto protoreflect.FileDescriptor var file_sdk_plugin_pb_backend_proto_rawDesc = []byte{ 0x0a, 0x1b, 0x73, 0x64, 0x6b, 0x2f, 0x70, 0x6c, 0x75, 0x67, 0x69, 0x6e, 0x2f, 0x70, 0x62, 0x2f, 0x62, 0x61, 0x63, 0x6b, 0x65, 0x6e, 0x64, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x12, 0x02, 0x70, - 0x62, 0x1a, 0x1f, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, - 0x75, 0x66, 0x2f, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x2e, 0x70, 0x72, 0x6f, - 0x74, 0x6f, 0x1a, 0x1c, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, - 0x62, 0x75, 0x66, 0x2f, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, + 0x62, 0x1a, 0x1c, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, + 0x75, 0x66, 0x2f, 0x73, 0x74, 0x72, 0x75, 0x63, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x1a, + 0x1f, 0x67, 0x6f, 0x6f, 0x67, 0x6c, 0x65, 0x2f, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x62, 0x75, 0x66, + 0x2f, 0x74, 0x69, 0x6d, 0x65, 0x73, 0x74, 0x61, 0x6d, 0x70, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x1a, 0x17, 0x73, 0x64, 0x6b, 0x2f, 0x6c, 0x6f, 0x67, 0x69, 0x63, 0x61, 0x6c, 0x2f, 0x65, 0x76, 0x65, 0x6e, 0x74, 0x2e, 0x70, 0x72, 0x6f, 0x74, 0x6f, 0x1a, 0x1a, 0x73, 0x64, 0x6b, 0x2f, 0x6c, 0x6f, 0x67, 0x69, 0x63, 0x61, 0x6c, 0x2f, 0x69, 0x64, 0x65, 0x6e, 0x74, 0x69, 0x74, 0x79, 0x2e, diff --git a/vault/logical_system_raft.go b/vault/logical_system_raft.go index 32576cbb6a..3576c47f0d 100644 --- a/vault/logical_system_raft.go +++ b/vault/logical_system_raft.go @@ -268,6 +268,13 @@ func (b *SystemBackend) handleRaftRemovePeerUpdate() framework.OperationFunc { } b.Core.raftFollowerStates.Delete(serverID) + _, err := raftBackend.RemovedServerCleanup(ctx, serverID) + if err != nil { + // log the error but don't return it - we might get an error if we can't find the node in the cache, which + // is not an error condition in this instance. + b.logger.Info("attempted to remove node from perf standby cache but it failed, which might be fine", "server ID", serverID, "error", err) + return nil, nil + } return nil, nil }