VAULT-30819: verify DR secondary leader before unsealing followers (#28459)

* VAULT-30819: verify DR secondary leader before unsealing followers

After we've enabled DR replication on the secondary leader the existing
cluster followers will be resealed with the primary clusters encryption
keys. We have to unseal the followers to make them available. To ensure
that we absolutely take every precaution before attempting to unseal the
followers we now verify that the secondary leader is the cluster leader,
has a valid merkle tree, and is streaming wals from the primary cluster
before we attempt to unseal the secondary followers.

Signed-off-by: Ryan Cragun <me@ryan.ec>
This commit is contained in:
Ryan Cragun
2024-09-24 09:13:40 -06:00
committed by GitHub
parent 12f03b073a
commit c8c51b1b9d
11 changed files with 125 additions and 10 deletions

View File

@@ -815,7 +815,7 @@ scenario "dev_pr_replication" {
Depending on how we're configured we'll pass the unseal keys according to this guide:
https://developer.hashicorp.com/vault/docs/enterprise/replication#seals
EOF
module = module.vault_unseal_nodes
module = module.vault_unseal_replication_followers
depends_on = [
step.create_primary_cluster,
step.create_secondary_cluster,

View File

@@ -256,8 +256,8 @@ module "vault_test_ui" {
ui_run_tests = var.ui_run_tests
}
module "vault_unseal_nodes" {
source = "./modules/vault_unseal_nodes"
module "vault_unseal_replication_followers" {
source = "./modules/vault_unseal_replication_followers"
vault_install_dir = var.vault_install_dir
}

View File

@@ -814,7 +814,11 @@ scenario "dr_replication" {
enos = local.enos_provider[matrix.distro]
}
verifies = quality.vault_api_sys_replication_dr_secondary_enable_write
verifies = [
quality.vault_api_sys_leader_read,
quality.vault_api_sys_replication_dr_secondary_enable_write,
quality.vault_api_sys_replication_dr_status_read,
]
variables {
ip_version = matrix.ip_version
@@ -834,7 +838,7 @@ scenario "dr_replication" {
type combinations. See the guide for more information:
https://developer.hashicorp.com/vault/docs/enterprise/replication#seals
EOF
module = module.vault_unseal_nodes
module = module.vault_unseal_replication_followers
depends_on = [
step.configure_dr_replication_secondary
]
@@ -883,7 +887,11 @@ scenario "dr_replication" {
and ensuring that all secondary nodes are unsealed.
EOF
module = module.vault_verify_dr_replication
depends_on = [step.configure_dr_replication_secondary]
depends_on = [
step.configure_dr_replication_secondary,
step.unseal_secondary_followers,
step.verify_secondary_cluster_is_unsealed_after_enabling_replication,
]
providers = {
enos = local.enos_provider[matrix.distro]

View File

@@ -820,7 +820,7 @@ scenario "pr_replication" {
type combinations. See the guide for more information:
https://developer.hashicorp.com/vault/docs/enterprise/replication#seals
EOF
module = module.vault_unseal_nodes
module = module.vault_unseal_replication_followers
depends_on = [
step.create_primary_cluster,
step.create_secondary_cluster,

View File

@@ -42,14 +42,16 @@ variable "vault_root_token" {
type = string
description = "The vault root token"
}
resource "enos_remote_exec" "configure_dr_primary" {
// Enable DR replication on the primary. This will immediately clear all data in the secondary.
resource "enos_remote_exec" "enable_dr_replication" {
environment = {
VAULT_ADDR = var.vault_addr
VAULT_TOKEN = var.vault_root_token
VAULT_INSTALL_DIR = var.vault_install_dir
}
scripts = [abspath("${path.module}/scripts/configure-vault-dr-primary.sh")]
scripts = [abspath("${path.module}/scripts/enable.sh")]
transport = {
ssh = {

View File

@@ -58,7 +58,7 @@ variable "wrapping_token" {
description = "The wrapping token created on primary cluster"
}
resource "enos_remote_exec" "configure_pr_secondary" {
resource "enos_remote_exec" "enable_replication" {
environment = {
VAULT_ADDR = var.vault_addr
VAULT_TOKEN = var.vault_root_token
@@ -72,3 +72,43 @@ resource "enos_remote_exec" "configure_pr_secondary" {
}
}
}
// Wait for our primary host to be the "leader", which means it's running and all "setup" tasks
// have been completed. We'll have to unseal our follower nodes after this has occurred.
module "wait_for_leader" {
source = "../vault_wait_for_leader"
depends_on = [
enos_remote_exec.enable_replication
]
hosts = { "0" : var.secondary_leader_host }
ip_version = var.ip_version
vault_addr = var.vault_addr
vault_install_dir = var.vault_install_dir
vault_root_token = var.vault_root_token
}
// Ensure that our leader is ready to for us to unseal follower nodes.
resource "enos_remote_exec" "wait_for_leader_ready" {
depends_on = [
module.wait_for_leader,
]
environment = {
REPLICATION_TYPE = var.replication_type
RETRY_INTERVAL = 3 // seconds
TIMEOUT_SECONDS = 60 // seconds
VAULT_ADDR = var.vault_addr
VAULT_TOKEN = var.vault_root_token
VAULT_INSTALL_DIR = var.vault_install_dir
}
scripts = [abspath("${path.module}/scripts/wait-for-leader-ready.sh")]
transport = {
ssh = {
host = var.secondary_leader_host.public_ip
}
}
}

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -e
fail() {
echo "$1" 1>&2
return 1
}
[[ -z "$REPLICATION_TYPE" ]] && fail "REPLICATION_TYPE env variable has not been set"
[[ -z "$RETRY_INTERVAL" ]] && fail "RETRY_INTERVAL env variable has not been set"
[[ -z "$TIMEOUT_SECONDS" ]] && fail "TIMEOUT_SECONDS env variable has not been set"
[[ -z "$VAULT_ADDR" ]] && fail "VAULT_ADDR env variable has not been set"
[[ -z "$VAULT_INSTALL_DIR" ]] && fail "VAULT_INSTALL_DIR env variable has not been set"
[[ -z "$VAULT_TOKEN" ]] && fail "VAULT_TOKEN env variable has not been set"
binpath=${VAULT_INSTALL_DIR}/vault
test -x "$binpath" || fail "unable to locate vault binary at $binpath"
export VAULT_FORMAT=json
replicationStatus() {
$binpath read "sys/replication/${REPLICATION_TYPE}/status" | jq .data
}
isReady() {
# Find the leader private IP address
local status
if ! status=$(replicationStatus); then
return 1
fi
if ! jq -eMc '.state == "stream-wals"' &> /dev/null <<< "$status"; then
echo "DR replication state is not yet running" 1>&2
echo "DR replication is not yet running, got: $(jq '.state' <<< "$status")" 1>&2
return 1
fi
if ! jq -eMc '.mode == "secondary"' &> /dev/null <<< "$status"; then
echo "DR replication mode is not yet primary, got: $(jq '.mode' <<< "$status")" 1>&2
return 1
fi
if ! jq -eMc '.corrupted_merkle_tree == false' &> /dev/null <<< "$status"; then
echo "DR replication merkle is corrupted" 1>&2
return 1
fi
echo "${REPLICATION_TYPE} primary is ready for followers to be unsealed!" 1>&2
return 0
}
begin_time=$(date +%s)
end_time=$((begin_time + TIMEOUT_SECONDS))
while [ "$(date +%s)" -lt "$end_time" ]; do
if isReady; then
exit 0
fi
sleep "$RETRY_INTERVAL"
done
fail "Timed out waiting for ${REPLICATION_TYPE} primary to ready: $(replicationStatus)"