diff --git a/enos/enos-modules.hcl b/enos/enos-modules.hcl index a5eac890b9..817d97e073 100644 --- a/enos/enos-modules.hcl +++ b/enos/enos-modules.hcl @@ -61,6 +61,10 @@ module "shutdown_node" { source = "./modules/shutdown_node" } +module "shutdown_multiple_nodes" { + source = "./modules/shutdown_multiple_nodes" +} + module "vault_agent" { source = "./modules/vault_agent" @@ -188,3 +192,8 @@ module "vault_verify_write_data" { vault_install_dir = var.vault_install_dir vault_instance_count = var.vault_instance_count } + +module "vault_raft_remove_peer" { + source = "./modules/vault_raft_remove_peer" + vault_install_dir = var.vault_install_dir +} diff --git a/enos/enos-scenario-autopilot.hcl b/enos/enos-scenario-autopilot.hcl index c0fa9bf1a1..9b5283301a 100644 --- a/enos/enos-scenario-autopilot.hcl +++ b/enos/enos-scenario-autopilot.hcl @@ -1,12 +1,11 @@ scenario "autopilot" { matrix { - arch = ["amd64", "arm64"] - artifact_source = ["local", "crt", "artifactory"] - artifact_type = ["bundle", "package"] - distro = ["ubuntu", "rhel"] - edition = ["ent", "ent.fips1402", "ent.hsm", "ent.hsm.fips1402"] - seal = ["awskms", "shamir"] - undo_logs_status = ["0", "1"] + arch = ["amd64", "arm64"] + artifact_source = ["local", "crt", "artifactory"] + artifact_type = ["bundle", "package"] + distro = ["ubuntu", "rhel"] + edition = ["ent", "ent.fips1402", "ent.hsm", "ent.hsm.fips1402"] + seal = ["awskms", "shamir"] # Packages are not offered for the oss, ent.fips1402, and ent.hsm.fips1402 editions exclude { @@ -46,8 +45,6 @@ scenario "autopilot" { arm64 = "t4g.small" } - enable_undo_logs = matrix.undo_logs_status == "1" && semverconstraint(var.vault_product_version, ">=1.13.0-0") ? true : false - vault_instance_type = coalesce(var.vault_instance_type, local.vault_instance_types[matrix.arch]) vault_license_path = abspath(var.vault_license_path != null ? var.vault_license_path : joinpath(path.root, "./support/vault.hclic")) vault_install_dir_packages = { @@ -221,28 +218,9 @@ scenario "autopilot" { vault_unseal_when_no_init = matrix.seal == "shamir" vault_unseal_keys = matrix.seal == "shamir" ? step.create_vault_cluster.vault_unseal_keys_hex : null vpc_id = step.create_vpc.vpc_id - vault_environment = { "VAULT_REPLICATION_USE_UNDO_LOGS" : local.enable_undo_logs } - } - } - - step "get_updated_vault_cluster_ips" { - module = module.vault_get_cluster_ips - depends_on = [ - step.create_vault_cluster, - step.get_vault_cluster_ips, - step.upgrade_vault_cluster_with_autopilot - ] - - providers = { - enos = local.enos_provider[matrix.distro] - } - - variables { - vault_instances = step.create_vault_cluster.vault_instances - vault_install_dir = local.vault_install_dir - added_vault_instances = step.upgrade_vault_cluster_with_autopilot.vault_instances - vault_root_token = step.create_vault_cluster.vault_root_token - node_public_ip = step.get_vault_cluster_ips.leader_public_ip + vault_environment = { + "VAULT_LOG_LEVEL" : "debug" + } } } @@ -281,6 +259,47 @@ scenario "autopilot" { } } + step "verify_autopilot_await_server_removal_state" { + module = module.vault_verify_autopilot + depends_on = [ + step.upgrade_vault_cluster_with_autopilot, + step.verify_raft_auto_join_voter + ] + + providers = { + enos = local.enos_provider[matrix.distro] + } + + variables { + vault_autopilot_upgrade_version = matrix.artifact_source == "local" ? step.get_local_metadata.version : var.vault_product_version + vault_autopilot_upgrade_status = "await-server-removal" + vault_install_dir = local.vault_install_dir + vault_instances = step.upgrade_vault_cluster_with_autopilot.vault_instances + vault_root_token = step.create_vault_cluster.vault_root_token + } + } + + step "get_updated_vault_cluster_ips" { + module = module.vault_get_cluster_ips + depends_on = [ + step.create_vault_cluster, + step.get_vault_cluster_ips, + step.upgrade_vault_cluster_with_autopilot + ] + + providers = { + enos = local.enos_provider[matrix.distro] + } + + variables { + vault_instances = step.create_vault_cluster.vault_instances + vault_install_dir = local.vault_install_dir + added_vault_instances = step.upgrade_vault_cluster_with_autopilot.vault_instances + vault_root_token = step.create_vault_cluster.vault_root_token + node_public_ip = step.get_vault_cluster_ips.leader_public_ip + } + } + step "verify_read_test_data" { module = module.vault_verify_read_data depends_on = [ @@ -301,32 +320,12 @@ scenario "autopilot" { } } - step "verify_autopilot_upgraded_vault_cluster" { - module = module.vault_verify_autopilot + step "raft_remove_peers" { + module = module.vault_raft_remove_peer depends_on = [ + step.get_updated_vault_cluster_ips, step.upgrade_vault_cluster_with_autopilot, - step.verify_raft_auto_join_voter - ] - - providers = { - enos = local.enos_provider[matrix.distro] - } - - variables { - vault_autopilot_upgrade_version = matrix.artifact_source == "local" ? step.get_local_metadata.version : var.vault_product_version - vault_autopilot_upgrade_status = "await-server-removal" - vault_install_dir = local.vault_install_dir - vault_instances = step.create_vault_cluster.vault_instances - vault_root_token = step.create_vault_cluster.vault_root_token - } - } - - step "verify_undo_logs_status" { - skip_step = semverconstraint(var.vault_product_version, "<1.13.0-0") - module = module.vault_verify_undo_logs - depends_on = [ - step.upgrade_vault_cluster_with_autopilot, - step.verify_autopilot_upgraded_vault_cluster + step.verify_autopilot_await_server_removal_state ] providers = { @@ -335,12 +334,71 @@ scenario "autopilot" { variables { vault_install_dir = local.vault_install_dir - vault_undo_logs_status = matrix.undo_logs_status - vault_instances = step.upgrade_vault_cluster_with_autopilot.vault_instances + operator_instance = step.get_updated_vault_cluster_ips.leader_public_ip + remove_vault_instances = step.create_vault_cluster.vault_instances + vault_instance_count = 3 vault_root_token = step.create_vault_cluster.vault_root_token } } + step "remove_old_nodes" { + module = module.shutdown_multiple_nodes + depends_on = [ + step.create_vault_cluster, + step.raft_remove_peers + ] + + providers = { + enos = local.enos_provider[matrix.distro] + } + + variables { + old_vault_instances = step.create_vault_cluster.vault_instances + vault_instance_count = 3 + } + } + + step "verify_autopilot_idle_state" { + module = module.vault_verify_autopilot + depends_on = [ + step.upgrade_vault_cluster_with_autopilot, + step.verify_raft_auto_join_voter, + step.remove_old_nodes + ] + + providers = { + enos = local.enos_provider[matrix.distro] + } + + variables { + vault_autopilot_upgrade_version = matrix.artifact_source == "local" ? step.get_local_metadata.version : var.vault_product_version + vault_autopilot_upgrade_status = "idle" + vault_install_dir = local.vault_install_dir + vault_instances = step.upgrade_vault_cluster_with_autopilot.vault_instances + vault_root_token = step.create_vault_cluster.vault_root_token + } + } + + step "verify_undo_logs_status" { + skip_step = semverconstraint(var.vault_product_version, "<1.13.0-0") + module = module.vault_verify_undo_logs + depends_on = [ + step.remove_old_nodes, + step.upgrade_vault_cluster_with_autopilot, + step.verify_autopilot_idle_state + ] + + providers = { + enos = local.enos_provider[matrix.distro] + } + + variables { + vault_install_dir = local.vault_install_dir + vault_instances = step.upgrade_vault_cluster_with_autopilot.vault_instances + vault_root_token = step.create_vault_cluster.vault_root_token + } + } + output "vault_cluster_instance_ids" { description = "The Vault cluster instance IDs" value = step.create_vault_cluster.instance_ids diff --git a/enos/enos-variables.hcl b/enos/enos-variables.hcl index 64129e8606..cd246a459f 100644 --- a/enos/enos-variables.hcl +++ b/enos/enos-variables.hcl @@ -165,3 +165,17 @@ variable "vault_upgrade_initial_release" { version = "1.10.4" } } + +variable "operator_instance" { + type = string + description = "The ip address of the operator (Voter) node" +} + +variable "remove_vault_instances" { + type = map(object({ + private_ip = string + public_ip = string + })) + description = "The old vault nodes to be removed" +} + diff --git a/enos/modules/shutdown_multiple_nodes/main.tf b/enos/modules/shutdown_multiple_nodes/main.tf new file mode 100644 index 0000000000..df0d1fb876 --- /dev/null +++ b/enos/modules/shutdown_multiple_nodes/main.tf @@ -0,0 +1,40 @@ +terraform { + required_providers { + enos = { + source = "app.terraform.io/hashicorp-qti/enos" + } + } +} + +variable "vault_instance_count" { + type = number + description = "How many vault instances are in the cluster" +} + +variable "old_vault_instances" { + type = map(object({ + private_ip = string + public_ip = string + })) + description = "The vault cluster instances to be shutdown" +} + +locals { + public_ips = { + for idx in range(var.vault_instance_count) : idx => { + public_ip = values(var.old_vault_instances)[idx].public_ip + private_ip = values(var.old_vault_instances)[idx].private_ip + } + } +} + +resource "enos_remote_exec" "shutdown_multiple_nodes" { + for_each = local.public_ips + inline = ["sudo shutdown -H --no-wall; exit 0"] + + transport = { + ssh = { + host = each.value.public_ip + } + } +} diff --git a/enos/modules/vault_raft_remove_peer/main.tf b/enos/modules/vault_raft_remove_peer/main.tf new file mode 100644 index 0000000000..f1d33d9c03 --- /dev/null +++ b/enos/modules/vault_raft_remove_peer/main.tf @@ -0,0 +1,71 @@ +terraform { + required_providers { + enos = { + source = "app.terraform.io/hashicorp-qti/enos" + } + } +} + +variable "vault_cluster_addr_port" { + description = "The Raft cluster address port" + type = string + default = "8201" +} + +variable "vault_install_dir" { + type = string + description = "The directory where the Vault binary will be installed" +} + +variable "vault_instance_count" { + type = number + description = "How many vault instances are in the cluster" +} + +variable "operator_instance" { + type = string + description = "The ip address of the operator (Voter) node" +} + +variable "remove_vault_instances" { + type = map(object({ + private_ip = string + public_ip = string + })) + description = "The old vault nodes to be removed" +} + +variable "vault_root_token" { + type = string + description = "The vault root token" +} + +locals { + instances = { + for idx in range(var.vault_instance_count) : idx => { + public_ip = values(var.remove_vault_instances)[idx].public_ip + private_ip = values(var.remove_vault_instances)[idx].private_ip + } + } +} + +resource "enos_remote_exec" "vault_raft_remove_peer" { + for_each = local.instances + + environment = { + VAULT_TOKEN = var.vault_root_token + VAULT_ADDR = "http://localhost:8200" + } + + content = templatefile("${path.module}/templates/raft-remove-peer.sh", { + remove_vault_cluster_addr = "${each.value.private_ip}:${var.vault_cluster_addr_port}" + vault_install_dir = var.vault_install_dir + vault_local_binary_path = "${var.vault_install_dir}/vault" + }) + + transport = { + ssh = { + host = var.operator_instance + } + } +} diff --git a/enos/modules/vault_raft_remove_peer/templates/raft-remove-peer.sh b/enos/modules/vault_raft_remove_peer/templates/raft-remove-peer.sh new file mode 100644 index 0000000000..6d967ee30d --- /dev/null +++ b/enos/modules/vault_raft_remove_peer/templates/raft-remove-peer.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +set -e + +binpath=${vault_install_dir}/vault + +node_addr=${remove_vault_cluster_addr} + +fail() { + echo "$1" 2>&1 + return 1 +} + +retry() { + local retries=$1 + shift + local count=0 + + until "$@"; do + exit=$? + wait=$((2 ** count)) + count=$((count + 1)) + if [ "$count" -lt "$retries" ]; then + sleep "$wait" + echo "retry $count" + else + return "$exit" + fi + done + + return 0 +} + +remove_peer() { + node_id=$($binpath operator raft list-peers -format json | jq -Mr --argjson expected "false" '.data.config.servers[] | select(.address=='\""$node_addr"\"') | select(.voter==$expected) | .node_id') + if [ "$?" != "0" ];then + fail "failed to get node id of a non-voter node" + fi + + $binpath operator raft remove-peer "$node_id" +} + +test -x "$binpath" || fail "unable to locate vault binary at $binpath" + +# Retry a few times because it can take some time for things to settle after autopilot upgrade +retry 5 remove_peer diff --git a/enos/modules/vault_verify_undo_logs/main.tf b/enos/modules/vault_verify_undo_logs/main.tf index dcde07b967..c856ca4c92 100644 --- a/enos/modules/vault_verify_undo_logs/main.tf +++ b/enos/modules/vault_verify_undo_logs/main.tf @@ -29,12 +29,6 @@ variable "vault_root_token" { description = "The vault root token" } -variable "vault_undo_logs_status" { - type = string - description = "An integer either 0 or 1 which indicates whether undo_logs are disabled or enabled" - default = null -} - locals { public_ips = { for idx in range(var.vault_instance_count) : idx => { @@ -48,9 +42,8 @@ resource "enos_remote_exec" "smoke-verify-undo-logs" { for_each = local.public_ips environment = { - VAULT_TOKEN = var.vault_root_token - VAULT_ADDR = "http://localhost:8200" - VAULT_UNDO_LOGS_STATUS = var.vault_undo_logs_status + VAULT_TOKEN = var.vault_root_token + VAULT_ADDR = "http://localhost:8200" } scripts = [abspath("${path.module}/scripts/smoke-verify-undo-logs.sh")] diff --git a/enos/modules/vault_verify_undo_logs/scripts/smoke-verify-undo-logs.sh b/enos/modules/vault_verify_undo_logs/scripts/smoke-verify-undo-logs.sh index 8e72d1dec9..5761ea1d74 100644 --- a/enos/modules/vault_verify_undo_logs/scripts/smoke-verify-undo-logs.sh +++ b/enos/modules/vault_verify_undo_logs/scripts/smoke-verify-undo-logs.sh @@ -1,19 +1,18 @@ #!/bin/bash -undo_logs_status="${VAULT_UNDO_LOGS_STATUS}" - function fail() { echo "$1" 1>&2 exit 1 } count=0 -retries=7 +retries=20 while :; do - state=$(curl --header "X-Vault-Token: $VAULT_TOKEN" "$VAULT_ADDR/v1/sys/metrics" | jq -r '.Gauges[] | select(.Name == "vault.core.replication.write_undo_logs")') + leader_address=$(curl -H "X-Vault-Request: true" -H "X-Vault-Token: $VAULT_TOKEN" "$VAULT_ADDR/v1/sys/leader" | jq '.leader_address' | sed 's/\"//g') + state=$(curl --header "X-Vault-Token: $VAULT_TOKEN" "$leader_address/v1/sys/metrics" | jq -r '.Gauges[] | select(.Name == "vault.core.replication.write_undo_logs")') target_undo_logs_status="$(jq -r '.Value' <<< "$state")" - if [ "$undo_logs_status" = "$target_undo_logs_status" ]; then + if [ "$target_undo_logs_status" == "1" ]; then exit 0 fi