[VAULT-27917] fix(enos): handle SLES guestregister.service unreliability (#27380)

* [VAULT-27917] fix(enos): handle SLES guestregister.service unreliability

The SLES provided `guestregister.service` systemd unit is unreliable
enough that it will fail ~ 1/9 times when provisioning SLES instances.
When this happens the machine will never successfully exec SUSEConnect
to enroll and we'll get no access to the SLES repositories and
subsequently break our scenarios.

I resolved this by restructuring our `install_packages` module to to
separate repository synchronization, repository addition, and package
installation into different scripts and resources and by adding special
case handling for SLES and the `guestregister.service`.

I also make a distinction between `dnf` and `yum` because while they are
sort of the same thing on RHEL, it is not the case with Amazon2. I also
shimmed out the rest of the support for Apt in case we ever need to add
repos there.

* Revert "Temporarily remove SLES from samples (#27378)"

This reverts commit 490cdd9066.

Signed-off-by: Ryan Cragun <me@ryan.ec>
This commit is contained in:
Ryan Cragun
2024-06-06 17:37:50 -06:00
committed by GitHub
parent 59320bb60b
commit 0513545dd8
7 changed files with 296 additions and 105 deletions

View File

@@ -97,7 +97,7 @@ sample "build_ce_linux_arm64_rpm" {
arch = ["arm64"]
artifact_source = ["crt"]
artifact_type = ["package"]
distro = ["amzn2", "rhel"] // temporarily remove SLES
distro = ["amzn2", "rhel", "sles"]
edition = ["ce"]
}
}
@@ -107,7 +107,7 @@ sample "build_ce_linux_arm64_rpm" {
arch = ["arm64"]
artifact_source = ["crt"]
artifact_type = ["package"]
distro = ["amzn2", "rhel"] // temporarily remove SLES
distro = ["amzn2", "rhel", "sles"]
edition = ["ce"]
}
}
@@ -117,7 +117,7 @@ sample "build_ce_linux_arm64_rpm" {
arch = ["arm64"]
artifact_source = ["crt"]
artifact_type = ["package"]
distro = ["amzn2", "rhel"] // temporarily remove SLES
distro = ["amzn2", "rhel", "sles"]
edition = ["ce"]
}
}
@@ -127,7 +127,7 @@ sample "build_ce_linux_arm64_rpm" {
arch = ["arm64"]
artifact_source = ["crt"]
artifact_type = ["package"]
distro = ["amzn2", "rhel"] // temporarily remove SLES
distro = ["amzn2", "rhel", "sles"]
edition = ["ce"]
}
}
@@ -141,7 +141,7 @@ sample "build_ce_linux_amd64_rpm" {
arch = ["amd64"]
artifact_source = ["crt"]
artifact_type = ["package"]
distro = ["amzn2", "leap", "rhel"] // temporarily remove SLES
distro = ["amzn2", "leap", "rhel", "sles"]
edition = ["ce"]
}
}
@@ -151,7 +151,7 @@ sample "build_ce_linux_amd64_rpm" {
arch = ["amd64"]
artifact_source = ["crt"]
artifact_type = ["package"]
distro = ["amzn2", "leap", "rhel"] // temporarily remove SLES
distro = ["amzn2", "leap", "rhel", "sles"]
edition = ["ce"]
}
}
@@ -161,7 +161,7 @@ sample "build_ce_linux_amd64_rpm" {
arch = ["amd64"]
artifact_source = ["crt"]
artifact_type = ["package"]
distro = ["amzn2", "leap", "rhel"] // temporarily remove SLES
distro = ["amzn2", "leap", "rhel", "sles"]
edition = ["ce"]
}
}
@@ -171,7 +171,7 @@ sample "build_ce_linux_amd64_rpm" {
arch = ["amd64"]
artifact_source = ["crt"]
artifact_type = ["package"]
distro = ["amzn2", "leap", "rhel"] // temporarily remove SLES
distro = ["amzn2", "leap", "rhel", "sles"]
edition = ["ce"]
exclude {

View File

@@ -97,7 +97,7 @@ sample "release_ce_linux_arm64_rpm" {
arch = ["arm64"]
artifact_source = ["artifactory"]
artifact_type = ["package"]
distro = ["amzn2", "rhel"] // temporarily remove SLES
distro = ["amzn2", "rhel", "sles"]
edition = ["ce"]
}
}
@@ -107,7 +107,7 @@ sample "release_ce_linux_arm64_rpm" {
arch = ["arm64"]
artifact_source = ["artifactory"]
artifact_type = ["package"]
distro = ["amzn2", "rhel"] // temporarily remove SLES
distro = ["amzn2", "rhel", "sles"]
edition = ["ce"]
}
}
@@ -117,7 +117,7 @@ sample "release_ce_linux_arm64_rpm" {
arch = ["arm64"]
artifact_source = ["artifactory"]
artifact_type = ["package"]
distro = ["amzn2", "rhel"] // temporarily remove SLES
distro = ["amzn2", "rhel", "sles"]
edition = ["ce"]
}
}
@@ -127,7 +127,7 @@ sample "release_ce_linux_arm64_rpm" {
arch = ["arm64"]
artifact_source = ["artifactory"]
artifact_type = ["package"]
distro = ["amzn2", "rhel"] // temporarily remove SLES
distro = ["amzn2", "rhel", "sles"]
edition = ["ce"]
}
}
@@ -141,7 +141,7 @@ sample "release_ce_linux_amd64_rpm" {
arch = ["amd64"]
artifact_source = ["artifactory"]
artifact_type = ["package"]
distro = ["amzn2", "leap", "rhel"] // temporarily remove SLES
distro = ["amzn2", "leap", "rhel", "sles"]
edition = ["ce"]
}
}
@@ -151,7 +151,7 @@ sample "release_ce_linux_amd64_rpm" {
arch = ["amd64"]
artifact_source = ["artifactory"]
artifact_type = ["package"]
distro = ["amzn2", "leap", "rhel"] // temporarily remove SLES
distro = ["amzn2", "leap", "rhel", "sles"]
edition = ["ce"]
}
}
@@ -161,7 +161,7 @@ sample "release_ce_linux_amd64_rpm" {
arch = ["amd64"]
artifact_source = ["artifactory"]
artifact_type = ["package"]
distro = ["amzn2", "leap", "rhel"] // temporarily remove SLES
distro = ["amzn2", "leap", "rhel", "sles"]
edition = ["ce"]
}
}
@@ -171,7 +171,7 @@ sample "release_ce_linux_amd64_rpm" {
arch = ["amd64"]
artifact_source = ["artifactory"]
artifact_type = ["package"]
distro = ["amzn2", "leap", "rhel"] // temporarily remove SLES
distro = ["amzn2", "leap", "rhel", "sles"]
edition = ["ce"]
}
}

View File

@@ -19,12 +19,11 @@ locals {
# enos_host_info.hosts[each.key].distro returns "amzn", so that is what we reference here.
"amzn" = "yum"
"opensuse-leap" = "zypper"
"rhel" = "yum"
"rhel" = "dnf"
"sles" = "zypper"
"ubuntu" = "apt"
}
distro_repos = {
# Currently sles is the only distro that requires setting up repos before installing packages
"sles" = {
"15.5" = "https://download.opensuse.org/repositories/network:utilities/SLE_15_SP5/network:utilities.repo"
}
@@ -50,7 +49,7 @@ variable "hosts" {
variable "timeout" {
type = number
description = "The max number of seconds to wait before timing out"
description = "The max number of seconds to wait before timing out. This is applied to each step so total timeout will be longer."
default = 120
}
@@ -70,19 +69,20 @@ resource "enos_host_info" "hosts" {
}
}
# Set up repos for each distro (in order to install some packages, some distros
# require us to manually add the repo for that package first)
resource "enos_remote_exec" "distro_repo_setup" {
# Synchronize repositories on remote machines. This does not update packages but only ensures that
# the remote hosts are configured with default upstream repositories that have been refreshed to
# the latest metedata.
resource "enos_remote_exec" "synchronize_repos" {
for_each = var.hosts
environment = {
DISTRO = enos_host_info.hosts[each.key].distro
DISTRO_REPOS = try(local.distro_repos[enos_host_info.hosts[each.key].distro][enos_host_info.hosts[each.key].distro_version], "__none")
PACKAGE_MANAGER = local.package_manager[enos_host_info.hosts[each.key].distro]
RETRY_INTERVAL = var.retry_interval
TIMEOUT_SECONDS = var.timeout
}
scripts = [abspath("${path.module}/scripts/distro-repo-setup.sh")]
scripts = [abspath("${path.module}/scripts/synchronize-repos.sh")]
transport = {
ssh = {
@@ -91,9 +91,34 @@ resource "enos_remote_exec" "distro_repo_setup" {
}
}
# Add any additional repositories.
resource "enos_remote_exec" "add_repos" {
for_each = var.hosts
depends_on = [enos_remote_exec.synchronize_repos]
environment = {
DISTRO_REPOS = try(local.distro_repos[enos_host_info.hosts[each.key].distro][enos_host_info.hosts[each.key].distro_version], "__none")
PACKAGE_MANAGER = local.package_manager[enos_host_info.hosts[each.key].distro]
RETRY_INTERVAL = var.retry_interval
TIMEOUT_SECONDS = var.timeout
}
scripts = [abspath("${path.module}/scripts/add-repos.sh")]
transport = {
ssh = {
host = each.value.public_ip
}
}
}
# Install any required packages.
resource "enos_remote_exec" "install_packages" {
for_each = var.hosts
depends_on = [enos_remote_exec.distro_repo_setup]
depends_on = [
enos_remote_exec.synchronize_repos,
enos_remote_exec.add_repos,
]
environment = {
PACKAGE_MANAGER = local.package_manager[enos_host_info.hosts[each.key].distro]

View File

@@ -0,0 +1,83 @@
#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -e
fail() {
echo "$1" 1>&2
exit 1
}
[[ -z "${PACKAGE_MANAGER}" ]] && fail "PACKAGE_MANAGER env variable has not been set"
[[ -z "${RETRY_INTERVAL}" ]] && fail "RETRY_INTERVAL env variable has not been set"
[[ -z "${TIMEOUT_SECONDS}" ]] && fail "TIMEOUT_SECONDS env variable has not been set"
# Add any repositories that have have been passed in
add_repos() {
# If we don't have any repos on the list for this distro, no action needed.
if [ ${#DISTRO_REPOS[@]} -lt 1 ]; then
echo "DISTRO_REPOS is empty; No repos required for the packages for this Linux distro."
return 0
fi
case $PACKAGE_MANAGER in
apt)
# NOTE: We do not currently add any apt repositories in our scenarios. I suspect if that time
# comes we'll need to add support for apt-key here.
for repo in ${DISTRO_REPOS}; do
if [ "$repo" == "__none" ]; then
continue
fi
sudo add-apt-repository "${repo}"
done
;;
dnf)
for repo in ${DISTRO_REPOS}; do
if [ "$repo" == "__none" ]; then
continue
fi
sudo dnf install -y "${repo}"
sudo dnf makecache -y
done
;;
yum)
for repo in ${DISTRO_REPOS}; do
if [ "$repo" == "__none" ]; then
continue
fi
sudo yum install -y "${repo}"
sudo yum makecache -y
done
;;
zypper)
# Add each repo
for repo in ${DISTRO_REPOS}; do
if [ "$repo" == "__none" ]; then
continue
fi
if sudo zypper lr "${repo}"; then
echo "A repo named ${repo} already exists, skipping..."
continue
fi
sudo zypper --gpg-auto-import-keys --non-interactive addrepo "${repo}"
done
sudo zypper --gpg-auto-import-keys ref
sudo zypper --gpg-auto-import-keys refs
;;
*)
fail "Unsupported package manager: ${PACKAGE_MANAGER}"
esac
}
begin_time=$(date +%s)
end_time=$((begin_time + TIMEOUT_SECONDS))
while [ "$(date +%s)" -lt "$end_time" ]; do
if add_repos; then
exit 0
fi
sleep "$RETRY_INTERVAL"
done
fail "Timed out waiting for distro repos to be set up"

View File

@@ -1,57 +0,0 @@
#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -e
fail() {
echo "$1" 1>&2
exit 1
}
[[ -z "$DISTRO" ]] && fail "DISTRO env variable has not been set"
[[ -z "$RETRY_INTERVAL" ]] && fail "RETRY_INTERVAL env variable has not been set"
[[ -z "$TIMEOUT_SECONDS" ]] && fail "TIMEOUT_SECONDS env variable has not been set"
setup_repos() {
# If we don't have any repos on the list for this distro, no action needed.
if [ ${#DISTRO_REPOS[@]} -lt 1 ]; then
echo "DISTRO_REPOS is empty; No repos required for the packages for this Linux distro."
return 0
fi
# Wait for cloud-init to finish so it doesn't race with any of our package installations.
# Note: Amazon Linux 2 throws Python 2.7 errors when running `cloud-init status` as
# non-root user (known bug).
sudo cloud-init status --wait
case $DISTRO in
"sles")
for repo in ${DISTRO_REPOS}; do
sudo zypper addrepo "${repo}"
done
;;
"rhel")
for repo in ${DISTRO_REPOS}; do
sudo rm -r /var/cache/dnf
sudo dnf install -y "${repo}"
sudo dnf update -y --refresh
done
;;
*)
return
;;
esac
}
begin_time=$(date +%s)
end_time=$((begin_time + TIMEOUT_SECONDS))
while [ "$(date +%s)" -lt "$end_time" ]; do
if setup_repos; then
exit 0
fi
sleep "$RETRY_INTERVAL"
done
fail "Timed out waiting for distro repos to install"

View File

@@ -14,6 +14,8 @@ fail() {
[[ -z "${PACKAGES}" ]] && fail "PACKAGES env variable has not been set"
[[ -z "${PACKAGE_MANAGER}" ]] && fail "PACKAGE_MANAGER env variable has not been set"
# Install packages based on the provided packages and package manager. We assume that the repositories
# have already been synchronized by the repo setup that is a prerequisite for this script.
install_packages() {
if [[ "${PACKAGES}" = "__skip" ]]; then
return 0
@@ -24,49 +26,69 @@ install_packages() {
# Use the default package manager of the current Linux distro to install packages
case $PACKAGE_MANAGER in
"apt")
sudo apt update
apt)
for package in ${PACKAGES}; do
if dpkg -s "${package}"; then
echo "Skipping installation of ${package} because it is already installed"
continue
else
echo "Installing ${package}"
sudo apt install -y "${package}"
local output
if ! output=$(sudo apt install -y "${package}" 2>&1); then
echo "Failed to install ${package}: ${output}" 1>&2
return 1
fi
fi
done
;;
"yum")
dnf)
for package in ${PACKAGES}; do
if rpm -q "${package}"; then
echo "Skipping installation of ${package} because it is already installed"
continue
else
echo "Installing ${package}"
sudo yum -y install "${package}"
local output
if ! output=$(sudo dnf -y install "${package}" 2>&1); then
echo "Failed to install ${package}: ${output}" 1>&2
return 1
fi
fi
done
;;
"zypper")
cd /tmp
sudo zypper --gpg-auto-import-keys ref
yum)
for package in ${PACKAGES}; do
if rpm -q "${package}"; then
echo "Skipping installation of ${package} because it is already installed"
continue
else
echo "Installing ${package}"
sudo zypper --non-interactive install "${package}"
date
local output
if ! output=$(sudo yum -y install "${package}" 2>&1); then
echo "Failed to install ${package}: ${output}" 1>&2
return 1
fi
fi
done
;;
zypper)
for package in ${PACKAGES}; do
if rpm -q "${package}"; then
echo "Skipping installation of ${package} because it is already installed"
continue
else
echo "Installing ${package}"
local output
if ! output=$(sudo zypper --non-interactive install -y -l --force-resolution "${package}" 2>&1); then
echo "Failed to install ${package}: ${output}" 1>&2
return 1
fi
fi
sudo zypper search -i
done
;;
*)
fail "No matching package manager provided."
;;
esac
}

View File

@@ -0,0 +1,118 @@
#!/usr/bin/env bash
# Copyright (c) HashiCorp, Inc.
# SPDX-License-Identifier: BUSL-1.1
set -e
fail() {
echo "$1" 1>&2
exit 1
}
[[ -z "${PACKAGE_MANAGER}" ]] && fail "PACKAGE_MANAGER env variable has not been set"
[[ -z "${RETRY_INTERVAL}" ]] && fail "RETRY_INTERVAL env variable has not been set"
[[ -z "${TIMEOUT_SECONDS}" ]] && fail "TIMEOUT_SECONDS env variable has not been set"
# The SLES AMI's do not come configured with Zypper repositories by default. To get them you
# have to run SUSEConnect to register the instance with SUSE. On the AMI this is handled
# automatically by a oneshot systemd unit called guestregister.service. This oneshot service needs
# to complete before any other repo or package steps are completed. At the time of writing it's very
# unreliable so we have to ensure that it has correctly executed ourselves or restart it. We do this
# by checking if the guestregister.service has reached the correct "inactive" state that we need.
# If it hasn't reached that state it's usually in some sort of active state, i.e. running, or it has
# failed. If it's in one of the active states we need to let it continue and check the status when
# it completes. If it has completed but is failed we'll restart the service to re-run the script that
# executes SUSEConnect.
sles_check_guestregister_service_and_restart_if_failed() {
local active_state
local failed_state
# systemctl returns non-zero exit codes. We rely on output here because all states don't have
# their own exit code.
set +e
active_state=$(sudo systemctl is-active guestregister.service)
failed_state=$(sudo systemctl is-failed guestregister.service)
set -e
case "$active_state" in
active|activating|deactivating)
# It's running so we'll return 1 and get retried by the caller
echo "the guestregister.service is still in the ${active_state} state" 1>&2
return 1
;;
*)
if [ "$active_state" == "inactive" ] && [ "$failed_state" == "inactive" ]; then
# The oneshot has completed and hasn't "failed"
echo "the guestregister.service is 'inactive' for both active and failed states"
return 0
fi
# Our service is stopped and failed, restart it and hope it works the next time
sudo systemctl restart --wait guestregister.service
;;
esac
}
# Check or restart the guestregister service if it has failed. If it passes do another check to make
# sure that the zypper repositories list isn't empty.
sles_ensure_suseconnect() {
local health_output
if ! health_output=$(sles_check_guestregister_service_and_restart_if_failed); then
echo "the guestregister.service failed to reach a healthy state: ${health_output}" 1>&2
return 1
fi
# Make sure Zypper has repositories.
if ! lr_output=$(zypper lr); then
echo "The guestregister.service failed. Unable to SUSEConnect and thus have no Zypper repositories: ${lr_output}: ${health_output}." 1>&2
return 1
fi
return 0
}
# Synchronize our repositories so that futher installation steps are working with updated cache
# and repo metadata.
synchronize_repos() {
case $PACKAGE_MANAGER in
apt)
sudo apt update
;;
dnf)
sudo dnf makecache
;;
yum)
sudo yum makecache
;;
zypper)
if [ "$DISTRO" == "sles" ]; then
if ! sles_ensure_suseconnect; then
return 1
fi
fi
sudo zypper --gpg-auto-import-keys --non-interactive ref
sudo zypper --gpg-auto-import-keys --non-interactive refs
;;
*)
return 0
;;
esac
}
# Before we start to modify repositories and install packages we'll wait for cloud-init to finish
# so it doesn't race with any of our package installations.
# We run as sudo becase Amazon Linux 2 throws Python 2.7 errors when running `cloud-init status` as
# non-root user (known bug).
sudo cloud-init status --wait
begin_time=$(date +%s)
end_time=$((begin_time + TIMEOUT_SECONDS))
while [ "$(date +%s)" -lt "$end_time" ]; do
if synchronize_repos; then
exit 0
fi
sleep "$RETRY_INTERVAL"
done
fail "Timed out waiting for distro repos to be set up"