fix(platform): fix upgrade issues in migrations, etcd timeout, and migration script (#2096)

## What this PR does

Fixes multiple upgrade issues discovered during v0.41.1 → v1.0 upgrade
testing.

**Migration 26 (monitoring → monitoring-system):**
- Use `cozystack.io/ui=true` label with
`--field-selector=metadata.name=monitoring` instead of
`apps.cozystack.io/application.kind=Monitoring` to find monitoring
HelmReleases — the old label is guaranteed to exist on v0.41.1 clusters,
while the new one depends on migration 22 having run
- Add `delete_helm_secrets` function with fallback deletion by secret
name pattern and post-deletion verification

**Migrations 28 and 29 (mysql→mariadb, virtual-machine split):**
- Wrap `grep` in pipes with `{ ... || true; }` to prevent `pipefail`
exit when grep filters out all lines
- Fix reconcile annotation in migration 29 to use RFC3339 timestamp
format instead of Unix epoch
- Remove protection-webhook handling from migration 29 — it is an
external component and should not be managed by cozystack migrations

**Migration 27 (piraeus CRD ownership):**
- Skip CRDs that don't exist instead of failing the entire migration
- Add name-pattern fallback for helm secret deletion

**etcd HelmRelease:**
- Increase timeout from 10m to 30m to accommodate TLS cert rotation hook

**migrate-to-version-1.0.sh:**
- Add missing ConfigMap → Package field mappings: `bundle-disable`,
`bundle-enable`, `expose-ingress`, `expose-services`
- Remove redundant bundle enabled flags — the variant already determines
them via its values file

<!-- This is an auto-generated comment: release notes by coderabbit.ai
-->

## Summary by CodeRabbit

* **New Features**
* Package generation now supports disabled/enabled package lists,
ingress name, and exposed services for customized publishing.

* **Bug Fixes**
* More robust secret cleanup with fallback deletions and post-deletion
verification.
  * Guarded pipelines to avoid failures when no resources match.
  * Reconciliation timestamps now use RFC3339 UTC.
  * Suspension failures are no longer silently suppressed.

* **Chores**
* Increased etcd upgrade timeout; improved namespace discovery,
relabeling behavior, and user-facing messaging.

<!-- end of auto-generated comment: release notes by coderabbit.ai -->
This commit is contained in:
Andrei Kvapil
2026-02-25 10:29:45 +01:00
committed by GitHub
6 changed files with 99 additions and 110 deletions

View File

@@ -52,6 +52,10 @@ OIDC_ENABLED=$(echo "$COZYSTACK_CM" | jq -r '.data["oidc-enabled"] // "false"')
KEYCLOAK_REDIRECTS=$(echo "$COZYSTACK_CM" | jq -r '.data["extra-keycloak-redirect-uri-for-dashboard"] // ""' )
TELEMETRY_ENABLED=$(echo "$COZYSTACK_CM" | jq -r '.data["telemetry-enabled"] // "true"')
BUNDLE_NAME=$(echo "$COZYSTACK_CM" | jq -r '.data["bundle-name"] // "paas-full"')
BUNDLE_DISABLE=$(echo "$COZYSTACK_CM" | jq -r '.data["bundle-disable"] // ""')
BUNDLE_ENABLE=$(echo "$COZYSTACK_CM" | jq -r '.data["bundle-enable"] // ""')
EXPOSE_INGRESS=$(echo "$COZYSTACK_CM" | jq -r '.data["expose-ingress"] // "tenant-root"')
EXPOSE_SERVICES=$(echo "$COZYSTACK_CM" | jq -r '.data["expose-services"] // ""')
# Certificate issuer configuration (old undocumented field: clusterissuer)
OLD_CLUSTER_ISSUER=$(echo "$COZYSTACK_CM" | jq -r '.data["clusterissuer"] // ""')
@@ -99,21 +103,24 @@ else
EXTERNAL_IPS=$(echo "$EXTERNAL_IPS" | sed 's/,/\n/g' | awk 'BEGIN{print}{print " - "$0}')
fi
# Determine bundle type
case "$BUNDLE_NAME" in
paas-full|distro-full)
SYSTEM_ENABLED="true"
SYSTEM_TYPE="full"
;;
paas-hosted|distro-hosted)
SYSTEM_ENABLED="false"
SYSTEM_TYPE="hosted"
;;
*)
SYSTEM_ENABLED="false"
SYSTEM_TYPE="hosted"
;;
esac
# Convert comma-separated lists to YAML arrays
if [ -z "$BUNDLE_DISABLE" ]; then
DISABLED_PACKAGES="[]"
else
DISABLED_PACKAGES=$(echo "$BUNDLE_DISABLE" | sed 's/,/\n/g' | awk 'BEGIN{print}{print " - "$0}')
fi
if [ -z "$BUNDLE_ENABLE" ]; then
ENABLED_PACKAGES="[]"
else
ENABLED_PACKAGES=$(echo "$BUNDLE_ENABLE" | sed 's/,/\n/g' | awk 'BEGIN{print}{print " - "$0}')
fi
if [ -z "$EXPOSE_SERVICES" ]; then
EXPOSED_SERVICES_YAML="[]"
else
EXPOSED_SERVICES_YAML=$(echo "$EXPOSE_SERVICES" | sed 's/,/\n/g' | awk 'BEGIN{print}{print " - "$0}')
fi
# Update bundle naming
BUNDLE_NAME=$(echo "$BUNDLE_NAME" | sed 's/paas/isp/')
@@ -141,8 +148,6 @@ echo " Root Host: $ROOT_HOST"
echo " API Server Endpoint: $API_SERVER_ENDPOINT"
echo " OIDC Enabled: $OIDC_ENABLED"
echo " Bundle Name: $BUNDLE_NAME"
echo " System Enabled: $SYSTEM_ENABLED"
echo " System Type: $SYSTEM_TYPE"
echo " Certificate Solver: ${SOLVER:-http01 (default)}"
echo " Issuer Name: ${ISSUER_NAME:-letsencrypt-prod (default)}"
echo ""
@@ -160,15 +165,8 @@ spec:
platform:
values:
bundles:
system:
enabled: $SYSTEM_ENABLED
type: "$SYSTEM_TYPE"
iaas:
enabled: true
paas:
enabled: true
naas:
enabled: true
disabledPackages: $DISABLED_PACKAGES
enabledPackages: $ENABLED_PACKAGES
networking:
clusterDomain: "$CLUSTER_DOMAIN"
podCIDR: "$POD_CIDR"
@@ -177,6 +175,8 @@ spec:
joinCIDR: "$JOIN_CIDR"
publishing:
host: "$ROOT_HOST"
ingressName: "$EXPOSE_INGRESS"
exposedServices: $EXPOSED_SERVICES_YAML
apiServerEndpoint: "$API_SERVER_ENDPOINT"
externalIPs: $EXTERNAL_IPS
${CERTIFICATES_SECTION}

View File

@@ -18,7 +18,7 @@ spec:
name: cozystack-etcd-application-default-etcd
namespace: cozy-system
interval: 5m
timeout: 10m
timeout: 30m
install:
remediation:
retries: -1

View File

@@ -2,6 +2,7 @@
# Migration 26 --> 27
# Migrate monitoring resources from extra/monitoring to system/monitoring
# This migration re-labels resources so they become owned by monitoring-system HelmRelease
# and deletes old helm release secrets so that helm does not diff old vs new chart manifests.
set -euo pipefail
@@ -35,10 +36,39 @@ relabel_resources() {
done
}
# Delete all helm release secrets for a given release name in a namespace.
# Uses both label selector and name-pattern matching to ensure complete cleanup.
delete_helm_secrets() {
local ns="$1"
local release="$2"
# Primary: delete by label selector
kubectl delete secrets -n "$ns" -l "name=${release},owner=helm" --ignore-not-found
# Fallback: find and delete by name pattern (in case labels were modified)
local remaining
remaining=$(kubectl get secrets -n "$ns" -o name | { grep "^secret/sh\.helm\.release\.v1\.${release}\." || true; })
if [ -n "$remaining" ]; then
echo " Found secrets not matched by label selector, deleting by name..."
echo "$remaining" | while IFS= read -r secret; do
echo " Deleting $secret"
kubectl delete -n "$ns" "$secret" --ignore-not-found
done
fi
# Verify all secrets are gone
remaining=$(kubectl get secrets -n "$ns" -o name | { grep "^secret/sh\.helm\.release\.v1\.${release}\." || true; })
if [ -n "$remaining" ]; then
echo " ERROR: Failed to delete helm release secrets:"
echo "$remaining"
return 1
fi
}
# Find all tenant namespaces with monitoring HelmRelease
echo "Finding tenant namespaces with monitoring HelmRelease..."
NAMESPACES=$(kubectl get hr --all-namespaces -l apps.cozystack.io/application.kind=Monitoring \
-o jsonpath='{range .items[*]}{.metadata.namespace}{"\n"}{end}' 2>/dev/null | sort -u || true)
NAMESPACES=$(kubectl get hr --all-namespaces -l cozystack.io/ui=true --field-selector=metadata.name=monitoring \
-o jsonpath='{range .items[*]}{.metadata.namespace}{"\n"}{end}' | sort -u)
if [ -z "$NAMESPACES" ]; then
echo "No monitoring HelmReleases found in tenant namespaces, skipping migration"
@@ -66,7 +96,7 @@ for ns in $NAMESPACES; do
# Step 1: Suspend the HelmRelease
echo ""
echo "Step 1: Suspending HelmRelease monitoring..."
kubectl patch hr -n "$ns" monitoring --type=merge -p '{"spec":{"suspend":true}}' 2>/dev/null || true
kubectl patch hr -n "$ns" monitoring --type=merge -p '{"spec":{"suspend":true}}'
# Wait a moment for reconciliation to stop
sleep 2
@@ -74,7 +104,7 @@ for ns in $NAMESPACES; do
# Step 2: Delete helm secrets for the monitoring release
echo ""
echo "Step 2: Deleting helm secrets for monitoring release..."
kubectl delete secrets -n "$ns" -l name=monitoring,owner=helm --ignore-not-found
delete_helm_secrets "$ns" "monitoring"
# Step 3: Relabel resources to be owned by monitoring-system
echo ""
@@ -121,7 +151,9 @@ for ns in $NAMESPACES; do
echo "Processing Cozystack resources..."
relabel_resources "$ns" "workloadmonitors.cozystack.io"
# Step 4: Delete the suspended HelmRelease (Flux won't delete resources when HR is suspended)
# Step 4: Delete the suspended HelmRelease
# Helm secrets are already gone, so flux finalizer will find no release to uninstall
# and will simply remove the finalizer without deleting any resources.
echo ""
echo "Step 4: Deleting suspended HelmRelease monitoring..."
kubectl delete hr -n "$ns" monitoring --ignore-not-found

View File

@@ -5,10 +5,24 @@ set -euo pipefail
# Migrate Piraeus CRDs to piraeus-operator-crds Helm release
for crd in linstorclusters.piraeus.io linstornodeconnections.piraeus.io linstorsatelliteconfigurations.piraeus.io linstorsatellites.piraeus.io; do
kubectl annotate crd "$crd" meta.helm.sh/release-namespace=cozy-linstor meta.helm.sh/release-name=piraeus-operator-crds --overwrite
kubectl label crd "$crd" app.kubernetes.io/managed-by=Helm helm.toolkit.fluxcd.io/namespace=cozy-linstor helm.toolkit.fluxcd.io/name=piraeus-operator-crds --overwrite
if kubectl get crd "$crd" >/dev/null 2>&1; then
echo " Relabeling CRD $crd"
kubectl annotate crd "$crd" meta.helm.sh/release-namespace=cozy-linstor meta.helm.sh/release-name=piraeus-operator-crds --overwrite
kubectl label crd "$crd" app.kubernetes.io/managed-by=Helm helm.toolkit.fluxcd.io/namespace=cozy-linstor helm.toolkit.fluxcd.io/name=piraeus-operator-crds --overwrite
else
echo " CRD $crd not found, skipping"
fi
done
# Delete old piraeus-operator helm secrets (by label and by name pattern)
kubectl delete secret -n cozy-linstor -l name=piraeus-operator,owner=helm --ignore-not-found
remaining=$(kubectl get secrets -n cozy-linstor -o name 2>/dev/null | { grep "^secret/sh\.helm\.release\.v1\.piraeus-operator\." || true; })
if [ -n "$remaining" ]; then
echo " Deleting remaining piraeus-operator helm secrets by name..."
echo "$remaining" | while IFS= read -r secret; do
kubectl delete -n cozy-linstor "$secret" --ignore-not-found
done
fi
# Stamp version
kubectl create configmap -n cozy-system cozystack-version \

View File

@@ -348,7 +348,7 @@ PVCEOF
# --- 3g: Clone Secrets ---
echo " --- Clone Secrets ---"
for secret in $(kubectl -n "$NAMESPACE" get secret -o name 2>/dev/null \
| grep "secret/${OLD_NAME}" | grep -v "sh.helm.release"); do
| { grep "secret/${OLD_NAME}" || true; } | { grep -v "sh.helm.release" || true; }); do
old_secret_name="${secret#secret/}"
new_secret_name="${NEW_NAME}${old_secret_name#${OLD_NAME}}"
clone_resource "$NAMESPACE" "secret" "$old_secret_name" "$new_secret_name" "$OLD_NAME" "$NEW_NAME"
@@ -357,7 +357,7 @@ PVCEOF
# --- 3h: Clone ConfigMaps ---
echo " --- Clone ConfigMaps ---"
for cm in $(kubectl -n "$NAMESPACE" get configmap -o name 2>/dev/null \
| grep "configmap/${OLD_NAME}"); do
| { grep "configmap/${OLD_NAME}" || true; }); do
old_cm_name="${cm#configmap/}"
new_cm_name="${NEW_NAME}${old_cm_name#${OLD_NAME}}"
clone_resource "$NAMESPACE" "configmap" "$old_cm_name" "$new_cm_name" "$OLD_NAME" "$NEW_NAME"
@@ -468,13 +468,13 @@ PVCEOF
fi
for secret in $(kubectl -n "$NAMESPACE" get secret -o name 2>/dev/null \
| grep "secret/${OLD_NAME}" | grep -v "sh.helm.release"); do
| { grep "secret/${OLD_NAME}" || true; } | { grep -v "sh.helm.release" || true; }); do
old_secret_name="${secret#secret/}"
delete_resource "$NAMESPACE" "secret" "$old_secret_name"
done
for cm in $(kubectl -n "$NAMESPACE" get configmap -o name 2>/dev/null \
| grep "configmap/${OLD_NAME}"); do
| { grep "configmap/${OLD_NAME}" || true; }); do
old_cm_name="${cm#configmap/}"
delete_resource "$NAMESPACE" "configmap" "$old_cm_name"
done

View File

@@ -9,8 +9,6 @@ set -euo pipefail
OLD_PREFIX="virtual-machine"
NEW_DISK_PREFIX="vm-disk"
NEW_INSTANCE_PREFIX="vm-instance"
PROTECTION_WEBHOOK_NAME="protection-webhook"
PROTECTION_WEBHOOK_NS="protection-webhook"
CDI_APISERVER_NS="cozy-kubevirt-cdi"
CDI_APISERVER_DEPLOY="cdi-apiserver"
CDI_VALIDATING_WEBHOOKS="cdi-api-datavolume-validate cdi-api-dataimportcron-validate cdi-api-populator-validate cdi-api-validate"
@@ -88,7 +86,6 @@ echo " Total: ${#INSTANCES[@]} instance(s)"
# STEP 2: Migrate each instance
# ============================================================
ALL_PV_NAMES=()
ALL_PROTECTED_RESOURCES=()
for entry in "${INSTANCES[@]}"; do
NAMESPACE="${entry%%/*}"
@@ -315,7 +312,7 @@ PVCEOF
# --- 2i: Clone Secrets ---
echo " --- Clone Secrets ---"
kubectl -n "$NAMESPACE" get secret -o name 2>/dev/null \
| grep "secret/${OLD_NAME}" | grep -v "sh.helm.release" | grep -v "values" \
| { grep "secret/${OLD_NAME}" || true; } | { grep -v "sh.helm.release" || true; } | { grep -v "values" || true; } \
| while IFS= read -r secret; do
old_secret_name="${secret#secret/}"
suffix="${old_secret_name#${OLD_NAME}}"
@@ -542,7 +539,7 @@ SVCEOF
# --- 2q: Delete old resources ---
echo " --- Delete old resources ---"
kubectl -n "$NAMESPACE" get secret -o name 2>/dev/null \
| grep "secret/${OLD_NAME}" | grep -v "sh.helm.release" | grep -v "values" \
| { grep "secret/${OLD_NAME}" || true; } | { grep -v "sh.helm.release" || true; } | { grep -v "values" || true; } \
| while IFS= read -r secret; do
old_secret_name="${secret#secret/}"
delete_resource "$NAMESPACE" "secret" "$old_secret_name"
@@ -564,71 +561,17 @@ SVCEOF
delete_resource "$NAMESPACE" "secret" "$VALUES_SECRET"
fi
# Collect protected resources for batch deletion
# Delete old service (if exists)
if resource_exists "$NAMESPACE" "svc" "$OLD_NAME"; then
ALL_PROTECTED_RESOURCES+=("${NAMESPACE}:svc/${OLD_NAME}")
delete_resource "$NAMESPACE" "svc" "$OLD_NAME"
fi
done
# ============================================================
# STEP 3: Delete protected resources (Services)
# STEP 3: Restore PV reclaim policies
# ============================================================
echo ""
echo "--- Step 3: Delete protected resources ---"
if [ ${#ALL_PROTECTED_RESOURCES[@]} -gt 0 ]; then
WEBHOOK_EXISTS=false
if kubectl -n "$PROTECTION_WEBHOOK_NS" get deploy "$PROTECTION_WEBHOOK_NAME" --no-headers 2>/dev/null | grep -q .; then
WEBHOOK_EXISTS=true
fi
if [ "$WEBHOOK_EXISTS" = "true" ]; then
echo " --- Temporarily disabling protection-webhook ---"
WEBHOOK_REPLICAS=$(kubectl -n "$PROTECTION_WEBHOOK_NS" get deploy "$PROTECTION_WEBHOOK_NAME" \
-o jsonpath='{.spec.replicas}' 2>/dev/null || echo "1")
echo " [SCALE] ${PROTECTION_WEBHOOK_NAME} -> 0 (was ${WEBHOOK_REPLICAS})"
kubectl -n "$PROTECTION_WEBHOOK_NS" scale deploy "$PROTECTION_WEBHOOK_NAME" --replicas=0
echo " [PATCH] Set failurePolicy=Ignore on ValidatingWebhookConfiguration/${PROTECTION_WEBHOOK_NAME}"
kubectl get validatingwebhookconfiguration "$PROTECTION_WEBHOOK_NAME" -o json | \
jq '.webhooks[].failurePolicy = "Ignore"' | \
kubectl apply -f - 2>/dev/null || true
echo " Waiting for webhook pods to terminate..."
kubectl -n "$PROTECTION_WEBHOOK_NS" wait --for=delete pod \
-l app.kubernetes.io/name=protection-webhook --timeout=60s 2>/dev/null || true
sleep 3
fi
for entry in "${ALL_PROTECTED_RESOURCES[@]}"; do
ns="${entry%%:*}"
res="${entry#*:}"
echo " [DELETE] ${ns}/${res}"
kubectl -n "$ns" delete "$res" --wait=false 2>/dev/null || true
done
if [ "$WEBHOOK_EXISTS" = "true" ]; then
echo " [PATCH] Set failurePolicy=Fail on ValidatingWebhookConfiguration/${PROTECTION_WEBHOOK_NAME}"
kubectl get validatingwebhookconfiguration "$PROTECTION_WEBHOOK_NAME" -o json | \
jq '.webhooks[].failurePolicy = "Fail"' | \
kubectl apply -f - 2>/dev/null || true
echo " [SCALE] ${PROTECTION_WEBHOOK_NAME} -> ${WEBHOOK_REPLICAS}"
kubectl -n "$PROTECTION_WEBHOOK_NS" scale deploy "$PROTECTION_WEBHOOK_NAME" \
--replicas="$WEBHOOK_REPLICAS"
echo " --- protection-webhook restored ---"
fi
else
echo " [SKIP] No protected resources to delete"
fi
# ============================================================
# STEP 4: Restore PV reclaim policies
# ============================================================
echo ""
echo "--- Step 4: Restore PV reclaim policies ---"
echo "--- Step 3: Restore PV reclaim policies ---"
for pv_name in "${ALL_PV_NAMES[@]}"; do
if [ -n "$pv_name" ]; then
current_policy=$(kubectl get pv "$pv_name" \
@@ -643,7 +586,7 @@ for pv_name in "${ALL_PV_NAMES[@]}"; do
done
# ============================================================
# STEP 5: Temporarily disable CDI datavolume webhooks
# STEP 4: Temporarily disable CDI datavolume webhooks
# ============================================================
# CDI's datavolume-validate webhook rejects DataVolume creation when a PVC
# with the same name already exists. We must disable it so that vm-disk
@@ -652,7 +595,7 @@ done
# cdi-apiserver (which serves the webhooks), then delete webhook configs.
# Both are restored after vm-disk HRs reconcile.
echo ""
echo "--- Step 5: Temporarily disable CDI webhooks ---"
echo "--- Step 4: Temporarily disable CDI webhooks ---"
CDI_OPERATOR_REPLICAS=$(kubectl -n "$CDI_APISERVER_NS" get deploy cdi-operator \
-o jsonpath='{.spec.replicas}' 2>/dev/null || echo "1")
@@ -685,10 +628,10 @@ done
sleep 2
# ============================================================
# STEP 6: Unsuspend vm-disk HelmReleases first
# STEP 5: Unsuspend vm-disk HelmReleases first
# ============================================================
echo ""
echo "--- Step 6: Unsuspend vm-disk HelmReleases ---"
echo "--- Step 5: Unsuspend vm-disk HelmReleases ---"
for entry in "${INSTANCES[@]}"; do
ns="${entry%%/*}"
instance="${entry#*/}"
@@ -705,7 +648,7 @@ for entry in "${INSTANCES[@]}"; do
# Force immediate reconciliation
echo " [TRIGGER] Reconcile ${ns}/hr/${disk_name}"
kubectl -n "$ns" annotate hr "$disk_name" --overwrite \
"reconcile.fluxcd.io/requestedAt=$(date +%s)" 2>/dev/null || true
"reconcile.fluxcd.io/requestedAt=$(date -u +'%Y-%m-%dT%H:%M:%SZ')" 2>/dev/null || true
fi
done
@@ -729,12 +672,12 @@ for entry in "${INSTANCES[@]}"; do
done
# ============================================================
# STEP 7: Restore CDI webhooks
# STEP 6: Restore CDI webhooks
# ============================================================
# Scale cdi-operator and cdi-apiserver back up.
# cdi-apiserver will recreate webhook configurations automatically on start.
echo ""
echo "--- Step 7: Restore CDI webhooks ---"
echo "--- Step 6: Restore CDI webhooks ---"
echo " [SCALE] cdi-operator -> ${CDI_OPERATOR_REPLICAS}"
kubectl -n "$CDI_APISERVER_NS" scale deploy cdi-operator \
@@ -749,10 +692,10 @@ kubectl -n "$CDI_APISERVER_NS" rollout status deploy "$CDI_APISERVER_DEPLOY" --t
echo " --- CDI webhooks restored ---"
# ============================================================
# STEP 8: Unsuspend vm-instance HelmReleases
# STEP 7: Unsuspend vm-instance HelmReleases
# ============================================================
echo ""
echo "--- Step 8: Unsuspend vm-instance HelmReleases ---"
echo "--- Step 7: Unsuspend vm-instance HelmReleases ---"
for entry in "${INSTANCES[@]}"; do
ns="${entry%%/*}"
instance="${entry#*/}"