From e37d1fc2ab9cfffb549cf46254d20896fc384f41 Mon Sep 17 00:00:00 2001 From: "Taylor, Stephen (st053q)" Date: Mon, 30 Nov 2020 10:17:40 -0700 Subject: [PATCH] [ceph-osd] Add a check for misplaced objects to the post-apply job OSD failures during an update can cause degraded and misplaced objects. The post-apply job restarts OSDs in failure domain batches in order to accomplish the restarts efficiently. There is already a wait for degraded objects to ensure that OSDs are not restarted on degraded PGs, but misplaced objects could mean that multiple object replicas exist in the same failure domain, so the job should wait for those to recover as well before restarting OSDs in order to avoid potential disruption under these failure conditions. Change-Id: I39606e388a9a1d3a4e9c547de56aac4fc5606ea2 --- ceph-osd/Chart.yaml | 2 +- ceph-osd/templates/bin/_post-apply.sh.tpl | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ceph-osd/Chart.yaml b/ceph-osd/Chart.yaml index 1f8436c3..7d0ad690 100644 --- a/ceph-osd/Chart.yaml +++ b/ceph-osd/Chart.yaml @@ -15,6 +15,6 @@ apiVersion: v1 appVersion: v1.0.0 description: OpenStack-Helm Ceph OSD name: ceph-osd -version: 0.1.11 +version: 0.1.12 home: https://github.com/ceph/ceph ... diff --git a/ceph-osd/templates/bin/_post-apply.sh.tpl b/ceph-osd/templates/bin/_post-apply.sh.tpl index be9114bd..aeb91c53 100644 --- a/ceph-osd/templates/bin/_post-apply.sh.tpl +++ b/ceph-osd/templates/bin/_post-apply.sh.tpl @@ -115,11 +115,11 @@ function wait_for_pgs () { done } -function wait_for_degraded_objects () { - echo "#### Start: Checking for degraded objects ####" +function wait_for_degraded_and_misplaced_objects () { + echo "#### Start: Checking for degraded and misplaced objects ####" # Loop until no degraded objects - while [[ ! -z "`ceph --cluster ${CLUSTER} -s | grep degraded`" ]] + while [[ ! -z "`ceph --cluster ${CLUSTER} -s | grep 'degraded\|misplaced'`" ]] do sleep 3 ceph -s @@ -150,7 +150,7 @@ function restart_by_rack() { sleep 60 # Degraded objects won't recover with noout set unless pods come back and # PGs become healthy, so simply wait for 0 degraded objects - wait_for_degraded_objects + wait_for_degraded_and_misplaced_objects ceph -s done } @@ -179,7 +179,7 @@ if [[ $max_release -gt 1 ]]; then if [[ $require_upgrade -gt 0 ]]; then echo "waiting for inactive pgs and degraded obejcts before upgrade" wait_for_pgs - wait_for_degraded_objects + wait_for_degraded_and_misplaced_objects ceph -s ceph osd "set" noout echo "lets restart the osds rack by rack"