From e37d1fc2ab9cfffb549cf46254d20896fc384f41 Mon Sep 17 00:00:00 2001
From: "Taylor, Stephen (st053q)" <st053q@att.com>
Date: Mon, 30 Nov 2020 10:17:40 -0700
Subject: [PATCH] [ceph-osd] Add a check for misplaced objects to the
 post-apply job

OSD failures during an update can cause degraded and misplaced
objects. The post-apply job restarts OSDs in failure domain
batches in order to accomplish the restarts efficiently. There is
already a wait for degraded objects to ensure that OSDs are not
restarted on degraded PGs, but misplaced objects could mean that
multiple object replicas exist in the same failure domain, so the
job should wait for those to recover as well before restarting
OSDs in order to avoid potential disruption under these failure
conditions.

Change-Id: I39606e388a9a1d3a4e9c547de56aac4fc5606ea2
---
 ceph-osd/Chart.yaml                       |  2 +-
 ceph-osd/templates/bin/_post-apply.sh.tpl | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/ceph-osd/Chart.yaml b/ceph-osd/Chart.yaml
index 1f8436c3..7d0ad690 100644
--- a/ceph-osd/Chart.yaml
+++ b/ceph-osd/Chart.yaml
@@ -15,6 +15,6 @@ apiVersion: v1
 appVersion: v1.0.0
 description: OpenStack-Helm Ceph OSD
 name: ceph-osd
-version: 0.1.11
+version: 0.1.12
 home: https://github.com/ceph/ceph
 ...
diff --git a/ceph-osd/templates/bin/_post-apply.sh.tpl b/ceph-osd/templates/bin/_post-apply.sh.tpl
index be9114bd..aeb91c53 100644
--- a/ceph-osd/templates/bin/_post-apply.sh.tpl
+++ b/ceph-osd/templates/bin/_post-apply.sh.tpl
@@ -115,11 +115,11 @@ function wait_for_pgs () {
   done
 }
 
-function wait_for_degraded_objects () {
-  echo "#### Start: Checking for degraded objects ####"
+function wait_for_degraded_and_misplaced_objects () {
+  echo "#### Start: Checking for degraded and misplaced objects ####"
 
   # Loop until no degraded objects
-    while [[ ! -z "`ceph --cluster ${CLUSTER} -s | grep degraded`" ]]
+    while [[ ! -z "`ceph --cluster ${CLUSTER} -s | grep 'degraded\|misplaced'`" ]]
     do
       sleep 3
       ceph -s
@@ -150,7 +150,7 @@ function restart_by_rack() {
      sleep 60
      # Degraded objects won't recover with noout set unless pods come back and
      # PGs become healthy, so simply wait for 0 degraded objects
-     wait_for_degraded_objects
+     wait_for_degraded_and_misplaced_objects
      ceph -s
   done
 }
@@ -179,7 +179,7 @@ if [[ $max_release -gt 1  ]]; then
   if [[  $require_upgrade -gt 0 ]]; then
     echo "waiting for inactive pgs and degraded obejcts before upgrade"
     wait_for_pgs
-    wait_for_degraded_objects
+    wait_for_degraded_and_misplaced_objects
     ceph -s
     ceph osd "set" noout
     echo "lets restart the osds rack by rack"