Add Ceph cluster health validation on upgrade
authorGiulio Fidente <gfidente@redhat.com>
Tue, 23 Aug 2016 20:24:57 +0000 (22:24 +0200)
committerGiulio Fidente <gfidente@redhat.com>
Tue, 30 Aug 2016 16:27:38 +0000 (18:27 +0200)
This will prevent the Ceph Mon upgrade script from starting if the
Ceph cluster is in error state.

It also adds a parameter to ignore warning states, useful when
performing an upgrade of a cluster where the number of healthy
OSDs does not guarantee the desired replica size.

Closes-Bug: 1618533
Change-Id: I1beb8ad0812f19b1018ba19b5a9fc85fa132d7f7

extraconfig/tasks/major_upgrade_ceph_mon.sh
extraconfig/tasks/major_upgrade_pacemaker.yaml

index 38befbb..b76dd7c 100755 (executable)
@@ -17,6 +17,21 @@ if ! [[ "$INSTALLED_VERSION" =~ ^0\.94.* ]]; then
     exit 0
 fi
 
+CEPH_STATUS=$(ceph health | awk '{print $1}')
+if [ ${CEPH_STATUS} = HEALTH_ERR ]; do
+    echo ERROR: Ceph cluster status is HEALTH_ERR, cannot be upgraded
+    exit 1
+fi
+
+# Useful when upgrading with OSDs num < replica size
+if [ $ignore_ceph_upgrade_warnings != "true" ]; then
+    timeout 300 bash -c "while [ ${CEPH_STATUS} != HEALTH_OK ]; do
+      echo WARNING: Waiting for Ceph cluster status to go HEALTH_OK;
+      sleep 30;
+      CEPH_STATUS=$(ceph health | awk '{print $1}')
+    done"
+fi
+
 MON_PID=$(pidof ceph-mon)
 MON_ID=$(hostname -s)
 
@@ -37,8 +52,6 @@ if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then
     echo WARNING: Ceph was not upgraded, restarting daemons
     service ceph start mon.${MON_ID}
 elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then
-    echo INFO: Ceph was upgraded to Jewel
-
     # RPM could own some of these but we can't take risks on the pre-existing files
     for d in /var/lib/ceph/mon /var/log/ceph /var/run/ceph /etc/ceph; do
         chown -R ceph:ceph $d
@@ -54,9 +67,11 @@ elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then
 
     # Wait for daemon to be back in the quorum
     timeout 300 bash -c "until (ceph quorum_status | jq .quorum_names | grep -sq ${MON_ID}); do
-      echo Waiting for mon.${MON_ID} to re-join quorum;
+      echo WARNING: Waiting for mon.${MON_ID} to re-join quorum;
       sleep 10;
     done"
+
+    echo INFO: Ceph was upgraded to Jewel
 else
     echo ERROR: Ceph was upgraded to an unknown release, daemon is stopped, need manual intervention
     exit 1
index c2e1488..598d22d 100644 (file)
@@ -26,6 +26,10 @@ parameters:
     constraints:
     - allowed_values: ['auto', 'yes', 'no']
     default: 'auto'
+  IgnoreCephUpgradeWarnings:
+    type: boolean
+    default: false
+    description: If enabled, Ceph upgrade will be forced even though cluster or PGs status is not clean
 
 resources:
   # TODO(jistr): for Mitaka->Newton upgrades and further we can use
@@ -36,7 +40,16 @@ resources:
     type: OS::Heat::SoftwareConfig
     properties:
       group: script
-      config: {get_file: major_upgrade_ceph_mon.sh}
+      config:
+        list_join:
+        - ''
+        - - str_replace:
+              template: |
+                #!/bin/bash
+                ignore_ceph_upgrade_warnings='IGNORE_CEPH_UPGRADE_WARNINGS'
+              params:
+                IGNORE_CEPH_UPGRADE_WARNINGS: {get_param: IgnoreCephUpgradeWarnings}
+          - get_file: major_upgrade_ceph_mon.sh
 
   CephMonUpgradeDeployment:
     type: OS::Heat::SoftwareDeploymentGroup