Merge "Increasing neutron timeout for low memory usage"

[apex-tripleo-heat-templates.git] / extraconfig / tasks / major_upgrade_ceph_mon.sh
diff --git a/extraconfig/tasks/major_upgrade_ceph_mon.sh b/extraconfig/tasks/major_upgrade_ceph_mon.sh

index 38befbb..e0d160f 100755 (executable)
--- a/extraconfig/tasks/major_upgrade_ceph_mon.sh
+++ b/extraconfig/tasks/major_upgrade_ceph_mon.sh
@@ -5,7 +5,7 @@ set -o pipefail
  echo INFO: starting $(basename "$0")
  
  # Exit if not running
-if ! pidof ceph-mon; then
+if ! pidof ceph-mon &> /dev/null; then
      echo INFO: ceph-mon is not running, skipping
      exit 0
  fi
@@ -17,6 +17,21 @@ if ! [[ "$INSTALLED_VERSION" =~ ^0\.94.* ]]; then
      exit 0
  fi
  
+CEPH_STATUS=$(ceph health | awk '{print $1}')
+if [ ${CEPH_STATUS} = HEALTH_ERR ]; then
+    echo ERROR: Ceph cluster status is HEALTH_ERR, cannot be upgraded
+    exit 1
+fi
+
+# Useful when upgrading with OSDs num < replica size
+if [[ ${ignore_ceph_upgrade_warnings:-False} != [Tt]rue ]]; then
+    timeout 300 bash -c "while [ ${CEPH_STATUS} != HEALTH_OK ]; do
+      echo WARNING: Waiting for Ceph cluster status to go HEALTH_OK;
+      sleep 30;
+      CEPH_STATUS=$(ceph health | awk '{print $1}')
+    done"
+fi
+
  MON_PID=$(pidof ceph-mon)
  MON_ID=$(hostname -s)
  
@@ -29,7 +44,7 @@ timeout 60 bash -c "while kill -0 ${MON_PID} 2> /dev/null; do
  done"
  
  # Update to Jewel
-yum -y -q update ceph-mon
+yum -y -q update ceph-mon ceph
  
  # Restart/Exit if not on Jewel, only in that case we need the changes
  UPDATED_VERSION=$(ceph --version | awk '{print $3}')
@@ -37,11 +52,9 @@ if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then
      echo WARNING: Ceph was not upgraded, restarting daemons
      service ceph start mon.${MON_ID}
  elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then
-    echo INFO: Ceph was upgraded to Jewel
-
      # RPM could own some of these but we can't take risks on the pre-existing files
      for d in /var/lib/ceph/mon /var/log/ceph /var/run/ceph /etc/ceph; do
-        chown -R ceph:ceph $d
+        chown -L -R ceph:ceph $d || echo WARNING: chown of $d failed
      done
  
      # Replay udev events with newer rules
@@ -54,9 +67,15 @@ elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then
  
      # Wait for daemon to be back in the quorum
      timeout 300 bash -c "until (ceph quorum_status | jq .quorum_names | grep -sq ${MON_ID}); do
-      echo Waiting for mon.${MON_ID} to re-join quorum;
+      echo WARNING: Waiting for mon.${MON_ID} to re-join quorum;
        sleep 10;
      done"
+
+    # if tunables become legacy, cluster status will be HEALTH_WARN causing
+    # upgrade to fail on following node
+    ceph osd crush tunables default
+
+    echo INFO: Ceph was upgraded to Jewel
  else
      echo ERROR: Ceph was upgraded to an unknown release, daemon is stopped, need manual intervention
      exit 1