X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=extraconfig%2Ftasks%2Fmajor_upgrade_ceph_mon.sh;h=e0d160f1efc2f4ac2d2058847de9359bd8a64244;hb=64dc5fabbd53752b68e27279bd07e204ddcdc915;hp=38befbbf65692ee2d65c20ba3a80b6aff50c1b90;hpb=a48f946e917b8b4021ea89c318c70e4771e2e211;p=apex-tripleo-heat-templates.git diff --git a/extraconfig/tasks/major_upgrade_ceph_mon.sh b/extraconfig/tasks/major_upgrade_ceph_mon.sh index 38befbbf..e0d160f1 100755 --- a/extraconfig/tasks/major_upgrade_ceph_mon.sh +++ b/extraconfig/tasks/major_upgrade_ceph_mon.sh @@ -5,7 +5,7 @@ set -o pipefail echo INFO: starting $(basename "$0") # Exit if not running -if ! pidof ceph-mon; then +if ! pidof ceph-mon &> /dev/null; then echo INFO: ceph-mon is not running, skipping exit 0 fi @@ -17,6 +17,21 @@ if ! [[ "$INSTALLED_VERSION" =~ ^0\.94.* ]]; then exit 0 fi +CEPH_STATUS=$(ceph health | awk '{print $1}') +if [ ${CEPH_STATUS} = HEALTH_ERR ]; then + echo ERROR: Ceph cluster status is HEALTH_ERR, cannot be upgraded + exit 1 +fi + +# Useful when upgrading with OSDs num < replica size +if [[ ${ignore_ceph_upgrade_warnings:-False} != [Tt]rue ]]; then + timeout 300 bash -c "while [ ${CEPH_STATUS} != HEALTH_OK ]; do + echo WARNING: Waiting for Ceph cluster status to go HEALTH_OK; + sleep 30; + CEPH_STATUS=$(ceph health | awk '{print $1}') + done" +fi + MON_PID=$(pidof ceph-mon) MON_ID=$(hostname -s) @@ -29,7 +44,7 @@ timeout 60 bash -c "while kill -0 ${MON_PID} 2> /dev/null; do done" # Update to Jewel -yum -y -q update ceph-mon +yum -y -q update ceph-mon ceph # Restart/Exit if not on Jewel, only in that case we need the changes UPDATED_VERSION=$(ceph --version | awk '{print $3}') @@ -37,11 +52,9 @@ if [[ "$UPDATED_VERSION" =~ ^0\.94.* ]]; then echo WARNING: Ceph was not upgraded, restarting daemons service ceph start mon.${MON_ID} elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then - echo INFO: Ceph was upgraded to Jewel - # RPM could own some of these but we can't take risks on the pre-existing files for d in /var/lib/ceph/mon /var/log/ceph /var/run/ceph /etc/ceph; do - chown -R ceph:ceph $d + chown -L -R ceph:ceph $d || echo WARNING: chown of $d failed done # Replay udev events with newer rules @@ -54,9 +67,15 @@ elif [[ "$UPDATED_VERSION" =~ ^10\.2.* ]]; then # Wait for daemon to be back in the quorum timeout 300 bash -c "until (ceph quorum_status | jq .quorum_names | grep -sq ${MON_ID}); do - echo Waiting for mon.${MON_ID} to re-join quorum; + echo WARNING: Waiting for mon.${MON_ID} to re-join quorum; sleep 10; done" + + # if tunables become legacy, cluster status will be HEALTH_WARN causing + # upgrade to fail on following node + ceph osd crush tunables default + + echo INFO: Ceph was upgraded to Jewel else echo ERROR: Ceph was upgraded to an unknown release, daemon is stopped, need manual intervention exit 1