Add checks in ansible upgrade tasks for CephMon and CephOSD
authorGiulio Fidente <gfidente@redhat.com>
Fri, 17 Feb 2017 15:49:58 +0000 (16:49 +0100)
committerGiulio Fidente <gfidente@redhat.com>
Fri, 24 Feb 2017 17:04:31 +0000 (12:04 -0500)
Adds two checks, one for the CephMon and one for the CephOSD upgrade
tasks borrowed from ceph-ansible.

Change-Id: I0a0e60d277240130c6bd76a74ccc13354b87a30a
Co-Authored-By: Sebastien Han <seb@redhat.com>
(cherry picked from commit a3df16776dd5d7eb0a60ca4c58cef9913eb1c5cb)

extraconfig/tasks/major_upgrade_pacemaker.yaml
puppet/services/ceph-mon.yaml
puppet/services/ceph-osd.yaml
puppet/services/ceph-rgw.yaml

index 8c91027..74d3be7 100644 (file)
@@ -18,10 +18,6 @@ parameters:
     constraints:
     - allowed_values: ['auto', 'yes', 'no']
     default: 'auto'
-  IgnoreCephUpgradeWarnings:
-    type: boolean
-    default: false
-    description: If enabled, Ceph upgrade will be forced even though cluster or PGs status is not clean
   KeepSaharaServicesOnUpgrade:
     type: boolean
     default: true
index 1ce5833..d589ef8 100644 (file)
@@ -59,6 +59,14 @@ parameters:
       }
     default: {}
     type: json
+  CephValidationRetries:
+    type: number
+    default: 5
+    description: Number of retry attempts for Ceph validation
+  CephValidationDelay:
+    type: number
+    default: 10
+    description: Interval (in seconds) in between validation checks
   MonitoringSubscriptionCephMon:
     default: 'overcloud-ceph-mon'
     type: string
@@ -119,21 +127,32 @@ outputs:
         # rolling upgrade of all osd nodes in step1
         - name: Check status
           tags: step0,validation
-          shell: ceph health | grep -qv HEALTH_ERR
-        # FIXME(shardy) I suspect we can use heat or ansible facts here instead?
-        - name: Get hostname
+          shell: ceph health | egrep -sq "HEALTH_OK|HEALTH_WARN"
+        - name: Stop CephMon
           tags: step0
-          shell: hostname -s
-          register: mon_id
-        - name: Stop Ceph Mon
+          service:
+            name: ceph-mon@{{ ansible_hostname }}
+            state: stopped
+        - name: Update Ceph packages
           tags: step0
-          service: name=ceph-mon@{{mon_id.stdout}} pattern=ceph-mon state=stopped
-        - name: Update ceph packages
+          yum:
+            name: ceph-mon
+            state: latest
+        - name: Start CephMon
           tags: step0
-          yum: name=ceph-mon state=latest
-        - name: Start ceph-mon service
-          tags: step0
-          service: name=ceph-mon@{{mon_id.stdout}} state=started
+          service:
+            name: ceph-mon@{{ ansible_hostname }}
+            state: started
+        # ceph-ansible
+        # https://github.com/ceph/ceph-ansible/blob/master/infrastructure-playbooks/rolling_update.yml#L149-L157
+        - name: Wait for the monitor to join the quorum...
+          tags: step0,ceph_quorum_validation
+          shell: |
+            ceph -s | grep monmap | sed 's/.*quorum//' | egrep -sq {{ ansible_hostname }}
+          register: ceph_quorum_nodecheck
+          until: ceph_quorum_nodecheck.rc == 0
+          retries: {get_param: CephValidationRetries}
+          delay: {get_param: CephValidationDelay}
         - name: ceph osd crush tunables default
           tags: step0
           shell: ceph osd crush tunables default
index 9bd83aa..a97fa11 100644 (file)
@@ -21,6 +21,24 @@ parameters:
   MonitoringSubscriptionCephOsd:
     default: 'overcloud-ceph-osd'
     type: string
+  CephValidationRetries:
+    type: number
+    default: 40
+    description: Number of retry attempts for Ceph validation
+  CephValidationDelay:
+    type: number
+    default: 30
+    description: Interval (in seconds) in between validation checks
+  IgnoreCephUpgradeWarnings:
+    type: boolean
+    default: false
+    description: If enabled, Ceph upgrade will be forced even though cluster or PGs status is not clean
+
+parameter_groups:
+- label: deprecated
+  description: Do not use deprecated params, they will be removed.
+  parameters:
+  - IgnoreCephUpgradeWarnings
 
 resources:
   CephBase:
@@ -66,17 +84,37 @@ outputs:
         - name: ceph osd set noscrub
           tags: step1
           command: ceph osd set noscrub
-        - name: Stop Ceph OSD
+        - name: Stop CephOSD
           tags: step1
-          service: name=ceph-osd@{{ item }} state=stopped
+          service:
+            name: ceph-osd@{{ item }}
+            state: stopped
           with_items: "{{osd_ids.stdout.strip().split()}}"
-        - name: Update ceph OSD packages
+        - name: Update Ceph packages
           tags: step1
-          yum: name=ceph-osd state=latest
-        - name: Start ceph-osd service
+          yum:
+            name: ceph-osd
+            state: latest
+        - name: Start CephOSD
           tags: step1
-          service: name=ceph-osd@{{ item }} state=started
+          service:
+            name: ceph-osd@{{ item }}
+            state: started
           with_items: "{{osd_ids.stdout.strip().split()}}"
+        # with awk we are meant to check if $2 and $4 are *the same* but it returns 1 when
+        # they are, so the check is inverted to produce an useful exit code
+        - name: Wait for clean pgs...
+          tags: step1,ceph_pgs_clean_validation
+          vars:
+            ignore_warnings: {get_param: IgnoreCephUpgradeWarnings}
+          shell: |
+            ceph pg stat | awk '{exit($2!=$4)}' && ceph health | egrep -sq "HEALTH_OK|HEALTH_WARN"
+          register: ceph_pgs_healthcheck
+          until: ceph_pgs_healthcheck.rc == 0
+          retries: {get_param: CephValidationRetries}
+          delay: {get_param: CephValidationDelay}
+          when:
+            - not ignore_warnings
         - name: ceph osd unset noout
           tags: step1
           command: ceph osd unset noout
index d7014e5..c5b29c7 100644 (file)
@@ -87,4 +87,6 @@ outputs:
           tags: step0,validation
         - name: Stop RGW instance
           tags: step1
-          service: name=ceph-radosgw@{{rgw_id.stdout}} state=stopped
+          service:
+            name: ceph-radosgw@{{rgw_id.stdout}}
+            state: stopped