Pacemaker maintenance mode for the duration of Puppet run on update
authorSteven Hardy <shardy@redhat.com>
Fri, 13 Nov 2015 11:18:50 +0000 (11:18 +0000)
committerJiri Stransky <jistr@redhat.com>
Mon, 14 Dec 2015 13:24:13 +0000 (14:24 +0100)
This enables pacemaker maintenantce mode when running Puppet on stack
update. Puppet can try to restart some overcloud services, which
pacemaker tries to prevent, and this can result in a failed Puppet run.

At the end of the puppet run, certain pacemaker resources are restarted
in an additional SoftwareDeployment to make sure that any config changes
have been fully applied. This is only done on stack updates (when
UpdateIdentifier is set to something), because the assumption is that on
stack create services already come up with the correct config.

(Change I9556085424fa3008d7f596578b58e7c33a336f75 has been squashed into
this one.)

Change-Id: I4d40358c511fc1f95b78a859e943082aaea17899
Co-Authored-By: Jiri Stransky <jistr@redhat.com>
Co-Authored-By: James Slagle <jslagle@redhat.com>
environments/puppet-pacemaker.yaml
extraconfig/tasks/noop.yaml [new file with mode: 0644]
extraconfig/tasks/pacemaker_resource_restart.sh [new file with mode: 0755]
extraconfig/tasks/post_puppet_pacemaker.yaml [new file with mode: 0644]
extraconfig/tasks/pre_puppet_pacemaker.yaml [new file with mode: 0644]
overcloud-resource-registry-puppet.yaml
overcloud.yaml
puppet/all-nodes-config.yaml
puppet/controller-post.yaml

index f235cf8..8986e35 100644 (file)
@@ -2,3 +2,5 @@
 # Overcloud controller with Pacemaker.
 resource_registry:
   OS::TripleO::ControllerConfig: ../puppet/controller-config-pacemaker.yaml
+  OS::TripleO::Tasks::ControllerPrePuppet: ../extraconfig/tasks/pre_puppet_pacemaker.yaml
+  OS::TripleO::Tasks::ControllerPostPuppet: ../extraconfig/tasks/post_puppet_pacemaker.yaml
diff --git a/extraconfig/tasks/noop.yaml b/extraconfig/tasks/noop.yaml
new file mode 100644 (file)
index 0000000..0cff746
--- /dev/null
@@ -0,0 +1,10 @@
+heat_template_version: 2014-10-16
+description: 'No-op task'
+
+parameters:
+  servers:
+    type: json
+  input_values:
+    type: json
+    default: {}
+    description: input values for the software deployments
diff --git a/extraconfig/tasks/pacemaker_resource_restart.sh b/extraconfig/tasks/pacemaker_resource_restart.sh
new file mode 100755 (executable)
index 0000000..ad3c370
--- /dev/null
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+set -eux
+
+pacemaker_status=$(systemctl is-active pacemaker)
+check_interval=3
+
+function check_resource {
+
+  service=$1
+  state=$2
+  timeout=$3
+  tstart=$(date +%s)
+  tend=$(( $tstart + $timeout ))
+
+  if [ "$state" = "stopped" ]; then
+      match_for_incomplete='Started'
+  else # started
+      match_for_incomplete='Stopped'
+  fi
+
+  while (( $(date +%s) < $tend )); do
+      node_states=$(pcs status --full | grep "$service" | grep -v Clone)
+      if echo "$node_states" | grep -q "$match_for_incomplete"; then
+          echo "$service not yet $state, sleeping $check_interval seconds."
+          sleep $check_interval
+      else
+        echo "$service has $state"
+        return
+      fi
+  done
+
+  echo "$service never $state after $timeout seconds" | tee /dev/fd/2
+  exit 1
+
+}
+
+# Run if pacemaker is running, we're the bootstrap node,
+# and we're updating the deployment (not creating).
+if [ "$pacemaker_status" = "active" -a \
+     "$(hiera bootstrap_nodeid)" = "$(facter hostname)" -a \
+     "$(hiera update_identifier)" != "nil" ]; then
+
+    pcs resource disable httpd
+    check_resource httpd stopped 300
+    pcs resource disable openstack-keystone
+    check_resource openstack-keystone stopped 1200
+
+    if pcs status | grep haproxy-clone; then
+        pcs resource restart haproxy-clone
+    fi
+    pcs resource restart redis-master
+    pcs resource restart mongod-clone
+    pcs resource restart rabbitmq-clone
+    pcs resource restart memcached-clone
+    pcs resource restart galera-master
+
+    pcs resource enable openstack-keystone
+    check_resource openstack-keystone started 300
+    pcs resource enable httpd
+    check_resource httpd started 800
+
+fi
diff --git a/extraconfig/tasks/post_puppet_pacemaker.yaml b/extraconfig/tasks/post_puppet_pacemaker.yaml
new file mode 100644 (file)
index 0000000..7de41d9
--- /dev/null
@@ -0,0 +1,44 @@
+heat_template_version: 2014-10-16
+description: 'Post-Puppet Config for Pacemaker deployments'
+
+parameters:
+  servers:
+    type: json
+  input_values:
+     type: json
+     description: input values for the software deployments
+
+resources:
+
+  ControllerPostPuppetMaintenanceModeConfig:
+    type: OS::Heat::SoftwareConfig
+    properties:
+      group: script
+      config: |
+        #!/bin/bash
+        pacemaker_status=$(systemctl is-active pacemaker)
+
+        if [ "$pacemaker_status" = "active" ]; then
+            pcs property set maintenance-mode=false
+        fi
+
+  ControllerPostPuppetMaintenanceModeDeployment:
+    type: OS::Heat::SoftwareDeployments
+    properties:
+      servers:  {get_param: servers}
+      config: {get_resource: ControllerPostPuppetMaintenanceModeConfig}
+      input_values: {get_param: input_values}
+
+  ControllerPostPuppetRestartConfig:
+    type: OS::Heat::SoftwareConfig
+    properties:
+      group: script
+      config: {get_file: pacemaker_resource_restart.sh}
+
+  ControllerPostPuppetRestartDeployment:
+    type: OS::Heat::SoftwareDeployments
+    depends_on: ControllerPostPuppetMaintenanceModeDeployment
+    properties:
+      servers:  {get_param: servers}
+      config: {get_resource: ControllerPostPuppetRestartConfig}
+      input_values: {get_param: input_values}
diff --git a/extraconfig/tasks/pre_puppet_pacemaker.yaml b/extraconfig/tasks/pre_puppet_pacemaker.yaml
new file mode 100644 (file)
index 0000000..2cfe92a
--- /dev/null
@@ -0,0 +1,30 @@
+heat_template_version: 2014-10-16
+description: 'Pre-Puppet Config for Pacemaker deployments'
+
+parameters:
+  servers:
+    type: json
+  input_values:
+     type: json
+     description: input values for the software deployments
+
+resources:
+
+  ControllerPrePuppetMaintenanceModeConfig:
+    type: OS::Heat::SoftwareConfig
+    properties:
+      group: script
+      config: |
+        #!/bin/bash
+        pacemaker_status=$(systemctl is-active pacemaker)
+
+        if [ "$pacemaker_status" = "active" ]; then
+            pcs property set maintenance-mode=true
+        fi
+
+  ControllerPrePuppetMaintenanceModeDeployment:
+    type: OS::Heat::SoftwareDeployments
+    properties:
+      servers:  {get_param: servers}
+      config: {get_resource: ControllerPrePuppetMaintenanceModeConfig}
+      input_values: {get_param: input_values}
index c072c29..77368d0 100644 (file)
@@ -21,7 +21,11 @@ resource_registry:
   OS::TripleO::CephClusterConfig::SoftwareConfig: puppet/ceph-cluster-config.yaml
   OS::TripleO::AllNodes::SoftwareConfig: puppet/all-nodes-config.yaml
   OS::TripleO::BootstrapNode::SoftwareConfig: puppet/bootstrap-config.yaml
+
+  # Tasks (for internal TripleO usage)
   OS::TripleO::Tasks::PackageUpdate: extraconfig/tasks/yum_update.yaml
+  OS::TripleO::Tasks::ControllerPrePuppet: extraconfig/tasks/noop.yaml
+  OS::TripleO::Tasks::ControllerPostPuppet: extraconfig/tasks/noop.yaml
 
   # This creates the "heat-admin" user for all OS images by default
   # To disable, replace with firstboot/userdata_default.yaml
index 6763618..82b5f40 100644 (file)
@@ -1127,6 +1127,8 @@ resources:
       neutron_api_node_ips: {get_attr: [ControllerIpListMap, net_ip_map, {get_param: [ServiceNetMap, NeutronApiNetwork]}]}
       keystone_public_api_node_ips: {get_attr: [ControllerIpListMap, net_ip_map, {get_param: [ServiceNetMap, KeystonePublicApiNetwork]}]}
       keystone_admin_api_node_ips: {get_attr: [ControllerIpListMap, net_ip_map, {get_param: [ServiceNetMap, KeystoneAdminApiNetwork]}]}
+      DeployIdentifier: {get_param: DeployIdentifier}
+      UpdateIdentifier: {get_param: UpdateIdentifier}
 
   MysqlRootPassword:
     type: OS::Heat::RandomString
index 2bc519b..1147b85 100644 (file)
@@ -51,6 +51,17 @@ parameters:
   keystone_admin_api_node_ips:
     type: comma_delimited_list
 
+  DeployIdentifier:
+    type: string
+    description: >
+      Setting this to a unique value will re-run any deployment tasks which
+      perform configuration on a Heat stack-update.
+  UpdateIdentifier:
+    type: string
+    description: >
+      Setting to a previously unused value during stack-update will trigger
+      package update on all nodes
+
 resources:
 
   allNodesConfigImpl:
@@ -240,6 +251,9 @@ resources:
                 nova::rabbit_hosts: *rabbit_nodes_array
                 keystone::rabbit_hosts: *rabbit_nodes_array
 
+                deploy_identifier: {get_param: DeployIdentifier}
+                update_identifier: {get_param: UpdateIdentifier}
+
 outputs:
   config_id:
     description: The ID of the allNodesConfigImpl resource.
index 941e1ac..ed8129e 100644 (file)
@@ -17,6 +17,13 @@ parameters:
 
 resources:
 
+  ControllerPrePuppet:
+    type: OS::TripleO::Tasks::ControllerPrePuppet
+    properties:
+      servers:  {get_param: servers}
+      input_values:
+        update_identifier: {get_param: NodeConfigIdentifiers}
+
   ControllerPuppetConfig:
     type: OS::TripleO::ControllerConfig
 
@@ -26,6 +33,7 @@ resources:
   # e.g all Deployment resources should have a *Deployment_StepN suffix
   ControllerLoadBalancerDeployment_Step1:
     type: OS::Heat::StructuredDeployments
+    depends_on: ControllerPrePuppet
     properties:
       servers:  {get_param: servers}
       config: {get_resource: ControllerPuppetConfig}
@@ -98,10 +106,18 @@ resources:
         step: 5
         update_identifier: {get_param: NodeConfigIdentifiers}
 
+  ControllerPostPuppet:
+    type: OS::TripleO::Tasks::ControllerPostPuppet
+    depends_on: ControllerOvercloudServicesDeployment_Step6
+    properties:
+      servers:  {get_param: servers}
+      input_values:
+        update_identifier: {get_param: NodeConfigIdentifiers}
+
   # Note, this should come last, so use depends_on to ensure
   # this is created after any other resources.
   ExtraConfig:
-    depends_on: ControllerOvercloudServicesDeployment_Step5
+    depends_on: ControllerPostPuppet
     type: OS::TripleO::NodeExtraConfigPost
     properties:
         servers: {get_param: servers}