extraconfig/tasks/yum_update.sh

   1 #!/bin/bash
   2
   3 # A heat-config-script which runs yum update during a stack-update.
   4 # Inputs:
   5 #   deploy_action - yum will only be run if this is UPDATE
   6 #   update_identifier - yum will only run for previously unused values of update_identifier
   7 #   command - yum sub-command to run, defaults to "update"
   8 #   command_arguments - yum command arguments, defaults to ""
   9
  10 echo "Started yum_update.sh on server $deploy_server_id at `date`"
  11 echo -n "false" > $heat_outputs_path.update_managed_packages
  12
  13 if [[ -z "$update_identifier" ]]; then
  14     echo "Not running due to unset update_identifier"
  15     exit 0
  16 fi
  17
  18 timestamp_dir=/var/lib/overcloud-yum-update
  19 mkdir -p $timestamp_dir
  20
  21 # sanitise to remove unusual characters
  22 update_identifier=${update_identifier//[^a-zA-Z0-9-_]/}
  23
  24 # seconds to wait for this node to rejoin the cluster after update
  25 cluster_start_timeout=600
  26 galera_sync_timeout=360
  27 cluster_settle_timeout=1800
  28
  29 timestamp_file="$timestamp_dir/$update_identifier"
  30 if [[ -a "$timestamp_file" ]]; then
  31     echo "Not running for already-run timestamp \"$update_identifier\""
  32     exit 0
  33 fi
  34 touch "$timestamp_file"
  35
  36 command_arguments=${command_arguments:-}
  37
  38 list_updates=$(yum list updates)
  39
  40 if [[ "$list_updates" == "" ]]; then
  41     echo "No packages require updating"
  42     exit 0
  43 fi
  44
  45 pacemaker_status=$(systemctl is-active pacemaker)
  46 pacemaker_dumpfile=$(mktemp)
  47
  48 if [[ "$pacemaker_status" == "active" ]] ; then
  49 SERVICES="memcached
  50 httpd
  51 neutron-dhcp-agent
  52 neutron-l3-agent
  53 neutron-metadata-agent
  54 neutron-openvswitch-agent
  55 neutron-server
  56 openstack-ceilometer-alarm-evaluator
  57 openstack-ceilometer-alarm-notifier
  58 openstack-ceilometer-api
  59 openstack-ceilometer-central
  60 openstack-ceilometer-collector
  61 openstack-ceilometer-notification
  62 openstack-cinder-api
  63 openstack-cinder-scheduler
  64 openstack-cinder-volume
  65 openstack-glance-api
  66 openstack-glance-registry
  67 openstack-heat-api
  68 openstack-heat-api-cfn
  69 openstack-heat-api-cloudwatch
  70 openstack-heat-engine
  71 openstack-keystone
  72 openstack-nova-api
  73 openstack-nova-conductor
  74 openstack-nova-consoleauth
  75 openstack-nova-novncproxy
  76 openstack-nova-scheduler"
  77
  78     echo "Dumping Pacemaker config"
  79     pcs cluster cib $pacemaker_dumpfile
  80
  81     echo "Checking for missing constraints"
  82
  83     if ! pcs constraint order show | grep "start openstack-nova-novncproxy-clone then start openstack-nova-api-clone"; then
  84         pcs -f $pacemaker_dumpfile constraint order start openstack-nova-novncproxy-clone then openstack-nova-api-clone
  85     fi
  86
  87     if ! pcs constraint order show | grep "start rabbitmq-clone then start openstack-keystone-clone"; then
  88         pcs -f $pacemaker_dumpfile constraint order start rabbitmq-clone then openstack-keystone-clone
  89     fi
  90
  91     if ! pcs constraint order show | grep "promote galera-master then start openstack-keystone-clone"; then
  92         pcs -f $pacemaker_dumpfile constraint order promote galera-master then openstack-keystone-clone
  93     fi
  94
  95     if pcs resource | grep "haproxy-clone"; then
  96         SERVICES="$SERVICES haproxy"
  97         if ! pcs constraint order show | grep "start haproxy-clone then start openstack-keystone-clone"; then
  98             pcs -f $pacemaker_dumpfile constraint order start haproxy-clone then openstack-keystone-clone
  99         fi
 100     fi
 101
 102     if ! pcs constraint order show | grep "start memcached-clone then start openstack-keystone-clone"; then
 103         pcs -f $pacemaker_dumpfile constraint order start memcached-clone then openstack-keystone-clone
 104     fi
 105
 106     if ! pcs constraint order show | grep "promote redis-master then start openstack-ceilometer-central-clone"; then
 107         pcs -f $pacemaker_dumpfile constraint order promote redis-master then start openstack-ceilometer-central-clone require-all=false
 108     fi
 109
 110     # ensure neutron constraints https://review.openstack.org/#/c/229466
 111     # remove ovs-cleanup after server and add openvswitch-agent instead
 112     if  pcs constraint order show  | grep "start neutron-server-clone then start neutron-ovs-cleanup-clone"; then
 113         pcs -f $pacemaker_dumpfile constraint remove order-neutron-server-clone-neutron-ovs-cleanup-clone-mandatory
 114     fi
 115     if ! pcs constraint order show | grep "start neutron-server-clone then start neutron-openvswitch-agent-clone"; then
 116         pcs -f $pacemaker_dumpfile constraint order start neutron-server-clone then neutron-openvswitch-agent-clone
 117     fi
 118
 119
 120     if ! pcs resource defaults | grep "resource-stickiness: INFINITY"; then
 121         pcs -f $pacemaker_dumpfile resource defaults resource-stickiness=INFINITY
 122     fi
 123
 124     echo "Setting resource start/stop timeouts"
 125     for service in $SERVICES; do
 126         pcs -f $pacemaker_dumpfile resource update $service op start timeout=200s op stop timeout=200s
 127     done
 128     # mongod start timeout is higher, setting only stop timeout
 129     pcs -f $pacemaker_dumpfile resource update mongod op start timeout=370s op  stop timeout=200s
 130
 131     echo "Making sure rabbitmq has the notify=true meta parameter"
 132     pcs -f $pacemaker_dumpfile resource update rabbitmq meta notify=true
 133
 134     echo "Applying new Pacemaker config"
 135     if ! pcs cluster cib-push $pacemaker_dumpfile; then
 136         echo "ERROR failed to apply new pacemaker config"
 137         exit 1
 138     fi
 139
 140     echo "Pacemaker running, stopping cluster node and doing full package update"
 141     node_count=$(pcs status xml | grep -o "<nodes_configured.*/>" | grep -o 'number="[0-9]*"' | grep -o "[0-9]*")
 142     if [[ "$node_count" == "1" ]] ; then
 143         echo "Active node count is 1, stopping node with --force"
 144         pcs cluster stop --force
 145     else
 146         pcs cluster stop
 147     fi
 148
 149     # clean leftover keepalived and radvd instances from neutron
 150     # (can be removed when we remove neutron-netns-cleanup from cluster services)
 151     # see https://review.gerrithub.io/#/c/248931/1/neutron-netns-cleanup.init
 152     killall neutron-keepalived-state-change 2>/dev/null || :
 153     kill $(ps ax | grep -e "keepalived.*\.pid-vrrp" | awk '{print $1}') 2>/dev/null || :
 154     kill $(ps ax | grep -e "radvd.*\.pid\.radvd" | awk '{print $1}') 2>/dev/null || :
 155 else
 156     echo "Upgrading openstack-puppet-modules"
 157     yum -q -y update openstack-puppet-modules
 158     echo "Upgrading other packages is handled by config management tooling"
 159     echo -n "true" > $heat_outputs_path.update_managed_packages
 160     exit 0
 161 fi
 162
 163 command=${command:-update}
 164 full_command="yum -q -y $command $command_arguments"
 165 echo "Running: $full_command"
 166
 167 result=$($full_command)
 168 return_code=$?
 169 echo "$result"
 170 echo "yum return code: $return_code"
 171
 172 if [[ "$pacemaker_status" == "active" ]] ; then
 173     echo "Starting cluster node"
 174     pcs cluster start
 175
 176     hostname=$(hostname -s)
 177     tstart=$(date +%s)
 178     while [[ "$(pcs status | grep "^Online" | grep -F -o $hostname)" == "" ]]; do
 179         sleep 5
 180         tnow=$(date +%s)
 181         if (( tnow-tstart > cluster_start_timeout )) ; then
 182             echo "ERROR $hostname failed to join cluster in $cluster_start_timeout seconds"
 183             pcs status
 184             exit 1
 185         fi
 186     done
 187
 188     tstart=$(date +%s)
 189     while ! clustercheck; do
 190         sleep 5
 191         tnow=$(date +%s)
 192         if (( tnow-tstart > galera_sync_timeout )) ; then
 193             echo "ERROR galera sync timed out"
 194             exit 1
 195         fi
 196     done
 197
 198     echo "Waiting for pacemaker cluster to settle"
 199     if ! timeout -k 10 $cluster_settle_timeout crm_resource --wait; then
 200         echo "ERROR timed out while waiting for the cluster to settle"
 201         exit 1
 202     fi
 203
 204     pcs status
 205 fi
 206
 207 echo "Finished yum_update.sh on server $deploy_server_id at `date`"
 208
 209 exit $return_code