extraconfig/tasks/yum_update.sh

   1 #!/bin/bash
   2
   3 # A heat-config-script which runs yum update during a stack-update.
   4 # Inputs:
   5 #   deploy_action - yum will only be run if this is UPDATE
   6 #   update_identifier - yum will only run for previously unused values of update_identifier
   7 #   command - yum sub-command to run, defaults to "update"
   8 #   command_arguments - yum command arguments, defaults to ""
   9
  10 echo "Started yum_update.sh on server $deploy_server_id at `date`"
  11 echo -n "false" > $heat_outputs_path.update_managed_packages
  12
  13 if [[ -z "$update_identifier" ]]; then
  14     echo "Not running due to unset update_identifier"
  15     exit 0
  16 fi
  17
  18 timestamp_dir=/var/lib/overcloud-yum-update
  19 mkdir -p $timestamp_dir
  20
  21 # sanitise to remove unusual characters
  22 update_identifier=${update_identifier//[^a-zA-Z0-9-_]/}
  23
  24 # seconds to wait for this node to rejoin the cluster after update
  25 cluster_start_timeout=600
  26 galera_sync_timeout=360
  27 cluster_settle_timeout=1800
  28
  29 timestamp_file="$timestamp_dir/$update_identifier"
  30 if [[ -a "$timestamp_file" ]]; then
  31     echo "Not running for already-run timestamp \"$update_identifier\""
  32     exit 0
  33 fi
  34 touch "$timestamp_file"
  35
  36 command_arguments=${command_arguments:-}
  37
  38 list_updates=$(yum list updates)
  39
  40 if [[ "$list_updates" == "" ]]; then
  41     echo "No packages require updating"
  42     exit 0
  43 fi
  44
  45 pacemaker_status=$(systemctl is-active pacemaker)
  46 pacemaker_dumpfile=$(mktemp)
  47
  48 if [[ "$pacemaker_status" == "active" ]] ; then
  49 SERVICES="memcached
  50 httpd
  51 neutron-dhcp-agent
  52 neutron-l3-agent
  53 neutron-metadata-agent
  54 neutron-openvswitch-agent
  55 neutron-server
  56 openstack-ceilometer-api
  57 openstack-ceilometer-central
  58 openstack-ceilometer-collector
  59 openstack-ceilometer-notification
  60 openstack-aodh-api
  61 openstack-aodh-evaluator
  62 openstack-aodh-notifier
  63 openstack-aodh-listener
  64 openstack-cinder-api
  65 openstack-cinder-scheduler
  66 openstack-cinder-volume
  67 openstack-glance-api
  68 openstack-glance-registry
  69 openstack-heat-api
  70 openstack-heat-api-cfn
  71 openstack-heat-api-cloudwatch
  72 openstack-heat-engine
  73 openstack-keystone
  74 openstack-nova-api
  75 openstack-nova-conductor
  76 openstack-nova-consoleauth
  77 openstack-nova-novncproxy
  78 openstack-nova-scheduler"
  79
  80     echo "Dumping Pacemaker config"
  81     pcs cluster cib $pacemaker_dumpfile
  82
  83     echo "Checking for missing constraints"
  84
  85     if ! pcs constraint order show | grep "start openstack-nova-novncproxy-clone then start openstack-nova-api-clone"; then
  86         pcs -f $pacemaker_dumpfile constraint order start openstack-nova-novncproxy-clone then openstack-nova-api-clone
  87     fi
  88
  89     if ! pcs constraint order show | grep "start rabbitmq-clone then start openstack-keystone-clone"; then
  90         pcs -f $pacemaker_dumpfile constraint order start rabbitmq-clone then openstack-keystone-clone
  91     fi
  92
  93     if ! pcs constraint order show | grep "promote galera-master then start openstack-keystone-clone"; then
  94         pcs -f $pacemaker_dumpfile constraint order promote galera-master then openstack-keystone-clone
  95     fi
  96
  97     if pcs resource | grep "haproxy-clone"; then
  98         SERVICES="$SERVICES haproxy"
  99         if ! pcs constraint order show | grep "start haproxy-clone then start openstack-keystone-clone"; then
 100             pcs -f $pacemaker_dumpfile constraint order start haproxy-clone then openstack-keystone-clone
 101         fi
 102     fi
 103
 104     if ! pcs constraint order show | grep "start memcached-clone then start openstack-keystone-clone"; then
 105         pcs -f $pacemaker_dumpfile constraint order start memcached-clone then openstack-keystone-clone
 106     fi
 107
 108     if ! pcs constraint order show | grep "promote redis-master then start openstack-ceilometer-central-clone"; then
 109         pcs -f $pacemaker_dumpfile constraint order promote redis-master then start openstack-ceilometer-central-clone require-all=false
 110     fi
 111
 112     if ! pcs constraint order show | grep "promote redis-master then start openstack-aodh-evaluator-clone"; then
 113         pcs -f $pacemaker_dumpfile constraint order promote redis-master then start openstack-aodh-evaluator-clone require-all=false
 114     fi
 115     # ensure neutron constraints https://review.openstack.org/#/c/229466
 116     # remove ovs-cleanup after server and add openvswitch-agent instead
 117     if  pcs constraint order show  | grep "start neutron-server-clone then start neutron-ovs-cleanup-clone"; then
 118         pcs -f $pacemaker_dumpfile constraint remove order-neutron-server-clone-neutron-ovs-cleanup-clone-mandatory
 119     fi
 120     if ! pcs constraint order show | grep "start neutron-server-clone then start neutron-openvswitch-agent-clone"; then
 121         pcs -f $pacemaker_dumpfile constraint order start neutron-server-clone then neutron-openvswitch-agent-clone
 122     fi
 123
 124
 125     if ! pcs resource defaults | grep "resource-stickiness: INFINITY"; then
 126         pcs -f $pacemaker_dumpfile resource defaults resource-stickiness=INFINITY
 127     fi
 128
 129     echo "Setting resource start/stop timeouts"
 130     for service in $SERVICES; do
 131         pcs -f $pacemaker_dumpfile resource update $service op start timeout=200s op stop timeout=200s
 132     done
 133     # mongod start timeout is higher, setting only stop timeout
 134     pcs -f $pacemaker_dumpfile resource update mongod op start timeout=370s op  stop timeout=200s
 135
 136     echo "Making sure rabbitmq has the notify=true meta parameter"
 137     pcs -f $pacemaker_dumpfile resource update rabbitmq meta notify=true
 138
 139     echo "Applying new Pacemaker config"
 140     if ! pcs cluster cib-push $pacemaker_dumpfile; then
 141         echo "ERROR failed to apply new pacemaker config"
 142         exit 1
 143     fi
 144
 145     echo "Pacemaker running, stopping cluster node and doing full package update"
 146     node_count=$(pcs status xml | grep -o "<nodes_configured.*/>" | grep -o 'number="[0-9]*"' | grep -o "[0-9]*")
 147     if [[ "$node_count" == "1" ]] ; then
 148         echo "Active node count is 1, stopping node with --force"
 149         pcs cluster stop --force
 150     else
 151         pcs cluster stop
 152     fi
 153
 154     # clean leftover keepalived and radvd instances from neutron
 155     # (can be removed when we remove neutron-netns-cleanup from cluster services)
 156     # see https://review.gerrithub.io/#/c/248931/1/neutron-netns-cleanup.init
 157     killall neutron-keepalived-state-change 2>/dev/null || :
 158     kill $(ps ax | grep -e "keepalived.*\.pid-vrrp" | awk '{print $1}') 2>/dev/null || :
 159     kill $(ps ax | grep -e "radvd.*\.pid\.radvd" | awk '{print $1}') 2>/dev/null || :
 160 else
 161     echo "Upgrading openstack-puppet-modules"
 162     yum -q -y update openstack-puppet-modules
 163     echo "Upgrading other packages is handled by config management tooling"
 164     echo -n "true" > $heat_outputs_path.update_managed_packages
 165     exit 0
 166 fi
 167
 168 command=${command:-update}
 169 full_command="yum -q -y $command $command_arguments"
 170 echo "Running: $full_command"
 171
 172 result=$($full_command)
 173 return_code=$?
 174 echo "$result"
 175 echo "yum return code: $return_code"
 176
 177 if [[ "$pacemaker_status" == "active" ]] ; then
 178     echo "Starting cluster node"
 179     pcs cluster start
 180
 181     hostname=$(hostname -s)
 182     tstart=$(date +%s)
 183     while [[ "$(pcs status | grep "^Online" | grep -F -o $hostname)" == "" ]]; do
 184         sleep 5
 185         tnow=$(date +%s)
 186         if (( tnow-tstart > cluster_start_timeout )) ; then
 187             echo "ERROR $hostname failed to join cluster in $cluster_start_timeout seconds"
 188             pcs status
 189             exit 1
 190         fi
 191     done
 192
 193     tstart=$(date +%s)
 194     while ! clustercheck; do
 195         sleep 5
 196         tnow=$(date +%s)
 197         if (( tnow-tstart > galera_sync_timeout )) ; then
 198             echo "ERROR galera sync timed out"
 199             exit 1
 200         fi
 201     done
 202
 203     echo "Waiting for pacemaker cluster to settle"
 204     if ! timeout -k 10 $cluster_settle_timeout crm_resource --wait; then
 205         echo "ERROR timed out while waiting for the cluster to settle"
 206         exit 1
 207     fi
 208
 209     pcs status
 210 fi
 211
 212 echo "Finished yum_update.sh on server $deploy_server_id at `date`"
 213
 214 exit $return_code