tests/run.sh

   1 #!/bin/bash -e
   2 ##############################################################################
   3 # Copyright (c) 2016 NEC Corporation and others.
   4 #
   5 # All rights reserved. This program and the accompanying materials
   6 # are made available under the terms of the Apache License, Version 2.0
   7 # which accompanies this distribution, and is available at
   8 # http://www.apache.org/licenses/LICENSE-2.0
   9 ##############################################################################
  10
  11 # Configuration
  12
  13 [[ "${CI_DEBUG:-true}" == [Tt]rue ]] && set -x
  14
  15 IMAGE_URL=https://launchpad.net/cirros/trunk/0.3.0/+download/cirros-0.3.0-x86_64-disk.img
  16 #if an existing image name is provided in the enviroment, use that one
  17 IMAGE_NAME=${IMAGE_NAME:-cirros}
  18 IMAGE_FILE="${IMAGE_NAME}.img"
  19 IMAGE_FORMAT=qcow2
  20 VM_BASENAME=doctor_vm
  21 VM_FLAVOR=m1.tiny
  22 #if VM_COUNT set, use that instead
  23 VM_COUNT=${VM_COUNT:-1}
  24 ALARM_BASENAME=doctor_alarm
  25 INSPECTOR_PORT=12345
  26 CONSUMER_PORT=12346
  27 DOCTOR_USER=doctor
  28 DOCTOR_PW=doctor
  29 DOCTOR_PROJECT=doctor
  30 #TODO: change back to `_member_` when JIRA DOCTOR-55 is done
  31 DOCTOR_ROLE=admin
  32 PROFILER_TYPE=${PROFILER_TYPE:-none}
  33
  34 TOP_DIR=$(cd $(dirname "$0") && pwd)
  35
  36 as_doctor_user="--os-username $DOCTOR_USER --os-password $DOCTOR_PW
  37                 --os-tenant-name $DOCTOR_PROJECT"
  38
  39
  40 # Functions
  41
  42 get_compute_host_info() {
  43     # get computer host info which first VM boot in
  44     COMPUTE_HOST=$(openstack $as_doctor_user server show ${VM_BASENAME}1 |
  45                    grep "OS-EXT-SRV-ATTR:host" | awk '{ print $4 }')
  46     compute_host_in_undercloud=${COMPUTE_HOST%%.*}
  47     die_if_not_set $LINENO COMPUTE_HOST "Failed to get compute hostname"
  48
  49     get_compute_ip_from_hostname $COMPUTE_HOST
  50
  51     echo "COMPUTE_HOST=$COMPUTE_HOST"
  52     echo "COMPUTE_IP=$COMPUTE_IP"
  53
  54     # verify connectivity to target compute host
  55     ping -c 1 "$COMPUTE_IP"
  56     if [[ $? -ne 0 ]] ; then
  57         die $LINENO "Can not ping to computer host"
  58     fi
  59
  60     # verify ssh to target compute host
  61     ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" 'exit'
  62     if [[ $? -ne 0 ]] ; then
  63         die $LINENO "Can not ssh to computer host"
  64     fi
  65 }
  66
  67 # TODO(r-mibu): update this function to support consumer instance
  68 #               and migrate this function into installer lib
  69 get_consumer_ip___to_be_removed() {
  70     local get_consumer_command="ip route get $COMPUTE_IP | awk '/ src /{print \$NF}'"
  71     if is_installer apex; then
  72         CONSUMER_IP=$(sudo ssh $ssh_opts root@$INSTALLER_IP \
  73                       "$get_consumer_command")
  74     elif is_installer fuel; then
  75         CONSUMER_IP=$(sudo sshpass -p r00tme ssh $ssh_opts root@${INSTALLER_IP} \
  76                       "$get_consumer_command")
  77     elif is_installer local; then
  78         CONSUMER_IP=`$get_consumer_command`
  79     fi
  80     echo "CONSUMER_IP=$CONSUMER_IP"
  81
  82     die_if_not_set $LINENO CONSUMER_IP "Could not get CONSUMER_IP."
  83 }
  84
  85 download_image() {
  86     #if a different name was provided for the image in the enviroment there's no need to download the image
  87     use_existing_image=false
  88     openstack image list | grep -q " $IMAGE_NAME " && use_existing_image=true
  89
  90     if [[ "$use_existing_image" == false ]] ; then
  91         [ -e "$IMAGE_FILE" ] && return 0
  92         wget "$IMAGE_URL" -o "$IMAGE_FILE"
  93     fi
  94 }
  95
  96 register_image() {
  97     openstack image list | grep -q " $IMAGE_NAME " && return 0
  98     openstack image create "$IMAGE_NAME" \
  99                            --public \
 100                            --disk-format "$IMAGE_FORMAT" \
 101                            --container-format bare \
 102                            --file "$IMAGE_FILE"
 103 }
 104
 105 create_test_user() {
 106     openstack project list | grep -q " $DOCTOR_PROJECT " || {
 107         openstack project create "$DOCTOR_PROJECT"
 108     }
 109     openstack user list | grep -q " $DOCTOR_USER " || {
 110         openstack user create "$DOCTOR_USER" --password "$DOCTOR_PW" \
 111                               --project "$DOCTOR_PROJECT"
 112     }
 113     openstack user role list "$DOCTOR_USER" --project "$DOCTOR_PROJECT" \
 114     | grep -q " $DOCTOR_ROLE " || {
 115         openstack role add "$DOCTOR_ROLE" --user "$DOCTOR_USER" \
 116                            --project "$DOCTOR_PROJECT"
 117     }
 118     # tojuvone: openstack quota show is broken and have to use nova
 119     # https://bugs.launchpad.net/manila/+bug/1652118
 120     # Note! while it is encouraged to use openstack client it has proven
 121     # quite buggy.
 122     # QUOTA=$(openstack quota show $DOCTOR_PROJECT)
 123     DOCTOR_QUOTA=$(nova quota-show --tenant DOCTOR_PROJECT)
 124     # We make sure that quota allows number of instances and cores
 125     OLD_INSTANCE_QUOTA=$(echo "${DOCTOR_QUOTA}" | grep " instances " | \
 126                          awk '{print $4}')
 127     if [ $OLD_INSTANCE_QUOTA -lt $VM_COUNT ]; then
 128         openstack quota set --instances $VM_COUNT \
 129                   $DOCTOR_USER
 130     fi
 131     OLD_CORES_QUOTA=$(echo "${DOCTOR_QUOTA}" | grep " cores " | \
 132                       awk '{print $4}')
 133     if [ $OLD_CORES_QUOTA -lt $VM_COUNT ]; then
 134         openstack quota set --cores $VM_COUNT \
 135                   $DOCTOR_USER
 136     fi
 137 }
 138
 139 boot_vm() {
 140     # test VM done with test user, so can test non-admin
 141     servers=$(openstack $as_doctor_user server list)
 142     for i in `seq $VM_COUNT`; do
 143         echo "${servers}" | grep -q " $VM_BASENAME$i " && continue
 144         openstack $as_doctor_user server create --flavor "$VM_FLAVOR" \
 145                             --image "$IMAGE_NAME" \
 146                             "$VM_BASENAME$i"
 147     done
 148     sleep 1
 149 }
 150
 151 create_alarm() {
 152     # get vm_id as test user
 153     alarm_list=$(ceilometer $as_doctor_user alarm-list)
 154     vms=$(openstack $as_doctor_user server list)
 155     for i in `seq $VM_COUNT`; do
 156         echo "${alarm_list}" | grep -q " $ALARM_BASENAME$i " || {
 157             vm_id=$(echo "${vms}" | grep " $VM_BASENAME$i " | awk '{print $2}')
 158             # TODO(r-mibu): change notification endpoint from localhost to the
 159             # consumer. IP address (functest container).
 160             ceilometer $as_doctor_user alarm-event-create \
 161                        --name "$ALARM_BASENAME$i" \
 162                        --alarm-action "http://localhost:$CONSUMER_PORT/failure" \
 163                        --description "VM failure" \
 164                        --enabled True \
 165                        --repeat-actions False \
 166                        --severity "moderate" \
 167                        --event-type compute.instance.update \
 168                        -q "traits.state=string::error; \
 169                        traits.instance_id=string::$vm_id"
 170             }
 171      done
 172 }
 173
 174 start_monitor() {
 175     pgrep -f "python monitor.py" && return 0
 176     sudo -E python monitor.py "$COMPUTE_HOST" "$COMPUTE_IP" "$INSPECTOR_TYPE" \
 177         "http://127.0.0.1:$INSPECTOR_PORT/events" > monitor.log 2>&1 &
 178 }
 179
 180 stop_monitor() {
 181     pgrep -f "python monitor.py" || return 0
 182     sudo kill $(pgrep -f "python monitor.py")
 183 }
 184
 185 start_consumer() {
 186     pgrep -f "python consumer.py" && return 0
 187     python consumer.py "$CONSUMER_PORT" > consumer.log 2>&1 &
 188
 189     # NOTE(r-mibu): create tunnel to the controller nodes, so that we can
 190     # avoid some network problems dpends on infra and installers.
 191     # This tunnel will be terminated by stop_consumer() or after 10 mins passed.
 192     if ! is_installer local; then
 193         for ip in $CONTROLLER_IPS
 194         do
 195             forward_rule="-R $CONSUMER_PORT:localhost:$CONSUMER_PORT"
 196             tunnel_command="sudo ssh $ssh_opts_cpu $COMPUTE_USER@$ip $forward_rule sleep 600"
 197             $tunnel_command > "ssh_tunnel.${ip}.log" 2>&1 < /dev/null &
 198         done
 199     fi
 200 }
 201
 202 stop_consumer() {
 203     pgrep -f "python consumer.py" || return 0
 204     kill $(pgrep -f "python consumer.py")
 205
 206     # NOTE(r-mibu): terminate tunnels to the controller nodes
 207     if ! is_installer local; then
 208         for ip in $CONTROLLER_IPS
 209         do
 210             forward_rule="-R $CONSUMER_PORT:localhost:$CONSUMER_PORT"
 211             tunnel_command="sudo ssh $ssh_opts_cpu $COMPUTE_USER@$ip $forward_rule sleep 600"
 212             kill $(pgrep -f "$tunnel_command")
 213         done
 214     fi
 215 }
 216
 217 wait_for_vm_launch() {
 218     echo "waiting for vm launch..."
 219
 220     count=0
 221     while [[ ${count} -lt 60 ]]
 222     do
 223         active_count=0
 224         vms=$(openstack $as_doctor_user server list)
 225         for i in `seq $VM_COUNT`; do
 226             state=$(echo "${vms}" | grep " $VM_BASENAME$i " | awk '{print $6}')
 227             if [[ "$state" == "ACTIVE" ]]; then
 228                 active_count=$(($active_count+1))
 229             elif [[ "$state" == "ERROR" ]]; then
 230                 die $LINENO "vm state $VM_BASENAME$i is ERROR"
 231             else
 232                 #This VM not yet active
 233                 count=$(($count+1))
 234                 sleep 5
 235                 continue
 236             fi
 237         done
 238         [[ $active_count -eq $VM_COUNT ]] && {
 239             echo "get computer host info..."
 240             get_compute_host_info
 241             VMS_ON_FAILED_HOST=$(openstack $as_doctor_user server list --host \
 242                          $COMPUTE_HOST | grep " ${VM_BASENAME}" |  wc -l)
 243             return 0
 244         }
 245         #Not all VMs active
 246         count=$(($count+1))
 247         sleep 5
 248     done
 249     die $LINENO "Time out while waiting for VM launch"
 250 }
 251
 252 inject_failure() {
 253     echo "disabling network of compute host [$COMPUTE_HOST] for 3 mins..."
 254     cat > disable_network.sh << 'END_TXT'
 255 #!/bin/bash -x
 256 dev=$(sudo ip a | awk '/ @COMPUTE_IP@\//{print $7}')
 257 [[ -n "$dev" ]] || dev=$(sudo ip a | awk '/ @COMPUTE_IP@\//{print $5}')
 258 sleep 1
 259 sudo ip link set $dev down
 260 echo "doctor set link down at" $(date "+%s.%N")
 261 sleep 180
 262 sudo ip link set $dev up
 263 sleep 1
 264 END_TXT
 265     sed -i -e "s/@COMPUTE_IP@/$COMPUTE_IP/" disable_network.sh
 266     chmod +x disable_network.sh
 267     scp $ssh_opts_cpu disable_network.sh "$COMPUTE_USER@$COMPUTE_IP:"
 268     ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" 'nohup ./disable_network.sh > disable_network.log 2>&1 &'
 269     # use host time to get rid of potential time sync deviation between nodes
 270     triggered=$(date "+%s.%N")
 271 }
 272
 273 wait_consumer() {
 274     local interval=1
 275     local rounds=$(($1 / $interval))
 276     for i in `seq $rounds`; do
 277         notified_count=$(grep "doctor consumer notified at" consumer.log | wc -l)
 278         if [[ $notified_count -eq  $VMS_ON_FAILED_HOST ]]; then
 279             return 0
 280         fi
 281         sleep $interval
 282     done
 283     die $LINENO "Consumer hasn't received fault notification."
 284 }
 285
 286 calculate_notification_time() {
 287     wait_consumer 60
 288     #keep 'at' as the last keyword just before the value, and
 289     #use regex to get value instead of the fixed column
 290     detected=$(grep "doctor monitor detected at" monitor.log |\
 291                sed -e "s/^.* at //")
 292     notified=$(grep "doctor consumer notified at" consumer.log |\
 293                sed -e "s/^.* at //" | tail -1)
 294
 295     echo "$notified $detected" | \
 296         awk '{
 297             d = $1 - $2;
 298             if (d < 1 && d > 0) { print d " OK"; exit 0 }
 299             else { print d " NG"; exit 1 }
 300         }'
 301 }
 302
 303 wait_ping() {
 304     local interval=5
 305     local rounds=$(($1 / $interval))
 306     for i in `seq $rounds`; do
 307         ping -c 1 "$COMPUTE_IP"
 308         if [[ $? -ne 0 ]] ; then
 309             sleep $interval
 310             continue
 311         fi
 312         return 0
 313     done
 314 }
 315
 316 check_host_status() {
 317     # Check host related to first Doctor VM is in wanted state
 318     # $1    Expected state
 319     # $2    Seconds to wait to have wanted state
 320     expected_state=$1
 321     local interval=5
 322     local rounds=$(($2 / $interval))
 323     for i in `seq $rounds`; do
 324         host_status_line=$(openstack $as_doctor_user --os-compute-api-version \
 325                            2.16 server show ${VM_BASENAME}1 | grep "host_status")
 326         host_status=$(echo $host_status_line | awk '{print $4}')
 327         die_if_not_set $LINENO host_status "host_status not reported by: nova show ${VM_BASENAME}1"
 328         if [[ "$expected_state" =~ "$host_status" ]] ; then
 329             echo "${VM_BASENAME}1 showing host_status: $host_status"
 330             return 0
 331         else
 332             sleep $interval
 333         fi
 334     done
 335     if [[ "$expected_state" =~ "$host_status" ]] ; then
 336         echo "${VM_BASENAME}1 showing host_status: $host_status"
 337     else
 338         die $LINENO  "host_status:$host_status not equal to expected_state: $expected_state"
 339     fi
 340 }
 341
 342 unset_forced_down_hosts() {
 343     for host in $(openstack compute service list --service nova-compute \
 344                   -f value -c Host -c State | sed -n -e '/down$/s/ *down$//p')
 345     do
 346         # TODO (r-mibu): make sample inspector use keystone v3 api
 347         OS_AUTH_URL=${OS_AUTH_URL/v3/v2.0} \
 348         python ./nova_force_down.py $host --unset
 349     done
 350
 351     echo "waiting disabled compute host back to be enabled..."
 352     wait_until 'openstack compute service list --service nova-compute
 353                 -f value -c State | grep -q down' 240 5
 354 }
 355
 356 collect_logs() {
 357     unset_forced_down_hosts
 358     # TODO: We need to make sure the target compute host is back to IP
 359     #       reachable. wait_ping() will be added by tojuvone .
 360     sleep 110
 361     scp $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP:disable_network.log" .
 362
 363     # TODO(yujunz) collect other logs, e.g. nova, aodh
 364 }
 365
 366 run_profiler() {
 367     if [[ "$PROFILER_TYPE" == "poc" ]]; then
 368         linkdown=$(grep "doctor set link down at " disable_network.log |\
 369                   sed -e "s/^.* at //")
 370         vmdown=$(grep "doctor mark vm.* error at" inspector.log |tail -n 1 |\
 371                  sed -e "s/^.* at //")
 372         hostdown=$(grep "doctor mark host.* down at" inspector.log |\
 373                  sed -e "s/^.* at //")
 374
 375         # TODO(yujunz) check the actual delay to verify time sync status
 376         # expected ~1s delay from $trigger to $linkdown
 377         relative_start=${linkdown}
 378         export DOCTOR_PROFILER_T00=$(python -c \
 379           "print(int(($linkdown-$relative_start)*1000))")
 380         export DOCTOR_PROFILER_T01=$(python -c \
 381           "print(int(($detected-$relative_start)*1000))")
 382         export DOCTOR_PROFILER_T03=$(python -c \
 383           "print(int(($vmdown-$relative_start)*1000))")
 384         export DOCTOR_PROFILER_T04=$(python -c \
 385           "print(int(($hostdown-$relative_start)*1000))")
 386         export DOCTOR_PROFILER_T09=$(python -c \
 387           "print(int(($notified-$relative_start)*1000))")
 388
 389         python profiler-poc.py >doctor_profiler.log 2>&1
 390     fi
 391 }
 392
 393 cleanup() {
 394     set +e
 395     echo "cleanup..."
 396     stop_monitor
 397     stop_inspector
 398     stop_consumer
 399
 400     unset_forced_down_hosts
 401
 402     wait_ping 120
 403
 404     scp $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP:disable_network.log" .
 405     vms=$(openstack $as_doctor_user server list)
 406     vmstodel=""
 407     for i in `seq $VM_COUNT`; do
 408         $(echo "${vms}" | grep -q " $VM_BASENAME$i ") &&
 409         vmstodel+=" $VM_BASENAME$i"
 410     done
 411     [[ $vmstodel ]] && openstack $as_doctor_user server delete $vmstodel
 412     alarm_list=$(ceilometer $as_doctor_user alarm-list)
 413     for i in `seq $VM_COUNT`; do
 414         alarm_id=$(echo "${alarm_list}" | grep " $ALARM_BASENAME$i " |
 415                    awk '{print $2}')
 416         [ -n "$alarm_id" ] && ceilometer $as_doctor_user alarm-delete "$alarm_id"
 417     done
 418     sleep 1
 419
 420     image_id=$(openstack image list | grep " $IMAGE_NAME " | awk '{print $2}')
 421     sleep 1
 422     #if an existing image was used, there's no need to remove it here
 423     if [[ "$use_existing_image" == false ]] ; then
 424         [ -n "$image_id" ] && openstack image delete "$image_id"
 425     fi
 426     openstack role remove "$DOCTOR_ROLE" --user "$DOCTOR_USER" \
 427                               --project "$DOCTOR_PROJECT"
 428     openstack project delete "$DOCTOR_PROJECT"
 429     openstack user delete "$DOCTOR_USER"
 430
 431     cleanup_installer
 432     cleanup_inspector
 433
 434     # NOTE: Temporal log printer.
 435     for f in $(find . -name '*.log')
 436     do
 437         echo
 438         echo "[$f]"
 439         sed -e 's/^/ | /' $f
 440         echo
 441     done
 442 }
 443
 444 # Main process
 445
 446 echo "Note: doctor/tests/run.sh has been executed."
 447 git log --oneline -1 || true   # ignore even you don't have git installed
 448
 449 trap cleanup EXIT
 450
 451 source $TOP_DIR/functions-common
 452 source $TOP_DIR/lib/installer
 453 source $TOP_DIR/lib/inspector
 454
 455 setup_installer
 456
 457 echo "preparing VM image..."
 458 download_image
 459 register_image
 460
 461 echo "creating test user..."
 462 create_test_user
 463
 464 echo "creating VM..."
 465 boot_vm
 466 wait_for_vm_launch
 467
 468 echo "creating alarm..."
 469 #TODO: change back to use, network problems depends on infra and installers
 470 #get_consumer_ip
 471 create_alarm
 472
 473 echo "starting doctor sample components..."
 474 start_inspector
 475 start_monitor
 476 start_consumer
 477
 478 sleep 60
 479 echo "injecting host failure..."
 480 inject_failure
 481
 482 check_host_status "(DOWN|UNKNOWN)" 60
 483 calculate_notification_time
 484 collect_logs
 485 run_profiler
 486
 487 echo "done"