d97a5c9caba43da30e8dcc153eea41d1cbb0308b
[doctor.git] / tests / run.sh
1 #!/bin/bash -e
2 ##############################################################################
3 # Copyright (c) 2016 NEC Corporation and others.
4 #
5 # All rights reserved. This program and the accompanying materials
6 # are made available under the terms of the Apache License, Version 2.0
7 # which accompanies this distribution, and is available at
8 # http://www.apache.org/licenses/LICENSE-2.0
9 ##############################################################################
10
11 # Configuration
12
13 [[ "${CI_DEBUG:-true}" == [Tt]rue ]] && set -x
14
15 IMAGE_URL=https://launchpad.net/cirros/trunk/0.3.0/+download/cirros-0.3.0-x86_64-disk.img
16 #if an existing image name is provided in the enviroment, use that one
17 IMAGE_NAME=${IMAGE_NAME:-cirros}
18 IMAGE_FILE="${IMAGE_NAME}.img"
19 IMAGE_FORMAT=qcow2
20 VM_BASENAME=doctor_vm
21 VM_FLAVOR=m1.tiny
22 #if VM_COUNT set, use that instead
23 VM_COUNT=${VM_COUNT:-1}
24 NET_NAME=doctor_net
25 NET_CIDR=192.168.168.0/24
26 ALARM_BASENAME=doctor_alarm
27 CONSUMER_PORT=12346
28 DOCTOR_USER=doctor
29 DOCTOR_PW=doctor
30 DOCTOR_PROJECT=doctor
31 #TODO: change back to `_member_` when JIRA DOCTOR-55 is done
32 DOCTOR_ROLE=admin
33 PROFILER_TYPE=${PROFILER_TYPE:-none}
34
35 TOP_DIR=$(cd $(dirname "$0") && pwd)
36
37 as_doctor_user="--os-username $DOCTOR_USER --os-password $DOCTOR_PW
38                 --os-project-name $DOCTOR_PROJECT --os-tenant-name $DOCTOR_PROJECT"
39 # NOTE: ceilometer command still requires '--os-tenant-name'.
40 #ceilometer="ceilometer ${as_doctor_user/--os-project-name/--os-tenant-name}"
41 ceilometer="ceilometer $as_doctor_user"
42
43
44 # Functions
45
46 get_compute_host_info() {
47     # get computer host info which first VM boot in
48     COMPUTE_HOST=$(openstack $as_doctor_user server show ${VM_BASENAME}1 |
49                    grep "OS-EXT-SRV-ATTR:host" | awk '{ print $4 }')
50     compute_host_in_undercloud=${COMPUTE_HOST%%.*}
51     die_if_not_set $LINENO COMPUTE_HOST "Failed to get compute hostname"
52
53     get_compute_ip_from_hostname $COMPUTE_HOST
54
55     echo "COMPUTE_HOST=$COMPUTE_HOST"
56     echo "COMPUTE_IP=$COMPUTE_IP"
57
58     # verify connectivity to target compute host
59     ping -c 1 "$COMPUTE_IP"
60     if [[ $? -ne 0 ]] ; then
61         die $LINENO "Can not ping to computer host"
62     fi
63
64     # verify ssh to target compute host
65     ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" 'exit'
66     if [[ $? -ne 0 ]] ; then
67         die $LINENO "Can not ssh to computer host"
68     fi
69 }
70
71 # TODO(r-mibu): update this function to support consumer instance
72 #               and migrate this function into installer lib
73 get_consumer_ip___to_be_removed() {
74     local get_consumer_command="ip route get $COMPUTE_IP | awk '/ src /{print \$NF}'"
75     if is_installer apex; then
76         CONSUMER_IP=$(sudo ssh $ssh_opts root@$INSTALLER_IP \
77                       "$get_consumer_command")
78     elif is_installer fuel; then
79         CONSUMER_IP=$(sudo sshpass -p r00tme ssh $ssh_opts root@${INSTALLER_IP} \
80                       "$get_consumer_command")
81     elif is_installer local; then
82         CONSUMER_IP=`$get_consumer_command`
83     fi
84     echo "CONSUMER_IP=$CONSUMER_IP"
85
86     die_if_not_set $LINENO CONSUMER_IP "Could not get CONSUMER_IP."
87 }
88
89 download_image() {
90     #if a different name was provided for the image in the enviroment there's no need to download the image
91     use_existing_image=false
92     openstack image list | grep -q " $IMAGE_NAME " && use_existing_image=true
93
94     if [[ "$use_existing_image" == false ]] ; then
95         [ -e "$IMAGE_FILE" ] && return 0
96         wget "$IMAGE_URL" -o "$IMAGE_FILE"
97     fi
98 }
99
100 register_image() {
101     openstack image list | grep -q " $IMAGE_NAME " && return 0
102     openstack image create "$IMAGE_NAME" \
103                            --public \
104                            --disk-format "$IMAGE_FORMAT" \
105                            --container-format bare \
106                            --file "$IMAGE_FILE"
107 }
108
109 create_test_user() {
110     openstack project list | grep -q " $DOCTOR_PROJECT " || {
111         openstack project create "$DOCTOR_PROJECT"
112     }
113     openstack user list | grep -q " $DOCTOR_USER " || {
114         openstack user create "$DOCTOR_USER" --password "$DOCTOR_PW" \
115                               --project "$DOCTOR_PROJECT"
116     }
117     openstack role show "$DOCTOR_ROLE" || {
118         openstack role create "$DOCTOR_ROLE"
119     }
120     openstack role add "$DOCTOR_ROLE" --user "$DOCTOR_USER" \
121                        --project "$DOCTOR_PROJECT"
122     # tojuvone: openstack quota show is broken and have to use nova
123     # https://bugs.launchpad.net/manila/+bug/1652118
124     # Note! while it is encouraged to use openstack client it has proven
125     # quite buggy.
126     # QUOTA=$(openstack quota show $DOCTOR_PROJECT)
127     DOCTOR_QUOTA=$(nova quota-show --tenant $DOCTOR_PROJECT)
128     # We make sure that quota allows number of instances and cores
129     OLD_INSTANCE_QUOTA=$(echo "${DOCTOR_QUOTA}" | grep " instances " | \
130                          awk '{print $4}')
131     if [ $OLD_INSTANCE_QUOTA -lt $VM_COUNT ]; then
132         openstack quota set --instances $VM_COUNT \
133                   $DOCTOR_USER
134     fi
135     OLD_CORES_QUOTA=$(echo "${DOCTOR_QUOTA}" | grep " cores " | \
136                       awk '{print $4}')
137     if [ $OLD_CORES_QUOTA -lt $VM_COUNT ]; then
138         openstack quota set --cores $VM_COUNT \
139                   $DOCTOR_USER
140     fi
141 }
142
143 boot_vm() {
144     # test VM done with test user, so can test non-admin
145
146     if ! openstack $as_doctor_user network show $NET_NAME; then
147         openstack $as_doctor_user network create $NET_NAME
148     fi
149     if ! openstack $as_doctor_user subnet show $NET_NAME; then
150         openstack $as_doctor_user subnet create $NET_NAME \
151             --network $NET_NAME --subnet-range $NET_CIDR --no-dhcp
152     fi
153     net_id=$(openstack $as_doctor_user network show $NET_NAME -f value -c id)
154
155     servers=$(openstack $as_doctor_user server list)
156     for i in `seq $VM_COUNT`; do
157         echo "${servers}" | grep -q " $VM_BASENAME$i " && continue
158         openstack $as_doctor_user server create --flavor "$VM_FLAVOR" \
159             --image "$IMAGE_NAME" --nic net-id=$net_id "$VM_BASENAME$i"
160     done
161     sleep 1
162 }
163
164 create_alarm() {
165     # get vm_id as test user
166     alarm_list=$($ceilometer alarm-list)
167     vms=$(openstack $as_doctor_user server list)
168     for i in `seq $VM_COUNT`; do
169         echo "${alarm_list}" | grep -q " $ALARM_BASENAME$i " || {
170             vm_id=$(echo "${vms}" | grep " $VM_BASENAME$i " | awk '{print $2}')
171             # TODO(r-mibu): change notification endpoint from localhost to the
172             # consumer. IP address (functest container).
173             $ceilometer alarm-event-create \
174                        --name "$ALARM_BASENAME$i" \
175                        --alarm-action "http://localhost:$CONSUMER_PORT/failure" \
176                        --description "VM failure" \
177                        --enabled True \
178                        --repeat-actions False \
179                        --severity "moderate" \
180                        --event-type compute.instance.update \
181                        -q "traits.state=string::error; \
182                        traits.instance_id=string::$vm_id"
183             }
184      done
185 }
186
187 start_monitor() {
188     pgrep -f "python monitor.py" && return 0
189     sudo -E python monitor.py "$COMPUTE_HOST" "$COMPUTE_IP" "$INSPECTOR_TYPE" \
190         > monitor.log 2>&1 &
191 }
192
193 stop_monitor() {
194     pgrep -f "python monitor.py" || return 0
195     sudo kill $(pgrep -f "python monitor.py")
196 }
197
198 start_consumer() {
199     pgrep -f "python consumer.py" && return 0
200     python consumer.py "$CONSUMER_PORT" > consumer.log 2>&1 &
201
202     # NOTE(r-mibu): create tunnel to the controller nodes, so that we can
203     # avoid some network problems dpends on infra and installers.
204     # This tunnel will be terminated by stop_consumer() or after 10 mins passed.
205     if ! is_installer local; then
206         for ip in $CONTROLLER_IPS
207         do
208             forward_rule="-R $CONSUMER_PORT:localhost:$CONSUMER_PORT"
209             tunnel_command="sudo ssh $ssh_opts_cpu $COMPUTE_USER@$ip $forward_rule sleep 600"
210             $tunnel_command > "ssh_tunnel.${ip}.log" 2>&1 < /dev/null &
211         done
212     fi
213 }
214
215 stop_consumer() {
216     pgrep -f "python consumer.py" || return 0
217     kill $(pgrep -f "python consumer.py")
218
219     # NOTE(r-mibu): terminate tunnels to the controller nodes
220     if ! is_installer local; then
221         for ip in $CONTROLLER_IPS
222         do
223             forward_rule="-R $CONSUMER_PORT:localhost:$CONSUMER_PORT"
224             tunnel_command="sudo ssh $ssh_opts_cpu $COMPUTE_USER@$ip $forward_rule sleep 600"
225             kill $(pgrep -f "$tunnel_command")
226         done
227     fi
228 }
229
230 wait_for_vm_launch() {
231     echo "waiting for vm launch..."
232
233     count=0
234     while [[ ${count} -lt 60 ]]
235     do
236         active_count=0
237         vms=$(openstack $as_doctor_user server list)
238         for i in `seq $VM_COUNT`; do
239             state=$(echo "${vms}" | grep " $VM_BASENAME$i " | awk '{print $6}')
240             if [[ "$state" == "ACTIVE" ]]; then
241                 active_count=$(($active_count+1))
242             elif [[ "$state" == "ERROR" ]]; then
243                 die $LINENO "vm state $VM_BASENAME$i is ERROR"
244             else
245                 #This VM not yet active
246                 count=$(($count+1))
247                 sleep 5
248                 continue
249             fi
250         done
251         [[ $active_count -eq $VM_COUNT ]] && {
252             echo "get computer host info..."
253             get_compute_host_info
254             VMS_ON_FAILED_HOST=$(openstack $as_doctor_user server list --host \
255                          $COMPUTE_HOST | grep " ${VM_BASENAME}" |  wc -l)
256             return 0
257         }
258         #Not all VMs active
259         count=$(($count+1))
260         sleep 5
261     done
262     die $LINENO "Time out while waiting for VM launch"
263 }
264
265 inject_failure() {
266     echo "disabling network of compute host [$COMPUTE_HOST] for 3 mins..."
267     cat > disable_network.sh << 'END_TXT'
268 #!/bin/bash -x
269 dev=$(sudo ip a | awk '/ @COMPUTE_IP@\//{print $7}')
270 [[ -n "$dev" ]] || dev=$(sudo ip a | awk '/ @COMPUTE_IP@\//{print $5}')
271 sleep 1
272 sudo ip link set $dev down
273 echo "doctor set link down at" $(date "+%s.%N")
274 sleep 180
275 sudo ip link set $dev up
276 sleep 1
277 END_TXT
278     sed -i -e "s/@COMPUTE_IP@/$COMPUTE_IP/" disable_network.sh
279     chmod +x disable_network.sh
280     scp $ssh_opts_cpu disable_network.sh "$COMPUTE_USER@$COMPUTE_IP:"
281     ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" 'nohup ./disable_network.sh > disable_network.log 2>&1 &'
282     # use host time to get rid of potential time sync deviation between nodes
283     triggered=$(date "+%s.%N")
284 }
285
286 wait_consumer() {
287     local interval=1
288     local rounds=$(($1 / $interval))
289     for i in `seq $rounds`; do
290         notified_count=$(grep "doctor consumer notified at" consumer.log | wc -l)
291         if [[ $notified_count -eq  $VMS_ON_FAILED_HOST ]]; then
292             return 0
293         fi
294         sleep $interval
295     done
296     die $LINENO "Consumer hasn't received fault notification."
297 }
298
299 calculate_notification_time() {
300     wait_consumer 60
301     #keep 'at' as the last keyword just before the value, and
302     #use regex to get value instead of the fixed column
303     detected=$(grep "doctor monitor detected at" monitor.log |\
304                sed -e "s/^.* at //")
305     notified=$(grep "doctor consumer notified at" consumer.log |\
306                sed -e "s/^.* at //" | tail -1)
307
308     echo "$notified $detected" | \
309         awk '{
310             d = $1 - $2;
311             if (d < 1 && d > 0) { print d " OK"; exit 0 }
312             else { print d " NG"; exit 1 }
313         }'
314 }
315
316 check_host_status() {
317     # Check host related to first Doctor VM is in wanted state
318     # $1    Expected state
319     # $2    Seconds to wait to have wanted state
320     expected_state=$1
321     local interval=5
322     local rounds=$(($2 / $interval))
323     for i in `seq $rounds`; do
324         host_status_line=$(openstack $as_doctor_user --os-compute-api-version \
325                            2.16 server show ${VM_BASENAME}1 | grep "host_status")
326         host_status=$(echo $host_status_line | awk '{print $4}')
327         die_if_not_set $LINENO host_status "host_status not reported by: nova show ${VM_BASENAME}1"
328         if [[ "$expected_state" =~ "$host_status" ]] ; then
329             echo "${VM_BASENAME}1 showing host_status: $host_status"
330             return 0
331         else
332             sleep $interval
333         fi
334     done
335     if [[ "$expected_state" =~ "$host_status" ]] ; then
336         echo "${VM_BASENAME}1 showing host_status: $host_status"
337     else
338         die $LINENO  "host_status:$host_status not equal to expected_state: $expected_state"
339     fi
340 }
341
342 unset_forced_down_hosts() {
343     # for debug
344     openstack compute service list --service nova-compute
345
346     downed_computes=$(openstack compute service list --service nova-compute \
347                       -f value -c Host -c State | grep ' down$' \
348                       | sed -e 's/ *down$//')
349     echo "downed_computes: $downed_computes"
350     for host in $downed_computes
351     do
352         # TODO(r-mibu): use openstack client
353         #openstack compute service set --up $host nova-compute
354         nova service-force-down --unset $host nova-compute
355     done
356
357     echo "waiting disabled compute host back to be enabled..."
358     wait_until 'openstack compute service list --service nova-compute
359                 -f value -c State | grep -q down' 240 5
360
361     for host in $downed_computes
362     do
363         # TODO(r-mibu): improve 'get_compute_ip_from_hostname'
364         get_compute_ip_from_hostname $host
365         wait_until "! ping -c 1 $COMPUTE_IP" 120 5
366     done
367 }
368
369 collect_logs() {
370     if [[ -n "$COMPUTE_IP" ]];then
371         scp $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP:disable_network.log" .
372     fi
373
374     # TODO(yujunz) collect other logs, e.g. nova, aodh
375 }
376
377 run_profiler() {
378     if [[ "$PROFILER_TYPE" == "poc" ]]; then
379         linkdown=$(grep "doctor set link down at " disable_network.log |\
380                   sed -e "s/^.* at //")
381         vmdown=$(grep "doctor mark vm.* error at" inspector.log |tail -n 1 |\
382                  sed -e "s/^.* at //")
383         hostdown=$(grep "doctor mark host.* down at" inspector.log |\
384                  sed -e "s/^.* at //")
385
386         # TODO(yujunz) check the actual delay to verify time sync status
387         # expected ~1s delay from $trigger to $linkdown
388         relative_start=${linkdown}
389         export DOCTOR_PROFILER_T00=$(python -c \
390           "print(int(($linkdown-$relative_start)*1000))")
391         export DOCTOR_PROFILER_T01=$(python -c \
392           "print(int(($detected-$relative_start)*1000))")
393         export DOCTOR_PROFILER_T03=$(python -c \
394           "print(int(($vmdown-$relative_start)*1000))")
395         export DOCTOR_PROFILER_T04=$(python -c \
396           "print(int(($hostdown-$relative_start)*1000))")
397         export DOCTOR_PROFILER_T09=$(python -c \
398           "print(int(($notified-$relative_start)*1000))")
399
400         python profiler-poc.py >doctor_profiler.log 2>&1
401     fi
402 }
403
404 cleanup() {
405     set +e
406     echo "cleanup..."
407     stop_monitor
408     stop_inspector
409     stop_consumer
410
411     unset_forced_down_hosts
412     collect_logs
413
414     vms=$(openstack $as_doctor_user server list)
415     vmstodel=""
416     for i in `seq $VM_COUNT`; do
417         $(echo "${vms}" | grep -q " $VM_BASENAME$i ") &&
418         vmstodel+=" $VM_BASENAME$i"
419     done
420     [[ $vmstodel ]] && openstack $as_doctor_user server delete $vmstodel
421     alarm_list=$($ceilometer alarm-list)
422     for i in `seq $VM_COUNT`; do
423         alarm_id=$(echo "${alarm_list}" | grep " $ALARM_BASENAME$i " |
424                    awk '{print $2}')
425         [ -n "$alarm_id" ] && $ceilometer alarm-delete "$alarm_id"
426     done
427     openstack $as_doctor_user subnet delete $NET_NAME
428     sleep 1
429     openstack $as_doctor_user network delete $NET_NAME
430     sleep 1
431
432     image_id=$(openstack image list | grep " $IMAGE_NAME " | awk '{print $2}')
433     sleep 1
434     #if an existing image was used, there's no need to remove it here
435     if [[ "$use_existing_image" == false ]] ; then
436         [ -n "$image_id" ] && openstack image delete "$image_id"
437     fi
438     openstack role remove "$DOCTOR_ROLE" --user "$DOCTOR_USER" \
439                               --project "$DOCTOR_PROJECT"
440     openstack project delete "$DOCTOR_PROJECT"
441     openstack user delete "$DOCTOR_USER"
442     # NOTE: remove role only for doctor test.
443     #openstack role delete "$DOCTOR_ROLE"
444
445     cleanup_installer
446     cleanup_inspector
447
448     # NOTE: Temporal log printer.
449     for f in $(find . -name '*.log')
450     do
451         echo
452         echo "[$f]"
453         sed -e 's/^/ | /' $f
454         echo
455     done
456 }
457
458 # Main process
459
460 echo "Note: doctor/tests/run.sh has been executed."
461 git log --oneline -1 || true   # ignore even you don't have git installed
462
463 trap cleanup EXIT
464
465 source $TOP_DIR/functions-common
466 source $TOP_DIR/lib/installer
467 source $TOP_DIR/lib/inspector
468
469 setup_installer
470
471 echo "preparing VM image..."
472 download_image
473 register_image
474
475 echo "creating test user..."
476 create_test_user
477
478 echo "creating VM..."
479 boot_vm
480 wait_for_vm_launch
481
482 echo "creating alarm..."
483 #TODO: change back to use, network problems depends on infra and installers
484 #get_consumer_ip
485 create_alarm
486
487 echo "starting doctor sample components..."
488 start_inspector
489 start_monitor
490 start_consumer
491
492 sleep 60
493 echo "injecting host failure..."
494 inject_failure
495
496 check_host_status "(DOWN|UNKNOWN)" 60
497 calculate_notification_time
498 unset_forced_down_hosts
499 collect_logs
500 run_profiler
501
502 echo "done"