add running profiler in python
[doctor.git] / tests / run.sh
1 #!/bin/bash -e
2 ##############################################################################
3 # Copyright (c) 2016 NEC Corporation and others.
4 #
5 # All rights reserved. This program and the accompanying materials
6 # are made available under the terms of the Apache License, Version 2.0
7 # which accompanies this distribution, and is available at
8 # http://www.apache.org/licenses/LICENSE-2.0
9 ##############################################################################
10
11 # Configuration
12
13 [[ "${CI_DEBUG:-true}" == [Tt]rue ]] && set -x
14
15 IMAGE_URL=https://launchpad.net/cirros/trunk/0.3.0/+download/cirros-0.3.0-x86_64-disk.img
16 #if an existing image name is provided in the enviroment, use that one
17 IMAGE_NAME=${IMAGE_NAME:-cirros}
18 IMAGE_FILE="${IMAGE_NAME}.img"
19 IMAGE_FORMAT=qcow2
20 VM_BASENAME=doctor_vm
21 VM_FLAVOR=m1.tiny
22 #if VM_COUNT set, use that instead
23 VM_COUNT=${VM_COUNT:-1}
24 NET_NAME=doctor_net
25 NET_CIDR=192.168.168.0/24
26 ALARM_BASENAME=doctor_alarm
27 CONSUMER_PORT=12346
28 DOCTOR_USER=doctor
29 DOCTOR_PW=doctor
30 DOCTOR_PROJECT=doctor
31 DOCTOR_ROLE=_member_
32 PROFILER_TYPE=${PROFILER_TYPE:-poc}
33 PYTHON_ENABLE=${PYTHON_ENABLE:-false}
34
35 TOP_DIR=$(cd $(dirname "$0") && pwd)
36
37 as_doctor_user="--os-username $DOCTOR_USER --os-password $DOCTOR_PW
38                 --os-project-name $DOCTOR_PROJECT --os-tenant-name $DOCTOR_PROJECT"
39 # NOTE: ceilometer command still requires '--os-tenant-name'.
40 #ceilometer="ceilometer ${as_doctor_user/--os-project-name/--os-tenant-name}"
41 ceilometer="ceilometer $as_doctor_user"
42 as_admin_user="--os-username admin --os-project-name $DOCTOR_PROJECT
43                --os-tenant-name $DOCTOR_PROJECT"
44
45
46 # Functions
47
48 get_compute_host_info() {
49     # get computer host info which first VM boot in as admin user
50     COMPUTE_HOST=$(openstack $as_admin_user server show ${VM_BASENAME}1 |
51                    grep "OS-EXT-SRV-ATTR:host " | awk '{ print $4 }')
52     compute_host_in_undercloud=${COMPUTE_HOST%%.*}
53     die_if_not_set $LINENO COMPUTE_HOST "Failed to get compute hostname"
54
55     get_compute_ip_from_hostname $COMPUTE_HOST
56
57     echo "COMPUTE_HOST=$COMPUTE_HOST"
58     echo "COMPUTE_IP=$COMPUTE_IP"
59
60     # verify connectivity to target compute host
61     ping -c 1 "$COMPUTE_IP"
62     if [[ $? -ne 0 ]] ; then
63         die $LINENO "Can not ping to computer host"
64     fi
65
66     # verify ssh to target compute host
67     ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" 'exit'
68     if [[ $? -ne 0 ]] ; then
69         die $LINENO "Can not ssh to computer host"
70     fi
71 }
72
73 # TODO(r-mibu): update this function to support consumer instance
74 #               and migrate this function into installer lib
75 get_consumer_ip___to_be_removed() {
76     local get_consumer_command="ip route get $COMPUTE_IP | awk '/ src /{print \$NF}'"
77     if is_installer apex; then
78         CONSUMER_IP=$(sudo ssh $ssh_opts root@$INSTALLER_IP \
79                       "$get_consumer_command")
80     elif is_installer fuel; then
81         CONSUMER_IP=$(sudo sshpass -p r00tme ssh $ssh_opts root@${INSTALLER_IP} \
82                       "$get_consumer_command")
83     elif is_installer local; then
84         CONSUMER_IP=`$get_consumer_command`
85     fi
86     echo "CONSUMER_IP=$CONSUMER_IP"
87
88     die_if_not_set $LINENO CONSUMER_IP "Could not get CONSUMER_IP."
89 }
90
91 download_image() {
92     #if a different name was provided for the image in the enviroment there's no need to download the image
93     use_existing_image=false
94     openstack image list | grep -q " $IMAGE_NAME " && use_existing_image=true
95
96     if [[ "$use_existing_image" == false ]] ; then
97         [ -e "$IMAGE_FILE" ] && return 0
98         wget "$IMAGE_URL" -o "$IMAGE_FILE"
99     fi
100 }
101
102 register_image() {
103     openstack image list | grep -q " $IMAGE_NAME " && return 0
104     openstack image create "$IMAGE_NAME" \
105                            --public \
106                            --disk-format "$IMAGE_FORMAT" \
107                            --container-format bare \
108                            --file "$IMAGE_FILE"
109 }
110
111 create_test_user() {
112     openstack project list | grep -q " $DOCTOR_PROJECT " || {
113         openstack project create --description "Doctor Project" \
114                                  "$DOCTOR_PROJECT"
115     }
116     openstack user list | grep -q " $DOCTOR_USER " || {
117         openstack user create "$DOCTOR_USER" --password "$DOCTOR_PW" \
118                               --project "$DOCTOR_PROJECT"
119     }
120     openstack role show "$DOCTOR_ROLE" | grep -q " $DOCTOR_ROLE " || {
121         openstack role create "$DOCTOR_ROLE"
122     }
123     openstack role assignment list --user "$DOCTOR_USER" \
124     --project "$DOCTOR_PROJECT" --names | grep -q " $DOCTOR_ROLE " || {
125         openstack role add "$DOCTOR_ROLE" --user "$DOCTOR_USER" \
126                            --project "$DOCTOR_PROJECT"
127     }
128     openstack role assignment list --user admin --project "$DOCTOR_PROJECT" \
129     --names | grep -q " admin " || {
130         openstack role add admin --user admin --project "$DOCTOR_PROJECT"
131     }
132     # tojuvone: openstack quota show is broken and have to use nova
133     # https://bugs.launchpad.net/manila/+bug/1652118
134     # Note! while it is encouraged to use openstack client it has proven
135     # quite buggy.
136     # QUOTA=$(openstack quota show $DOCTOR_PROJECT)
137     DOCTOR_QUOTA=$(nova quota-show --tenant $DOCTOR_PROJECT)
138     # We make sure that quota allows number of instances and cores
139     OLD_INSTANCE_QUOTA=$(echo "${DOCTOR_QUOTA}" | grep " instances " | \
140                          awk '{print $4}')
141     if [ $OLD_INSTANCE_QUOTA -lt $VM_COUNT ]; then
142         openstack quota set --instances $VM_COUNT \
143                   $DOCTOR_USER
144     fi
145     OLD_CORES_QUOTA=$(echo "${DOCTOR_QUOTA}" | grep " cores " | \
146                       awk '{print $4}')
147     if [ $OLD_CORES_QUOTA -lt $VM_COUNT ]; then
148         openstack quota set --cores $VM_COUNT \
149                   $DOCTOR_USER
150     fi
151 }
152
153 remove_test_user() {
154     openstack project list | grep -q " $DOCTOR_PROJECT " && {
155         openstack role assignment list --user admin \
156         --project "$DOCTOR_PROJECT" --names | grep -q " admin " && {
157             openstack role remove admin --user admin --project "$DOCTOR_PROJECT"
158         }
159         openstack user list | grep -q " $DOCTOR_USER " && {
160             openstack role assignment list --user "$DOCTOR_USER" \
161             --project "$DOCTOR_PROJECT" --names | grep -q " $DOCTOR_ROLE " && {
162                 openstack role remove "$DOCTOR_ROLE" --user "$DOCTOR_USER" \
163                 --project "$DOCTOR_PROJECT"
164             }
165             openstack user delete "$DOCTOR_USER"
166         }
167         openstack project delete "$DOCTOR_PROJECT"
168     }
169 }
170
171 boot_vm() {
172     # test VM done with test user, so can test non-admin
173
174     if ! openstack $as_doctor_user network show $NET_NAME; then
175         openstack $as_doctor_user network create $NET_NAME
176     fi
177     if ! openstack $as_doctor_user subnet show $NET_NAME; then
178         openstack $as_doctor_user subnet create $NET_NAME \
179             --network $NET_NAME --subnet-range $NET_CIDR --no-dhcp
180     fi
181     net_id=$(openstack $as_doctor_user network show $NET_NAME -f value -c id)
182
183     servers=$(openstack $as_doctor_user server list)
184     for i in `seq $VM_COUNT`; do
185         echo "${servers}" | grep -q " $VM_BASENAME$i " && continue
186         openstack $as_doctor_user server create --flavor "$VM_FLAVOR" \
187             --image "$IMAGE_NAME" --nic net-id=$net_id "$VM_BASENAME$i"
188     done
189     sleep 1
190 }
191
192 create_alarm() {
193     # get vm_id as test user
194     alarm_list=$($ceilometer alarm-list)
195     vms=$(openstack $as_doctor_user server list)
196     for i in `seq $VM_COUNT`; do
197         echo "${alarm_list}" | grep -q " $ALARM_BASENAME$i " || {
198             vm_id=$(echo "${vms}" | grep " $VM_BASENAME$i " | awk '{print $2}')
199             # TODO(r-mibu): change notification endpoint from localhost to the
200             # consumer. IP address (functest container).
201             $ceilometer alarm-event-create \
202                        --name "$ALARM_BASENAME$i" \
203                        --alarm-action "http://localhost:$CONSUMER_PORT/failure" \
204                        --description "VM failure" \
205                        --enabled True \
206                        --repeat-actions False \
207                        --severity "moderate" \
208                        --event-type compute.instance.update \
209                        -q "traits.state=string::error; \
210                        traits.instance_id=string::$vm_id"
211             }
212      done
213 }
214
215 start_consumer() {
216     pgrep -f "python consumer.py" && return 0
217     python consumer.py "$CONSUMER_PORT" > consumer.log 2>&1 &
218
219     # NOTE(r-mibu): create tunnel to the controller nodes, so that we can
220     # avoid some network problems dpends on infra and installers.
221     # This tunnel will be terminated by stop_consumer() or after 10 mins passed.
222     if ! is_installer local; then
223         for ip in $CONTROLLER_IPS
224         do
225             forward_rule="-R $CONSUMER_PORT:localhost:$CONSUMER_PORT"
226             tunnel_command="sudo ssh $ssh_opts_cpu $COMPUTE_USER@$ip $forward_rule sleep 600"
227             $tunnel_command > "ssh_tunnel.${ip}.log" 2>&1 < /dev/null &
228         done
229     fi
230 }
231
232 stop_consumer() {
233     pgrep -f "python consumer.py" || return 0
234     kill $(pgrep -f "python consumer.py")
235
236     # NOTE(r-mibu): terminate tunnels to the controller nodes
237     if ! is_installer local; then
238         for ip in $CONTROLLER_IPS
239         do
240             forward_rule="-R $CONSUMER_PORT:localhost:$CONSUMER_PORT"
241             tunnel_command="sudo ssh $ssh_opts_cpu $COMPUTE_USER@$ip $forward_rule sleep 600"
242             kill $(pgrep -f "$tunnel_command")
243         done
244     fi
245 }
246
247 wait_for_vm_launch() {
248     echo "waiting for vm launch..."
249
250     count=0
251     while [[ ${count} -lt 60 ]]
252     do
253         active_count=0
254         vms=$(openstack $as_doctor_user server list)
255         for i in `seq $VM_COUNT`; do
256             state=$(echo "${vms}" | grep " $VM_BASENAME$i " | awk '{print $6}')
257             if [[ "$state" == "ACTIVE" ]]; then
258                 active_count=$(($active_count+1))
259             elif [[ "$state" == "ERROR" ]]; then
260                 die $LINENO "vm state $VM_BASENAME$i is ERROR"
261             else
262                 #This VM not yet active
263                 count=$(($count+1))
264                 sleep 5
265                 continue
266             fi
267         done
268         [[ $active_count -eq $VM_COUNT ]] && {
269             echo "get computer host info..."
270             get_compute_host_info
271             VMS_ON_FAILED_HOST=$(openstack $as_doctor_user server list --host \
272                          $COMPUTE_HOST | grep " ${VM_BASENAME}" |  wc -l)
273             return 0
274         }
275         #Not all VMs active
276         count=$(($count+1))
277         sleep 5
278     done
279     die $LINENO "Time out while waiting for VM launch"
280 }
281
282 inject_failure() {
283     echo "disabling network of compute host [$COMPUTE_HOST] for 3 mins..."
284     cat > disable_network.sh << 'END_TXT'
285 #!/bin/bash -x
286 sleep 1
287 if [ -n "@INTERFACE_NAME@" ]; then
288     dev=@INTERFACE_NAME@
289 else
290     dev=$(sudo ip a | awk '/ @COMPUTE_IP@\//{print $NF}')
291 fi
292 sudo ip link set $dev down
293 echo "doctor set link down at" $(date "+%s.%N")
294 sleep 180
295 sudo ip link set $dev up
296 sleep 1
297 END_TXT
298     sed -i -e "s/@COMPUTE_IP@/$COMPUTE_IP/" disable_network.sh
299     sed -i -e "s/@INTERFACE_NAME@/$INTERFACE_NAME/" disable_network.sh
300     chmod +x disable_network.sh
301     scp $ssh_opts_cpu disable_network.sh "$COMPUTE_USER@$COMPUTE_IP:"
302     ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" 'nohup ./disable_network.sh > disable_network.log 2>&1 &'
303     # use host time to get rid of potential time sync deviation between nodes
304     triggered=$(date "+%s.%N")
305 }
306
307 wait_consumer() {
308     local interval=1
309     local rounds=$(($1 / $interval))
310     for i in `seq $rounds`; do
311         notified_count=$(grep "doctor consumer notified at" consumer.log | wc -l)
312         if [[ $notified_count -eq  $VMS_ON_FAILED_HOST ]]; then
313             return 0
314         fi
315         sleep $interval
316     done
317     die $LINENO "Consumer hasn't received fault notification."
318 }
319
320 calculate_notification_time() {
321     wait_consumer 60
322     #keep 'at' as the last keyword just before the value, and
323     #use regex to get value instead of the fixed column
324     if [ ! -f monitor.log ]; then
325         scp $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP:monitor.log" .
326     fi
327     detected=$(grep "doctor monitor detected at" monitor.log |\
328                sed -e "s/^.* at //" | tail -1)
329     notified=$(grep "doctor consumer notified at" consumer.log |\
330                sed -e "s/^.* at //" | tail -1)
331
332     echo "$notified $detected" | \
333         awk '{
334             d = $1 - $2;
335             if (d < 1 && d > 0) { print d " OK"; exit 0 }
336             else { print d " NG"; exit 1 }
337         }'
338 }
339
340 check_host_status() {
341     # Check host related to first Doctor VM is in wanted state
342     # $1    Expected state
343     # $2    Seconds to wait to have wanted state
344     expected_state=$1
345     local interval=5
346     local rounds=$(($2 / $interval))
347     for i in `seq $rounds`; do
348         host_status_line=$(openstack $as_doctor_user --os-compute-api-version \
349                            2.16 server show ${VM_BASENAME}1 | grep "host_status")
350         host_status=$(echo $host_status_line | awk '{print $4}')
351         die_if_not_set $LINENO host_status "host_status not reported by: nova show ${VM_BASENAME}1"
352         if [[ "$expected_state" =~ "$host_status" ]] ; then
353             echo "${VM_BASENAME}1 showing host_status: $host_status"
354             return 0
355         else
356             sleep $interval
357         fi
358     done
359     if [[ "$expected_state" =~ "$host_status" ]] ; then
360         echo "${VM_BASENAME}1 showing host_status: $host_status"
361     else
362         die $LINENO  "host_status:$host_status not equal to expected_state: $expected_state"
363     fi
364 }
365
366 unset_forced_down_hosts() {
367     # for debug
368     openstack compute service list --service nova-compute
369
370     downed_computes=$(openstack compute service list --service nova-compute \
371                       -f value -c Host -c State | grep ' down$' \
372                       | sed -e 's/ *down$//')
373     echo "downed_computes: $downed_computes"
374     for host in $downed_computes
375     do
376         # TODO(r-mibu): use openstack client
377         #openstack compute service set --up $host nova-compute
378         nova service-force-down --unset $host nova-compute
379     done
380
381     echo "waiting disabled compute host back to be enabled..."
382     wait_until 'openstack compute service list --service nova-compute
383                 -f value -c State | grep -q down' 240 5
384
385     for host in $downed_computes
386     do
387         # TODO(r-mibu): improve 'get_compute_ip_from_hostname'
388         get_compute_ip_from_hostname $host
389         wait_until "! ping -c 1 $COMPUTE_IP" 120 5
390     done
391 }
392
393 collect_logs() {
394     if [[ -n "$COMPUTE_IP" ]];then
395         scp $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP:disable_network.log" .
396     fi
397
398     # TODO(yujunz) collect other logs, e.g. nova, aodh
399 }
400
401 run_profiler() {
402     if [[ "$PROFILER_TYPE" == "poc" ]]; then
403         linkdown=$(grep "doctor set link down at " disable_network.log |\
404                   sed -e "s/^.* at //")
405         vmdown=$(grep "doctor mark vm.* error at" inspector.log |tail -n 1 |\
406                  sed -e "s/^.* at //")
407         hostdown=$(grep "doctor mark host.* down at" inspector.log |\
408                  sed -e "s/^.* at //")
409
410         # TODO(yujunz) check the actual delay to verify time sync status
411         # expected ~1s delay from $trigger to $linkdown
412         relative_start=${linkdown}
413         export DOCTOR_PROFILER_T00=$(python -c \
414           "print(int(($linkdown-$relative_start)*1000))")
415         export DOCTOR_PROFILER_T01=$(python -c \
416           "print(int(($detected-$relative_start)*1000))")
417         export DOCTOR_PROFILER_T03=$(python -c \
418           "print(int(($vmdown-$relative_start)*1000))")
419         export DOCTOR_PROFILER_T04=$(python -c \
420           "print(int(($hostdown-$relative_start)*1000))")
421         export DOCTOR_PROFILER_T09=$(python -c \
422           "print(int(($notified-$relative_start)*1000))")
423
424         python profiler_poc.py > doctor_profiler.log 2>&1
425     fi
426 }
427
428 cleanup() {
429     set +e
430     echo "cleanup..."
431     stop_inspector
432     stop_consumer
433
434     unset_forced_down_hosts
435     stop_monitor
436     collect_logs
437
438     vms=$(openstack $as_doctor_user server list)
439     vmstodel=""
440     for i in `seq $VM_COUNT`; do
441         $(echo "${vms}" | grep -q " $VM_BASENAME$i ") &&
442         vmstodel+=" $VM_BASENAME$i"
443     done
444     [[ $vmstodel ]] && openstack $as_doctor_user server delete $vmstodel
445     alarm_list=$($ceilometer alarm-list)
446     for i in `seq $VM_COUNT`; do
447         alarm_id=$(echo "${alarm_list}" | grep " $ALARM_BASENAME$i " |
448                    awk '{print $2}')
449         [ -n "$alarm_id" ] && $ceilometer alarm-delete "$alarm_id"
450     done
451     openstack $as_doctor_user subnet delete $NET_NAME
452     sleep 1
453     openstack $as_doctor_user network delete $NET_NAME
454     sleep 1
455
456     image_id=$(openstack image list | grep " $IMAGE_NAME " | awk '{print $2}')
457     sleep 1
458     #if an existing image was used, there's no need to remove it here
459     if [[ "$use_existing_image" == false ]] ; then
460         [ -n "$image_id" ] && openstack image delete "$image_id"
461     fi
462
463     remove_test_user
464
465     cleanup_installer
466     cleanup_inspector
467     cleanup_monitor
468
469     # NOTE: Temporal log printer.
470     for f in $(find . -name '*.log')
471     do
472         echo
473         echo "[$f]"
474         sed -e 's/^/ | /' $f
475         echo
476     done
477 }
478
479 setup_python_packages() {
480     sudo pip install flask==0.10.1
481     command -v openstack || sudo pip install python-openstackclient==2.3.0
482     command -v ceilometer || sudo pip install python-ceilometerclient==2.6.2
483     command -v congress || sudo pip install python-congressclient==1.5.0
484 }
485
486 # Main process
487
488 if [[ $PYTHON_ENABLE == [Tt]rue ]]; then
489     which tox || sudo pip install tox
490     if [ -f /usr/bin/apt-get ]; then
491         sudo apt-get install -y python3-dev
492     elif [ -f /usr/bin/yum ] ; then
493         sudo yum install -y python3-devel
494     fi
495
496     cd $TOP_DIR
497     echo "executing tox..."
498     tox
499     exit $?
500 fi
501
502 echo "Note: doctor/tests/run.sh has been executed."
503 git log --oneline -1 || true   # ignore even you don't have git installed
504
505 trap cleanup EXIT
506
507 setup_python_packages
508
509 source $TOP_DIR/functions-common
510 source $TOP_DIR/lib/installer
511 source $TOP_DIR/lib/inspector
512 source $TOP_DIR/lib/monitor
513
514 rm -f *.log
515
516 setup_installer
517
518 echo "preparing VM image..."
519 download_image
520 register_image
521
522 echo "creating test user..."
523 create_test_user
524
525 echo "creating VM..."
526 boot_vm
527 wait_for_vm_launch
528
529 echo "creating alarm..."
530 #TODO: change back to use, network problems depends on infra and installers
531 #get_consumer_ip
532 create_alarm
533
534 echo "starting doctor sample components..."
535 start_inspector
536 start_monitor
537 start_consumer
538
539 sleep 60
540 echo "injecting host failure..."
541 inject_failure
542
543 check_host_status "(DOWN|UNKNOWN)" 60
544 unset_forced_down_hosts
545 calculate_notification_time
546 collect_logs
547 run_profiler
548
549 echo "done"