Fix bugs in profiler
[doctor.git] / tests / run.sh
index df6db97..a26fafe 100755 (executable)
@@ -222,6 +222,7 @@ wait_for_vm_launch() {
             return 0
         fi
         if [[ "$state" == "ERROR" ]]; then
+            openstack $as_doctor_user server show $VM_NAME
             die $LINENO "vm state is ERROR"
         fi
         count=$(($count+1))
@@ -237,7 +238,7 @@ inject_failure() {
 dev=$(sudo ip a | awk '/ @COMPUTE_IP@\//{print $7}')
 sleep 1
 sudo ip link set $dev down
-echo "doctor set host down at" $(date "+%s.%N")
+echo "doctor set link down at" $(date "+%s.%N")
 sleep 180
 sudo ip link set $dev up
 sleep 1
@@ -246,24 +247,8 @@ END_TXT
     chmod +x disable_network.sh
     scp $ssh_opts_cpu disable_network.sh "$COMPUTE_USER@$COMPUTE_IP:"
     ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" 'nohup ./disable_network.sh > disable_network.log 2>&1 &'
-}
-
-profile_performance_poc() {
-    triggered=$(grep "^doctor set host down at" disable_network.log |\
-                sed -e "s/^.* at //")
-    vmdown=$(grep "doctor mark vm.* error at" inspector.log |tail -n 1 |\
-               sed -e "s/^.* at //")
-    hostdown=$(grep "doctor mark host.* down at" inspector.log |\
-               sed -e "s/^.* at //")
-
-    #calculate the relative interval to triggered(T00)
-    export DOCTOR_PROFILER_T00=0
-    export DOCTOR_PROFILER_T01=$(echo "($detected-$triggered)*1000/1" |bc)
-    export DOCTOR_PROFILER_T03=$(echo "($vmdown-$triggered)*1000/1" |bc)
-    export DOCTOR_PROFILER_T04=$(echo "($hostdown-$triggered)*1000/1" |bc)
-    export DOCTOR_PROFILER_T09=$(echo "($notified-$triggered)*1000/1" |bc)
-
-    python profiler-poc.py
+    # use host time to get rid of potential time sync deviation between nodes
+    triggered=$(date "+%s.%N")
 }
 
 calculate_notification_time() {
@@ -278,10 +263,6 @@ calculate_notification_time() {
     notified=$(grep "doctor consumer notified at" consumer.log |\
                sed -e "s/^.* at //")
 
-    if [[ "$PROFILER_TYPE" == "poc" ]]; then
-        profile_performance_poc
-    fi
-
     echo "$notified $detected" | \
         awk '{
             d = $1 - $2;
@@ -304,6 +285,55 @@ check_host_status() {
     fi
 }
 
+unset_forced_down_hosts() {
+    for host in $(openstack compute service list --service nova-compute \
+                  -f value -c Host -c State | sed -n -e '/down$/s/ *down$//p')
+    do
+        python ./nova_force_down.py $host --unset
+    done
+
+    echo "waiting disabled compute host back to be enabled..."
+    wait_until 'openstack compute service list --service nova-compute
+                -f value -c State | grep -q down' 240 5
+}
+
+collect_logs() {
+    unset_forced_down_hosts
+    # TODO: We need to make sure the target compute host is back to IP
+    #       reachable. wait_ping() will be added by tojuvone .
+    sleep 110
+    scp $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP:disable_network.log" .
+
+    # TODO(yujunz) collect other logs, e.g. nova, aodh
+}
+
+run_profiler() {
+    if [[ "$PROFILER_TYPE" == "poc" ]]; then
+        linkdown=$(grep "doctor set link down at " disable_network.log |\
+                  sed -e "s/^.* at //")
+        vmdown=$(grep "doctor mark vm.* error at" inspector.log |tail -n 1 |\
+                 sed -e "s/^.* at //")
+        hostdown=$(grep "doctor mark host.* down at" inspector.log |\
+                 sed -e "s/^.* at //")
+
+        # TODO(yujunz) check the actual delay to verify time sync status
+        # expected ~1s delay from $trigger to $linkdown
+        relative_start=${linkdown}
+        export DOCTOR_PROFILER_T00=$(python -c \
+          "print(int(($linkdown-$relative_start)*1000))")
+        export DOCTOR_PROFILER_T01=$(python -c \
+          "print(int(($detected-$relative_start)*1000))")
+        export DOCTOR_PROFILER_T03=$(python -c \
+          "print(int(($vmdown-$relative_start)*1000))")
+        export DOCTOR_PROFILER_T04=$(python -c \
+          "print(int(($hostdown-$relative_start)*1000))")
+        export DOCTOR_PROFILER_T09=$(python -c \
+          "print(int(($notified-$relative_start)*1000))")
+
+        python profiler-poc.py >doctor_profiler.log 2>&1
+    fi
+}
+
 cleanup() {
     set +e
     echo "cleanup..."
@@ -311,10 +341,10 @@ cleanup() {
     stop_inspector
     stop_consumer
 
-    echo "waiting disabled compute host back to be enabled..."
-    python ./nova_force_down.py "$COMPUTE_HOST" --unset
-    sleep 240
-    check_host_status "UP"
+    unset_forced_down_hosts
+    # TODO: We need to make sure the target compute host is back to IP
+    #       reachable. wait_ping() will be added by tojuvone .
+    sleep 110
     scp $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP:disable_network.log" .
 
     openstack $as_doctor_user server list | grep -q " $VM_NAME " && openstack $as_doctor_user server delete "$VM_NAME"
@@ -383,5 +413,7 @@ sleep 60
 
 check_host_status "(DOWN|UNKNOWN)"
 calculate_notification_time
+collect_logs
+run_profiler
 
 echo "done"