Add Collectd as a Monitor Type 57/37357/6
authorUmar Farooq <umar.farooq@neclab.eu>
Thu, 13 Jul 2017 10:20:56 +0000 (12:20 +0200)
committerRyota Mibu <r-mibu@cq.jp.nec.com>
Wed, 9 Aug 2017 04:39:54 +0000 (04:39 +0000)
A plugin for collectd is added to use collectd on compute as a
monitor type. Monitor files are updated accordingly.
The inspector now listens on all interfaces instead of only
localhost to enable it to communicate with compute node.

JIRA: DOCTOR-86
JIRA: DOCTOR-101

Change-Id: Idc834d428152e4687020eff7d8db36a652b1bf86
Signed-off-by: Umar Farooq <umar.farooq@neclab.eu>
12 files changed:
docs/development/manuals/monitors.rst [new file with mode: 0644]
etc/doctor.sample.conf
tests/inspector.py
tests/lib/monitor [new file with mode: 0644]
tests/lib/monitors/collectd/collectd [new file with mode: 0644]
tests/lib/monitors/collectd/collectd_plugin.py [new file with mode: 0644]
tests/lib/monitors/sample/monitor.py [moved from tests/monitor.py with 100% similarity]
tests/lib/monitors/sample/sample [new file with mode: 0644]
tests/main.py
tests/monitor/__init__.py
tests/monitor/collectd.py [new file with mode: 0644]
tests/run.sh

diff --git a/docs/development/manuals/monitors.rst b/docs/development/manuals/monitors.rst
new file mode 100644 (file)
index 0000000..0d22b1d
--- /dev/null
@@ -0,0 +1,36 @@
+.. This work is licensed under a Creative Commons Attribution 4.0 International License.
+.. http://creativecommons.org/licenses/by/4.0
+
+Monitor Types and Limitations
+=============================
+
+Currently there are two monitor types supported: sample and collectd
+
+Sample Monitor
+--------------
+
+Sample monitor type pings the compute host from the control host and calculates the
+notification time after the ping timeout.
+Also if inspector type is sample, the compute node needs to communicate with the control
+node on port 12345. This port needs to be opened for incomming traffic on control node.
+
+Collectd Monitor
+----------------
+
+Collectd monitor type uses collectd daemon running ovs_events plugin. Collectd runs on
+compute to send instant notification to the control node. The notification time is
+calculated by using the difference of time at which compute node sends notification to
+control node and the time at which consumer is notified. The time on control and compute
+node has to be synchronized for this reason. For further details on setting up collectd
+on the compute node, use the following link:
+http://docs.opnfv.org/en/stable-danube/submodules/barometer/docs/release/userguide/feature.userguide.html#id18
+
+Collectd monitors an interface managed by OVS. If the interface is not be assigned
+an IP, the user has to provide the name of interface to be monitored. The command to
+launch the doctor test in that case is:
+MONITOR_TYPE=collectd INSPECTOR_TYPE=sample INTERFACE_NAME=example_iface ./run.sh
+
+If the interface name or IP is not provided, the collectd monitor type will monitor the
+default management interface. This may result in the failure of doctor run.sh test case.
+The test case sets the monitored interface down and if the inspector (sample or congress)
+is running on the same subnet, collectd monitor will not be able to communicate with it.
index 8a1ddc3..2b24394 100644 (file)
@@ -22,5 +22,6 @@ doctor_role = _member_
 quota_instances = 1
 quota_cores = 1
 
-
+[monitor]
+type=collectd
 
index a61051f..82ffc33 100644 (file)
@@ -116,8 +116,7 @@ def get_args():
 
 def main():
     args = get_args()
-    app.run(port=args.port)
-
+    app.run(host='0.0.0.0', port=args.port)
 
 if __name__ == '__main__':
     main()
diff --git a/tests/lib/monitor b/tests/lib/monitor
new file mode 100644 (file)
index 0000000..6b804ec
--- /dev/null
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+MONITOR_TYPE=${MONITOR_TYPE:-sample}
+
+function is_monitor_supported {
+    local monitor="$1"
+    [[ -f $TOP_DIR/lib/monitors/$monitor/$monitor ]]
+}
+
+function is_monitor {
+    local monitor="$1"
+    [[ $monitor == $MONITOR_TYPE ]]
+}
+
+function start_monitor {
+    start_monitor_$MONITOR_TYPE
+}
+
+function stop_monitor {
+    stop_monitor_$MONITOR_TYPE
+}
+
+function cleanup_monitor {
+    cleanup_monitor_$MONITOR_TYPE
+}
+
+if ! is_monitor_supported $MONITOR_TYPE; then
+    die $LINENO "MONITOR_TYPE=$MONITOR_TYPE is not supported."
+fi
+
+source $TOP_DIR/lib/monitors/$MONITOR_TYPE/$MONITOR_TYPE
diff --git a/tests/lib/monitors/collectd/collectd b/tests/lib/monitors/collectd/collectd
new file mode 100644 (file)
index 0000000..f509665
--- /dev/null
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+function start_monitor_collectd {
+    ## CONTROL_IP is the IP of primary interface of control node i.e.
+    ## eth0, eno1. It is used by collectd monitor to communicate with
+    ## sample inspector.
+    ## @TODO (umar) see if mgmt IP of control is a better option. Also
+    ## primary interface may not be the right option
+    CONTROL_IP="$(ip a | sed -En 's/127.0.0.1//;s/.*inet (addr:)?(([0-9]*\.){3}[0-9]*).*/\2/p' | sed -n 1p)"
+    #CONTROL_IP=192.168.98.6
+
+    echo "
+Hostname \"$COMPUTE_HOST\"
+FQDNLookup false
+Interval 1
+MaxReadInterval 2
+
+<LoadPlugin python>
+    Globals true
+</LoadPlugin>
+LoadPlugin ovs_events
+LoadPlugin logfile
+
+<Plugin logfile>
+  File \"/var/log/collectd.log\"
+  Timestamp true
+  LogLevel \"info\"
+</Plugin>
+
+<Plugin python>
+    ModulePath \"/home/$COMPUTE_USER\"
+    LogTraces true
+    Interactive false
+    Import \"collectd_plugin\"
+    <Module \"collectd_plugin\">
+        control_ip \"$CONTROL_IP\"
+        compute_ip \"$COMPUTE_IP\"
+        compute_host \"$COMPUTE_HOST\"
+        compute_user \"$COMPUTE_USER\"
+        inspector_type \"$INSPECTOR_TYPE\"
+        os_auth_url \"$OS_AUTH_URL\"
+        os_username \"$OS_USERNAME\"
+        os_password \"$OS_PASSWORD\"
+        os_project_name \"$OS_PROJECT_NAME\"
+        os_user_domain_name \"$OS_USER_DOMAIN_NAME\"
+        os_user_domain_id \"$OS_USER_DOMAIN_ID\"
+        os_project_domain_name \"$OS_PROJECT_DOMAIN_NAME\"
+        os_project_domain_id \"$OS_PROJECT_DOMAIN_ID\"
+    </Module>
+</Plugin>
+
+<Plugin ovs_events>
+    Port 6640
+    Socket \"/var/run/openvswitch/db.sock\"
+    Interfaces \"@INTERFACE_NAME@\"
+    SendNotification true
+    DispatchValues false
+</Plugin>
+
+" > $TOP_DIR/lib/monitors/collectd.conf
+
+    scp $ssh_opts_cpu $TOP_DIR/lib/monitors/collectd.conf $COMPUTE_USER@$COMPUTE_IP:
+    ## @TODO (umar) Always assuming that the interface is assigned an IP if
+    ## interface name is not provided. See if there is a better approach
+    ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" "
+        if [ -n \"$INTERFACE_NAME\" ]; then
+            dev=$INTERFACE_NAME
+        else
+            dev=\$(sudo ip a | awk '/ $COMPUTE_IP\//{print \$NF}')
+        fi
+        sed -i -e \"s/@INTERFACE_NAME@/\$dev/\" collectd.conf
+        collectd_conf=/opt/collectd/etc/collectd.conf
+        if [ -e \$collectd_conf ]; then
+            sudo cp \$collectd_conf \${collectd_conf}-doctor-saved
+        else
+            sudo touch \${collectd_conf}-doctor-created
+        fi
+        sudo mv collectd.conf /opt/collectd/etc/collectd.conf"
+
+    scp $ssh_opts_cpu $TOP_DIR/lib/monitors/collectd/collectd_plugin.py $COMPUTE_USER@$COMPUTE_IP:collectd_plugin.py
+    ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" "sudo pkill collectd
+                                                   sudo /opt/collectd/sbin/collectd"
+}
+
+function stop_monitor_collectd {
+    ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" 'sudo pkill collectd'
+}
+
+function cleanup_monitor_collectd {
+    ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" "
+        collectd_conf=/opt/collectd/etc/collectd.conf
+        if [ -e \"\${collectd_conf}-doctor-created\" ]; then
+            sudo rm \"\${collectd_conf}-doctor-created\"
+            sudo rm \$collectd_conf
+        elif [ -e \"\${collectd_conf}-doctor-saved\" ]; then
+            sudo cp -f \"\${collectd_conf}-doctor-saved\" \$collectd_conf
+            sudo rm \"\${collectd_conf}-doctor-saved\"
+        fi"
+
+    rm $TOP_DIR/lib/monitors/collectd.conf
+}
diff --git a/tests/lib/monitors/collectd/collectd_plugin.py b/tests/lib/monitors/collectd/collectd_plugin.py
new file mode 100644 (file)
index 0000000..70fcf26
--- /dev/null
@@ -0,0 +1,167 @@
+##############################################################################
+# Copyright (c) 2017 NEC Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+
+import collectd
+import sys
+from netifaces import interfaces, ifaddresses, AF_INET
+from datetime import datetime
+import json
+import requests
+import time
+from requests.exceptions import ConnectionError
+
+from keystoneauth1 import loading
+from keystoneauth1 import session
+from congressclient.v1 import client
+
+
+def write_debug(str_write, write_type, compute_user):
+    file_name = ('/home/%s/monitor.log' % compute_user)
+    file_tmp = open(file_name, write_type)
+    file_tmp.write( "%s" % str_write)
+    file_tmp.close()
+
+
+class DoctorMonitorCollectd(object):
+    def __init__(self):
+        self.control_ip = ''
+        self.compute_user = ''
+        self.compute_ip = ''
+        self.host_name = ''
+        self.inspector_type = ''
+        self.inspector_url = ''
+        self.os_auth_url = ''
+        self.os_username = ''
+        self.os_password = ''
+        self.os_project_name = ''
+        self.os_user_domain_name = ''
+        self.os_user_domain_id = ''
+        self.os_project_domain_name = ''
+        self.os_project_domain_id = ''
+        self.sess = ''
+        self.auth = ''
+        self.inspector_notified = 0
+        self.start_notifications = 0
+        self.monitor_type = 'sample'
+
+    def config_func(self, config):
+        for node in config.children:
+            key = node.key.lower()
+            val = node.values[0]
+
+            if key == 'compute_host':
+                self.host_name = val
+            elif key == 'control_ip':
+                self.control_ip = val
+            elif key == 'compute_ip':
+                self.compute_ip = val
+            elif key == 'compute_user':
+                self.compute_user = val
+            elif key == 'inspector_type':
+                self.inspector_type = val
+            elif key == 'os_auth_url':
+                self.os_auth_url = val
+            elif key == 'os_username':
+                self.os_username = val
+            elif key == 'os_password':
+                self.os_password = val
+            elif key == 'os_project_name':
+                self.os_project_name = val
+            elif key == 'os_user_domain_name':
+                self.os_user_domain_name = val
+            elif key == 'os_user_domain_id':
+                self.os_user_domain_id = val
+            elif key == 'os_project_domain_name':
+                self.os_project_domain_name = val
+            elif key == 'os_project_domain_id':
+                self.os_project_domain_id = val
+            else:
+                collectd.info('Unknown config key "%s"' % key)
+
+    def init_collectd(self):
+        write_debug("Compute node collectd monitor start at %s\n\n" % datetime.now().isoformat(), "w", self.compute_user)
+
+        if self.inspector_type == 'sample':
+            self.inspector_url = ('http://%s:12345/events' % self.control_ip)
+        elif self.inspector_type == 'congress':
+            loader = loading.get_plugin_loader('password')
+            self.auth = loader.load_from_options(auth_url=self.os_auth_url,
+                        username=self.os_username,
+                        password=self.os_password,
+                        project_name=self.os_project_name,
+                        user_domain_name=self.os_user_domain_name,
+                        user_domain_id=self.os_user_domain_id,
+                        project_domain_name=self.os_project_domain_name,
+                        project_domain_id=self.os_project_domain_id)
+            self.sess=session.Session(auth=self.auth)
+            congress = client.Client(session=self.sess, service_type='policy')
+            ds = congress.list_datasources()['results']
+            doctor_ds = next((item for item in ds if item['driver'] == 'doctor'),
+                         None)
+
+            congress_endpoint = congress.httpclient.get_endpoint(auth=self.auth)
+            self.inspector_url = ('%s/v1/data-sources/%s/tables/events/rows' %
+                              (congress_endpoint, doctor_ds['id']))
+        else:
+            sys.exit()
+        self.start_notifications = 1
+
+
+    def notify_inspector(self):
+        event_type = "compute.host.down"
+        payload = [
+            {
+                 'id': ("monitor_%s_id1" % self.monitor_type),
+                 'time': datetime.now().isoformat(),
+                 'type': event_type,
+                 'details': {
+                     'hostname': self.host_name,
+                     'status': 'down',
+                     'monitor': ("monitor_%s" % self.monitor_type),
+                     'monitor_event_id': ("monitor_%s_event1" % self.monitor_type)
+                 },
+             },
+        ]
+        data = json.dumps(payload)
+        self.inspector_notified = 1
+
+        if self.inspector_type == 'sample':
+            headers = {'content-type': 'application/json'}
+            try:
+                requests.post(self.inspector_url, data=data, headers=headers)
+            except ConnectionError as err:
+                print err
+        elif self.inspector_type == 'congress':
+            # TODO(umar) enhance for token expiry case
+            headers = {
+                'Content-Type': 'application/json',
+                'Accept': 'application/json',
+                'X-Auth-Token': self.sess.get_token()
+            }
+            requests.put(self.inspector_url, data=data, headers=headers)
+
+
+    def handle_notif(self, notification, data=None):
+        if (notification.severity == collectd.NOTIF_FAILURE or
+            notification.severity == collectd.NOTIF_WARNING):
+            if (self.start_notifications == 1 and self.inspector_notified == 0):
+                write_debug("Received down notification: doctor monitor detected at %s\n" % time.time(), "a", self.compute_user)
+                self.notify_inspector()
+
+        elif notification.severity == collectd.NOTIF_OKAY:
+            collectd.info("Interface status: UP again %s\n" % time.time())
+        else:
+            collectd.info("Unknown notification severity %s\n" % notification.severity)
+
+
+monitor = DoctorMonitorCollectd()
+
+collectd.register_config(monitor.config_func)
+collectd.register_init(monitor.init_collectd)
+collectd.register_notification(monitor.handle_notif)
diff --git a/tests/lib/monitors/sample/sample b/tests/lib/monitors/sample/sample
new file mode 100644 (file)
index 0000000..1d31033
--- /dev/null
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+function start_monitor_sample {
+    cp $TOP_DIR/lib/monitors/sample/monitor.py $TOP_DIR/monitor.py
+    pgrep -f "python monitor.py" && return 0
+    sudo -E python monitor.py "$COMPUTE_HOST" "$COMPUTE_IP" "$INSPECTOR_TYPE" \
+        > monitor.log 2>&1 &
+}
+
+function stop_monitor_sample {
+    pgrep -f "python monitor.py" || return 0
+    sudo kill $(pgrep -f "python monitor.py")
+}
+
+function cleanup_monitor_sample {
+    rm monitor.py
+    return
+}
index b59cd7a..b360f12 100644 (file)
@@ -74,8 +74,10 @@ class DoctorTest(object):
             self.setup()
 
             # injecting host failure...
+            # NOTE (umar) add INTERFACE_NAME logic to host injection
 
             # verify the test results
+            # NOTE (umar) copy remote monitor.log file when monitor=collectd
 
         except Exception as e:
             LOG.error('doctor test failed, Exception=%s' % e)
index 51a6a65..e268907 100644 (file)
@@ -12,14 +12,15 @@ from oslo_utils import importutils
 OPTS = [
     cfg.StrOpt('type',
                default='sample',
-               choices=['sample'],
+               choices=['sample', 'collectd'],
                help='the type of doctor monitor component',
                required=True),
 ]
 
 
 _monitor_name_class_mapping = {
-    'sample': 'monitor.sample.SampleMonitor'
+    'sample': 'monitor.sample.SampleMonitor',
+    'collectd': 'monitor.collectd.CollectdMonitor'
 }
 
 def get_monitor(conf, inspector_url, log):
diff --git a/tests/monitor/collectd.py b/tests/monitor/collectd.py
new file mode 100644 (file)
index 0000000..f7a4f44
--- /dev/null
@@ -0,0 +1,145 @@
+##############################################################################
+# Copyright (c) 2017 NEC Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+
+import os
+import socket
+import getpass
+import sys
+
+from identity_auth import get_session
+from os_clients import nova_client
+from monitor.base import BaseMonitor
+
+
+class CollectdMonitor(BaseMonitor):
+    def __init__(self, conf, inspector_url, log):
+        super(CollectdMonitor, self).__init__(conf, inspector_url, log)
+        self.top_dir = os.path.dirname(sys.path[0])
+        self.session = get_session()
+        self.nova = nova_client(conf.nova_version, self.session)
+        self.compute_hosts = self.nova.hypervisors.list(detailed=True)
+        for host in self.compute_hosts:
+            host_dict = host.__dict__
+            self.compute_host = host_dict['hypervisor_hostname']
+            self.compute_ip = host_dict['host_ip']
+        tmp_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+        tmp_sock.connect(("8.8.8.8", 80))
+
+        ## control_ip is the IP of primary interface of control node i.e.
+        ## eth0, eno1. It is used by collectd monitor to communicate with
+        ## sample inspector.
+        ## TODO (umar) see if mgmt IP of control is a better option. Also
+        ## primary interface may not be the right option
+        self.control_ip = tmp_sock.getsockname()[0]
+        self.compute_user = getpass.getuser()
+        self.interface_name = os.environ.get('INTERFACE_NAME') or ''
+        self.inspector_type = os.environ.get('INSPECTOR_TYPE', 'sample')
+        self.auth_url = os.environ.get('OS_AUTH_URL')
+        self.username = os.environ.get('OS_USERNAME')
+        self.password = os.environ.get('OS_PASSWORD')
+        self.project_name = os.environ.get('OS_PROJECT_NAME')
+        self.user_domain_name = os.environ.get('OS_USER_DOMAIN_NAME') or 'default'
+        self.user_domain_id = os.environ.get('OS_USER_DOMAIN_ID')
+        self.project_domain_name = os.environ.get('OS_PROJECT_DOMAIN_NAME') or 'default'
+        self.project_domain_id = os.environ.get('OS_PROJECT_DOMAIN_ID')
+        self.ssh_opts_cpu = '-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no'
+
+    def start(self):
+        self.log.info("Collectd monitor start.........")
+        f = open("%s/tests/collectd.conf" % self.top_dir, 'w')
+        collectd_conf_file = """ 
+Hostname %s
+FQDNLookup false
+Interval 1
+MaxReadInterval 2
+
+<LoadPlugin python>
+Globals true
+</LoadPlugin>
+LoadPlugin ovs_events
+LoadPlugin logfile
+
+<Plugin logfile>
+    File \"/var/log/collectd.log\"
+    Timestamp true
+    LogLevel \"info\"
+</Plugin>
+
+<Plugin python>
+    ModulePath \"/home/%s\"
+    LogTraces true
+    Interactive false
+    Import \"collectd_plugin\"
+    <Module \"collectd_plugin\">
+        control_ip \"%s\"
+        compute_ip \"%s\"
+        compute_host \"%s\"
+        compute_user \"%s\"
+        inspector_type \"%s\"
+        os_auth_url \"%s\"
+        os_username \"%s\"
+        os_password \"%s\"
+        os_project_name \"%s\"
+        os_user_domain_name \"%s\"
+        os_user_domain_id \"%s\"
+        os_project_domain_name \"%s\"
+        os_project_domain_id \"%s\"
+    </Module>
+</Plugin>
+
+<Plugin ovs_events>
+    Port 6640
+    Socket \"/var/run/openvswitch/db.sock\"
+    Interfaces \"@INTERFACE_NAME@\"
+    SendNotification true
+    DispatchValues false
+</Plugin>
+            """ % (self.compute_host, self.compute_user, self.control_ip, self.compute_ip, self.compute_host, self.compute_user,
+                   self.inspector_type, self.auth_url, self.username, self.password, self.project_name, self.user_domain_name,
+                   self.user_domain_id, self.project_domain_name, self.project_domain_id)
+        f.write(collectd_conf_file)
+        f.close()
+
+        os.system(" scp %s %s/tests/collectd.conf %s@%s: " % (self.ssh_opts_cpu, self.top_dir, self.compute_user, self.compute_ip))
+        self.log.info("after first scp")
+        ## @TODO (umar) Always assuming that the interface is assigned an IP if
+        ## interface name is not provided. See if there is a better approach
+        os.system(""" ssh %s %s@%s \"if [ -n \"%s\" ]; then
+            dev=%s
+        else
+            dev=\$(sudo ip a | awk '/ %s\//{print \$NF}')
+        fi
+        sed -i -e \"s/@INTERFACE_NAME@/\$dev/\" collectd.conf
+        collectd_conf=/opt/collectd/etc/collectd.conf
+        if [ -e \$collectd_conf ]; then
+            sudo cp \$collectd_conf \${collectd_conf}-doctor-saved
+        else
+            sudo touch \${collectd_conf}-doctor-created
+        fi
+        sudo mv collectd.conf /opt/collectd/etc/collectd.conf\" """ % (self.ssh_opts_cpu, self.compute_user, self.compute_ip, self.interface_name, self.interface_name, self.compute_ip))
+        self.log.info("after first ssh")
+        os.system(" scp  %s %s/tests/lib/monitors/collectd/collectd_plugin.py %s@%s:collectd_plugin.py " % (self.ssh_opts_cpu, self.top_dir, self.compute_user, self.compute_ip))
+        self.log.info("after sec scp")
+        os.system(" ssh %s %s@%s \"sudo pkill collectd; sudo /opt/collectd/sbin/collectd\" " % (self.ssh_opts_cpu, self.compute_user, self.compute_ip))
+        self.log.info("after sec ssh")
+
+    def stop(self):
+        os.system(" ssh %s %s@%s \"sudo pkill collectd\" " % (self.ssh_opts_cpu, self.compute_user, self.compute_ip))
+
+    def cleanup(self):
+        os.system(""" ssh %s %s@%s \"
+            collectd_conf=/opt/collectd/etc/collectd.conf
+            if [ -e \"\${collectd_conf}-doctor-created\" ]; then
+                sudo rm \"\${collectd_conf}-doctor-created\"
+                sudo rm \$collectd_conf
+            elif [ -e \"\${collectd_conf}-doctor-saved\" ]; then
+                sudo cp -f \"\${collectd_conf}-doctor-saved\" \$collectd_conf
+                sudo rm \"\${collectd_conf}-doctor-saved\"
+            fi\" """ % (self.ssh_opts_cpu, self.compute_user, self.compute_ip))
+        os.remove("%s/tests/collectd.conf" % self.top_dir)
index abdad58..69f18b2 100755 (executable)
@@ -212,17 +212,6 @@ create_alarm() {
      done
 }
 
-start_monitor() {
-    pgrep -f "python monitor.py" && return 0
-    sudo -E python monitor.py "$COMPUTE_HOST" "$COMPUTE_IP" "$INSPECTOR_TYPE" \
-        > monitor.log 2>&1 &
-}
-
-stop_monitor() {
-    pgrep -f "python monitor.py" || return 0
-    sudo kill $(pgrep -f "python monitor.py")
-}
-
 start_consumer() {
     pgrep -f "python consumer.py" && return 0
     python consumer.py "$CONSUMER_PORT" > consumer.log 2>&1 &
@@ -294,8 +283,12 @@ inject_failure() {
     echo "disabling network of compute host [$COMPUTE_HOST] for 3 mins..."
     cat > disable_network.sh << 'END_TXT'
 #!/bin/bash -x
-dev=$(sudo ip a | awk '/ @COMPUTE_IP@\//{print $NF}')
 sleep 1
+if [ -n "@INTERFACE_NAME@" ]; then
+    dev=@INTERFACE_NAME@
+else
+    dev=$(sudo ip a | awk '/ @COMPUTE_IP@\//{print $NF}')
+fi
 sudo ip link set $dev down
 echo "doctor set link down at" $(date "+%s.%N")
 sleep 180
@@ -303,6 +296,7 @@ sudo ip link set $dev up
 sleep 1
 END_TXT
     sed -i -e "s/@COMPUTE_IP@/$COMPUTE_IP/" disable_network.sh
+    sed -i -e "s/@INTERFACE_NAME@/$INTERFACE_NAME/" disable_network.sh
     chmod +x disable_network.sh
     scp $ssh_opts_cpu disable_network.sh "$COMPUTE_USER@$COMPUTE_IP:"
     ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" 'nohup ./disable_network.sh > disable_network.log 2>&1 &'
@@ -327,8 +321,11 @@ calculate_notification_time() {
     wait_consumer 60
     #keep 'at' as the last keyword just before the value, and
     #use regex to get value instead of the fixed column
+    if [ ! -f monitor.log ]; then
+        scp $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP:monitor.log" .
+    fi
     detected=$(grep "doctor monitor detected at" monitor.log |\
-               sed -e "s/^.* at //")
+               sed -e "s/^.* at //" | tail -1)
     notified=$(grep "doctor consumer notified at" consumer.log |\
                sed -e "s/^.* at //" | tail -1)
 
@@ -431,11 +428,11 @@ run_profiler() {
 cleanup() {
     set +e
     echo "cleanup..."
-    stop_monitor
     stop_inspector
     stop_consumer
 
     unset_forced_down_hosts
+    stop_monitor
     collect_logs
 
     vms=$(openstack $as_doctor_user server list)
@@ -467,6 +464,7 @@ cleanup() {
 
     cleanup_installer
     cleanup_inspector
+    cleanup_monitor
 
     # NOTE: Temporal log printer.
     for f in $(find . -name '*.log')
@@ -511,6 +509,9 @@ setup_python_packages
 source $TOP_DIR/functions-common
 source $TOP_DIR/lib/installer
 source $TOP_DIR/lib/inspector
+source $TOP_DIR/lib/monitor
+
+rm -f *.log
 
 setup_installer
 
@@ -540,8 +541,8 @@ echo "injecting host failure..."
 inject_failure
 
 check_host_status "(DOWN|UNKNOWN)" 60
-calculate_notification_time
 unset_forced_down_hosts
+calculate_notification_time
 collect_logs
 run_profiler