Enable intel_pmu plugin in collectd
[yardstick.git] / yardstick / network_services / nfvi / resource.py
index d807f5e..fef44e2 100644 (file)
@@ -19,6 +19,7 @@ from __future__ import print_function
 import logging
 from itertools import chain
 
+import errno
 import jinja2
 import os
 import os.path
@@ -51,18 +52,27 @@ class ResourceProfile(object):
     COLLECTD_CONF = "collectd.conf"
     AMPQ_PORT = 5672
     DEFAULT_INTERVAL = 25
+    DEFAULT_TIMEOUT = 3600
+
+    def __init__(self, mgmt, port_names=None, cores=None, plugins=None,
+                 interval=None, timeout=None):
 
-    def __init__(self, mgmt, port_names=None, cores=None, plugins=None, interval=None):
         if plugins is None:
             self.plugins = {}
         else:
             self.plugins = plugins
+
         if interval is None:
             self.interval = self.DEFAULT_INTERVAL
         else:
             self.interval = interval
+
+        if timeout is None:
+            self.timeout = self.DEFAULT_TIMEOUT
+        else:
+            self.timeout = timeout
+
         self.enable = True
-        self.cores = validate_non_string_sequence(cores, default=[])
         self._queue = multiprocessing.Queue()
         self.amqp_client = None
         self.port_names = validate_non_string_sequence(port_names, default=[])
@@ -73,8 +83,16 @@ class ResourceProfile(object):
 
     def check_if_sa_running(self, process):
         """ verify if system agent is running """
-        err, pid, _ = self.connection.execute("pgrep -f %s" % process)
-        return [err == 0, pid]
+        try:
+            err, pid, _ = self.connection.execute("pgrep -f %s" % process)
+            # strip whitespace
+            return err, pid.strip()
+        except OSError as e:
+            if e.errno in {errno.ECONNRESET}:
+                # if we can't connect to check, then we won't be able to connect to stop it
+                LOG.exception("can't connect to host to check collectd status")
+                return 1, None
+            raise
 
     def run_collectd_amqp(self):
         """ run amqp consumer to collect the NFVi data """
@@ -127,7 +145,7 @@ class ResourceProfile(object):
     def parse_intel_pmu_stats(cls, key, value):
         return {''.join(str(v) for v in key): value.split(":")[1]}
 
-    def parse_collectd_result(self, metrics, core_list):
+    def parse_collectd_result(self, metrics):
         """ convert collectd data into json"""
         result = {
             "cpu": {},
@@ -148,11 +166,10 @@ class ResourceProfile(object):
             res_key0 = next(res_key_iter)
             res_key1 = next(res_key_iter)
 
-            if "cpu" in res_key0 or "intel_rdt" in res_key0:
+            if "cpu" in res_key0 or "intel_rdt" in res_key0 or "intel_pmu" in res_key0:
                 cpu_key, name, metric, testcase = \
                     self.get_cpu_data(res_key0, res_key1, value)
-                if cpu_key in core_list:
-                    result["cpu"].setdefault(cpu_key, {}).update({name: metric})
+                result["cpu"].setdefault(cpu_key, {}).update({name: metric})
 
             elif "memory" in res_key0:
                 result["memory"].update({res_key1: value.split(":")[0]})
@@ -169,9 +186,6 @@ class ResourceProfile(object):
             elif "ovs_stats" in res_key0:
                 result["ovs_stats"].update(self.parse_ovs_stats(key_split, value))
 
-            elif "intel_pmu-all" in res_key0:
-                result["intel_pmu"].update(self.parse_intel_pmu_stats(res_key1, value))
-
         result["timestamp"] = testcase
 
         return result
@@ -179,8 +193,9 @@ class ResourceProfile(object):
     def amqp_process_for_nfvi_kpi(self):
         """ amqp collect and return nfvi kpis """
         if self.amqp_client is None and self.enable:
-            self.amqp_client = \
-                multiprocessing.Process(target=self.run_collectd_amqp)
+            self.amqp_client = multiprocessing.Process(
+                name="AmqpClient-{}-{}".format(self.mgmt['ip'], os.getpid()),
+                target=self.run_collectd_amqp)
             self.amqp_client.start()
 
     def amqp_collect_nfvi_kpi(self):
@@ -191,7 +206,7 @@ class ResourceProfile(object):
         metric = {}
         while not self._queue.empty():
             metric.update(self._queue.get())
-        msg = self.parse_collectd_result(metric, self.cores)
+        msg = self.parse_collectd_result(metric)
         return msg
 
     def _provide_config_file(self, config_file_path, nfvi_cfg, template_kwargs):
@@ -235,6 +250,10 @@ class ResourceProfile(object):
             # connection.execute("sudo %s '%s' '%s'" % (
             #     collectd_installer, http_proxy, https_proxy))
             return
+        if "intel_pmu" in self.plugins:
+            LOG.debug("Downloading event list for pmu_stats plugin")
+            cmd = 'sudo bash -c \'cd /opt/tempT/pmu-tools/; python event_download_local.py\''
+            connection.execute(cmd)
         LOG.debug("Starting collectd to collect NFVi stats")
         # ensure collectd.conf.d exists to avoid error/warning
         connection.execute("sudo mkdir -p /etc/collectd/collectd.conf.d")
@@ -255,8 +274,11 @@ class ResourceProfile(object):
         connection.execute("sudo rabbitmqctl authenticate_user admin admin")
         connection.execute("sudo rabbitmqctl set_permissions -p / admin '.*' '.*' '.*'")
 
-        LOG.debug("Start collectd service.....")
-        connection.execute("sudo %s" % collectd_path)
+        LOG.debug("Start collectd service..... %s second timeout", self.timeout)
+        # intel_pmu plug requires large numbers of files open, so try to set
+        # ulimit -n to a large value
+        connection.execute("sudo bash -c 'ulimit -n 1000000 ; %s'" % collectd_path,
+                           timeout=self.timeout)
         LOG.debug("Done")
 
     def initiate_systemagent(self, bin_path):
@@ -282,13 +304,18 @@ class ResourceProfile(object):
         LOG.debug("Stop resource monitor...")
 
         if self.amqp_client is not None:
+            # we proper and try to join first
+            self.amqp_client.join(3)
             self.amqp_client.terminate()
 
+        LOG.debug("Check if %s is running", agent)
         status, pid = self.check_if_sa_running(agent)
-        if status == 0:
+        LOG.debug("status %s  pid %s", status, pid)
+        if status != 0:
             return
 
-        self.connection.execute('sudo kill -9 %s' % pid)
-        self.connection.execute('sudo pkill -9 %s' % agent)
+        if pid:
+            self.connection.execute('sudo kill -9 "%s"' % pid)
+        self.connection.execute('sudo pkill -9 "%s"' % agent)
         self.connection.execute('sudo service rabbitmq-server stop')
         self.connection.execute("sudo rabbitmqctl stop_app")