X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=yardstick%2Fnetwork_services%2Fnfvi%2Fresource.py;h=0c0bf223a91a85ff8c302c76c74ae72c1138fb76;hb=b7f867ba873806a10972108fa19d26b2ae19fa58;hp=055fdba7ed9feed81359e714eeeefbad5473ad3d;hpb=050a7452ef96c2a5d887207587436fc61e7d24cd;p=yardstick.git diff --git a/yardstick/network_services/nfvi/resource.py b/yardstick/network_services/nfvi/resource.py index 055fdba7e..0c0bf223a 100644 --- a/yardstick/network_services/nfvi/resource.py +++ b/yardstick/network_services/nfvi/resource.py @@ -13,58 +13,98 @@ # limitations under the License. """ Resource collection definitions """ -from __future__ import absolute_import -from __future__ import print_function -import tempfile +import errno +from itertools import chain import logging +import multiprocessing import os import os.path import re -import multiprocessing -from collections import Sequence +import jinja2 +import pkg_resources from oslo_config import cfg +from oslo_utils.encodeutils import safe_decode from yardstick import ssh +from yardstick.common.exceptions import ResourceCommandError +from yardstick.common.task_template import finalize_for_yaml +from yardstick.common.utils import validate_non_string_sequence from yardstick.network_services.nfvi.collectd import AmqpConsumer -from yardstick.network_services.utils import get_nsb_option + LOG = logging.getLogger(__name__) CONF = cfg.CONF ZMQ_OVS_PORT = 5567 ZMQ_POLLING_TIME = 12000 -LIST_PLUGINS_ENABLED = ["amqp", "cpu", "cpufreq", "intel_rdt", "memory", - "hugepages", "dpdkstat", "virt", "ovs_stats", "intel_pmu"] +LIST_PLUGINS_ENABLED = ["amqp", "cpu", "cpufreq", "memory", + "hugepages"] class ResourceProfile(object): """ This profile adds a resource at the beginning of the test session """ + COLLECTD_CONF = "collectd.conf" + AMPQ_PORT = 5672 + DEFAULT_INTERVAL = 25 + DEFAULT_TIMEOUT = 3600 + OVS_SOCKET_PATH = "/usr/local/var/run/openvswitch/db.sock" + + def __init__(self, mgmt, port_names=None, plugins=None, interval=None, timeout=None): + + if plugins is None: + self.plugins = {} + else: + self.plugins = plugins + + if interval is None: + self.interval = self.DEFAULT_INTERVAL + else: + self.interval = interval + + if timeout is None: + self.timeout = self.DEFAULT_TIMEOUT + else: + self.timeout = timeout - def __init__(self, mgmt, interfaces=None, cores=None): self.enable = True - self.connection = None - self.cores = cores if isinstance(cores, Sequence) else [] self._queue = multiprocessing.Queue() self.amqp_client = None - self.interfaces = interfaces if isinstance(interfaces, Sequence) else [] + self.port_names = validate_non_string_sequence(port_names, default=[]) - # why the host or ip? - self.vnfip = mgmt.get("host", mgmt["ip"]) - self.connection = ssh.SSH.from_node(mgmt, overrides={"ip": self.vnfip}) + # we need to save mgmt so we can connect to port 5672 + self.mgmt = mgmt + self.connection = ssh.AutoConnectSSH.from_node(mgmt) - self.connection.wait() + @classmethod + def make_from_node(cls, node, timeout): + # node dict works as mgmt dict + # don't need port names, there is no way we can + # tell what port is used on the compute node + collectd_options = node["collectd"] + plugins = collectd_options.get("plugins", {}) + interval = collectd_options.get("interval") + + return cls(node, plugins=plugins, interval=interval, timeout=timeout) - def check_if_sa_running(self, process): + def check_if_system_agent_running(self, process): """ verify if system agent is running """ - err, pid, _ = self.connection.execute("pgrep -f %s" % process) - return [err == 0, pid] + try: + err, pid, _ = self.connection.execute("pgrep -f %s" % process) + # strip whitespace + return err, pid.strip() + except OSError as e: + if e.errno in {errno.ECONNRESET}: + # if we can't connect to check, then we won't be able to connect to stop it + LOG.exception("Can't connect to host to check %s status", process) + return 1, None + raise def run_collectd_amqp(self): """ run amqp consumer to collect the NFVi data """ - amqp_url = 'amqp://admin:admin@{}:5672/%2F'.format(self.vnfip) + amqp_url = 'amqp://admin:admin@{}:{}/%2F'.format(self.mgmt['ip'], self.AMPQ_PORT) amqp = AmqpConsumer(amqp_url, self._queue) try: amqp.run() @@ -111,9 +151,9 @@ class ResourceProfile(object): @classmethod def parse_intel_pmu_stats(cls, key, value): - return {''.join(key): value.split(":")[1]} + return {''.join(str(v) for v in key): value.split(":")[1]} - def parse_collectd_result(self, metrics, core_list): + def parse_collectd_result(self, metrics): """ convert collectd data into json""" result = { "cpu": {}, @@ -122,21 +162,21 @@ class ResourceProfile(object): "dpdkstat": {}, "virt": {}, "ovs_stats": {}, - "intel_pmu": {}, } testcase = "" - for key, value in metrics.items(): + # unicode decode + decoded = ((safe_decode(k, 'utf-8'), safe_decode(v, 'utf-8')) for k, v in metrics.items()) + for key, value in decoded: key_split = key.split("/") res_key_iter = (key for key in key_split if "nsb_stats" not in key) res_key0 = next(res_key_iter) res_key1 = next(res_key_iter) - if "cpu" in res_key0 or "intel_rdt" in res_key0: + if "cpu" in res_key0 or "intel_rdt" in res_key0 or "intel_pmu" in res_key0: cpu_key, name, metric, testcase = \ self.get_cpu_data(res_key0, res_key1, value) - if cpu_key in core_list: - result["cpu"].setdefault(cpu_key, {}).update({name: metric}) + result["cpu"].setdefault(cpu_key, {}).update({name: metric}) elif "memory" in res_key0: result["memory"].update({res_key1: value.split(":")[0]}) @@ -153,9 +193,6 @@ class ResourceProfile(object): elif "ovs_stats" in res_key0: result["ovs_stats"].update(self.parse_ovs_stats(key_split, value)) - elif "intel_pmu-all" in res_key0: - result["intel_pmu"].update(self.parse_intel_pmu_stats(res_key1, value)) - result["timestamp"] = testcase return result @@ -163,8 +200,9 @@ class ResourceProfile(object): def amqp_process_for_nfvi_kpi(self): """ amqp collect and return nfvi kpis """ if self.amqp_client is None and self.enable: - self.amqp_client = \ - multiprocessing.Process(target=self.run_collectd_amqp) + self.amqp_client = multiprocessing.Process( + name="AmqpClient-{}-{}".format(self.mgmt['ip'], os.getpid()), + target=self.run_collectd_amqp) self.amqp_client.start() def amqp_collect_nfvi_kpi(self): @@ -175,75 +213,99 @@ class ResourceProfile(object): metric = {} while not self._queue.empty(): metric.update(self._queue.get()) - msg = self.parse_collectd_result(metric, self.cores) + msg = self.parse_collectd_result(metric) return msg - def _provide_config_file(self, bin_path, nfvi_cfg, kwargs): - with open(os.path.join(bin_path, nfvi_cfg), 'r') as cfg: - template = cfg.read() - cfg, cfg_content = tempfile.mkstemp() - with os.fdopen(cfg, "w+") as cfg: - cfg.write(template.format(**kwargs)) - cfg_file = os.path.join(bin_path, nfvi_cfg) - self.connection.put(cfg_content, cfg_file) - - def _prepare_collectd_conf(self, bin_path): + def _provide_config_file(self, config_file_path, nfvi_cfg, template_kwargs): + template = pkg_resources.resource_string("yardstick.network_services.nfvi", + nfvi_cfg).decode('utf-8') + cfg_content = jinja2.Template(template, trim_blocks=True, lstrip_blocks=True, + finalize=finalize_for_yaml).render( + **template_kwargs) + # cfg_content = io.StringIO(template.format(**template_kwargs)) + cfg_file = os.path.join(config_file_path, nfvi_cfg) + # must write as root, so use sudo + self.connection.execute("cat | sudo tee {}".format(cfg_file), stdin=cfg_content) + + def _prepare_collectd_conf(self, config_file_path): """ Prepare collectd conf """ - loadplugin = "\n".join("LoadPlugin {0}".format(plugin) - for plugin in LIST_PLUGINS_ENABLED) - - interfaces = "\n".join("PortName '{0[name]}'".format(interface) - for interface in self.interfaces) kwargs = { - "interval": '25', - "loadplugin": loadplugin, - "dpdk_interface": interfaces, + "interval": self.interval, + "loadplugins": set(chain(LIST_PLUGINS_ENABLED, self.plugins.keys())), + # Optional fields PortName is descriptive only, use whatever is present + "port_names": self.port_names, + # "ovs_bridge_interfaces": ["br-int"], + "plugins": self.plugins, } - self._provide_config_file(bin_path, 'collectd.conf', kwargs) + self._provide_config_file(config_file_path, self.COLLECTD_CONF, kwargs) + + def _setup_ovs_stats(self, connection): + try: + socket_path = self.plugins["ovs_stats"].get("ovs_socket_path", self.OVS_SOCKET_PATH) + except KeyError: + # ovs_stats is not a dict + socket_path = self.OVS_SOCKET_PATH + status = connection.execute("test -S {}".format(socket_path))[0] + if status != 0: + LOG.error("cannot find OVS socket %s", socket_path) + + def _start_rabbitmq(self, connection): + # Reset amqp queue + LOG.debug("reset and setup amqp to collect data from collectd") + # ensure collectd.conf.d exists to avoid error/warning + cmd_list = ["sudo mkdir -p /etc/collectd/collectd.conf.d", + "sudo service rabbitmq-server restart", + "sudo rabbitmqctl stop_app", + "sudo rabbitmqctl reset", + "sudo rabbitmqctl start_app", + "sudo rabbitmqctl add_user admin admin", + "sudo rabbitmqctl authenticate_user admin admin", + "sudo rabbitmqctl set_permissions -p / admin '.*' '.*' '.*'" + ] + for cmd in cmd_list: + exit_status, stdout, stderr = connection.execute(cmd) + if exit_status != 0: + raise ResourceCommandError(command=cmd, stderr=stderr) + + # check stdout for "sudo rabbitmqctl status" command + cmd = "sudo rabbitmqctl status" + _, stdout, stderr = connection.execute(cmd) + if not re.search("RabbitMQ", stdout): + LOG.error("rabbitmqctl status don't have RabbitMQ in running apps") + raise ResourceCommandError(command=cmd, stderr=stderr) def _start_collectd(self, connection, bin_path): LOG.debug("Starting collectd to collect NFVi stats") - connection.execute('sudo pkill -9 collectd') - bin_path = get_nsb_option("bin_path") - collectd_path = os.path.join(bin_path, "collectd", "collectd") + collectd_path = os.path.join(bin_path, "collectd", "sbin", "collectd") + config_file_path = os.path.join(bin_path, "collectd", "etc") + self._prepare_collectd_conf(config_file_path) + + connection.execute('sudo pkill -x -9 collectd') exit_status = connection.execute("which %s > /dev/null 2>&1" % collectd_path)[0] if exit_status != 0: LOG.warning("%s is not present disabling", collectd_path) - # disable auto-provisioning because it requires Internet access - # collectd_installer = os.path.join(bin_path, "collectd.sh") - # provision_tool(connection, collectd) - # http_proxy = os.environ.get('http_proxy', '') - # https_proxy = os.environ.get('https_proxy', '') - # connection.execute("sudo %s '%s' '%s'" % ( - # collectd_installer, http_proxy, https_proxy)) return - LOG.debug("Starting collectd to collect NFVi stats") - self._prepare_collectd_conf(bin_path) + if "ovs_stats" in self.plugins: + self._setup_ovs_stats(connection) - # Reset amqp queue - LOG.debug("reset and setup amqp to collect data from collectd") - connection.execute("sudo rm -rf /var/lib/rabbitmq/mnesia/rabbit*") - connection.execute("sudo service rabbitmq-server start") - connection.execute("sudo rabbitmqctl stop_app") - connection.execute("sudo rabbitmqctl reset") - connection.execute("sudo rabbitmqctl start_app") - connection.execute("sudo service rabbitmq-server restart") - - LOG.debug("Creating amdin user for rabbitmq in order to collect data from collectd") - connection.execute("sudo rabbitmqctl delete_user guest") - connection.execute("sudo rabbitmqctl add_user admin admin") - connection.execute("sudo rabbitmqctl authenticate_user admin admin") - connection.execute("sudo rabbitmqctl set_permissions -p / admin \".*\" \".*\" \".*\"") - - LOG.debug("Start collectd service.....") - connection.execute("sudo %s" % collectd_path) + LOG.debug("Starting collectd to collect NFVi stats") + LOG.debug("Start collectd service..... %s second timeout", self.timeout) + # intel_pmu plug requires large numbers of files open, so try to set + # ulimit -n to a large value + connection.execute("sudo bash -c 'ulimit -n 1000000 ; %s'" % collectd_path, + timeout=self.timeout) LOG.debug("Done") def initiate_systemagent(self, bin_path): """ Start system agent for NFVi collection on host """ if self.enable: - self._start_collectd(self.connection, bin_path) + try: + self._start_rabbitmq(self.connection) + self._start_collectd(self.connection, bin_path) + except ResourceCommandError as e: + LOG.exception("Exception during collectd and rabbitmq start: %s", str(e)) + raise def start(self): """ start nfvi collection """ @@ -259,13 +321,18 @@ class ResourceProfile(object): LOG.debug("Stop resource monitor...") if self.amqp_client is not None: + # we proper and try to join first + self.amqp_client.join(3) self.amqp_client.terminate() - status, pid = self.check_if_sa_running(agent) - if status == 0: + LOG.debug("Check if %s is running", agent) + status, pid = self.check_if_system_agent_running(agent) + LOG.debug("status %s pid %s", status, pid) + if status != 0: return - self.connection.execute('sudo kill -9 %s' % pid) - self.connection.execute('sudo pkill -9 %s' % agent) + if pid: + self.connection.execute('sudo kill -9 "%s"' % pid) + self.connection.execute('sudo pkill -9 "%s"' % agent) self.connection.execute('sudo service rabbitmq-server stop') self.connection.execute("sudo rabbitmqctl stop_app")