X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=yardstick%2Fnetwork_services%2Fnfvi%2Fresource.py;h=fef44e2079fb358203ea0139b776f40fb7b56ebf;hb=49677852f2bc690d235318d2208504241eef61a9;hp=18b0d895259b68aada03071a623ff6092380b350;hpb=50a5d0cb28d95d8edb47b1775569fcdf52dd1c96;p=yardstick.git diff --git a/yardstick/network_services/nfvi/resource.py b/yardstick/network_services/nfvi/resource.py index 18b0d8952..fef44e207 100644 --- a/yardstick/network_services/nfvi/resource.py +++ b/yardstick/network_services/nfvi/resource.py @@ -14,147 +14,308 @@ """ Resource collection definitions """ from __future__ import absolute_import +from __future__ import print_function + import logging +from itertools import chain + +import errno +import jinja2 +import os import os.path import re import multiprocessing +import pkg_resources + from oslo_config import cfg +from oslo_utils.encodeutils import safe_decode from yardstick import ssh +from yardstick.common.task_template import finalize_for_yaml +from yardstick.common.utils import validate_non_string_sequence from yardstick.network_services.nfvi.collectd import AmqpConsumer -from yardstick.network_services.utils import provision_tool +from yardstick.network_services.utils import get_nsb_option + +LOG = logging.getLogger(__name__) CONF = cfg.CONF ZMQ_OVS_PORT = 5567 ZMQ_POLLING_TIME = 12000 +LIST_PLUGINS_ENABLED = ["amqp", "cpu", "cpufreq", "memory", + "hugepages"] class ResourceProfile(object): """ This profile adds a resource at the beginning of the test session """ + COLLECTD_CONF = "collectd.conf" + AMPQ_PORT = 5672 + DEFAULT_INTERVAL = 25 + DEFAULT_TIMEOUT = 3600 - def __init__(self, vnfd, cores): - self.enable = True - self.connection = None - self.cores = cores + def __init__(self, mgmt, port_names=None, cores=None, plugins=None, + interval=None, timeout=None): + + if plugins is None: + self.plugins = {} + else: + self.plugins = plugins + + if interval is None: + self.interval = self.DEFAULT_INTERVAL + else: + self.interval = interval - mgmt_interface = vnfd.get("mgmt-interface") - # why the host or ip? - self.vnfip = mgmt_interface.get("host", mgmt_interface["ip"]) - self.connection = ssh.SSH.from_node(mgmt_interface, - overrides={"ip": self.vnfip}) + if timeout is None: + self.timeout = self.DEFAULT_TIMEOUT + else: + self.timeout = timeout + + self.enable = True + self._queue = multiprocessing.Queue() + self.amqp_client = None + self.port_names = validate_non_string_sequence(port_names, default=[]) - self.connection.wait() + # we need to save mgmt so we can connect to port 5672 + self.mgmt = mgmt + self.connection = ssh.AutoConnectSSH.from_node(mgmt) def check_if_sa_running(self, process): """ verify if system agent is running """ - err, pid, _ = self.connection.execute("pgrep -f %s" % process) - return [err == 0, pid] + try: + err, pid, _ = self.connection.execute("pgrep -f %s" % process) + # strip whitespace + return err, pid.strip() + except OSError as e: + if e.errno in {errno.ECONNRESET}: + # if we can't connect to check, then we won't be able to connect to stop it + LOG.exception("can't connect to host to check collectd status") + return 1, None + raise - def run_collectd_amqp(self, queue): + def run_collectd_amqp(self): """ run amqp consumer to collect the NFVi data """ - amqp = \ - AmqpConsumer('amqp://admin:admin@{}:5672/%2F'.format(self.vnfip), - queue) + amqp_url = 'amqp://admin:admin@{}:{}/%2F'.format(self.mgmt['ip'], self.AMPQ_PORT) + amqp = AmqpConsumer(amqp_url, self._queue) try: amqp.run() except (AttributeError, RuntimeError, KeyboardInterrupt): amqp.stop() @classmethod - def get_cpu_data(cls, reskey, value): + def parse_simple_resource(cls, key, value): + reskey = "/".join(rkey for rkey in key if "nsb_stats" not in rkey) + return {reskey: value.split(":")[1]} + + @classmethod + def get_cpu_data(cls, res_key0, res_key1, value): """ Get cpu topology of the host """ pattern = r"-(\d+)" - if "cpufreq" in reskey[1]: - match = re.search(pattern, reskey[2], re.MULTILINE) - metric = reskey[1] + + if 'cpufreq' in res_key0: + metric, source = res_key0, res_key1 else: - match = re.search(pattern, reskey[1], re.MULTILINE) - metric = reskey[2] + metric, source = res_key1, res_key0 + + match = re.search(pattern, source, re.MULTILINE) + if not match: + return "error", "Invalid", "", "" + + time, value = value.split(":") + return str(match.group(1)), metric, value, time - time, val = re.split(":", value) - if match: - return [str(match.group(1)), metric, val, time] + @classmethod + def parse_hugepages(cls, key, value): + return cls.parse_simple_resource(key, value) + + @classmethod + def parse_dpdkstat(cls, key, value): + return cls.parse_simple_resource(key, value) - return ["error", "Invalid", ""] + @classmethod + def parse_virt(cls, key, value): + return cls.parse_simple_resource(key, value) + + @classmethod + def parse_ovs_stats(cls, key, value): + return cls.parse_simple_resource(key, value) - def parse_collectd_result(self, metrics, listcores): + @classmethod + def parse_intel_pmu_stats(cls, key, value): + return {''.join(str(v) for v in key): value.split(":")[1]} + + def parse_collectd_result(self, metrics): """ convert collectd data into json""" - res = {"cpu": {}, "memory": {}} + result = { + "cpu": {}, + "memory": {}, + "hugepages": {}, + "dpdkstat": {}, + "virt": {}, + "ovs_stats": {}, + "intel_pmu": {}, + } testcase = "" - for key, value in metrics.items(): - reskey = key.rsplit("/") - if "cpu" in reskey[1] or "intel_rdt" in reskey[1]: + # unicode decode + decoded = ((safe_decode(k, 'utf-8'), safe_decode(v, 'utf-8')) for k, v in metrics.items()) + for key, value in decoded: + key_split = key.split("/") + res_key_iter = (key for key in key_split if "nsb_stats" not in key) + res_key0 = next(res_key_iter) + res_key1 = next(res_key_iter) + + if "cpu" in res_key0 or "intel_rdt" in res_key0 or "intel_pmu" in res_key0: cpu_key, name, metric, testcase = \ - self.get_cpu_data(reskey, value) - if cpu_key in listcores: - res["cpu"].setdefault(cpu_key, {}).update({name: metric}) - elif "memory" in reskey[1]: - val = re.split(":", value)[1] - res["memory"].update({reskey[2]: val}) - res["timestamp"] = testcase + self.get_cpu_data(res_key0, res_key1, value) + result["cpu"].setdefault(cpu_key, {}).update({name: metric}) + + elif "memory" in res_key0: + result["memory"].update({res_key1: value.split(":")[0]}) + + elif "hugepages" in res_key0: + result["hugepages"].update(self.parse_hugepages(key_split, value)) + + elif "dpdkstat" in res_key0: + result["dpdkstat"].update(self.parse_dpdkstat(key_split, value)) - return res + elif "virt" in res_key1: + result["virt"].update(self.parse_virt(key_split, value)) - def amqp_collect_nfvi_kpi(self, _queue=multiprocessing.Queue()): + elif "ovs_stats" in res_key0: + result["ovs_stats"].update(self.parse_ovs_stats(key_split, value)) + + result["timestamp"] = testcase + + return result + + def amqp_process_for_nfvi_kpi(self): """ amqp collect and return nfvi kpis """ - try: - metric = {} - amqp_client = \ - multiprocessing.Process(target=self.run_collectd_amqp, - args=(_queue,)) - amqp_client.start() - amqp_client.join(7) - amqp_client.terminate() - - while not _queue.empty(): - metric.update(_queue.get()) - except (AttributeError, RuntimeError, TypeError, ValueError): - logging.debug("Failed to get NFVi stats...") - msg = {} - else: - msg = self.parse_collectd_result(metric, self.cores) + if self.amqp_client is None and self.enable: + self.amqp_client = multiprocessing.Process( + name="AmqpClient-{}-{}".format(self.mgmt['ip'], os.getpid()), + target=self.run_collectd_amqp) + self.amqp_client.start() + def amqp_collect_nfvi_kpi(self): + """ amqp collect and return nfvi kpis """ + if not self.enable: + return {} + + metric = {} + while not self._queue.empty(): + metric.update(self._queue.get()) + msg = self.parse_collectd_result(metric) return msg - @classmethod - def _start_collectd(cls, connection, bin_path): - connection.execute('pkill -9 collectd') - collectd = os.path.join(bin_path, "collectd.sh") - provision_tool(connection, collectd) - provision_tool(connection, os.path.join(bin_path, "collectd.conf")) + def _provide_config_file(self, config_file_path, nfvi_cfg, template_kwargs): + template = pkg_resources.resource_string("yardstick.network_services.nfvi", + nfvi_cfg).decode('utf-8') + cfg_content = jinja2.Template(template, trim_blocks=True, lstrip_blocks=True, + finalize=finalize_for_yaml).render( + **template_kwargs) + # cfg_content = io.StringIO(template.format(**template_kwargs)) + cfg_file = os.path.join(config_file_path, nfvi_cfg) + # must write as root, so use sudo + self.connection.execute("cat | sudo tee {}".format(cfg_file), stdin=cfg_content) + + def _prepare_collectd_conf(self, config_file_path): + """ Prepare collectd conf """ + + kwargs = { + "interval": self.interval, + "loadplugins": set(chain(LIST_PLUGINS_ENABLED, self.plugins.keys())), + # Optional fields PortName is descriptive only, use whatever is present + "port_names": self.port_names, + # "ovs_bridge_interfaces": ["br-int"], + "plugins": self.plugins, + } + self._provide_config_file(config_file_path, self.COLLECTD_CONF, kwargs) + + def _start_collectd(self, connection, bin_path): + LOG.debug("Starting collectd to collect NFVi stats") + connection.execute('sudo pkill -x -9 collectd') + bin_path = get_nsb_option("bin_path") + collectd_path = os.path.join(bin_path, "collectd", "sbin", "collectd") + config_file_path = os.path.join(bin_path, "collectd", "etc") + exit_status = connection.execute("which %s > /dev/null 2>&1" % collectd_path)[0] + if exit_status != 0: + LOG.warning("%s is not present disabling", collectd_path) + # disable auto-provisioning because it requires Internet access + # collectd_installer = os.path.join(bin_path, "collectd.sh") + # provision_tool(connection, collectd) + # http_proxy = os.environ.get('http_proxy', '') + # https_proxy = os.environ.get('https_proxy', '') + # connection.execute("sudo %s '%s' '%s'" % ( + # collectd_installer, http_proxy, https_proxy)) + return + if "intel_pmu" in self.plugins: + LOG.debug("Downloading event list for pmu_stats plugin") + cmd = 'sudo bash -c \'cd /opt/tempT/pmu-tools/; python event_download_local.py\'' + connection.execute(cmd) + LOG.debug("Starting collectd to collect NFVi stats") + # ensure collectd.conf.d exists to avoid error/warning + connection.execute("sudo mkdir -p /etc/collectd/collectd.conf.d") + self._prepare_collectd_conf(config_file_path) # Reset amqp queue + LOG.debug("reset and setup amqp to collect data from collectd") + connection.execute("sudo rm -rf /var/lib/rabbitmq/mnesia/rabbit*") connection.execute("sudo service rabbitmq-server start") connection.execute("sudo rabbitmqctl stop_app") connection.execute("sudo rabbitmqctl reset") connection.execute("sudo rabbitmqctl start_app") connection.execute("sudo service rabbitmq-server restart") - # Run collectd - connection.execute(collectd) - connection.execute(os.path.join(bin_path, "collectd", "collectd")) + LOG.debug("Creating admin user for rabbitmq in order to collect data from collectd") + connection.execute("sudo rabbitmqctl delete_user guest") + connection.execute("sudo rabbitmqctl add_user admin admin") + connection.execute("sudo rabbitmqctl authenticate_user admin admin") + connection.execute("sudo rabbitmqctl set_permissions -p / admin '.*' '.*' '.*'") + + LOG.debug("Start collectd service..... %s second timeout", self.timeout) + # intel_pmu plug requires large numbers of files open, so try to set + # ulimit -n to a large value + connection.execute("sudo bash -c 'ulimit -n 1000000 ; %s'" % collectd_path, + timeout=self.timeout) + LOG.debug("Done") def initiate_systemagent(self, bin_path): """ Start system agent for NFVi collection on host """ if self.enable: - self._start_collectd(self.connection, bin_path) + try: + self._start_collectd(self.connection, bin_path) + except Exception: + LOG.exception("Exception during collectd start") + raise def start(self): """ start nfvi collection """ if self.enable: - logging.debug("Start NVFi metric collection...") + LOG.debug("Start NVFi metric collection...") def stop(self): """ stop nfvi collection """ - if self.enable: - agent = "collectd" - logging.debug("Stop resource monitor...") - status, pid = self.check_if_sa_running(agent) - if status: - self.connection.execute('kill -9 %s' % pid) - self.connection.execute('pkill -9 %s' % agent) - self.connection.execute('service rabbitmq-server stop') - self.connection.execute("sudo rabbitmqctl stop_app") + if not self.enable: + return + + agent = "collectd" + LOG.debug("Stop resource monitor...") + + if self.amqp_client is not None: + # we proper and try to join first + self.amqp_client.join(3) + self.amqp_client.terminate() + + LOG.debug("Check if %s is running", agent) + status, pid = self.check_if_sa_running(agent) + LOG.debug("status %s pid %s", status, pid) + if status != 0: + return + + if pid: + self.connection.execute('sudo kill -9 "%s"' % pid) + self.connection.execute('sudo pkill -9 "%s"' % agent) + self.connection.execute('sudo service rabbitmq-server stop') + self.connection.execute("sudo rabbitmqctl stop_app")