add collectd resource node capability
[yardstick.git] / yardstick / network_services / nfvi / resource.py
1 # Copyright (c) 2016-2017 Intel Corporation
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
6 #
7 #      http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 """ Resource collection definitions """
15
16 from __future__ import absolute_import
17 from __future__ import print_function
18
19 import logging
20 from itertools import chain
21
22 import jinja2
23 import os
24 import os.path
25 import re
26 import multiprocessing
27 import pkg_resources
28
29 from oslo_config import cfg
30 from oslo_utils.encodeutils import safe_decode
31
32 from yardstick import ssh
33 from yardstick.common.task_template import finalize_for_yaml
34 from yardstick.common.utils import validate_non_string_sequence
35 from yardstick.network_services.nfvi.collectd import AmqpConsumer
36 from yardstick.network_services.utils import get_nsb_option
37
38 LOG = logging.getLogger(__name__)
39
40 CONF = cfg.CONF
41 ZMQ_OVS_PORT = 5567
42 ZMQ_POLLING_TIME = 12000
43 LIST_PLUGINS_ENABLED = ["amqp", "cpu", "cpufreq", "memory",
44                         "hugepages"]
45
46
47 class ResourceProfile(object):
48     """
49     This profile adds a resource at the beginning of the test session
50     """
51     COLLECTD_CONF = "collectd.conf"
52     AMPQ_PORT = 5672
53     DEFAULT_INTERVAL = 25
54     DEFAULT_TIMEOUT = 3600
55
56     def __init__(self, mgmt, port_names=None, cores=None, plugins=None,
57                  interval=None, timeout=None):
58
59         if plugins is None:
60             self.plugins = {}
61         else:
62             self.plugins = plugins
63
64         if interval is None:
65             self.interval = self.DEFAULT_INTERVAL
66         else:
67             self.interval = interval
68
69         if timeout is None:
70             self.timeout = self.DEFAULT_TIMEOUT
71         else:
72             self.timeout = timeout
73
74         self.enable = True
75         self.cores = validate_non_string_sequence(cores, default=[])
76         self._queue = multiprocessing.Queue()
77         self.amqp_client = None
78         self.port_names = validate_non_string_sequence(port_names, default=[])
79
80         # we need to save mgmt so we can connect to port 5672
81         self.mgmt = mgmt
82         self.connection = ssh.AutoConnectSSH.from_node(mgmt)
83
84     def check_if_sa_running(self, process):
85         """ verify if system agent is running """
86         status, pid, _ = self.connection.execute("pgrep -f %s" % process)
87         return status == 0, pid
88
89     def run_collectd_amqp(self):
90         """ run amqp consumer to collect the NFVi data """
91         amqp_url = 'amqp://admin:admin@{}:{}/%2F'.format(self.mgmt['ip'], self.AMPQ_PORT)
92         amqp = AmqpConsumer(amqp_url, self._queue)
93         try:
94             amqp.run()
95         except (AttributeError, RuntimeError, KeyboardInterrupt):
96             amqp.stop()
97
98     @classmethod
99     def parse_simple_resource(cls, key, value):
100         reskey = "/".join(rkey for rkey in key if "nsb_stats" not in rkey)
101         return {reskey: value.split(":")[1]}
102
103     @classmethod
104     def get_cpu_data(cls, res_key0, res_key1, value):
105         """ Get cpu topology of the host """
106         pattern = r"-(\d+)"
107
108         if 'cpufreq' in res_key0:
109             metric, source = res_key0, res_key1
110         else:
111             metric, source = res_key1, res_key0
112
113         match = re.search(pattern, source, re.MULTILINE)
114         if not match:
115             return "error", "Invalid", "", ""
116
117         time, value = value.split(":")
118         return str(match.group(1)), metric, value, time
119
120     @classmethod
121     def parse_hugepages(cls, key, value):
122         return cls.parse_simple_resource(key, value)
123
124     @classmethod
125     def parse_dpdkstat(cls, key, value):
126         return cls.parse_simple_resource(key, value)
127
128     @classmethod
129     def parse_virt(cls, key, value):
130         return cls.parse_simple_resource(key, value)
131
132     @classmethod
133     def parse_ovs_stats(cls, key, value):
134         return cls.parse_simple_resource(key, value)
135
136     @classmethod
137     def parse_intel_pmu_stats(cls, key, value):
138         return {''.join(str(v) for v in key): value.split(":")[1]}
139
140     def parse_collectd_result(self, metrics, core_list):
141         """ convert collectd data into json"""
142         result = {
143             "cpu": {},
144             "memory": {},
145             "hugepages": {},
146             "dpdkstat": {},
147             "virt": {},
148             "ovs_stats": {},
149             "intel_pmu": {},
150         }
151         testcase = ""
152
153         # unicode decode
154         decoded = ((safe_decode(k, 'utf-8'), safe_decode(v, 'utf-8')) for k, v in metrics.items())
155         for key, value in decoded:
156             key_split = key.split("/")
157             res_key_iter = (key for key in key_split if "nsb_stats" not in key)
158             res_key0 = next(res_key_iter)
159             res_key1 = next(res_key_iter)
160
161             if "cpu" in res_key0 or "intel_rdt" in res_key0:
162                 cpu_key, name, metric, testcase = \
163                     self.get_cpu_data(res_key0, res_key1, value)
164                 if cpu_key in core_list:
165                     result["cpu"].setdefault(cpu_key, {}).update({name: metric})
166
167             elif "memory" in res_key0:
168                 result["memory"].update({res_key1: value.split(":")[0]})
169
170             elif "hugepages" in res_key0:
171                 result["hugepages"].update(self.parse_hugepages(key_split, value))
172
173             elif "dpdkstat" in res_key0:
174                 result["dpdkstat"].update(self.parse_dpdkstat(key_split, value))
175
176             elif "virt" in res_key1:
177                 result["virt"].update(self.parse_virt(key_split, value))
178
179             elif "ovs_stats" in res_key0:
180                 result["ovs_stats"].update(self.parse_ovs_stats(key_split, value))
181
182             elif "intel_pmu-all" in res_key0:
183                 result["intel_pmu"].update(self.parse_intel_pmu_stats(res_key1, value))
184
185         result["timestamp"] = testcase
186
187         return result
188
189     def amqp_process_for_nfvi_kpi(self):
190         """ amqp collect and return nfvi kpis """
191         if self.amqp_client is None and self.enable:
192             self.amqp_client = \
193                 multiprocessing.Process(target=self.run_collectd_amqp)
194             self.amqp_client.start()
195
196     def amqp_collect_nfvi_kpi(self):
197         """ amqp collect and return nfvi kpis """
198         if not self.enable:
199             return {}
200
201         metric = {}
202         while not self._queue.empty():
203             metric.update(self._queue.get())
204         msg = self.parse_collectd_result(metric, self.cores)
205         return msg
206
207     def _provide_config_file(self, config_file_path, nfvi_cfg, template_kwargs):
208         template = pkg_resources.resource_string("yardstick.network_services.nfvi",
209                                                  nfvi_cfg).decode('utf-8')
210         cfg_content = jinja2.Template(template, trim_blocks=True, lstrip_blocks=True,
211                                       finalize=finalize_for_yaml).render(
212             **template_kwargs)
213         # cfg_content = io.StringIO(template.format(**template_kwargs))
214         cfg_file = os.path.join(config_file_path, nfvi_cfg)
215         # must write as root, so use sudo
216         self.connection.execute("cat | sudo tee {}".format(cfg_file), stdin=cfg_content)
217
218     def _prepare_collectd_conf(self, config_file_path):
219         """ Prepare collectd conf """
220
221         kwargs = {
222             "interval": self.interval,
223             "loadplugins": set(chain(LIST_PLUGINS_ENABLED, self.plugins.keys())),
224             # Optional fields PortName is descriptive only, use whatever is present
225             "port_names": self.port_names,
226             # "ovs_bridge_interfaces": ["br-int"],
227             "plugins": self.plugins,
228         }
229         self._provide_config_file(config_file_path, self.COLLECTD_CONF, kwargs)
230
231     def _start_collectd(self, connection, bin_path):
232         LOG.debug("Starting collectd to collect NFVi stats")
233         connection.execute('sudo pkill -x -9 collectd')
234         bin_path = get_nsb_option("bin_path")
235         collectd_path = os.path.join(bin_path, "collectd", "sbin", "collectd")
236         config_file_path = os.path.join(bin_path, "collectd", "etc")
237         exit_status = connection.execute("which %s > /dev/null 2>&1" % collectd_path)[0]
238         if exit_status != 0:
239             LOG.warning("%s is not present disabling", collectd_path)
240             # disable auto-provisioning because it requires Internet access
241             # collectd_installer = os.path.join(bin_path, "collectd.sh")
242             # provision_tool(connection, collectd)
243             # http_proxy = os.environ.get('http_proxy', '')
244             # https_proxy = os.environ.get('https_proxy', '')
245             # connection.execute("sudo %s '%s' '%s'" % (
246             #     collectd_installer, http_proxy, https_proxy))
247             return
248         LOG.debug("Starting collectd to collect NFVi stats")
249         # ensure collectd.conf.d exists to avoid error/warning
250         connection.execute("sudo mkdir -p /etc/collectd/collectd.conf.d")
251         self._prepare_collectd_conf(config_file_path)
252
253         # Reset amqp queue
254         LOG.debug("reset and setup amqp to collect data from collectd")
255         connection.execute("sudo rm -rf /var/lib/rabbitmq/mnesia/rabbit*")
256         connection.execute("sudo service rabbitmq-server start")
257         connection.execute("sudo rabbitmqctl stop_app")
258         connection.execute("sudo rabbitmqctl reset")
259         connection.execute("sudo rabbitmqctl start_app")
260         connection.execute("sudo service rabbitmq-server restart")
261
262         LOG.debug("Creating admin user for rabbitmq in order to collect data from collectd")
263         connection.execute("sudo rabbitmqctl delete_user guest")
264         connection.execute("sudo rabbitmqctl add_user admin admin")
265         connection.execute("sudo rabbitmqctl authenticate_user admin admin")
266         connection.execute("sudo rabbitmqctl set_permissions -p / admin '.*' '.*' '.*'")
267
268         LOG.debug("Start collectd service.....")
269         connection.execute("sudo %s" % collectd_path)
270         LOG.debug("Done")
271
272     def initiate_systemagent(self, bin_path):
273         """ Start system agent for NFVi collection on host """
274         if self.enable:
275             try:
276                 self._start_collectd(self.connection, bin_path)
277             except Exception:
278                 LOG.exception("Exception during collectd start")
279                 raise
280
281     def start(self):
282         """ start nfvi collection """
283         if self.enable:
284             LOG.debug("Start NVFi metric collection...")
285
286     def stop(self):
287         """ stop nfvi collection """
288         if not self.enable:
289             return
290
291         agent = "collectd"
292         LOG.debug("Stop resource monitor...")
293
294         if self.amqp_client is not None:
295             self.amqp_client.terminate()
296
297         status, pid = self.check_if_sa_running(agent)
298         if status == 0:
299             return
300
301         self.connection.execute('sudo kill -9 %s' % pid)
302         self.connection.execute('sudo pkill -9 %s' % agent)
303         self.connection.execute('sudo service rabbitmq-server stop')
304         self.connection.execute("sudo rabbitmqctl stop_app")