1 # -*- coding: utf-8 -*-
3 # Licensed under the Apache License, Version 2.0 (the "License"); you may
4 # not use this file except in compliance with the License. You may obtain
5 # a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
11 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
12 # License for the specific language governing permissions and limitations
15 """Classes used by collectd.py"""
23 from opnfv.deployment import factory
25 from functest.utils import constants
27 ID_RSA_PATH = '/root/.ssh/id_rsa'
28 SSH_KEYS_SCRIPT = '/home/opnfv/barometer/baro_utils/get_ssh_keys.sh'
29 DEF_PLUGIN_INTERVAL = 10
30 COLLECTD_CONF = '/etc/collectd.conf'
31 COLLECTD_CONF_DIR = '/etc/collectd/collectd.conf.d'
32 NOTIFICATION_FILE = '/var/log/python-notifications.dump'
33 COLLECTD_NOTIFICATION = '/etc/collectd_notification_dump.py'
34 APEX_IP = os.getenv("INSTALLER_IP").rstrip('\n')
36 APEX_USER_STACK = 'stack'
37 APEX_PKEY = '/root/.ssh/id_rsa'
41 """Node configuration class"""
42 def __init__(self, attrs):
43 self.__null = attrs[0]
45 self.__name = attrs[2]
46 self.__status = attrs[3] if attrs[3] else None
47 self.__taskState = attrs[4]
48 self.__pwrState = attrs[5]
49 self.__ip = re.sub('^[a-z]+=', '', attrs[6])
60 """Get node IP address"""
69 handler = factory.Factory.get_handler('apex',
73 nodes = handler.get_nodes()
77 class ConfigServer(object):
78 """Class to get env configuration"""
79 def __init__(self, host, user, logger, priv_key=None):
83 self.__priv_key = priv_key
85 self.__logger = logger
87 self.__private_key_file = ID_RSA_PATH
88 if not os.path.isfile(self.__private_key_file):
90 "Private key file '{}'".format(self.__private_key_file)
92 raise IOError("Private key file '{}' not found.".format(
93 self.__private_key_file))
95 # get list of available nodes
96 ssh, sftp = self.__open_sftp_session(
97 self.__host, self.__user, self.__passwd)
99 fuel_node_passed = False
101 while (attempt <= 10) and not fuel_node_passed:
102 stdin, stdout, stderr = ssh.exec_command(
103 "source stackrc; nova list")
104 stderr_lines = stderr.readlines()
106 self.__logger.warning(
107 "'Apex node' command failed (try {}):".format(attempt))
108 for line in stderr_lines:
109 self.__logger.debug(line.strip())
111 fuel_node_passed = True
114 "'Apex node' command passed (try {})".format(attempt))
116 if not fuel_node_passed:
118 "'Apex node' command failed. This was the last try.")
120 "'Apex node' command failed. This was the last try.")
121 node_table = stdout.readlines()\
123 # skip table title and parse table values
125 for entry in node_table[3:]:
126 if entry[0] == '+' or entry[0] == '\n':
131 Node([str(x.strip(' \n')) for x in entry.split('|')]))
133 def get_controllers(self):
134 # Get list of controllers
135 print self.__nodes[0]._Node__ip
137 [node for node in self.__nodes if 'controller' in node.get_name()])
139 def get_computes(self):
140 # Get list of computes
142 [node for node in self.__nodes if 'compute' in node.get_name()])
148 def __open_sftp_session(self, host, user, passwd=None):
149 # Connect to given host.
150 """Keyword arguments:
151 host -- host to connect
153 passwd -- password to use
155 Return tuple of SSH and SFTP client instances.
158 ssh = paramiko.SSHClient()
159 ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy())
161 # try a direct access using password or private key
162 if not passwd and not self.__priv_key:
164 self.__priv_key = paramiko.RSAKey.from_private_key_file(
165 self.__private_key_file)
167 # connect to the server
169 host, username=user, password=passwd, pkey=self.__priv_key)
170 sftp = ssh.open_sftp()
172 # return SFTP client instance
175 def get_plugin_interval(self, compute, plugin):
176 """Find the plugin interval in collectd configuration.
179 compute -- compute node instance
180 plugin -- plug-in name
182 If found, return interval value, otherwise the default value"""
183 default_interval = DEF_PLUGIN_INTERVAL
184 compute_name = compute.get_name()
185 nodes = get_apex_nodes()
187 if compute_name == node.get_dict()['name']:
188 stdout = node.run_cmd(
189 'cat /etc/collectd/collectd.conf.d/{}.conf'.format(plugin))
191 return default_interval
192 for line in stdout.split('\n'):
193 if 'Interval' in line:
195 return default_interval
197 def get_plugin_config_values(self, compute, plugin, parameter):
198 """Get parameter values from collectd config file.
201 compute -- compute node instance
202 plugin -- plug-in name
203 parameter -- plug-in parameter
205 Return list of found values."""
207 compute_name = compute.get_name()
208 nodes = get_apex_nodes()
210 if compute_name == node.get_dict()['name']:
211 stdout = node.run_cmd(
212 'cat /etc/collectd/collectd.conf.d/{}.conf' .format(plugin))
214 return default_values
215 for line in stdout.split('\n'):
216 if 'Interfaces' in line:
217 return line.split(' ', 1)[1]
218 elif 'Bridges' in line:
219 return line.split(' ', 1)[1]
220 elif 'Cores' in line:
221 return line.split(' ', 1)[1]
224 return default_values
226 def execute_command(self, command, host_ip=None, ssh=None):
227 """Execute command on node and return list of lines of standard output.
231 host_ip -- IP of the node
232 ssh -- existing open SSH session to use
234 One of host_ip or ssh must not be None. If both are not None,
235 existing ssh session is used.
237 if host_ip is None and ssh is None:
238 raise ValueError('One of host_ip or ssh must not be None.')
240 ssh, sftp = self.__open_sftp_session(host_ip, 'root', 'opnfvapex')
241 stdin, stdout, stderr = ssh.exec_command(command)
242 return stdout.readlines()
244 def get_ovs_interfaces(self, compute):
245 """Get list of configured OVS interfaces
248 compute -- compute node instance
250 compute_name = compute.get_name()
251 nodes = get_apex_nodes()
253 if compute_name == node.get_dict()['name']:
254 stdout = node.run_cmd('sudo ovs-vsctl list-br')
257 def is_gnocchi_running(self, controller):
258 """Check whether Gnocchi is running on controller.
261 controller -- controller node instance
263 Return boolean value whether Gnocchi is running.
265 gnocchi_present = False
266 controller_name = controller.get_name()
267 nodes = get_apex_nodes()
269 if controller_name == node.get_dict()['name']:
270 node.put_file(constants.ENV_FILE, 'overcloudrc.v3')
271 stdout = node.run_cmd(
272 "source overcloudrc.v3;"
273 + "openstack catalog list | grep gnocchi")
276 elif 'gnocchi' in stdout:
277 gnocchi_present = True
278 return gnocchi_present
281 return gnocchi_present
283 def is_aodh_running(self, controller):
284 """Check whether aodh service is running on controller
287 controller_name = controller.get_name()
288 nodes = get_apex_nodes()
290 if controller_name == node.get_dict()['name']:
291 node.put_file(constants.ENV_FILE, 'overcloudrc.v3')
292 stdout = node.run_cmd(
293 "source overcloudrc.v3;"
294 + "openstack catalog list | grep aodh")
297 elif 'aodh' in stdout:
304 def is_redis_running(self, compute):
305 """Check whether redis service is running on compute"""
306 compute_name = compute.get_name()
307 nodes = get_apex_nodes()
309 if compute_name == node.get_dict()['name']:
310 stdout = node.run_cmd('sudo systemctl status docker'
312 '| grep barometer-redis')
313 if stdout and 'barometer-redis' in stdout:
315 'Redis is running in node {}'.format(
319 'Redis is *not* running in node {}'.format(
323 def is_localagent_server_running(self, compute):
324 """Check whether LocalAgent server is running on compute"""
325 compute_name = compute.get_name()
326 nodes = get_apex_nodes()
328 if compute_name == node.get_dict()['name']:
329 stdout = node.run_cmd('sudo systemctl status docker'
331 '| grep opnfv/barometer-localagent')
332 if stdout and '/server' in stdout:
334 'LocalAgent Server is running in node {}'.format(
338 'LocalAgent Server is *not* running in node {}'.format(
342 def is_localagent_infofetch_running(self, compute):
343 """Check whether LocalAgent infofetch is running on compute"""
344 compute_name = compute.get_name()
345 nodes = get_apex_nodes()
347 if compute_name == node.get_dict()['name']:
348 stdout = node.run_cmd('sudo systemctl status docker'
350 '| grep opnfv/barometer-localagent')
351 if stdout and '/infofetch' in stdout:
353 'LocalAgent InfoFetch is running in node {}'.format(
357 'LocalAgent InfoFetch is *not* running in node {}'.format(
361 def get_localagent_config(self, compute):
362 """Get config values of LocalAgent"""
363 compute_name = compute.get_name()
364 nodes = get_apex_nodes()
366 if compute_name == node.get_dict()['name']:
367 # We use following after functest accept python-toml
368 # stdout = node.run_cmd(
369 # 'cat /etc/barometer-localagent/config.toml')
371 # agent_conf = toml.loads(stdout)
372 # except (TypeError, TomlDecodeError) as e:
373 # self.__logger.error(
374 # 'LocalAgent config error: {}'.format(e))
379 'egrep "listen_port|amqp_"'
380 ' /etc/barometer-localagent/config.toml'
381 '| sed -e "s/#.*$//" | sed -e "s/=/:/"'
383 stdout = node.run_cmd(readcmd)
384 agent_conf = {"server": yaml.load(stdout)}
387 'ping -n -c1 ' + agent_conf["server"]["amqp_host"] +
388 '| sed -ne "s/^.*bytes from //p" | sed -e "s/:.*//"'
390 agent_conf["server"]["amqp_host"] = node.run_cmd(pingcmd)
395 def is_mcelog_installed(self, compute, package):
396 """Check whether package exists on compute node.
399 compute -- compute node instance
400 package -- Linux package to search for
402 Return boolean value whether package is installed.
404 compute_name = compute.get_name()
405 nodes = get_apex_nodes()
407 if compute_name == node.get_dict()['name']:
408 stdout = node.run_cmd(
409 'rpm -qa | grep mcelog')
412 elif 'mcelog' in stdout:
417 def is_rdt_available(self, compute):
418 """Check whether the compute node is a virtual machine."""
419 compute_name = compute.get_name()
420 nodes = get_apex_nodes()
422 if compute_name == node.get_dict()['name']:
423 stdout = node.run_cmd('cat /proc/cpuinfo | grep hypervisor')
424 if 'hypervisor' in stdout:
428 def is_libpqos_on_node(self, compute):
429 """Check whether libpqos is present on compute node"""
431 compute_name = compute.get_name()
432 nodes = get_apex_nodes()
434 if compute_name == node.get_dict()['name']:
435 stdout = node.run_cmd('ls /usr/local/lib/ | grep libpqos')
436 if 'libpqos' in stdout:
440 def check_aodh_plugin_included(self, compute):
441 """Check if aodh plugin is included in collectd.conf file.
442 If not, try to enable it.
445 compute -- compute node instance
447 Return boolean value whether AODH plugin is included
448 or it's enabling was successful.
450 compute_name = compute.get_name()
451 nodes = get_apex_nodes()
453 if compute_name == node.get_dict()['name']:
454 aodh_conf = node.run_cmd('ls /etc/collectd/collectd.conf.d')
455 if 'aodh.conf' not in aodh_conf:
457 "AODH Plugin not included in {}".format(compute_name))
461 "AODH plugin present in compute node {}" .format(
466 def check_gnocchi_plugin_included(self, compute):
467 """Check if gnocchi plugin is included in collectd.conf file.
468 If not, try to enable it.
471 compute -- compute node instance
473 Return boolean value whether gnocchi plugin is included
474 or it's enabling was successful.
476 compute_name = compute.get_name()
477 nodes = get_apex_nodes()
479 if compute_name == node.get_dict()['name']:
480 gnocchi_conf = node.run_cmd('ls /etc/collectd/collectd.conf.d')
481 if 'collectd-ceilometer-plugin.conf' not in gnocchi_conf:
483 "Gnocchi Plugin not included in node {}".format(
488 "Gnocchi plugin available in compute node {}" .format(
493 def check_snmp_plugin_included(self, compute):
494 """Check if SNMP plugin is active in compute node.
496 snmp_mib = '/usr/share/snmp/mibs/Intel-Rdt.txt'
497 snmp_string = 'INTEL-RDT-MIB::intelRdt'
498 compute_name = compute.get_name()
499 nodes = get_apex_nodes()
501 if compute_name == node.get_dict()['name']:
502 stdout = node.run_cmd(
503 'snmpwalk -v2c -m {0} -c public localhost {1}' .format(
504 snmp_mib, snmp_string))
505 self.__logger.info("snmp output = {}" .format(stdout))
512 self, compute, plugins, error_plugins, create_backup=True):
513 """Enable plugins on compute node
516 compute -- compute node instance
517 plugins -- list of plugins to be enabled
519 Return boolean value indicating whether function was successful.
521 csv_file = os.path.dirname(os.path.realpath(__file__)) + '/csv.conf'
522 plugins = sorted(plugins)
523 compute_name = compute.get_name()
524 nodes = get_apex_nodes()
526 if compute_name == node.get_dict()['name']:
527 node.put_file(csv_file, 'csv.conf')
530 + '/etc/collectd/collectd.conf.d/csv.conf')
533 def restart_collectd(self, compute):
534 """Restart collectd on compute node.
537 compute -- compute node instance
539 Retrun tuple with boolean indicating success and list of warnings
540 received during collectd start.
542 compute_name = compute.get_name()
543 nodes = get_apex_nodes()
545 def get_collectd_processes(compute_node):
546 """Get number of running collectd processes.
549 ssh_session -- instance of SSH session in which to check
552 stdout = compute_node.run_cmd("pgrep collectd")
556 if compute_name == node.get_dict()['name']:
557 # node.run_cmd('su; "opnfvapex"')
558 self.__logger.info('Stopping collectd service...')
559 node.run_cmd('sudo systemctl stop collectd')
561 if get_collectd_processes(node):
562 self.__logger.error('Collectd is still running...')
564 self.__logger.info('Starting collectd service...')
565 stdout = node.run_cmd('sudo systemctl start collectd')
568 output.strip() for output in stdout if 'WARN: ' in output]
569 if get_collectd_processes(node) == 0:
570 self.__logger.error('Collectd is still not running...')
571 return False, warning
574 def trigger_alarm_update(self, alarm, compute_node):
575 # TODO: move these actions to main, with criteria lists so that we can reference that
576 # i.e. test_plugin_with_aodh(self, compute, plugin.., logger, criteria_list, alarm_action)
577 if alarm == 'mcelog':
578 compute_node.run_cmd('sudo modprobe mce-inject')
579 compute_node.run_cmd('sudo ./mce-inject_ea < corrected')
580 if alarm == 'ovs_events':
581 compute_node.run_cmd('sudo ifconfig -a | grep br0')
582 compute_node.run_cmd('sudo ifconfig br0 down; sudo ifconfig br0 up')
584 def test_plugins_with_aodh(
585 self, compute, plugin_interval, logger,
591 nodes = get_apex_nodes()
592 compute_node = [node for node in nodes if node.get_dict()['name'] == compute][0]
594 if node.is_controller():
595 self.__logger.info('Getting AODH Alarm list on {}' .format(
596 (node.get_dict()['name'])))
597 node.put_file(constants.ENV_FILE, 'overcloudrc.v3')
598 self.trigger_alarm_update(criteria_list, compute_node)
599 stdout = node.run_cmd(
600 "source overcloudrc.v3;"
601 + "aodh alarm list | grep {0} | grep {1}"
602 .format(criteria_list, compute))
604 self.__logger.info("aodh alarm list was empty")
606 for line in stdout.splitlines():
607 line = line.replace('|', "")
608 metric_id = line.split()[0]
609 stdout = node.run_cmd(
610 'source overcloudrc.v3; aodh alarm show {}' .format(
613 self.__logger.info("aodh alarm list was empty")
615 for line in stdout.splitlines()[3: -1]:
616 line = line.replace('|', "")
617 if line.split()[0] == 'state_timestamp':
618 timestamps1 = line.split()[1]
619 self.trigger_alarm_update(criteria_list, compute_node)
621 stdout = node.run_cmd(
622 "source overcloudrc.v3; aodh alarm show {}" .format(
625 self.__logger.info("aodh alarm list was empty")
627 for line in stdout.splitlines()[3:-1]:
628 line = line.replace('|', "")
629 if line.split()[0] == 'state_timestamp':
630 timestamps2 = line.split()[1]
631 if timestamps1 == timestamps2:
633 "Data not updated after interval of 12 seconds")
636 self.__logger.info("PASS")
639 def test_plugins_with_gnocchi(
640 self, compute, plugin_interval, logger,
646 nodes = get_apex_nodes()
647 if plugin_interval > 15:
648 sleep_time = plugin_interval*2
653 if node.is_controller():
654 self.__logger.info('Getting gnocchi metric list on {}' .format(
655 (node.get_dict()['name'])))
656 node.put_file(constants.ENV_FILE, 'overcloudrc.v3')
657 stdout = node.run_cmd(
658 "source overcloudrc.v3;"
659 + "gnocchi metric list | grep {0} | grep {1}"
660 .format(criteria_list, compute))
662 self.__logger.info("gnocchi list was empty")
664 for line in stdout.splitlines():
665 line = line.replace('|', "")
666 metric_id = line.split()[0]
667 stdout = node.run_cmd(
668 'source overcloudrc.v3;gnocchi measures show {}'.format(
671 self.__logger.info("gnocchi list was empty")
673 for line in stdout.splitlines()[3: -1]:
677 timestamps1 = line.replace('|', "")
678 timestamps1 = timestamps1.split()[0]
679 time.sleep(sleep_time)
680 stdout = node.run_cmd(
681 "source overcloudrc.v3;gnocchi measures show {}".format(
684 self.__logger.info("gnocchi measures was empty")
686 for line in stdout.splitlines()[3:-1]:
690 timestamps2 = line.replace('|', "")
691 timestamps2 = timestamps2.split()[0]
692 if timestamps1 == timestamps2:
694 "Plugin Interval is {}" .format(plugin_interval))
696 "Data not updated after {} seconds".format(
700 self.__logger.info("PASS")
704 def test_plugins_with_snmp(
705 self, compute, plugin_interval, logger, plugin, snmp_mib_files=[],
706 snmp_mib_strings=[], snmp_in_commands=[]):
708 if plugin in ('hugepages', 'intel_rdt', 'mcelog'):
709 nodes = get_apex_nodes()
711 if compute == node.get_dict()['name']:
712 stdout = node.run_cmd(
713 'snmpwalk -v2c -m {0} -c public localhost {1}' .format(
714 snmp_mib_files, snmp_mib_strings))
715 self.__logger.info("{}" .format(stdout))
717 self.__logger.info("No output from snmpwalk")
719 elif 'OID' in stdout:
720 self.__logger.info("SNMP query failed")
723 counter1 = stdout.split()[3]
725 stdout = node.run_cmd(
726 'snmpwalk -v2c -m {0} -c public localhost {1}' .format(
727 snmp_mib_files, snmp_mib_strings))
728 self.__logger.info("{}" .format(stdout))
730 self.__logger.info("No output from snmpwalk")
731 elif 'OID' in stdout:
733 "SNMP query failed during second check")
734 self.__logger.info("waiting for 10 sec")
736 stdout = node.run_cmd(
737 'snmpwalk -v2c -m {0} -c public localhost {1}' .format(
738 snmp_mib_files, snmp_mib_strings))
739 self.__logger.info("{}" .format(stdout))
741 self.__logger.info("No output from snmpwalk")
742 elif 'OID' in stdout:
743 self.__logger.info("SNMP query failed again")
744 self.__logger.info("Failing this test case")
747 counter2 = stdout.split()[3]
749 if counter1 == counter2:
756 def check_localagent_dummy_included(self, compute, name):
757 """Check if dummy collectd config by LocalAgent
758 is included in collectd.conf file.
761 compute -- compute node instance
762 name -- config file name
764 compute_name = compute.get_name()
765 nodes = get_apex_nodes()
767 if compute_name == node.get_dict()['name']:
768 dummy_conf = node.run_cmd('ls /etc/collectd/collectd.conf.d')
769 if name + '.conf' not in dummy_conf:
770 self.__logger.error('check conf FAIL')
773 self.__logger.info('check conf PASS')
774 fullpath = '/etc/collectd/collectd.conf.d/{}'.format(
776 self.__logger.info('Delete file {}'.format(fullpath))
777 node.run_cmd('sudo rm -f ' + fullpath)
779 self.__logger.error('Some panic, compute not found')
782 def create_testvm(self, compute_node, test_name):
783 nodes = get_apex_nodes()
784 compute_name = compute_node.get_name()
786 controller_node = None
788 if node.is_controller():
789 controller_node = node
792 self.__logger.debug('Creating Test VM on {}' .format(compute_name))
793 self.__logger.debug('Create command is executed in {}' .format(
794 (controller_node.get_dict()['name'])))
796 image_filename = 'cirros-0.4.0-x86_64-disk.img'
797 controller_node.run_cmd(
799 'http://download.cirros-cloud.net/0.4.0/'
802 node.put_file(constants.ENV_FILE, 'overcloudrc.v3')
803 image = controller_node.run_cmd(
804 'source overcloudrc.v3;'
805 'openstack image create -f value -c id'
806 ' --disk-format qcow2 --file {0} {1}'
807 .format(image_filename, test_name))
808 flavor = controller_node.run_cmd(
809 'source overcloudrc.v3;'
810 'openstack flavor create -f value -c id {}'
812 host = controller_node.run_cmd(
813 'source overcloudrc.v3;'
814 'openstack hypervisor list -f value -c "Hypervisor Hostname"'
816 .format(compute_name))
817 server = controller_node.run_cmd(
818 'source overcloudrc.v3;'
819 'openstack server create -f value -c id'
820 ' --image {0} --flavor {1} --availability-zone {2} {3}'
821 .format(image, flavor, 'nova:' + host, test_name))
823 resources = {"image": image, "flavor": flavor, "server": server}
826 self.__logger.debug('VM created')
827 self.__logger.debug('VM info: {}'.format(resources))
831 def delete_testvm(self, resources):
832 nodes = get_apex_nodes()
834 controller_node = None
836 if node.is_controller():
837 controller_node = node
840 self.__logger.debug('Deleteing Test VM')
841 self.__logger.debug('VM to be deleted info: {}'.format(resources))
842 self.__logger.debug('Delete command is executed in {}' .format(
843 (controller_node.get_dict()['name'])))
845 server = resources.get('server', None)
846 flavor = resources.get('flavor', None)
847 image = resources.get('image', None)
849 controller_node.run_cmd(
850 'source overcloudrc.v3;'
851 'openstack server delete {}'.format(server))
853 controller_node.run_cmd(
854 'source overcloudrc.v3;'
855 'openstack flavor delete {}'.format(flavor))
857 controller_node.run_cmd(
858 'source overcloudrc.v3;'
859 'openstack image delete {}'.format(image))
861 self.__logger.debug('VM and other OpenStack resources deleted')
863 def test_localagent_infofetch_get_data(self, compute, test_name):
864 compute_name = compute.get_name()
865 nodes = get_apex_nodes()
867 if compute_name == node.get_dict()['name']:
868 stdout = node.run_cmd(
869 'redis-cli keys "barometer-localagent/vm/*/vminfo"'
870 ' | while read k; do redis-cli get $k; done'
871 ' | grep {}'.format(test_name))
872 self.__logger.debug('InfoFetch data: {}'.format(stdout))
873 if stdout and test_name in stdout:
874 self.__logger.info('PASS')
877 self.__logger.info('No test vm info')
879 self.__logger.info('FAIL')