X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=doctor_tests%2Fmain.py;h=8051ad6e756a6b088a218feef8cd311708105861;hb=8893fda6dd5db6d8b1543228107e6dd423fbc8a6;hp=0ea9d36b9ed4c4e1cab0cab58346e5f1154e0b32;hpb=171f4b42c7d6e179a987b98b46d8c9ae6f2fc36d;p=doctor.git diff --git a/doctor_tests/main.py b/doctor_tests/main.py index 0ea9d36b..8051ad6e 100644 --- a/doctor_tests/main.py +++ b/doctor_tests/main.py @@ -9,6 +9,7 @@ import os from os.path import isfile, join import random +import signal import sys import time @@ -33,10 +34,9 @@ from doctor_tests.scenario.network_failure import NetworkFault from doctor_tests.user import User -LOG = doctor_log.Logger('doctor').getLogger() - -# TODO (r-mibu): fix doctor logger or consider logfile option -LOG_FILE = LOG.handlers[0].baseFilename +Logger = doctor_log.Logger('doctor') +LOG = Logger.getLogger() +LogFile = Logger.getLogFilename() class DoctorTest(object): @@ -69,6 +69,9 @@ class DoctorTest(object): # creating test user... self.user.create() + + def setup_fault_management(self): + # user settings... self.user.update_quota() # creating VM... @@ -80,24 +83,27 @@ class DoctorTest(object): self.alarm.create() # starting doctor sample components... - self.inspector.start() + # tbd tojuvone: move inspector and consumer to common setup + # when they support updating VMs via instance.create and + # instance.delete alarm + self.inspector.start() + self.consumer.start() self.down_host = self.get_host_info_for_random_vm() self.monitor.start(self.down_host) - self.consumer.start() - - def run(self): - """run doctor test""" + def test_fault_management(self): try: - LOG.info('doctor test starting.......') + LOG.info('doctor fault management test starting.......') # prepare test env - self.setup() + self.setup_fault_management() # wait for aodh alarms are updated in caches for event evaluator, - # sleep time should be larger than event_alarm_cache_ttl(default 60) - time.sleep(60) + # sleep time should be larger than event_alarm_cache_ttl + # (default 60) + # (tojuvone) Fraser currently needs 120 + time.sleep(120) # injecting host failure... # NOTE (umar) add INTERFACE_NAME logic to host injection @@ -109,17 +115,63 @@ class DoctorTest(object): # NOTE (umar) copy remote monitor.log file when monitor=collectd self.check_host_status(self.down_host.name, 'down') - notification_time = calculate_notification_time(LOG_FILE) + notification_time = calculate_notification_time(LogFile) if notification_time < 1 and notification_time > 0: - LOG.info('doctor test successfully, notification_time=%s' % notification_time) + LOG.info('doctor fault management test successfully, ' + 'notification_time=%s' % notification_time) else: - LOG.error('doctor test failed, notification_time=%s' % notification_time) + LOG.error('doctor fault management test failed, ' + 'notification_time=%s' % notification_time) sys.exit(1) if self.conf.profiler_type: - LOG.info('doctor test begin to run profile.......') + LOG.info('doctor fault management test begin to run ' + 'profile.......') self.collect_logs() self.run_profiler() + except Exception as e: + LOG.error('doctor fault management test failed, ' + 'Exception=%s' % e) + sys.exit(1) + finally: + self.cleanup_fault_management() + + def _amount_compute_nodes(self): + services = self.nova.services.list(binary='nova-compute') + return len(services) + + def test_maintenance(self): + cnodes = self._amount_compute_nodes() + if cnodes < 3: + # need 2 compute for redundancy and one spare to migrate + LOG.info('not enough compute nodes, skipping doctor ' + 'maintenance test') + return + try: + LOG.info('doctor maintenance test starting.......') + # TODO (tojuvone) test setup and actual test + except Exception as e: + LOG.error('doctor maintenance test failed, Exception=%s' % e) + sys.exit(1) + # TODO (tojuvone) finally: test case specific cleanup + + def run(self): + """run doctor tests""" + try: + LOG.info('doctor test starting.......') + # prepare common test env + self.setup() + if self.conf.test_case == 'all': + self.test_fault_management() + self.test_maintenance() + else: + function = 'test_%s' % self.conf.test_case + if hasattr(self, function): + getattr(self, function)() + else: + raise Exception('Can not find function <%s> in' + 'DoctorTest, see config manual' + % function) except Exception as e: LOG.error('doctor test failed, Exception=%s' % e) sys.exit(1) @@ -130,13 +182,11 @@ class DoctorTest(object): num = random.randint(0, self.conf.instance_count - 1) vm_name = "%s%d" % (self.conf.instance_basename, num) - servers = \ - {getattr(server, 'name'): server - for server in self.nova.servers.list()} + servers = {getattr(server, 'name'): server + for server in self.nova.servers.list()} server = servers.get(vm_name) if not server: - raise \ - Exception('Can not find instance: vm_name(%s)' % vm_name) + raise Exception('Can not find instance: vm_name(%s)' % vm_name) host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname') host_ip = self.installer.get_host_ip_from_hostname(host_name) @@ -145,13 +195,15 @@ class DoctorTest(object): return Host(host_name, host_ip) def check_host_status(self, hostname, state): - service = self.nova.services.list(host=hostname, binary='nova-compute') + service = self.nova.services.list(host=hostname, + binary='nova-compute') host_state = service[0].__dict__.get('state') assert host_state == state def unset_forced_down_hosts(self): if self.down_host: - self.nova.services.force_down(self.down_host.name, 'nova-compute', False) + self.nova.services.force_down(self.down_host.name, + 'nova-compute', False) time.sleep(2) self.check_host_status(self.down_host.name, 'up') @@ -159,55 +211,69 @@ class DoctorTest(object): self.fault.get_disable_network_log() def run_profiler(self): - test_dir = os.path.split(os.path.realpath(__file__))[0] + net_down_log_file = self.fault.get_disable_network_log() reg = '(?<=doctor set link down at )\d+.\d+' - linkdown = float(match_rep_in_file(reg, LOG_FILE).group(0)) + linkdown = float(match_rep_in_file(reg, net_down_log_file).group(0)) reg = '(.* doctor mark vm.* error at )(\d+.\d+)' - vmdown = float(match_rep_in_file(reg, LOG_FILE).group(2)) + vmdown = float(match_rep_in_file(reg, LogFile).group(2)) reg = '(.* doctor mark host.* down at )(\d+.\d+)' - hostdown = float(match_rep_in_file(reg, LOG_FILE).group(2)) + hostdown = float(match_rep_in_file(reg, LogFile).group(2)) reg = '(?<=doctor monitor detected at )\d+.\d+' - detected = float(match_rep_in_file(reg, LOG_FILE).group(0)) + detected = float(match_rep_in_file(reg, LogFile).group(0)) reg = '(?<=doctor consumer notified at )\d+.\d+' - notified = float(match_rep_in_file(reg, LOG_FILE).group(0)) + notified = float(match_rep_in_file(reg, LogFile).group(0)) # TODO(yujunz) check the actual delay to verify time sync status # expected ~1s delay from $trigger to $linkdown relative_start = linkdown - os.environ['DOCTOR_PROFILER_T00'] = str(int((linkdown - relative_start)*1000)) - os.environ['DOCTOR_PROFILER_T01'] = str(int((detected - relative_start) * 1000)) - os.environ['DOCTOR_PROFILER_T03'] = str(int((vmdown - relative_start) * 1000)) - os.environ['DOCTOR_PROFILER_T04'] = str(int((hostdown - relative_start) * 1000)) - os.environ['DOCTOR_PROFILER_T09'] = str(int((notified - relative_start) * 1000)) + os.environ['DOCTOR_PROFILER_T00'] = ( + str(int((linkdown - relative_start) * 1000))) + os.environ['DOCTOR_PROFILER_T01'] = ( + str(int((detected - relative_start) * 1000))) + os.environ['DOCTOR_PROFILER_T03'] = ( + str(int((vmdown - relative_start) * 1000))) + os.environ['DOCTOR_PROFILER_T04'] = ( + str(int((hostdown - relative_start) * 1000))) + os.environ['DOCTOR_PROFILER_T09'] = ( + str(int((notified - relative_start) * 1000))) profiler_main(log=LOG) - def cleanup(self): + def cleanup_fault_management(self): self.unset_forced_down_hosts() self.inspector.stop() self.monitor.stop() self.consumer.stop() - self.installer.cleanup() self.alarm.delete() self.instance.delete() self.network.delete() - self.image.delete() self.fault.cleanup() + + def cleanup(self): + self.installer.cleanup() + self.image.delete() self.user.delete() + # Kill possible hanging subprocess + os.killpg(0, signal.SIGKILL) def main(): """doctor main""" + # TODO (tojuvone): JIRA DOCTOR-123: Test cases have some issue to always + # kill all subprocesses. To ensure they are killed this group is done so + # all processes can be killed without knowing what they are. + os.setpgrp() test_dir = os.path.split(os.path.realpath(__file__))[0] doctor_root_dir = os.path.dirname(test_dir) config_file_dir = '{0}/{1}'.format(doctor_root_dir, 'etc/') - config_files = [join(config_file_dir, f) for f in os.listdir(config_file_dir) + config_files = [join(config_file_dir, f) + for f in os.listdir(config_file_dir) if isfile(join(config_file_dir, f))] conf = config.prepare_conf(args=sys.argv[1:],