X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=doctor_tests%2Fmain.py;h=8051ad6e756a6b088a218feef8cd311708105861;hb=8893fda6dd5db6d8b1543228107e6dd423fbc8a6;hp=0ea9d36b9ed4c4e1cab0cab58346e5f1154e0b32;hpb=171f4b42c7d6e179a987b98b46d8c9ae6f2fc36d;p=doctor.git

diff --git a/doctor_tests/main.py b/doctor_tests/main.py
index 0ea9d36b..8051ad6e 100644
--- a/doctor_tests/main.py
+++ b/doctor_tests/main.py
@@ -9,6 +9,7 @@
 import os
 from os.path import isfile, join
 import random
+import signal
 import sys
 import time
 
@@ -33,10 +34,9 @@ from doctor_tests.scenario.network_failure import NetworkFault
 from doctor_tests.user import User
 
 
-LOG = doctor_log.Logger('doctor').getLogger()
-
-# TODO (r-mibu): fix doctor logger or consider logfile option
-LOG_FILE = LOG.handlers[0].baseFilename
+Logger = doctor_log.Logger('doctor')
+LOG = Logger.getLogger()
+LogFile = Logger.getLogFilename()
 
 
 class DoctorTest(object):
@@ -69,6 +69,9 @@ class DoctorTest(object):
 
         # creating test user...
         self.user.create()
+
+    def setup_fault_management(self):
+        # user settings...
         self.user.update_quota()
 
         # creating VM...
@@ -80,24 +83,27 @@ class DoctorTest(object):
         self.alarm.create()
 
         # starting doctor sample components...
-        self.inspector.start()
+        # tbd tojuvone: move inspector and consumer to common setup
+        # when they support updating VMs via instance.create and
+        # instance.delete alarm
 
+        self.inspector.start()
+        self.consumer.start()
         self.down_host = self.get_host_info_for_random_vm()
         self.monitor.start(self.down_host)
 
-        self.consumer.start()
-
-    def run(self):
-        """run doctor test"""
+    def test_fault_management(self):
         try:
-            LOG.info('doctor test starting.......')
+            LOG.info('doctor fault management test starting.......')
 
             # prepare test env
-            self.setup()
+            self.setup_fault_management()
 
             # wait for aodh alarms are updated in caches for event evaluator,
-            # sleep time should be larger than event_alarm_cache_ttl(default 60)
-            time.sleep(60)
+            # sleep time should be larger than event_alarm_cache_ttl
+            # (default 60)
+            # (tojuvone) Fraser currently needs 120
+            time.sleep(120)
 
             # injecting host failure...
             # NOTE (umar) add INTERFACE_NAME logic to host injection
@@ -109,17 +115,63 @@ class DoctorTest(object):
             # NOTE (umar) copy remote monitor.log file when monitor=collectd
             self.check_host_status(self.down_host.name, 'down')
 
-            notification_time = calculate_notification_time(LOG_FILE)
+            notification_time = calculate_notification_time(LogFile)
             if notification_time < 1 and notification_time > 0:
-                LOG.info('doctor test successfully, notification_time=%s' % notification_time)
+                LOG.info('doctor fault management test successfully, '
+                         'notification_time=%s' % notification_time)
             else:
-                LOG.error('doctor test failed, notification_time=%s' % notification_time)
+                LOG.error('doctor fault management test failed, '
+                          'notification_time=%s' % notification_time)
                 sys.exit(1)
 
             if self.conf.profiler_type:
-                LOG.info('doctor test begin to run profile.......')
+                LOG.info('doctor fault management test begin to run '
+                         'profile.......')
                 self.collect_logs()
                 self.run_profiler()
+        except Exception as e:
+            LOG.error('doctor fault management test failed, '
+                      'Exception=%s' % e)
+            sys.exit(1)
+        finally:
+            self.cleanup_fault_management()
+
+    def _amount_compute_nodes(self):
+        services = self.nova.services.list(binary='nova-compute')
+        return len(services)
+
+    def test_maintenance(self):
+        cnodes = self._amount_compute_nodes()
+        if cnodes < 3:
+            # need 2 compute for redundancy and one spare to migrate
+            LOG.info('not enough compute nodes, skipping doctor '
+                     'maintenance test')
+            return
+        try:
+            LOG.info('doctor maintenance test starting.......')
+            # TODO (tojuvone) test setup and actual test
+        except Exception as e:
+            LOG.error('doctor maintenance test failed, Exception=%s' % e)
+            sys.exit(1)
+        # TODO (tojuvone) finally: test case specific cleanup
+
+    def run(self):
+        """run doctor tests"""
+        try:
+            LOG.info('doctor test starting.......')
+            # prepare common test env
+            self.setup()
+            if self.conf.test_case == 'all':
+                self.test_fault_management()
+                self.test_maintenance()
+            else:
+                function = 'test_%s' % self.conf.test_case
+                if hasattr(self, function):
+                    getattr(self, function)()
+                else:
+                    raise Exception('Can not find function <%s> in'
+                                    'DoctorTest, see config manual'
+                                    % function)
         except Exception as e:
             LOG.error('doctor test failed, Exception=%s' % e)
             sys.exit(1)
@@ -130,13 +182,11 @@ class DoctorTest(object):
         num = random.randint(0, self.conf.instance_count - 1)
         vm_name = "%s%d" % (self.conf.instance_basename, num)
 
-        servers = \
-            {getattr(server, 'name'): server
-             for server in self.nova.servers.list()}
+        servers = {getattr(server, 'name'): server
+                   for server in self.nova.servers.list()}
         server = servers.get(vm_name)
         if not server:
-            raise \
-                Exception('Can not find instance: vm_name(%s)' % vm_name)
+            raise Exception('Can not find instance: vm_name(%s)' % vm_name)
         host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname')
         host_ip = self.installer.get_host_ip_from_hostname(host_name)
 
@@ -145,13 +195,15 @@ class DoctorTest(object):
         return Host(host_name, host_ip)
 
     def check_host_status(self, hostname, state):
-        service = self.nova.services.list(host=hostname, binary='nova-compute')
+        service = self.nova.services.list(host=hostname,
+                                          binary='nova-compute')
         host_state = service[0].__dict__.get('state')
         assert host_state == state
 
     def unset_forced_down_hosts(self):
         if self.down_host:
-            self.nova.services.force_down(self.down_host.name, 'nova-compute', False)
+            self.nova.services.force_down(self.down_host.name,
+                                          'nova-compute', False)
             time.sleep(2)
             self.check_host_status(self.down_host.name, 'up')
 
@@ -159,55 +211,69 @@ class DoctorTest(object):
         self.fault.get_disable_network_log()
 
     def run_profiler(self):
-        test_dir = os.path.split(os.path.realpath(__file__))[0]
 
+        net_down_log_file = self.fault.get_disable_network_log()
         reg = '(?<=doctor set link down at )\d+.\d+'
-        linkdown = float(match_rep_in_file(reg, LOG_FILE).group(0))
+        linkdown = float(match_rep_in_file(reg, net_down_log_file).group(0))
 
         reg = '(.* doctor mark vm.* error at )(\d+.\d+)'
-        vmdown = float(match_rep_in_file(reg, LOG_FILE).group(2))
+        vmdown = float(match_rep_in_file(reg, LogFile).group(2))
 
         reg = '(.* doctor mark host.* down at )(\d+.\d+)'
-        hostdown = float(match_rep_in_file(reg, LOG_FILE).group(2))
+        hostdown = float(match_rep_in_file(reg, LogFile).group(2))
 
         reg = '(?<=doctor monitor detected at )\d+.\d+'
-        detected = float(match_rep_in_file(reg, LOG_FILE).group(0))
+        detected = float(match_rep_in_file(reg, LogFile).group(0))
 
         reg = '(?<=doctor consumer notified at )\d+.\d+'
-        notified = float(match_rep_in_file(reg, LOG_FILE).group(0))
+        notified = float(match_rep_in_file(reg, LogFile).group(0))
 
         # TODO(yujunz) check the actual delay to verify time sync status
         # expected ~1s delay from $trigger to $linkdown
         relative_start = linkdown
-        os.environ['DOCTOR_PROFILER_T00'] = str(int((linkdown - relative_start)*1000))
-        os.environ['DOCTOR_PROFILER_T01'] = str(int((detected - relative_start) * 1000))
-        os.environ['DOCTOR_PROFILER_T03'] = str(int((vmdown - relative_start) * 1000))
-        os.environ['DOCTOR_PROFILER_T04'] = str(int((hostdown - relative_start) * 1000))
-        os.environ['DOCTOR_PROFILER_T09'] = str(int((notified - relative_start) * 1000))
+        os.environ['DOCTOR_PROFILER_T00'] = (
+            str(int((linkdown - relative_start) * 1000)))
+        os.environ['DOCTOR_PROFILER_T01'] = (
+            str(int((detected - relative_start) * 1000)))
+        os.environ['DOCTOR_PROFILER_T03'] = (
+            str(int((vmdown - relative_start) * 1000)))
+        os.environ['DOCTOR_PROFILER_T04'] = (
+            str(int((hostdown - relative_start) * 1000)))
+        os.environ['DOCTOR_PROFILER_T09'] = (
+            str(int((notified - relative_start) * 1000)))
 
         profiler_main(log=LOG)
 
-    def cleanup(self):
+    def cleanup_fault_management(self):
         self.unset_forced_down_hosts()
         self.inspector.stop()
         self.monitor.stop()
         self.consumer.stop()
-        self.installer.cleanup()
         self.alarm.delete()
         self.instance.delete()
         self.network.delete()
-        self.image.delete()
         self.fault.cleanup()
+
+    def cleanup(self):
+        self.installer.cleanup()
+        self.image.delete()
         self.user.delete()
+        # Kill possible hanging subprocess
+        os.killpg(0, signal.SIGKILL)
 
 
 def main():
     """doctor main"""
+    # TODO (tojuvone): JIRA DOCTOR-123: Test cases have some issue to always
+    # kill all subprocesses. To ensure they are killed this group is done so
+    # all processes can be killed without knowing what they are.
+    os.setpgrp()
     test_dir = os.path.split(os.path.realpath(__file__))[0]
     doctor_root_dir = os.path.dirname(test_dir)
 
     config_file_dir = '{0}/{1}'.format(doctor_root_dir, 'etc/')
-    config_files = [join(config_file_dir, f) for f in os.listdir(config_file_dir)
+    config_files = [join(config_file_dir, f)
+                    for f in os.listdir(config_file_dir)
                     if isfile(join(config_file_dir, f))]
 
     conf = config.prepare_conf(args=sys.argv[1:],