Maintenance support for latest Fenix, python3 and Fuel
[doctor.git] / doctor_tests / main.py
index 3f34635..cdb4af5 100644 (file)
@@ -1,5 +1,5 @@
 ##############################################################################
-# Copyright (c) 2017 ZTE Corporation and others.
+# Copyright (c) 2019 ZTE Corporation and others.
 #
 # All rights reserved. This program and the accompanying materials
 # are made available under the terms of the Apache License, Version 2.0
@@ -8,28 +8,19 @@
 ##############################################################################
 import os
 from os.path import isfile, join
-import random
 import sys
 import time
+from traceback import format_exc
 
-from doctor_tests.alarm import Alarm
-from doctor_tests.common.constants import Host
-from doctor_tests.common.utils import match_rep_in_file
 from doctor_tests import config
-from doctor_tests.consumer import get_consumer
 from doctor_tests.identity_auth import get_identity_auth
 from doctor_tests.identity_auth import get_session
 from doctor_tests.image import Image
-from doctor_tests.instance import Instance
-from doctor_tests.inspector import get_inspector
 from doctor_tests.installer import get_installer
 import doctor_tests.logger as doctor_log
-from doctor_tests.network import Network
-from doctor_tests.monitor import get_monitor
+from doctor_tests.scenario.fault_management import FaultManagement
 from doctor_tests.os_clients import nova_client
-from doctor_tests.profiler_poc import main as profiler_main
-from doctor_tests.scenario.common import calculate_notification_time
-from doctor_tests.scenario.network_failure import NetworkFault
+from doctor_tests.scenario.maintenance import Maintenance
 from doctor_tests.user import User
 
 
@@ -44,20 +35,10 @@ class DoctorTest(object):
         self.conf = conf
         self.image = Image(self.conf, LOG)
         self.user = User(self.conf, LOG)
-        self.network = Network(self.conf, LOG)
-        self.instance = Instance(self.conf, LOG)
-        self.alarm = Alarm(self.conf, LOG)
         self.installer = get_installer(self.conf, LOG)
-        self.inspector = get_inspector(self.conf, LOG)
-        self.monitor = get_monitor(self.conf,
-                                   self.inspector.get_inspector_url(),
-                                   LOG)
-        self.consumer = get_consumer(self.conf, LOG)
-        self.fault = NetworkFault(self.conf, self.installer, LOG)
         auth = get_identity_auth(project=self.conf.doctor_project)
         self.nova = nova_client(self.conf.nova_version,
                                 get_session(auth=auth))
-        self.down_host = None
 
     def setup(self):
         # prepare the cloud env
@@ -68,145 +49,110 @@ class DoctorTest(object):
 
         # creating test user...
         self.user.create()
-        self.user.update_quota()
 
-        # creating VM...
-        self.network.create()
-        self.instance.create()
-        self.instance.wait_for_vm_launch()
-
-        # creating alarm...
-        self.alarm.create()
-
-        # starting doctor sample components...
-        self.inspector.start()
-
-        self.down_host = self.get_host_info_for_random_vm()
-        self.monitor.start(self.down_host)
-
-        self.consumer.start()
-
-    def run(self):
-        """run doctor test"""
+    def test_fault_management(self):
         try:
-            LOG.info('doctor test starting.......')
+            LOG.info('doctor fault management test starting.......')
+            transport_url = self.installer.get_transport_url()
+            self.fault_management = \
+                FaultManagement(self.conf, self.installer, self.user, LOG,
+                                transport_url)
 
             # prepare test env
-            self.setup()
+            self.fault_management.setup()
 
             # wait for aodh alarms are updated in caches for event evaluator,
             # sleep time should be larger than event_alarm_cache_ttl
             # (default 60)
-            time.sleep(60)
+            # (tojuvone) Fraser currently needs 120
+            time.sleep(120)
 
             # injecting host failure...
             # NOTE (umar) add INTERFACE_NAME logic to host injection
-
-            self.fault.start(self.down_host)
-            time.sleep(10)
+            self.fault_management.start()
+            time.sleep(30)
 
             # verify the test results
             # NOTE (umar) copy remote monitor.log file when monitor=collectd
-            self.check_host_status(self.down_host.name, 'down')
+            self.fault_management.check_host_status('down')
+            self.fault_management.check_notification_time()
+
+        except Exception as e:
+            LOG.error('doctor fault management test failed, '
+                      'Exception=%s' % e)
+            LOG.error(format_exc())
+            sys.exit(1)
+        finally:
+            self.fault_management.cleanup()
+
+    def _amount_compute_nodes(self):
+        services = self.nova.services.list(binary='nova-compute')
+        return len(services)
+
+    def test_maintenance(self):
+        cnodes = self._amount_compute_nodes()
+        if cnodes < 3:
+            # need 2 compute for redundancy and one spare to migrate
+            LOG.info('not enough compute nodes, skipping doctor '
+                     'maintenance test')
+            return
+        elif self.conf.installer.type not in ['apex', 'fuel']:
+            LOG.info('not supported installer, skipping doctor '
+                     'maintenance test')
+            return
+        try:
+            LOG.info('doctor maintenance test starting.......')
+            trasport_url = self.installer.get_transport_url()
+            maintenance = Maintenance(trasport_url, self.conf, LOG)
+            maintenance.setup_maintenance(self.user)
 
-            notification_time = calculate_notification_time(LogFile)
-            if notification_time < 1 and notification_time > 0:
-                LOG.info('doctor test successfully, notification_time=%s'
-                         % notification_time)
+            # wait for aodh alarms are updated in caches for event evaluator,
+            # sleep time should be larger than event_alarm_cache_ttl
+            # (default 60)
+            LOG.info('wait aodh for 120s.......')
+            time.sleep(120)
+
+            session_id = maintenance.start_maintenance()
+            maintenance.wait_maintenance_complete(session_id)
+
+            LOG.info('doctor maintenance complete.......')
+
+        except Exception as e:
+            LOG.error('doctor maintenance test failed, Exception=%s' % e)
+            LOG.error(format_exc())
+            sys.exit(1)
+        finally:
+            maintenance.cleanup_maintenance()
+
+    def run(self):
+        """run doctor tests"""
+        try:
+            LOG.info('doctor test starting.......')
+
+            # prepare common test env
+            self.setup()
+
+            if self.conf.test_case == 'all':
+                self.test_fault_management()
+                self.test_maintenance()
             else:
-                LOG.error('doctor test failed, notification_time=%s'
-                          % notification_time)
-                sys.exit(1)
-
-            if self.conf.profiler_type:
-                LOG.info('doctor test begin to run profile.......')
-                self.collect_logs()
-                self.run_profiler()
+                function = 'test_%s' % self.conf.test_case
+                if hasattr(self, function):
+                    getattr(self, function)()
+                else:
+                    raise Exception('Can not find function <%s> in'
+                                    'DoctorTest, see config manual'
+                                    % function)
         except Exception as e:
             LOG.error('doctor test failed, Exception=%s' % e)
+            LOG.error(format_exc())
             sys.exit(1)
         finally:
             self.cleanup()
 
-    def get_host_info_for_random_vm(self):
-        num = random.randint(0, self.conf.instance_count - 1)
-        vm_name = "%s%d" % (self.conf.instance_basename, num)
-
-        servers = \
-            {getattr(server, 'name'): server
-             for server in self.nova.servers.list()}
-        server = servers.get(vm_name)
-        if not server:
-            raise \
-                Exception('Can not find instance: vm_name(%s)' % vm_name)
-        host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname')
-        host_ip = self.installer.get_host_ip_from_hostname(host_name)
-
-        LOG.info('Get host info(name:%s, ip:%s) which vm(%s) launched at'
-                 % (host_name, host_ip, vm_name))
-        return Host(host_name, host_ip)
-
-    def check_host_status(self, hostname, state):
-        service = self.nova.services.list(host=hostname,
-                                          binary='nova-compute')
-        host_state = service[0].__dict__.get('state')
-        assert host_state == state
-
-    def unset_forced_down_hosts(self):
-        if self.down_host:
-            self.nova.services.force_down(self.down_host.name,
-                                          'nova-compute', False)
-            time.sleep(2)
-            self.check_host_status(self.down_host.name, 'up')
-
-    def collect_logs(self):
-        self.fault.get_disable_network_log()
-
-    def run_profiler(self):
-
-        net_down_log_file = self.fault.get_disable_network_log()
-        reg = '(?<=doctor set link down at )\d+.\d+'
-        linkdown = float(match_rep_in_file(reg, net_down_log_file).group(0))
-
-        reg = '(.* doctor mark vm.* error at )(\d+.\d+)'
-        vmdown = float(match_rep_in_file(reg, LogFile).group(2))
-
-        reg = '(.* doctor mark host.* down at )(\d+.\d+)'
-        hostdown = float(match_rep_in_file(reg, LogFile).group(2))
-
-        reg = '(?<=doctor monitor detected at )\d+.\d+'
-        detected = float(match_rep_in_file(reg, LogFile).group(0))
-
-        reg = '(?<=doctor consumer notified at )\d+.\d+'
-        notified = float(match_rep_in_file(reg, LogFile).group(0))
-
-        # TODO(yujunz) check the actual delay to verify time sync status
-        # expected ~1s delay from $trigger to $linkdown
-        relative_start = linkdown
-        os.environ['DOCTOR_PROFILER_T00'] = \
-            str(int((linkdown - relative_start) * 1000))
-        os.environ['DOCTOR_PROFILER_T01'] = \
-            str(int((detected - relative_start) * 1000))
-        os.environ['DOCTOR_PROFILER_T03'] = \
-            str(int((vmdown - relative_start) * 1000))
-        os.environ['DOCTOR_PROFILER_T04'] = \
-            str(int((hostdown - relative_start) * 1000))
-        os.environ['DOCTOR_PROFILER_T09'] = \
-            str(int((notified - relative_start) * 1000))
-
-        profiler_main(log=LOG)
-
     def cleanup(self):
-        self.unset_forced_down_hosts()
-        self.inspector.stop()
-        self.monitor.stop()
-        self.consumer.stop()
         self.installer.cleanup()
-        self.alarm.delete()
-        self.instance.delete()
-        self.network.delete()
         self.image.delete()
-        self.fault.cleanup()
         self.user.delete()