Bug - Testing in Apex with OpenStack master fails 49/65449/9
authorTomi Juvonen <tomi.juvonen@nokia.com>
Wed, 28 Nov 2018 09:48:27 +0000 (11:48 +0200)
committerTomi Juvonen <tomi.juvonen@nokia.com>
Tue, 18 Dec 2018 10:40:53 +0000 (12:40 +0200)
Support yet another path to find config files.
Tune config changes to take effect properly for maintenance.
transport_url parcing enhanced.
Nova reset state to error takes well over 1 second these days
and at the end it then sends notification that we use.
Only reasonable thing is to send notification straight from
the Inspector as it should have been done in the first place.
Now we can do 200ms as total time, with just a few millisends
actuallly spent on sending the notification. Further one
could improve this by having node specific Inspector agent to
react even in more Telco grade speed.

Change-Id: I787f8e9dd6484842c6c568b15767018d11b36862
Signed-off-by: Tomi Juvonen <tomi.juvonen@nokia.com>
doctor_tests/inspector/__init__.py
doctor_tests/inspector/sample.py
doctor_tests/installer/apex.py
doctor_tests/installer/base.py
doctor_tests/installer/common/restore_compute_config.py
doctor_tests/installer/common/set_compute_config.py
doctor_tests/main.py
doctor_tests/scenario/fault_management.py
doctor_tests/scenario/maintenance.py

index 31291ba..50365a6 100644 (file)
@@ -42,6 +42,10 @@ _inspector_name_class_mapping = {
 }
 
 
-def get_inspector(conf, log):
+def get_inspector(conf, log, transport_url=None):
     inspector_class = _inspector_name_class_mapping[conf.inspector.type]
-    return importutils.import_object(inspector_class, conf, log)
+    if conf.inspector.type == 'sample':
+        return importutils.import_object(inspector_class, conf, log,
+                                         transport_url)
+    else:
+        return importutils.import_object(inspector_class, conf, log)
index a55a12b..baf0306 100644 (file)
@@ -10,6 +10,7 @@ import collections
 from flask import Flask
 from flask import request
 import json
+import oslo_messaging
 import time
 from threading import Thread
 import requests
@@ -26,7 +27,7 @@ from doctor_tests.inspector.base import BaseInspector
 class SampleInspector(BaseInspector):
     event_type = 'compute.host.down'
 
-    def __init__(self, conf, log):
+    def __init__(self, conf, log, trasport_url):
         super(SampleInspector, self).__init__(conf, log)
         self.inspector_url = self.get_inspector_url()
         self.novaclients = list()
@@ -43,6 +44,17 @@ class SampleInspector(BaseInspector):
         self.hostnames = list()
         self.app = None
 
+        try:
+            transport = oslo_messaging.get_notification_transport(self.conf,
+                                                                  trasport_url)
+            self.notif = oslo_messaging.Notifier(transport,
+                                                 'compute.instance.update',
+                                                 driver='messaging',
+                                                 topics=['notifications'])
+            self.notif = self.notif.prepare(publisher_id='sample')
+        except:
+            self.notif = None
+
     def _init_novaclients(self):
         self.NUMBER_OF_CLIENTS = self.conf.instance_count
         auth = get_identity_auth(project=self.conf.doctor_project)
@@ -54,7 +66,7 @@ class SampleInspector(BaseInspector):
     def _init_servers_list(self):
         self.servers.clear()
         opts = {'all_tenants': True}
-        servers = self.nova.servers.list(search_opts=opts)
+        servers = self.nova.servers.list(detailed=True, search_opts=opts)
         for server in servers:
             try:
                 host = server.__dict__.get('OS-EXT-SRV-ATTR:host')
@@ -97,10 +109,14 @@ class SampleInspector(BaseInspector):
             event_type = event['type']
             if event_type == self.event_type:
                 self.hostnames.append(hostname)
+                if self.notif is not None:
+                    thr0 = self._send_notif(hostname)
                 thr1 = self._disable_compute_host(hostname)
                 thr2 = self._vms_reset_state('error', hostname)
                 if self.conf.inspector.update_neutron_port_dp_status:
                     thr3 = self._set_ports_data_plane_status('DOWN', hostname)
+                if self.notif is not None:
+                    thr0.join()
                 thr1.join()
                 thr2.join()
                 if self.conf.inspector.update_neutron_port_dp_status:
@@ -156,8 +172,8 @@ class SampleInspector(BaseInspector):
             nova.servers.reset_state(server, state)
             vmdown_time = time.time()
             self.vm_down_time = vmdown_time
-            self.log.info('doctor mark vm(%s) error at %s'
-                          % (server, vmdown_time))
+            self.log.info('doctor mark vm(%s) %s at %s'
+                          % (server, state, vmdown_time))
 
         thrs = []
         for nova, server in zip(self.novaclients, self.servers[hostname]):
@@ -166,6 +182,26 @@ class SampleInspector(BaseInspector):
         for t in thrs:
             t.join()
 
+    @utils.run_async
+    def _send_notif(self, hostname):
+
+        @utils.run_async
+        def _send_notif(server):
+            payload = dict(tenant_id=server.tenant_id,
+                           instance_id=server.id,
+                           state="error")
+            self.notif.info({'some': 'context'}, 'compute.instance.update',
+                            payload)
+            self.log.info('doctor compute.instance.update vm(%s) error %s'
+                          % (server, time.time()))
+
+        thrs = []
+        for server in self.servers[hostname]:
+            t = _send_notif(server)
+            thrs.append(t)
+        for t in thrs:
+            t.join()
+
     @utils.run_async
     def _set_ports_data_plane_status(self, status, hostname):
         body = {'data_plane_status': status}
index 3c97378..79c59e9 100644 (file)
@@ -6,7 +6,6 @@
 # which accompanies this distribution, and is available at
 # http://www.apache.org/licenses/LICENSE-2.0
 ##############################################################################
-import re
 import time
 
 from doctor_tests.common.constants import Inspector
@@ -36,8 +35,6 @@ class ApexInstaller(BaseInstaller):
         self.key_file = None
         self.controllers = list()
         self.computes = list()
-        self.controller_clients = list()
-        self.compute_clients = list()
 
     def setup(self):
         self.log.info('Setup Apex installer start......')
@@ -83,26 +80,6 @@ class ApexInstaller(BaseInstaller):
         host_ips = self._run_cmd_remote(self.client, command)
         return host_ips[0]
 
-    def get_transport_url(self):
-        client = SSHClient(self.controllers[0], self.node_user_name,
-                           key_filename=self.key_file)
-        if self.use_containers:
-            ncbase = "/var/lib/config-data/puppet-generated/nova"
-        else:
-            ncbase = ""
-        command = 'sudo grep "^transport_url" %s/etc/nova/nova.conf' % ncbase
-
-        ret, url = client.ssh(command)
-        if ret:
-            raise Exception('Exec command to get host ip from controller(%s)'
-                            'in Apex installer failed, ret=%s, output=%s'
-                            % (self.controllers[0], ret, url))
-        # need to use ip instead of hostname
-        ret = (re.sub("@.*:", "@%s:" % self.controllers[0],
-               url[0].split("=", 1)[1]))
-        self.log.debug('get_transport_url %s' % ret)
-        return ret
-
     def _set_docker_restart_cmd(self, service):
         # There can be multiple instances running so need to restart all
         cmd = "for container in `sudo docker ps | grep "
@@ -114,22 +91,6 @@ class ApexInstaller(BaseInstaller):
     def set_apply_patches(self):
         self.log.info('Set apply patches start......')
 
-        if self.conf.test_case != 'fault_management':
-            if self.use_containers:
-                restart_cmd = self._set_docker_restart_cmd("nova-compute")
-            else:
-                restart_cmd = 'sudo systemctl restart' \
-                              ' openstack-nova-compute.service'
-            for node_ip in self.computes:
-                client = SSHClient(node_ip, self.node_user_name,
-                                   key_filename=self.key_file)
-                self.compute_clients.append(client)
-                self._run_apply_patches(client,
-                                        restart_cmd,
-                                        [self.nc_set_compute_script],
-                                        python=self.python)
-            time.sleep(10)
-
         set_scripts = [self.cm_set_script]
 
         if self.use_containers:
@@ -157,11 +118,28 @@ class ApexInstaller(BaseInstaller):
         for node_ip in self.controllers:
             client = SSHClient(node_ip, self.node_user_name,
                                key_filename=self.key_file)
-            self.controller_clients.append(client)
             self._run_apply_patches(client,
                                     restart_cmd,
                                     set_scripts,
                                     python=self.python)
+        time.sleep(5)
+
+        self.log.info('Set apply patches start......')
+
+        if self.conf.test_case != 'fault_management':
+            if self.use_containers:
+                restart_cmd = self._set_docker_restart_cmd("nova")
+            else:
+                restart_cmd = 'sudo systemctl restart' \
+                              ' openstack-nova-compute.service'
+            for node_ip in self.computes:
+                client = SSHClient(node_ip, self.node_user_name,
+                                   key_filename=self.key_file)
+                self._run_apply_patches(client,
+                                        restart_cmd,
+                                        [self.nc_set_compute_script],
+                                        python=self.python)
+            time.sleep(5)
 
     def restore_apply_patches(self):
         self.log.info('restore apply patches start......')
@@ -190,39 +168,22 @@ class ApexInstaller(BaseInstaller):
                 restart_cmd += ' openstack-congress-server.service'
             restore_scripts.append(self.cg_restore_script)
 
-        for client, node_ip in zip(self.controller_clients, self.controllers):
-            retry = 0
-            while retry < 2:
-                try:
-                    self._run_apply_patches(client,
-                                            restart_cmd,
-                                            restore_scripts,
-                                            python=self.python)
-                except Exception:
-                    if retry > 0:
-                        raise Exception("SSHClient to %s feiled" % node_ip)
-                    client = SSHClient(node_ip, self.node_user_name,
-                                       key_filename=self.key_file)
-                    retry += 1
-                break
+        for node_ip in self.controllers:
+            client = SSHClient(node_ip, self.node_user_name,
+                               key_filename=self.key_file)
+            self._run_apply_patches(client,
+                                    restart_cmd,
+                                    restore_scripts,
+                                    python=self.python)
+
         if self.conf.test_case != 'fault_management':
             if self.use_containers:
                 restart_cmd = self._set_docker_restart_cmd("nova-compute")
             else:
                 restart_cmd = 'sudo systemctl restart' \
                               ' openstack-nova-compute.service'
-            for client, node_ip in zip(self.compute_clients, self.computes):
-                retry = 0
-                while retry < 2:
-                    try:
-                        self._run_apply_patches(
-                            client, restart_cmd,
-                            [self.nc_restore_compute_script],
-                            python=self.python)
-                    except Exception:
-                        if retry > 0:
-                            raise Exception("SSHClient to %s feiled" % node_ip)
-                        client = SSHClient(node_ip, self.node_user_name,
-                                           key_filename=self.key_file)
-                        retry += 1
-                    break
+            for node_ip in self.computes:
+                self._run_apply_patches(
+                    client, restart_cmd,
+                    [self.nc_restore_compute_script],
+                    python=self.python)
index 124b191..df781ee 100644 (file)
@@ -11,6 +11,7 @@ import getpass
 import grp
 import os
 import pwd
+import re
 import six
 import stat
 import subprocess
@@ -126,6 +127,48 @@ class BaseInstaller(object):
         os.chmod(ssh_key, stat.S_IREAD)
         return ssh_key
 
+    def get_transport_url(self):
+        client = utils.SSHClient(self.controllers[0], self.node_user_name,
+                                 key_filename=self.key_file)
+        if self.use_containers:
+            ncbase = "/var/lib/config-data/puppet-generated/nova"
+        else:
+            ncbase = ""
+        try:
+            cmd = 'sudo grep "^transport_url" %s/etc/nova/nova.conf' % ncbase
+            ret, url = client.ssh(cmd)
+            if ret:
+                raise Exception('Exec command to get transport from '
+                                'controller(%s) in Apex installer failed, '
+                                'ret=%s, output=%s'
+                                % (self.controllers[0], ret, url))
+            else:
+                # need to use ip instead of hostname
+                ret = (re.sub("@.*:", "@%s:" % self.controllers[0],
+                       url[0].split("=", 1)[1]))
+        except:
+            cmd = 'grep -i "^rabbit" %s/etc/nova/nova.conf' % ncbase
+            ret, lines = client.ssh(cmd)
+            if ret:
+                raise Exception('Exec command to get transport from '
+                                'controller(%s) in Apex installer failed, '
+                                'ret=%s, output=%s'
+                                % (self.controllers[0], ret, url))
+            else:
+                for line in lines.split('\n'):
+                    if line.startswith("rabbit_userid"):
+                        rabbit_userid = line.split("=")
+                    if line.startswith("rabbit_port"):
+                        rabbit_port = line.split("=")
+                    if line.startswith("rabbit_password"):
+                        rabbit_password = line.split("=")
+                ret = "rabbit://%s:%s@%s:%s/?ssl=0" % (rabbit_userid,
+                                                       rabbit_password,
+                                                       self.controllers[0],
+                                                       rabbit_port)
+        self.log.debug('get_transport_url %s' % ret)
+        return ret
+
     def _run_cmd_remote(self, client, command):
         self.log.info('Run command=%s in %s installer......'
                       % (command, self.conf.installer.type))
@@ -161,14 +204,21 @@ class BaseInstaller(object):
             for script_name in script_names:
                 script_abs_path = '{0}/{1}/{2}'.format(installer_dir,
                                                        'common', script_name)
-                client.scp(script_abs_path, script_name)
-                cmd = 'sudo %s %s' % (python, script_name)
-                ret, output = client.ssh(cmd)
+                try:
+                    client.scp(script_abs_path, script_name)
+                except:
+                    client.scp(script_abs_path, script_name)
+                try:
+                    cmd = 'sudo %s %s' % (python, script_name)
+                    ret, output = client.ssh(cmd)
+                except:
+                    ret, output = client.ssh(cmd)
+
                 if ret:
                     raise Exception('Do the command in remote'
                                     ' node failed, ret=%s, cmd=%s, output=%s'
                                     % (ret, cmd, output))
-            if 'nova-scheduler' in restart_cmd:
+            if 'nova' in restart_cmd:
                 # Make sure scheduler has proper cpu_allocation_ratio
                 time.sleep(5)
             client.ssh(restart_cmd)
index 0e9939f..82e10a6 100644 (file)
@@ -11,18 +11,16 @@ import shutil
 
 
 def restore_cpu_allocation_ratio():
-    nova_base = "/var/lib/config-data/puppet-generated/nova"
-    if not os.path.isdir(nova_base):
-        nova_base = ""
-    nova_file = nova_base + '/etc/nova/nova.conf'
-    nova_file_bak = nova_base + '/etc/nova/nova.bak'
-
-    if not os.path.isfile(nova_file_bak):
-        print('Bak_file:%s does not exist.' % nova_file_bak)
-    else:
-        print('restore: %s' % nova_file)
-        shutil.copyfile(nova_file_bak, nova_file)
-        os.remove(nova_file_bak)
+    for nova_file_bak in ["/var/lib/config-data/puppet-generated/nova_libvirt/etc/nova/nova.bak",  # noqa
+                          "/var/lib/config-data/puppet-generated/nova/etc/nova/nova.bak",  # noqa
+                          "/etc/nova/nova.bak"]:
+        if os.path.isfile(nova_file_bak):
+            nova_file = nova_file_bak.replace(".bak", ".conf")
+            print('restoring nova.bak.')
+            shutil.copyfile(nova_file_bak, nova_file)
+            os.remove(nova_file_bak)
+            return
+    print('nova.bak does not exist.')
     return
 
 restore_cpu_allocation_ratio()
index 8626608..76ac649 100644 (file)
@@ -10,29 +10,17 @@ import os
 import shutil
 
 
-def make_initial_config(service, dest):
-    for mk in ["", "/etc", "/%s" % service]:
-        dest += mk
-        os.mkdir(dest)
-    src = "/etc/%s/%s.conf" % (service, service)
-    dest += "/%s.conf" % service
-    shutil.copyfile(src, dest)
-
-
 def set_cpu_allocation_ratio():
-    docker_conf_base_dir = "/var/lib/config-data/puppet-generated"
-    if not os.path.isdir(docker_conf_base_dir):
-        nova_base = ""
-    else:
-        nova_base = "%s/nova" % docker_conf_base_dir
-        if not os.path.isdir(nova_base):
-            # nova.conf to be used might not exist
-            make_initial_config("nova", nova_base)
-    nova_file = nova_base + '/etc/nova/nova.conf'
-    nova_file_bak = nova_base + '/etc/nova/nova.bak'
+    nova_file_bak = None
+    for nova_file in ["/var/lib/config-data/puppet-generated/nova_libvirt/etc/nova/nova.conf",  # noqa
+                      "/var/lib/config-data/puppet-generated/nova/etc/nova/nova.conf",  # noqa
+                      "/etc/nova/nova.conf"]:
+        if os.path.isfile(nova_file):
+            nova_file_bak = nova_file.replace(".conf", ".bak")
+            break
 
-    if not os.path.isfile(nova_file):
-        raise Exception("File doesn't exist: %s." % nova_file)
+    if nova_file_bak is None:
+        raise Exception("Could not find nova.conf")
     # TODO (tojuvone): Unfortunately ConfigParser did not produce working conf
     fcheck = open(nova_file)
     found_list = ([ca for ca in fcheck.readlines() if "cpu_allocation_ratio"
index 438d832..351d5f1 100644 (file)
@@ -53,9 +53,10 @@ class DoctorTest(object):
     def test_fault_management(self):
         try:
             LOG.info('doctor fault management test starting.......')
-
+            transport_url = self.installer.get_transport_url()
             self.fault_management = \
-                FaultManagement(self.conf, self.installer, self.user, LOG)
+                FaultManagement(self.conf, self.installer, self.user, LOG,
+                                transport_url)
 
             # prepare test env
             self.fault_management.setup()
@@ -79,6 +80,7 @@ class DoctorTest(object):
         except Exception as e:
             LOG.error('doctor fault management test failed, '
                       'Exception=%s' % e)
+            LOG.error(format_exc())
             sys.exit(1)
         finally:
             self.fault_management.cleanup()
@@ -143,6 +145,7 @@ class DoctorTest(object):
                                     % function)
         except Exception as e:
             LOG.error('doctor test failed, Exception=%s' % e)
+            LOG.error(format_exc())
             sys.exit(1)
         finally:
             self.cleanup()
index 869311b..a110b88 100644 (file)
@@ -40,7 +40,7 @@ sleep 1
 
 class FaultManagement(object):
 
-    def __init__(self, conf, installer, user, log):
+    def __init__(self, conf, installer, user, log, transport_url):
         self.conf = conf
         self.log = log
         self.user = user
@@ -55,7 +55,7 @@ class FaultManagement(object):
         self.network = Network(self.conf, log)
         self.instance = Instance(self.conf, log)
         self.alarm = Alarm(self.conf, log)
-        self.inspector = get_inspector(self.conf, log)
+        self.inspector = get_inspector(self.conf, log, transport_url)
         self.monitor = get_monitor(self.conf,
                                    self.inspector.get_inspector_url(),
                                    log)
index 09795c2..a2129f6 100644 (file)
@@ -40,7 +40,7 @@ class Maintenance(object):
         else:
             self.endpoint = 'v1/maintenance'
         self.app_manager = get_app_manager(self.stack, self.conf, self.log)
-        self.inspector = get_inspector(self.conf, self.log)
+        self.inspector = get_inspector(self.conf, self.log, trasport_url)
 
     def get_external_network(self):
         ext_net = None
@@ -68,8 +68,16 @@ class Maintenance(object):
                 raise Exception('not enough vcpus (%d) on %s' %
                                 (vcpus, hostname))
             if vcpus_used > 0:
-                raise Exception('%d vcpus used on %s'
-                                % (vcpus_used, hostname))
+                if self.conf.test_case == 'all':
+                    # VCPU might not yet be free after fault_management test
+                    self.log.info('%d vcpus used on %s, retry...'
+                                  % (vcpus_used, hostname))
+                    time.sleep(15)
+                    hvisor = self.nova.hypervisors.get(hvisor.id)
+                    vcpus_used = hvisor.__getattr__('vcpus_used')
+                if vcpus_used > 0:
+                    raise Exception('%d vcpus used on %s'
+                                    % (vcpus_used, hostname))
             if prev_vcpus != 0 and prev_vcpus != vcpus:
                 raise Exception('%d vcpus on %s does not match to'
                                 '%d on %s'