Merge "Update the maintenance design document"
authorTomi Juvonen <tomi.juvonen@nokia.com>
Fri, 10 Aug 2018 08:29:04 +0000 (08:29 +0000)
committerGerrit Code Review <gerrit@opnfv.org>
Fri, 10 Aug 2018 08:29:04 +0000 (08:29 +0000)
29 files changed:
docs/development/overview/testing.rst
docs/release/release-notes/releasenotes.rst
doctor_tests/admin_tool/__init__.py [new file with mode: 0644]
doctor_tests/admin_tool/base.py [new file with mode: 0644]
doctor_tests/admin_tool/sample.py [new file with mode: 0644]
doctor_tests/app_manager/__init__.py [new file with mode: 0644]
doctor_tests/app_manager/base.py [new file with mode: 0644]
doctor_tests/app_manager/sample.py [new file with mode: 0644]
doctor_tests/config.py
doctor_tests/consumer/__init__.py
doctor_tests/inspector/sample.py
doctor_tests/installer/__init__.py
doctor_tests/installer/apex.py
doctor_tests/installer/base.py
doctor_tests/installer/common/restore_compute_config.py [new file with mode: 0644]
doctor_tests/installer/common/restore_config.py [moved from doctor_tests/installer/common/restore_ceilometer.py with 51% similarity]
doctor_tests/installer/common/set_ceilometer.py [deleted file]
doctor_tests/installer/common/set_compute_config.py [new file with mode: 0644]
doctor_tests/installer/common/set_config.py [new file with mode: 0644]
doctor_tests/installer/mcp.py
doctor_tests/main.py
doctor_tests/maintenance_hot_tpl.yaml [new file with mode: 0644]
doctor_tests/os_clients.py
doctor_tests/scenario/fault_management.py
doctor_tests/scenario/maintenance.py [new file with mode: 0644]
doctor_tests/stack.py [new file with mode: 0644]
doctor_tests/user.py
requirements.txt
tox.ini

index 98be43e..ba0546e 100644 (file)
@@ -29,6 +29,18 @@ OpenStack services.
 
 .. _OpenStackClient Configuration: https://docs.openstack.org/python-openstackclient/latest/configuration/index.html
 
+Doctor now supports different test cases and for that you might want to
+export TEST_CASE with different values:
+
+.. code-block:: bash
+
+    #Fault management (default)
+    export TEST_CASE='fault_management'
+    #Maintenance (requires 3 compute nodes)
+    export TEST_CASE='maintenance'
+    #Run both tests cases
+    export TEST_CASE='all'
+
 Run Python Test Script
 ~~~~~~~~~~~~~~~~~~~~~~
 
@@ -45,6 +57,18 @@ environment and then run the test.
 
 .. _doctor.sample.conf: https://git.opnfv.org/doctor/tree/etc/doctor.sample.conf
 
+In OPNFV Apex jumphost you can run Doctor testing as follows using tox:
+
+.. code-block:: bash
+
+    #Before Gambia: overcloudrc.v3
+    source overcloudrc
+    export INSTALLER_IP=${INSTALLER_IP}
+    export INSTALLER_TYPE=${INSTALLER_TYPE}
+    git clone https://gerrit.opnfv.org/gerrit/doctor
+    cd doctor
+    sudo -E tox
+
 Run Functest Suite
 ==================
 
index 6765341..f1cf9d7 100644 (file)
@@ -14,6 +14,7 @@ Version history
 +------------+----------+--------------+-------------+
 | **Date**   | **Ver.** | **Author**   | **Comment** |
 +============+==========+==============+=============+
+| 2018-06-25 | 6.2.0    | Tomi Juvonen |             |
 | 2018-05-25 | 6.1.0    | Tomi Juvonen |             |
 | 2018-04-23 | 6.0.0    | Tomi Juvonen |             |
 +------------+----------+--------------+-------------+
diff --git a/doctor_tests/admin_tool/__init__.py b/doctor_tests/admin_tool/__init__.py
new file mode 100644 (file)
index 0000000..e8b1281
--- /dev/null
@@ -0,0 +1,37 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from oslo_config import cfg
+from oslo_utils import importutils
+
+
+OPTS = [
+    cfg.StrOpt('type',
+               default='sample',
+               choices=['sample'],
+               help='the component of doctor admin_tool',
+               required=True),
+    cfg.StrOpt('ip',
+               default='127.0.0.1',
+               help='the ip of admin_tool',
+               required=True),
+    cfg.IntOpt('port',
+               default='12347',
+               help='the port of doctor admin_tool',
+               required=True),
+]
+
+
+_admin_tool_name_class_mapping = {
+    'sample': 'doctor_tests.admin_tool.sample.SampleAdminTool'
+}
+
+
+def get_admin_tool(trasport_url, conf, log):
+    admin_tool_class = _admin_tool_name_class_mapping.get(conf.admin_tool.type)
+    return importutils.import_object(admin_tool_class, trasport_url, conf, log)
diff --git a/doctor_tests/admin_tool/base.py b/doctor_tests/admin_tool/base.py
new file mode 100644 (file)
index 0000000..0f0b2dc
--- /dev/null
@@ -0,0 +1,26 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import abc
+import six
+
+
+@six.add_metaclass(abc.ABCMeta)
+class BaseAdminTool(object):
+
+    def __init__(self, conf, log):
+        self.conf = conf
+        self.log = log
+
+    @abc.abstractmethod
+    def start(self):
+        pass
+
+    @abc.abstractmethod
+    def stop(self):
+        pass
diff --git a/doctor_tests/admin_tool/sample.py b/doctor_tests/admin_tool/sample.py
new file mode 100644 (file)
index 0000000..892a4c8
--- /dev/null
@@ -0,0 +1,726 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import datetime
+from flask import Flask
+from flask import request
+import json
+from novaclient.exceptions import BadRequest
+import oslo_messaging as messaging
+import requests
+import time
+from threading import Thread
+from traceback import format_exc
+from uuid import uuid1 as generate_uuid
+
+from doctor_tests.admin_tool.base import BaseAdminTool
+from doctor_tests.identity_auth import get_identity_auth
+from doctor_tests.identity_auth import get_session
+from doctor_tests.os_clients import aodh_client
+from doctor_tests.os_clients import nova_client
+
+
+class SampleAdminTool(BaseAdminTool):
+
+    def __init__(self, trasport_url, conf, log):
+        super(SampleAdminTool, self).__init__(conf, log)
+        self.trasport_url = trasport_url
+        self.app = None
+
+    def start(self):
+        self.log.info('sample admin tool start......')
+        self.app = AdminTool(self.trasport_url, self.conf, self, self.log)
+        self.app.start()
+
+    def stop(self):
+        self.log.info('sample admin tool stop......')
+        if not self.app:
+            return
+        headers = {
+            'Content-Type': 'application/json',
+            'Accept': 'application/json',
+        }
+        url = 'http://%s:%d/shutdown'\
+              % (self.conf.admin_tool.ip,
+                 self.conf.admin_tool.port)
+        requests.post(url, data='', headers=headers)
+
+
+class AdminMain(Thread):
+
+    def __init__(self, trasport_url, session_id, data, parent, conf, log):
+        Thread.__init__(self)
+        self.session_id = session_id
+        self.parent = parent
+        self.log = log
+        self.conf = conf
+        self.url = 'http://0.0.0.0:%s' % conf.admin_tool.port
+        self.projects_state = dict()  # current state for each project
+        self.proj_server_actions = dict()  # actions for each project server
+        self.projects_servers = dict()  # servers processed in current state
+        self.maint_proj_servers = dict()  # servers under whole maintenance
+        self.hosts = data['hosts']
+        self.maintenance_at = data['maintenance_at']
+        self.computes_disabled = list()
+        self.metadata = data['metadata']
+        self.auth = get_identity_auth(project=self.conf.doctor_project)
+        self.state = data['state']
+        self.aodh = aodh_client(self.conf.aodh_version,
+                                get_session(auth=self.auth))
+        self.nova = nova_client(self.conf.nova_version,
+                                get_session(auth=self.auth))
+        self.log.info('transport_url %s' % trasport_url)
+        transport = messaging.get_transport(self.conf, trasport_url)
+        self.notif_proj = messaging.Notifier(transport,
+                                             'maintenance.planned',
+                                             driver='messaging',
+                                             topics=['notifications'])
+        self.notif_proj = self.notif_proj.prepare(publisher_id='admin_tool')
+        self.notif_admin = messaging.Notifier(transport,
+                                              'maintenance.host',
+                                              driver='messaging',
+                                              topics=['notifications'])
+        self.notif_admin = self.notif_admin.prepare(publisher_id='admin_tool')
+        self.log.info('Admin tool session %s initialized' % self.session_id)
+
+    def cleanup(self):
+        for host in self.computes_disabled:
+            self.log.info('enable nova-compute on %s' % host)
+            self.nova.services.enable(host, 'nova-compute')
+
+    def _projects_not_in_wanted_states(self, wanted_states):
+        if len([v for v in self.projects_state.values()
+               if v not in wanted_states]):
+            return True
+        else:
+            return False
+
+    def projects_not_in_state(self, state):
+        if len([v for v in self.projects_state.values()
+               if v != state]):
+            return True
+        else:
+            return False
+
+    def wait_projects_state(self, wanted_states, wait_seconds):
+        retries = wait_seconds
+        while (retries > 0 and
+               self._projects_not_in_wanted_states(wanted_states)):
+            time.sleep(1)
+            retries = retries - 1
+        if self._projects_not_in_wanted_states(wanted_states):
+            self.log.error('Admin tool session %s: projects in invalid states '
+                           '%s' % (self.session_id, self.projects_state))
+            raise Exception('Admin tool session %s: not all projects in states'
+                            ' %s' % (self.session_id, wanted_states))
+        else:
+            self.log.info('all projects replied')
+
+    def _project_notify(self, project_id, instance_ids, allowed_actions,
+                        actions_at, state, metadata):
+        reply_url = '%s/%s/maintenance' % (self.url, project_id)
+
+        payload = dict(project_id=project_id,
+                       instance_ids=instance_ids,
+                       allowed_actions=allowed_actions,
+                       state=state,
+                       actions_at=actions_at,
+                       session_id=self.session_id,
+                       metadata=metadata,
+                       reply_url=reply_url)
+
+        self.log.debug('Sending "maintenance.planned" to project: %s' %
+                       payload)
+
+        self.notif_proj.info({'some': 'context'}, 'maintenance.scheduled',
+                             payload)
+
+    def _admin_notify(self, project, host, state, session_id):
+        payload = dict(project_id=project, host=host, state=state,
+                       session_id=session_id)
+
+        self.log.debug('Sending "maintenance.host": %s' % payload)
+
+        self.notif_admin.info({'some': 'context'}, 'maintenance.host', payload)
+
+    def down_scale(self):
+        for project in self.projects_servers:
+            self.log.info('DOWN_SCALE to project %s' % project)
+            self.log.debug('instance_ids %s' % self.projects_servers[project])
+            instance_ids = '%s/%s/maintenance' % (self.url, project)
+            allowed_actions = []
+            wait_seconds = 120
+            actions_at = (datetime.datetime.utcnow() +
+                          datetime.timedelta(seconds=wait_seconds)
+                          ).strftime('%Y-%m-%d %H:%M:%S')
+            state = self.state
+            metadata = self.metadata
+            self._project_notify(project, instance_ids,
+                                 allowed_actions, actions_at, state,
+                                 metadata)
+        allowed_states = ['ACK_DOWN_SCALE', 'NACK_DOWN_SCALE']
+        self.wait_projects_state(allowed_states, wait_seconds)
+        if self.projects_not_in_state('ACK_DOWN_SCALE'):
+            raise Exception('Admin tool session %s: all states not '
+                            'ACK_DOWN_SCALE %s' %
+                            (self.session_id, self.projects_state))
+
+    def maintenance(self):
+        for project in self.projects_servers:
+            self.log.info('\nMAINTENANCE to project %s\n' % project)
+            self.log.debug('instance_ids %s' % self.projects_servers[project])
+            instance_ids = '%s/%s/maintenance' % (self.url, project)
+            allowed_actions = []
+            actions_at = self.maintenance_at
+            state = self.state
+            metadata = self.metadata
+            maint_at = self.str_to_datetime(self.maintenance_at)
+            td = maint_at - datetime.datetime.utcnow()
+            wait_seconds = int(td.total_seconds())
+            if wait_seconds < 10:
+                raise Exception('Admin tool session %s: No time for project to'
+                                ' answer: %s' %
+                                (self.session_id, wait_seconds))
+            self._project_notify(project, instance_ids,
+                                 allowed_actions, actions_at, state,
+                                 metadata)
+        allowed_states = ['ACK_MAINTENANCE', 'NACK_MAINTENANCE']
+        self.wait_projects_state(allowed_states, wait_seconds)
+        if self.projects_not_in_state('ACK_MAINTENANCE'):
+            raise Exception('Admin tool session %s: all states not '
+                            'ACK_MAINTENANCE %s' %
+                            (self.session_id, self.projects_state))
+
+    def maintenance_complete(self):
+        for project in self.projects_servers:
+            self.log.info('MAINTENANCE_COMPLETE to project %s' % project)
+            instance_ids = '%s/%s/maintenance' % (self.url, project)
+            allowed_actions = []
+            wait_seconds = 120
+            actions_at = (datetime.datetime.utcnow() +
+                          datetime.timedelta(seconds=wait_seconds)
+                          ).strftime('%Y-%m-%d %H:%M:%S')
+            state = 'MAINTENANCE_COMPLETE'
+            metadata = self.metadata
+            self._project_notify(project, instance_ids,
+                                 allowed_actions, actions_at, state,
+                                 metadata)
+        allowed_states = ['ACK_MAINTENANCE_COMPLETE',
+                          'NACK_MAINTENANCE_COMPLETE']
+        self.wait_projects_state(allowed_states, wait_seconds)
+        if self.projects_not_in_state('ACK_MAINTENANCE_COMPLETE'):
+            raise Exception('Admin tool session %s: all states not '
+                            'ACK_MAINTENANCE_COMPLETE %s' %
+                            (self.session_id, self.projects_state))
+
+    def need_down_scale(self, host_servers):
+        room_for_instances = 0
+        for host in host_servers:
+            instances = 0
+            for project in host_servers[host]:
+                for instance in host_servers[host][project]:
+                    instances += 1
+            room_for_instances += (2 - instances)
+        self.log.info('there is room for %d instances' % room_for_instances)
+        if room_for_instances > 1:
+            return False
+        else:
+            return True
+
+    def find_host_to_be_empty(self, host_servers):
+        host_to_be_empty = None
+        host_nonha_instances = 0
+        for host in host_servers:
+            ha_instances = 0
+            nonha_instances = 0
+            for project in host_servers[host]:
+                for instance in host_servers[host][project]:
+                    if ('doctor_ha_app_' in
+                            host_servers[host][project][instance]):
+                        ha_instances += 1
+                    else:
+                        nonha_instances += 1
+            self.log.info('host %s has %d ha and %d non ha instances' %
+                          (host, ha_instances, nonha_instances))
+            if ha_instances == 0:
+                if host_to_be_empty:
+                    if nonha_instances < host_nonha_instances:
+                        host_to_be_empty = host
+                        host_nonha_instances = nonha_instances
+                else:
+                    host_to_be_empty = host
+                    host_nonha_instances = nonha_instances
+        self.log.info('host %s selected to be empty' % host_to_be_empty)
+        return host_to_be_empty
+
+    def make_compute_host_empty(self, host, projects_servers, statebase):
+        state = statebase
+        state_ack = 'ACK_%s' % statebase
+        state_nack = 'NACK_%s' % statebase
+        for project in projects_servers:
+            # self.projects_servers must have servers under action
+            self.projects_servers[project] = projects_servers[project].copy()
+            self.log.info('%s to project %s' % (state, project))
+            self.project_servers_log_info(project, projects_servers)
+            instance_ids = '%s/%s/maintenance' % (self.url, project)
+            allowed_actions = ['MIGRATE', 'LIVE_MIGRATE', 'OWN_ACTION']
+            wait_seconds = 120
+            actions_at = (datetime.datetime.utcnow() +
+                          datetime.timedelta(seconds=wait_seconds)
+                          ).strftime('%Y-%m-%d %H:%M:%S')
+            metadata = self.metadata
+            self._project_notify(project, instance_ids,
+                                 allowed_actions, actions_at, state,
+                                 metadata)
+        allowed_states = [state_ack, state_nack]
+        self.wait_projects_state(allowed_states, wait_seconds)
+        if self.projects_not_in_state(state_ack):
+            raise Exception('Admin tool session %s: all states not %s %s' %
+                            (self.session_id, state_ack, self.projects_state))
+        self.actions_to_have_empty_host(host)
+
+    def notify_action_done(self, project, instance_id):
+        instance_ids = instance_id
+        allowed_actions = []
+        actions_at = None
+        state = "INSTANCE_ACTION_DONE"
+        metadata = None
+        self._project_notify(project, instance_ids, allowed_actions,
+                             actions_at, state, metadata)
+
+    def actions_to_have_empty_host(self, host):
+        retry = 0
+        while len(self.proj_server_actions) == 0:
+            time.sleep(2)
+            if retry == 10:
+                raise Exception('Admin tool session %s: project server actions'
+                                ' not set' % self.session_id)
+            retry += 1
+        for project in self.proj_server_actions:
+            for server, action in self.proj_server_actions[project].items():
+                self.log.info('Action %s server %s: %s' % (action, server,
+                              self.projects_servers[project][server]))
+                if action == 'MIGRATE':
+                    self.migrate_server(server)
+                    self.notify_action_done(project, server)
+                elif action == 'OWN_ACTION':
+                    pass
+                else:
+                    raise Exception('Admin tool session %s: server %s action '
+                                    '%s not supported' %
+                                    (self.session_id, server, action))
+        self.proj_server_actions = dict()
+        self._wait_host_empty(host)
+
+    def migrate_server(self, server_id):
+        server = self.nova.servers.get(server_id)
+        vm_state = server.__dict__.get('OS-EXT-STS:vm_state')
+        self.log.info('server %s state %s' % (server_id, vm_state))
+        last_vm_state = vm_state
+        retry_migrate = 5
+        while True:
+            try:
+                server.migrate()
+                time.sleep(5)
+                retries = 36
+                while vm_state != 'resized' and retries > 0:
+                    # try to confirm within 3min
+                    server = self.nova.servers.get(server_id)
+                    vm_state = server.__dict__.get('OS-EXT-STS:vm_state')
+                    if vm_state == 'resized':
+                        server.confirm_resize()
+                        self.log.info('server %s migration confirmed' %
+                                      server_id)
+                        return
+                    if last_vm_state != vm_state:
+                        self.log.info('server %s state: %s' % (server_id,
+                                      vm_state))
+                    if vm_state == 'error':
+                        raise Exception('server %s migration failed, state: %s'
+                                        % (server_id, vm_state))
+                    time.sleep(5)
+                    retries = retries - 1
+                    last_vm_state = vm_state
+                # Timout waiting state to change
+                break
+
+            except BadRequest:
+                if retry_migrate == 0:
+                    raise Exception('server %s migrate failed' % server_id)
+                # Might take time for scheduler to sync inconsistent instance
+                # list for host
+                retry_time = 180 - (retry_migrate * 30)
+                self.log.info('server %s migrate failed, retry in %s sec'
+                              % (server_id, retry_time))
+                time.sleep(retry_time)
+            except Exception as e:
+                self.log.error('server %s migration failed, Exception=%s' %
+                               (server_id, e))
+                self.log.error(format_exc())
+                raise Exception('server %s migration failed, state: %s' %
+                                (server_id, vm_state))
+            finally:
+                retry_migrate = retry_migrate - 1
+        raise Exception('server %s migration timeout, state: %s' %
+                        (server_id, vm_state))
+
+    def _wait_host_empty(self, host):
+        hid = self.nova.hypervisors.search(host)[0].id
+        vcpus_used_last = 0
+        # wait 4min to get host empty
+        for j in range(48):
+            hvisor = self.nova.hypervisors.get(hid)
+            vcpus_used = hvisor.__getattr__('vcpus_used')
+            if vcpus_used > 0:
+                if vcpus_used_last == 0:
+                    self.log.info('%s still has %d vcpus reserved. wait...'
+                                  % (host, vcpus_used))
+                elif vcpus_used != vcpus_used_last:
+                    self.log.info('%s still has %d vcpus reserved. wait...'
+                                  % (host, vcpus_used))
+                vcpus_used_last = vcpus_used
+                time.sleep(5)
+            else:
+                self.log.info('%s empty' % host)
+                return
+        raise Exception('%s host not empty' % host)
+
+    def projects_listen_alarm(self, match_event):
+        match_projects = ([str(alarm['project_id']) for alarm in
+                          self.aodh.alarm.list() if
+                          str(alarm['event_rule']['event_type']) ==
+                          match_event])
+        all_projects_match = True
+        for project in list(self.projects_state):
+            if project not in match_projects:
+                self.log.error('Admin tool session %s: project %s not '
+                               'listening to %s' %
+                               (self.session_id, project, match_event))
+                all_projects_match = False
+        return all_projects_match
+
+    def project_servers_log_info(self, project, host_servers):
+        info = 'Project servers:\n'
+        for server in host_servers[project]:
+            info += ('  %s: %s\n' %
+                     (server, host_servers[project][server]))
+        self.log.info('%s' % info)
+
+    def servers_log_info(self, host_servers):
+        info = '\n'
+        for host in self.hosts:
+            info += '%s:\n' % host
+            if host in host_servers:
+                for project in host_servers[host]:
+                    info += '  %s:\n' % project
+                    for server in host_servers[host][project]:
+                        info += ('    %s: %s\n' %
+                                 (server, host_servers[host][project][server]))
+        self.log.info('%s' % info)
+
+    def update_server_info(self):
+        opts = {'all_tenants': True}
+        servers = self.nova.servers.list(search_opts=opts)
+        self.projects_servers = dict()
+        host_servers = dict()
+        for server in servers:
+            try:
+                host = str(server.__dict__.get('OS-EXT-SRV-ATTR:host'))
+                project = str(server.tenant_id)
+                server_name = str(server.name)
+                server_id = str(server.id)
+            except Exception:
+                raise Exception('can not get params from server=%s' %
+                                server)
+            if host not in self.hosts:
+                continue
+            if host not in host_servers:
+                host_servers[host] = dict()
+            if project not in host_servers[host]:
+                host_servers[host][project] = dict()
+            if project not in self.projects_servers:
+                self.projects_servers[project] = dict()
+            if project not in self.projects_state:
+                self.projects_state[project] = None
+            host_servers[host][project][server_id] = server_name
+            self.projects_servers[project][server_id] = server_name
+        return host_servers
+
+    def str_to_datetime(self, dt_str):
+        mdate, mtime = dt_str.split()
+        year, month, day = map(int, mdate.split('-'))
+        hours, minutes, seconds = map(int, mtime.split(':'))
+        return datetime.datetime(year, month, day, hours, minutes, seconds)
+
+    def host_maintenance(self, host):
+        self.log.info('maintaining host %s' % host)
+        # no implementation to make real maintenance
+        time.sleep(5)
+
+    def run(self):
+        while self.state != 'MAINTENANCE_COMPLETE':
+            self.log.info('--==session %s: processing state %s==--' %
+                          (self.session_id, self.state))
+            if self.state == 'MAINTENANCE':
+                host_servers = self.update_server_info()
+                self.servers_log_info(host_servers)
+
+                if not self.projects_listen_alarm('maintenance.scheduled'):
+                    raise Exception('all projects do not listen maintenance '
+                                    'alarm')
+                self.maintenance()
+
+                maint_at = self.str_to_datetime(self.maintenance_at)
+                if maint_at > datetime.datetime.utcnow():
+                    time_now = (datetime.datetime.utcnow().strftime(
+                                '%Y-%m-%d %H:%M:%S'))
+                    self.log.info('Time now: %s maintenance starts: %s....' %
+                                  (time_now, self.maintenance_at))
+                    td = maint_at - datetime.datetime.utcnow()
+                    time.sleep(td.total_seconds())
+                time_now = (datetime.datetime.utcnow().strftime(
+                            '%Y-%m-%d %H:%M:%S'))
+                self.log.info('Time to start maintenance starts: %s' %
+                              time_now)
+
+                # check if we have empty compute host
+                # True -> PLANNED_MAINTENANCE
+                # False -> check if we can migrate VMs to get empty host
+                # True -> PREPARE_MAINTENANCE
+                # False -> DOWN_SCALE
+                maintenance_empty_hosts = ([h for h in self.hosts if h not in
+                                           host_servers])
+
+                if len(maintenance_empty_hosts) == 0:
+                    if self.need_down_scale(host_servers):
+                        self.log.info('Need to down scale')
+                        self.state = 'DOWN_SCALE'
+                    else:
+                        self.log.info('Free capacity, but need empty host')
+                        self.state = 'PREPARE_MAINTENANCE'
+                else:
+                    self.log.info('Free capacity, but need empty host')
+                    self.state = 'PLANNED_MAINTENANCE'
+                self.log.info('--==State change from MAINTENANCE to %s==--'
+                              % self.state)
+            elif self.state == 'DOWN_SCALE':
+                # Test case is hard coded to have all compute capacity used
+                # We need to down scale to have one empty compute host
+                self.down_scale()
+                self.state = 'PREPARE_MAINTENANCE'
+                host_servers = self.update_server_info()
+                self.servers_log_info(host_servers)
+                self.log.info('--==State change from DOWN_SCALE to'
+                              ' %s==--' % self.state)
+
+            elif self.state == 'PREPARE_MAINTENANCE':
+                # It might be down scale did not free capacity on a single
+                # compute host, so we need to arrange free capacity to a single
+                # compute host
+                self.maint_proj_servers = self.projects_servers.copy()
+                maintenance_empty_hosts = ([h for h in self.hosts if h not in
+                                           host_servers])
+                if len(maintenance_empty_hosts) == 0:
+                    self.log.info('no empty hosts for maintenance')
+                    if self.need_down_scale(host_servers):
+                        raise Exception('Admin tool session %s: Not enough '
+                                        'free capacity for maintenance' %
+                                        self.session_id)
+                    host = self.find_host_to_be_empty(host_servers)
+                    if host:
+                        self.make_compute_host_empty(host, host_servers[host],
+                                                     'PREPARE_MAINTENANCE')
+                    else:
+                        # We do not currently support another down scale if
+                        # first was not enough
+                        raise Exception('Admin tool session %s: No host '
+                                        'candidate to be emptied' %
+                                        self.session_id)
+                else:
+                    for host in maintenance_empty_hosts:
+                        self.log.info('%s already empty '
+                                      'for maintenance' % host)
+                self.state = 'PLANNED_MAINTENANCE'
+                host_servers = self.update_server_info()
+                self.servers_log_info(host_servers)
+                self.log.info('--==State change from PREPARE_MAINTENANCE to %s'
+                              '==--' % self.state)
+            elif self.state == 'PLANNED_MAINTENANCE':
+                maintenance_hosts = list()
+                maintenance_empty_hosts = list()
+                # TODO This should be admin. hack for now to have it work
+                admin_project = list(self.projects_state)[0]
+                for host in self.hosts:
+                    self.log.info('disable nova-compute on host %s' % host)
+                    self.nova.services.disable_log_reason(host, 'nova-compute',
+                                                          'maintenance')
+                    self.computes_disabled.append(host)
+                    if host in host_servers and len(host_servers[host]):
+                        maintenance_hosts.append(host)
+                    else:
+                        maintenance_empty_hosts.append(host)
+                self.log.info('--==Start to maintain empty hosts==--\n%s' %
+                              maintenance_empty_hosts)
+                for host in maintenance_empty_hosts:
+                    # scheduler has problems, let's see if just down scaled
+                    # host is really empty
+                    self._wait_host_empty(host)
+                    self.log.info('IN_MAINTENANCE host %s' % host)
+                    self._admin_notify(admin_project, host, 'IN_MAINTENANCE',
+                                       self.session_id)
+                    self.host_maintenance(host)
+                    self._admin_notify(admin_project, host,
+                                       'MAINTENANCE_COMPLETE',
+                                       self.session_id)
+                    self.nova.services.enable(host, 'nova-compute')
+                    self.computes_disabled.remove(host)
+                    self.log.info('MAINTENANCE_COMPLETE host %s' % host)
+                self.log.info('--==Start to maintain occupied hosts==--\n%s' %
+                              maintenance_hosts)
+                for host in maintenance_hosts:
+                    self.log.info('PLANNED_MAINTENANCE host %s' % host)
+                    self.make_compute_host_empty(host, host_servers[host],
+                                                 'PLANNED_MAINTENANCE')
+                    self.log.info('IN_MAINTENANCE  host %s' % host)
+                    self._admin_notify(admin_project, host, 'IN_MAINTENANCE',
+                                       self.session_id)
+                    self.host_maintenance(host)
+                    self._admin_notify(admin_project, host,
+                                       'MAINTENANCE_COMPLETE',
+                                       self.session_id)
+                    self.nova.services.enable(host, 'nova-compute')
+                    self.computes_disabled.remove(host)
+                    self.log.info('MAINTENANCE_COMPLETE host %s' % host)
+                self.state = 'PLANNED_MAINTENANCE_COMPLETE'
+                host_servers = self.update_server_info()
+                self.servers_log_info(host_servers)
+            elif self.state == 'PLANNED_MAINTENANCE_COMPLETE':
+                self.log.info('Projects still need to up scale back to full '
+                              'capcity')
+                self.maintenance_complete()
+                host_servers = self.update_server_info()
+                self.servers_log_info(host_servers)
+                self.state = 'MAINTENANCE_COMPLETE'
+            else:
+                raise Exception('Admin tool session %s: session in invalid '
+                                'state %s' % (self.session_id, self.state))
+        self.log.info('--==Maintenance session %s: '
+                      'MAINTENANCE SESSION COMPLETE==--' % self.session_id)
+
+    def project_input(self, project_id, data):
+        self.log.debug('Admin tool session %s: project %s input' %
+                       (self.session_id, project_id))
+        if 'instance_actions' in data:
+            self.proj_server_actions[project_id] = (
+                data['instance_actions'].copy())
+        self.projects_state[project_id] = data['state']
+
+    def project_get_instances(self, project_id):
+        ret = list(self.projects_servers[project_id])
+        self.log.debug('Admin tool session %s: project %s GET return: %s' %
+                       (self.session_id, project_id, ret))
+        return ret
+
+    def stop(self):
+        self.stopped = True
+
+
+class AdminTool(Thread):
+
+    def __init__(self, trasport_url, conf, admin_tool, log):
+        Thread.__init__(self)
+        self.admin_tool = admin_tool
+        self.log = log
+        self.conf = conf
+        self.port = self.conf.admin_tool.port
+        self.maint_sessions = {}
+        self.projects = {}
+        self.maintenance_hosts = []
+        self.trasport_url = trasport_url
+
+    def run(self):
+        app = Flask('admin_tool')
+
+        @app.route('/maintenance', methods=['POST'])
+        def admin_maintenance_api_post():
+            data = json.loads(request.data.decode('utf8'))
+            self.log.info('maintenance message: %s' % data)
+            if 'session_id' in data:
+                if data['state'] == 'REMOVE_MAINTENANCE_SESSION':
+                    session_id = data['session_id']
+                    self.log.info('remove session %s'
+                                  % session_id)
+                    self.maint_sessions[session_id].cleanup()
+                    self.maint_sessions[session_id].stop()
+                    del self.maint_sessions[session_id]
+            else:
+                session_id = str(generate_uuid())
+                self.log.info('creating session: %s' % session_id)
+                self.maint_sessions[session_id] = (
+                    AdminMain(self.trasport_url,
+                              session_id,
+                              data,
+                              self,
+                              self.conf,
+                              self.log))
+                self.maint_sessions[session_id].start()
+            reply = json.dumps({'session_id': session_id,
+                                'state': 'ACK_%s' % data['state']})
+            self.log.debug('reply: %s' % reply)
+            return reply, 200, None
+
+        @app.route('/maintenance', methods=['GET'])
+        def admin_maintenance_api_get():
+            data = json.loads(request.data.decode('utf8'))
+            self.log.debug('Admin get maintenance: %s' % data)
+            session_id = data['session_id']
+            reply = json.dumps({'state':
+                               self.maint_sessions[session_id].state})
+            self.log.debug('reply: %s' % reply)
+            return reply, 200, None
+
+        @app.route('/<projet_id>/maintenance', methods=['PUT'])
+        def project_maintenance_api_put(projet_id=None):
+            data = json.loads(request.data.decode('utf8'))
+            self.log.debug('%s project put: %s' % (projet_id, data))
+            self.project_input(projet_id, data)
+            return 'OK'
+
+        @app.route('/<projet_id>/maintenance', methods=['GET'])
+        def project_maintenance_api_get(projet_id=None):
+            data = json.loads(request.data.decode('utf8'))
+            self.log.debug('%s project get %s' % (projet_id, data))
+            instances = self.project_get_instances(projet_id, data)
+            reply = json.dumps({'instance_ids': instances})
+            self.log.debug('%s reply: %s' % (projet_id, reply))
+            return reply, 200, None
+
+        @app.route('/shutdown', methods=['POST'])
+        def shutdown():
+            for session in self.maint_sessions:
+                self.log.info('shutdown admin tool session %s thread' %
+                              session)
+                self.maint_sessions[session].cleanup()
+                self.maint_sessions[session].stop()
+            self.log.info('shutdown admin_tool server at %s' % time.time())
+            func = request.environ.get('werkzeug.server.shutdown')
+            if func is None:
+                raise RuntimeError('Not running with the Werkzeug Server')
+            func()
+            return 'admin_tool app shutting down...'
+
+        app.run(host='0.0.0.0', port=self.port)
+
+    def project_input(self, project_id, data):
+        session_id = data['session_id']
+        self.maint_sessions[session_id].project_input(project_id, data)
+
+    def project_get_instances(self, project_id, data):
+        session_id = data['session_id']
+        return self.maint_sessions[session_id].project_get_instances(
+            project_id)
diff --git a/doctor_tests/app_manager/__init__.py b/doctor_tests/app_manager/__init__.py
new file mode 100644 (file)
index 0000000..717d658
--- /dev/null
@@ -0,0 +1,38 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from oslo_config import cfg
+from oslo_utils import importutils
+
+
+OPTS = [
+    cfg.StrOpt('type',
+               default='sample',
+               choices=['sample'],
+               help='the component of doctor app manager',
+               required=True),
+    cfg.StrOpt('ip',
+               default='127.0.0.1',
+               help='the ip of app manager',
+               required=True),
+    cfg.IntOpt('port',
+               default='12348',
+               help='the port of doctor app manager',
+               required=True),
+]
+
+
+_app_manager_name_class_mapping = {
+    'sample': 'doctor_tests.app_manager.sample.SampleAppManager'
+}
+
+
+def get_app_manager(stack, conf, log):
+    app_manager_class = (
+        _app_manager_name_class_mapping.get(conf.app_manager.type))
+    return importutils.import_object(app_manager_class, stack, conf, log)
diff --git a/doctor_tests/app_manager/base.py b/doctor_tests/app_manager/base.py
new file mode 100644 (file)
index 0000000..0d42408
--- /dev/null
@@ -0,0 +1,26 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import abc
+import six
+
+
+@six.add_metaclass(abc.ABCMeta)
+class BaseAppManager(object):
+
+    def __init__(self, conf, log):
+        self.conf = conf
+        self.log = log
+
+    @abc.abstractmethod
+    def start(self):
+        pass
+
+    @abc.abstractmethod
+    def stop(self):
+        pass
diff --git a/doctor_tests/app_manager/sample.py b/doctor_tests/app_manager/sample.py
new file mode 100644 (file)
index 0000000..94926ee
--- /dev/null
@@ -0,0 +1,255 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from flask import Flask
+from flask import request
+import json
+import yaml
+import time
+from threading import Thread
+import requests
+
+from doctor_tests.app_manager.base import BaseAppManager
+from doctor_tests.identity_auth import get_identity_auth
+from doctor_tests.identity_auth import get_session
+from doctor_tests.os_clients import nova_client
+
+
+class SampleAppManager(BaseAppManager):
+
+    def __init__(self, stack, conf, log):
+        super(SampleAppManager, self).__init__(conf, log)
+        self.stack = stack
+        self.app = None
+
+    def start(self):
+        self.log.info('sample app manager start......')
+        self.app = AppManager(self.stack, self.conf, self, self.log)
+        self.app.start()
+
+    def stop(self):
+        self.log.info('sample app manager stop......')
+        if not self.app:
+            return
+        headers = {
+            'Content-Type': 'application/json',
+            'Accept': 'application/json',
+        }
+        url = 'http://%s:%d/shutdown'\
+              % (self.conf.app_manager.ip,
+                 self.conf.app_manager.port)
+        requests.post(url, data='', headers=headers)
+
+
+class AppManager(Thread):
+
+    def __init__(self, stack, conf, app_manager, log):
+        Thread.__init__(self)
+        self.stack = stack
+        self.conf = conf
+        self.port = self.conf.app_manager.port
+        self.app_manager = app_manager
+        self.log = log
+        self.intance_ids = None
+        self.headers = {
+            'Content-Type': 'application/json',
+            'Accept': 'application/json'}
+        self.auth = get_identity_auth(project=self.conf.doctor_project)
+        self.nova = nova_client(self.conf.nova_version,
+                                get_session(auth=self.auth))
+        self.orig_number_of_instances = self.number_of_instances()
+        self.ha_instances = self.get_ha_instances()
+        self.floating_ip = None
+        self.active_instance_id = self.active_instance_id()
+
+    def active_instance_id(self):
+        for instance in self.ha_instances:
+            network_interfaces = next(iter(instance.addresses.values()))
+            for network_interface in network_interfaces:
+                _type = network_interface.get('OS-EXT-IPS:type')
+                if _type == "floating":
+                    if not self.floating_ip:
+                        self.floating_ip = network_interface.get('addr')
+                    self.log.debug('active_instance: %s %s' %
+                                   (instance.name, instance.id))
+                    return instance.id
+        raise Exception("No active instance found")
+
+    def switch_over_ha_instance(self):
+        for instance in self.ha_instances:
+            if instance.id != self.active_instance_id:
+                self.log.info('Switch over to: %s %s' % (instance.name,
+                                                         instance.id))
+                instance.add_floating_ip(self.floating_ip)
+                self.active_instance_id = instance.id
+                break
+
+    def get_instance_ids(self):
+        ret = list()
+        for instance in self.nova.servers.list(detailed=False):
+            ret.append(instance.id)
+        return ret
+
+    def get_ha_instances(self):
+        ha_instances = list()
+        for instance in self.nova.servers.list(detailed=True):
+            if "doctor_ha_app_" in instance.name:
+                ha_instances.append(instance)
+                self.log.debug('ha_instances: %s' % instance.name)
+        return ha_instances
+
+    def _alarm_data_decoder(self, data):
+        if "[" in data or "{" in data:
+            # string to list or dict removing unicode
+            data = yaml.load(data.replace("u'", "'"))
+        return data
+
+    def _alarm_traits_decoder(self, data):
+        return ({str(t[0]): self._alarm_data_decoder(str(t[2]))
+                for t in data['reason_data']['event']['traits']})
+
+    def get_session_instance_ids(self, url, session_id):
+        data = {'session_id': session_id}
+        ret = requests.get(url, data=json.dumps(data), headers=self.headers)
+        if ret.status_code != 200:
+            raise Exception(ret.text)
+        self.log.info('get_instance_ids %s' % ret.json())
+        return ret.json()['instance_ids']
+
+    def scale_instances(self, number_of_instances):
+        number_of_instances_before = self.number_of_instances()
+
+        parameters = self.stack.parameters
+        parameters['nonha_intances'] += number_of_instances
+        self.stack.update(self.stack.stack_name,
+                          self.stack.stack_id,
+                          self.stack.template,
+                          parameters=parameters,
+                          files=self.stack.files)
+
+        number_of_instances_after = self.number_of_instances()
+        if (number_of_instances_before + number_of_instances !=
+           number_of_instances_after):
+            self.log.error('scale_instances with: %d from: %d ends up to: %d'
+                           % (number_of_instances, number_of_instances_before,
+                              number_of_instances_after))
+            raise Exception('scale_instances failed')
+
+        self.log.info('scaled insances from %d to %d' %
+                      (number_of_instances_before,
+                       number_of_instances_after))
+
+    def number_of_instances(self):
+        return len(self.nova.servers.list(detailed=False))
+
+    def run(self):
+        app = Flask('app_manager')
+
+        @app.route('/maintenance', methods=['POST'])
+        def maintenance_alarm():
+            data = json.loads(request.data.decode('utf8'))
+            try:
+                payload = self._alarm_traits_decoder(data)
+            except:
+                payload = ({t[0]: t[2] for t in
+                           data['reason_data']['event']['traits']})
+                self.log.error('cannot parse alarm data: %s' % payload)
+                raise Exception('sample app manager cannot parse alarm.'
+                                'Possibly trait data over 256 char')
+
+            self.log.info('sample app manager received data = %s' % payload)
+
+            state = payload['state']
+            reply_state = None
+            reply = dict()
+
+            self.log.info('sample app manager state: %s' % state)
+
+            if state == 'MAINTENANCE':
+                instance_ids = (self.get_session_instance_ids(
+                                payload['instance_ids'],
+                                payload['session_id']))
+                reply['instance_ids'] = instance_ids
+                reply_state = 'ACK_MAINTENANCE'
+
+            elif state == 'DOWN_SCALE':
+                # scale down 2 isntances that is VCPUS equaling to single
+                # compute node
+                self.scale_instances(-2)
+                reply['instance_ids'] = self.get_instance_ids()
+                reply_state = 'ACK_DOWN_SCALE'
+
+            elif state == 'MAINTENANCE_COMPLETE':
+                # possibly need to upscale
+                number_of_instances = self.number_of_instances()
+                if self.orig_number_of_instances > number_of_instances:
+                    scale_instances = (self.orig_number_of_instances -
+                                       number_of_instances)
+                    self.scale_instances(scale_instances)
+                reply_state = 'ACK_MAINTENANCE_COMPLETE'
+
+            elif state == 'PREPARE_MAINTENANCE':
+                if "MIGRATE" not in payload['allowed_actions']:
+                    raise Exception('MIGRATE not supported')
+
+                instance_ids = (self.get_session_instance_ids(
+                                payload['instance_ids'],
+                                payload['session_id']))
+                self.log.info('sample app manager got instances: %s' %
+                              instance_ids)
+                instance_actions = dict()
+                for instance_id in instance_ids:
+                    instance_actions[instance_id] = "MIGRATE"
+                    if instance_id == self.active_instance_id:
+                        self.switch_over_ha_instance()
+                reply['instance_actions'] = instance_actions
+                reply_state = 'ACK_PREPARE_MAINTENANCE'
+
+            elif state == 'PLANNED_MAINTENANCE':
+                if "MIGRATE" not in payload['allowed_actions']:
+                    raise Exception('MIGRATE not supported')
+
+                instance_ids = (self.get_session_instance_ids(
+                                payload['instance_ids'],
+                                payload['session_id']))
+                self.log.info('sample app manager got instances: %s' %
+                              instance_ids)
+                instance_actions = dict()
+                for instance_id in instance_ids:
+                    instance_actions[instance_id] = "MIGRATE"
+                    if instance_id == self.active_instance_id:
+                        self.switch_over_ha_instance()
+                reply['instance_actions'] = instance_actions
+                reply_state = 'ACK_PLANNED_MAINTENANCE'
+
+            elif state == 'INSTANCE_ACTION_DONE':
+                self.log.info('%s' % payload['instance_ids'])
+
+            else:
+                raise Exception('sample app manager received event with'
+                                ' unknown state %s' % state)
+
+            if reply_state:
+                reply['session_id'] = payload['session_id']
+                reply['state'] = reply_state
+                url = payload['reply_url']
+                self.log.info('sample app manager reply: %s' % reply)
+                requests.put(url, data=json.dumps(reply), headers=self.headers)
+
+            return 'OK'
+
+        @app.route('/shutdown', methods=['POST'])
+        def shutdown():
+            self.log.info('shutdown app manager server at %s' % time.time())
+            func = request.environ.get('werkzeug.server.shutdown')
+            if func is None:
+                raise RuntimeError('Not running with the Werkzeug Server')
+            func()
+            return 'app manager shutting down...'
+
+        app.run(host="0.0.0.0", port=self.port)
index dc05c0d..cea1f0c 100644 (file)
@@ -11,6 +11,8 @@ import itertools
 from oslo_config import cfg\r
 \r
 from doctor_tests import alarm\r
+from doctor_tests import admin_tool\r
+from doctor_tests import app_manager\r
 from doctor_tests import consumer\r
 from doctor_tests import image\r
 from doctor_tests import instance\r
@@ -30,6 +32,8 @@ def list_opts():
         ('monitor', monitor.OPTS),\r
         ('inspector', inspector.OPTS),\r
         ('consumer', consumer.OPTS),\r
+        ('admin_tool', admin_tool.OPTS),\r
+        ('app_manager', app_manager.OPTS),\r
         ('DEFAULT', itertools.chain(\r
             os_clients.OPTS,\r
             image.OPTS,\r
index 2c66a54..e5a3650 100644 (file)
@@ -21,7 +21,7 @@ OPTS = [
                help='the ip of consumer',
                required=True),
     cfg.IntOpt('port',
-               default='12346',
+               default=12346,
                help='the port of doctor consumer',
                required=True),
 ]
index 7742373..a55a12b 100644 (file)
@@ -13,6 +13,7 @@ import json
 import time
 from threading import Thread
 import requests
+import yaml
 
 from doctor_tests.common import utils
 from doctor_tests.identity_auth import get_identity_auth
@@ -105,6 +106,39 @@ class SampleInspector(BaseInspector):
                 if self.conf.inspector.update_neutron_port_dp_status:
                     thr3.join()
 
+    def _alarm_data_decoder(self, data):
+        if "[" in data or "{" in data:
+            # string to list or dict removing unicode
+            data = yaml.load(data.replace("u'", "'"))
+        return data
+
+    def _alarm_traits_decoder(self, data):
+        return ({str(t[0]): self._alarm_data_decoder(str(t[2]))
+                for t in data['reason_data']['event']['traits']})
+
+    def maintenance(self, data):
+        try:
+            payload = self._alarm_traits_decoder(data)
+        except:
+            payload = ({t[0]: t[2] for t in
+                       data['reason_data']['event']['traits']})
+            self.log.error('cannot parse alarm data: %s' % payload)
+            raise Exception('sample inspector cannot parse alarm.'
+                            'Possibly trait data over 256 char')
+        self.log.info('sample inspector received data = %s' % payload)
+
+        state = payload['state']
+        host = payload['host']
+
+        if state == 'IN_MAINTENANCE':
+            self.log.info("sample inspector: disable %s automatic fault "
+                          "management" % host)
+        elif state == 'MAINTENANCE_COMPLETE':
+            self.log.info("sample inspector: enable %s automatic fault "
+                          "management" % host)
+        else:
+            raise("sample inspector couldn't handle state: %s" % state)
+
     @utils.run_async
     def _disable_compute_host(self, hostname):
         self.nova.services.force_down(hostname, 'nova-compute', True)
@@ -173,6 +207,11 @@ class InspectorApp(Thread):
             self.inspector.handle_events(events)
             return "OK"
 
+        @app.route('/maintenance', methods=['POST'])
+        def maintenance():
+            self.inspector.maintenance(request.json)
+            return "OK"
+
         @app.route('/events/shutdown', methods=['POST'])
         def shutdown():
             self.log.info('shutdown inspector app server at %s' % time.time())
index 31fce75..ee44018 100644 (file)
@@ -24,6 +24,10 @@ OPTS = [
                default='root',
                help='the user name for login installer server',
                required=True),
+    cfg.StrOpt('key_file',
+               default=os.environ.get('SSH_KEY', None),
+               help='the key for user to login installer server',
+               required=False),
 ]
 
 
index 1ce3eb6..bfa72d3 100644 (file)
@@ -6,29 +6,36 @@
 # which accompanies this distribution, and is available at
 # http://www.apache.org/licenses/LICENSE-2.0
 ##############################################################################
+import re
+import time
+
 from doctor_tests.common.utils import SSHClient
 from doctor_tests.installer.base import BaseInstaller
 
 
 class ApexInstaller(BaseInstaller):
     node_user_name = 'heat-admin'
-    cm_set_script = 'set_ceilometer.py'
-    cm_restore_script = 'restore_ceilometer.py'
+    cm_set_script = 'set_config.py'
+    cm_set_compute_script = 'set_compute_config.py'
+    cm_restore_script = 'restore_config.py'
+    cm_restore_compute_script = 'restore_compute_config.py'
 
     def __init__(self, conf, log):
         super(ApexInstaller, self).__init__(conf, log)
         self.client = SSHClient(self.conf.installer.ip,
                                 self.conf.installer.username,
+                                key_filename=self.conf.installer.key_file,
                                 look_for_keys=True)
         self.key_file = None
         self.controllers = list()
+        self.computes = list()
         self.controller_clients = list()
+        self.compute_clients = list()
 
     def setup(self):
         self.log.info('Setup Apex installer start......')
-
         self.key_file = self.get_ssh_key_from_installer()
-        self.controllers = self.get_controller_ips()
+        self._get_and_set_ips()
         self.create_flavor()
         self.set_apply_patches()
         self.setup_stunnel()
@@ -42,16 +49,20 @@ class ApexInstaller(BaseInstaller):
         key_path = '/home/stack/.ssh/id_rsa'
         return self._get_ssh_key(self.client, key_path)
 
-    def get_controller_ips(self):
-        self.log.info('Get controller ips from Apex installer......')
-
-        command = "source stackrc; " \
-                  "nova list | grep ' overcloud-controller-[0-9] ' " \
-                  "| sed -e 's/^.*ctlplane=//' |awk '{print $1}'"
-        controllers = self._run_cmd_remote(self.client, command)
-        self.log.info('Get controller_ips:%s from Apex installer'
-                      % controllers)
-        return controllers
+    def _get_and_set_ips(self):
+        self.log.info('Get controller and compute ips from Apex installer'
+                      '......')
+
+        command = "source stackrc; nova list | grep ' overcloud-'"
+        raw_ips_list = self._run_cmd_remote(self.client, command)
+        for line in raw_ips_list:
+            ip = line.split('ctlplane=', 1)[1].split(" ", 1)[0]
+            if 'overcloud-controller-' in line:
+                self.controllers.append(ip)
+            elif 'overcloud-novacompute-' in line:
+                self.computes.append(ip)
+        self.log.info('controller_ips:%s' % self.controllers)
+        self.log.info('compute_ips:%s' % self.computes)
 
     def get_host_ip_from_hostname(self, hostname):
         self.log.info('Get host ip by hostname=%s from Apex installer......'
@@ -62,12 +73,31 @@ class ApexInstaller(BaseInstaller):
         host_ips = self._run_cmd_remote(self.client, command)
         return host_ips[0]
 
+    def get_transport_url(self):
+        client = SSHClient(self.controllers[0], self.node_user_name,
+                           key_filename=self.key_file)
+
+        command = 'sudo grep "^transport_url" /etc/nova/nova.conf'
+        ret, url = client.ssh(command)
+        if ret:
+            raise Exception('Exec command to get host ip from controller(%s)'
+                            'in Apex installer failed, ret=%s, output=%s'
+                            % (self.controllers[0], ret, url))
+        # need to use ip instead of hostname
+        ret = (re.sub("@.*:", "@%s:" % self.controllers[0],
+               url[0].split("=", 1)[1]))
+        self.log.debug('get_transport_url %s' % ret)
+        return ret
+
     def set_apply_patches(self):
         self.log.info('Set apply patches start......')
 
         restart_cm_cmd = 'sudo systemctl restart ' \
                          'openstack-ceilometer-notification.service'
 
+        if self.conf.test_case != 'fault_management':
+            restart_cm_cmd += ' openstack-nova-scheduler.service'
+
         for node_ip in self.controllers:
             client = SSHClient(node_ip, self.node_user_name,
                                key_filename=self.key_file)
@@ -76,13 +106,38 @@ class ApexInstaller(BaseInstaller):
                                     restart_cm_cmd,
                                     self.cm_set_script)
 
+        if self.conf.test_case != 'fault_management':
+            restart_cm_cmd = 'sudo systemctl restart ' \
+                             'openstack-nova-compute.service'
+            for node_ip in self.computes:
+                client = SSHClient(node_ip, self.node_user_name,
+                                   key_filename=self.key_file)
+                self.compute_clients.append(client)
+                self._run_apply_patches(client,
+                                        restart_cm_cmd,
+                                        self.cm_set_compute_script)
+
+        if self.conf.test_case != 'fault_management':
+            time.sleep(10)
+
     def restore_apply_patches(self):
         self.log.info('restore apply patches start......')
 
         restart_cm_cmd = 'sudo systemctl restart ' \
                          'openstack-ceilometer-notification.service'
 
+        if self.conf.test_case != 'fault_management':
+            restart_cm_cmd += ' openstack-nova-scheduler.service'
+
         for client in self.controller_clients:
             self._run_apply_patches(client,
                                     restart_cm_cmd,
                                     self.cm_restore_script)
+
+        if self.conf.test_case != 'fault_management':
+            restart_cm_cmd = 'sudo systemctl restart ' \
+                             'openstack-nova-compute.service'
+            for client in self.compute_clients:
+                self._run_apply_patches(client,
+                                        restart_cm_cmd,
+                                        self.cm_restore_compute_script)
index 76bbeb1..4eed3f2 100644 (file)
@@ -58,22 +58,33 @@ class BaseInstaller(object):
     def setup_stunnel(self):
         self.log.info('Setup ssh stunnel in %s installer......'
                       % self.conf.installer.type)
+        tunnels = [self.conf.consumer.port]
+        if self.conf.test_case == 'maintenance':
+            tunnel_uptime = 1200
+            tunnels += [self.conf.app_manager.port, self.conf.inspector.port]
+        elif self.conf.test_case == 'all':
+            tunnel_uptime = 1800
+            tunnels += [self.conf.app_manager.port, self.conf.inspector.port]
+        else:
+            tunnel_uptime = 600
 
         for node_ip in self.controllers:
-            cmd = ("ssh -o UserKnownHostsFile=/dev/null"
-                   " -o StrictHostKeyChecking=no"
-                   " -i %s %s@%s -R %s:localhost:%s"
-                   " sleep 600 > ssh_tunnel.%s.log"
-                   " 2>&1 < /dev/null &"
-                   % (self.key_file,
-                      self.node_user_name,
-                      node_ip,
-                      self.conf.consumer.port,
-                      self.conf.consumer.port,
-                      node_ip))
-            server = subprocess.Popen(cmd, shell=True)
-            self.servers.append(server)
-            server.communicate()
+            for port in tunnels:
+                self.log.info('tunnel for port %s' % port)
+                cmd = ("ssh -o UserKnownHostsFile=/dev/null"
+                       " -o StrictHostKeyChecking=no"
+                       " -i %s %s@%s -R %s:localhost:%s"
+                       " sleep %s > ssh_tunnel.%s"
+                       " 2>&1 < /dev/null "
+                       % (self.key_file,
+                          self.node_user_name,
+                          node_ip,
+                          port,
+                          port,
+                          tunnel_uptime,
+                          node_ip))
+                server = subprocess.Popen('exec ' + cmd, shell=True)
+                self.servers.append(server)
 
     def _get_ssh_key(self, client, key_path):
         self.log.info('Get SSH keys from %s installer......'
diff --git a/doctor_tests/installer/common/restore_compute_config.py b/doctor_tests/installer/common/restore_compute_config.py
new file mode 100644 (file)
index 0000000..0971d12
--- /dev/null
@@ -0,0 +1,25 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import shutil
+
+
+def restore_cpu_allocation_ratio():
+    nova_file = '/etc/nova/nova.conf'
+    nova_file_bak = '/etc/nova/nova.bak'
+
+    if not os.path.isfile(nova_file_bak):
+        print('Bak_file:%s does not exist.' % nova_file_bak)
+    else:
+        print('restore: %s' % nova_file)
+        shutil.copyfile(nova_file_bak, nova_file)
+        os.remove(nova_file_bak)
+    return
+
+restore_cpu_allocation_ratio()
@@ -24,4 +24,32 @@ def restore_ep_config():
     return
 
 
+def restore_ed_config():
+
+    ed_file = '/etc/ceilometer/event_definitions.yaml'
+    ed_file_bak = '/etc/ceilometer/event_definitions.bak'
+
+    if not os.path.isfile(ed_file_bak):
+        print("Bak_file doesn't exist: %s." % ed_file_bak)
+    else:
+        print('restore: %s' % ed_file)
+        shutil.copyfile(ed_file_bak, ed_file)
+        os.remove(ed_file_bak)
+    return
+
+
+def restore_cpu_allocation_ratio():
+    nova_file = '/etc/nova/nova.conf'
+    nova_file_bak = '/etc/nova/nova.bak'
+
+    if not os.path.isfile(nova_file_bak):
+        print('Bak_file:%s does not exist.' % nova_file_bak)
+    else:
+        print('restore: %s' % nova_file)
+        shutil.copyfile(nova_file_bak, nova_file)
+        os.remove(nova_file_bak)
+    return
+
 restore_ep_config()
+restore_ed_config()
+restore_cpu_allocation_ratio()
diff --git a/doctor_tests/installer/common/set_ceilometer.py b/doctor_tests/installer/common/set_ceilometer.py
deleted file mode 100644 (file)
index 4050aae..0000000
+++ /dev/null
@@ -1,45 +0,0 @@
-##############################################################################
-# Copyright (c) 2017 ZTE Corporation and others.
-#
-# All rights reserved. This program and the accompanying materials
-# are made available under the terms of the Apache License, Version 2.0
-# which accompanies this distribution, and is available at
-# http://www.apache.org/licenses/LICENSE-2.0
-##############################################################################
-import os
-import shutil
-import yaml
-
-ep_file = '/etc/ceilometer/event_pipeline.yaml'
-ep_file_bak = '/etc/ceilometer/event_pipeline.yaml.bak'
-event_notifier_topic = 'notifier://?topic=alarm.all'
-
-
-def set_notifier_topic():
-    config_modified = False
-
-    if not os.path.isfile(ep_file):
-        raise Exception("File doesn't exist: %s." % ep_file)
-
-    with open(ep_file, 'r') as file:
-        config = yaml.safe_load(file)
-
-    sinks = config['sinks']
-    for sink in sinks:
-        if sink['name'] == 'event_sink':
-            publishers = sink['publishers']
-            if event_notifier_topic not in publishers:
-                print('Add event notifier in ceilometer')
-                publishers.append(event_notifier_topic)
-                config_modified = True
-            else:
-                print('NOTE: event notifier is configured'
-                      'in ceilometer as we needed')
-
-    if config_modified:
-        shutil.copyfile(ep_file, ep_file_bak)
-        with open(ep_file, 'w+') as file:
-            file.write(yaml.safe_dump(config))
-
-
-set_notifier_topic()
diff --git a/doctor_tests/installer/common/set_compute_config.py b/doctor_tests/installer/common/set_compute_config.py
new file mode 100644 (file)
index 0000000..07db1e1
--- /dev/null
@@ -0,0 +1,48 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import shutil
+
+
+def set_cpu_allocation_ratio():
+    nova_file = '/etc/nova/nova.conf'
+    nova_file_bak = '/etc/nova/nova.bak'
+
+    if not os.path.isfile(nova_file):
+        raise Exception("File doesn't exist: %s." % nova_file)
+    # TODO (tojuvone): Unfortunately ConfigParser did not produce working conf
+    fcheck = open(nova_file)
+    found_list = ([ca for ca in fcheck.readlines() if "cpu_allocation_ratio"
+                  in ca])
+    fcheck.close()
+    if found_list and len(found_list):
+        change = False
+        found = False
+        for car in found_list:
+            if car.startswith('#'):
+                continue
+            if car.startswith('cpu_allocation_ratio'):
+                found = True
+                if "1.0" not in car.split('=')[1]:
+                    change = True
+    if not found or change:
+        # need to add or change
+        shutil.copyfile(nova_file, nova_file_bak)
+        fin = open(nova_file_bak)
+        fout = open(nova_file, "wt")
+        for line in fin:
+            if change and line.startswith("cpu_allocation_ratio"):
+                line = "cpu_allocation_ratio=1.0"
+            if not found and line.startswith("[DEFAULT]"):
+                line += "cpu_allocation_ratio=1.0\n"
+            fout.write(line)
+        fin.close()
+        fout.close()
+
+set_cpu_allocation_ratio()
diff --git a/doctor_tests/installer/common/set_config.py b/doctor_tests/installer/common/set_config.py
new file mode 100644 (file)
index 0000000..4246524
--- /dev/null
@@ -0,0 +1,139 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import shutil
+import yaml
+
+ep_file = '/etc/ceilometer/event_pipeline.yaml'
+ep_file_bak = '/etc/ceilometer/event_pipeline.yaml.bak'
+event_notifier_topic = 'notifier://?topic=alarm.all'
+
+
+def set_notifier_topic():
+    config_modified = False
+
+    if not os.path.isfile(ep_file):
+        raise Exception("File doesn't exist: %s." % ep_file)
+
+    with open(ep_file, 'r') as file:
+        config = yaml.safe_load(file)
+
+    sinks = config['sinks']
+    for sink in sinks:
+        if sink['name'] == 'event_sink':
+            publishers = sink['publishers']
+            if event_notifier_topic not in publishers:
+                print('Add event notifier in ceilometer')
+                publishers.append(event_notifier_topic)
+                config_modified = True
+            else:
+                print('NOTE: event notifier is configured'
+                      'in ceilometer as we needed')
+
+    if config_modified:
+        shutil.copyfile(ep_file, ep_file_bak)
+        with open(ep_file, 'w+') as file:
+            file.write(yaml.safe_dump(config))
+
+
+def set_maintenance_event_definitions():
+    ed_file = '/etc/ceilometer/event_definitions.yaml'
+    ed_file_bak = '/etc/ceilometer/event_definitions.bak'
+
+    if not os.path.isfile(ed_file):
+        raise Exception("File doesn't exist: %s." % ed_file)
+
+    with open(ed_file, 'r') as file:
+        config = yaml.safe_load(file)
+
+    et_list = [et['event_type'] for et in config]
+
+    if 'maintenance.scheduled' in et_list:
+        add_mscheduled = False
+        print('NOTE: maintenance.scheduled allready configured')
+    else:
+        print('NOTE: add maintenance.scheduled to event_definitions.yaml')
+        add_mscheduled = True
+        mscheduled = {
+            'event_type': 'maintenance.scheduled',
+            'traits': {
+                'allowed_actions': {'fields': 'payload.allowed_actions'},
+                'instance_ids': {'fields': 'payload.instance_ids'},
+                'reply_url': {'fields': 'payload.reply_url'},
+                'actions_at': {'fields': 'payload.actions_at',
+                               'type': 'datetime'},
+                'state': {'fields': 'payload.state'},
+                'session_id': {'fields': 'payload.session_id'},
+                'project_id': {'fields': 'payload.project_id'},
+                'metadata': {'fields': 'payload.metadata'}
+            }
+        }
+        config.append(mscheduled)
+
+    if 'maintenance.host' in et_list:
+        add_mhost = False
+        print('NOTE: maintenance.host allready configured')
+    else:
+        print('NOTE: add maintenance.host to event_definitions.yaml')
+        add_mhost = True
+        mhost = {
+            'event_type': 'maintenance.host',
+            'traits': {
+                'host': {'fields': 'payload.host'},
+                'project_id': {'fields': 'payload.project_id'},
+                'state': {'fields': 'payload.state'},
+                'session_id': {'fields': 'payload.session_id'}
+            }
+        }
+        config.append(mhost)
+
+    if add_mscheduled or add_mhost:
+        shutil.copyfile(ed_file, ed_file_bak)
+        with open(ed_file, 'w+') as file:
+            file.write(yaml.safe_dump(config))
+
+
+def set_cpu_allocation_ratio():
+    nova_file = '/etc/nova/nova.conf'
+    nova_file_bak = '/etc/nova/nova.bak'
+
+    if not os.path.isfile(nova_file):
+        raise Exception("File doesn't exist: %s." % nova_file)
+    # TODO (tojuvone): Unfortunately ConfigParser did not produce working conf
+    fcheck = open(nova_file)
+    found_list = ([ca for ca in fcheck.readlines() if "cpu_allocation_ratio"
+                  in ca])
+    fcheck.close()
+    if found_list and len(found_list):
+        change = False
+        found = False
+        for car in found_list:
+            if car.startswith('#'):
+                continue
+            if car.startswith('cpu_allocation_ratio'):
+                found = True
+                if "1.0" not in car.split('=')[1]:
+                    change = True
+    if not found or change:
+        # need to add or change
+        shutil.copyfile(nova_file, nova_file_bak)
+        fin = open(nova_file_bak)
+        fout = open(nova_file, "wt")
+        for line in fin:
+            if change and line.startswith("cpu_allocation_ratio"):
+                line = "cpu_allocation_ratio=1.0"
+            if not found and line.startswith("[DEFAULT]"):
+                line += "cpu_allocation_ratio=1.0\n"
+            fout.write(line)
+        fin.close()
+        fout.close()
+
+set_notifier_topic()
+set_maintenance_event_definitions()
+set_cpu_allocation_ratio()
index 8ba9f00..e7e41db 100644 (file)
@@ -22,7 +22,8 @@ class McpInstaller(BaseInstaller):
         self.key_file = self.get_ssh_key_from_installer()
         self.client = SSHClient(self.conf.installer.ip,
                                 self.node_user_name,
-                                key_filename=self.key_file)
+                                key_filename=self.key_file,
+                                look_for_keys=True)
         self.controllers = list()
         self.controller_clients = list()
 
index 61facb6..438d832 100644 (file)
@@ -10,6 +10,7 @@ import os
 from os.path import isfile, join
 import sys
 import time
+from traceback import format_exc
 
 from doctor_tests import config
 from doctor_tests.identity_auth import get_identity_auth
@@ -17,8 +18,9 @@ from doctor_tests.identity_auth import get_session
 from doctor_tests.image import Image
 from doctor_tests.installer import get_installer
 import doctor_tests.logger as doctor_log
-from doctor_tests.os_clients import nova_client
 from doctor_tests.scenario.fault_management import FaultManagement
+from doctor_tests.os_clients import nova_client
+from doctor_tests.scenario.maintenance import Maintenance
 from doctor_tests.user import User
 
 
@@ -67,7 +69,7 @@ class DoctorTest(object):
             # injecting host failure...
             # NOTE (umar) add INTERFACE_NAME logic to host injection
             self.fault_management.start()
-            time.sleep(10)
+            time.sleep(30)
 
             # verify the test results
             # NOTE (umar) copy remote monitor.log file when monitor=collectd
@@ -92,20 +94,42 @@ class DoctorTest(object):
             LOG.info('not enough compute nodes, skipping doctor '
                      'maintenance test')
             return
+        elif self.conf.installer.type != 'apex':
+            LOG.info('not supported installer, skipping doctor '
+                     'maintenance test')
+            return
         try:
             LOG.info('doctor maintenance test starting.......')
-            # TODO (tojuvone) test setup and actual test
+            trasport_url = self.installer.get_transport_url()
+            maintenance = Maintenance(trasport_url, self.conf, LOG)
+            maintenance.setup_maintenance(self.user)
+
+            # wait for aodh alarms are updated in caches for event evaluator,
+            # sleep time should be larger than event_alarm_cache_ttl
+            # (default 60)
+            LOG.info('wait aodh for 120s.......')
+            time.sleep(120)
+
+            session_id = maintenance.start_maintenance()
+            maintenance.wait_maintenance_complete(session_id)
+
+            LOG.info('doctor maintenance complete.......')
+
         except Exception as e:
             LOG.error('doctor maintenance test failed, Exception=%s' % e)
+            LOG.error(format_exc())
             sys.exit(1)
-        # TODO (tojuvone) finally: test case specific cleanup
+        finally:
+            maintenance.cleanup_maintenance()
 
     def run(self):
         """run doctor tests"""
         try:
             LOG.info('doctor test starting.......')
+
             # prepare common test env
             self.setup()
+
             if self.conf.test_case == 'all':
                 self.test_fault_management()
                 self.test_maintenance()
diff --git a/doctor_tests/maintenance_hot_tpl.yaml b/doctor_tests/maintenance_hot_tpl.yaml
new file mode 100644 (file)
index 0000000..e2e4702
--- /dev/null
@@ -0,0 +1,119 @@
+---
+heat_template_version: 2017-02-24
+description: Doctor Maintenance test case
+
+parameters:
+  ext_net:
+    type: string
+    default: external
+  flavor_vcpus:
+    type: number
+    default: 24
+  maint_image:
+    type: string
+    default: cirros
+  ha_intances:
+    type: number
+    default: 2
+  nonha_intances:
+    type: number
+    default: 4
+  app_manager_alarm_url:
+    type: string
+    default: http://0.0.0.0:12348/maintenance
+  inpector_alarm_url:
+    type: string
+    default: http://0.0.0.0:12345/maintenance
+
+
+resources:
+  int_net:
+    type: OS::Neutron::Net
+
+  int_subnet:
+    type: OS::Neutron::Subnet
+    properties:
+      network_id: {get_resource: int_net}
+      cidr: "9.9.9.0/24"
+      dns_nameservers: ["8.8.8.8"]
+      ip_version: 4
+
+  int_router:
+    type: OS::Neutron::Router
+    properties:
+      external_gateway_info: {network: {get_param: ext_net}}
+
+  int_interface:
+    type: OS::Neutron::RouterInterface
+    properties:
+      router_id: {get_resource: int_router}
+      subnet: {get_resource: int_subnet}
+
+  maint_instance_flavor:
+    type: OS::Nova::Flavor
+    properties:
+      name: doctor_maint_flavor
+      ram: 512
+      vcpus: {get_param: flavor_vcpus}
+      disk: 1
+
+  ha_app_svrgrp:
+    type: OS::Nova::ServerGroup
+    properties:
+      name: doctor_ha_app_group
+      policies: ['anti-affinity']
+
+  floating_ip:
+    type: OS::Nova::FloatingIP
+    properties:
+      pool: {get_param: ext_net}
+
+  multi_ha_instances:
+    type: OS::Heat::ResourceGroup
+    properties:
+      count: {get_param: ha_intances}
+      resource_def:
+        type: OS::Nova::Server
+        properties:
+          name: doctor_ha_app_%index%
+          flavor: {get_resource: maint_instance_flavor}
+          image: {get_param: maint_image}
+          networks:
+            - network: {get_resource: int_net}
+          scheduler_hints:
+            group: {get_resource: ha_app_svrgrp}
+
+  multi_nonha_instances:
+    type: OS::Heat::ResourceGroup
+    properties:
+      count: {get_param: nonha_intances}
+      resource_def:
+        type: OS::Nova::Server
+        properties:
+          name: doctor_nonha_app_%index%
+          flavor: {get_resource: maint_instance_flavor}
+          image: {get_param: maint_image}
+          networks:
+            - network: {get_resource: int_net}
+
+  association:
+    type: OS::Nova::FloatingIPAssociation
+    properties:
+      floating_ip: {get_resource: floating_ip}
+      server_id: {get_attr: [multi_ha_instances, resource.0]}
+
+  app_manager_alarm:
+    type: OS::Aodh::EventAlarm
+    properties:
+      alarm_actions:
+        - {get_param: app_manager_alarm_url}
+      event_type: "maintenance.scheduled"
+      repeat_actions: true
+
+  inpector_alarm:
+    type: OS::Aodh::EventAlarm
+    properties:
+      alarm_actions:
+        - {get_param: inpector_alarm_url}
+      event_type: "maintenance.host"
+      repeat_actions: true
index 640281d..7ab4e9b 100644 (file)
@@ -11,6 +11,7 @@ from oslo_config import cfg
 import aodhclient.client as aodhclient\r
 from congressclient.v1 import client as congressclient\r
 import glanceclient.client as glanceclient\r
+import heatclient.client as heatclient\r
 from keystoneclient import client as ks_client\r
 from neutronclient.v2_0 import client as neutronclient\r
 import novaclient.client as novaclient\r
@@ -23,6 +24,7 @@ OPTS = [
     cfg.StrOpt('aodh_version', default='2', help='aodh version'),\r
     cfg.StrOpt('vitrage_version', default='1', help='vitrage version'),\r
     cfg.StrOpt('keystone_version', default='v3', help='keystone version'),\r
+    cfg.StrOpt('heat_version', default='1', help='heat version'),\r
 ]\r
 \r
 \r
@@ -31,6 +33,11 @@ def glance_client(version, session):
                                session=session)\r
 \r
 \r
+def heat_client(version, session):\r
+    return heatclient.Client(version=version,\r
+                             session=session)\r
+\r
+\r
 def keystone_client(version, session):\r
     return ks_client.Client(version=version,\r
                             session=session)\r
index b1fe809..f8f53e8 100644 (file)
@@ -32,7 +32,7 @@ dev=$(sudo ip a | awk '/ {compute_ip}\//{{print $NF}}')
 sleep 1
 sudo ip link set $dev down
 echo "doctor set link down at" $(date "+%s.%N")
-sleep 10
+sleep 30
 sudo ip link set $dev up
 sleep 1
 """
diff --git a/doctor_tests/scenario/maintenance.py b/doctor_tests/scenario/maintenance.py
new file mode 100644 (file)
index 0000000..54244d7
--- /dev/null
@@ -0,0 +1,192 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import datetime
+import json
+import requests
+import time
+
+from doctor_tests.admin_tool import get_admin_tool
+from doctor_tests.app_manager import get_app_manager
+from doctor_tests.common.utils import get_doctor_test_root_dir
+from doctor_tests.identity_auth import get_identity_auth
+from doctor_tests.identity_auth import get_session
+from doctor_tests.inspector import get_inspector
+from doctor_tests.os_clients import keystone_client
+from doctor_tests.os_clients import neutron_client
+from doctor_tests.os_clients import nova_client
+from doctor_tests.stack import Stack
+
+
+class Maintenance(object):
+
+    def __init__(self, trasport_url, conf, log):
+        self.conf = conf
+        self.log = log
+        self.keystone = keystone_client(
+            self.conf.keystone_version, get_session())
+        self.nova = nova_client(conf.nova_version, get_session())
+        auth = get_identity_auth(project=self.conf.doctor_project)
+        self.neutron = neutron_client(get_session(auth=auth))
+        self.stack = Stack(self.conf, self.log)
+        self.admin_tool = get_admin_tool(trasport_url, self.conf, self.log)
+        self.app_manager = get_app_manager(self.stack, self.conf, self.log)
+        self.inspector = get_inspector(self.conf, self.log)
+
+    def get_external_network(self):
+        ext_net = None
+        networks = self.neutron.list_networks()['networks']
+        for network in networks:
+            if network['router:external']:
+                ext_net = network['name']
+                break
+        if ext_net is None:
+            raise Exception("external network not defined")
+        return ext_net
+
+    def setup_maintenance(self, user):
+        # each hypervisor needs to have same amount of vcpus and they
+        # need to be free before test
+        hvisors = self.nova.hypervisors.list(detailed=True)
+        prev_vcpus = 0
+        prev_hostname = ''
+        self.log.info('checking hypervisors.......')
+        for hvisor in hvisors:
+            vcpus = hvisor.__getattr__('vcpus')
+            vcpus_used = hvisor.__getattr__('vcpus_used')
+            hostname = hvisor.__getattr__('hypervisor_hostname')
+            if vcpus < 2:
+                raise Exception('not enough vcpus (%d) on %s' %
+                                (vcpus, hostname))
+            if vcpus_used > 0:
+                raise Exception('%d vcpus used on %s'
+                                % (vcpus_used, hostname))
+            if prev_vcpus != 0 and prev_vcpus != vcpus:
+                raise Exception('%d vcpus on %s does not match to'
+                                '%d on %s'
+                                % (vcpus, hostname,
+                                   prev_vcpus, prev_hostname))
+            prev_vcpus = vcpus
+            prev_hostname = hostname
+
+        # maintenance flavor made so that 2 instances take whole node
+        flavor_vcpus = int(vcpus / 2)
+        compute_nodes = len(hvisors)
+        amount_actstdby_instances = 2
+        amount_noredundancy_instances = 2 * compute_nodes - 2
+        self.log.info('testing %d computes with %d vcpus each'
+                      % (compute_nodes, vcpus))
+        self.log.info('testing %d actstdby and %d noredundancy instances'
+                      % (amount_actstdby_instances,
+                         amount_noredundancy_instances))
+        max_instances = (amount_actstdby_instances +
+                         amount_noredundancy_instances)
+        max_cores = compute_nodes * vcpus
+
+        user.update_quota(max_instances, max_cores)
+
+        test_dir = get_doctor_test_root_dir()
+        template_file = '{0}/{1}'.format(test_dir, 'maintenance_hot_tpl.yaml')
+        files, template = self.stack.get_hot_tpl(template_file)
+
+        ext_net = self.get_external_network()
+
+        parameters = {'ext_net': ext_net,
+                      'flavor_vcpus': flavor_vcpus,
+                      'maint_image': self.conf.image_name,
+                      'nonha_intances': amount_noredundancy_instances,
+                      'ha_intances': amount_actstdby_instances}
+
+        self.log.info('creating maintenance stack.......')
+        self.log.info('parameters: %s' % parameters)
+
+        self.stack.create('doctor_test_maintenance',
+                          template,
+                          parameters=parameters,
+                          files=files)
+
+        self.admin_tool.start()
+        self.app_manager.start()
+        self.inspector.start()
+
+    def start_maintenance(self):
+        self.log.info('start maintenance.......')
+        hvisors = self.nova.hypervisors.list(detailed=True)
+        maintenance_hosts = list()
+        for hvisor in hvisors:
+            hostname = hvisor.__getattr__('hypervisor_hostname')
+            maintenance_hosts.append(hostname)
+
+        url = 'http://0.0.0.0:%s/maintenance' % self.conf.admin_tool.port
+        # let's start maintenance 20sec from now, so projects will have
+        # time to ACK to it before that
+        maintenance_at = (datetime.datetime.utcnow() +
+                          datetime.timedelta(seconds=20)
+                          ).strftime('%Y-%m-%d %H:%M:%S')
+        data = {'hosts': maintenance_hosts,
+                'state': 'MAINTENANCE',
+                'maintenance_at': maintenance_at,
+                'metadata': {'openstack_version': 'Pike'}}
+        headers = {
+            'Content-Type': 'application/json',
+            'Accept': 'application/json'}
+
+        ret = requests.post(url, data=json.dumps(data), headers=headers)
+        if ret.status_code != 200:
+            raise Exception(ret.text)
+        return ret.json()['session_id']
+
+    def remove_maintenance_session(self, session_id):
+        self.log.info('remove maintenance session %s.......' % session_id)
+
+        url = 'http://0.0.0.0:%s/maintenance' % self.conf.admin_tool.port
+
+        data = {'state': 'REMOVE_MAINTENANCE_SESSION',
+                'session_id': session_id}
+        headers = {
+            'Content-Type': 'application/json',
+            'Accept': 'application/json'}
+
+        ret = requests.post(url, data=json.dumps(data), headers=headers)
+        if ret.status_code != 200:
+            raise Exception(ret.text)
+
+    def get_maintenance_state(self, session_id):
+        url = 'http://0.0.0.0:%s/maintenance' % self.conf.admin_tool.port
+        data = {'session_id': session_id}
+        headers = {
+            'Content-Type': 'application/json',
+            'Accept': 'application/json'}
+        ret = requests.get(url, data=json.dumps(data), headers=headers)
+        if ret.status_code != 200:
+            raise Exception(ret.text)
+        return ret.json()['state']
+
+    def wait_maintenance_complete(self, session_id):
+        retries = 60
+        state = None
+        time.sleep(600)
+        while state != 'MAINTENANCE_COMPLETE' and retries > 0:
+            time.sleep(10)
+            state = self.get_maintenance_state(session_id)
+            retries = retries - 1
+        if retries == 0 and state != 'MAINTENANCE_COMPLETE':
+            raise Exception('maintenance %s not completed within 20min, status'
+                            ' %s' % (session_id, state))
+        elif state == 'MAINTENANCE_COMPLETE':
+            self.log.info('maintenance %s %s' % (session_id, state))
+            self.remove_maintenance_session(session_id)
+        elif state == 'MAINTENANCE_FAILED':
+            raise Exception('maintenance %s failed' % session_id)
+
+    def cleanup_maintenance(self):
+        self.admin_tool.stop()
+        self.app_manager.stop()
+        self.inspector.stop()
+        self.log.info('stack delete start.......')
+        self.stack.delete()
diff --git a/doctor_tests/stack.py b/doctor_tests/stack.py
new file mode 100644 (file)
index 0000000..688c205
--- /dev/null
@@ -0,0 +1,106 @@
+##############################################################################
+# Copyright (c) 2018 Nokia Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import time
+
+from heatclient.common.template_utils import get_template_contents
+from heatclient import exc as heat_excecption
+
+from doctor_tests.identity_auth import get_identity_auth
+from doctor_tests.identity_auth import get_session
+from doctor_tests.os_clients import heat_client
+
+
+class Stack(object):
+
+    def __init__(self, conf, log):
+        self.conf = conf
+        self.log = log
+        auth = get_identity_auth(project=self.conf.doctor_project)
+        self.heat = heat_client(self.conf.heat_version,
+                                get_session(auth=auth))
+        self.stack_name = None
+        self.stack_id = None
+        self.template = None
+        self.parameters = {}
+        self.files = {}
+
+    # standard yaml.load will not work for hot tpl becasue of date format in
+    # heat_template_version is not string
+    def get_hot_tpl(self, template_file):
+        if not os.path.isfile(template_file):
+            raise Exception('File(%s) does not exist' % template_file)
+        return get_template_contents(template_file=template_file)
+
+    def _wait_stack_action_complete(self, action):
+        action_in_progress = '%s_IN_PROGRESS' % action
+        action_complete = '%s_COMPLETE' % action
+        action_failed = '%s_FAILED' % action
+
+        status = action_in_progress
+        stack_retries = 150
+        while status == action_in_progress and stack_retries > 0:
+            time.sleep(2)
+            try:
+                stack = self.heat.stacks.get(self.stack_name)
+            except heat_excecption.HTTPNotFound:
+                if action == 'DELETE':
+                    # Might happen you never get status as stack deleted
+                    status = action_complete
+                    break
+                else:
+                    raise Exception('unable to get stack')
+            status = stack.stack_status
+            stack_retries = stack_retries - 1
+        if stack_retries == 0 and status != action_complete:
+            raise Exception("stack %s not completed within 5min, status:"
+                            " %s" % (action, status))
+        elif status == action_complete:
+            self.log.info('stack %s %s' % (self.stack_name, status))
+        elif status == action_failed:
+            raise Exception("stack %s failed" % action)
+        else:
+            self.log.error('stack %s %s' % (self.stack_name, status))
+            raise Exception("stack %s unknown result" % action)
+
+    def wait_stack_delete(self):
+        self._wait_stack_action_complete('DELETE')
+
+    def wait_stack_create(self):
+        self._wait_stack_action_complete('CREATE')
+
+    def wait_stack_update(self):
+        self._wait_stack_action_complete('UPDATE')
+
+    def create(self, stack_name, template, parameters={}, files={}):
+        self.stack_name = stack_name
+        self.template = template
+        self.parameters = parameters
+        self.files = files
+        stack = self.heat.stacks.create(stack_name=self.stack_name,
+                                        files=files,
+                                        template=template,
+                                        parameters=parameters)
+        self.stack_id = stack['stack']['id']
+        self.wait_stack_create()
+
+    def update(self, stack_name, stack_id, template, parameters={}, files={}):
+        self.heat.stacks.update(stack_name=stack_name,
+                                stack_id=stack_id,
+                                files=files,
+                                template=template,
+                                parameters=parameters)
+        self.wait_stack_update()
+
+    def delete(self):
+        if self.stack_id is not None:
+            self.heat.stacks.delete(self.stack_name)
+            self.wait_stack_delete()
+        else:
+            self.log.info('no stack to delete')
index fee3e1f..29aa004 100644 (file)
@@ -8,12 +8,12 @@
 ##############################################################################
 import os
 
+from keystoneclient import exceptions as ks_exceptions
 from oslo_config import cfg
 
 from doctor_tests.identity_auth import get_session
 from doctor_tests.os_clients import keystone_client
 from doctor_tests.os_clients import nova_client
-from keystoneclient import exceptions as ks_exceptions
 
 
 OPTS = [
@@ -53,10 +53,11 @@ class User(object):
     def __init__(self, conf, log):
         self.conf = conf
         self.log = log
+        self.def_quota = None
+        self.restore_def_quota = False
         self.keystone = keystone_client(
             self.conf.keystone_version, get_session())
-        self.nova = \
-            nova_client(conf.nova_version, get_session())
+        self.nova = nova_client(conf.nova_version, get_session())
         self.users = {}
         self.projects = {}
         self.roles = {}
@@ -83,10 +84,9 @@ class User(object):
                              domain=self.conf.doctor_domain_id)}
         if self.conf.doctor_project not in self.projects:
             self.log.info('create project......')
-            test_project = \
-                self.keystone.projects.create(
-                    self.conf.doctor_project,
-                    self.conf.doctor_domain_id)
+            test_project = self.keystone.projects.create(
+                self.conf.doctor_project,
+                self.conf.doctor_domain_id)
             self.projects[test_project.name] = test_project
         else:
             self.log.info('project %s already created......'
@@ -151,6 +151,13 @@ class User(object):
             self.keystone.roles.grant(role, user=user, project=project)
             roles_for_user[role_name] = role
 
+    def _restore_default_quota(self):
+        if self.def_quota is not None and self.restore_def_quota:
+            self.log.info('restore default quota......')
+            self.nova.quota_classes.update('default',
+                                           instances=self.def_quota.instances,
+                                           cores=self.def_quota.cores)
+
     def delete(self):
         """delete the test user, project and role"""
         self.log.info('user delete start......')
@@ -159,6 +166,8 @@ class User(object):
         user = self.users.get(self.conf.doctor_user)
         role = self.roles.get(self.conf.doctor_role)
 
+        self._restore_default_quota()
+
         if project:
             if 'admin' in self.roles_for_admin:
                 self.keystone.roles.revoke(
@@ -177,23 +186,45 @@ class User(object):
             self.keystone.projects.delete(project)
         self.log.info('user delete end......')
 
-    def update_quota(self):
-        self.log.info('user quota update start......')
+    def update_quota(self, instances=None, cores=None):
+        self.log.info('quota update start......')
         project = self.projects.get(self.conf.doctor_project)
+
         user = self.users.get(self.conf.doctor_user)
 
+        if instances is not None:
+            quota_instances = instances
+        else:
+            quota_instances = self.conf.quota_instances
+        if cores is not None:
+            quota_cores = cores
+        else:
+            quota_cores = self.conf.quota_cores
+
         if project and user:
+            # default needs to be at least the same as with doctor_user
+            self.log.info('default quota update start......')
+
+            self.def_quota = self.nova.quota_classes.get('default')
+            if quota_instances > self.def_quota.instances:
+                self.restore_def_quota = True
+                self.nova.quota_classes.update('default',
+                                               instances=quota_instances)
+            if quota_cores > self.def_quota.cores:
+                self.restore_def_quota = True
+                self.nova.quota_classes.update('default',
+                                               cores=quota_cores)
+            self.log.info('user quota update start......')
             self.quota = self.nova.quotas.get(project.id,
                                               user_id=user.id)
-            if self.conf.quota_instances > self.quota.instances:
-                self.nova.quotas.update(
-                    project.id,
-                    instances=self.conf.quota_instances,
-                    user_id=user.id)
-            if self.conf.quota_cores > self.quota.cores:
+            if quota_instances > self.quota.instances:
+                self.nova.quotas.update(project.id,
+                                        instances=quota_instances,
+                                        user_id=user.id)
+            if quota_cores > self.quota.cores:
                 self.nova.quotas.update(project.id,
-                                        cores=self.conf.quota_cores,
+                                        cores=quota_cores,
                                         user_id=user.id)
-            self.log.info('user quota update end......')
         else:
             raise Exception('No project or role for update quota')
+        self.log.info('quota update end......')
index b60878f..50ce80f 100644 (file)
@@ -7,6 +7,8 @@ scp
 requests>=2.14.2 # Apache-2.0
 oslo.config>=5.2.0 # Apache-2.0
 python-openstackclient>=3.12.0 # Apache-2.0
+oslo.messaging>=5.30.2 # Apache-2.0
+oslo.versionedobjects>=1.26.1 # Apache-2.0
 python-ceilometerclient>=2.5.0 # Apache-2.0
 aodhclient>=0.9.0 # Apache-2.0
 python-keystoneclient>=3.8.0 # Apache-2.0
@@ -16,4 +18,5 @@ python-congressclient<2000,>=1.9.0 # Apache-2.0
 python-glanceclient>=2.8.0 # Apache-2.0
 python-vitrageclient>=2.0.0 # Apache-2.0
 virtualenv>=14.0.6 # MIT
+python-heatclient>=1.8.2 # Apache-2.0
 flake8<2.6.0,>=2.5.4 # MIT
diff --git a/tox.ini b/tox.ini
index 53efbfe..832a1da 100644 (file)
--- a/tox.ini
+++ b/tox.ini
@@ -1,6 +1,6 @@
 [tox]
 minversion = 2.3.1
-envlist = py35, pep8
+envlist = py34, pep8
 skipsdist = True
 
 [testenv]
@@ -28,7 +28,9 @@ passenv =
     CI_DEBUG
     INSTALLER_TYPE
     INSTALLER_IP
-    PROFILER_TYPE
+    INSPECTOR_TYPE
+    TEST_CASE
+    SSH_KEY
 changedir = {toxinidir}/doctor_tests
 commands = doctor-test