refactor failure inject 17/39317/9
authordongwenjuan <dong.wenjuan@zte.com.cn>
Mon, 14 Aug 2017 08:58:13 +0000 (16:58 +0800)
committerdongwenjuan <dong.wenjuan@zte.com.cn>
Mon, 4 Sep 2017 14:30:39 +0000 (22:30 +0800)
JIRA: DOCTOR-116

Change-Id: I14deda4ccb47414cff139a522a9196b68e92500e
Signed-off-by: dongwenjuan <dong.wenjuan@zte.com.cn>
17 files changed:
tests/alarm.py
tests/common/__init__.py [new file with mode: 0644]
tests/common/constants.py [new file with mode: 0644]
tests/common/utils.py [moved from tests/utils.py with 92% similarity]
tests/consumer/sample.py
tests/inspector/sample.py
tests/installer/apex.py
tests/installer/base.py
tests/installer/local.py
tests/logger.py
tests/main.py
tests/monitor/base.py
tests/monitor/collectd.py
tests/monitor/sample.py
tests/scenario/__init__.py [new file with mode: 0644]
tests/scenario/common.py [new file with mode: 0644]
tests/scenario/network_failure.py [new file with mode: 0644]

index 0582085..916f440 100644 (file)
@@ -26,9 +26,7 @@ class Alarm(object):
     def __init__(self, conf, log):
         self.conf = conf
         self.log = log
-        self.auth = get_identity_auth(username=self.conf.doctor_user,
-                                      password=self.conf.doctor_passwd,
-                                      project=self.conf.doctor_project)
+        self.auth = get_identity_auth(project=self.conf.doctor_project)
         self.aodh = \
             aodh_client(conf.aodh_version,
                         get_session(auth=self.auth))
diff --git a/tests/common/__init__.py b/tests/common/__init__.py
new file mode 100644 (file)
index 0000000..e68a307
--- /dev/null
@@ -0,0 +1,8 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
\ No newline at end of file
diff --git a/tests/common/constants.py b/tests/common/constants.py
new file mode 100644 (file)
index 0000000..72d037a
--- /dev/null
@@ -0,0 +1,12 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from collections import namedtuple
+
+
+Host = namedtuple('Host', ['name', 'ip'])
similarity index 92%
rename from tests/utils.py
rename to tests/common/utils.py
index fd8c4cd..38fd97d 100644 (file)
@@ -51,7 +51,7 @@ class SSHClient(object):
 
     def ssh(self, command):
         if self.log:
-            self.log.debug("Executing: %s" % command)
+            self.log.info("Executing: %s" % command)
         stdin, stdout, stderr = self.client.exec_command(command)
         ret = stdout.channel.recv_exit_status()
         output = list()
@@ -59,12 +59,12 @@ class SSHClient(object):
             output.append(line.decode('utf-8'))
         if ret:
             if self.log:
-                self.log.debug("*** FAILED to run command %s (%s)" % (command, ret))
+                self.log.info("*** FAILED to run command %s (%s)" % (command, ret))
             raise Exception(
                 "Unable to run \ncommand: %s\nret: %s"
                 % (command, ret))
         if self.log:
-            self.log.debug("*** SUCCESSFULLY run command %s" % command)
+            self.log.info("*** SUCCESSFULLY run command %s" % command)
         return ret, output
 
     def scp(self, source, dest, method='put'):
@@ -77,6 +77,7 @@ class SSHClient(object):
             ftp.get(source, dest)
         ftp.close()
 
+
 def run_async(func):
     from threading import Thread
     from functools import wraps
index a698623..20ad9d5 100644 (file)
@@ -55,9 +55,9 @@ class ConsumerApp(Thread):
         @app.route('/failure', methods=['POST'])
         def event_posted():
             self.log.info('doctor consumer notified at %s' % time.time())
-            self.log.info('received data = %s' % request.data)
-            data = json.loads(request.data)
-            return "OK"
+            self.log.info('sample consumer received data = %s' % request.data)
+            data = json.loads(request.data.decode('utf8'))
+            return 'OK'
 
         @app.route('/shutdown', methods=['POST'])
         def shutdown():
index b364e82..1c05ced 100644 (file)
@@ -14,12 +14,12 @@ import time
 from threading import Thread
 import requests
 
+from common import utils
 from identity_auth import get_identity_auth
 from identity_auth import get_session
 from os_clients import nova_client
 from os_clients import neutron_client
 from inspector.base import BaseInspector
-import utils
 
 
 class SampleInspector(BaseInspector):
index e0960a5..98eb6c9 100644 (file)
@@ -11,10 +11,11 @@ import grp
 import os
 import pwd
 import stat
+import subprocess
 import sys
 
+from common.utils import SSHClient
 from installer.base import BaseInstaller
-from utils import SSHClient
 
 
 class ApexInstaller(BaseInstaller):
@@ -30,20 +31,28 @@ class ApexInstaller(BaseInstaller):
         self.key_file = None
         self.controllers = list()
         self.controller_clients = list()
+        self.servers = list()
 
     def setup(self):
         self.log.info('Setup Apex installer start......')
 
-        self.key_file = self.get_ssh_key_from_installer()
+        self.get_ssh_key_from_installer()
         self.get_controller_ips()
         self.set_apply_patches()
+        self.setup_stunnel()
 
     def cleanup(self):
         self.restore_apply_patches()
+        for server in self.servers:
+            server.terminate()
 
     def get_ssh_key_from_installer(self):
         self.log.info('Get SSH keys from Apex installer......')
 
+        if self.key_file is not None:
+            self.log.info('Already have SSH keys from Apex installer......')
+            return self.key_file
+
         self.client.scp('/home/stack/.ssh/id_rsa', './instack_key', method='get')
         user = getpass.getuser()
         uid = pwd.getpwnam(user).pw_uid
@@ -51,7 +60,8 @@ class ApexInstaller(BaseInstaller):
         os.chown('./instack_key', uid, gid)
         os.chmod('./instack_key', stat.S_IREAD)
         current_dir = sys.path[0]
-        return '{0}/{1}'.format(current_dir, 'instack_key')
+        self.key_file = '{0}/{1}'.format(current_dir, 'instack_key')
+        return self.key_file
 
     def get_controller_ips(self):
         self.log.info('Get controller ips from Apex installer......')
@@ -63,8 +73,32 @@ class ApexInstaller(BaseInstaller):
         if ret:
             raise Exception('Exec command to get controller ips in Apex installer failed'
                             'ret=%s, output=%s' % (ret, controllers))
+        self.log.info('Get controller_ips:%s from Apex installer' % controllers)
         self.controllers = controllers
 
+    def get_host_ip_from_hostname(self, hostname):
+        self.log.info('Get host ip from host name in Apex installer......')
+
+        hostname_in_undercloud = hostname.split('.')[0]
+
+        command = "source stackrc; nova show %s  | awk '/ ctlplane network /{print $5}'" % (hostname_in_undercloud)
+        ret, host_ip = self.client.ssh(command)
+        if ret:
+            raise Exception('Exec command to get host ip from hostname(%s) in Apex installer failed'
+                            'ret=%s, output=%s' % (hostname, ret, host_ip))
+        self.log.info('Get host_ip:%s from host_name:%s in Apex installer' % (host_ip, hostname))
+        return host_ip[0]
+
+    def setup_stunnel(self):
+        self.log.info('Setup ssh stunnel in controller nodes in Apex installer......')
+        for node_ip in self.controllers:
+            cmd = "sudo ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i %s %s@%s -R %s:localhost:%s sleep 600 > ssh_tunnel.%s.log 2>&1 < /dev/null &" \
+                  % (self.key_file, self.node_user_name, node_ip,
+                     self.conf.consumer.port, self.conf.consumer.port, node_ip)
+            server = subprocess.Popen(cmd, shell=True)
+            self.servers.append(server)
+            server.communicate()
+
     def set_apply_patches(self):
         self.log.info('Set apply patches start......')
 
index f3837f1..fa39816 100644 (file)
@@ -23,6 +23,10 @@ class BaseInstaller(object):
     def get_ssh_key_from_installer(self):
         pass
 
+    @abc.abstractmethod
+    def get_host_ip_from_hostname(self, hostname):
+        pass
+
     @abc.abstractmethod
     def setup(self):
         pass
index abe0ba2..dcdf41e 100644 (file)
@@ -8,10 +8,11 @@
 ##############################################################################
 import os
 import shutil
+import subprocess
 
 from installer.base import BaseInstaller
-from utils import load_json_file
-from utils import write_json_file
+from common.utils import load_json_file
+from common.utils import write_json_file
 
 
 class LocalInstaller(BaseInstaller):
@@ -34,7 +35,18 @@ class LocalInstaller(BaseInstaller):
 
     def get_ssh_key_from_installer(self):
         self.log.info('Assuming SSH keys already exchanged with computer for local installer type')
-        return
+        return None
+
+    def get_host_ip_from_hostname(self, hostname):
+        self.log.info('Get host ip from host name in local installer......')
+
+        cmd = "getent hosts %s | awk '{ print $1 }'" % (hostname)
+        server = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
+        stdout, stderr = server.communicate()
+        host_ip = stdout.strip()
+
+        self.log.info('Get host_ip:%s from host_name:%s in local installer' % (host_ip, hostname))
+        return host_ip
 
     def set_apply_patches(self):
         self._set_nova_policy()
index 80d19bb..b7a49fd 100644 (file)
@@ -21,8 +21,6 @@ class Logger(object):
 
         CI_DEBUG = os.getenv('CI_DEBUG')
 
-        filename = '%s.log' % logger_name
-        logging.basicConfig(filemode='w', filename=filename)
         self.logger = logging.getLogger(logger_name)
         self.logger.propagate = 0
         self.logger.setLevel(logging.DEBUG)
@@ -38,7 +36,8 @@ class Logger(object):
             ch.setLevel(logging.INFO)
         self.logger.addHandler(ch)
 
-        file_handler = logging.FileHandler(filename)
+        filename = '%s.log' % logger_name
+        file_handler = logging.FileHandler(filename, mode='w')
         file_handler.setFormatter(formatter)
         file_handler.setLevel(logging.DEBUG)
         self.logger.addHandler(file_handler)
index db2fafd..7e7c3bc 100644 (file)
@@ -8,19 +8,27 @@
 ##############################################################################
 import os
 from os.path import isfile, join
+import random
 import sys
+import time
 
 from alarm import Alarm
+from common.constants import Host
 import config
 from consumer import get_consumer
+from identity_auth import get_identity_auth
+from identity_auth import get_session
 from image import Image
 from instance import Instance
 from inspector import get_inspector
 from installer import get_installer
 import logger as doctor_log
-from user import User
 from network import Network
 from monitor import get_monitor
+from os_clients import nova_client
+from scenario.common import calculate_notification_time
+from scenario.network_failure import NetworkFault
+from user import User
 
 
 LOG = doctor_log.Logger('doctor').getLogger()
@@ -35,12 +43,17 @@ class DoctorTest(object):
         self.network = Network(self.conf, LOG)
         self.instance = Instance(self.conf, LOG)
         self.alarm = Alarm(self.conf, LOG)
+        self.installer = get_installer(self.conf, LOG)
         self.inspector = get_inspector(self.conf, LOG)
         self.monitor = get_monitor(self.conf,
                                    self.inspector.get_inspector_url(),
                                    LOG)
         self.consumer = get_consumer(self.conf, LOG)
-        self.installer = get_installer(self.conf, LOG)
+        self.fault = NetworkFault(self.conf, self.installer, LOG)
+        auth = get_identity_auth(project=self.conf.doctor_project)
+        self.nova = nova_client(self.conf.nova_version,
+                                get_session(auth=auth))
+        self.down_host = None
 
     def setup(self):
         # prepare the cloud env
@@ -63,7 +76,10 @@ class DoctorTest(object):
 
         # starting doctor sample components...
         self.inspector.start()
-        self.monitor.start()
+
+        self.down_host = self.get_host_info_for_random_vm()
+        self.monitor.start(self.down_host)
+
         self.consumer.start()
 
     def run(self):
@@ -71,30 +87,76 @@ class DoctorTest(object):
         try:
             LOG.info('doctor test starting.......')
 
+            # prepare test env
             self.setup()
 
+            # wait for aodh alarms are updated in caches for event evaluator,
+            # sleep time should be larger than event_alarm_cache_ttl(default 60)
+            time.sleep(60)
+
             # injecting host failure...
             # NOTE (umar) add INTERFACE_NAME logic to host injection
 
+            self.fault.start(self.down_host)
+            time.sleep(10)
+
             # verify the test results
             # NOTE (umar) copy remote monitor.log file when monitor=collectd
-
+            self.check_host_status(self.down_host.name, 'down')
+
+            notification_time = calculate_notification_time()
+            if notification_time < 1 and notification_time > 0:
+                LOG.info('doctor test successfully, notification_time=%s' % notification_time)
+            else:
+                LOG.error('doctor test failed, notification_time=%s' % notification_time)
+                sys.exit(1)
         except Exception as e:
             LOG.error('doctor test failed, Exception=%s' % e)
             sys.exit(1)
         finally:
             self.cleanup()
 
+    def get_host_info_for_random_vm(self):
+        num = random.randint(0, self.conf.instance_count - 1)
+        vm_name = "%s%d" % (self.conf.instance_basename, num)
+
+        servers = \
+            {getattr(server, 'name'): server
+             for server in self.nova.servers.list()}
+        server = servers.get(vm_name)
+        if not server:
+            raise \
+                Exception('Can not find instance: vm_name(%s)' % vm_name)
+        host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname')
+        host_ip = self.installer.get_host_ip_from_hostname(host_name)
+
+        LOG.info('Get host info(name:%s, ip:%s) which vm(%s) launched at'
+                 % (host_name, host_ip, vm_name))
+        return Host(host_name, host_ip)
+
+    def check_host_status(self, hostname, state):
+        service = self.nova.services.list(host=hostname, binary='nova-compute')
+        host_state = service[0].__dict__.get('state')
+        assert host_state == state
+
+    def unset_forced_down_hosts(self):
+        if self.down_host:
+            self.nova.services.force_down(self.down_host.name, 'nova-compute', False)
+            time.sleep(2)
+            self.check_host_status(self.down_host.name, 'up')
+
     def cleanup(self):
+        self.unset_forced_down_hosts()
+        self.inspector.stop()
+        self.monitor.stop()
+        self.consumer.stop()
+        self.installer.cleanup()
         self.alarm.delete()
         self.instance.delete()
         self.network.delete()
         self.image.delete()
-        self.inspector.stop()
+        self.fault.cleanup()
         self.user.delete()
-        self.monitor.stop()
-        self.consumer.stop()
-        self.installer.cleanup()
 
 
 def main():
index ccb647c..119c8a1 100644 (file)
@@ -19,7 +19,7 @@ class BaseMonitor(object):
         self.inspector_url = inspector_url
 
     @abc.abstractmethod
-    def start(self):
+    def start(self, host):
         pass
 
     @abc.abstractmethod
index f7a4f44..e2a800e 100644 (file)
@@ -12,8 +12,6 @@ import socket
 import getpass
 import sys
 
-from identity_auth import get_session
-from os_clients import nova_client
 from monitor.base import BaseMonitor
 
 
@@ -21,13 +19,6 @@ class CollectdMonitor(BaseMonitor):
     def __init__(self, conf, inspector_url, log):
         super(CollectdMonitor, self).__init__(conf, inspector_url, log)
         self.top_dir = os.path.dirname(sys.path[0])
-        self.session = get_session()
-        self.nova = nova_client(conf.nova_version, self.session)
-        self.compute_hosts = self.nova.hypervisors.list(detailed=True)
-        for host in self.compute_hosts:
-            host_dict = host.__dict__
-            self.compute_host = host_dict['hypervisor_hostname']
-            self.compute_ip = host_dict['host_ip']
         tmp_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
         tmp_sock.connect(("8.8.8.8", 80))
 
@@ -50,8 +41,10 @@ class CollectdMonitor(BaseMonitor):
         self.project_domain_id = os.environ.get('OS_PROJECT_DOMAIN_ID')
         self.ssh_opts_cpu = '-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no'
 
-    def start(self):
+    def start(self, host):
         self.log.info("Collectd monitor start.........")
+        self.compute_host = host.name
+        self.compute_ip = host.ip
         f = open("%s/tests/collectd.conf" % self.top_dir, 'w')
         collectd_conf_file = """ 
 Hostname %s
index 1333a2e..9ac1bcc 100644 (file)
@@ -14,7 +14,6 @@ from threading import Thread
 import time
 
 from identity_auth import get_session
-from os_clients import nova_client
 from monitor.base import BaseMonitor
 
 
@@ -24,26 +23,18 @@ class SampleMonitor(BaseMonitor):
     def __init__(self, conf, inspector_url, log):
         super(SampleMonitor, self).__init__(conf, inspector_url, log)
         self.session = get_session()
-        self.nova = nova_client(conf.nova_version, self.session)
-        self.hosts = self.nova.hypervisors.list(detailed=True)
-        self.pingers = []
+        self.pinger = None
 
-    def start(self):
+    def start(self, host):
         self.log.info('sample monitor start......')
-        for host in self.hosts:
-            host_dict = host.__dict__
-            host_name = host_dict['hypervisor_hostname']
-            host_ip = host_dict['host_ip']
-            pinger = Pinger(host_name, host_ip, self, self.log)
-            pinger.start()
-            self.pingers.append(pinger)
+        self.pinger = Pinger(host.name, host.ip, self, self.log)
+        self.pinger.start()
 
     def stop(self):
         self.log.info('sample monitor stop......')
-        for pinger in self.pingers:
-            pinger.stop()
-            pinger.join()
-        del self.pingers
+        if self.pinger is not None:
+            self.pinger.stop()
+            self.pinger.join()
 
     def report_error(self, hostname):
         self.log.info('sample monitor report error......')
diff --git a/tests/scenario/__init__.py b/tests/scenario/__init__.py
new file mode 100644 (file)
index 0000000..48893ae
--- /dev/null
@@ -0,0 +1,8 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
diff --git a/tests/scenario/common.py b/tests/scenario/common.py
new file mode 100644 (file)
index 0000000..e880e8b
--- /dev/null
@@ -0,0 +1,40 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+import os
+import re
+import sys
+
+
+def match_rep_in_file(regex, full_path):
+    if not os.path.isfile(full_path):
+        raise Exception('File(%s) does not exist' % full_path)
+
+    with open(full_path, 'r') as file:
+        for line in file:
+            result = re.search(regex, line)
+            if result:
+                return result.group(0)
+
+    return None
+
+
+def calculate_notification_time():
+    log_file = '{0}/{1}'.format(sys.path[0], 'doctor.log')
+
+    reg = '(?<=doctor monitor detected at )\d+.\d+'
+    detected = match_rep_in_file(reg, log_file)
+    if not detected:
+        raise Exception('Can not find detected time')
+
+    reg = '(?<=doctor consumer notified at )\d+.\d+'
+    notified = match_rep_in_file(reg, log_file)
+    if not notified:
+        raise Exception('Can not find notified time')
+
+    return float(notified) - float(detected)
\ No newline at end of file
diff --git a/tests/scenario/network_failure.py b/tests/scenario/network_failure.py
new file mode 100644 (file)
index 0000000..1d9027a
--- /dev/null
@@ -0,0 +1,71 @@
+##############################################################################
+# Copyright (c) 2017 ZTE Corporation and others.
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+##############################################################################
+from identity_auth import get_session
+from os_clients import nova_client
+from common.utils import SSHClient
+
+LINK_DOWN_SCRIPT = """
+#!/bin/bash -x
+dev=$(sudo ip a | awk '/ {compute_ip}\//{{print $NF}}')
+sleep 1
+sudo ip link set $dev down
+echo "doctor set link down at" $(date "+%s.%N")
+sleep 10
+sudo ip link set $dev up
+sleep 1
+"""
+
+
+class NetworkFault(object):
+
+    def __init__(self, conf, installer, log):
+        self.conf = conf
+        self.log = log
+        self.installer = installer
+        self.nova = nova_client(self.conf.nova_version, get_session())
+        self.host = None
+        self.GetLog = False
+
+    def start(self, host):
+        self.log.info('fault inject start......')
+        self._set_link_down(host.ip)
+        self.host = host
+        self.log.info('fault inject end......')
+
+    def cleanup(self):
+        self.log.info('fault inject cleanup......')
+        self.get_diable_network_log()
+
+    def get_diable_network_log(self):
+        if not self.GetLog:
+            self.log.info('Already get the disable_netork.log from down_host......')
+            return
+        if self.host is not None:
+            client = SSHClient(self.host.ip,
+                               self.installer.node_user_name,
+                               key_filename=self.installer.get_ssh_key_from_installer(),
+                               look_for_keys=True,
+                               log=self.log)
+            client.scp('disable_network.log', './disable_network.log', method='get')
+            self.log.info('Get the disable_netork.log from down_host(host_name:%s, host_ip:%s)'
+                          % (self.host.name, self.host.ip))
+        self.GetLog = True
+
+    def _set_link_down(self, compute_ip):
+        file_name = './disable_network.sh'
+        with open(file_name, 'w') as file:
+            file.write(LINK_DOWN_SCRIPT.format(compute_ip=compute_ip))
+        client = SSHClient(compute_ip,
+                           self.installer.node_user_name,
+                           key_filename=self.installer.get_ssh_key_from_installer(),
+                           look_for_keys=True,
+                           log=self.log)
+        client.scp('./disable_network.sh', 'disable_network.sh')
+        command = 'bash disable_network.sh > disable_network.log 2>&1 &'
+        client.ssh(command)