From 19f7ba75850c52e1ae163766b64b6d153e8d7e1b Mon Sep 17 00:00:00 2001 From: dongwenjuan Date: Mon, 14 Aug 2017 16:58:13 +0800 Subject: [PATCH] refactor failure inject JIRA: DOCTOR-116 Change-Id: I14deda4ccb47414cff139a522a9196b68e92500e Signed-off-by: dongwenjuan --- tests/alarm.py | 4 +- tests/common/__init__.py | 8 ++++ tests/common/constants.py | 12 ++++++ tests/{ => common}/utils.py | 7 ++-- tests/consumer/sample.py | 6 +-- tests/inspector/sample.py | 2 +- tests/installer/apex.py | 40 ++++++++++++++++++-- tests/installer/base.py | 4 ++ tests/installer/local.py | 18 +++++++-- tests/logger.py | 5 +-- tests/main.py | 78 +++++++++++++++++++++++++++++++++++---- tests/monitor/base.py | 2 +- tests/monitor/collectd.py | 13 ++----- tests/monitor/sample.py | 23 ++++-------- tests/scenario/__init__.py | 8 ++++ tests/scenario/common.py | 40 ++++++++++++++++++++ tests/scenario/network_failure.py | 71 +++++++++++++++++++++++++++++++++++ 17 files changed, 287 insertions(+), 54 deletions(-) create mode 100644 tests/common/__init__.py create mode 100644 tests/common/constants.py rename tests/{ => common}/utils.py (92%) create mode 100644 tests/scenario/__init__.py create mode 100644 tests/scenario/common.py create mode 100644 tests/scenario/network_failure.py diff --git a/tests/alarm.py b/tests/alarm.py index 0582085e..916f4405 100644 --- a/tests/alarm.py +++ b/tests/alarm.py @@ -26,9 +26,7 @@ class Alarm(object): def __init__(self, conf, log): self.conf = conf self.log = log - self.auth = get_identity_auth(username=self.conf.doctor_user, - password=self.conf.doctor_passwd, - project=self.conf.doctor_project) + self.auth = get_identity_auth(project=self.conf.doctor_project) self.aodh = \ aodh_client(conf.aodh_version, get_session(auth=self.auth)) diff --git a/tests/common/__init__.py b/tests/common/__init__.py new file mode 100644 index 00000000..e68a3070 --- /dev/null +++ b/tests/common/__init__.py @@ -0,0 +1,8 @@ +############################################################################## +# Copyright (c) 2017 ZTE Corporation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## \ No newline at end of file diff --git a/tests/common/constants.py b/tests/common/constants.py new file mode 100644 index 00000000..72d037af --- /dev/null +++ b/tests/common/constants.py @@ -0,0 +1,12 @@ +############################################################################## +# Copyright (c) 2017 ZTE Corporation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## +from collections import namedtuple + + +Host = namedtuple('Host', ['name', 'ip']) diff --git a/tests/utils.py b/tests/common/utils.py similarity index 92% rename from tests/utils.py rename to tests/common/utils.py index fd8c4cd7..38fd97d8 100644 --- a/tests/utils.py +++ b/tests/common/utils.py @@ -51,7 +51,7 @@ class SSHClient(object): def ssh(self, command): if self.log: - self.log.debug("Executing: %s" % command) + self.log.info("Executing: %s" % command) stdin, stdout, stderr = self.client.exec_command(command) ret = stdout.channel.recv_exit_status() output = list() @@ -59,12 +59,12 @@ class SSHClient(object): output.append(line.decode('utf-8')) if ret: if self.log: - self.log.debug("*** FAILED to run command %s (%s)" % (command, ret)) + self.log.info("*** FAILED to run command %s (%s)" % (command, ret)) raise Exception( "Unable to run \ncommand: %s\nret: %s" % (command, ret)) if self.log: - self.log.debug("*** SUCCESSFULLY run command %s" % command) + self.log.info("*** SUCCESSFULLY run command %s" % command) return ret, output def scp(self, source, dest, method='put'): @@ -77,6 +77,7 @@ class SSHClient(object): ftp.get(source, dest) ftp.close() + def run_async(func): from threading import Thread from functools import wraps diff --git a/tests/consumer/sample.py b/tests/consumer/sample.py index a698623a..20ad9d57 100644 --- a/tests/consumer/sample.py +++ b/tests/consumer/sample.py @@ -55,9 +55,9 @@ class ConsumerApp(Thread): @app.route('/failure', methods=['POST']) def event_posted(): self.log.info('doctor consumer notified at %s' % time.time()) - self.log.info('received data = %s' % request.data) - data = json.loads(request.data) - return "OK" + self.log.info('sample consumer received data = %s' % request.data) + data = json.loads(request.data.decode('utf8')) + return 'OK' @app.route('/shutdown', methods=['POST']) def shutdown(): diff --git a/tests/inspector/sample.py b/tests/inspector/sample.py index b364e825..1c05cede 100644 --- a/tests/inspector/sample.py +++ b/tests/inspector/sample.py @@ -14,12 +14,12 @@ import time from threading import Thread import requests +from common import utils from identity_auth import get_identity_auth from identity_auth import get_session from os_clients import nova_client from os_clients import neutron_client from inspector.base import BaseInspector -import utils class SampleInspector(BaseInspector): diff --git a/tests/installer/apex.py b/tests/installer/apex.py index e0960a5f..98eb6c9c 100644 --- a/tests/installer/apex.py +++ b/tests/installer/apex.py @@ -11,10 +11,11 @@ import grp import os import pwd import stat +import subprocess import sys +from common.utils import SSHClient from installer.base import BaseInstaller -from utils import SSHClient class ApexInstaller(BaseInstaller): @@ -30,20 +31,28 @@ class ApexInstaller(BaseInstaller): self.key_file = None self.controllers = list() self.controller_clients = list() + self.servers = list() def setup(self): self.log.info('Setup Apex installer start......') - self.key_file = self.get_ssh_key_from_installer() + self.get_ssh_key_from_installer() self.get_controller_ips() self.set_apply_patches() + self.setup_stunnel() def cleanup(self): self.restore_apply_patches() + for server in self.servers: + server.terminate() def get_ssh_key_from_installer(self): self.log.info('Get SSH keys from Apex installer......') + if self.key_file is not None: + self.log.info('Already have SSH keys from Apex installer......') + return self.key_file + self.client.scp('/home/stack/.ssh/id_rsa', './instack_key', method='get') user = getpass.getuser() uid = pwd.getpwnam(user).pw_uid @@ -51,7 +60,8 @@ class ApexInstaller(BaseInstaller): os.chown('./instack_key', uid, gid) os.chmod('./instack_key', stat.S_IREAD) current_dir = sys.path[0] - return '{0}/{1}'.format(current_dir, 'instack_key') + self.key_file = '{0}/{1}'.format(current_dir, 'instack_key') + return self.key_file def get_controller_ips(self): self.log.info('Get controller ips from Apex installer......') @@ -63,8 +73,32 @@ class ApexInstaller(BaseInstaller): if ret: raise Exception('Exec command to get controller ips in Apex installer failed' 'ret=%s, output=%s' % (ret, controllers)) + self.log.info('Get controller_ips:%s from Apex installer' % controllers) self.controllers = controllers + def get_host_ip_from_hostname(self, hostname): + self.log.info('Get host ip from host name in Apex installer......') + + hostname_in_undercloud = hostname.split('.')[0] + + command = "source stackrc; nova show %s | awk '/ ctlplane network /{print $5}'" % (hostname_in_undercloud) + ret, host_ip = self.client.ssh(command) + if ret: + raise Exception('Exec command to get host ip from hostname(%s) in Apex installer failed' + 'ret=%s, output=%s' % (hostname, ret, host_ip)) + self.log.info('Get host_ip:%s from host_name:%s in Apex installer' % (host_ip, hostname)) + return host_ip[0] + + def setup_stunnel(self): + self.log.info('Setup ssh stunnel in controller nodes in Apex installer......') + for node_ip in self.controllers: + cmd = "sudo ssh -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no -i %s %s@%s -R %s:localhost:%s sleep 600 > ssh_tunnel.%s.log 2>&1 < /dev/null &" \ + % (self.key_file, self.node_user_name, node_ip, + self.conf.consumer.port, self.conf.consumer.port, node_ip) + server = subprocess.Popen(cmd, shell=True) + self.servers.append(server) + server.communicate() + def set_apply_patches(self): self.log.info('Set apply patches start......') diff --git a/tests/installer/base.py b/tests/installer/base.py index f3837f15..fa39816a 100644 --- a/tests/installer/base.py +++ b/tests/installer/base.py @@ -23,6 +23,10 @@ class BaseInstaller(object): def get_ssh_key_from_installer(self): pass + @abc.abstractmethod + def get_host_ip_from_hostname(self, hostname): + pass + @abc.abstractmethod def setup(self): pass diff --git a/tests/installer/local.py b/tests/installer/local.py index abe0ba25..dcdf41e3 100644 --- a/tests/installer/local.py +++ b/tests/installer/local.py @@ -8,10 +8,11 @@ ############################################################################## import os import shutil +import subprocess from installer.base import BaseInstaller -from utils import load_json_file -from utils import write_json_file +from common.utils import load_json_file +from common.utils import write_json_file class LocalInstaller(BaseInstaller): @@ -34,7 +35,18 @@ class LocalInstaller(BaseInstaller): def get_ssh_key_from_installer(self): self.log.info('Assuming SSH keys already exchanged with computer for local installer type') - return + return None + + def get_host_ip_from_hostname(self, hostname): + self.log.info('Get host ip from host name in local installer......') + + cmd = "getent hosts %s | awk '{ print $1 }'" % (hostname) + server = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) + stdout, stderr = server.communicate() + host_ip = stdout.strip() + + self.log.info('Get host_ip:%s from host_name:%s in local installer' % (host_ip, hostname)) + return host_ip def set_apply_patches(self): self._set_nova_policy() diff --git a/tests/logger.py b/tests/logger.py index 80d19bb9..b7a49fdb 100644 --- a/tests/logger.py +++ b/tests/logger.py @@ -21,8 +21,6 @@ class Logger(object): CI_DEBUG = os.getenv('CI_DEBUG') - filename = '%s.log' % logger_name - logging.basicConfig(filemode='w', filename=filename) self.logger = logging.getLogger(logger_name) self.logger.propagate = 0 self.logger.setLevel(logging.DEBUG) @@ -38,7 +36,8 @@ class Logger(object): ch.setLevel(logging.INFO) self.logger.addHandler(ch) - file_handler = logging.FileHandler(filename) + filename = '%s.log' % logger_name + file_handler = logging.FileHandler(filename, mode='w') file_handler.setFormatter(formatter) file_handler.setLevel(logging.DEBUG) self.logger.addHandler(file_handler) diff --git a/tests/main.py b/tests/main.py index db2fafd9..7e7c3bc2 100644 --- a/tests/main.py +++ b/tests/main.py @@ -8,19 +8,27 @@ ############################################################################## import os from os.path import isfile, join +import random import sys +import time from alarm import Alarm +from common.constants import Host import config from consumer import get_consumer +from identity_auth import get_identity_auth +from identity_auth import get_session from image import Image from instance import Instance from inspector import get_inspector from installer import get_installer import logger as doctor_log -from user import User from network import Network from monitor import get_monitor +from os_clients import nova_client +from scenario.common import calculate_notification_time +from scenario.network_failure import NetworkFault +from user import User LOG = doctor_log.Logger('doctor').getLogger() @@ -35,12 +43,17 @@ class DoctorTest(object): self.network = Network(self.conf, LOG) self.instance = Instance(self.conf, LOG) self.alarm = Alarm(self.conf, LOG) + self.installer = get_installer(self.conf, LOG) self.inspector = get_inspector(self.conf, LOG) self.monitor = get_monitor(self.conf, self.inspector.get_inspector_url(), LOG) self.consumer = get_consumer(self.conf, LOG) - self.installer = get_installer(self.conf, LOG) + self.fault = NetworkFault(self.conf, self.installer, LOG) + auth = get_identity_auth(project=self.conf.doctor_project) + self.nova = nova_client(self.conf.nova_version, + get_session(auth=auth)) + self.down_host = None def setup(self): # prepare the cloud env @@ -63,7 +76,10 @@ class DoctorTest(object): # starting doctor sample components... self.inspector.start() - self.monitor.start() + + self.down_host = self.get_host_info_for_random_vm() + self.monitor.start(self.down_host) + self.consumer.start() def run(self): @@ -71,30 +87,76 @@ class DoctorTest(object): try: LOG.info('doctor test starting.......') + # prepare test env self.setup() + # wait for aodh alarms are updated in caches for event evaluator, + # sleep time should be larger than event_alarm_cache_ttl(default 60) + time.sleep(60) + # injecting host failure... # NOTE (umar) add INTERFACE_NAME logic to host injection + self.fault.start(self.down_host) + time.sleep(10) + # verify the test results # NOTE (umar) copy remote monitor.log file when monitor=collectd - + self.check_host_status(self.down_host.name, 'down') + + notification_time = calculate_notification_time() + if notification_time < 1 and notification_time > 0: + LOG.info('doctor test successfully, notification_time=%s' % notification_time) + else: + LOG.error('doctor test failed, notification_time=%s' % notification_time) + sys.exit(1) except Exception as e: LOG.error('doctor test failed, Exception=%s' % e) sys.exit(1) finally: self.cleanup() + def get_host_info_for_random_vm(self): + num = random.randint(0, self.conf.instance_count - 1) + vm_name = "%s%d" % (self.conf.instance_basename, num) + + servers = \ + {getattr(server, 'name'): server + for server in self.nova.servers.list()} + server = servers.get(vm_name) + if not server: + raise \ + Exception('Can not find instance: vm_name(%s)' % vm_name) + host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname') + host_ip = self.installer.get_host_ip_from_hostname(host_name) + + LOG.info('Get host info(name:%s, ip:%s) which vm(%s) launched at' + % (host_name, host_ip, vm_name)) + return Host(host_name, host_ip) + + def check_host_status(self, hostname, state): + service = self.nova.services.list(host=hostname, binary='nova-compute') + host_state = service[0].__dict__.get('state') + assert host_state == state + + def unset_forced_down_hosts(self): + if self.down_host: + self.nova.services.force_down(self.down_host.name, 'nova-compute', False) + time.sleep(2) + self.check_host_status(self.down_host.name, 'up') + def cleanup(self): + self.unset_forced_down_hosts() + self.inspector.stop() + self.monitor.stop() + self.consumer.stop() + self.installer.cleanup() self.alarm.delete() self.instance.delete() self.network.delete() self.image.delete() - self.inspector.stop() + self.fault.cleanup() self.user.delete() - self.monitor.stop() - self.consumer.stop() - self.installer.cleanup() def main(): diff --git a/tests/monitor/base.py b/tests/monitor/base.py index ccb647cf..119c8a1c 100644 --- a/tests/monitor/base.py +++ b/tests/monitor/base.py @@ -19,7 +19,7 @@ class BaseMonitor(object): self.inspector_url = inspector_url @abc.abstractmethod - def start(self): + def start(self, host): pass @abc.abstractmethod diff --git a/tests/monitor/collectd.py b/tests/monitor/collectd.py index f7a4f442..e2a800ea 100644 --- a/tests/monitor/collectd.py +++ b/tests/monitor/collectd.py @@ -12,8 +12,6 @@ import socket import getpass import sys -from identity_auth import get_session -from os_clients import nova_client from monitor.base import BaseMonitor @@ -21,13 +19,6 @@ class CollectdMonitor(BaseMonitor): def __init__(self, conf, inspector_url, log): super(CollectdMonitor, self).__init__(conf, inspector_url, log) self.top_dir = os.path.dirname(sys.path[0]) - self.session = get_session() - self.nova = nova_client(conf.nova_version, self.session) - self.compute_hosts = self.nova.hypervisors.list(detailed=True) - for host in self.compute_hosts: - host_dict = host.__dict__ - self.compute_host = host_dict['hypervisor_hostname'] - self.compute_ip = host_dict['host_ip'] tmp_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) tmp_sock.connect(("8.8.8.8", 80)) @@ -50,8 +41,10 @@ class CollectdMonitor(BaseMonitor): self.project_domain_id = os.environ.get('OS_PROJECT_DOMAIN_ID') self.ssh_opts_cpu = '-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no' - def start(self): + def start(self, host): self.log.info("Collectd monitor start.........") + self.compute_host = host.name + self.compute_ip = host.ip f = open("%s/tests/collectd.conf" % self.top_dir, 'w') collectd_conf_file = """ Hostname %s diff --git a/tests/monitor/sample.py b/tests/monitor/sample.py index 1333a2ec..9ac1bccf 100644 --- a/tests/monitor/sample.py +++ b/tests/monitor/sample.py @@ -14,7 +14,6 @@ from threading import Thread import time from identity_auth import get_session -from os_clients import nova_client from monitor.base import BaseMonitor @@ -24,26 +23,18 @@ class SampleMonitor(BaseMonitor): def __init__(self, conf, inspector_url, log): super(SampleMonitor, self).__init__(conf, inspector_url, log) self.session = get_session() - self.nova = nova_client(conf.nova_version, self.session) - self.hosts = self.nova.hypervisors.list(detailed=True) - self.pingers = [] + self.pinger = None - def start(self): + def start(self, host): self.log.info('sample monitor start......') - for host in self.hosts: - host_dict = host.__dict__ - host_name = host_dict['hypervisor_hostname'] - host_ip = host_dict['host_ip'] - pinger = Pinger(host_name, host_ip, self, self.log) - pinger.start() - self.pingers.append(pinger) + self.pinger = Pinger(host.name, host.ip, self, self.log) + self.pinger.start() def stop(self): self.log.info('sample monitor stop......') - for pinger in self.pingers: - pinger.stop() - pinger.join() - del self.pingers + if self.pinger is not None: + self.pinger.stop() + self.pinger.join() def report_error(self, hostname): self.log.info('sample monitor report error......') diff --git a/tests/scenario/__init__.py b/tests/scenario/__init__.py new file mode 100644 index 00000000..48893ae6 --- /dev/null +++ b/tests/scenario/__init__.py @@ -0,0 +1,8 @@ +############################################################################## +# Copyright (c) 2017 ZTE Corporation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## diff --git a/tests/scenario/common.py b/tests/scenario/common.py new file mode 100644 index 00000000..e880e8b2 --- /dev/null +++ b/tests/scenario/common.py @@ -0,0 +1,40 @@ +############################################################################## +# Copyright (c) 2017 ZTE Corporation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## +import os +import re +import sys + + +def match_rep_in_file(regex, full_path): + if not os.path.isfile(full_path): + raise Exception('File(%s) does not exist' % full_path) + + with open(full_path, 'r') as file: + for line in file: + result = re.search(regex, line) + if result: + return result.group(0) + + return None + + +def calculate_notification_time(): + log_file = '{0}/{1}'.format(sys.path[0], 'doctor.log') + + reg = '(?<=doctor monitor detected at )\d+.\d+' + detected = match_rep_in_file(reg, log_file) + if not detected: + raise Exception('Can not find detected time') + + reg = '(?<=doctor consumer notified at )\d+.\d+' + notified = match_rep_in_file(reg, log_file) + if not notified: + raise Exception('Can not find notified time') + + return float(notified) - float(detected) \ No newline at end of file diff --git a/tests/scenario/network_failure.py b/tests/scenario/network_failure.py new file mode 100644 index 00000000..1d9027a2 --- /dev/null +++ b/tests/scenario/network_failure.py @@ -0,0 +1,71 @@ +############################################################################## +# Copyright (c) 2017 ZTE Corporation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## +from identity_auth import get_session +from os_clients import nova_client +from common.utils import SSHClient + +LINK_DOWN_SCRIPT = """ +#!/bin/bash -x +dev=$(sudo ip a | awk '/ {compute_ip}\//{{print $NF}}') +sleep 1 +sudo ip link set $dev down +echo "doctor set link down at" $(date "+%s.%N") +sleep 10 +sudo ip link set $dev up +sleep 1 +""" + + +class NetworkFault(object): + + def __init__(self, conf, installer, log): + self.conf = conf + self.log = log + self.installer = installer + self.nova = nova_client(self.conf.nova_version, get_session()) + self.host = None + self.GetLog = False + + def start(self, host): + self.log.info('fault inject start......') + self._set_link_down(host.ip) + self.host = host + self.log.info('fault inject end......') + + def cleanup(self): + self.log.info('fault inject cleanup......') + self.get_diable_network_log() + + def get_diable_network_log(self): + if not self.GetLog: + self.log.info('Already get the disable_netork.log from down_host......') + return + if self.host is not None: + client = SSHClient(self.host.ip, + self.installer.node_user_name, + key_filename=self.installer.get_ssh_key_from_installer(), + look_for_keys=True, + log=self.log) + client.scp('disable_network.log', './disable_network.log', method='get') + self.log.info('Get the disable_netork.log from down_host(host_name:%s, host_ip:%s)' + % (self.host.name, self.host.ip)) + self.GetLog = True + + def _set_link_down(self, compute_ip): + file_name = './disable_network.sh' + with open(file_name, 'w') as file: + file.write(LINK_DOWN_SCRIPT.format(compute_ip=compute_ip)) + client = SSHClient(compute_ip, + self.installer.node_user_name, + key_filename=self.installer.get_ssh_key_from_installer(), + look_for_keys=True, + log=self.log) + client.scp('./disable_network.sh', 'disable_network.sh') + command = 'bash disable_network.sh > disable_network.log 2>&1 &' + client.ssh(command) -- 2.16.6