1 ##############################################################################
2 # Copyright (c) 2017 ZTE Corporation and others.
4 # All rights reserved. This program and the accompanying materials
5 # are made available under the terms of the Apache License, Version 2.0
6 # which accompanies this distribution, and is available at
7 # http://www.apache.org/licenses/LICENSE-2.0
8 ##############################################################################
10 from os.path import isfile, join
15 from doctor_tests.alarm import Alarm
16 from doctor_tests.common.constants import Host
17 from doctor_tests.common.utils import match_rep_in_file
18 from doctor_tests import config
19 from doctor_tests.consumer import get_consumer
20 from doctor_tests.identity_auth import get_identity_auth
21 from doctor_tests.identity_auth import get_session
22 from doctor_tests.image import Image
23 from doctor_tests.instance import Instance
24 from doctor_tests.inspector import get_inspector
25 from doctor_tests.installer import get_installer
26 import doctor_tests.logger as doctor_log
27 from doctor_tests.network import Network
28 from doctor_tests.monitor import get_monitor
29 from doctor_tests.os_clients import nova_client
30 from doctor_tests.profiler_poc import main as profiler_main
31 from doctor_tests.scenario.common import calculate_notification_time
32 from doctor_tests.scenario.network_failure import NetworkFault
33 from doctor_tests.user import User
36 Logger = doctor_log.Logger('doctor')
37 LOG = Logger.getLogger()
38 LogFile = Logger.getLogFilename()
41 class DoctorTest(object):
43 def __init__(self, conf):
45 self.image = Image(self.conf, LOG)
46 self.user = User(self.conf, LOG)
47 self.network = Network(self.conf, LOG)
48 self.instance = Instance(self.conf, LOG)
49 self.alarm = Alarm(self.conf, LOG)
50 self.installer = get_installer(self.conf, LOG)
51 self.inspector = get_inspector(self.conf, LOG)
52 self.monitor = get_monitor(self.conf,
53 self.inspector.get_inspector_url(),
55 self.consumer = get_consumer(self.conf, LOG)
56 self.fault = NetworkFault(self.conf, self.installer, LOG)
57 auth = get_identity_auth(project=self.conf.doctor_project)
58 self.nova = nova_client(self.conf.nova_version,
59 get_session(auth=auth))
63 # prepare the cloud env
64 self.installer.setup()
66 # preparing VM image...
69 # creating test user...
72 def setup_fault_management(self):
74 self.user.update_quota()
78 self.instance.create()
79 self.instance.wait_for_vm_launch()
84 # starting doctor sample components...
85 # tbd tojuvone: move inspector and consumer to common setup
86 # when they support updating VMs via instance.create and
87 # instance.delete alarm
89 self.inspector.start()
91 self.down_host = self.get_host_info_for_random_vm()
92 self.monitor.start(self.down_host)
94 def test_fault_management(self):
96 LOG.info('doctor fault management test starting.......')
99 self.setup_fault_management()
101 # wait for aodh alarms are updated in caches for event evaluator,
102 # sleep time should be larger than event_alarm_cache_ttl
104 # (tojuvone) Fraser currently needs 120
107 # injecting host failure...
108 # NOTE (umar) add INTERFACE_NAME logic to host injection
110 self.fault.start(self.down_host)
113 # verify the test results
114 # NOTE (umar) copy remote monitor.log file when monitor=collectd
115 self.check_host_status(self.down_host.name, 'down')
117 notification_time = calculate_notification_time(LogFile)
118 if notification_time < 1 and notification_time > 0:
119 LOG.info('doctor fault management test successfully, '
120 'notification_time=%s' % notification_time)
122 LOG.error('doctor fault management test failed, '
123 'notification_time=%s' % notification_time)
126 if self.conf.profiler_type:
127 LOG.info('doctor fault management test begin to run '
131 except Exception as e:
132 LOG.error('doctor fault management test failed, '
136 self.cleanup_fault_management()
138 def _amount_compute_nodes(self):
139 services = self.nova.services.list(binary='nova-compute')
142 def test_maintenance(self):
143 cnodes = self._amount_compute_nodes()
145 # need 2 compute for redundancy and one spare to migrate
146 LOG.info('not enough compute nodes, skipping doctor '
150 LOG.info('doctor maintenance test starting.......')
151 # TODO (tojuvone) test setup and actual test
152 except Exception as e:
153 LOG.error('doctor maintenance test failed, Exception=%s' % e)
155 # TODO (tojuvone) finally: test case specific cleanup
158 """run doctor tests"""
160 LOG.info('doctor test starting.......')
161 # prepare common test env
163 if self.conf.test_case == 'all':
164 self.test_fault_management()
165 self.test_maintenance()
167 function = 'test_%s' % self.conf.test_case
168 if hasattr(self, function):
169 getattr(self, function)()
171 raise Exception('Can not find function <%s> in'
172 'DoctorTest, see config manual'
174 except Exception as e:
175 LOG.error('doctor test failed, Exception=%s' % e)
180 def get_host_info_for_random_vm(self):
181 num = random.randint(0, self.conf.instance_count - 1)
182 vm_name = "%s%d" % (self.conf.instance_basename, num)
184 servers = {getattr(server, 'name'): server
185 for server in self.nova.servers.list()}
186 server = servers.get(vm_name)
188 raise Exception('Can not find instance: vm_name(%s)' % vm_name)
189 host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname')
190 host_ip = self.installer.get_host_ip_from_hostname(host_name)
192 LOG.info('Get host info(name:%s, ip:%s) which vm(%s) launched at'
193 % (host_name, host_ip, vm_name))
194 return Host(host_name, host_ip)
196 def check_host_status(self, hostname, state):
197 service = self.nova.services.list(host=hostname,
198 binary='nova-compute')
199 host_state = service[0].__dict__.get('state')
200 assert host_state == state
202 def unset_forced_down_hosts(self):
204 self.nova.services.force_down(self.down_host.name,
205 'nova-compute', False)
207 self.check_host_status(self.down_host.name, 'up')
209 def collect_logs(self):
210 self.fault.get_disable_network_log()
212 def run_profiler(self):
214 net_down_log_file = self.fault.get_disable_network_log()
215 reg = '(?<=doctor set link down at )\d+.\d+'
216 linkdown = float(match_rep_in_file(reg, net_down_log_file).group(0))
218 reg = '(.* doctor mark vm.* error at )(\d+.\d+)'
219 vmdown = float(match_rep_in_file(reg, LogFile).group(2))
221 reg = '(.* doctor mark host.* down at )(\d+.\d+)'
222 hostdown = float(match_rep_in_file(reg, LogFile).group(2))
224 reg = '(?<=doctor monitor detected at )\d+.\d+'
225 detected = float(match_rep_in_file(reg, LogFile).group(0))
227 reg = '(?<=doctor consumer notified at )\d+.\d+'
228 notified = float(match_rep_in_file(reg, LogFile).group(0))
230 # TODO(yujunz) check the actual delay to verify time sync status
231 # expected ~1s delay from $trigger to $linkdown
232 relative_start = linkdown
233 os.environ['DOCTOR_PROFILER_T00'] = (
234 str(int((linkdown - relative_start) * 1000)))
235 os.environ['DOCTOR_PROFILER_T01'] = (
236 str(int((detected - relative_start) * 1000)))
237 os.environ['DOCTOR_PROFILER_T03'] = (
238 str(int((vmdown - relative_start) * 1000)))
239 os.environ['DOCTOR_PROFILER_T04'] = (
240 str(int((hostdown - relative_start) * 1000)))
241 os.environ['DOCTOR_PROFILER_T09'] = (
242 str(int((notified - relative_start) * 1000)))
244 profiler_main(log=LOG)
246 def cleanup_fault_management(self):
247 self.unset_forced_down_hosts()
248 self.inspector.stop()
252 self.instance.delete()
253 self.network.delete()
257 self.installer.cleanup()
264 test_dir = os.path.split(os.path.realpath(__file__))[0]
265 doctor_root_dir = os.path.dirname(test_dir)
267 config_file_dir = '{0}/{1}'.format(doctor_root_dir, 'etc/')
268 config_files = [join(config_file_dir, f)
269 for f in os.listdir(config_file_dir)
270 if isfile(join(config_file_dir, f))]
272 conf = config.prepare_conf(args=sys.argv[1:],
273 config_files=config_files)
275 doctor = DoctorTest(conf)