ability to run maintenance test
[doctor.git] / doctor_tests / main.py
1 ##############################################################################
2 # Copyright (c) 2017 ZTE Corporation and others.
3 #
4 # All rights reserved. This program and the accompanying materials
5 # are made available under the terms of the Apache License, Version 2.0
6 # which accompanies this distribution, and is available at
7 # http://www.apache.org/licenses/LICENSE-2.0
8 ##############################################################################
9 import os
10 from os.path import isfile, join
11 import random
12 import sys
13 import time
14
15 from doctor_tests.alarm import Alarm
16 from doctor_tests.common.constants import Host
17 from doctor_tests.common.utils import match_rep_in_file
18 from doctor_tests import config
19 from doctor_tests.consumer import get_consumer
20 from doctor_tests.identity_auth import get_identity_auth
21 from doctor_tests.identity_auth import get_session
22 from doctor_tests.image import Image
23 from doctor_tests.instance import Instance
24 from doctor_tests.inspector import get_inspector
25 from doctor_tests.installer import get_installer
26 import doctor_tests.logger as doctor_log
27 from doctor_tests.network import Network
28 from doctor_tests.monitor import get_monitor
29 from doctor_tests.os_clients import nova_client
30 from doctor_tests.profiler_poc import main as profiler_main
31 from doctor_tests.scenario.common import calculate_notification_time
32 from doctor_tests.scenario.network_failure import NetworkFault
33 from doctor_tests.user import User
34
35
36 Logger = doctor_log.Logger('doctor')
37 LOG = Logger.getLogger()
38 LogFile = Logger.getLogFilename()
39
40
41 class DoctorTest(object):
42
43     def __init__(self, conf):
44         self.conf = conf
45         self.image = Image(self.conf, LOG)
46         self.user = User(self.conf, LOG)
47         self.network = Network(self.conf, LOG)
48         self.instance = Instance(self.conf, LOG)
49         self.alarm = Alarm(self.conf, LOG)
50         self.installer = get_installer(self.conf, LOG)
51         self.inspector = get_inspector(self.conf, LOG)
52         self.monitor = get_monitor(self.conf,
53                                    self.inspector.get_inspector_url(),
54                                    LOG)
55         self.consumer = get_consumer(self.conf, LOG)
56         self.fault = NetworkFault(self.conf, self.installer, LOG)
57         auth = get_identity_auth(project=self.conf.doctor_project)
58         self.nova = nova_client(self.conf.nova_version,
59                                 get_session(auth=auth))
60         self.down_host = None
61
62     def setup(self):
63         # prepare the cloud env
64         self.installer.setup()
65
66         # preparing VM image...
67         self.image.create()
68
69         # creating test user...
70         self.user.create()
71
72     def setup_fault_management(self):
73         # user settings...
74         self.user.update_quota()
75
76         # creating VM...
77         self.network.create()
78         self.instance.create()
79         self.instance.wait_for_vm_launch()
80
81         # creating alarm...
82         self.alarm.create()
83
84         # starting doctor sample components...
85         # tbd tojuvone: move inspector and consumer to common setup
86         # when they support updating VMs via instance.create and
87         # instance.delete alarm
88
89         self.inspector.start()
90         self.consumer.start()
91         self.down_host = self.get_host_info_for_random_vm()
92         self.monitor.start(self.down_host)
93
94     def test_fault_management(self):
95         try:
96             LOG.info('doctor fault management test starting.......')
97
98             # prepare test env
99             self.setup_fault_management()
100
101             # wait for aodh alarms are updated in caches for event evaluator,
102             # sleep time should be larger than event_alarm_cache_ttl
103             # (default 60)
104             time.sleep(60)
105
106             # injecting host failure...
107             # NOTE (umar) add INTERFACE_NAME logic to host injection
108
109             self.fault.start(self.down_host)
110             time.sleep(10)
111
112             # verify the test results
113             # NOTE (umar) copy remote monitor.log file when monitor=collectd
114             self.check_host_status(self.down_host.name, 'down')
115
116             notification_time = calculate_notification_time(LogFile)
117             if notification_time < 1 and notification_time > 0:
118                 LOG.info('doctor fault management test successfully, '
119                          'notification_time=%s' % notification_time)
120             else:
121                 LOG.error('doctor fault management test failed, '
122                           'notification_time=%s' % notification_time)
123                 sys.exit(1)
124
125             if self.conf.profiler_type:
126                 LOG.info('doctor fault management test begin to run '
127                          'profile.......')
128                 self.collect_logs()
129                 self.run_profiler()
130         except Exception as e:
131             LOG.error('doctor fault management test failed, '
132                       'Exception=%s' % e)
133             sys.exit(1)
134         finally:
135             self.cleanup_fault_management()
136
137     def _amount_compute_nodes(self):
138         services = self.nova.services.list(binary='nova-compute')
139         return len(services)
140
141     def test_maintenance(self):
142         cnodes = self._amount_compute_nodes()
143         if cnodes < 3:
144             # need 2 compute for redundancy and one spare to migrate
145             LOG.info('not enough compute nodes, skipping doctor '
146                      'maintenance test')
147             return
148         try:
149             LOG.info('doctor maintenance test starting.......')
150             # TODO (tojuvone) test setup and actual test
151         except Exception as e:
152             LOG.error('doctor maintenance test failed, Exception=%s' % e)
153             sys.exit(1)
154         # TODO (tojuvone) finally: test case specific cleanup
155
156     def run(self):
157         """run doctor tests"""
158         try:
159             LOG.info('doctor test starting.......')
160             # prepare common test env
161             self.setup()
162             if self.conf.test_case == 'all':
163                 self.test_fault_management()
164                 self.test_maintenance()
165             else:
166                 getattr(self, self.conf.test_case)()
167         except Exception as e:
168             LOG.error('doctor test failed, Exception=%s' % e)
169             sys.exit(1)
170         finally:
171             self.cleanup()
172
173     def get_host_info_for_random_vm(self):
174         num = random.randint(0, self.conf.instance_count - 1)
175         vm_name = "%s%d" % (self.conf.instance_basename, num)
176
177         servers = \
178             {getattr(server, 'name'): server
179              for server in self.nova.servers.list()}
180         server = servers.get(vm_name)
181         if not server:
182             raise \
183                 Exception('Can not find instance: vm_name(%s)' % vm_name)
184         host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname')
185         host_ip = self.installer.get_host_ip_from_hostname(host_name)
186
187         LOG.info('Get host info(name:%s, ip:%s) which vm(%s) launched at'
188                  % (host_name, host_ip, vm_name))
189         return Host(host_name, host_ip)
190
191     def check_host_status(self, hostname, state):
192         service = self.nova.services.list(host=hostname,
193                                           binary='nova-compute')
194         host_state = service[0].__dict__.get('state')
195         assert host_state == state
196
197     def unset_forced_down_hosts(self):
198         if self.down_host:
199             self.nova.services.force_down(self.down_host.name,
200                                           'nova-compute', False)
201             time.sleep(2)
202             self.check_host_status(self.down_host.name, 'up')
203
204     def collect_logs(self):
205         self.fault.get_disable_network_log()
206
207     def run_profiler(self):
208
209         net_down_log_file = self.fault.get_disable_network_log()
210         reg = '(?<=doctor set link down at )\d+.\d+'
211         linkdown = float(match_rep_in_file(reg, net_down_log_file).group(0))
212
213         reg = '(.* doctor mark vm.* error at )(\d+.\d+)'
214         vmdown = float(match_rep_in_file(reg, LogFile).group(2))
215
216         reg = '(.* doctor mark host.* down at )(\d+.\d+)'
217         hostdown = float(match_rep_in_file(reg, LogFile).group(2))
218
219         reg = '(?<=doctor monitor detected at )\d+.\d+'
220         detected = float(match_rep_in_file(reg, LogFile).group(0))
221
222         reg = '(?<=doctor consumer notified at )\d+.\d+'
223         notified = float(match_rep_in_file(reg, LogFile).group(0))
224
225         # TODO(yujunz) check the actual delay to verify time sync status
226         # expected ~1s delay from $trigger to $linkdown
227         relative_start = linkdown
228         os.environ['DOCTOR_PROFILER_T00'] = \
229             str(int((linkdown - relative_start) * 1000))
230         os.environ['DOCTOR_PROFILER_T01'] = \
231             str(int((detected - relative_start) * 1000))
232         os.environ['DOCTOR_PROFILER_T03'] = \
233             str(int((vmdown - relative_start) * 1000))
234         os.environ['DOCTOR_PROFILER_T04'] = \
235             str(int((hostdown - relative_start) * 1000))
236         os.environ['DOCTOR_PROFILER_T09'] = \
237             str(int((notified - relative_start) * 1000))
238
239         profiler_main(log=LOG)
240
241     def cleanup_fault_management(self):
242         self.unset_forced_down_hosts()
243         self.inspector.stop()
244         self.monitor.stop()
245         self.consumer.stop()
246         self.alarm.delete()
247         self.instance.delete()
248         self.network.delete()
249         self.fault.cleanup()
250
251     def cleanup(self):
252         self.installer.cleanup()
253         self.image.delete()
254         self.user.delete()
255
256
257 def main():
258     """doctor main"""
259     test_dir = os.path.split(os.path.realpath(__file__))[0]
260     doctor_root_dir = os.path.dirname(test_dir)
261
262     config_file_dir = '{0}/{1}'.format(doctor_root_dir, 'etc/')
263     config_files = [join(config_file_dir, f)
264                     for f in os.listdir(config_file_dir)
265                     if isfile(join(config_file_dir, f))]
266
267     conf = config.prepare_conf(args=sys.argv[1:],
268                                config_files=config_files)
269
270     doctor = DoctorTest(conf)
271     doctor.run()