Add process group so can kill possible hanging subprocesses.
[doctor.git] / doctor_tests / main.py
1 ##############################################################################
2 # Copyright (c) 2017 ZTE Corporation and others.
3 #
4 # All rights reserved. This program and the accompanying materials
5 # are made available under the terms of the Apache License, Version 2.0
6 # which accompanies this distribution, and is available at
7 # http://www.apache.org/licenses/LICENSE-2.0
8 ##############################################################################
9 import os
10 from os.path import isfile, join
11 import random
12 import signal
13 import sys
14 import time
15
16 from doctor_tests.alarm import Alarm
17 from doctor_tests.common.constants import Host
18 from doctor_tests.common.utils import match_rep_in_file
19 from doctor_tests import config
20 from doctor_tests.consumer import get_consumer
21 from doctor_tests.identity_auth import get_identity_auth
22 from doctor_tests.identity_auth import get_session
23 from doctor_tests.image import Image
24 from doctor_tests.instance import Instance
25 from doctor_tests.inspector import get_inspector
26 from doctor_tests.installer import get_installer
27 import doctor_tests.logger as doctor_log
28 from doctor_tests.network import Network
29 from doctor_tests.monitor import get_monitor
30 from doctor_tests.os_clients import nova_client
31 from doctor_tests.profiler_poc import main as profiler_main
32 from doctor_tests.scenario.common import calculate_notification_time
33 from doctor_tests.scenario.network_failure import NetworkFault
34 from doctor_tests.user import User
35
36
37 Logger = doctor_log.Logger('doctor')
38 LOG = Logger.getLogger()
39 LogFile = Logger.getLogFilename()
40
41
42 class DoctorTest(object):
43
44     def __init__(self, conf):
45         self.conf = conf
46         self.image = Image(self.conf, LOG)
47         self.user = User(self.conf, LOG)
48         self.network = Network(self.conf, LOG)
49         self.instance = Instance(self.conf, LOG)
50         self.alarm = Alarm(self.conf, LOG)
51         self.installer = get_installer(self.conf, LOG)
52         self.inspector = get_inspector(self.conf, LOG)
53         self.monitor = get_monitor(self.conf,
54                                    self.inspector.get_inspector_url(),
55                                    LOG)
56         self.consumer = get_consumer(self.conf, LOG)
57         self.fault = NetworkFault(self.conf, self.installer, LOG)
58         auth = get_identity_auth(project=self.conf.doctor_project)
59         self.nova = nova_client(self.conf.nova_version,
60                                 get_session(auth=auth))
61         self.down_host = None
62
63     def setup(self):
64         # prepare the cloud env
65         self.installer.setup()
66
67         # preparing VM image...
68         self.image.create()
69
70         # creating test user...
71         self.user.create()
72
73     def setup_fault_management(self):
74         # user settings...
75         self.user.update_quota()
76
77         # creating VM...
78         self.network.create()
79         self.instance.create()
80         self.instance.wait_for_vm_launch()
81
82         # creating alarm...
83         self.alarm.create()
84
85         # starting doctor sample components...
86         # tbd tojuvone: move inspector and consumer to common setup
87         # when they support updating VMs via instance.create and
88         # instance.delete alarm
89
90         self.inspector.start()
91         self.consumer.start()
92         self.down_host = self.get_host_info_for_random_vm()
93         self.monitor.start(self.down_host)
94
95     def test_fault_management(self):
96         try:
97             LOG.info('doctor fault management test starting.......')
98
99             # prepare test env
100             self.setup_fault_management()
101
102             # wait for aodh alarms are updated in caches for event evaluator,
103             # sleep time should be larger than event_alarm_cache_ttl
104             # (default 60)
105             time.sleep(60)
106
107             # injecting host failure...
108             # NOTE (umar) add INTERFACE_NAME logic to host injection
109
110             self.fault.start(self.down_host)
111             time.sleep(10)
112
113             # verify the test results
114             # NOTE (umar) copy remote monitor.log file when monitor=collectd
115             self.check_host_status(self.down_host.name, 'down')
116
117             notification_time = calculate_notification_time(LogFile)
118             if notification_time < 1 and notification_time > 0:
119                 LOG.info('doctor fault management test successfully, '
120                          'notification_time=%s' % notification_time)
121             else:
122                 LOG.error('doctor fault management test failed, '
123                           'notification_time=%s' % notification_time)
124                 sys.exit(1)
125
126             if self.conf.profiler_type:
127                 LOG.info('doctor fault management test begin to run '
128                          'profile.......')
129                 self.collect_logs()
130                 self.run_profiler()
131         except Exception as e:
132             LOG.error('doctor fault management test failed, '
133                       'Exception=%s' % e)
134             sys.exit(1)
135         finally:
136             self.cleanup_fault_management()
137
138     def _amount_compute_nodes(self):
139         services = self.nova.services.list(binary='nova-compute')
140         return len(services)
141
142     def test_maintenance(self):
143         cnodes = self._amount_compute_nodes()
144         if cnodes < 3:
145             # need 2 compute for redundancy and one spare to migrate
146             LOG.info('not enough compute nodes, skipping doctor '
147                      'maintenance test')
148             return
149         try:
150             LOG.info('doctor maintenance test starting.......')
151             # TODO (tojuvone) test setup and actual test
152         except Exception as e:
153             LOG.error('doctor maintenance test failed, Exception=%s' % e)
154             sys.exit(1)
155         # TODO (tojuvone) finally: test case specific cleanup
156
157     def run(self):
158         """run doctor tests"""
159         try:
160             LOG.info('doctor test starting.......')
161             # prepare common test env
162             self.setup()
163             if self.conf.test_case == 'all':
164                 self.test_fault_management()
165                 self.test_maintenance()
166             else:
167                 function = 'test_%s' % self.conf.test_case
168                 if hasattr(self, function):
169                     getattr(self, function)()
170                 else:
171                     raise Exception('Can not find function <%s> in'
172                                     'DoctorTest, see config manual'
173                                     % function)
174         except Exception as e:
175             LOG.error('doctor test failed, Exception=%s' % e)
176             sys.exit(1)
177         finally:
178             self.cleanup()
179
180     def get_host_info_for_random_vm(self):
181         num = random.randint(0, self.conf.instance_count - 1)
182         vm_name = "%s%d" % (self.conf.instance_basename, num)
183
184         servers = {getattr(server, 'name'): server
185                    for server in self.nova.servers.list()}
186         server = servers.get(vm_name)
187         if not server:
188             raise Exception('Can not find instance: vm_name(%s)' % vm_name)
189         host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname')
190         host_ip = self.installer.get_host_ip_from_hostname(host_name)
191
192         LOG.info('Get host info(name:%s, ip:%s) which vm(%s) launched at'
193                  % (host_name, host_ip, vm_name))
194         return Host(host_name, host_ip)
195
196     def check_host_status(self, hostname, state):
197         service = self.nova.services.list(host=hostname,
198                                           binary='nova-compute')
199         host_state = service[0].__dict__.get('state')
200         assert host_state == state
201
202     def unset_forced_down_hosts(self):
203         if self.down_host:
204             self.nova.services.force_down(self.down_host.name,
205                                           'nova-compute', False)
206             time.sleep(2)
207             self.check_host_status(self.down_host.name, 'up')
208
209     def collect_logs(self):
210         self.fault.get_disable_network_log()
211
212     def run_profiler(self):
213
214         net_down_log_file = self.fault.get_disable_network_log()
215         reg = '(?<=doctor set link down at )\d+.\d+'
216         linkdown = float(match_rep_in_file(reg, net_down_log_file).group(0))
217
218         reg = '(.* doctor mark vm.* error at )(\d+.\d+)'
219         vmdown = float(match_rep_in_file(reg, LogFile).group(2))
220
221         reg = '(.* doctor mark host.* down at )(\d+.\d+)'
222         hostdown = float(match_rep_in_file(reg, LogFile).group(2))
223
224         reg = '(?<=doctor monitor detected at )\d+.\d+'
225         detected = float(match_rep_in_file(reg, LogFile).group(0))
226
227         reg = '(?<=doctor consumer notified at )\d+.\d+'
228         notified = float(match_rep_in_file(reg, LogFile).group(0))
229
230         # TODO(yujunz) check the actual delay to verify time sync status
231         # expected ~1s delay from $trigger to $linkdown
232         relative_start = linkdown
233         os.environ['DOCTOR_PROFILER_T00'] = (
234             str(int((linkdown - relative_start) * 1000)))
235         os.environ['DOCTOR_PROFILER_T01'] = (
236             str(int((detected - relative_start) * 1000)))
237         os.environ['DOCTOR_PROFILER_T03'] = (
238             str(int((vmdown - relative_start) * 1000)))
239         os.environ['DOCTOR_PROFILER_T04'] = (
240             str(int((hostdown - relative_start) * 1000)))
241         os.environ['DOCTOR_PROFILER_T09'] = (
242             str(int((notified - relative_start) * 1000)))
243
244         profiler_main(log=LOG)
245
246     def cleanup_fault_management(self):
247         self.unset_forced_down_hosts()
248         self.inspector.stop()
249         self.monitor.stop()
250         self.consumer.stop()
251         self.alarm.delete()
252         self.instance.delete()
253         self.network.delete()
254         self.fault.cleanup()
255
256     def cleanup(self):
257         self.installer.cleanup()
258         self.image.delete()
259         self.user.delete()
260         # Kill possible hanging subprocess
261         os.killpg(0, signal.SIGKILL)
262
263
264 def main():
265     """doctor main"""
266     # TODO (tojuvone): JIRA DOCTOR-123: Test cases have some issue to always
267     # kill all subprocesses. To ensure they are killed this group is done so
268     # all processes can be killed without knowing what they are.
269     os.setpgrp()
270     test_dir = os.path.split(os.path.realpath(__file__))[0]
271     doctor_root_dir = os.path.dirname(test_dir)
272
273     config_file_dir = '{0}/{1}'.format(doctor_root_dir, 'etc/')
274     config_files = [join(config_file_dir, f)
275                     for f in os.listdir(config_file_dir)
276                     if isfile(join(config_file_dir, f))]
277
278     conf = config.prepare_conf(args=sys.argv[1:],
279                                config_files=config_files)
280
281     doctor = DoctorTest(conf)
282     doctor.run()