a110b88a5c2d595bd39875d1d6672088c5bbd678
[doctor.git] / doctor_tests / scenario / fault_management.py
1 ##############################################################################
2 # Copyright (c) 2017 ZTE Corporation and others.
3 #
4 # All rights reserved. This program and the accompanying materials
5 # are made available under the terms of the Apache License, Version 2.0
6 # which accompanies this distribution, and is available at
7 # http://www.apache.org/licenses/LICENSE-2.0
8 ##############################################################################
9 import os
10 import random
11 import time
12
13 from doctor_tests.alarm import Alarm
14 from doctor_tests.common.constants import Host
15 from doctor_tests.common.utils import get_doctor_test_root_dir
16 from doctor_tests.common.utils import match_rep_in_file
17 from doctor_tests.common.utils import SSHClient
18 from doctor_tests.consumer import get_consumer
19 from doctor_tests.identity_auth import get_identity_auth
20 from doctor_tests.identity_auth import get_session
21 from doctor_tests.instance import Instance
22 from doctor_tests.inspector import get_inspector
23 from doctor_tests.monitor import get_monitor
24 from doctor_tests.network import Network
25 from doctor_tests.profiler_poc import main as profiler_main
26 from doctor_tests.os_clients import nova_client
27
28
29 LINK_DOWN_SCRIPT = """
30 #!/bin/bash -x
31 dev=$(sudo ip a | awk '/ {compute_ip}\//{{print $NF}}')
32 sleep 1
33 sudo ip link set $dev down
34 echo "doctor set link down at" $(date "+%s.%N")
35 sleep 30
36 sudo ip link set $dev up
37 sleep 1
38 """
39
40
41 class FaultManagement(object):
42
43     def __init__(self, conf, installer, user, log, transport_url):
44         self.conf = conf
45         self.log = log
46         self.user = user
47         self.installer = installer
48         auth = get_identity_auth(project=self.conf.doctor_project)
49         self.nova = nova_client(self.conf.nova_version,
50                                 get_session(auth=auth))
51         self.test_dir = get_doctor_test_root_dir()
52         self.down_host = None
53         self.GetLog = False
54         self.disable_network_log = None
55         self.network = Network(self.conf, log)
56         self.instance = Instance(self.conf, log)
57         self.alarm = Alarm(self.conf, log)
58         self.inspector = get_inspector(self.conf, log, transport_url)
59         self.monitor = get_monitor(self.conf,
60                                    self.inspector.get_inspector_url(),
61                                    log)
62         self.consumer = get_consumer(self.conf, log)
63
64     def setup(self):
65         self.log.info('fault management setup......')
66
67         # user settings...
68         self.user.update_quota()
69
70         # creating VM...
71         self.network.create()
72         self.instance.create()
73         self.instance.wait_for_vm_launch()
74
75         # creating alarm...
76         self.alarm.create()
77
78         # starting doctor sample components...
79         # tbd tojuvone: move inspector and consumer to common setup
80         # when they support updating VMs via instance.create and
81         # instance.delete alarm
82
83         self.inspector.start()
84         self.consumer.start()
85         self.down_host = self.get_host_info_for_random_vm()
86         self.monitor.start(self.down_host)
87
88     def start(self):
89         self.log.info('fault management start......')
90         self._set_link_down(self.down_host.ip)
91         self.log.info('fault management end......')
92
93     def cleanup(self):
94         self.log.info('fault management cleanup......')
95
96         self.get_disable_network_log()
97         self.unset_forced_down_hosts()
98         self.inspector.stop()
99         self.monitor.stop()
100         self.consumer.stop()
101         self.alarm.delete()
102         self.instance.delete()
103         self.network.delete()
104
105     def get_host_info_for_random_vm(self):
106         num = random.randint(0, self.conf.instance_count - 1)
107         vm_name = "%s%d" % (self.conf.instance_basename, num)
108
109         servers = {getattr(server, 'name'): server
110                    for server in self.nova.servers.list()}
111         server = servers.get(vm_name)
112         if not server:
113             raise Exception('Can not find instance: vm_name(%s)' % vm_name)
114         host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname')
115         host_ip = self.installer.get_host_ip_from_hostname(host_name)
116
117         self.log.info('Get host info(name:%s, ip:%s) which vm(%s) launched at'
118                       % (host_name, host_ip, vm_name))
119         return Host(host_name, host_ip)
120
121     def unset_forced_down_hosts(self):
122         if self.down_host:
123             self.nova.services.force_down(self.down_host.name,
124                                           'nova-compute', False)
125             time.sleep(2)
126             self.check_host_status('up')
127
128     def check_host_status(self, state):
129         service = self.nova.services.list(host=self.down_host.name,
130                                           binary='nova-compute')
131         host_state = service[0].__dict__.get('state')
132         assert host_state == state
133
134     def get_disable_network_log(self):
135         if self.GetLog:
136             self.log.info('Already get the disable_netork.log '
137                           'from down_host......')
138             return self.disable_network_log
139         if self.down_host is not None:
140             client = SSHClient(
141                 self.down_host.ip,
142                 self.installer.node_user_name,
143                 key_filename=self.installer.get_ssh_key_from_installer(),
144                 look_for_keys=True,
145                 log=self.log)
146
147             self.disable_network_log = \
148                 '{0}/{1}'.format(self.test_dir,
149                                  'disable_network.log')
150             client.scp('disable_network.log',
151                        self.disable_network_log,
152                        method='get')
153             self.log.info('Get the disable_netork.log from'
154                           'down_host(host_name:%s, host_ip:%s)'
155                           % (self.down_host.name, self.down_host.ip))
156         self.GetLog = True
157         return self.disable_network_log
158
159     def _set_link_down(self, compute_ip):
160         file_name = '{0}/{1}'.format(self.test_dir, 'disable_network.sh')
161         with open(file_name, 'w') as file:
162             file.write(LINK_DOWN_SCRIPT.format(compute_ip=compute_ip))
163         client = SSHClient(
164             compute_ip,
165             self.installer.node_user_name,
166             key_filename=self.installer.get_ssh_key_from_installer(),
167             look_for_keys=True,
168             log=self.log)
169         client.scp(file_name, 'disable_network.sh')
170         command = 'bash disable_network.sh > disable_network.log 2>&1 &'
171         client.ssh(command)
172
173     def check_notification_time(self):
174         if self.consumer.notified_time is None \
175                 or self.monitor.detected_time is None:
176             raise Exception('doctor fault management test failed, '
177                             'detected_time=%s, notified_time=%s'
178                             % (self.monitor.detected_time,
179                                self.consumer.notified_time))
180         notification_time = \
181             self.consumer.notified_time - \
182             self.monitor.detected_time
183
184         self.log.info('doctor fault management notification_time=%s'
185                       % notification_time)
186
187         if notification_time < 1 and notification_time > 0:
188             self.log.info('doctor fault management test successfully')
189         else:
190             if self.conf.profiler_type:
191                 self.log.info('run doctor fault management profile.......')
192                 self.run_profiler()
193
194             raise Exception('doctor fault management test failed, '
195                             'notification_time=%s' % notification_time)
196
197         if self.conf.profiler_type:
198             self.log.info('run doctor fault management profile.......')
199             self.run_profiler()
200
201     def run_profiler(self):
202
203         net_down_log_file = self.get_disable_network_log()
204         reg = '(?<=doctor set link down at )\d+.\d+'
205         linkdown = float(match_rep_in_file(reg, net_down_log_file).group(0))
206
207         vmdown = self.inspector.vm_down_time
208         hostdown = self.inspector.host_down_time
209         detected = self.monitor.detected_time
210         notified = self.consumer.notified_time
211
212         # TODO(yujunz) check the actual delay to verify time sync status
213         # expected ~1s delay from $trigger to $linkdown
214         relative_start = linkdown
215         os.environ['DOCTOR_PROFILER_T00'] = (
216             str(int((linkdown - relative_start) * 1000)))
217         os.environ['DOCTOR_PROFILER_T01'] = (
218             str(int((detected - relative_start) * 1000)))
219         os.environ['DOCTOR_PROFILER_T03'] = (
220             str(int((vmdown - relative_start) * 1000)))
221         os.environ['DOCTOR_PROFILER_T04'] = (
222             str(int((hostdown - relative_start) * 1000)))
223         os.environ['DOCTOR_PROFILER_T09'] = (
224             str(int((notified - relative_start) * 1000)))
225
226         profiler_main(log=self.log)