use hostname without domain name & fix the `find` command in tox
[doctor.git] / doctor_tests / scenario / fault_management.py
1 ##############################################################################
2 # Copyright (c) 2017 ZTE Corporation and others.
3 #
4 # All rights reserved. This program and the accompanying materials
5 # are made available under the terms of the Apache License, Version 2.0
6 # which accompanies this distribution, and is available at
7 # http://www.apache.org/licenses/LICENSE-2.0
8 ##############################################################################
9 import os
10 import random
11 import time
12
13 from doctor_tests.alarm import Alarm
14 from doctor_tests.common.constants import Host
15 from doctor_tests.common.utils import get_doctor_test_root_dir
16 from doctor_tests.common.utils import match_rep_in_file
17 from doctor_tests.common.utils import SSHClient
18 from doctor_tests.consumer import get_consumer
19 from doctor_tests.identity_auth import get_identity_auth
20 from doctor_tests.identity_auth import get_session
21 from doctor_tests.instance import Instance
22 from doctor_tests.inspector import get_inspector
23 from doctor_tests.monitor import get_monitor
24 from doctor_tests.network import Network
25 from doctor_tests.profiler_poc import main as profiler_main
26 from doctor_tests.os_clients import nova_client
27
28
29 LINK_DOWN_SCRIPT = """
30 #!/bin/bash -x
31 dev=$(sudo ip a | awk '/ {compute_ip}\//{{print $NF}}')
32 sleep 1
33 sudo ip link set $dev down
34 echo "doctor set link down at" $(date "+%s.%N")
35 sleep 30
36 sudo ip link set $dev up
37 sleep 1
38 """
39
40
41 class FaultManagement(object):
42
43     def __init__(self, conf, installer, user, log, transport_url):
44         self.conf = conf
45         self.log = log
46         self.user = user
47         self.installer = installer
48         auth = get_identity_auth(project=self.conf.doctor_project)
49         self.nova = nova_client(self.conf.nova_version,
50                                 get_session(auth=auth))
51         self.test_dir = get_doctor_test_root_dir()
52         self.down_host = None
53         self.GetLog = False
54         self.disable_network_log = None
55         self.network = Network(self.conf, log)
56         self.instance = Instance(self.conf, log)
57         self.alarm = Alarm(self.conf, log)
58         self.inspector = get_inspector(self.conf, log, transport_url)
59         self.monitor = get_monitor(self.conf,
60                                    self.inspector.get_inspector_url(),
61                                    log)
62         self.consumer = get_consumer(self.conf, log)
63
64     def setup(self):
65         self.log.info('fault management setup......')
66
67         # user settings...
68         self.user.update_quota()
69
70         # creating VM...
71         self.network.create()
72         self.instance.create()
73         self.instance.wait_for_vm_launch()
74
75         # creating alarm...
76         self.alarm.create()
77
78         # starting doctor sample components...
79         # tbd tojuvone: move inspector and consumer to common setup
80         # when they support updating VMs via instance.create and
81         # instance.delete alarm
82
83         self.inspector.start()
84         self.consumer.start()
85         self.down_host = self.get_host_info_for_random_vm()
86         self.monitor.start(self.down_host)
87
88     def start(self):
89         self.log.info('fault management start......')
90         self._set_link_down(self.down_host.ip)
91         self.log.info('fault management end......')
92
93     def cleanup(self):
94         self.log.info('fault management cleanup......')
95
96         self.get_disable_network_log()
97         self.unset_forced_down_hosts()
98         self.inspector.stop()
99         self.monitor.stop()
100         self.consumer.stop()
101         self.alarm.delete()
102         self.instance.delete()
103         self.network.delete()
104
105     def get_host_info_for_random_vm(self):
106         num = random.randint(0, self.conf.instance_count - 1)
107         vm_name = "%s%d" % (self.conf.instance_basename, num)
108
109         servers = {getattr(server, 'name'): server
110                    for server in self.nova.servers.list()}
111         server = servers.get(vm_name)
112         if not server:
113             raise Exception('Can not find instance: vm_name(%s)' % vm_name)
114         # use hostname without domain name which is mapped to the cell
115         hostname = \
116             server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname')
117         host_name = hostname.split('.')[0]
118         host_ip = self.installer.get_host_ip_from_hostname(host_name)
119
120         self.log.info('Get host info(name:%s, ip:%s) which vm(%s) launched at'
121                       % (host_name, host_ip, vm_name))
122         return Host(host_name, host_ip)
123
124     def unset_forced_down_hosts(self):
125         if self.down_host:
126             self.nova.services.force_down(self.down_host.name,
127                                           'nova-compute', False)
128             time.sleep(2)
129             self.check_host_status('up')
130
131     def check_host_status(self, state):
132         service = self.nova.services.list(host=self.down_host.name,
133                                           binary='nova-compute')
134         host_state = service[0].__dict__.get('state')
135         assert host_state == state
136
137     def get_disable_network_log(self):
138         if self.GetLog:
139             self.log.info('Already get the disable_netork.log '
140                           'from down_host......')
141             return self.disable_network_log
142         if self.down_host is not None:
143             client = SSHClient(
144                 self.down_host.ip,
145                 self.installer.node_user_name,
146                 key_filename=self.installer.get_ssh_key_from_installer(),
147                 look_for_keys=True,
148                 log=self.log)
149
150             self.disable_network_log = \
151                 '{0}/{1}'.format(self.test_dir,
152                                  'disable_network.log')
153             client.scp('disable_network.log',
154                        self.disable_network_log,
155                        method='get')
156             self.log.info('Get the disable_netork.log from'
157                           'down_host(host_name:%s, host_ip:%s)'
158                           % (self.down_host.name, self.down_host.ip))
159         self.GetLog = True
160         return self.disable_network_log
161
162     def _set_link_down(self, compute_ip):
163         file_name = '{0}/{1}'.format(self.test_dir, 'disable_network.sh')
164         with open(file_name, 'w') as file:
165             file.write(LINK_DOWN_SCRIPT.format(compute_ip=compute_ip))
166         client = SSHClient(
167             compute_ip,
168             self.installer.node_user_name,
169             key_filename=self.installer.get_ssh_key_from_installer(),
170             look_for_keys=True,
171             log=self.log)
172         client.scp(file_name, 'disable_network.sh')
173         command = 'bash disable_network.sh > disable_network.log 2>&1 &'
174         client.ssh(command)
175
176     def check_notification_time(self):
177         if self.consumer.notified_time is None \
178                 or self.monitor.detected_time is None:
179             raise Exception('doctor fault management test failed, '
180                             'detected_time=%s, notified_time=%s'
181                             % (self.monitor.detected_time,
182                                self.consumer.notified_time))
183         notification_time = \
184             self.consumer.notified_time - \
185             self.monitor.detected_time
186
187         self.log.info('doctor fault management notification_time=%s'
188                       % notification_time)
189
190         if notification_time < 1 and notification_time > 0:
191             self.log.info('doctor fault management test successfully')
192         else:
193             if self.conf.profiler_type:
194                 self.log.info('run doctor fault management profile.......')
195                 self.run_profiler()
196
197             raise Exception('doctor fault management test failed, '
198                             'notification_time=%s' % notification_time)
199
200         if self.conf.profiler_type:
201             self.log.info('run doctor fault management profile.......')
202             self.run_profiler()
203
204     def run_profiler(self):
205
206         net_down_log_file = self.get_disable_network_log()
207         reg = '(?<=doctor set link down at )\d+.\d+'
208         linkdown = float(match_rep_in_file(reg, net_down_log_file).group(0))
209
210         vmdown = self.inspector.vm_down_time
211         hostdown = self.inspector.host_down_time
212         detected = self.monitor.detected_time
213         notified = self.consumer.notified_time
214
215         if None in [vmdown, hostdown, detected, notified]:
216             self.log.info('one of the time for profiler is None, return')
217             return
218
219         # TODO(yujunz) check the actual delay to verify time sync status
220         # expected ~1s delay from $trigger to $linkdown
221         relative_start = linkdown
222         os.environ['DOCTOR_PROFILER_T00'] = (
223             str(int((linkdown - relative_start) * 1000)))
224         os.environ['DOCTOR_PROFILER_T01'] = (
225             str(int((detected - relative_start) * 1000)))
226         os.environ['DOCTOR_PROFILER_T03'] = (
227             str(int((vmdown - relative_start) * 1000)))
228         os.environ['DOCTOR_PROFILER_T04'] = (
229             str(int((hostdown - relative_start) * 1000)))
230         os.environ['DOCTOR_PROFILER_T09'] = (
231             str(int((notified - relative_start) * 1000)))
232
233         profiler_main(log=self.log)