1 ##############################################################################
2 # Copyright (c) 2017 ZTE Corporation and others.
4 # All rights reserved. This program and the accompanying materials
5 # are made available under the terms of the Apache License, Version 2.0
6 # which accompanies this distribution, and is available at
7 # http://www.apache.org/licenses/LICENSE-2.0
8 ##############################################################################
13 from doctor_tests.alarm import Alarm
14 from doctor_tests.common.constants import Host
15 from doctor_tests.common.utils import get_doctor_test_root_dir
16 from doctor_tests.common.utils import match_rep_in_file
17 from doctor_tests.common.utils import SSHClient
18 from doctor_tests.consumer import get_consumer
19 from doctor_tests.identity_auth import get_identity_auth
20 from doctor_tests.identity_auth import get_session
21 from doctor_tests.instance import Instance
22 from doctor_tests.inspector import get_inspector
23 from doctor_tests.monitor import get_monitor
24 from doctor_tests.network import Network
25 from doctor_tests.profiler_poc import main as profiler_main
26 from doctor_tests.os_clients import nova_client
29 LINK_DOWN_SCRIPT = """
31 dev=$(sudo ip a | awk '/ {compute_ip}\//{{print $NF}}')
33 sudo ip link set $dev down
34 echo "doctor set link down at" $(date "+%s.%N")
36 sudo ip link set $dev up
41 class FaultManagement(object):
43 def __init__(self, conf, installer, user, log):
47 self.installer = installer
48 auth = get_identity_auth(project=self.conf.doctor_project)
49 self.nova = nova_client(self.conf.nova_version,
50 get_session(auth=auth))
51 self.test_dir = get_doctor_test_root_dir()
54 self.disable_network_log = None
55 self.network = Network(self.conf, log)
56 self.instance = Instance(self.conf, log)
57 self.alarm = Alarm(self.conf, log)
58 self.inspector = get_inspector(self.conf, log)
59 self.monitor = get_monitor(self.conf,
60 self.inspector.get_inspector_url(),
62 self.consumer = get_consumer(self.conf, log)
65 self.log.info('fault management setup......')
68 self.user.update_quota()
72 self.instance.create()
73 self.instance.wait_for_vm_launch()
78 # starting doctor sample components...
79 # tbd tojuvone: move inspector and consumer to common setup
80 # when they support updating VMs via instance.create and
81 # instance.delete alarm
83 self.inspector.start()
85 self.down_host = self.get_host_info_for_random_vm()
86 self.monitor.start(self.down_host)
89 self.log.info('fault management start......')
90 self._set_link_down(self.down_host.ip)
91 self.log.info('fault management end......')
94 self.log.info('fault management cleanup......')
96 self.get_disable_network_log()
97 self.unset_forced_down_hosts()
102 self.instance.delete()
103 self.network.delete()
105 def get_host_info_for_random_vm(self):
106 num = random.randint(0, self.conf.instance_count - 1)
107 vm_name = "%s%d" % (self.conf.instance_basename, num)
109 servers = {getattr(server, 'name'): server
110 for server in self.nova.servers.list()}
111 server = servers.get(vm_name)
113 raise Exception('Can not find instance: vm_name(%s)' % vm_name)
114 host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname')
115 host_ip = self.installer.get_host_ip_from_hostname(host_name)
117 self.log.info('Get host info(name:%s, ip:%s) which vm(%s) launched at'
118 % (host_name, host_ip, vm_name))
119 return Host(host_name, host_ip)
121 def unset_forced_down_hosts(self):
123 self.nova.services.force_down(self.down_host.name,
124 'nova-compute', False)
126 self.check_host_status('up')
128 def check_host_status(self, state):
129 service = self.nova.services.list(host=self.down_host.name,
130 binary='nova-compute')
131 host_state = service[0].__dict__.get('state')
132 assert host_state == state
134 def get_disable_network_log(self):
136 self.log.info('Already get the disable_netork.log '
137 'from down_host......')
138 return self.disable_network_log
139 if self.down_host is not None:
142 self.installer.node_user_name,
143 key_filename=self.installer.get_ssh_key_from_installer(),
147 self.disable_network_log = \
148 '{0}/{1}'.format(self.test_dir,
149 'disable_network.log')
150 client.scp('disable_network.log',
151 self.disable_network_log,
153 self.log.info('Get the disable_netork.log from'
154 'down_host(host_name:%s, host_ip:%s)'
155 % (self.down_host.name, self.down_host.ip))
157 return self.disable_network_log
159 def _set_link_down(self, compute_ip):
160 file_name = '{0}/{1}'.format(self.test_dir, 'disable_network.sh')
161 with open(file_name, 'w') as file:
162 file.write(LINK_DOWN_SCRIPT.format(compute_ip=compute_ip))
165 self.installer.node_user_name,
166 key_filename=self.installer.get_ssh_key_from_installer(),
169 client.scp(file_name, 'disable_network.sh')
170 command = 'bash disable_network.sh > disable_network.log 2>&1 &'
173 def check_notification_time(self):
174 if self.consumer.notified_time is None \
175 or self.monitor.detected_time is None:
176 raise Exception('doctor fault management test failed, '
177 'detected_time=%s, notified_time=%s'
178 % (self.monitor.detected_time,
179 self.consumer.notified_time))
180 notification_time = \
181 self.consumer.notified_time - \
182 self.monitor.detected_time
183 if notification_time < 1 and notification_time > 0:
184 self.log.info('doctor fault management test successfully,'
185 'notification_time=%s' % notification_time)
187 if self.conf.profiler_type:
188 self.log.info('run doctor fault management profile.......')
191 raise Exception('doctor fault management test failed, '
192 'notification_time=%s' % notification_time)
194 if self.conf.profiler_type:
195 self.log.info('run doctor fault management profile.......')
198 def run_profiler(self):
200 net_down_log_file = self.get_disable_network_log()
201 reg = '(?<=doctor set link down at )\d+.\d+'
202 linkdown = float(match_rep_in_file(reg, net_down_log_file).group(0))
204 vmdown = self.inspector.vm_down_time
205 hostdown = self.inspector.host_down_time
206 detected = self.monitor.detected_time
207 notified = self.consumer.notified_time
209 # TODO(yujunz) check the actual delay to verify time sync status
210 # expected ~1s delay from $trigger to $linkdown
211 relative_start = linkdown
212 os.environ['DOCTOR_PROFILER_T00'] = (
213 str(int((linkdown - relative_start) * 1000)))
214 os.environ['DOCTOR_PROFILER_T01'] = (
215 str(int((detected - relative_start) * 1000)))
216 os.environ['DOCTOR_PROFILER_T03'] = (
217 str(int((vmdown - relative_start) * 1000)))
218 os.environ['DOCTOR_PROFILER_T04'] = (
219 str(int((hostdown - relative_start) * 1000)))
220 os.environ['DOCTOR_PROFILER_T09'] = (
221 str(int((notified - relative_start) * 1000)))
223 profiler_main(log=self.log)