12 from teuthology import misc as teuthology
14 log = logging.getLogger(__name__)
18 Get monitor names from the context value.
20 mons = [f[len('mon.'):] for f in teuthology.get_mon_names(ctx)]
23 class MonitorThrasher:
29 - wait for quorum to be formed
30 - sleep for 'revive_delay' seconds
32 - wait for quorum to be formed
33 - sleep for 'thrash_delay' seconds
37 seed Seed to use on the RNG to reproduce a previous
38 behaviour (default: None; i.e., not set)
39 revive_delay Number of seconds to wait before reviving
40 the monitor (default: 10)
41 thrash_delay Number of seconds to wait in-between
42 test iterations (default: 0)
43 thrash_store Thrash monitor store before killing the monitor being thrashed (default: False)
44 thrash_store_probability Probability of thrashing a monitor's store
46 thrash_many Thrash multiple monitors instead of just one. If
47 'maintain-quorum' is set to False, then we will
48 thrash up to as many monitors as there are
49 available. (default: False)
50 maintain_quorum Always maintain quorum, taking care on how many
51 monitors we kill during the thrashing. If we
52 happen to only have one or two monitors configured,
53 if this option is set to True, then we won't run
54 this task as we cannot guarantee maintenance of
55 quorum. Setting it to false however would allow the
56 task to run with as many as just one single monitor.
58 freeze_mon_probability: how often to freeze the mon instead of killing it,
60 freeze_mon_duration: how many seconds to freeze the mon (default: 15)
61 scrub Scrub after each iteration (default: True)
63 Note: if 'store-thrash' is set to True, then 'maintain-quorum' must also
74 thrash_store_probability: 40
84 def __init__(self, ctx, manager, config, logger):
86 self.manager = manager
87 self.manager.wait_for_clean()
93 if self.config is None:
96 """ Test reproducibility """
97 self.random_seed = self.config.get('seed', None)
99 if self.random_seed is None:
100 self.random_seed = int(time.time())
102 self.rng = random.Random()
103 self.rng.seed(int(self.random_seed))
105 """ Monitor thrashing """
106 self.revive_delay = float(self.config.get('revive_delay', 10.0))
107 self.thrash_delay = float(self.config.get('thrash_delay', 0.0))
109 self.thrash_many = self.config.get('thrash_many', False)
110 self.maintain_quorum = self.config.get('maintain_quorum', True)
112 self.scrub = self.config.get('scrub', True)
114 self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10))
115 self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0))
117 assert self.max_killable() > 0, \
118 'Unable to kill at least one monitor with the current config.'
120 """ Store thrashing """
121 self.store_thrash = self.config.get('store_thrash', False)
122 self.store_thrash_probability = int(
123 self.config.get('store_thrash_probability', 50))
124 if self.store_thrash:
125 assert self.store_thrash_probability > 0, \
126 'store_thrash is set, probability must be > 0'
127 assert self.maintain_quorum, \
128 'store_thrash = true must imply maintain_quorum = true'
130 self.thread = gevent.spawn(self.do_thrash)
134 locally log info messages
140 Break out of this processes thrashing loop.
145 def should_thrash_store(self):
147 If allowed, indicate that we should thrash a certain percentage of
148 the time as determined by the store_thrash_probability value.
150 if not self.store_thrash:
152 return self.rng.randrange(0, 101) < self.store_thrash_probability
154 def thrash_store(self, mon):
156 Thrash the monitor specified.
157 :param mon: monitor to thrash
159 addr = self.ctx.ceph['ceph'].conf['mon.%s' % mon]['mon addr']
160 self.log('thrashing mon.{id}@{addr} store'.format(id=mon, addr=addr))
161 out = self.manager.raw_cluster_cmd('-m', addr, 'sync', 'force')
163 assert j['ret'] == 0, \
164 'error forcing store sync on mon.{id}:\n{ret}'.format(
167 def should_freeze_mon(self):
169 Indicate that we should freeze a certain percentago of the time
170 as determined by the freeze_mon_probability value.
172 return self.rng.randrange(0, 101) < self.freeze_mon_probability
174 def freeze_mon(self, mon):
176 Send STOP signal to freeze the monitor.
178 log.info('Sending STOP to mon %s', mon)
179 self.manager.signal_mon(mon, 19) # STOP
181 def unfreeze_mon(self, mon):
183 Send CONT signal to unfreeze the monitor.
185 log.info('Sending CONT to mon %s', mon)
186 self.manager.signal_mon(mon, 18) # CONT
188 def kill_mon(self, mon):
190 Kill the monitor specified
192 self.log('killing mon.{id}'.format(id=mon))
193 self.manager.kill_mon(mon)
195 def revive_mon(self, mon):
197 Revive the monitor specified
199 self.log('killing mon.{id}'.format(id=mon))
200 self.log('reviving mon.{id}'.format(id=mon))
201 self.manager.revive_mon(mon)
203 def max_killable(self):
205 Return the maximum number of monitors we can kill.
207 m = len(_get_mons(self.ctx))
208 if self.maintain_quorum:
209 return max(math.ceil(m/2.0)-1, 0)
215 Cotinuously loop and thrash the monitors.
217 self.log('start thrashing')
218 self.log('seed: {s}, revive delay: {r}, thrash delay: {t} '\
219 'thrash many: {tm}, maintain quorum: {mq} '\
220 'store thrash: {st}, probability: {stp} '\
221 'freeze mon: prob {fp} duration {fd}'.format(
222 s=self.random_seed,r=self.revive_delay,t=self.thrash_delay,
223 tm=self.thrash_many, mq=self.maintain_quorum,
224 st=self.store_thrash,stp=self.store_thrash_probability,
225 fp=self.freeze_mon_probability,fd=self.freeze_mon_duration,
228 while not self.stopping:
229 mons = _get_mons(self.ctx)
230 self.manager.wait_for_mon_quorum_size(len(mons))
231 self.log('making sure all monitors are in the quorum')
233 s = self.manager.get_mon_status(m)
234 assert s['state'] == 'leader' or s['state'] == 'peon'
235 assert len(s['quorum']) == len(mons)
237 kill_up_to = self.rng.randrange(1, self.max_killable()+1)
238 mons_to_kill = self.rng.sample(mons, kill_up_to)
239 self.log('monitors to thrash: {m}'.format(m=mons_to_kill))
243 if mon in mons_to_kill:
245 if self.should_freeze_mon():
246 mons_to_freeze.append(mon)
247 self.log('monitors to freeze: {m}'.format(m=mons_to_freeze))
249 for mon in mons_to_kill:
250 self.log('thrashing mon.{m}'.format(m=mon))
252 """ we only thrash stores if we are maintaining quorum """
253 if self.should_thrash_store() and self.maintain_quorum:
254 self.thrash_store(mon)
259 for mon in mons_to_freeze:
261 self.log('waiting for {delay} secs to unfreeze mons'.format(
262 delay=self.freeze_mon_duration))
263 time.sleep(self.freeze_mon_duration)
264 for mon in mons_to_freeze:
265 self.unfreeze_mon(mon)
267 if self.maintain_quorum:
268 self.manager.wait_for_mon_quorum_size(len(mons)-len(mons_to_kill))
270 if m in mons_to_kill:
272 s = self.manager.get_mon_status(m)
273 assert s['state'] == 'leader' or s['state'] == 'peon'
274 assert len(s['quorum']) == len(mons)-len(mons_to_kill)
276 self.log('waiting for {delay} secs before reviving monitors'.format(
277 delay=self.revive_delay))
278 time.sleep(self.revive_delay)
280 for mon in mons_to_kill:
284 for mon in mons_to_freeze:
286 self.log('waiting for {delay} secs to unfreeze mons'.format(
287 delay=self.freeze_mon_duration))
288 time.sleep(self.freeze_mon_duration)
289 for mon in mons_to_freeze:
290 self.unfreeze_mon(mon)
292 self.manager.wait_for_mon_quorum_size(len(mons))
294 s = self.manager.get_mon_status(m)
295 assert s['state'] == 'leader' or s['state'] == 'peon'
296 assert len(s['quorum']) == len(mons)
299 self.log('triggering scrub')
301 self.manager.raw_cluster_cmd('scrub')
303 log.exception("Saw exception while triggering scrub")
305 if self.thrash_delay > 0.0:
306 self.log('waiting for {delay} secs before continuing thrashing'.format(
307 delay=self.thrash_delay))
308 time.sleep(self.thrash_delay)
310 @contextlib.contextmanager
311 def task(ctx, config):
313 Stress test the monitor by thrashing them while another task/workunit
316 Please refer to MonitorThrasher class for further information on the
321 assert isinstance(config, dict), \
322 'mon_thrash task only accepts a dict for configuration'
323 assert len(_get_mons(ctx)) > 2, \
324 'mon_thrash task requires at least 3 monitors'
325 log.info('Beginning mon_thrash...')
326 first_mon = teuthology.get_first_mon(ctx, config)
327 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
328 manager = ceph_manager.CephManager(
331 logger=log.getChild('ceph_manager'),
333 thrash_proc = MonitorThrasher(ctx,
335 logger=log.getChild('mon_thrasher'))
337 log.debug('Yielding')
340 log.info('joining mon_thrasher')
341 thrash_proc.do_join()
342 mons = _get_mons(ctx)
343 manager.wait_for_mon_quorum_size(len(mons))