src/ceph/qa/tasks/mon_thrash.py

   1 """
   2 Monitor thrash
   3 """
   4 import logging
   5 import contextlib
   6 import ceph_manager
   7 import random
   8 import time
   9 import gevent
  10 import json
  11 import math
  12 from teuthology import misc as teuthology
  13
  14 log = logging.getLogger(__name__)
  15
  16 def _get_mons(ctx):
  17     """
  18     Get monitor names from the context value.
  19     """
  20     mons = [f[len('mon.'):] for f in teuthology.get_mon_names(ctx)]
  21     return mons
  22
  23 class MonitorThrasher:
  24     """
  25     How it works::
  26
  27     - pick a monitor
  28     - kill it
  29     - wait for quorum to be formed
  30     - sleep for 'revive_delay' seconds
  31     - revive monitor
  32     - wait for quorum to be formed
  33     - sleep for 'thrash_delay' seconds
  34
  35     Options::
  36
  37     seed                Seed to use on the RNG to reproduce a previous
  38                         behaviour (default: None; i.e., not set)
  39     revive_delay        Number of seconds to wait before reviving
  40                         the monitor (default: 10)
  41     thrash_delay        Number of seconds to wait in-between
  42                         test iterations (default: 0)
  43     thrash_store        Thrash monitor store before killing the monitor being thrashed (default: False)
  44     thrash_store_probability  Probability of thrashing a monitor's store
  45                               (default: 50)
  46     thrash_many         Thrash multiple monitors instead of just one. If
  47                         'maintain-quorum' is set to False, then we will
  48                         thrash up to as many monitors as there are
  49                         available. (default: False)
  50     maintain_quorum     Always maintain quorum, taking care on how many
  51                         monitors we kill during the thrashing. If we
  52                         happen to only have one or two monitors configured,
  53                         if this option is set to True, then we won't run
  54                         this task as we cannot guarantee maintenance of
  55                         quorum. Setting it to false however would allow the
  56                         task to run with as many as just one single monitor.
  57                         (default: True)
  58     freeze_mon_probability: how often to freeze the mon instead of killing it,
  59                         in % (default: 0)
  60     freeze_mon_duration: how many seconds to freeze the mon (default: 15)
  61     scrub               Scrub after each iteration (default: True)
  62
  63     Note: if 'store-thrash' is set to True, then 'maintain-quorum' must also
  64           be set to True.
  65
  66     For example::
  67
  68     tasks:
  69     - ceph:
  70     - mon_thrash:
  71         revive_delay: 20
  72         thrash_delay: 1
  73         thrash_store: true
  74         thrash_store_probability: 40
  75         seed: 31337
  76         maintain_quorum: true
  77         thrash_many: true
  78     - ceph-fuse:
  79     - workunit:
  80         clients:
  81           all:
  82             - mon/workloadgen.sh
  83     """
  84     def __init__(self, ctx, manager, config, logger):
  85         self.ctx = ctx
  86         self.manager = manager
  87         self.manager.wait_for_clean()
  88
  89         self.stopping = False
  90         self.logger = logger
  91         self.config = config
  92
  93         if self.config is None:
  94             self.config = dict()
  95
  96         """ Test reproducibility """
  97         self.random_seed = self.config.get('seed', None)
  98
  99         if self.random_seed is None:
 100             self.random_seed = int(time.time())
 101
 102         self.rng = random.Random()
 103         self.rng.seed(int(self.random_seed))
 104
 105         """ Monitor thrashing """
 106         self.revive_delay = float(self.config.get('revive_delay', 10.0))
 107         self.thrash_delay = float(self.config.get('thrash_delay', 0.0))
 108
 109         self.thrash_many = self.config.get('thrash_many', False)
 110         self.maintain_quorum = self.config.get('maintain_quorum', True)
 111
 112         self.scrub = self.config.get('scrub', True)
 113
 114         self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10))
 115         self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0))
 116
 117         assert self.max_killable() > 0, \
 118             'Unable to kill at least one monitor with the current config.'
 119
 120         """ Store thrashing """
 121         self.store_thrash = self.config.get('store_thrash', False)
 122         self.store_thrash_probability = int(
 123             self.config.get('store_thrash_probability', 50))
 124         if self.store_thrash:
 125             assert self.store_thrash_probability > 0, \
 126                 'store_thrash is set, probability must be > 0'
 127             assert self.maintain_quorum, \
 128                 'store_thrash = true must imply maintain_quorum = true'
 129
 130         self.thread = gevent.spawn(self.do_thrash)
 131
 132     def log(self, x):
 133         """
 134         locally log info messages
 135         """
 136         self.logger.info(x)
 137
 138     def do_join(self):
 139         """
 140         Break out of this processes thrashing loop.
 141         """
 142         self.stopping = True
 143         self.thread.get()
 144
 145     def should_thrash_store(self):
 146         """
 147         If allowed, indicate that we should thrash a certain percentage of
 148         the time as determined by the store_thrash_probability value.
 149         """
 150         if not self.store_thrash:
 151             return False
 152         return self.rng.randrange(0, 101) < self.store_thrash_probability
 153
 154     def thrash_store(self, mon):
 155         """
 156         Thrash the monitor specified.
 157         :param mon: monitor to thrash
 158         """
 159         addr = self.ctx.ceph['ceph'].conf['mon.%s' % mon]['mon addr']
 160         self.log('thrashing mon.{id}@{addr} store'.format(id=mon, addr=addr))
 161         out = self.manager.raw_cluster_cmd('-m', addr, 'sync', 'force')
 162         j = json.loads(out)
 163         assert j['ret'] == 0, \
 164             'error forcing store sync on mon.{id}:\n{ret}'.format(
 165                 id=mon,ret=out)
 166
 167     def should_freeze_mon(self):
 168         """
 169         Indicate that we should freeze a certain percentago of the time
 170         as determined by the freeze_mon_probability value.
 171         """
 172         return self.rng.randrange(0, 101) < self.freeze_mon_probability
 173
 174     def freeze_mon(self, mon):
 175         """
 176         Send STOP signal to freeze the monitor.
 177         """
 178         log.info('Sending STOP to mon %s', mon)
 179         self.manager.signal_mon(mon, 19)  # STOP
 180
 181     def unfreeze_mon(self, mon):
 182         """
 183         Send CONT signal to unfreeze the monitor.
 184         """
 185         log.info('Sending CONT to mon %s', mon)
 186         self.manager.signal_mon(mon, 18)  # CONT
 187
 188     def kill_mon(self, mon):
 189         """
 190         Kill the monitor specified
 191         """
 192         self.log('killing mon.{id}'.format(id=mon))
 193         self.manager.kill_mon(mon)
 194
 195     def revive_mon(self, mon):
 196         """
 197         Revive the monitor specified
 198         """
 199         self.log('killing mon.{id}'.format(id=mon))
 200         self.log('reviving mon.{id}'.format(id=mon))
 201         self.manager.revive_mon(mon)
 202
 203     def max_killable(self):
 204         """
 205         Return the maximum number of monitors we can kill.
 206         """
 207         m = len(_get_mons(self.ctx))
 208         if self.maintain_quorum:
 209             return max(math.ceil(m/2.0)-1, 0)
 210         else:
 211             return m
 212
 213     def do_thrash(self):
 214         """
 215         Cotinuously loop and thrash the monitors.
 216         """
 217         self.log('start thrashing')
 218         self.log('seed: {s}, revive delay: {r}, thrash delay: {t} '\
 219                    'thrash many: {tm}, maintain quorum: {mq} '\
 220                    'store thrash: {st}, probability: {stp} '\
 221                    'freeze mon: prob {fp} duration {fd}'.format(
 222                 s=self.random_seed,r=self.revive_delay,t=self.thrash_delay,
 223                 tm=self.thrash_many, mq=self.maintain_quorum,
 224                 st=self.store_thrash,stp=self.store_thrash_probability,
 225                 fp=self.freeze_mon_probability,fd=self.freeze_mon_duration,
 226                 ))
 227
 228         while not self.stopping:
 229             mons = _get_mons(self.ctx)
 230             self.manager.wait_for_mon_quorum_size(len(mons))
 231             self.log('making sure all monitors are in the quorum')
 232             for m in mons:
 233                 s = self.manager.get_mon_status(m)
 234                 assert s['state'] == 'leader' or s['state'] == 'peon'
 235                 assert len(s['quorum']) == len(mons)
 236
 237             kill_up_to = self.rng.randrange(1, self.max_killable()+1)
 238             mons_to_kill = self.rng.sample(mons, kill_up_to)
 239             self.log('monitors to thrash: {m}'.format(m=mons_to_kill))
 240
 241             mons_to_freeze = []
 242             for mon in mons:
 243                 if mon in mons_to_kill:
 244                     continue
 245                 if self.should_freeze_mon():
 246                     mons_to_freeze.append(mon)
 247             self.log('monitors to freeze: {m}'.format(m=mons_to_freeze))
 248
 249             for mon in mons_to_kill:
 250                 self.log('thrashing mon.{m}'.format(m=mon))
 251
 252                 """ we only thrash stores if we are maintaining quorum """
 253                 if self.should_thrash_store() and self.maintain_quorum:
 254                     self.thrash_store(mon)
 255
 256                 self.kill_mon(mon)
 257
 258             if mons_to_freeze:
 259                 for mon in mons_to_freeze:
 260                     self.freeze_mon(mon)
 261                 self.log('waiting for {delay} secs to unfreeze mons'.format(
 262                     delay=self.freeze_mon_duration))
 263                 time.sleep(self.freeze_mon_duration)
 264                 for mon in mons_to_freeze:
 265                     self.unfreeze_mon(mon)
 266
 267             if self.maintain_quorum:
 268                 self.manager.wait_for_mon_quorum_size(len(mons)-len(mons_to_kill))
 269                 for m in mons:
 270                     if m in mons_to_kill:
 271                         continue
 272                     s = self.manager.get_mon_status(m)
 273                     assert s['state'] == 'leader' or s['state'] == 'peon'
 274                     assert len(s['quorum']) == len(mons)-len(mons_to_kill)
 275
 276             self.log('waiting for {delay} secs before reviving monitors'.format(
 277                 delay=self.revive_delay))
 278             time.sleep(self.revive_delay)
 279
 280             for mon in mons_to_kill:
 281                 self.revive_mon(mon)
 282             # do more freezes
 283             if mons_to_freeze:
 284                 for mon in mons_to_freeze:
 285                     self.freeze_mon(mon)
 286                 self.log('waiting for {delay} secs to unfreeze mons'.format(
 287                     delay=self.freeze_mon_duration))
 288                 time.sleep(self.freeze_mon_duration)
 289                 for mon in mons_to_freeze:
 290                     self.unfreeze_mon(mon)
 291
 292             self.manager.wait_for_mon_quorum_size(len(mons))
 293             for m in mons:
 294                 s = self.manager.get_mon_status(m)
 295                 assert s['state'] == 'leader' or s['state'] == 'peon'
 296                 assert len(s['quorum']) == len(mons)
 297
 298             if self.scrub:
 299                 self.log('triggering scrub')
 300                 try:
 301                     self.manager.raw_cluster_cmd('scrub')
 302                 except Exception:
 303                     log.exception("Saw exception while triggering scrub")
 304
 305             if self.thrash_delay > 0.0:
 306                 self.log('waiting for {delay} secs before continuing thrashing'.format(
 307                     delay=self.thrash_delay))
 308                 time.sleep(self.thrash_delay)
 309
 310 @contextlib.contextmanager
 311 def task(ctx, config):
 312     """
 313     Stress test the monitor by thrashing them while another task/workunit
 314     is running.
 315
 316     Please refer to MonitorThrasher class for further information on the
 317     available options.
 318     """
 319     if config is None:
 320         config = {}
 321     assert isinstance(config, dict), \
 322         'mon_thrash task only accepts a dict for configuration'
 323     assert len(_get_mons(ctx)) > 2, \
 324         'mon_thrash task requires at least 3 monitors'
 325     log.info('Beginning mon_thrash...')
 326     first_mon = teuthology.get_first_mon(ctx, config)
 327     (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
 328     manager = ceph_manager.CephManager(
 329         mon,
 330         ctx=ctx,
 331         logger=log.getChild('ceph_manager'),
 332         )
 333     thrash_proc = MonitorThrasher(ctx,
 334         manager, config,
 335         logger=log.getChild('mon_thrasher'))
 336     try:
 337         log.debug('Yielding')
 338         yield
 339     finally:
 340         log.info('joining mon_thrasher')
 341         thrash_proc.do_join()
 342         mons = _get_mons(ctx)
 343         manager.wait_for_mon_quorum_size(len(mons))