src/ceph/qa/tasks/mon_seesaw.py

   1 from cStringIO import StringIO
   2
   3 import contextlib
   4 import logging
   5 import random
   6
   7 from teuthology import misc as teuthology
   8 from teuthology.orchestra import run
   9
  10 from ceph_manager import CephManager, write_conf
  11
  12
  13 log = logging.getLogger(__name__)
  14
  15
  16 def _get_mons(ctx):
  17     return [name[len('mon.'):] for name in teuthology.get_mon_names(ctx)]
  18
  19
  20 # teuthology prepares the monitor IPs (and ports) in get_mons(), we can
  21 # enumerate all monitor ports ([6789..]), and find the next available one.
  22 def _get_next_port(ctx, ip, cluster):
  23     # assuming we have only one cluster here.
  24     used = []
  25     for name in teuthology.get_mon_names(ctx, cluster):
  26         addr = ctx.ceph[cluster].conf[name]['mon addr']
  27         mon_ip, mon_port = addr.split(':')
  28         if mon_ip != ip:
  29             continue
  30         used.append(int(mon_port))
  31     port = 6789
  32     used.sort()
  33     for p in used:
  34         if p != port:
  35             break
  36         port += 1
  37     return port
  38
  39
  40 def _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path):
  41     # co-locate a new monitor on remote where an existing monitor is hosted
  42     cluster = manager.cluster
  43     remote.run(args=['sudo', 'mkdir', '-p', data_path])
  44     keyring_path = '/etc/ceph/{cluster}.keyring'.format(
  45         cluster=manager.cluster)
  46     testdir = teuthology.get_testdir(ctx)
  47     monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
  48                                                    cluster=cluster)
  49     manager.raw_cluster_cmd('mon', 'getmap', '-o', monmap_path)
  50     if manager.controller != remote:
  51         monmap = teuthology.get_file(manager.controller, monmap_path)
  52         teuthology.write_file(remote, monmap_path, StringIO(monmap))
  53     remote.run(
  54         args=[
  55             'sudo',
  56             'ceph-mon',
  57             '--cluster', cluster,
  58             '--mkfs',
  59             '-i', mon,
  60             '--monmap', monmap_path,
  61             '--keyring', keyring_path])
  62     if manager.controller != remote:
  63         teuthology.delete_file(remote, monmap_path)
  64     # raw_cluster_cmd() is performed using sudo, so sudo here also.
  65     teuthology.delete_file(manager.controller, monmap_path, sudo=True)
  66     # update ceph.conf so that the ceph CLI is able to connect to the cluster
  67     if conf_path:
  68         ip = remote.ip_address
  69         port = _get_next_port(ctx, ip, cluster)
  70         mon_addr = '{ip}:{port}'.format(ip=ip, port=port)
  71         ctx.ceph[cluster].conf[name] = {'mon addr': mon_addr}
  72         write_conf(ctx, conf_path, cluster)
  73
  74
  75 def _teardown_mon(ctx, manager, remote, name, data_path, conf_path):
  76     cluster = manager.cluster
  77     del ctx.ceph[cluster].conf[name]
  78     write_conf(ctx, conf_path, cluster)
  79     remote.run(args=['sudo', 'rm', '-rf', data_path])
  80
  81
  82 @contextlib.contextmanager
  83 def _prepare_mon(ctx, manager, remote, mon):
  84     cluster = manager.cluster
  85     data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format(
  86         cluster=cluster, id=mon)
  87     conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster)
  88     name = 'mon.{0}'.format(mon)
  89     _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path)
  90     yield
  91     _teardown_mon(ctx, manager, remote, name,
  92                   data_path, conf_path)
  93
  94
  95 # run_daemon() in ceph.py starts a herd of daemons of the same type, but
  96 # _run_daemon() starts only one instance.
  97 @contextlib.contextmanager
  98 def _run_daemon(ctx, remote, cluster, type_, id_):
  99     testdir = teuthology.get_testdir(ctx)
 100     coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
 101     daemon_signal = 'kill'
 102     run_cmd = [
 103         'sudo',
 104         'adjust-ulimits',
 105         'ceph-coverage',
 106         coverage_dir,
 107         'daemon-helper',
 108         daemon_signal,
 109     ]
 110     run_cmd_tail = [
 111         'ceph-%s' % (type_),
 112         '-f',
 113         '--cluster', cluster,
 114         '-i', id_]
 115     run_cmd.extend(run_cmd_tail)
 116     ctx.daemons.add_daemon(remote, type_, id_,
 117                            cluster=cluster,
 118                            args=run_cmd,
 119                            logger=log.getChild(type_),
 120                            stdin=run.PIPE,
 121                            wait=False)
 122     daemon = ctx.daemons.get_daemon(type_, id_, cluster)
 123     yield daemon
 124     daemon.stop()
 125
 126
 127 @contextlib.contextmanager
 128 def task(ctx, config):
 129     """
 130     replace a monitor with a newly added one, and then revert this change
 131
 132     How it works::
 133     1. add a mon with specified id (mon.victim_prime)
 134     2. wait for quorum
 135     3. remove a monitor with specified id (mon.victim), mon.victim will commit
 136        suicide
 137     4. wait for quorum
 138     5. <yield>
 139     5. add mon.a back, and start it
 140     6. wait for quorum
 141     7. remove mon.a_prime
 142
 143     Options::
 144     victim       the id of the mon to be removed (pick a random mon by default)
 145     replacer     the id of the new mon (use "${victim}_prime" if not specified)
 146     """
 147     first_mon = teuthology.get_first_mon(ctx, config)
 148     (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
 149     manager = CephManager(mon, ctx=ctx, logger=log.getChild('ceph_manager'))
 150
 151     if config is None:
 152         config = {}
 153     assert isinstance(config, dict), \
 154         "task ceph only supports a dictionary for configuration"
 155     overrides = ctx.config.get('overrides', {})
 156     teuthology.deep_merge(config, overrides.get('mon_seesaw', {}))
 157     victim = config.get('victim', random.choice(_get_mons(ctx)))
 158     replacer = config.get('replacer', '{0}_prime'.format(victim))
 159     remote = manager.find_remote('mon', victim)
 160     quorum = manager.get_mon_quorum()
 161     cluster = manager.cluster
 162     log.info('replacing {victim} with {replacer}'.format(victim=victim,
 163                                                          replacer=replacer))
 164     with _prepare_mon(ctx, manager, remote, replacer):
 165         with _run_daemon(ctx, remote, cluster, 'mon', replacer):
 166             # replacer will join the quorum automatically
 167             manager.wait_for_mon_quorum_size(len(quorum) + 1, 10)
 168             # if we don't remove the victim from monmap, there is chance that
 169             # we are leaving the new joiner with a monmap of 2 mon, and it will
 170             # not able to reach the other one, it will be keeping probing for
 171             # ever.
 172             log.info('removing {mon}'.format(mon=victim))
 173             manager.raw_cluster_cmd('mon', 'remove', victim)
 174             manager.wait_for_mon_quorum_size(len(quorum), 10)
 175             # the victim will commit suicide after being removed from
 176             # monmap, let's wait until it stops.
 177             ctx.daemons.get_daemon('mon', victim, cluster).wait(10)
 178             try:
 179                 # perform other tasks
 180                 yield
 181             finally:
 182                 # bring the victim back online
 183                 # nuke the monstore of victim, otherwise it will refuse to boot
 184                 # with following message:
 185                 #
 186                 # not in monmap and have been in a quorum before; must have
 187                 # been removed
 188                 log.info('re-adding {mon}'.format(mon=victim))
 189                 data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format(
 190                     cluster=cluster, id=victim)
 191                 remote.run(args=['sudo', 'rm', '-rf', data_path])
 192                 name = 'mon.{0}'.format(victim)
 193                 _setup_mon(ctx, manager, remote, victim, name, data_path, None)
 194                 log.info('reviving {mon}'.format(mon=victim))
 195                 manager.revive_mon(victim)
 196                 manager.wait_for_mon_quorum_size(len(quorum) + 1, 10)
 197                 manager.raw_cluster_cmd('mon', 'remove', replacer)
 198                 manager.wait_for_mon_quorum_size(len(quorum), 10)