X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=src%2Fceph%2Fqa%2Ftasks%2Fmon_seesaw.py;fp=src%2Fceph%2Fqa%2Ftasks%2Fmon_seesaw.py;h=b101c0e416f96490827c3f5b64f07ac47c8e3b25;hb=812ff6ca9fcd3e629e49d4328905f33eee8ca3f5;hp=0000000000000000000000000000000000000000;hpb=15280273faafb77777eab341909a3f495cf248d9;p=stor4nfv.git diff --git a/src/ceph/qa/tasks/mon_seesaw.py b/src/ceph/qa/tasks/mon_seesaw.py new file mode 100644 index 0000000..b101c0e --- /dev/null +++ b/src/ceph/qa/tasks/mon_seesaw.py @@ -0,0 +1,198 @@ +from cStringIO import StringIO + +import contextlib +import logging +import random + +from teuthology import misc as teuthology +from teuthology.orchestra import run + +from ceph_manager import CephManager, write_conf + + +log = logging.getLogger(__name__) + + +def _get_mons(ctx): + return [name[len('mon.'):] for name in teuthology.get_mon_names(ctx)] + + +# teuthology prepares the monitor IPs (and ports) in get_mons(), we can +# enumerate all monitor ports ([6789..]), and find the next available one. +def _get_next_port(ctx, ip, cluster): + # assuming we have only one cluster here. + used = [] + for name in teuthology.get_mon_names(ctx, cluster): + addr = ctx.ceph[cluster].conf[name]['mon addr'] + mon_ip, mon_port = addr.split(':') + if mon_ip != ip: + continue + used.append(int(mon_port)) + port = 6789 + used.sort() + for p in used: + if p != port: + break + port += 1 + return port + + +def _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path): + # co-locate a new monitor on remote where an existing monitor is hosted + cluster = manager.cluster + remote.run(args=['sudo', 'mkdir', '-p', data_path]) + keyring_path = '/etc/ceph/{cluster}.keyring'.format( + cluster=manager.cluster) + testdir = teuthology.get_testdir(ctx) + monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir, + cluster=cluster) + manager.raw_cluster_cmd('mon', 'getmap', '-o', monmap_path) + if manager.controller != remote: + monmap = teuthology.get_file(manager.controller, monmap_path) + teuthology.write_file(remote, monmap_path, StringIO(monmap)) + remote.run( + args=[ + 'sudo', + 'ceph-mon', + '--cluster', cluster, + '--mkfs', + '-i', mon, + '--monmap', monmap_path, + '--keyring', keyring_path]) + if manager.controller != remote: + teuthology.delete_file(remote, monmap_path) + # raw_cluster_cmd() is performed using sudo, so sudo here also. + teuthology.delete_file(manager.controller, monmap_path, sudo=True) + # update ceph.conf so that the ceph CLI is able to connect to the cluster + if conf_path: + ip = remote.ip_address + port = _get_next_port(ctx, ip, cluster) + mon_addr = '{ip}:{port}'.format(ip=ip, port=port) + ctx.ceph[cluster].conf[name] = {'mon addr': mon_addr} + write_conf(ctx, conf_path, cluster) + + +def _teardown_mon(ctx, manager, remote, name, data_path, conf_path): + cluster = manager.cluster + del ctx.ceph[cluster].conf[name] + write_conf(ctx, conf_path, cluster) + remote.run(args=['sudo', 'rm', '-rf', data_path]) + + +@contextlib.contextmanager +def _prepare_mon(ctx, manager, remote, mon): + cluster = manager.cluster + data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format( + cluster=cluster, id=mon) + conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster) + name = 'mon.{0}'.format(mon) + _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path) + yield + _teardown_mon(ctx, manager, remote, name, + data_path, conf_path) + + +# run_daemon() in ceph.py starts a herd of daemons of the same type, but +# _run_daemon() starts only one instance. +@contextlib.contextmanager +def _run_daemon(ctx, remote, cluster, type_, id_): + testdir = teuthology.get_testdir(ctx) + coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir) + daemon_signal = 'kill' + run_cmd = [ + 'sudo', + 'adjust-ulimits', + 'ceph-coverage', + coverage_dir, + 'daemon-helper', + daemon_signal, + ] + run_cmd_tail = [ + 'ceph-%s' % (type_), + '-f', + '--cluster', cluster, + '-i', id_] + run_cmd.extend(run_cmd_tail) + ctx.daemons.add_daemon(remote, type_, id_, + cluster=cluster, + args=run_cmd, + logger=log.getChild(type_), + stdin=run.PIPE, + wait=False) + daemon = ctx.daemons.get_daemon(type_, id_, cluster) + yield daemon + daemon.stop() + + +@contextlib.contextmanager +def task(ctx, config): + """ + replace a monitor with a newly added one, and then revert this change + + How it works:: + 1. add a mon with specified id (mon.victim_prime) + 2. wait for quorum + 3. remove a monitor with specified id (mon.victim), mon.victim will commit + suicide + 4. wait for quorum + 5. + 5. add mon.a back, and start it + 6. wait for quorum + 7. remove mon.a_prime + + Options:: + victim the id of the mon to be removed (pick a random mon by default) + replacer the id of the new mon (use "${victim}_prime" if not specified) + """ + first_mon = teuthology.get_first_mon(ctx, config) + (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys() + manager = CephManager(mon, ctx=ctx, logger=log.getChild('ceph_manager')) + + if config is None: + config = {} + assert isinstance(config, dict), \ + "task ceph only supports a dictionary for configuration" + overrides = ctx.config.get('overrides', {}) + teuthology.deep_merge(config, overrides.get('mon_seesaw', {})) + victim = config.get('victim', random.choice(_get_mons(ctx))) + replacer = config.get('replacer', '{0}_prime'.format(victim)) + remote = manager.find_remote('mon', victim) + quorum = manager.get_mon_quorum() + cluster = manager.cluster + log.info('replacing {victim} with {replacer}'.format(victim=victim, + replacer=replacer)) + with _prepare_mon(ctx, manager, remote, replacer): + with _run_daemon(ctx, remote, cluster, 'mon', replacer): + # replacer will join the quorum automatically + manager.wait_for_mon_quorum_size(len(quorum) + 1, 10) + # if we don't remove the victim from monmap, there is chance that + # we are leaving the new joiner with a monmap of 2 mon, and it will + # not able to reach the other one, it will be keeping probing for + # ever. + log.info('removing {mon}'.format(mon=victim)) + manager.raw_cluster_cmd('mon', 'remove', victim) + manager.wait_for_mon_quorum_size(len(quorum), 10) + # the victim will commit suicide after being removed from + # monmap, let's wait until it stops. + ctx.daemons.get_daemon('mon', victim, cluster).wait(10) + try: + # perform other tasks + yield + finally: + # bring the victim back online + # nuke the monstore of victim, otherwise it will refuse to boot + # with following message: + # + # not in monmap and have been in a quorum before; must have + # been removed + log.info('re-adding {mon}'.format(mon=victim)) + data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format( + cluster=cluster, id=victim) + remote.run(args=['sudo', 'rm', '-rf', data_path]) + name = 'mon.{0}'.format(victim) + _setup_mon(ctx, manager, remote, victim, name, data_path, None) + log.info('reviving {mon}'.format(mon=victim)) + manager.revive_mon(victim) + manager.wait_for_mon_quorum_size(len(quorum) + 1, 10) + manager.raw_cluster_cmd('mon', 'remove', replacer) + manager.wait_for_mon_quorum_size(len(quorum), 10)