src/ceph/qa/tasks/mds_creation_failure.py

   1
   2 import logging
   3 import contextlib
   4 import time
   5 import ceph_manager
   6 from teuthology import misc
   7 from teuthology.orchestra.run import CommandFailedError, Raw
   8
   9 log = logging.getLogger(__name__)
  10
  11
  12 @contextlib.contextmanager
  13 def task(ctx, config):
  14     """
  15     Go through filesystem creation with a synthetic failure in an MDS
  16     in its 'up:creating' state, to exercise the retry behaviour.
  17     """
  18     # Grab handles to the teuthology objects of interest
  19     mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
  20     if len(mdslist) != 1:
  21         # Require exactly one MDS, the code path for creation failure when
  22         # a standby is available is different
  23         raise RuntimeError("This task requires exactly one MDS")
  24
  25     mds_id = mdslist[0]
  26     (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.iterkeys()
  27     manager = ceph_manager.CephManager(
  28         mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'),
  29     )
  30
  31     # Stop MDS
  32     manager.raw_cluster_cmd('mds', 'set', "max_mds", "0")
  33     mds = ctx.daemons.get_daemon('mds', mds_id)
  34     mds.stop()
  35     manager.raw_cluster_cmd('mds', 'fail', mds_id)
  36
  37     # Reset the filesystem so that next start will go into CREATING
  38     manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it")
  39     manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data")
  40
  41     # Start the MDS with mds_kill_create_at set, it will crash during creation
  42     mds.restart_with_args(["--mds_kill_create_at=1"])
  43     try:
  44         mds.wait_for_exit()
  45     except CommandFailedError as e:
  46         if e.exitstatus == 1:
  47             log.info("MDS creation killed as expected")
  48         else:
  49             log.error("Unexpected status code %s" % e.exitstatus)
  50             raise
  51
  52     # Since I have intentionally caused a crash, I will clean up the resulting core
  53     # file to avoid task.internal.coredump seeing it as a failure.
  54     log.info("Removing core file from synthetic MDS failure")
  55     mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))])
  56
  57     # It should have left the MDS map state still in CREATING
  58     status = manager.get_mds_status(mds_id)
  59     assert status['state'] == 'up:creating'
  60
  61     # Start the MDS again without the kill flag set, it should proceed with creation successfully
  62     mds.restart()
  63
  64     # Wait for state ACTIVE
  65     t = 0
  66     create_timeout = 120
  67     while True:
  68         status = manager.get_mds_status(mds_id)
  69         if status['state'] == 'up:active':
  70             log.info("MDS creation completed successfully")
  71             break
  72         elif status['state'] == 'up:creating':
  73             log.info("MDS still in creating state")
  74             if t > create_timeout:
  75                 log.error("Creating did not complete within %ss" % create_timeout)
  76                 raise RuntimeError("Creating did not complete within %ss" % create_timeout)
  77             t += 1
  78             time.sleep(1)
  79         else:
  80             log.error("Unexpected MDS state: %s" % status['state'])
  81             assert(status['state'] in ['up:active', 'up:creating'])
  82
  83     # The system should be back up in a happy healthy state, go ahead and run any further tasks
  84     # inside this context.
  85     yield