src/ceph/qa/tasks/dump_stuck.py

   1 """
   2 Dump_stuck command
   3 """
   4 import logging
   5 import re
   6 import time
   7
   8 import ceph_manager
   9 from teuthology import misc as teuthology
  10
  11
  12 log = logging.getLogger(__name__)
  13
  14 def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
  15     """
  16     Do checks.  Make sure get_stuck_pgs return the right amout of information, then
  17     extract health information from the raw_cluster_cmd and compare the results with
  18     values passed in.  This passes if all asserts pass.
  19
  20     :param num_manager: Ceph manager
  21     :param num_inactive: number of inaactive pages that are stuck
  22     :param num_unclean: number of unclean pages that are stuck
  23     :paran num_stale: number of stale pages that are stuck
  24     :param timeout: timeout value for get_stuck_pgs calls
  25     """
  26     inactive = manager.get_stuck_pgs('inactive', timeout)
  27     unclean = manager.get_stuck_pgs('unclean', timeout)
  28     stale = manager.get_stuck_pgs('stale', timeout)
  29     log.info('inactive %s / %d,  unclean %s / %d,  stale %s / %d',
  30              len(inactive), num_inactive,
  31              len(unclean), num_unclean,
  32              len(stale), num_stale)
  33     assert len(inactive) == num_inactive
  34     assert len(unclean) == num_unclean
  35     assert len(stale) == num_stale
  36
  37 def task(ctx, config):
  38     """
  39     Test the dump_stuck command.
  40
  41     :param ctx: Context
  42     :param config: Configuration
  43     """
  44     assert config is None, \
  45         'dump_stuck requires no configuration'
  46     assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
  47         'dump_stuck requires exactly 2 osds'
  48
  49     timeout = 60
  50     first_mon = teuthology.get_first_mon(ctx, config)
  51     (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
  52
  53     manager = ceph_manager.CephManager(
  54         mon,
  55         ctx=ctx,
  56         logger=log.getChild('ceph_manager'),
  57         )
  58
  59     manager.flush_pg_stats([0, 1])
  60     manager.wait_for_clean(timeout)
  61
  62     manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--',
  63 #                            '--mon-osd-report-timeout 90',
  64                             '--mon-pg-stuck-threshold 10')
  65
  66     # all active+clean
  67     check_stuck(
  68         manager,
  69         num_inactive=0,
  70         num_unclean=0,
  71         num_stale=0,
  72         )
  73     num_pgs = manager.get_num_pgs()
  74
  75     manager.mark_out_osd(0)
  76     time.sleep(timeout)
  77     manager.flush_pg_stats([1])
  78     manager.wait_for_recovery(timeout)
  79
  80     # all active+clean+remapped
  81     check_stuck(
  82         manager,
  83         num_inactive=0,
  84         num_unclean=0,
  85         num_stale=0,
  86         )
  87
  88     manager.mark_in_osd(0)
  89     manager.flush_pg_stats([0, 1])
  90     manager.wait_for_clean(timeout)
  91
  92     # all active+clean
  93     check_stuck(
  94         manager,
  95         num_inactive=0,
  96         num_unclean=0,
  97         num_stale=0,
  98         )
  99
 100     log.info('stopping first osd')
 101     manager.kill_osd(0)
 102     manager.mark_down_osd(0)
 103     manager.wait_for_active(timeout)
 104
 105     log.info('waiting for all to be unclean')
 106     starttime = time.time()
 107     done = False
 108     while not done:
 109         try:
 110             check_stuck(
 111                 manager,
 112                 num_inactive=0,
 113                 num_unclean=num_pgs,
 114                 num_stale=0,
 115                 )
 116             done = True
 117         except AssertionError:
 118             # wait up to 15 minutes to become stale
 119             if time.time() - starttime > 900:
 120                 raise
 121
 122
 123     log.info('stopping second osd')
 124     manager.kill_osd(1)
 125     manager.mark_down_osd(1)
 126
 127     log.info('waiting for all to be stale')
 128     starttime = time.time()
 129     done = False
 130     while not done:
 131         try:
 132             check_stuck(
 133                 manager,
 134                 num_inactive=0,
 135                 num_unclean=num_pgs,
 136                 num_stale=num_pgs,
 137                 )
 138             done = True
 139         except AssertionError:
 140             # wait up to 15 minutes to become stale
 141             if time.time() - starttime > 900:
 142                 raise
 143
 144     log.info('reviving')
 145     for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
 146         manager.revive_osd(id_)
 147         manager.mark_in_osd(id_)
 148     while True:
 149         try:
 150             manager.flush_pg_stats([0, 1])
 151             break
 152         except Exception:
 153             log.exception('osds must not be started yet, waiting...')
 154             time.sleep(1)
 155     manager.wait_for_clean(timeout)
 156
 157     check_stuck(
 158         manager,
 159         num_inactive=0,
 160         num_unclean=0,
 161         num_stale=0,
 162         )