9 from teuthology import misc as teuthology
12 log = logging.getLogger(__name__)
14 def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
16 Do checks. Make sure get_stuck_pgs return the right amout of information, then
17 extract health information from the raw_cluster_cmd and compare the results with
18 values passed in. This passes if all asserts pass.
20 :param num_manager: Ceph manager
21 :param num_inactive: number of inaactive pages that are stuck
22 :param num_unclean: number of unclean pages that are stuck
23 :paran num_stale: number of stale pages that are stuck
24 :param timeout: timeout value for get_stuck_pgs calls
26 inactive = manager.get_stuck_pgs('inactive', timeout)
27 unclean = manager.get_stuck_pgs('unclean', timeout)
28 stale = manager.get_stuck_pgs('stale', timeout)
29 log.info('inactive %s / %d, unclean %s / %d, stale %s / %d',
30 len(inactive), num_inactive,
31 len(unclean), num_unclean,
32 len(stale), num_stale)
33 assert len(inactive) == num_inactive
34 assert len(unclean) == num_unclean
35 assert len(stale) == num_stale
37 def task(ctx, config):
39 Test the dump_stuck command.
42 :param config: Configuration
44 assert config is None, \
45 'dump_stuck requires no configuration'
46 assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
47 'dump_stuck requires exactly 2 osds'
50 first_mon = teuthology.get_first_mon(ctx, config)
51 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
53 manager = ceph_manager.CephManager(
56 logger=log.getChild('ceph_manager'),
59 manager.flush_pg_stats([0, 1])
60 manager.wait_for_clean(timeout)
62 manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--',
63 # '--mon-osd-report-timeout 90',
64 '--mon-pg-stuck-threshold 10')
73 num_pgs = manager.get_num_pgs()
75 manager.mark_out_osd(0)
77 manager.flush_pg_stats([1])
78 manager.wait_for_recovery(timeout)
80 # all active+clean+remapped
88 manager.mark_in_osd(0)
89 manager.flush_pg_stats([0, 1])
90 manager.wait_for_clean(timeout)
100 log.info('stopping first osd')
102 manager.mark_down_osd(0)
103 manager.wait_for_active(timeout)
105 log.info('waiting for all to be unclean')
106 starttime = time.time()
117 except AssertionError:
118 # wait up to 15 minutes to become stale
119 if time.time() - starttime > 900:
123 log.info('stopping second osd')
125 manager.mark_down_osd(1)
127 log.info('waiting for all to be stale')
128 starttime = time.time()
139 except AssertionError:
140 # wait up to 15 minutes to become stale
141 if time.time() - starttime > 900:
145 for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
146 manager.revive_osd(id_)
147 manager.mark_in_osd(id_)
150 manager.flush_pg_stats([0, 1])
153 log.exception('osds must not be started yet, waiting...')
155 manager.wait_for_clean(timeout)