7 from teuthology import misc as teuthology
10 log = logging.getLogger(__name__)
13 def rados_start(testdir, remote, cmd):
15 Run a remote rados command (currently used to only write data)
17 log.info("rados %s" % ' '.join(cmd))
21 '{tdir}/archive/coverage'.format(tdir=testdir),
31 def task(ctx, config):
33 Test (non-backfill) recovery
37 assert isinstance(config, dict), \
38 'task only accepts a dict for configuration'
39 testdir = teuthology.get_testdir(ctx)
40 first_mon = teuthology.get_first_mon(ctx, config)
41 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
43 num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
44 log.info('num_osds is %s' % num_osds)
47 manager = ceph_manager.CephManager(
50 logger=log.getChild('ceph_manager'),
53 while len(manager.get_osd_status()['up']) < 3:
55 manager.flush_pg_stats([0, 1, 2])
56 manager.wait_for_clean()
58 # test some osdmap flags
59 manager.raw_cluster_cmd('osd', 'set', 'noin')
60 manager.raw_cluster_cmd('osd', 'set', 'noout')
61 manager.raw_cluster_cmd('osd', 'set', 'noup')
62 manager.raw_cluster_cmd('osd', 'set', 'nodown')
63 manager.raw_cluster_cmd('osd', 'unset', 'noin')
64 manager.raw_cluster_cmd('osd', 'unset', 'noout')
65 manager.raw_cluster_cmd('osd', 'unset', 'noup')
66 manager.raw_cluster_cmd('osd', 'unset', 'nodown')
69 p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '20', 'write', '-b', '4096',
74 # trigger a divergent target:
75 # blackhole + restart osd.1 (shorter log)
76 manager.blackhole_kill_osd(1)
77 # kill osd.2 (longer log... we'll make it divergent below)
82 # wait for our writes to complete + succeed
84 log.info('err is %d' % err)
87 manager.flush_pg_stats([0, 1])
88 manager.wait_for_active_or_down()
90 # write some more (make sure osd.2 really is divergent)
91 p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096'])
94 # revive divergent osd
97 while len(manager.get_osd_status()['up']) < 3:
98 log.info('waiting a bit...')
100 log.info('3 are up!')
102 # cluster must recover
103 manager.flush_pg_stats([0, 1, 2])
104 manager.wait_for_clean()
107 def test_incomplete_pgs(ctx, config):
109 Test handling of incomplete pgs. Requires 4 osds.
111 testdir = teuthology.get_testdir(ctx)
114 assert isinstance(config, dict), \
115 'task only accepts a dict for configuration'
116 first_mon = teuthology.get_first_mon(ctx, config)
117 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
119 num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
120 log.info('num_osds is %s' % num_osds)
123 manager = ceph_manager.CephManager(
126 logger=log.getChild('ceph_manager'),
129 while len(manager.get_osd_status()['up']) < 4:
132 manager.flush_pg_stats([0, 1, 2, 3])
133 manager.wait_for_clean()
135 log.info('Testing incomplete pgs...')
140 osd_recovery_delay_start=1000)
142 # move data off of osd.0, osd.1
143 manager.raw_cluster_cmd('osd', 'out', '0', '1')
144 manager.flush_pg_stats([0, 1, 2, 3], [0, 1])
145 manager.wait_for_clean()
147 # lots of objects in rbd (no pg log, will backfill)
148 p = rados_start(testdir, mon,
149 ['-p', 'rbd', 'bench', '20', 'write', '-b', '1',
153 # few objects in rbd pool (with pg log, normal recovery)
154 for f in range(1, 20):
155 p = rados_start(testdir, mon, ['-p', 'rbd', 'put',
156 'foo.%d' % f, '/etc/passwd'])
160 manager.raw_cluster_cmd('osd', 'in', '0', '1')
161 manager.raw_cluster_cmd('osd', 'out', '2', '3')
163 manager.flush_pg_stats([0, 1, 2, 3], [2, 3])
165 manager.wait_for_active()
167 assert not manager.is_clean()
168 assert not manager.is_recovered()
171 log.info('stopping 2,3')
175 manager.raw_cluster_cmd('osd', 'down', '2', '3')
176 manager.flush_pg_stats([0, 1])
177 manager.wait_for_active_or_down()
179 assert manager.get_num_down() > 0
182 manager.revive_osd(2)
183 manager.revive_osd(3)
184 while len(manager.get_osd_status()['up']) < 4:
185 log.info('waiting a bit...')
187 log.info('all are up!')
190 manager.kick_recovery_wq(i)
192 # cluster must recover
193 manager.wait_for_clean()