2 Special regression test for tracker #11184
4 Synopsis: osd/SnapMapper.cc: 282: FAILED assert(check(oid))
6 This is accomplished by moving a pg that wasn't part of split and still include
11 from cStringIO import StringIO
13 from teuthology.orchestra import run
14 from teuthology import misc as teuthology
15 from util.rados import rados
19 log = logging.getLogger(__name__)
22 def task(ctx, config):
24 Test handling of divergent entries during export / import
25 to regression test tracker #11184
33 Requires 3 osds on a single test node.
37 assert isinstance(config, dict), \
38 'divergent_priors task only accepts a dict for configuration'
40 manager = ctx.managers['ceph']
42 while len(manager.get_osd_status()['up']) < 3:
45 manager.flush_pg_stats(osds)
46 manager.raw_cluster_cmd('osd', 'set', 'noout')
47 manager.raw_cluster_cmd('osd', 'set', 'noin')
48 manager.raw_cluster_cmd('osd', 'set', 'nodown')
49 manager.wait_for_clean()
51 # something that is always there
52 dummyfile = '/etc/fstab'
53 dummyfile2 = '/etc/resolv.conf'
54 testdir = teuthology.get_testdir(ctx)
57 log.info('creating foo')
58 manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
59 manager.raw_cluster_cmd(
60 'osd', 'pool', 'application', 'enable',
61 'foo', 'rados', run.Raw('||'), 'true')
63 # Remove extra pool to simlify log output
64 manager.raw_cluster_cmd('osd', 'pool', 'delete', 'rbd', 'rbd', '--yes-i-really-really-mean-it')
67 manager.set_config(i, osd_min_pg_log_entries=10)
68 manager.set_config(i, osd_max_pg_log_entries=10)
69 manager.set_config(i, osd_pg_log_trim_min=5)
72 divergent = manager.get_pg_primary('foo', 0)
73 log.info("primary and soon to be divergent is %d", divergent)
74 non_divergent = list(osds)
75 non_divergent.remove(divergent)
77 log.info('writing initial objects')
78 first_mon = teuthology.get_first_mon(ctx, config)
79 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
82 rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
84 manager.wait_for_clean()
86 # blackhole non_divergent
87 log.info("blackholing osds %s", str(non_divergent))
88 for i in non_divergent:
89 manager.set_config(i, objectstore_blackhole=1)
93 # Write some soon to be divergent
94 log.info('writing divergent objects')
95 for i in range(DIVERGENT_WRITE):
96 rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i,
97 dummyfile2], wait=False)
98 # Remove some soon to be divergent
99 log.info('remove divergent objects')
100 for i in range(DIVERGENT_REMOVE):
101 rados(ctx, mon, ['-p', 'foo', 'rm',
102 'existing_%d' % (i + DIVERGENT_WRITE)], wait=False)
105 args=['killall', '-9', 'rados'],
109 # kill all the osds but leave divergent in
110 log.info('killing all the osds')
114 manager.mark_down_osd(i)
115 for i in non_divergent:
116 manager.mark_out_osd(i)
118 # bring up non-divergent
119 log.info("bringing up non_divergent %s", str(non_divergent))
120 for i in non_divergent:
121 manager.revive_osd(i)
122 for i in non_divergent:
123 manager.mark_in_osd(i)
125 # write 1 non-divergent object (ensure that old divergent one is divergent)
126 objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE)
127 log.info('writing non-divergent object ' + objname)
128 rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2])
130 manager.wait_for_recovery()
132 # ensure no recovery of up osds first
133 log.info('delay recovery')
134 for i in non_divergent:
135 manager.wait_run_admin_socket(
136 'osd', i, ['set_recovery_delay', '100000'])
138 # bring in our divergent friend
139 log.info("revive divergent %d", divergent)
140 manager.raw_cluster_cmd('osd', 'set', 'noup')
141 manager.revive_osd(divergent)
143 log.info('delay recovery divergent')
144 manager.wait_run_admin_socket(
145 'osd', divergent, ['set_recovery_delay', '100000'])
147 manager.raw_cluster_cmd('osd', 'unset', 'noup')
148 while len(manager.get_osd_status()['up']) < 3:
151 log.info('wait for peering')
152 rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
154 # At this point the divergent_priors should have been detected
156 log.info("killing divergent %d", divergent)
157 manager.kill_osd(divergent)
159 # Split pgs for pool foo
160 manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'pg_num', '2')
163 manager.raw_cluster_cmd('pg','dump')
166 (exp_remote,) = ctx.\
167 cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys()
168 FSPATH = manager.get_filepath()
169 JPATH = os.path.join(FSPATH, "journal")
170 prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
171 "--data-path {fpath} --journal-path {jpath} "
173 "/var/log/ceph/objectstore_tool.$$.log ".
174 format(fpath=FSPATH, jpath=JPATH))
176 expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
177 cmd = ((prefix + "--op export-remove --pgid 2.0 --file {file}").
178 format(id=divergent, file=expfile))
179 proc = exp_remote.run(args=cmd, wait=True,
180 check_status=False, stdout=StringIO())
181 assert proc.exitstatus == 0
183 # Kill one of non-divergent OSDs
184 log.info('killing osd.%d' % non_divergent[0])
185 manager.kill_osd(non_divergent[0])
186 manager.mark_down_osd(non_divergent[0])
187 # manager.mark_out_osd(non_divergent[0])
189 # An empty collection for pg 2.0 might need to be cleaned up
190 cmd = ((prefix + "--force --op remove --pgid 2.0").
191 format(id=non_divergent[0]))
192 proc = exp_remote.run(args=cmd, wait=True,
193 check_status=False, stdout=StringIO())
195 cmd = ((prefix + "--op import --file {file}").
196 format(id=non_divergent[0], file=expfile))
197 proc = exp_remote.run(args=cmd, wait=True,
198 check_status=False, stdout=StringIO())
199 assert proc.exitstatus == 0
201 # bring in our divergent friend and other node
202 log.info("revive divergent %d", divergent)
203 manager.revive_osd(divergent)
204 manager.mark_in_osd(divergent)
205 log.info("revive %d", non_divergent[0])
206 manager.revive_osd(non_divergent[0])
208 while len(manager.get_osd_status()['up']) < 3:
211 log.info('delay recovery divergent')
212 manager.set_config(divergent, osd_recovery_delay_start=100000)
213 log.info('mark divergent in')
214 manager.mark_in_osd(divergent)
216 log.info('wait for peering')
217 rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
219 log.info("killing divergent %d", divergent)
220 manager.kill_osd(divergent)
221 log.info("reviving divergent %d", divergent)
222 manager.revive_osd(divergent)
225 log.info('allowing recovery')
226 # Set osd_recovery_delay_start back to 0 and kick the queue
228 manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug',
229 'kick_recovery_wq', ' 0')
231 log.info('reading divergent objects')
232 for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE):
233 exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i,
235 assert exit_status is 0
238 cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys()
239 cmd = 'rm {file}'.format(file=expfile)
240 remote.run(args=cmd, wait=True)