2 from cStringIO import StringIO
12 from teuthology import misc as teuthology
14 log = logging.getLogger(__name__)
17 def wait_for_victim_pg(manager):
18 """Return a PG with some data and its acting set"""
19 # wait for some PG to have data that we can mess with
22 stats = manager.get_pg_stats()
24 size = pg['stat_sum']['num_bytes']
32 def find_victim_object(ctx, pg, osd):
33 """Return a file to be fuzzed"""
34 (osd_remote,) = ctx.cluster.only('osd.%d' % osd).remotes.iterkeys()
35 data_path = os.path.join(
37 'ceph-{id}'.format(id=osd),
39 '{pg}_head'.format(pg=pg),
44 with contextlib.closing(StringIO()) as ls_fp:
46 args=['sudo', 'ls', data_path],
49 ls_out = ls_fp.getvalue()
51 # find an object file we can mess with (and not the pg info object)
52 osdfilename = next(line for line in ls_out.split('\n')
53 if not line.endswith('::::head#'))
54 assert osdfilename is not None
56 # Get actual object name from osd stored filename
57 objname = osdfilename.split(':')[4]
58 return osd_remote, os.path.join(data_path, osdfilename), objname
61 def corrupt_file(osd_remote, path):
62 # put a single \0 at the beginning of the file
67 'bs=1', 'count=1', 'conv=notrunc']
77 def deep_scrub(manager, victim, pool):
78 # scrub, verify inconsistent
79 pgnum = get_pgnum(victim)
80 manager.do_pg_scrub(pool, pgnum, 'deep-scrub')
82 stats = manager.get_single_pg_stats(victim)
83 inconsistent = stats['state'].find('+inconsistent') != -1
87 def repair(manager, victim, pool):
88 # repair, verify no longer inconsistent
89 pgnum = get_pgnum(victim)
90 manager.do_pg_scrub(pool, pgnum, 'repair')
92 stats = manager.get_single_pg_stats(victim)
93 inconsistent = stats['state'].find('+inconsistent') != -1
94 assert not inconsistent
97 def test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, pool):
98 corrupt_file(osd_remote, obj_path)
99 deep_scrub(manager, pg, pool)
100 repair(manager, pg, pool)
103 def test_repair_bad_omap(ctx, manager, pg, osd, objname):
104 # Test deep-scrub with various omap modifications
105 # Modify omap on specific osd
106 log.info('fuzzing omap of %s' % objname)
107 manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'key'])
108 manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname,
110 manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'badhdr'])
112 deep_scrub(manager, pg, 'rbd')
113 # please note, the repair here is errnomous, it rewrites the correct omap
114 # digest and data digest on the replicas with the corresponding digests
115 # from the primary osd which is hosting the victim object, see
116 # find_victim_object().
117 # so we need to either put this test and the end of this task or
118 # undo the mess-up manually before the "repair()" that just ensures
119 # the cleanup is sane, otherwise the succeeding tests will fail. if they
120 # try set "badkey" in hope to get an "inconsistent" pg with a deep-scrub.
121 manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'hdr'])
122 manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'badkey'])
123 manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname,
125 repair(manager, pg, 'rbd')
129 def __init__(self, manager, osd_remote, pool, osd_id,
130 obj_name, obj_path, omap_key, omap_val):
131 self.manager = manager
132 self.osd = osd_remote
137 self.omap_key = omap_key
138 self.omap_val = omap_val
140 @contextlib.contextmanager
141 def _test_with_file(self, messup_cmd, *checks):
142 temp = tempfile.mktemp()
143 backup_cmd = ['sudo', 'cp', os.path.join(self.path, 'data'), temp]
144 self.osd.run(args=backup_cmd)
145 self.osd.run(args=messup_cmd.split())
147 create_cmd = ['sudo', 'mkdir', self.path]
148 self.osd.run(args=create_cmd, check_status=False)
149 restore_cmd = ['sudo', 'cp', temp, os.path.join(self.path, 'data')]
150 self.osd.run(args=restore_cmd)
153 cmd = 'sudo rmdir {path}'.format(path=self.path)
154 return self._test_with_file(cmd, 'missing')
157 cmd = 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \
158 'conv=notrunc oflag=append'.format(path=self.path)
159 return self._test_with_file(cmd,
160 'data_digest_mismatch',
164 cmd = 'sudo dd if=/dev/null of={path}/data'.format(path=self.path)
165 return self._test_with_file(cmd,
166 'data_digest_mismatch',
169 def change_obj(self):
170 cmd = 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \
171 'conv=notrunc'.format(path=self.path)
172 return self._test_with_file(cmd,
173 'data_digest_mismatch')
175 @contextlib.contextmanager
177 cmd = ['rmomapkey', self.pool, self.obj, self.omap_key]
178 self.manager.osd_admin_socket(self.osd_id, cmd)
179 yield ('omap_digest_mismatch',)
180 cmd = ['setomapval', self.pool, self.obj,
181 self.omap_key, self.omap_val]
182 self.manager.osd_admin_socket(self.osd_id, cmd)
184 @contextlib.contextmanager
186 cmd = ['setomapval', self.pool, self.obj, 'badkey', 'badval']
187 self.manager.osd_admin_socket(self.osd_id, cmd)
188 yield ('omap_digest_mismatch',)
189 cmd = ['rmomapkey', self.pool, self.obj, 'badkey']
190 self.manager.osd_admin_socket(self.osd_id, cmd)
192 @contextlib.contextmanager
193 def change_omap(self):
194 cmd = ['setomapval', self.pool, self.obj, self.omap_key, 'badval']
195 self.manager.osd_admin_socket(self.osd_id, cmd)
196 yield ('omap_digest_mismatch',)
197 cmd = ['setomapval', self.pool, self.obj, self.omap_key, self.omap_val]
198 self.manager.osd_admin_socket(self.osd_id, cmd)
201 class InconsistentObjChecker:
202 """Check the returned inconsistents/inconsistent info"""
204 def __init__(self, osd, acting, obj_name):
208 assert self.osd in self.acting
210 def basic_checks(self, inc):
211 assert inc['object']['name'] == self.obj
212 assert inc['object']['snap'] == "head"
213 assert len(inc['shards']) == len(self.acting), \
214 "the number of returned shard does not match with the acting set"
216 def run(self, check, inc):
217 func = getattr(self, check)
220 def _check_errors(self, inc, err_name):
223 for shard in inc['shards']:
224 log.info('shard = %r' % shard)
225 log.info('err = %s' % err_name)
226 assert 'osd' in shard
228 err = err_name in shard['errors']
230 assert bad_found is False, \
231 "multiple entries found for the given OSD"
232 assert err is True, \
233 "Didn't find '{err}' in errors".format(err=err_name)
236 assert osd in self.acting, "shard not in acting set"
237 assert err is False, \
238 "Expected '{err}' in errors".format(err=err_name)
240 assert bad_found is True, \
241 "Shard for osd.{osd} not found".format(osd=self.osd)
242 assert good_found is True, \
243 "No other acting shards found"
245 def _check_attrs(self, inc, attr_name):
248 for shard in inc['shards']:
249 log.info('shard = %r' % shard)
250 log.info('attr = %s' % attr_name)
251 assert 'osd' in shard
253 attr = shard.get(attr_name, False)
255 assert bad_attr is None, \
256 "multiple entries found for the given OSD"
259 assert osd in self.acting, "shard not in acting set"
260 assert good_attr is None or good_attr == attr, \
261 "multiple good attrs found"
263 assert bad_attr is not None, \
264 "bad {attr} not found".format(attr=attr_name)
265 assert good_attr is not None, \
266 "good {attr} not found".format(attr=attr_name)
267 assert good_attr != bad_attr, \
268 "bad attr is identical to the good ones: " \
269 "{0} == {1}".format(good_attr, bad_attr)
271 def data_digest_mismatch(self, inc):
272 assert 'data_digest_mismatch' in inc['errors']
273 self._check_attrs(inc, 'data_digest')
275 def missing(self, inc):
276 assert 'missing' in inc['union_shard_errors']
277 self._check_errors(inc, 'missing')
279 def size_mismatch(self, inc):
280 assert 'size_mismatch' in inc['errors']
281 self._check_attrs(inc, 'size')
283 def omap_digest_mismatch(self, inc):
284 assert 'omap_digest_mismatch' in inc['errors']
285 self._check_attrs(inc, 'omap_digest')
288 def test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd_id,
290 mon = manager.controller
294 manager.do_rados(mon, ['-p', pool, 'setomapval', obj_name,
296 # Update missing digests, requires "osd deep scrub update digest min age: 0"
297 pgnum = get_pgnum(pg)
298 manager.do_pg_scrub(pool, pgnum, 'deep-scrub')
300 messup = MessUp(manager, osd_remote, pool, osd_id, obj_name, obj_path,
302 for test in [messup.rm_omap, messup.add_omap, messup.change_omap,
303 messup.append, messup.truncate, messup.change_obj,
305 with test() as checks:
306 deep_scrub(manager, pg, pool)
307 cmd = 'rados list-inconsistent-pg {pool} ' \
308 '--format=json'.format(pool=pool)
309 with contextlib.closing(StringIO()) as out:
310 mon.run(args=cmd.split(), stdout=out)
311 pgs = json.loads(out.getvalue())
314 cmd = 'rados list-inconsistent-obj {pg} ' \
315 '--format=json'.format(pg=pg)
316 with contextlib.closing(StringIO()) as out:
317 mon.run(args=cmd.split(), stdout=out)
318 objs = json.loads(out.getvalue())
319 assert len(objs['inconsistents']) == 1
321 checker = InconsistentObjChecker(osd_id, acting, obj_name)
322 inc_obj = objs['inconsistents'][0]
323 log.info('inc = %r', inc_obj)
324 checker.basic_checks(inc_obj)
326 checker.run(check, inc_obj)
329 def task(ctx, config):
341 - deep-scrub 0 missing, 1 inconsistent objects
342 - deep-scrub [0-9]+ errors
343 - repair 0 missing, 1 inconsistent objects
344 - repair [0-9]+ errors, [0-9]+ fixed
345 - shard [0-9]+ missing
346 - deep-scrub 1 missing, 1 inconsistent objects
347 - does not match object info size
348 - attr name mistmatch
349 - deep-scrub 1 missing, 0 inconsistent objects
350 - failed to pick suitable auth object
353 osd deep scrub update digest min age: 0
358 assert isinstance(config, dict), \
359 'scrub_test task only accepts a dict for configuration'
360 first_mon = teuthology.get_first_mon(ctx, config)
361 (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
363 num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
364 log.info('num_osds is %s' % num_osds)
366 manager = ceph_manager.CephManager(
369 logger=log.getChild('ceph_manager'),
372 while len(manager.get_osd_status()['up']) < num_osds:
375 for i in range(num_osds):
376 manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs',
377 '--', '--osd-objectstore-fuse')
378 manager.flush_pg_stats(range(num_osds))
379 manager.wait_for_clean()
382 p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1',
383 'write', '-b', '4096'])
384 log.info('err is %d' % p.exitstatus)
386 # wait for some PG to have data that we can mess with
387 pg, acting = wait_for_victim_pg(manager)
390 osd_remote, obj_path, obj_name = find_victim_object(ctx, pg, osd)
391 manager.do_rados(mon, ['-p', 'rbd', 'setomapval', obj_name, 'key', 'val'])
392 log.info('err is %d' % p.exitstatus)
393 manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', obj_name, 'hdr'])
394 log.info('err is %d' % p.exitstatus)
396 # Update missing digests, requires "osd deep scrub update digest min age: 0"
397 pgnum = get_pgnum(pg)
398 manager.do_pg_scrub('rbd', pgnum, 'deep-scrub')
400 log.info('messing with PG %s on osd %d' % (pg, osd))
401 test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, 'rbd')
402 test_repair_bad_omap(ctx, manager, pg, osd, obj_name)
403 test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd,
405 log.info('test successful!')
407 # shut down fuse mount
408 for i in range(num_osds):
409 manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs',
410 '--', '--no-osd-objectstore-fuse')