X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=src%2Fceph%2Fqa%2Ftasks%2Fcephfs%2Ftest_damage.py;fp=src%2Fceph%2Fqa%2Ftasks%2Fcephfs%2Ftest_damage.py;h=0000000000000000000000000000000000000000;hb=7da45d65be36d36b880cc55c5036e96c24b53f00;hp=380b49c4b6584918a7bc70353f69ac8959ecde0f;hpb=691462d09d0987b47e112d6ee8740375df3c51b2;p=stor4nfv.git diff --git a/src/ceph/qa/tasks/cephfs/test_damage.py b/src/ceph/qa/tasks/cephfs/test_damage.py deleted file mode 100644 index 380b49c..0000000 --- a/src/ceph/qa/tasks/cephfs/test_damage.py +++ /dev/null @@ -1,548 +0,0 @@ -import json -import logging -import errno -import re -from teuthology.contextutil import MaxWhileTries -from teuthology.exceptions import CommandFailedError -from teuthology.orchestra.run import wait -from tasks.cephfs.fuse_mount import FuseMount -from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology - -DAMAGED_ON_START = "damaged_on_start" -DAMAGED_ON_LS = "damaged_on_ls" -CRASHED = "server crashed" -NO_DAMAGE = "no damage" -FAILED_CLIENT = "client failed" -FAILED_SERVER = "server failed" - -# An EIO in response to a stat from the client -EIO_ON_LS = "eio" - -# An EIO, but nothing in damage table (not ever what we expect) -EIO_NO_DAMAGE = "eio without damage entry" - - -log = logging.getLogger(__name__) - - -class TestDamage(CephFSTestCase): - def _simple_workload_write(self): - self.mount_a.run_shell(["mkdir", "subdir"]) - self.mount_a.write_n_mb("subdir/sixmegs", 6) - return self.mount_a.stat("subdir/sixmegs") - - def is_marked_damaged(self, rank): - mds_map = self.fs.get_mds_map() - return rank in mds_map['damaged'] - - @for_teuthology #459s - def test_object_deletion(self): - """ - That the MDS has a clean 'damaged' response to loss of any single metadata object - """ - - self._simple_workload_write() - - # Hmm, actually it would be nice to permute whether the metadata pool - # state contains sessions or not, but for the moment close this session - # to avoid waiting through reconnect on every MDS start. - self.mount_a.umount_wait() - for mds_name in self.fs.get_active_names(): - self.fs.mds_asok(["flush", "journal"], mds_name) - - self.fs.mds_stop() - self.fs.mds_fail() - - self.fs.rados(['export', '/tmp/metadata.bin']) - - def is_ignored(obj_id, dentry=None): - """ - A filter to avoid redundantly mutating many similar objects (e.g. - stray dirfrags) or similar dentries (e.g. stray dir dentries) - """ - if re.match("60.\.00000000", obj_id) and obj_id != "600.00000000": - return True - - if dentry and obj_id == "100.00000000": - if re.match("stray.+_head", dentry) and dentry != "stray0_head": - return True - - return False - - def get_path(obj_id, dentry=None): - """ - What filesystem path does this object or dentry correspond to? i.e. - what should I poke to see EIO after damaging it? - """ - - if obj_id == "1.00000000" and dentry == "subdir_head": - return "./subdir" - elif obj_id == "10000000000.00000000" and dentry == "sixmegs_head": - return "./subdir/sixmegs" - - # None means ls will do an "ls -R" in hope of seeing some errors - return None - - objects = self.fs.rados(["ls"]).split("\n") - objects = [o for o in objects if not is_ignored(o)] - - # Find all objects with an OMAP header - omap_header_objs = [] - for o in objects: - header = self.fs.rados(["getomapheader", o]) - # The rados CLI wraps the header output in a hex-printed style - header_bytes = int(re.match("header \((.+) bytes\)", header).group(1)) - if header_bytes > 0: - omap_header_objs.append(o) - - # Find all OMAP key/vals - omap_keys = [] - for o in objects: - keys_str = self.fs.rados(["listomapkeys", o]) - if keys_str: - for key in keys_str.split("\n"): - if not is_ignored(o, key): - omap_keys.append((o, key)) - - # Find objects that have data in their bodies - data_objects = [] - for obj_id in objects: - stat_out = self.fs.rados(["stat", obj_id]) - size = int(re.match(".+, size (.+)$", stat_out).group(1)) - if size > 0: - data_objects.append(obj_id) - - # Define the various forms of damage we will inflict - class MetadataMutation(object): - def __init__(self, obj_id_, desc_, mutate_fn_, expectation_, ls_path=None): - self.obj_id = obj_id_ - self.desc = desc_ - self.mutate_fn = mutate_fn_ - self.expectation = expectation_ - if ls_path is None: - self.ls_path = "." - else: - self.ls_path = ls_path - - def __eq__(self, other): - return self.desc == other.desc - - def __hash__(self): - return hash(self.desc) - - junk = "deadbeef" * 10 - mutations = [] - - # Removals - for obj_id in objects: - if obj_id in [ - # JournalPointers are auto-replaced if missing (same path as upgrade) - "400.00000000", - # Missing dirfrags for non-system dirs result in empty directory - "10000000000.00000000", - # PurgeQueue is auto-created if not found on startup - "500.00000000" - ]: - expectation = NO_DAMAGE - else: - expectation = DAMAGED_ON_START - - log.info("Expectation on rm '{0}' will be '{1}'".format( - obj_id, expectation - )) - - mutations.append(MetadataMutation( - obj_id, - "Delete {0}".format(obj_id), - lambda o=obj_id: self.fs.rados(["rm", o]), - expectation - )) - - # Blatant corruptions - mutations.extend([ - MetadataMutation( - o, - "Corrupt {0}".format(o), - lambda o=o: self.fs.rados(["put", o, "-"], stdin_data=junk), - DAMAGED_ON_START - ) for o in data_objects - ]) - - # Truncations - for obj_id in data_objects: - if obj_id == "500.00000000": - # The PurgeQueue is allowed to be empty: Journaler interprets - # an empty header object as an empty journal. - expectation = NO_DAMAGE - else: - expectation = DAMAGED_ON_START - - mutations.append( - MetadataMutation( - o, - "Truncate {0}".format(o), - lambda o=o: self.fs.rados(["truncate", o, "0"]), - DAMAGED_ON_START - )) - - # OMAP value corruptions - for o, k in omap_keys: - if o.startswith("100."): - # Anything in rank 0's 'mydir' - expectation = DAMAGED_ON_START - else: - expectation = EIO_ON_LS - - mutations.append( - MetadataMutation( - o, - "Corrupt omap key {0}:{1}".format(o, k), - lambda o=o,k=k: self.fs.rados(["setomapval", o, k, junk]), - expectation, - get_path(o, k) - ) - ) - - # OMAP header corruptions - for obj_id in omap_header_objs: - if re.match("60.\.00000000", obj_id) \ - or obj_id in ["1.00000000", "100.00000000", "mds0_sessionmap"]: - expectation = DAMAGED_ON_START - else: - expectation = NO_DAMAGE - - log.info("Expectation on corrupt header '{0}' will be '{1}'".format( - obj_id, expectation - )) - - mutations.append( - MetadataMutation( - obj_id, - "Corrupt omap header on {0}".format(obj_id), - lambda o=obj_id: self.fs.rados(["setomapheader", o, junk]), - expectation - ) - ) - - results = {} - - for mutation in mutations: - log.info("Applying mutation '{0}'".format(mutation.desc)) - - # Reset MDS state - self.mount_a.umount_wait(force=True) - self.fs.mds_stop() - self.fs.mds_fail() - self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0') - - # Reset RADOS pool state - self.fs.rados(['import', '/tmp/metadata.bin']) - - # Inject the mutation - mutation.mutate_fn() - - # Try starting the MDS - self.fs.mds_restart() - - # How long we'll wait between starting a daemon and expecting - # it to make it through startup, and potentially declare itself - # damaged to the mon cluster. - startup_timeout = 60 - - if mutation.expectation not in (EIO_ON_LS, DAMAGED_ON_LS, NO_DAMAGE): - if mutation.expectation == DAMAGED_ON_START: - # The MDS may pass through active before making it to damaged - try: - self.wait_until_true(lambda: self.is_marked_damaged(0), startup_timeout) - except RuntimeError: - pass - - # Wait for MDS to either come up or go into damaged state - try: - self.wait_until_true(lambda: self.is_marked_damaged(0) or self.fs.are_daemons_healthy(), startup_timeout) - except RuntimeError: - crashed = False - # Didn't make it to healthy or damaged, did it crash? - for daemon_id, daemon in self.fs.mds_daemons.items(): - if daemon.proc and daemon.proc.finished: - crashed = True - log.error("Daemon {0} crashed!".format(daemon_id)) - daemon.proc = None # So that subsequent stop() doesn't raise error - if not crashed: - # Didn't go health, didn't go damaged, didn't crash, so what? - raise - else: - log.info("Result: Mutation '{0}' led to crash".format(mutation.desc)) - results[mutation] = CRASHED - continue - if self.is_marked_damaged(0): - log.info("Result: Mutation '{0}' led to DAMAGED state".format(mutation.desc)) - results[mutation] = DAMAGED_ON_START - continue - else: - log.info("Mutation '{0}' did not prevent MDS startup, attempting ls...".format(mutation.desc)) - else: - try: - self.wait_until_true(self.fs.are_daemons_healthy, 60) - except RuntimeError: - log.info("Result: Mutation '{0}' should have left us healthy, actually not.".format(mutation.desc)) - if self.is_marked_damaged(0): - results[mutation] = DAMAGED_ON_START - else: - results[mutation] = FAILED_SERVER - continue - log.info("Daemons came up after mutation '{0}', proceeding to ls".format(mutation.desc)) - - # MDS is up, should go damaged on ls or client mount - self.mount_a.mount() - self.mount_a.wait_until_mounted() - if mutation.ls_path == ".": - proc = self.mount_a.run_shell(["ls", "-R", mutation.ls_path], wait=False) - else: - proc = self.mount_a.stat(mutation.ls_path, wait=False) - - if mutation.expectation == DAMAGED_ON_LS: - try: - self.wait_until_true(lambda: self.is_marked_damaged(0), 60) - log.info("Result: Mutation '{0}' led to DAMAGED state after ls".format(mutation.desc)) - results[mutation] = DAMAGED_ON_LS - except RuntimeError: - if self.fs.are_daemons_healthy(): - log.error("Result: Failed to go damaged on mutation '{0}', actually went active".format( - mutation.desc)) - results[mutation] = NO_DAMAGE - else: - log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc)) - results[mutation] = FAILED_SERVER - - else: - try: - wait([proc], 20) - log.info("Result: Mutation '{0}' did not caused DAMAGED state".format(mutation.desc)) - results[mutation] = NO_DAMAGE - except MaxWhileTries: - log.info("Result: Failed to complete client IO on mutation '{0}'".format(mutation.desc)) - results[mutation] = FAILED_CLIENT - except CommandFailedError as e: - if e.exitstatus == errno.EIO: - log.info("Result: EIO on client") - results[mutation] = EIO_ON_LS - else: - log.info("Result: unexpected error {0} on client".format(e)) - results[mutation] = FAILED_CLIENT - - if mutation.expectation == EIO_ON_LS: - # EIOs mean something handled by DamageTable: assert that it has - # been populated - damage = json.loads( - self.fs.mon_manager.raw_cluster_cmd( - 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), "damage", "ls", '--format=json-pretty')) - if len(damage) == 0: - results[mutation] = EIO_NO_DAMAGE - - failures = [(mutation, result) for (mutation, result) in results.items() if mutation.expectation != result] - if failures: - log.error("{0} mutations had unexpected outcomes:".format(len(failures))) - for mutation, result in failures: - log.error(" Expected '{0}' actually '{1}' from '{2}'".format( - mutation.expectation, result, mutation.desc - )) - raise RuntimeError("{0} mutations had unexpected outcomes".format(len(failures))) - else: - log.info("All {0} mutations had expected outcomes".format(len(mutations))) - - def test_damaged_dentry(self): - # Damage to dentrys is interesting because it leaves the - # directory's `complete` flag in a subtle state where - # we have marked the dir complete in order that folks - # can access it, but in actual fact there is a dentry - # missing - self.mount_a.run_shell(["mkdir", "subdir/"]) - - self.mount_a.run_shell(["touch", "subdir/file_undamaged"]) - self.mount_a.run_shell(["touch", "subdir/file_to_be_damaged"]) - - subdir_ino = self.mount_a.path_to_ino("subdir") - - self.mount_a.umount_wait() - for mds_name in self.fs.get_active_names(): - self.fs.mds_asok(["flush", "journal"], mds_name) - - self.fs.mds_stop() - self.fs.mds_fail() - - # Corrupt a dentry - junk = "deadbeef" * 10 - dirfrag_obj = "{0:x}.00000000".format(subdir_ino) - self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk]) - - # Start up and try to list it - self.fs.mds_restart() - self.fs.wait_for_daemons() - - self.mount_a.mount() - self.mount_a.wait_until_mounted() - dentries = self.mount_a.ls("subdir/") - - # The damaged guy should have disappeared - self.assertEqual(dentries, ["file_undamaged"]) - - # I should get ENOENT if I try and read it normally, because - # the dir is considered complete - try: - self.mount_a.stat("subdir/file_to_be_damaged", wait=True) - except CommandFailedError as e: - self.assertEqual(e.exitstatus, errno.ENOENT) - else: - raise AssertionError("Expected ENOENT") - - # The fact that there is damaged should have bee recorded - damage = json.loads( - self.fs.mon_manager.raw_cluster_cmd( - 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), - "damage", "ls", '--format=json-pretty')) - self.assertEqual(len(damage), 1) - damage_id = damage[0]['id'] - - # If I try to create a dentry with the same name as the damaged guy - # then that should be forbidden - try: - self.mount_a.touch("subdir/file_to_be_damaged") - except CommandFailedError as e: - self.assertEqual(e.exitstatus, errno.EIO) - else: - raise AssertionError("Expected EIO") - - # Attempting that touch will clear the client's complete flag, now - # when I stat it I'll get EIO instead of ENOENT - try: - self.mount_a.stat("subdir/file_to_be_damaged", wait=True) - except CommandFailedError as e: - if isinstance(self.mount_a, FuseMount): - self.assertEqual(e.exitstatus, errno.EIO) - else: - # Kernel client handles this case differently - self.assertEqual(e.exitstatus, errno.ENOENT) - else: - raise AssertionError("Expected EIO") - - nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files") - self.assertEqual(nfiles, "2") - - self.mount_a.umount_wait() - - # Now repair the stats - scrub_json = self.fs.mds_asok(["scrub_path", "/subdir", "repair"]) - log.info(json.dumps(scrub_json, indent=2)) - - self.assertEqual(scrub_json["passed_validation"], False) - self.assertEqual(scrub_json["raw_stats"]["checked"], True) - self.assertEqual(scrub_json["raw_stats"]["passed"], False) - - # Check that the file count is now correct - self.mount_a.mount() - self.mount_a.wait_until_mounted() - nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files") - self.assertEqual(nfiles, "1") - - # Clean up the omap object - self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk]) - - # Clean up the damagetable entry - self.fs.mon_manager.raw_cluster_cmd( - 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), - "damage", "rm", "{did}".format(did=damage_id)) - - # Now I should be able to create a file with the same name as the - # damaged guy if I want. - self.mount_a.touch("subdir/file_to_be_damaged") - - def test_open_ino_errors(self): - """ - That errors encountered during opening inos are properly propagated - """ - - self.mount_a.run_shell(["mkdir", "dir1"]) - self.mount_a.run_shell(["touch", "dir1/file1"]) - self.mount_a.run_shell(["mkdir", "dir2"]) - self.mount_a.run_shell(["touch", "dir2/file2"]) - self.mount_a.run_shell(["mkdir", "testdir"]) - self.mount_a.run_shell(["ln", "dir1/file1", "testdir/hardlink1"]) - self.mount_a.run_shell(["ln", "dir2/file2", "testdir/hardlink2"]) - - file1_ino = self.mount_a.path_to_ino("dir1/file1") - file2_ino = self.mount_a.path_to_ino("dir2/file2") - dir2_ino = self.mount_a.path_to_ino("dir2") - - # Ensure everything is written to backing store - self.mount_a.umount_wait() - self.fs.mds_asok(["flush", "journal"]) - - # Drop everything from the MDS cache - self.mds_cluster.mds_stop() - self.fs.journal_tool(['journal', 'reset']) - self.mds_cluster.mds_fail_restart() - self.fs.wait_for_daemons() - - self.mount_a.mount() - - # Case 1: un-decodeable backtrace - - # Validate that the backtrace is present and decodable - self.fs.read_backtrace(file1_ino) - # Go corrupt the backtrace of alpha/target (used for resolving - # bravo/hardlink). - self.fs._write_data_xattr(file1_ino, "parent", "rhubarb") - - # Check that touching the hardlink gives EIO - ran = self.mount_a.run_shell(["stat", "testdir/hardlink1"], wait=False) - try: - ran.wait() - except CommandFailedError: - self.assertTrue("Input/output error" in ran.stderr.getvalue()) - - # Check that an entry is created in the damage table - damage = json.loads( - self.fs.mon_manager.raw_cluster_cmd( - 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), - "damage", "ls", '--format=json-pretty')) - self.assertEqual(len(damage), 1) - self.assertEqual(damage[0]['damage_type'], "backtrace") - self.assertEqual(damage[0]['ino'], file1_ino) - - self.fs.mon_manager.raw_cluster_cmd( - 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), - "damage", "rm", str(damage[0]['id'])) - - - # Case 2: missing dirfrag for the target inode - - self.fs.rados(["rm", "{0:x}.00000000".format(dir2_ino)]) - - # Check that touching the hardlink gives EIO - ran = self.mount_a.run_shell(["stat", "testdir/hardlink2"], wait=False) - try: - ran.wait() - except CommandFailedError: - self.assertTrue("Input/output error" in ran.stderr.getvalue()) - - # Check that an entry is created in the damage table - damage = json.loads( - self.fs.mon_manager.raw_cluster_cmd( - 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), - "damage", "ls", '--format=json-pretty')) - self.assertEqual(len(damage), 2) - if damage[0]['damage_type'] == "backtrace" : - self.assertEqual(damage[0]['ino'], file2_ino) - self.assertEqual(damage[1]['damage_type'], "dir_frag") - self.assertEqual(damage[1]['ino'], dir2_ino) - else: - self.assertEqual(damage[0]['damage_type'], "dir_frag") - self.assertEqual(damage[0]['ino'], dir2_ino) - self.assertEqual(damage[1]['damage_type'], "backtrace") - self.assertEqual(damage[1]['ino'], file2_ino) - - for entry in damage: - self.fs.mon_manager.raw_cluster_cmd( - 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), - "damage", "rm", str(entry['id']))