src/ceph/qa/tasks/cephfs/test_journal_migration.py

   1
   2 from StringIO import StringIO
   3 from tasks.cephfs.cephfs_test_case import CephFSTestCase
   4 from tasks.workunit import task as workunit
   5
   6 JOURNAL_FORMAT_LEGACY = 0
   7 JOURNAL_FORMAT_RESILIENT = 1
   8
   9
  10 class TestJournalMigration(CephFSTestCase):
  11     CLIENTS_REQUIRED = 1
  12     MDSS_REQUIRED = 2
  13
  14     def test_journal_migration(self):
  15         old_journal_version = JOURNAL_FORMAT_LEGACY
  16         new_journal_version = JOURNAL_FORMAT_RESILIENT
  17
  18         # Pick out two daemons to use
  19         mds_a, mds_b = sorted(self.mds_cluster.mds_ids[0:2])
  20
  21         self.mount_a.umount_wait()
  22         self.fs.mds_stop()
  23
  24         # Enable standby replay, to cover the bug case #8811 where
  25         # a standby replay might mistakenly end up trying to rewrite
  26         # the journal at the same time as an active daemon.
  27         self.fs.set_ceph_conf('mds', 'mds standby replay', "true")
  28         self.fs.set_ceph_conf('mds', 'mds standby for rank', "0")
  29
  30         # Create a filesystem using the older journal format.
  31         self.fs.set_ceph_conf('mds', 'mds journal format', old_journal_version)
  32         self.fs.recreate()
  33         self.fs.mds_restart(mds_id=mds_a)
  34         self.fs.wait_for_daemons()
  35         self.assertEqual(self.fs.get_active_names(), [mds_a])
  36
  37         def replay_names():
  38             return [s['name']
  39                     for s in self.fs.status().get_replays(fscid = self.fs.id)]
  40
  41         # Start the standby and wait for it to come up
  42         self.fs.mds_restart(mds_id=mds_b)
  43         self.wait_until_equal(
  44                 replay_names,
  45                 [mds_b],
  46                 timeout = 30)
  47
  48         # Do some client work so that the log is populated with something.
  49         with self.mount_a.mounted():
  50             self.mount_a.create_files()
  51             self.mount_a.check_files()  # sanity, this should always pass
  52
  53             # Run a more substantial workunit so that the length of the log to be
  54             # coverted is going span at least a few segments
  55             workunit(self.ctx, {
  56                 'clients': {
  57                     "client.{0}".format(self.mount_a.client_id): ["suites/fsstress.sh"],
  58                 },
  59                 "timeout": "3h"
  60             })
  61
  62         # Modify the ceph.conf to ask the MDS to use the new journal format.
  63         self.fs.set_ceph_conf('mds', 'mds journal format', new_journal_version)
  64
  65         # Restart the MDS.
  66         self.fs.mds_fail_restart(mds_id=mds_a)
  67         self.fs.mds_fail_restart(mds_id=mds_b)
  68
  69         # This ensures that all daemons come up into a valid state
  70         self.fs.wait_for_daemons()
  71
  72         # Check that files created in the initial client workload are still visible
  73         # in a client mount.
  74         with self.mount_a.mounted():
  75             self.mount_a.check_files()
  76
  77         # Verify that the journal really has been rewritten.
  78         journal_version = self.fs.get_journal_version()
  79         if journal_version != new_journal_version:
  80             raise RuntimeError("Journal was not upgraded, version should be {0} but is {1}".format(
  81                 new_journal_version, journal_version()
  82             ))
  83
  84         # Verify that cephfs-journal-tool can now read the rewritten journal
  85         inspect_out = self.fs.journal_tool(["journal", "inspect"])
  86         if not inspect_out.endswith(": OK"):
  87             raise RuntimeError("Unexpected journal-tool result: '{0}'".format(
  88                 inspect_out
  89             ))
  90
  91         self.fs.journal_tool(["event", "get", "json", "--path", "/tmp/journal.json"])
  92         p = self.fs.tool_remote.run(
  93             args=[
  94                 "python",
  95                 "-c",
  96                 "import json; print len(json.load(open('/tmp/journal.json')))"
  97             ],
  98             stdout=StringIO())
  99         event_count = int(p.stdout.getvalue().strip())
 100         if event_count < 1000:
 101             # Approximate value of "lots", expected from having run fsstress
 102             raise RuntimeError("Unexpectedly few journal events: {0}".format(event_count))
 103
 104         # Do some client work to check that writing the log is still working
 105         with self.mount_a.mounted():
 106             workunit(self.ctx, {
 107                 'clients': {
 108                     "client.{0}".format(self.mount_a.client_id): ["fs/misc/trivial_sync.sh"],
 109                 },
 110                 "timeout": "3h"
 111             })
 112
 113         # Check that both an active and a standby replay are still up
 114         self.assertEqual(len(replay_names()), 1)
 115         self.assertEqual(len(self.fs.get_active_names()), 1)
 116         self.assertTrue(self.mds_cluster.mds_daemons[mds_a].running())
 117         self.assertTrue(self.mds_cluster.mds_daemons[mds_b].running())
 118