5 from tasks.mgr.mgr_test_case import MgrTestCase
8 log = logging.getLogger(__name__)
11 class TestFailover(MgrTestCase):
14 def test_timeout(self):
16 That when an active mgr stops responding, a standby is promoted
17 after mon_mgr_beacon_grace.
20 # Query which mgr is active
21 original_active = self.mgr_cluster.get_active_id()
22 original_standbys = self.mgr_cluster.get_standby_ids()
25 self.mgr_cluster.mgr_stop(original_active)
27 # Assert that the other mgr becomes active
29 lambda: self.mgr_cluster.get_active_id() in original_standbys,
33 self.mgr_cluster.mgr_restart(original_active)
35 lambda: original_active in self.mgr_cluster.get_standby_ids(),
39 def test_timeout_nostandby(self):
41 That when an active mgr stop responding, and no standby is
42 available, the active mgr is removed from the map anyway.
44 # Query which mgr is active
45 original_active = self.mgr_cluster.get_active_id()
46 original_standbys = self.mgr_cluster.get_standby_ids()
48 for s in original_standbys:
49 self.mgr_cluster.mgr_stop(s)
50 self.mgr_cluster.mgr_fail(s)
52 self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
53 self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
55 grace = int(self.mgr_cluster.get_config("mon_mgr_beacon_grace"))
56 log.info("Should time out in about {0} seconds".format(grace))
58 self.mgr_cluster.mgr_stop(original_active)
60 # Now wait for the mon to notice the mgr is gone and remove it
62 self.wait_until_equal(
63 lambda: self.mgr_cluster.get_active_id(),
68 self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
69 self.assertEqual(self.mgr_cluster.get_active_id(), "")
71 def test_explicit_fail(self):
73 That when a user explicitly fails a daemon, a standby immediately
77 # Query which mgr is active
78 original_active = self.mgr_cluster.get_active_id()
79 original_standbys = self.mgr_cluster.get_standby_ids()
81 self.mgr_cluster.mgr_fail(original_active)
83 # A standby should take over
85 lambda: self.mgr_cluster.get_active_id() in original_standbys,
89 # The one we failed should come back as a standby (he isn't
92 lambda: original_active in self.mgr_cluster.get_standby_ids(),
96 # Both daemons should have fully populated metadata
97 # (regression test for http://tracker.ceph.com/issues/21260)
98 meta = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
100 id_to_meta = dict([(i['id'], i) for i in meta])
101 for i in [original_active] + original_standbys:
102 self.assertIn(i, id_to_meta)
103 self.assertIn('ceph_version', id_to_meta[i])
105 # We should be able to fail back over again: the exercises
106 # our re-initialization of the python runtime within
107 # a single process lifetime.
109 # Get rid of any bystander standbys so that the original_active
110 # will be selected as next active.
111 new_active = self.mgr_cluster.get_active_id()
112 for daemon in original_standbys:
113 if daemon != new_active:
114 self.mgr_cluster.mgr_stop(daemon)
115 self.mgr_cluster.mgr_fail(daemon)
117 self.assertListEqual(self.mgr_cluster.get_standby_ids(),
120 self.mgr_cluster.mgr_stop(new_active)
121 self.mgr_cluster.mgr_fail(new_active)
123 self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
124 self.assertEqual(self.mgr_cluster.get_standby_ids(), [])
126 def test_standby_timeout(self):
128 That when a standby daemon stops sending beacons, it is
129 removed from the list of standbys
132 original_active = self.mgr_cluster.get_active_id()
133 original_standbys = self.mgr_cluster.get_standby_ids()
135 victim = original_standbys[0]
136 self.mgr_cluster.mgr_stop(victim)
138 expect_standbys = set(original_standbys) - {victim}
140 self.wait_until_true(
141 lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
144 self.assertEqual(self.mgr_cluster.get_active_id(), original_active)