Fix some bugs when testing opensds ansible
[stor4nfv.git] / src / ceph / qa / tasks / mgr / test_failover.py
1
2 import logging
3 import json
4
5 from tasks.mgr.mgr_test_case import MgrTestCase
6
7
8 log = logging.getLogger(__name__)
9
10
11 class TestFailover(MgrTestCase):
12     MGRS_REQUIRED = 2
13
14     def test_timeout(self):
15         """
16         That when an active mgr stops responding, a standby is promoted
17         after mon_mgr_beacon_grace.
18         """
19
20         # Query which mgr is active
21         original_active = self.mgr_cluster.get_active_id()
22         original_standbys = self.mgr_cluster.get_standby_ids()
23
24         # Stop that daemon
25         self.mgr_cluster.mgr_stop(original_active)
26
27         # Assert that the other mgr becomes active
28         self.wait_until_true(
29             lambda: self.mgr_cluster.get_active_id() in original_standbys,
30             timeout=60
31         )
32
33         self.mgr_cluster.mgr_restart(original_active)
34         self.wait_until_true(
35             lambda: original_active in self.mgr_cluster.get_standby_ids(),
36             timeout=10
37         )
38
39     def test_timeout_nostandby(self):
40         """
41         That when an active mgr stop responding, and no standby is
42         available, the active mgr is removed from the map anyway.
43         """
44         # Query which mgr is active
45         original_active = self.mgr_cluster.get_active_id()
46         original_standbys = self.mgr_cluster.get_standby_ids()
47
48         for s in original_standbys:
49             self.mgr_cluster.mgr_stop(s)
50             self.mgr_cluster.mgr_fail(s)
51
52         self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
53         self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
54
55         grace = int(self.mgr_cluster.get_config("mon_mgr_beacon_grace"))
56         log.info("Should time out in about {0} seconds".format(grace))
57
58         self.mgr_cluster.mgr_stop(original_active)
59
60         # Now wait for the mon to notice the mgr is gone and remove it
61         # from the map.
62         self.wait_until_equal(
63             lambda: self.mgr_cluster.get_active_id(),
64             "",
65             timeout=grace * 2
66         )
67
68         self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
69         self.assertEqual(self.mgr_cluster.get_active_id(), "")
70
71     def test_explicit_fail(self):
72         """
73         That when a user explicitly fails a daemon, a standby immediately
74         replaces it.
75         :return:
76         """
77         # Query which mgr is active
78         original_active = self.mgr_cluster.get_active_id()
79         original_standbys = self.mgr_cluster.get_standby_ids()
80
81         self.mgr_cluster.mgr_fail(original_active)
82
83         # A standby should take over
84         self.wait_until_true(
85             lambda: self.mgr_cluster.get_active_id() in original_standbys,
86             timeout=60
87         )
88
89         # The one we failed should come back as a standby (he isn't
90         # really dead)
91         self.wait_until_true(
92             lambda: original_active in self.mgr_cluster.get_standby_ids(),
93             timeout=10
94         )
95
96         # Both daemons should have fully populated metadata
97         # (regression test for http://tracker.ceph.com/issues/21260)
98         meta = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
99             "mgr", "metadata"))
100         id_to_meta = dict([(i['id'], i) for i in meta])
101         for i in [original_active] + original_standbys:
102             self.assertIn(i, id_to_meta)
103             self.assertIn('ceph_version', id_to_meta[i])
104
105         # We should be able to fail back over again: the exercises
106         # our re-initialization of the python runtime within
107         # a single process lifetime.
108
109         # Get rid of any bystander standbys so that the original_active
110         # will be selected as next active.
111         new_active = self.mgr_cluster.get_active_id()
112         for daemon in original_standbys:
113             if daemon != new_active:
114                 self.mgr_cluster.mgr_stop(daemon)
115                 self.mgr_cluster.mgr_fail(daemon)
116
117         self.assertListEqual(self.mgr_cluster.get_standby_ids(),
118                              [original_active])
119
120         self.mgr_cluster.mgr_stop(new_active)
121         self.mgr_cluster.mgr_fail(new_active)
122
123         self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
124         self.assertEqual(self.mgr_cluster.get_standby_ids(), [])
125
126     def test_standby_timeout(self):
127         """
128         That when a standby daemon stops sending beacons, it is
129         removed from the list of standbys
130         :return:
131         """
132         original_active = self.mgr_cluster.get_active_id()
133         original_standbys = self.mgr_cluster.get_standby_ids()
134
135         victim = original_standbys[0]
136         self.mgr_cluster.mgr_stop(victim)
137
138         expect_standbys = set(original_standbys) - {victim}
139
140         self.wait_until_true(
141             lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
142             timeout=60
143         )
144         self.assertEqual(self.mgr_cluster.get_active_id(), original_active)