Fix some bugs when testing opensds ansible
[stor4nfv.git] / src / ceph / qa / tasks / cephfs / test_journal_repair.py
1
2 """
3 Test our tools for recovering the content of damaged journals
4 """
5
6 import json
7 import logging
8 from textwrap import dedent
9 import time
10
11 from teuthology.exceptions import CommandFailedError, ConnectionLostError
12 from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
13 from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
14 from tasks.workunit import task as workunit
15
16 log = logging.getLogger(__name__)
17
18
19 class TestJournalRepair(CephFSTestCase):
20     MDSS_REQUIRED = 2
21
22     def test_inject_to_empty(self):
23         """
24         That when some dentries in the journal but nothing is in
25         the backing store, we correctly populate the backing store
26         from the journalled dentries.
27         """
28
29         # Inject metadata operations
30         self.mount_a.run_shell(["touch", "rootfile"])
31         self.mount_a.run_shell(["mkdir", "subdir"])
32         self.mount_a.run_shell(["touch", "subdir/subdirfile"])
33         # There are several different paths for handling hardlinks, depending
34         # on whether an existing dentry (being overwritten) is also a hardlink
35         self.mount_a.run_shell(["mkdir", "linkdir"])
36
37         # Test inode -> remote transition for a dentry
38         self.mount_a.run_shell(["touch", "linkdir/link0"])
39         self.mount_a.run_shell(["rm", "-f", "linkdir/link0"])
40         self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"])
41
42         # Test nothing -> remote transition
43         self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"])
44
45         # Test remote -> inode transition
46         self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"])
47         self.mount_a.run_shell(["rm", "-f", "linkdir/link2"])
48         self.mount_a.run_shell(["touch", "linkdir/link2"])
49
50         # Test remote -> diff remote transition
51         self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"])
52         self.mount_a.run_shell(["rm", "-f", "linkdir/link3"])
53         self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"])
54
55         # Test an empty directory
56         self.mount_a.run_shell(["mkdir", "subdir/subsubdir"])
57         self.mount_a.run_shell(["sync"])
58
59         # Before we unmount, make a note of the inode numbers, later we will
60         # check that they match what we recover from the journal
61         rootfile_ino = self.mount_a.path_to_ino("rootfile")
62         subdir_ino = self.mount_a.path_to_ino("subdir")
63         linkdir_ino = self.mount_a.path_to_ino("linkdir")
64         subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile")
65         subsubdir_ino = self.mount_a.path_to_ino("subdir/subsubdir")
66
67         self.mount_a.umount_wait()
68
69         # Stop the MDS
70         self.fs.mds_stop()
71         self.fs.mds_fail()
72
73         # Now, the journal should contain the operations, but the backing
74         # store shouldn't
75         with self.assertRaises(ObjectNotFound):
76             self.fs.list_dirfrag(subdir_ino)
77         self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
78
79         # Execute the dentry recovery, this should populate the backing store
80         self.fs.journal_tool(['event', 'recover_dentries', 'list'])
81
82         # Dentries in ROOT_INO are present
83         self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
84         self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head', 'subsubdir_head'])
85         self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)),
86                          sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head']))
87
88         # Now check the MDS can read what we wrote: truncate the journal
89         # and start the mds.
90         self.fs.journal_tool(['journal', 'reset'])
91         self.fs.mds_fail_restart()
92         self.fs.wait_for_daemons()
93
94         # List files
95         self.mount_a.mount()
96         self.mount_a.wait_until_mounted()
97
98         # First ls -R to populate MDCache, such that hardlinks will
99         # resolve properly (recover_dentries does not create backtraces,
100         # so ordinarily hardlinks to inodes that happen not to have backtraces
101         # will be invisible in readdir).
102         # FIXME: hook in forward scrub here to regenerate backtraces
103         proc = self.mount_a.run_shell(['ls', '-R'])
104         self.mount_a.umount_wait()  # remount to clear client cache before our second ls
105         self.mount_a.mount()
106         self.mount_a.wait_until_mounted()
107
108         proc = self.mount_a.run_shell(['ls', '-R'])
109         self.assertEqual(proc.stdout.getvalue().strip(),
110                          dedent("""
111                          .:
112                          linkdir
113                          rootfile
114                          subdir
115
116                          ./linkdir:
117                          link0
118                          link1
119                          link2
120                          link3
121
122                          ./subdir:
123                          subdirfile
124                          subsubdir
125
126                          ./subdir/subsubdir:
127                          """).strip())
128
129         # Check the correct inos were preserved by path
130         self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile"))
131         self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir"))
132         self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile"))
133         self.assertEqual(subsubdir_ino, self.mount_a.path_to_ino("subdir/subsubdir"))
134
135         # Check that the hard link handling came out correctly
136         self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino)
137         self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino)
138         self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino)
139         self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino)
140
141         # Create a new file, ensure it is not issued the same ino as one of the
142         # recovered ones
143         self.mount_a.run_shell(["touch", "afterwards"])
144         new_ino = self.mount_a.path_to_ino("afterwards")
145         self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino])
146
147         # Check that we can do metadata ops in the recovered directory
148         self.mount_a.run_shell(["touch", "subdir/subsubdir/subsubdirfile"])
149
150     @for_teuthology # 308s
151     def test_reset(self):
152         """
153         That after forcibly modifying the backing store, we can get back into
154         a good state by resetting the MDSMap.
155
156         The scenario is that we have two active MDSs, and we lose the journals.  Once
157         we have completely lost confidence in the integrity of the metadata, we want to
158         return the system to a single-MDS state to go into a scrub to recover what we
159         can.
160         """
161
162         # Set max_mds to 2
163         self.fs.set_max_mds(2)
164
165         # See that we have two active MDSs
166         self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
167                               reject_fn=lambda v: v > 2 or v < 1)
168         active_mds_names = self.fs.get_active_names()
169
170         # Switch off any unneeded MDS daemons
171         for unneeded_mds in set(self.mds_cluster.mds_ids) - set(active_mds_names):
172             self.mds_cluster.mds_stop(unneeded_mds)
173             self.mds_cluster.mds_fail(unneeded_mds)
174
175         # Create a dir on each rank
176         self.mount_a.run_shell(["mkdir", "alpha"])
177         self.mount_a.run_shell(["mkdir", "bravo"])
178         self.mount_a.setfattr("alpha/", "ceph.dir.pin", "0")
179         self.mount_a.setfattr("bravo/", "ceph.dir.pin", "1")
180
181         def subtrees_assigned():
182             got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0])
183
184             for s in got_subtrees:
185                 if s['dir']['path'] == '/bravo':
186                     if s['auth_first'] == 1:
187                         return True
188                     else:
189                         # Should not happen
190                         raise RuntimeError("/bravo is subtree but not rank 1!")
191
192             return False
193
194         # Ensure the pinning has taken effect and the /bravo dir is now
195         # migrated to rank 1.
196         self.wait_until_true(subtrees_assigned, 30)
197
198         # Do some IO (this should be split across ranks according to
199         # the rank-pinned dirs)
200         self.mount_a.create_n_files("alpha/file", 1000)
201         self.mount_a.create_n_files("bravo/file", 1000)
202
203         # Flush the journals so that we have some backing store data
204         # belonging to one MDS, and some to the other MDS.
205         for mds_name in active_mds_names:
206             self.fs.mds_asok(["flush", "journal"], mds_name)
207
208         # Stop (hard) the second MDS daemon
209         self.fs.mds_stop(active_mds_names[1])
210
211         # Wipe out the tables for MDS rank 1 so that it is broken and can't start
212         # (this is the simulated failure that we will demonstrate that the disaster
213         #  recovery tools can get us back from)
214         self.fs.erase_metadata_objects(prefix="mds1_")
215
216         # Try to access files from the client
217         blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False)
218
219         # Check that this "ls -R" blocked rather than completing: indicates
220         # it got stuck trying to access subtrees which were on the now-dead MDS.
221         log.info("Sleeping to check ls is blocked...")
222         time.sleep(60)
223         self.assertFalse(blocked_ls.finished)
224
225         # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1
226         # is not coming back.  Kill it.
227         log.info("Killing mount, it's blocked on the MDS we killed")
228         self.mount_a.kill()
229         self.mount_a.kill_cleanup()
230         try:
231             # Now that the mount is dead, the ls -R should error out.
232             blocked_ls.wait()
233         except (CommandFailedError, ConnectionLostError):
234             # The ConnectionLostError case is for kernel client, where
235             # killing the mount also means killing the node.
236             pass
237
238         # See that the second MDS will crash when it starts and tries to
239         # acquire rank 1
240         damaged_id = active_mds_names[1]
241         self.fs.mds_restart(damaged_id)
242
243         # The daemon taking the damaged rank should start starting, then
244         # restart back into standby after asking the mon to mark the rank
245         # damaged.
246         def is_marked_damaged():
247             mds_map = self.fs.get_mds_map()
248             return 1 in mds_map['damaged']
249
250         self.wait_until_true(is_marked_damaged, 60)
251
252         def get_state():
253             info = self.mds_cluster.get_mds_info(damaged_id)
254             return info['state'] if info is not None else None
255
256         self.wait_until_equal(
257                 get_state,
258                 "up:standby",
259                 timeout=60)
260
261         self.fs.mds_stop(damaged_id)
262         self.fs.mds_fail(damaged_id)
263
264         # Now give up and go through a disaster recovery procedure
265         self.fs.mds_stop(active_mds_names[0])
266         self.fs.mds_fail(active_mds_names[0])
267         # Invoke recover_dentries quietly, because otherwise log spews millions of lines
268         self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=0, quiet=True)
269         self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=1, quiet=True)
270         self.fs.table_tool(["0", "reset", "session"])
271         self.fs.journal_tool(["journal", "reset"], rank=0)
272         self.fs.erase_mds_objects(1)
273         self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
274                 '--yes-i-really-mean-it')
275
276         # Bring an MDS back online, mount a client, and see that we can walk the full
277         # filesystem tree again
278         self.fs.mds_fail_restart(active_mds_names[0])
279         self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30,
280                               reject_fn=lambda v: len(v) > 1)
281         self.mount_a.mount()
282         self.mount_a.run_shell(["ls", "-R"], wait=True)
283
284     def test_table_tool(self):
285         active_mdss = self.fs.get_active_names()
286         self.assertEqual(len(active_mdss), 1)
287         mds_name = active_mdss[0]
288
289         self.mount_a.run_shell(["touch", "foo"])
290         self.fs.mds_asok(["flush", "journal"], mds_name)
291
292         log.info(self.fs.table_tool(["all", "show", "inode"]))
293         log.info(self.fs.table_tool(["all", "show", "snap"]))
294         log.info(self.fs.table_tool(["all", "show", "session"]))
295
296         # Inode table should always be the same because initial state
297         # and choice of inode are deterministic.
298         # Should see one inode consumed
299         self.assertEqual(
300             json.loads(self.fs.table_tool(["all", "show", "inode"])),
301             {"0": {
302                 "data": {
303                     "version": 2,
304                     "inotable": {
305                         "projected_free": [
306                             {"start": 1099511628777,
307                              "len": 1099511626775}],
308                         "free": [
309                             {"start": 1099511628777,
310                              "len": 1099511626775}]}},
311                 "result": 0}}
312
313         )
314
315         # Should see one session
316         session_data = json.loads(self.fs.table_tool(
317             ["all", "show", "session"]))
318         self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 1)
319         self.assertEqual(session_data["0"]["result"], 0)
320
321         # Should see no snaps
322         self.assertEqual(
323             json.loads(self.fs.table_tool(["all", "show", "snap"])),
324             {"version": 0,
325              "snapserver": {"last_snap": 1,
326                             "pending_noop": [],
327                             "snaps": [],
328                             "need_to_purge": {},
329                             "pending_update": [],
330                             "pending_destroy": []},
331              "result": 0}
332         )
333
334         # Reset everything
335         for table in ["session", "inode", "snap"]:
336             self.fs.table_tool(["all", "reset", table])
337
338         log.info(self.fs.table_tool(["all", "show", "inode"]))
339         log.info(self.fs.table_tool(["all", "show", "snap"]))
340         log.info(self.fs.table_tool(["all", "show", "session"]))
341
342         # Should see 0 sessions
343         session_data = json.loads(self.fs.table_tool(
344             ["all", "show", "session"]))
345         self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 0)
346         self.assertEqual(session_data["0"]["result"], 0)
347
348         # Should see entire inode range now marked free
349         self.assertEqual(
350             json.loads(self.fs.table_tool(["all", "show", "inode"])),
351             {"0": {"data": {"version": 1,
352                             "inotable": {"projected_free": [
353                                 {"start": 1099511627776,
354                                  "len": 1099511627776}],
355                                  "free": [
356                                     {"start": 1099511627776,
357                                     "len": 1099511627776}]}},
358                    "result": 0}}
359         )
360
361         # Should see no snaps
362         self.assertEqual(
363             json.loads(self.fs.table_tool(["all", "show", "snap"])),
364             {"version": 1,
365              "snapserver": {"last_snap": 1,
366                             "pending_noop": [],
367                             "snaps": [],
368                             "need_to_purge": {},
369                             "pending_update": [],
370                             "pending_destroy": []},
371              "result": 0}
372         )
373
374     def test_table_tool_take_inos(self):
375         initial_range_start = 1099511627776
376         initial_range_len = 1099511627776
377         # Initially a completely clear range
378         self.assertEqual(
379             json.loads(self.fs.table_tool(["all", "show", "inode"])),
380             {"0": {"data": {"version": 0,
381                             "inotable": {"projected_free": [
382                                 {"start": initial_range_start,
383                                  "len": initial_range_len}],
384                                 "free": [
385                                     {"start": initial_range_start,
386                                      "len": initial_range_len}]}},
387                    "result": 0}}
388         )
389
390         # Remove some
391         self.assertEqual(
392             json.loads(self.fs.table_tool(["all", "take_inos", "{0}".format(initial_range_start + 100)])),
393             {"0": {"data": {"version": 1,
394                             "inotable": {"projected_free": [
395                                 {"start": initial_range_start + 101,
396                                  "len": initial_range_len - 101}],
397                                 "free": [
398                                     {"start": initial_range_start + 101,
399                                      "len": initial_range_len - 101}]}},
400                    "result": 0}}
401         )
402
403     @for_teuthology  # Hack: "for_teuthology" because .sh doesn't work outside teuth
404     def test_journal_smoke(self):
405         workunit(self.ctx, {
406             'clients': {
407                 "client.{0}".format(self.mount_a.client_id): [
408                     "fs/misc/trivial_sync.sh"],
409             },
410             "timeout": "1h"
411         })
412
413         for mount in self.mounts:
414             mount.umount_wait()
415
416         self.fs.mds_stop()
417         self.fs.mds_fail()
418
419         # journal tool smoke
420         workunit(self.ctx, {
421             'clients': {
422                 "client.{0}".format(self.mount_a.client_id): [
423                     "suites/cephfs_journal_tool_smoke.sh"],
424             },
425             "timeout": "1h"
426         })
427
428
429
430         self.fs.mds_restart()
431         self.fs.wait_for_daemons()
432
433         self.mount_a.mount()
434
435         # trivial sync moutn a
436         workunit(self.ctx, {
437             'clients': {
438                 "client.{0}".format(self.mount_a.client_id): [
439                     "fs/misc/trivial_sync.sh"],
440             },
441             "timeout": "1h"
442         })
443