1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "include/int_types.h"
16 #include "include/types.h"
22 #include <sys/types.h>
24 #include <sys/ioctl.h>
25 #include "include/compat.h"
26 #include "include/linux_fiemap.h"
27 #include "include/color.h"
28 #include "include/buffer.h"
29 #include "include/assert.h"
32 #include "os/fs/btrfs_ioctl.h"
39 #include "BtrfsFileStoreBackend.h"
41 #include "common/errno.h"
42 #include "common/config.h"
44 #if defined(__linux__)
46 #define dout_context cct()
47 #define dout_subsys ceph_subsys_filestore
49 #define dout_prefix *_dout << "btrfsfilestorebackend(" << get_basedir_path() << ") "
51 #define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
52 #define ALIGNED(x, by) (!((x) % (by)))
53 #define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
55 BtrfsFileStoreBackend::BtrfsFileStoreBackend(FileStore *fs):
56 GenericFileStoreBackend(fs), has_clone_range(false),
57 has_snap_create(false), has_snap_destroy(false),
58 has_snap_create_v2(false), has_wait_sync(false), stable_commits(false),
59 m_filestore_btrfs_clone_range(cct()->_conf->filestore_btrfs_clone_range),
60 m_filestore_btrfs_snap (cct()->_conf->filestore_btrfs_snap) { }
62 int BtrfsFileStoreBackend::detect_features()
66 r = GenericFileStoreBackend::detect_features();
71 if (m_filestore_btrfs_clone_range) {
72 int fd = ::openat(get_basedir_fd(), "clone_range_test", O_CREAT|O_WRONLY, 0600);
74 if (::unlinkat(get_basedir_fd(), "clone_range_test", 0) < 0) {
76 dout(0) << "detect_feature: failed to unlink test file for CLONE_RANGE ioctl: "
77 << cpp_strerror(r) << dendl;
79 btrfs_ioctl_clone_range_args clone_args;
80 memset(&clone_args, 0, sizeof(clone_args));
81 clone_args.src_fd = -1;
82 r = ::ioctl(fd, BTRFS_IOC_CLONE_RANGE, &clone_args);
83 if (r < 0 && errno == EBADF) {
84 dout(0) << "detect_feature: CLONE_RANGE ioctl is supported" << dendl;
85 has_clone_range = true;
88 dout(0) << "detect_feature: CLONE_RANGE ioctl is NOT supported: " << cpp_strerror(r) << dendl;
90 TEMP_FAILURE_RETRY(::close(fd));
93 dout(0) << "detect_feature: failed to create test file for CLONE_RANGE ioctl: "
94 << cpp_strerror(r) << dendl;
97 dout(0) << "detect_feature: CLONE_RANGE ioctl is DISABLED via 'filestore btrfs clone range' option" << dendl;
100 struct btrfs_ioctl_vol_args vol_args;
101 memset(&vol_args, 0, sizeof(vol_args));
103 // create test source volume
105 strcpy(vol_args.name, "test_subvol");
106 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, &vol_args);
109 dout(0) << "detect_feature: failed to create simple subvolume " << vol_args.name << ": " << cpp_strerror(r) << dendl;
111 int srcfd = ::openat(get_basedir_fd(), vol_args.name, O_RDONLY);
114 dout(0) << "detect_feature: failed to open " << vol_args.name << ": " << cpp_strerror(r) << dendl;
117 // snap_create and snap_destroy?
119 strcpy(vol_args.name, "sync_snap_test");
120 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
122 if (r == 0 || errno == EEXIST) {
123 dout(0) << "detect_feature: SNAP_CREATE is supported" << dendl;
124 has_snap_create = true;
126 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
128 dout(0) << "detect_feature: SNAP_DESTROY is supported" << dendl;
129 has_snap_destroy = true;
132 dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
134 if (err == -EPERM && getuid() != 0) {
135 dout(0) << "detect_feature: failed with EPERM as non-root; remount with -o user_subvol_rm_allowed" << dendl;
137 << "btrfs SNAP_DESTROY failed as non-root; remount with -o user_subvol_rm_allowed"
138 << TEXT_NORMAL << std::endl;
139 } else if (err == -EOPNOTSUPP) {
140 derr << "btrfs SNAP_DESTROY ioctl not supported; you need a kernel newer than 2.6.32" << dendl;
144 dout(0) << "detect_feature: SNAP_CREATE failed: " << cpp_strerror(err) << dendl;
147 if (m_filestore_btrfs_snap) {
148 if (has_snap_destroy)
149 stable_commits = true;
151 dout(0) << "detect_feature: snaps enabled, but no SNAP_DESTROY ioctl; DISABLING" << dendl;
156 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_START_SYNC, &transid);
159 dout(0) << "detect_feature: START_SYNC got " << cpp_strerror(err) << dendl;
161 if (r == 0 && transid > 0) {
162 dout(0) << "detect_feature: START_SYNC is supported (transid " << transid << ")" << dendl;
164 // do we have wait_sync too?
165 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
166 if (r == 0 || errno == ERANGE) {
167 dout(0) << "detect_feature: WAIT_SYNC is supported" << dendl;
168 has_wait_sync = true;
171 dout(0) << "detect_feature: WAIT_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
175 dout(0) << "detect_feature: START_SYNC is NOT supported: " << cpp_strerror(err) << dendl;
179 // async snap creation?
180 struct btrfs_ioctl_vol_args_v2 async_args;
181 memset(&async_args, 0, sizeof(async_args));
182 async_args.fd = srcfd;
183 async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
184 strcpy(async_args.name, "async_snap_test");
186 // remove old one, first
188 strcpy(vol_args.name, async_args.name);
189 if (::fstatat(get_basedir_fd(), vol_args.name, &st, 0) == 0) {
190 dout(0) << "detect_feature: removing old async_snap_test" << dendl;
191 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
194 dout(0) << "detect_feature: failed to remove old async_snap_test: " << cpp_strerror(err) << dendl;
198 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
199 if (r == 0 || errno == EEXIST) {
200 dout(0) << "detect_feature: SNAP_CREATE_V2 is supported" << dendl;
201 has_snap_create_v2 = true;
204 strcpy(vol_args.name, "async_snap_test");
205 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
208 dout(0) << "detect_feature: SNAP_DESTROY failed: " << cpp_strerror(err) << dendl;
212 dout(0) << "detect_feature: SNAP_CREATE_V2 is NOT supported: " << cpp_strerror(err) << dendl;
216 // clean up test subvol
218 TEMP_FAILURE_RETRY(::close(srcfd));
220 strcpy(vol_args.name, "test_subvol");
221 r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
224 dout(0) << "detect_feature: failed to remove " << vol_args.name << ": " << cpp_strerror(r) << dendl;
227 if (m_filestore_btrfs_snap && !has_snap_create_v2) {
228 dout(0) << "mount WARNING: btrfs snaps enabled, but no SNAP_CREATE_V2 ioctl (from kernel 2.6.37+)" << dendl;
230 << " ** WARNING: 'filestore btrfs snap' is enabled (for safe transactions,\n"
231 << " rollback), but btrfs does not support the SNAP_CREATE_V2 ioctl\n"
232 << " (added in Linux 2.6.37). Expect slow btrfs sync/commit\n"
240 bool BtrfsFileStoreBackend::can_checkpoint()
242 return stable_commits;
245 int BtrfsFileStoreBackend::create_current()
248 int ret = ::stat(get_current_path().c_str(), &st);
251 if (!S_ISDIR(st.st_mode)) {
252 dout(0) << "create_current: current/ exists but is not a directory" << dendl;
257 struct statfs currentfs;
258 ret = ::fstat(get_basedir_fd(), &basest);
261 dout(0) << "create_current: cannot fstat basedir " << cpp_strerror(ret) << dendl;
264 ret = ::statfs(get_current_path().c_str(), ¤tfs);
267 dout(0) << "create_current: cannot statsf basedir " << cpp_strerror(ret) << dendl;
270 if (currentfs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev) {
271 dout(2) << "create_current: current appears to be a btrfs subvolume" << dendl;
272 stable_commits = true;
277 struct btrfs_ioctl_vol_args volargs;
278 memset(&volargs, 0, sizeof(volargs));
281 strcpy(volargs.name, "current");
282 if (::ioctl(get_basedir_fd(), BTRFS_IOC_SUBVOL_CREATE, (unsigned long int)&volargs) < 0) {
284 dout(0) << "create_current: BTRFS_IOC_SUBVOL_CREATE failed with error "
285 << cpp_strerror(ret) << dendl;
289 dout(2) << "create_current: created btrfs subvol " << get_current_path() << dendl;
290 if (::chmod(get_current_path().c_str(), 0755) < 0) {
292 dout(0) << "create_current: failed to chmod " << get_current_path() << " to 0755: "
293 << cpp_strerror(ret) << dendl;
297 stable_commits = true;
301 int BtrfsFileStoreBackend::list_checkpoints(list<string>& ls)
306 ret = ::fstat(get_basedir_fd(), &basest);
309 dout(0) << "list_checkpoints: cannot fstat basedir " << cpp_strerror(ret) << dendl;
314 DIR *dir = ::opendir(get_basedir_path().c_str());
317 dout(0) << "list_checkpoints: opendir '" << get_basedir_path() << "' failed: "
318 << cpp_strerror(ret) << dendl;
325 while ((de = ::readdir(dir))) {
326 snprintf(path, sizeof(path), "%s/%s", get_basedir_path().c_str(), de->d_name);
329 ret = ::stat(path, &st);
332 dout(0) << "list_checkpoints: stat '" << path << "' failed: "
333 << cpp_strerror(err) << dendl;
337 if (!S_ISDIR(st.st_mode))
341 ret = ::statfs(path, &fs);
344 dout(0) << "list_checkpoints: statfs '" << path << "' failed: "
345 << cpp_strerror(err) << dendl;
349 if (fs.f_type == BTRFS_SUPER_MAGIC && basest.st_dev != st.st_dev)
350 snaps.push_back(string(de->d_name));
353 if (::closedir(dir) < 0) {
355 dout(0) << "list_checkpoints: closedir failed: " << cpp_strerror(ret) << dendl;
367 int BtrfsFileStoreBackend::create_checkpoint(const string& name, uint64_t *transid)
369 dout(10) << "create_checkpoint: '" << name << "'" << dendl;
370 if (has_snap_create_v2 && transid) {
371 struct btrfs_ioctl_vol_args_v2 async_args;
372 memset(&async_args, 0, sizeof(async_args));
373 async_args.fd = get_current_fd();
374 async_args.flags = BTRFS_SUBVOL_CREATE_ASYNC;
376 size_t name_size = sizeof(async_args.name);
377 strncpy(async_args.name, name.c_str(), name_size);
378 async_args.name[name_size-1] = '\0';
380 int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE_V2, &async_args);
383 dout(0) << "create_checkpoint: async snap create '" << name << "' got " << cpp_strerror(r) << dendl;
386 dout(20) << "create_checkpoint: async snap create '" << name << "' transid " << async_args.transid << dendl;
387 *transid = async_args.transid;
389 struct btrfs_ioctl_vol_args vol_args;
390 memset(&vol_args, 0, sizeof(vol_args));
391 vol_args.fd = get_current_fd();
393 size_t name_size = sizeof(vol_args.name);
394 strncpy(vol_args.name, name.c_str(), name_size);
395 vol_args.name[name_size-1] = '\0';
397 int r = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
400 dout(0) << "create_checkpoint: snap create '" << name << "' got " << cpp_strerror(r) << dendl;
409 int BtrfsFileStoreBackend::sync_checkpoint(uint64_t transid)
412 dout(10) << "sync_checkpoint: transid " << transid << " to complete" << dendl;
413 int ret = ::ioctl(get_op_fd(), BTRFS_IOC_WAIT_SYNC, &transid);
416 dout(0) << "sync_checkpoint: ioctl WAIT_SYNC got " << cpp_strerror(ret) << dendl;
419 dout(20) << "sync_checkpoint: done waiting for transid " << transid << dendl;
423 int BtrfsFileStoreBackend::rollback_to(const string& name)
425 dout(10) << "rollback_to: to '" << name << "'" << dendl;
427 btrfs_ioctl_vol_args vol_args;
429 memset(&vol_args, 0, sizeof(vol_args));
431 strcpy(vol_args.name, "current");
433 int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
434 if (ret && errno != ENOENT) {
435 dout(0) << "rollback_to: error removing old current subvol: " << cpp_strerror(ret) << dendl;
436 snprintf(s, sizeof(s), "%s/current.remove.me.%d", get_basedir_path().c_str(), rand());
437 if (::rename(get_current_path().c_str(), s)) {
439 dout(0) << "rollback_to: error renaming old current subvol: "
440 << cpp_strerror(ret) << dendl;
445 snprintf(s, sizeof(s), "%s/%s", get_basedir_path().c_str(), name.c_str());
448 vol_args.fd = ::open(s, O_RDONLY);
449 if (vol_args.fd < 0) {
451 dout(0) << "rollback_to: error opening '" << s << "': " << cpp_strerror(ret) << dendl;
454 ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_CREATE, &vol_args);
457 dout(0) << "rollback_to: ioctl SNAP_CREATE got " << cpp_strerror(ret) << dendl;
459 TEMP_FAILURE_RETRY(::close(vol_args.fd));
463 int BtrfsFileStoreBackend::destroy_checkpoint(const string& name)
465 dout(10) << "destroy_checkpoint: '" << name << "'" << dendl;
466 btrfs_ioctl_vol_args vol_args;
467 memset(&vol_args, 0, sizeof(vol_args));
469 strncpy(vol_args.name, name.c_str(), sizeof(vol_args.name));
471 int ret = ::ioctl(get_basedir_fd(), BTRFS_IOC_SNAP_DESTROY, &vol_args);
474 dout(0) << "destroy_checkpoint: ioctl SNAP_DESTROY got " << cpp_strerror(ret) << dendl;
480 int BtrfsFileStoreBackend::syncfs()
482 dout(15) << "syncfs" << dendl;
483 // do a full btrfs commit
484 int ret = ::ioctl(get_op_fd(), BTRFS_IOC_SYNC);
487 dout(0) << "syncfs: btrfs IOC_SYNC got " << cpp_strerror(ret) << dendl;
492 int BtrfsFileStoreBackend::clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff)
494 dout(20) << "clone_range: " << srcoff << "~" << len << " to " << dstoff << dendl;
495 size_t blk_size = get_blksize();
496 if (!has_clone_range ||
497 srcoff % blk_size != dstoff % blk_size) {
498 dout(20) << "clone_range: using copy" << dendl;
499 return _copy_range(from, to, srcoff, len, dstoff);
505 uint64_t srcoffclone = ALIGN_UP(srcoff, blk_size);
506 uint64_t dstoffclone = ALIGN_UP(dstoff, blk_size);
507 if (srcoffclone >= srcoff + len) {
508 dout(20) << "clone_range: using copy, extent too short to align srcoff" << dendl;
509 return _copy_range(from, to, srcoff, len, dstoff);
512 uint64_t lenclone = len - (srcoffclone - srcoff);
513 if (!ALIGNED(lenclone, blk_size)) {
514 struct stat from_stat, to_stat;
515 err = ::fstat(from, &from_stat);
516 if (err) return -errno;
517 err = ::fstat(to , &to_stat);
518 if (err) return -errno;
520 if (srcoff + len != (uint64_t)from_stat.st_size ||
521 dstoff + len < (uint64_t)to_stat.st_size) {
522 // Not to the end of the file, need to align length as well
523 lenclone = ALIGN_DOWN(lenclone, blk_size);
528 return _copy_range(from, to, srcoff, len, dstoff);
531 dout(20) << "clone_range: cloning " << srcoffclone << "~" << lenclone
532 << " to " << dstoffclone << " = " << r << dendl;
533 btrfs_ioctl_clone_range_args a;
535 a.src_offset = srcoffclone;
536 a.src_length = lenclone;
537 a.dest_offset = dstoffclone;
538 err = ::ioctl(to, BTRFS_IOC_CLONE_RANGE, &a);
541 } else if (errno == EINVAL) {
542 // Still failed, might be compressed
543 dout(20) << "clone_range: failed CLONE_RANGE call with -EINVAL, using copy" << dendl;
544 return _copy_range(from, to, srcoff, len, dstoff);
549 // Take care any trimmed from front
550 if (srcoffclone != srcoff) {
551 err = _copy_range(from, to, srcoff, srcoffclone - srcoff, dstoff);
560 if (srcoffclone + lenclone != srcoff + len) {
561 err = _copy_range(from, to,
562 srcoffclone + lenclone,
563 (srcoff + len) - (srcoffclone + lenclone),
564 dstoffclone + lenclone);
571 dout(20) << "clone_range: finished " << srcoff << "~" << len
572 << " to " << dstoff << " = " << r << dendl;