1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #include "include/int_types.h"
16 #include "include/types.h"
22 #include <sys/types.h>
24 #include <sys/ioctl.h>
26 #if defined(__linux__)
30 #include "include/compat.h"
31 #include "include/linux_fiemap.h"
37 #include "GenericFileStoreBackend.h"
39 #include "common/errno.h"
40 #include "common/config.h"
41 #include "common/sync_filesystem.h"
42 #include "common/blkdev.h"
44 #include "common/SloppyCRCMap.h"
45 #include "os/filestore/chain_xattr.h"
47 #define SLOPPY_CRC_XATTR "user.cephos.scrc"
50 #define dout_context cct()
51 #define dout_subsys ceph_subsys_filestore
53 #define dout_prefix *_dout << "genericfilestorebackend(" << get_basedir_path() << ") "
55 #define ALIGN_DOWN(x, by) ((x) - ((x) % (by)))
56 #define ALIGNED(x, by) (!((x) % (by)))
57 #define ALIGN_UP(x, by) (ALIGNED((x), (by)) ? (x) : (ALIGN_DOWN((x), (by)) + (by)))
59 GenericFileStoreBackend::GenericFileStoreBackend(FileStore *fs):
62 seek_data_hole(false),
64 m_filestore_fiemap(cct()->_conf->filestore_fiemap),
65 m_filestore_seek_data_hole(cct()->_conf->filestore_seek_data_hole),
66 m_filestore_fsync_flushes_journal_data(cct()->_conf->filestore_fsync_flushes_journal_data),
67 m_filestore_splice(cct()->_conf->filestore_splice)
71 // NOTE: the below won't work on btrfs; we'll assume rotational.
72 string fn = get_basedir_path();
73 int fd = ::open(fn.c_str(), O_RDONLY);
77 char partition[PATH_MAX], devname[PATH_MAX];
78 int r = get_device_by_fd(fd, partition, devname, sizeof(devname));
80 dout(1) << "unable to get device name for " << get_basedir_path() << ": "
81 << cpp_strerror(r) << dendl;
84 m_rotational = block_device_is_rotational(devname);
85 dout(20) << __func__ << " devname " << devname
86 << " rotational " << (int)m_rotational << dendl;
90 // journal rotational?
92 // NOTE: the below won't work on btrfs; we'll assume rotational.
93 string fn = get_journal_path();
94 int fd = ::open(fn.c_str(), O_RDONLY);
98 char partition[PATH_MAX], devname[PATH_MAX];
99 int r = get_device_by_fd(fd, partition, devname, sizeof(devname));
101 dout(1) << "unable to get journal device name for "
102 << get_journal_path() << ": " << cpp_strerror(r) << dendl;
103 m_journal_rotational = true;
105 m_journal_rotational = block_device_is_rotational(devname);
106 dout(20) << __func__ << " journal devname " << devname
107 << " journal rotational " << (int)m_journal_rotational << dendl;
113 int GenericFileStoreBackend::detect_features()
116 snprintf(fn, sizeof(fn), "%s/fiemap_test", get_basedir_path().c_str());
118 int fd = ::open(fn, O_CREAT|O_RDWR|O_TRUNC, 0644);
121 derr << "detect_features: unable to create " << fn << ": " << cpp_strerror(fd) << dendl;
125 // ext4 has a bug in older kernels where fiemap will return an empty
126 // result in some cases. this is a file layout that triggers the bug
129 0x0000000000016000, 0x0000000000007000,
130 0x000000000004a000, 0x0000000000007000,
131 0x0000000000060000, 0x0000000000001000,
132 0x0000000000061000, 0x0000000000008000,
133 0x0000000000069000, 0x0000000000007000,
134 0x00000000000a3000, 0x000000000000c000,
135 0x000000000024e000, 0x000000000000c000,
136 0x000000000028b000, 0x0000000000009000,
137 0x00000000002b1000, 0x0000000000003000,
140 for (int i=0; v[i]; i++) {
144 // write a large extent
146 memset(buf, 1, sizeof(buf));
147 int r = ::lseek(fd, off, SEEK_SET);
150 derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(r) << dendl;
151 VOID_TEMP_FAILURE_RETRY(::close(fd));
154 r = write(fd, buf, sizeof(buf));
156 derr << "detect_features: failed to write to " << fn << ": " << cpp_strerror(r) << dendl;
157 VOID_TEMP_FAILURE_RETRY(::close(fd));
162 // fiemap an extent inside that
163 if (!m_filestore_fiemap) {
164 dout(0) << "detect_features: FIEMAP ioctl is disabled via 'filestore fiemap' config option" << dendl;
165 ioctl_fiemap = false;
167 struct fiemap *fiemap;
168 int r = do_fiemap(fd, 2430421, 59284, &fiemap);
170 dout(0) << "detect_features: FIEMAP ioctl is NOT supported" << dendl;
171 ioctl_fiemap = false;
173 if (fiemap->fm_mapped_extents == 0) {
174 dout(0) << "detect_features: FIEMAP ioctl is supported, but buggy -- upgrade your kernel" << dendl;
175 ioctl_fiemap = false;
177 dout(0) << "detect_features: FIEMAP ioctl is supported and appears to work" << dendl;
184 // SEEK_DATA/SEEK_HOLE detection
185 if (!m_filestore_seek_data_hole) {
186 dout(0) << "detect_features: SEEK_DATA/SEEK_HOLE is disabled via 'filestore seek data hole' config option" << dendl;
187 seek_data_hole = false;
189 #if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA)
190 // If compiled on an OS with SEEK_HOLE/SEEK_DATA support, but running
191 // on an OS that doesn't support SEEK_HOLE/SEEK_DATA, EINVAL is returned.
192 // Fall back to use fiemap.
195 hole_pos = lseek(fd, 0, SEEK_HOLE);
197 if (errno == EINVAL) {
198 dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is NOT supported" << dendl;
199 seek_data_hole = false;
201 derr << "detect_features: failed to lseek " << fn << ": " << cpp_strerror(-errno) << dendl;
202 VOID_TEMP_FAILURE_RETRY(::close(fd));
206 dout(0) << "detect_features: lseek SEEK_DATA/SEEK_HOLE is supported" << dendl;
207 seek_data_hole = true;
213 #ifdef CEPH_HAVE_SPLICE
214 if (!m_filestore_splice) {
215 dout(0) << __func__ << ": splice() is disabled via 'filestore splice' config option" << dendl;
221 if ((r = pipe(pipefd)) < 0)
222 dout(0) << "detect_features: splice pipe met error " << cpp_strerror(errno) << dendl;
224 lseek(fd, 0, SEEK_SET);
225 r = splice(fd, &off_in, pipefd[1], NULL, 10, 0);
226 if (!(r < 0 && errno == EINVAL)) {
228 dout(0) << "detect_features: splice is supported" << dendl;
230 dout(0) << "detect_features: splice is NOT supported" << dendl;
237 VOID_TEMP_FAILURE_RETRY(::close(fd));
240 bool have_syncfs = false;
241 #ifdef HAVE_SYS_SYNCFS
242 if (::syncfs(get_basedir_fd()) == 0) {
243 dout(0) << "detect_features: syncfs(2) syscall fully supported (by glibc and kernel)" << dendl;
246 dout(0) << "detect_features: syncfs(2) syscall supported by glibc BUT NOT the kernel" << dendl;
248 #elif defined(SYS_syncfs)
249 if (syscall(SYS_syncfs, get_basedir_fd()) == 0) {
250 dout(0) << "detect_features: syscall(SYS_syncfs, fd) fully supported" << dendl;
253 dout(0) << "detect_features: syscall(SYS_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
255 #elif defined(__NR_syncfs)
256 if (syscall(__NR_syncfs, get_basedir_fd()) == 0) {
257 dout(0) << "detect_features: syscall(__NR_syncfs, fd) fully supported" << dendl;
260 dout(0) << "detect_features: syscall(__NR_syncfs, fd) supported by libc BUT NOT the kernel" << dendl;
264 dout(0) << "detect_features: syncfs(2) syscall not supported" << dendl;
265 if (m_filestore_fsync_flushes_journal_data) {
266 dout(0) << "detect_features: no syncfs(2), but 'filestore fsync flushes journal data = true', so fsync will suffice." << dendl;
268 dout(0) << "detect_features: no syncfs(2), must use sync(2)." << dendl;
269 dout(0) << "detect_features: WARNING: multiple ceph-osd daemons on the same host will be slow" << dendl;
276 int GenericFileStoreBackend::create_current()
279 int ret = ::stat(get_current_path().c_str(), &st);
282 if (!S_ISDIR(st.st_mode)) {
283 dout(0) << "_create_current: current/ exists but is not a directory" << dendl;
287 ret = ::mkdir(get_current_path().c_str(), 0755);
290 dout(0) << "_create_current: mkdir " << get_current_path() << " failed: "<< cpp_strerror(ret) << dendl;
296 int GenericFileStoreBackend::syncfs()
299 if (m_filestore_fsync_flushes_journal_data) {
300 dout(15) << "syncfs: doing fsync on " << get_op_fd() << dendl;
301 // make the file system's journal commit.
302 // this works with ext3, but NOT ext4
303 ret = ::fsync(get_op_fd());
307 dout(15) << "syncfs: doing a full sync (syncfs(2) if possible)" << dendl;
308 ret = sync_filesystem(get_current_fd());
313 int GenericFileStoreBackend::do_fiemap(int fd, off_t start, size_t len, struct fiemap **pfiemap)
315 struct fiemap *fiemap = NULL;
316 struct fiemap *_realloc_fiemap = NULL;
320 fiemap = (struct fiemap*)calloc(sizeof(struct fiemap), 1);
324 * There is a bug on xfs about fiemap. Suppose(offset=3990, len=4096),
325 * the result is (logical=4096, len=4096). It leak the [3990, 4096).
326 * Commit:"xfs: fix rounding error of fiemap length parameter
327 * (eedf32bfcace7d8e20cc66757d74fc68f3439ff7)" fix this bug.
328 * Here, we make offset aligned with CEPH_PAGE_SIZE to avoid this bug.
330 fiemap->fm_start = start - start % CEPH_PAGE_SIZE;
331 fiemap->fm_length = len + start % CEPH_PAGE_SIZE;
332 fiemap->fm_flags = FIEMAP_FLAG_SYNC; /* flush extents to disk if needed */
334 #if defined(DARWIN) || defined(__FreeBSD__)
338 if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
343 size = sizeof(struct fiemap_extent) * (fiemap->fm_mapped_extents);
345 _realloc_fiemap = (struct fiemap *)realloc(fiemap, sizeof(struct fiemap) + size);
346 if (!_realloc_fiemap) {
350 fiemap = _realloc_fiemap;
353 memset(fiemap->fm_extents, 0, size);
355 fiemap->fm_extent_count = fiemap->fm_mapped_extents;
356 fiemap->fm_mapped_extents = 0;
358 #if defined(DARWIN) || defined(__FreeBSD__)
362 if (ioctl(fd, FS_IOC_FIEMAP, fiemap) < 0) {
377 int GenericFileStoreBackend::_crc_load_or_init(int fd, SloppyCRCMap *cm)
382 int l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, buf, sizeof(buf));
387 bp = buffer::create(l);
388 memcpy(bp.c_str(), buf, l);
389 } else if (l == -ERANGE) {
390 l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, 0, 0);
392 bp = buffer::create(l);
393 l = chain_fgetxattr(fd, SLOPPY_CRC_XATTR, bp.c_str(), l);
397 bl.append(std::move(bp));
398 bufferlist::iterator p = bl.begin();
402 catch (buffer::error &e) {
406 derr << __func__ << " got " << cpp_strerror(r) << dendl;
410 int GenericFileStoreBackend::_crc_save(int fd, SloppyCRCMap *cm)
414 int r = chain_fsetxattr(fd, SLOPPY_CRC_XATTR, bl.c_str(), bl.length());
416 derr << __func__ << " got " << cpp_strerror(r) << dendl;
420 int GenericFileStoreBackend::_crc_update_write(int fd, loff_t off, size_t len, const bufferlist& bl)
422 SloppyCRCMap scm(get_crc_block_size());
423 int r = _crc_load_or_init(fd, &scm);
427 scm.write(off, len, bl, &ss);
428 dout(30) << __func__ << "\n" << ss.str() << dendl;
429 r = _crc_save(fd, &scm);
433 int GenericFileStoreBackend::_crc_update_truncate(int fd, loff_t off)
435 SloppyCRCMap scm(get_crc_block_size());
436 int r = _crc_load_or_init(fd, &scm);
440 r = _crc_save(fd, &scm);
444 int GenericFileStoreBackend::_crc_update_zero(int fd, loff_t off, size_t len)
446 SloppyCRCMap scm(get_crc_block_size());
447 int r = _crc_load_or_init(fd, &scm);
451 r = _crc_save(fd, &scm);
455 int GenericFileStoreBackend::_crc_update_clone_range(int srcfd, int destfd,
456 loff_t srcoff, size_t len, loff_t dstoff)
458 SloppyCRCMap scm_src(get_crc_block_size());
459 SloppyCRCMap scm_dst(get_crc_block_size());
460 int r = _crc_load_or_init(srcfd, &scm_src);
463 r = _crc_load_or_init(destfd, &scm_dst);
467 scm_dst.clone_range(srcoff, len, dstoff, scm_src, &ss);
468 dout(30) << __func__ << "\n" << ss.str() << dendl;
469 r = _crc_save(destfd, &scm_dst);
473 int GenericFileStoreBackend::_crc_verify_read(int fd, loff_t off, size_t len, const bufferlist& bl,
476 SloppyCRCMap scm(get_crc_block_size());
477 int r = _crc_load_or_init(fd, &scm);
480 return scm.read(off, len, bl, out);