X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=src%2Fceph%2Fsrc%2Fos%2Fbluestore%2FPMEMDevice.cc;fp=src%2Fceph%2Fsrc%2Fos%2Fbluestore%2FPMEMDevice.cc;h=262eeb1c3bb0e49c958032b881fc515fd306ef58;hb=812ff6ca9fcd3e629e49d4328905f33eee8ca3f5;hp=0000000000000000000000000000000000000000;hpb=15280273faafb77777eab341909a3f495cf248d9;p=stor4nfv.git diff --git a/src/ceph/src/os/bluestore/PMEMDevice.cc b/src/ceph/src/os/bluestore/PMEMDevice.cc new file mode 100644 index 0000000..262eeb1 --- /dev/null +++ b/src/ceph/src/os/bluestore/PMEMDevice.cc @@ -0,0 +1,305 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2015 Intel + * + * Author: Jianpeng Ma + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +#include +#include + +#include "PMEMDevice.h" +#include "include/types.h" +#include "include/compat.h" +#include "include/stringify.h" +#include "common/errno.h" +#include "common/debug.h" +#include "common/blkdev.h" + +#define dout_context cct +#define dout_subsys ceph_subsys_bdev +#undef dout_prefix +#define dout_prefix *_dout << "bdev-PMEM(" << path << ") " + +PMEMDevice::PMEMDevice(CephContext *cct, aio_callback_t cb, void *cbpriv) + : BlockDevice(cct), + fd(-1), addr(0), + size(0), block_size(0), + debug_lock("PMEMDevice::debug_lock"), + injecting_crash(0) +{ +} + +int PMEMDevice::_lock() +{ + struct flock l; + memset(&l, 0, sizeof(l)); + l.l_type = F_WRLCK; + l.l_whence = SEEK_SET; + l.l_start = 0; + l.l_len = 0; + int r = ::fcntl(fd, F_SETLK, &l); + if (r < 0) + return -errno; + return 0; +} + +int PMEMDevice::open(const string& p) +{ + path = p; + int r = 0; + dout(1) << __func__ << " path " << path << dendl; + + fd = ::open(path.c_str(), O_RDWR); + if (fd < 0) { + r = -errno; + derr << __func__ << " open got: " << cpp_strerror(r) << dendl; + return r; + } + + r = _lock(); + if (r < 0) { + derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r) + << dendl; + goto out_fail; + } + + struct stat st; + r = ::fstat(fd, &st); + if (r < 0) { + r = -errno; + derr << __func__ << " fstat got " << cpp_strerror(r) << dendl; + goto out_fail; + } + if (S_ISBLK(st.st_mode)) { + int64_t s; + r = get_block_device_size(fd, &s); + if (r < 0) { + goto out_fail; + } + size = s; + } else { + size = st.st_size; + } + + size_t map_len; + addr = (char *)pmem_map_file(path.c_str(), size, PMEM_FILE_EXCL, O_RDWR, &map_len, NULL); + if (addr == NULL) { + derr << __func__ << " pmem_map_file error" << dendl; + goto out_fail; + } + size = map_len; + + // Operate as though the block size is 4 KB. The backing file + // blksize doesn't strictly matter except that some file systems may + // require a read/modify/write if we write something smaller than + // it. + block_size = g_conf->bdev_block_size; + if (block_size != (unsigned)st.st_blksize) { + dout(1) << __func__ << " backing device/file reports st_blksize " + << st.st_blksize << ", using bdev_block_size " + << block_size << " anyway" << dendl; + } + + dout(1) << __func__ + << " size " << size + << " (" << pretty_si_t(size) << "B)" + << " block_size " << block_size + << " (" << pretty_si_t(block_size) << "B)" + << dendl; + return 0; + + out_fail: + VOID_TEMP_FAILURE_RETRY(::close(fd)); + fd = -1; + return r; +} + +void PMEMDevice::close() +{ + dout(1) << __func__ << dendl; + + assert(addr != NULL); + pmem_unmap(addr, size); + assert(fd >= 0); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + fd = -1; + + path.clear(); +} + +static string get_dev_property(const char *dev, const char *property) +{ + char val[1024] = {0}; + get_block_device_string_property(dev, property, val, sizeof(val)); + return val; +} + +int PMEMDevice::collect_metadata(string prefix, map *pm) const +{ + (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational); + (*pm)[prefix + "size"] = stringify(get_size()); + (*pm)[prefix + "block_size"] = stringify(get_block_size()); + (*pm)[prefix + "driver"] = "PMEMDevice"; + (*pm)[prefix + "type"] = "ssd"; + + struct stat st; + int r = ::fstat(fd, &st); + if (r < 0) + return -errno; + if (S_ISBLK(st.st_mode)) { + (*pm)[prefix + "access_mode"] = "blk"; + char partition_path[PATH_MAX]; + char dev_node[PATH_MAX]; + int rc = get_device_by_fd(fd, partition_path, dev_node, PATH_MAX); + switch (rc) { + case -EOPNOTSUPP: + case -EINVAL: + (*pm)[prefix + "partition_path"] = "unknown"; + (*pm)[prefix + "dev_node"] = "unknown"; + break; + case -ENODEV: + (*pm)[prefix + "partition_path"] = string(partition_path); + (*pm)[prefix + "dev_node"] = "unknown"; + break; + default: + { + (*pm)[prefix + "partition_path"] = string(partition_path); + (*pm)[prefix + "dev_node"] = string(dev_node); + (*pm)[prefix + "model"] = get_dev_property(dev_node, "device/model"); + (*pm)[prefix + "dev"] = get_dev_property(dev_node, "dev"); + + // nvme exposes a serial number + string serial = get_dev_property(dev_node, "device/serial"); + if (serial.length()) { + (*pm)[prefix + "serial"] = serial; + } + + // nvme has a device/device/* structure; infer from that. there + // is probably a better way? + string nvme_vendor = get_dev_property(dev_node, "device/device/vendor"); + if (nvme_vendor.length()) { + (*pm)[prefix + "type"] = "nvme"; + } + } + } + } else { + (*pm)[prefix + "access_mode"] = "file"; + (*pm)[prefix + "path"] = path; + } + return 0; +} + +int PMEMDevice::flush() +{ + //Because all write is persist. So no need + return 0; +} + + +void PMEMDevice::aio_submit(IOContext *ioc) +{ + return; +} + +int PMEMDevice::write(uint64_t off, bufferlist& bl, bool buffered) +{ + uint64_t len = bl.length(); + dout(20) << __func__ << " " << off << "~" << len << dendl; + assert(len > 0); + assert(off < size); + assert(off + len <= size); + + dout(40) << "data: "; + bl.hexdump(*_dout); + *_dout << dendl; + + if (g_conf->bdev_inject_crash && + rand() % g_conf->bdev_inject_crash == 0) { + derr << __func__ << " bdev_inject_crash: dropping io " << off << "~" << len + << dendl; + ++injecting_crash; + return 0; + } + + bufferlist::iterator p = bl.begin(); + uint32_t off1 = off; + while (len) { + const char *data; + uint32_t l = p.get_ptr_and_advance(len, &data); + pmem_memcpy_persist(addr + off1, data, l); + len -= l; + off1 += l; + } + + return 0; +} + +int PMEMDevice::aio_write( + uint64_t off, + bufferlist &bl, + IOContext *ioc, + bool buffered) +{ + return write(off, bl, buffered); +} + + +int PMEMDevice::read(uint64_t off, uint64_t len, bufferlist *pbl, + IOContext *ioc, + bool buffered) +{ + dout(5) << __func__ << " " << off << "~" << len << dendl; + assert(len > 0); + assert(off < size); + assert(off + len <= size); + + bufferptr p = buffer::create_page_aligned(len); + memcpy(p.c_str(), addr + off, len); + + pbl->clear(); + pbl->push_back(std::move(p)); + + dout(40) << "data: "; + pbl->hexdump(*_dout); + *_dout << dendl; + + return 0; +} + +int PMEMDevice::aio_read(uint64_t off, uint64_t len, bufferlist *pbl, + IOContext *ioc) +{ + return read(off, len, pbl, ioc, false); +} + +int PMEMDevice::read_random(uint64_t off, uint64_t len, char *buf, bool buffered) +{ + assert(len > 0); + assert(off < size); + assert(off + len <= size); + + memcpy(buf, addr + off, len); + return 0; +} + + +int PMEMDevice::invalidate_cache(uint64_t off, uint64_t len) +{ + dout(5) << __func__ << " " << off << "~" << len << dendl; + return 0; +} + +