--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 Intel <jianpeng.ma@intel.com>
+ *
+ * Author: Jianpeng Ma <jianpeng.ma@intel.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <libpmem.h>
+
+#include "PMEMDevice.h"
+#include "include/types.h"
+#include "include/compat.h"
+#include "include/stringify.h"
+#include "common/errno.h"
+#include "common/debug.h"
+#include "common/blkdev.h"
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bdev
+#undef dout_prefix
+#define dout_prefix *_dout << "bdev-PMEM(" << path << ") "
+
+PMEMDevice::PMEMDevice(CephContext *cct, aio_callback_t cb, void *cbpriv)
+ : BlockDevice(cct),
+ fd(-1), addr(0),
+ size(0), block_size(0),
+ debug_lock("PMEMDevice::debug_lock"),
+ injecting_crash(0)
+{
+}
+
+int PMEMDevice::_lock()
+{
+ struct flock l;
+ memset(&l, 0, sizeof(l));
+ l.l_type = F_WRLCK;
+ l.l_whence = SEEK_SET;
+ l.l_start = 0;
+ l.l_len = 0;
+ int r = ::fcntl(fd, F_SETLK, &l);
+ if (r < 0)
+ return -errno;
+ return 0;
+}
+
+int PMEMDevice::open(const string& p)
+{
+ path = p;
+ int r = 0;
+ dout(1) << __func__ << " path " << path << dendl;
+
+ fd = ::open(path.c_str(), O_RDWR);
+ if (fd < 0) {
+ r = -errno;
+ derr << __func__ << " open got: " << cpp_strerror(r) << dendl;
+ return r;
+ }
+
+ r = _lock();
+ if (r < 0) {
+ derr << __func__ << " failed to lock " << path << ": " << cpp_strerror(r)
+ << dendl;
+ goto out_fail;
+ }
+
+ struct stat st;
+ r = ::fstat(fd, &st);
+ if (r < 0) {
+ r = -errno;
+ derr << __func__ << " fstat got " << cpp_strerror(r) << dendl;
+ goto out_fail;
+ }
+ if (S_ISBLK(st.st_mode)) {
+ int64_t s;
+ r = get_block_device_size(fd, &s);
+ if (r < 0) {
+ goto out_fail;
+ }
+ size = s;
+ } else {
+ size = st.st_size;
+ }
+
+ size_t map_len;
+ addr = (char *)pmem_map_file(path.c_str(), size, PMEM_FILE_EXCL, O_RDWR, &map_len, NULL);
+ if (addr == NULL) {
+ derr << __func__ << " pmem_map_file error" << dendl;
+ goto out_fail;
+ }
+ size = map_len;
+
+ // Operate as though the block size is 4 KB. The backing file
+ // blksize doesn't strictly matter except that some file systems may
+ // require a read/modify/write if we write something smaller than
+ // it.
+ block_size = g_conf->bdev_block_size;
+ if (block_size != (unsigned)st.st_blksize) {
+ dout(1) << __func__ << " backing device/file reports st_blksize "
+ << st.st_blksize << ", using bdev_block_size "
+ << block_size << " anyway" << dendl;
+ }
+
+ dout(1) << __func__
+ << " size " << size
+ << " (" << pretty_si_t(size) << "B)"
+ << " block_size " << block_size
+ << " (" << pretty_si_t(block_size) << "B)"
+ << dendl;
+ return 0;
+
+ out_fail:
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ fd = -1;
+ return r;
+}
+
+void PMEMDevice::close()
+{
+ dout(1) << __func__ << dendl;
+
+ assert(addr != NULL);
+ pmem_unmap(addr, size);
+ assert(fd >= 0);
+ VOID_TEMP_FAILURE_RETRY(::close(fd));
+ fd = -1;
+
+ path.clear();
+}
+
+static string get_dev_property(const char *dev, const char *property)
+{
+ char val[1024] = {0};
+ get_block_device_string_property(dev, property, val, sizeof(val));
+ return val;
+}
+
+int PMEMDevice::collect_metadata(string prefix, map<string,string> *pm) const
+{
+ (*pm)[prefix + "rotational"] = stringify((int)(bool)rotational);
+ (*pm)[prefix + "size"] = stringify(get_size());
+ (*pm)[prefix + "block_size"] = stringify(get_block_size());
+ (*pm)[prefix + "driver"] = "PMEMDevice";
+ (*pm)[prefix + "type"] = "ssd";
+
+ struct stat st;
+ int r = ::fstat(fd, &st);
+ if (r < 0)
+ return -errno;
+ if (S_ISBLK(st.st_mode)) {
+ (*pm)[prefix + "access_mode"] = "blk";
+ char partition_path[PATH_MAX];
+ char dev_node[PATH_MAX];
+ int rc = get_device_by_fd(fd, partition_path, dev_node, PATH_MAX);
+ switch (rc) {
+ case -EOPNOTSUPP:
+ case -EINVAL:
+ (*pm)[prefix + "partition_path"] = "unknown";
+ (*pm)[prefix + "dev_node"] = "unknown";
+ break;
+ case -ENODEV:
+ (*pm)[prefix + "partition_path"] = string(partition_path);
+ (*pm)[prefix + "dev_node"] = "unknown";
+ break;
+ default:
+ {
+ (*pm)[prefix + "partition_path"] = string(partition_path);
+ (*pm)[prefix + "dev_node"] = string(dev_node);
+ (*pm)[prefix + "model"] = get_dev_property(dev_node, "device/model");
+ (*pm)[prefix + "dev"] = get_dev_property(dev_node, "dev");
+
+ // nvme exposes a serial number
+ string serial = get_dev_property(dev_node, "device/serial");
+ if (serial.length()) {
+ (*pm)[prefix + "serial"] = serial;
+ }
+
+ // nvme has a device/device/* structure; infer from that. there
+ // is probably a better way?
+ string nvme_vendor = get_dev_property(dev_node, "device/device/vendor");
+ if (nvme_vendor.length()) {
+ (*pm)[prefix + "type"] = "nvme";
+ }
+ }
+ }
+ } else {
+ (*pm)[prefix + "access_mode"] = "file";
+ (*pm)[prefix + "path"] = path;
+ }
+ return 0;
+}
+
+int PMEMDevice::flush()
+{
+ //Because all write is persist. So no need
+ return 0;
+}
+
+
+void PMEMDevice::aio_submit(IOContext *ioc)
+{
+ return;
+}
+
+int PMEMDevice::write(uint64_t off, bufferlist& bl, bool buffered)
+{
+ uint64_t len = bl.length();
+ dout(20) << __func__ << " " << off << "~" << len << dendl;
+ assert(len > 0);
+ assert(off < size);
+ assert(off + len <= size);
+
+ dout(40) << "data: ";
+ bl.hexdump(*_dout);
+ *_dout << dendl;
+
+ if (g_conf->bdev_inject_crash &&
+ rand() % g_conf->bdev_inject_crash == 0) {
+ derr << __func__ << " bdev_inject_crash: dropping io " << off << "~" << len
+ << dendl;
+ ++injecting_crash;
+ return 0;
+ }
+
+ bufferlist::iterator p = bl.begin();
+ uint32_t off1 = off;
+ while (len) {
+ const char *data;
+ uint32_t l = p.get_ptr_and_advance(len, &data);
+ pmem_memcpy_persist(addr + off1, data, l);
+ len -= l;
+ off1 += l;
+ }
+
+ return 0;
+}
+
+int PMEMDevice::aio_write(
+ uint64_t off,
+ bufferlist &bl,
+ IOContext *ioc,
+ bool buffered)
+{
+ return write(off, bl, buffered);
+}
+
+
+int PMEMDevice::read(uint64_t off, uint64_t len, bufferlist *pbl,
+ IOContext *ioc,
+ bool buffered)
+{
+ dout(5) << __func__ << " " << off << "~" << len << dendl;
+ assert(len > 0);
+ assert(off < size);
+ assert(off + len <= size);
+
+ bufferptr p = buffer::create_page_aligned(len);
+ memcpy(p.c_str(), addr + off, len);
+
+ pbl->clear();
+ pbl->push_back(std::move(p));
+
+ dout(40) << "data: ";
+ pbl->hexdump(*_dout);
+ *_dout << dendl;
+
+ return 0;
+}
+
+int PMEMDevice::aio_read(uint64_t off, uint64_t len, bufferlist *pbl,
+ IOContext *ioc)
+{
+ return read(off, len, pbl, ioc, false);
+}
+
+int PMEMDevice::read_random(uint64_t off, uint64_t len, char *buf, bool buffered)
+{
+ assert(len > 0);
+ assert(off < size);
+ assert(off + len <= size);
+
+ memcpy(buf, addr + off, len);
+ return 0;
+}
+
+
+int PMEMDevice::invalidate_cache(uint64_t off, uint64_t len)
+{
+ dout(5) << __func__ << " " << off << "~" << len << dendl;
+ return 0;
+}
+
+