--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2015 XSky <haomai@xsky.com>
+ *
+ * Author: Haomai Wang <haomaiwang@gmail.com>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_OS_BLUESTORE_NVMEDEVICE
+#define CEPH_OS_BLUESTORE_NVMEDEVICE
+
+#include <queue>
+#include <map>
+#include <limits>
+
+// since _Static_assert introduced in c11
+#define _Static_assert static_assert
+
+
+#include "include/interval_set.h"
+#include "common/ceph_time.h"
+#include "common/Mutex.h"
+#include "common/Cond.h"
+#include "BlockDevice.h"
+
+enum class IOCommand {
+ READ_COMMAND,
+ WRITE_COMMAND,
+ FLUSH_COMMAND
+};
+
+class Task;
+class PerfCounters;
+class SharedDriverData;
+
+class NVMEDevice : public BlockDevice {
+ /**
+ * points to pinned, physically contiguous memory region;
+ * contains 4KB IDENTIFY structure for controller which is
+ * target for CONTROLLER IDENTIFY command during initialization
+ */
+ SharedDriverData *driver;
+ string name;
+
+ uint64_t size;
+ uint64_t block_size;
+
+ bool aio_stop;
+
+ struct BufferedExtents {
+ struct Extent {
+ uint64_t x_len;
+ uint64_t x_off;
+ const char *data;
+ uint64_t data_len;
+ };
+ using Offset = uint64_t;
+ map<Offset, Extent> buffered_extents;
+ uint64_t left_edge = std::numeric_limits<uint64_t>::max();
+ uint64_t right_edge = 0;
+
+ void verify() {
+ interval_set<uint64_t> m;
+ for (auto && it : buffered_extents) {
+ assert(!m.intersects(it.first, it.second.x_len));
+ m.insert(it.first, it.second.x_len);
+ }
+ }
+
+ void insert(uint64_t off, uint64_t len, const char *data) {
+ auto it = buffered_extents.lower_bound(off);
+ if (it != buffered_extents.begin()) {
+ --it;
+ if (it->first + it->second.x_len <= off)
+ ++it;
+ }
+ uint64_t end = off + len;
+ if (off < left_edge)
+ left_edge = off;
+ if (end > right_edge)
+ right_edge = end;
+ while (it != buffered_extents.end()) {
+ if (it->first >= end)
+ break;
+ uint64_t extent_it_end = it->first + it->second.x_len;
+ assert(extent_it_end >= off);
+ if (it->first <= off) {
+ if (extent_it_end > end) {
+ // <- data ->
+ // <- it ->
+ it->second.x_len -= (extent_it_end - off);
+ buffered_extents[end] = Extent{
+ extent_it_end - end, it->second.x_off + it->second.x_len + len, it->second.data, it->second.data_len};
+ } else {
+ // <- data ->
+ // <- it ->
+ assert(extent_it_end <= end);
+ it->second.x_len -= (extent_it_end - off);
+ }
+ ++it;
+ } else {
+ assert(it->first > off);
+ if (extent_it_end > end) {
+ // <- data ->
+ // <- it ->
+ uint64_t overlap = end - it->first;
+ buffered_extents[end] = Extent{
+ it->second.x_len - overlap, it->second.x_off + overlap, it->second.data, it->second.data_len};
+ } else {
+ // <- data ->
+ // <- it ->
+ }
+ buffered_extents.erase(it++);
+ }
+ }
+ buffered_extents[off] = Extent{
+ len, 0, data, len};
+
+ if (0)
+ verify();
+ }
+
+ void memcpy_check(char *dst, uint64_t dst_raw_len, uint64_t dst_off,
+ map<Offset, Extent>::iterator &it, uint64_t src_off, uint64_t copylen) {
+ if (0) {
+ assert(dst_off + copylen <= dst_raw_len);
+ assert(it->second.x_off + src_off + copylen <= it->second.data_len);
+ }
+ memcpy(dst + dst_off, it->second.data + it->second.x_off + src_off, copylen);
+ }
+
+ uint64_t read_overlap(uint64_t off, uint64_t len, char *buf) {
+ uint64_t end = off + len;
+ if (end <= left_edge || off >= right_edge)
+ return 0;
+
+ uint64_t copied = 0;
+ auto it = buffered_extents.lower_bound(off);
+ if (it != buffered_extents.begin()) {
+ --it;
+ if (it->first + it->second.x_len <= off)
+ ++it;
+ }
+ uint64_t copy_len;
+ while (it != buffered_extents.end()) {
+ if (it->first >= end)
+ break;
+ uint64_t extent_it_end = it->first + it->second.x_len;
+ assert(extent_it_end >= off);
+ if (it->first >= off) {
+ if (extent_it_end > end) {
+ // <- data ->
+ // <- it ->
+ copy_len = len - (it->first - off);
+ memcpy_check(buf, len, it->first - off, it, 0, copy_len);
+ } else {
+ // <- data ->
+ // <- it ->
+ copy_len = it->second.x_len;
+ memcpy_check(buf, len, it->first - off, it, 0, copy_len);
+ }
+ } else {
+ if (extent_it_end > end) {
+ // <- data ->
+ // <- it ->
+ copy_len = len;
+ memcpy_check(buf, len, 0, it, off - it->first, copy_len);
+ } else {
+ // <- data ->
+ // <- it ->
+ assert(extent_it_end <= end);
+ copy_len = it->first + it->second.x_len - off;
+ memcpy_check(buf, len, 0, it, off - it->first, copy_len);
+ }
+ }
+ copied += copy_len;
+ ++it;
+ }
+ return copied;
+ }
+
+ void clear() {
+ buffered_extents.clear();
+ left_edge = std::numeric_limits<uint64_t>::max();
+ right_edge = 0;
+ }
+ };
+ Mutex buffer_lock;
+ BufferedExtents buffered_extents;
+ Task *buffered_task_head = nullptr;
+
+ static void init();
+ public:
+ SharedDriverData *get_driver() { return driver; }
+
+ public:
+ aio_callback_t aio_callback;
+ void *aio_callback_priv;
+
+ NVMEDevice(CephContext* cct, aio_callback_t cb, void *cbpriv);
+
+ bool supported_bdev_label() override { return false; }
+
+ void aio_submit(IOContext *ioc) override;
+
+ uint64_t get_size() const override {
+ return size;
+ }
+ uint64_t get_block_size() const override {
+ return block_size;
+ }
+
+ int read(uint64_t off, uint64_t len, bufferlist *pbl,
+ IOContext *ioc,
+ bool buffered) override;
+ int aio_read(
+ uint64_t off,
+ uint64_t len,
+ bufferlist *pbl,
+ IOContext *ioc) override;
+ int aio_write(uint64_t off, bufferlist& bl,
+ IOContext *ioc,
+ bool buffered) override;
+ int write(uint64_t off, bufferlist& bl, bool buffered) override;
+ int flush() override;
+ int read_random(uint64_t off, uint64_t len, char *buf, bool buffered) override;
+
+ // for managing buffered readers/writers
+ int invalidate_cache(uint64_t off, uint64_t len) override;
+ int open(const string& path) override;
+ void close() override;
+ int collect_metadata(string prefix, map<string,string> *pm) const override;
+};
+
+#endif