1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
5 * rbd-nbd - RBD in userspace
7 * Copyright (C) 2015 - 2016 Kylin Corporation
9 * Author: Yunchuan Wen <yunchuan.wen@kylin-cloud.com>
10 * Li Wang <li.wang@kylin-cloud.com>
12 * This is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU Lesser General Public
14 * License version 2.1, as published by the Free Software
15 * Foundation. See file COPYING.
19 #include "include/int_types.h"
27 #include <sys/types.h>
30 #include <linux/nbd.h>
32 #include <sys/ioctl.h>
33 #include <sys/socket.h>
37 #include <boost/regex.hpp>
39 #include "mon/MonClient.h"
40 #include "common/config.h"
41 #include "common/dout.h"
43 #include "common/errno.h"
44 #include "common/module.h"
45 #include "common/safe_io.h"
46 #include "common/TextTable.h"
47 #include "common/ceph_argparse.h"
48 #include "common/Preforker.h"
49 #include "common/version.h"
50 #include "global/global_init.h"
51 #include "global/signal_handler.h"
53 #include "include/rados/librados.hpp"
54 #include "include/rbd/librbd.hpp"
55 #include "include/stringify.h"
56 #include "include/xlist.h"
58 #define dout_context g_ceph_context
59 #define dout_subsys ceph_subsys_rbd
61 #define dout_prefix *_dout << "rbd-nbd: "
67 bool exclusive = false;
68 bool readonly = false;
69 bool set_max_part = false;
79 std::cout << "Usage: rbd-nbd [options] map <image-or-snap-spec> Map an image to nbd device\n"
80 << " unmap <device path> Unmap nbd device\n"
81 << " list-mapped List mapped nbd devices\n"
83 << " --device <device path> Specify nbd device path\n"
84 << " --read-only Map read-only\n"
85 << " --nbds_max <limit> Override for module param nbds_max\n"
86 << " --max_part <limit> Override for module param max_part\n"
87 << " --exclusive Forbid writes by other clients\n"
89 generic_server_usage();
101 #define RBD_NBD_BLKSIZE 512UL
104 #define VERSION_INFO 2
106 #ifdef CEPH_BIG_ENDIAN
107 #define ntohll(a) (a)
108 #elif defined(CEPH_LITTLE_ENDIAN)
109 #define ntohll(a) swab(a)
111 #error "Could not determine endianess"
113 #define htonll(a) ntohll(a)
115 static int parse_args(vector<const char*>& args, std::ostream *err_msg, Config *cfg);
117 static void handle_signal(int signum)
119 assert(signum == SIGINT || signum == SIGTERM);
120 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
121 dout(20) << __func__ << ": " << "sending NBD_DISCONNECT" << dendl;
122 if (ioctl(nbd, NBD_DISCONNECT) < 0) {
123 derr << "rbd-nbd: disconnect failed: " << cpp_strerror(errno) << dendl;
125 dout(20) << __func__ << ": " << "disconnected" << dendl;
133 librbd::Image ℑ
136 NBDServer(int _fd, librbd::Image& _image)
139 , lock("NBDServer::Locker")
140 , reader_thread(*this, &NBDServer::reader_entry)
141 , writer_thread(*this, &NBDServer::writer_entry)
146 std::atomic<bool> terminated = { false };
150 bool expected = false;
151 if (terminated.compare_exchange_strong(expected, true)) {
152 ::shutdown(fd, SHUT_RDWR);
154 Mutex::Locker l(lock);
161 xlist<IOContext*>::item item;
163 struct nbd_request request;
164 struct nbd_reply reply;
173 friend std::ostream &operator<<(std::ostream &os, const IOContext &ctx);
177 xlist<IOContext*> io_pending;
178 xlist<IOContext*> io_finished;
180 void io_start(IOContext *ctx)
182 Mutex::Locker l(lock);
183 io_pending.push_back(&ctx->item);
186 void io_finish(IOContext *ctx)
188 Mutex::Locker l(lock);
189 assert(ctx->item.is_on_list());
190 ctx->item.remove_myself();
191 io_finished.push_back(&ctx->item);
195 IOContext *wait_io_finish()
197 Mutex::Locker l(lock);
198 while(io_finished.empty() && !terminated)
201 if (io_finished.empty())
204 IOContext *ret = io_finished.front();
205 io_finished.pop_front();
212 assert(!reader_thread.is_started());
213 Mutex::Locker l(lock);
214 while(!io_pending.empty())
217 while(!io_finished.empty()) {
218 ceph::unique_ptr<IOContext> free_ctx(io_finished.front());
219 io_finished.pop_front();
223 static void aio_callback(librbd::completion_t cb, void *arg)
225 librbd::RBD::AioCompletion *aio_completion =
226 reinterpret_cast<librbd::RBD::AioCompletion*>(cb);
228 IOContext *ctx = reinterpret_cast<IOContext *>(arg);
229 int ret = aio_completion->get_return_value();
231 dout(20) << __func__ << ": " << *ctx << dendl;
233 if (ret == -EINVAL) {
234 // if shrinking an image, a pagecache writeback might reference
235 // extents outside of the range of the new image extents
236 dout(0) << __func__ << ": masking IO out-of-bounds error" << dendl;
242 ctx->reply.error = htonl(-ret);
243 } else if ((ctx->command == NBD_CMD_READ) &&
244 ret < static_cast<int>(ctx->request.len)) {
245 int pad_byte_count = static_cast<int> (ctx->request.len) - ret;
246 ctx->data.append_zero(pad_byte_count);
247 dout(20) << __func__ << ": " << *ctx << ": Pad byte count: "
248 << pad_byte_count << dendl;
249 ctx->reply.error = 0;
251 ctx->reply.error = htonl(0);
253 ctx->server->io_finish(ctx);
255 aio_completion->release();
260 while (!terminated) {
261 ceph::unique_ptr<IOContext> ctx(new IOContext());
264 dout(20) << __func__ << ": waiting for nbd request" << dendl;
266 int r = safe_read_exact(fd, &ctx->request, sizeof(struct nbd_request));
268 derr << "failed to read nbd request header: " << cpp_strerror(r)
273 if (ctx->request.magic != htonl(NBD_REQUEST_MAGIC)) {
274 derr << "invalid nbd request header" << dendl;
278 ctx->request.from = ntohll(ctx->request.from);
279 ctx->request.type = ntohl(ctx->request.type);
280 ctx->request.len = ntohl(ctx->request.len);
282 ctx->reply.magic = htonl(NBD_REPLY_MAGIC);
283 memcpy(ctx->reply.handle, ctx->request.handle, sizeof(ctx->reply.handle));
285 ctx->command = ctx->request.type & 0x0000ffff;
287 dout(20) << *ctx << ": start" << dendl;
289 switch (ctx->command)
292 // NBD_DO_IT will return when pipe is closed
293 dout(0) << "disconnect request received" << dendl;
296 bufferptr ptr(ctx->request.len);
297 r = safe_read_exact(fd, ptr.c_str(), ctx->request.len);
299 derr << *ctx << ": failed to read nbd request data: "
300 << cpp_strerror(r) << dendl;
303 ctx->data.push_back(ptr);
307 IOContext *pctx = ctx.release();
309 librbd::RBD::AioCompletion *c = new librbd::RBD::AioCompletion(pctx, aio_callback);
310 switch (pctx->command)
313 image.aio_write(pctx->request.from, pctx->request.len, pctx->data, c);
316 image.aio_read(pctx->request.from, pctx->request.len, pctx->data, c);
322 image.aio_discard(pctx->request.from, pctx->request.len, c);
325 derr << *pctx << ": invalid request command" << dendl;
330 dout(20) << __func__ << ": terminated" << dendl;
335 while (!terminated) {
336 dout(20) << __func__ << ": waiting for io request" << dendl;
337 ceph::unique_ptr<IOContext> ctx(wait_io_finish());
339 dout(20) << __func__ << ": no io requests, terminating" << dendl;
343 dout(20) << __func__ << ": got: " << *ctx << dendl;
345 int r = safe_write(fd, &ctx->reply, sizeof(struct nbd_reply));
347 derr << *ctx << ": failed to write reply header: " << cpp_strerror(r)
351 if (ctx->command == NBD_CMD_READ && ctx->reply.error == htonl(0)) {
352 r = ctx->data.write_fd(fd);
354 derr << *ctx << ": failed to write replay data: " << cpp_strerror(r)
359 dout(20) << *ctx << ": finish" << dendl;
361 dout(20) << __func__ << ": terminated" << dendl;
364 class ThreadHelper : public Thread
367 typedef void (NBDServer::*entry_func)();
372 ThreadHelper(NBDServer &_server, entry_func _func)
377 void* entry() override
383 } reader_thread, writer_thread;
390 dout(10) << __func__ << ": starting" << dendl;
394 reader_thread.create("rbd_reader");
395 writer_thread.create("rbd_writer");
402 dout(10) << __func__ << ": terminating" << dendl;
406 reader_thread.join();
407 writer_thread.join();
421 std::ostream &operator<<(std::ostream &os, const NBDServer::IOContext &ctx) {
423 os << "[" << std::hex << ntohll(*((uint64_t *)ctx.request.handle));
440 os << " UNKNOW(" << ctx.command << ") ";
444 os << ctx.request.from << "~" << ctx.request.len << " "
445 << ntohl(ctx.reply.error) << "]";
450 class NBDWatchCtx : public librbd::UpdateWatchCtx
454 librados::IoCtx &io_ctx;
455 librbd::Image ℑ
459 librados::IoCtx &_io_ctx,
460 librbd::Image &_image,
468 ~NBDWatchCtx() override {}
470 void handle_notify() override
472 librbd::image_info_t info;
473 if (image.stat(info, sizeof(info)) == 0) {
474 unsigned long new_size = info.size;
476 if (new_size != size) {
477 if (ioctl(fd, BLKFLSBUF, NULL) < 0)
478 derr << "invalidate page cache failed: " << cpp_strerror(errno) << dendl;
479 if (ioctl(fd, NBD_SET_SIZE, new_size) < 0) {
480 derr << "resize failed: " << cpp_strerror(errno) << dendl;
484 if (image.invalidate_cache() < 0)
485 derr << "invalidate rbd cache failed" << dendl;
491 static int open_device(const char* path, Config *cfg = nullptr, bool try_load_module = false)
493 int nbd = open(path, O_RDWR);
494 bool loaded_module = false;
496 if (nbd < 0 && try_load_module && access("/sys/module/nbd", F_OK) != 0) {
500 param << "nbds_max=" << cfg->nbds_max;
503 param << " max_part=" << cfg->max_part;
505 r = module_load("nbd", param.str().c_str());
507 cerr << "rbd-nbd: failed to load nbd kernel module: " << cpp_strerror(-r) << std::endl;
510 loaded_module = true;
512 nbd = open(path, O_RDWR);
515 if (try_load_module && !loaded_module &&
516 (cfg->nbds_max || cfg->set_max_part)) {
517 cerr << "rbd-nbd: ignoring kernel module parameter options: nbd module already loaded"
524 static int check_device_size(int nbd_index, unsigned long expected_size)
526 // There are bugs with some older kernel versions that result in an
527 // overflow for large image sizes. This check is to ensure we are
530 unsigned long size = 0;
531 std::string path = "/sys/block/nbd" + stringify(nbd_index) + "/size";
533 ifs.open(path.c_str(), std::ifstream::in);
534 if (!ifs.is_open()) {
535 cerr << "rbd-nbd: failed to open " << path << std::endl;
539 size *= RBD_NBD_BLKSIZE;
542 // Newer kernel versions will report real size only after nbd
543 // connect. Assume this is the case and return success.
547 if (size != expected_size) {
548 cerr << "rbd-nbd: kernel reported invalid device size (" << size
549 << ", expected " << expected_size << ")" << std::endl;
556 static int do_map(int argc, const char *argv[], Config *cfg)
560 librados::Rados rados;
562 librados::IoCtx io_ctx;
572 librbd::image_info_t info;
576 vector<const char*> args;
577 argv_to_vec(argc, argv, args);
580 auto cct = global_init(NULL, args, CEPH_ENTITY_TYPE_CLIENT,
581 CODE_ENVIRONMENT_DAEMON,
582 CINIT_FLAG_UNPRIVILEGED_DAEMON_DEFAULTS);
583 g_ceph_context->_conf->set_val_or_die("pid_file", "");
585 if (global_init_prefork(g_ceph_context) >= 0) {
587 r = forker.prefork(err);
589 cerr << err << std::endl;
593 if (forker.is_parent()) {
594 global_init_postfork_start(g_ceph_context);
595 if (forker.parent_wait(err) != 0) {
602 common_init_finish(g_ceph_context);
603 global_init_chdir(g_ceph_context);
605 if (socketpair(AF_UNIX, SOCK_STREAM, 0, fd) == -1) {
610 if (cfg->devpath.empty()) {
612 bool try_load_module = true;
614 snprintf(dev, sizeof(dev), "/dev/nbd%d", index);
616 nbd = open_device(dev, cfg, try_load_module);
617 try_load_module = false;
620 cerr << "rbd-nbd: failed to find unused device" << std::endl;
624 r = ioctl(nbd, NBD_SET_SOCK, fd[0]);
635 r = sscanf(cfg->devpath.c_str(), "/dev/nbd%d", &index);
637 cerr << "rbd-nbd: invalid device path: " << cfg->devpath
638 << " (expected /dev/nbd{num})" << std::endl;
641 nbd = open_device(cfg->devpath.c_str(), cfg, true);
644 cerr << "rbd-nbd: failed to open device: " << cfg->devpath << std::endl;
648 r = ioctl(nbd, NBD_SET_SOCK, fd[0]);
651 cerr << "rbd-nbd: the device " << cfg->devpath << " is busy" << std::endl;
657 flags = NBD_FLAG_SEND_FLUSH | NBD_FLAG_SEND_TRIM | NBD_FLAG_HAS_FLAGS;
658 if (!cfg->snapname.empty() || cfg->readonly) {
659 flags |= NBD_FLAG_READ_ONLY;
663 r = rados.init_with_context(g_ceph_context);
671 r = rados.ioctx_create(cfg->poolname.c_str(), io_ctx);
675 r = rbd.open(io_ctx, image, cfg->imgname.c_str());
679 if (cfg->exclusive) {
680 r = image.lock_acquire(RBD_LOCK_MODE_EXCLUSIVE);
682 cerr << "rbd-nbd: failed to acquire exclusive lock: " << cpp_strerror(r)
688 if (!cfg->snapname.empty()) {
689 r = image.snap_set(cfg->snapname.c_str());
694 r = image.stat(info, sizeof(info));
698 r = ioctl(nbd, NBD_SET_BLKSIZE, RBD_NBD_BLKSIZE);
704 if (info.size > ULONG_MAX) {
706 cerr << "rbd-nbd: image is too large (" << prettybyte_t(info.size)
707 << ", max is " << prettybyte_t(ULONG_MAX) << ")" << std::endl;
713 r = ioctl(nbd, NBD_SET_SIZE, size);
719 r = check_device_size(index, size);
724 ioctl(nbd, NBD_SET_FLAGS, flags);
726 r = ioctl(nbd, BLKROSET, (unsigned long) &read_only);
735 NBDWatchCtx watch_ctx(nbd, io_ctx, image, info.size);
736 r = image.update_watch(&watch_ctx, &handle);
740 cout << cfg->devpath << std::endl;
742 if (g_conf->daemonize) {
744 global_init_postfork_start(g_ceph_context);
745 global_init_postfork_finish(g_ceph_context);
749 NBDServer server(fd[1], image);
753 init_async_signal_handler();
754 register_async_signal_handler(SIGHUP, sighup_handler);
755 register_async_signal_handler_oneshot(SIGINT, handle_signal);
756 register_async_signal_handler_oneshot(SIGTERM, handle_signal);
758 ioctl(nbd, NBD_DO_IT);
760 unregister_async_signal_handler(SIGHUP, sighup_handler);
761 unregister_async_signal_handler(SIGINT, handle_signal);
762 unregister_async_signal_handler(SIGTERM, handle_signal);
763 shutdown_async_signal_handler();
768 r = image.update_unwatch(handle);
774 ioctl(nbd, NBD_CLEAR_SOCK);
775 cerr << "rbd-nbd: failed to map, status: " << cpp_strerror(-r) << std::endl;
786 forker.exit(r < 0 ? EXIT_FAILURE : 0);
791 static int do_unmap(const std::string &devpath)
795 int nbd = open_device(devpath.c_str());
797 cerr << "rbd-nbd: failed to open device: " << devpath << std::endl;
801 r = ioctl(nbd, NBD_DISCONNECT);
803 cerr << "rbd-nbd: the device is not used" << std::endl;
811 static int parse_imgpath(const std::string &imgpath, Config *cfg)
813 boost::regex pattern("^(?:([^/@]+)/)?([^/@]+)(?:@([^/@]+))?$");
815 if (!boost::regex_match(imgpath, match, pattern)) {
816 std::cerr << "rbd-nbd: invalid spec '" << imgpath << "'" << std::endl;
820 if (match[1].matched) {
821 cfg->poolname = match[1];
824 cfg->imgname = match[2];
826 if (match[3].matched)
827 cfg->snapname = match[3];
832 static int get_mapped_info(int pid, Config *cfg)
835 std::string path = "/proc/" + stringify(pid) + "/cmdline";
838 std::vector<const char*> args;
840 ifs.open(path.c_str(), std::ifstream::in);
841 assert (ifs.is_open());
844 for (unsigned i = 0; i < cmdline.size(); i++) {
845 const char *arg = &cmdline[i];
847 if (strcmp(basename(arg) , "rbd-nbd") != 0) {
854 while (cmdline[i] != '\0') {
859 std::ostringstream err_msg;
860 r = parse_args(args, &err_msg, cfg);
864 static int get_map_pid(const std::string& pid_path)
868 ifs.open(pid_path.c_str(), std::ifstream::in);
869 if (!ifs.is_open()) {
876 static int do_list_mapped_devices()
879 bool should_print = false;
883 std::string default_pool_name;
887 tbl.define_column("pid", TextTable::LEFT, TextTable::LEFT);
888 tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT);
889 tbl.define_column("image", TextTable::LEFT, TextTable::LEFT);
890 tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT);
891 tbl.define_column("device", TextTable::LEFT, TextTable::LEFT);
894 std::string nbd_path = "/sys/block/nbd" + stringify(index);
895 if(access(nbd_path.c_str(), F_OK) != 0) {
898 std::string pid_path = nbd_path + "/pid";
899 pid = get_map_pid(pid_path);
903 r = get_mapped_info(pid, &cfg);
909 if (cfg.snapname.empty()) {
912 tbl << pid << cfg.poolname << cfg.imgname << cfg.snapname
913 << "/dev/nbd" + stringify(index) << TextTable::endrow;
925 static int parse_args(vector<const char*>& args, std::ostream *err_msg, Config *cfg)
927 std::string conf_file_list;
929 CephInitParameters iparams = ceph_argparse_early_args(
930 args, CEPH_ENTITY_TYPE_CLIENT, &cluster, &conf_file_list);
933 config.name = iparams.name;
934 config.cluster = cluster;
936 if (!conf_file_list.empty()) {
937 config.parse_config_files(conf_file_list.c_str(), nullptr, 0);
939 config.parse_config_files(nullptr, nullptr, 0);
942 config.parse_argv(args);
943 cfg->poolname = config.get_val<std::string>("rbd_default_pool");
945 std::vector<const char*>::iterator i;
946 std::ostringstream err;
948 for (i = args.begin(); i != args.end(); ) {
949 if (ceph_argparse_flag(args, i, "-h", "--help", (char*)NULL)) {
951 } else if (ceph_argparse_flag(args, i, "-v", "--version", (char*)NULL)) {
953 } else if (ceph_argparse_witharg(args, i, &cfg->devpath, "--device", (char *)NULL)) {
954 } else if (ceph_argparse_witharg(args, i, &cfg->nbds_max, err, "--nbds_max", (char *)NULL)) {
955 if (!err.str().empty()) {
956 *err_msg << "rbd-nbd: " << err.str();
959 if (cfg->nbds_max < 0) {
960 *err_msg << "rbd-nbd: Invalid argument for nbds_max!";
963 } else if (ceph_argparse_witharg(args, i, &cfg->max_part, err, "--max_part", (char *)NULL)) {
964 if (!err.str().empty()) {
965 *err_msg << "rbd-nbd: " << err.str();
968 if ((cfg->max_part < 0) || (cfg->max_part > 255)) {
969 *err_msg << "rbd-nbd: Invalid argument for max_part(0~255)!";
972 cfg->set_max_part = true;
973 } else if (ceph_argparse_flag(args, i, "--read-only", (char *)NULL)) {
974 cfg->readonly = true;
975 } else if (ceph_argparse_flag(args, i, "--exclusive", (char *)NULL)) {
976 cfg->exclusive = true;
982 if (args.begin() != args.end()) {
983 if (strcmp(*args.begin(), "map") == 0) {
985 } else if (strcmp(*args.begin(), "unmap") == 0) {
987 } else if (strcmp(*args.begin(), "list-mapped") == 0) {
990 *err_msg << "rbd-nbd: unknown command: " << *args.begin();
993 args.erase(args.begin());
997 *err_msg << "rbd-nbd: must specify command";
1003 if (args.begin() == args.end()) {
1004 *err_msg << "rbd-nbd: must specify image-or-snap-spec";
1007 if (parse_imgpath(string(*args.begin()), cfg) < 0)
1009 args.erase(args.begin());
1012 if (args.begin() == args.end()) {
1013 *err_msg << "rbd-nbd: must specify nbd device path";
1016 cfg->devpath = *args.begin();
1017 args.erase(args.begin());
1024 if (args.begin() != args.end()) {
1025 *err_msg << "rbd-nbd: unknown args: " << *args.begin();
1032 static int rbd_nbd(int argc, const char *argv[])
1036 vector<const char*> args;
1037 argv_to_vec(argc, argv, args);
1039 std::ostringstream err_msg;
1040 r = parse_args(args, &err_msg, &cfg);
1041 if (r == HELP_INFO) {
1044 } else if (r == VERSION_INFO) {
1045 std::cout << pretty_version_to_str() << std::endl;
1049 cerr << err_msg.str() << std::endl;
1055 if (cfg.imgname.empty()) {
1056 cerr << "rbd-nbd: image name was not specified" << std::endl;
1060 r = do_map(argc, argv, &cfg);
1065 r = do_unmap(cfg.devpath);
1070 r = do_list_mapped_devices();
1083 int main(int argc, const char *argv[])
1085 int r = rbd_nbd(argc, argv);
1087 return EXIT_FAILURE;