2 * Ceph - scalable distributed file system
4 * Copyright (C) 2014 Inktank Storage, Inc.
6 * This is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License version 2.1, as published by the Free Software
9 * Foundation. See file COPYING.
24 #include <sys/types.h>
27 #include "auth/KeyRing.h"
28 #include "common/errno.h"
29 #include "common/Formatter.h"
30 #include "common/module.h"
31 #include "common/run_cmd.h"
32 #include "common/safe_io.h"
33 #include "common/secret.h"
34 #include "common/TextTable.h"
35 #include "include/assert.h"
36 #include "include/stringify.h"
37 #include "include/krbd.h"
38 #include "mon/MonMap.h"
40 #include <blkid/blkid.h>
45 const static int POLL_TIMEOUT=120000;
52 static string get_kernel_rbd_name(const char *id)
54 return string("/dev/rbd") + id;
57 static int sysfs_write_rbd(const char *which, const string& buf)
59 const string s = string("/sys/bus/rbd/") + which;
60 const string t = s + "_single_major";
65 * 'add' and 'add_single_major' interfaces are identical, but if rbd
66 * kernel module is new enough and is configured to use single-major
67 * scheme, 'add' is disabled in order to prevent old userspace from
68 * doing weird things at unmap time.
70 * Same goes for 'remove' vs 'remove_single_major'.
72 fd = open(t.c_str(), O_WRONLY);
74 if (errno == ENOENT) {
75 fd = open(s.c_str(), O_WRONLY);
83 r = safe_write(fd, buf.c_str(), buf.size());
89 static int sysfs_write_rbd_add(const string& buf)
91 return sysfs_write_rbd("add", buf);
94 static int sysfs_write_rbd_remove(const string& buf)
96 return sysfs_write_rbd("remove", buf);
99 static int have_minor_attr(void)
102 * 'minor' attribute was added as part of single_major merge, which
103 * exposed the 'single_major' parameter. 'minor' is always present,
104 * regardless of whether single-major scheme is turned on or not.
106 * (Something like ver >= KERNEL_VERSION(3, 14, 0) is a no-go because
107 * this has to work with rbd.ko backported to various kernels.)
109 return access("/sys/module/rbd/parameters/single_major", F_OK) == 0;
112 static int build_map_buf(CephContext *cct, const char *pool, const char *image,
113 const char *snap, const char *options, string *pbuf)
119 r = monmap.build_initial(cct, cerr);
123 list<entity_addr_t> mon_addr;
124 monmap.list_addrs(mon_addr);
126 for (const auto &p : mon_addr) {
127 if (oss.tellp() > 0) {
130 oss << p.get_sockaddr();
133 oss << " name=" << cct->_conf->name.get_id();
136 if (cct->_conf->auth_client_required != "none") {
137 r = keyring.from_ceph_context(cct);
138 if (r == -ENOENT && !(cct->_conf->keyfile.length() ||
139 cct->_conf->key.length()))
142 cerr << "rbd: failed to get secret" << std::endl;
148 string key_name = string("client.") + cct->_conf->name.get_id();
149 if (keyring.get_secret(cct->_conf->name, secret)) {
151 secret.encode_base64(secret_str);
153 r = set_kernel_secret(secret_str.c_str(), key_name.c_str());
156 cerr << "rbd: warning: secret has length 0" << std::endl;
157 oss << ",key=" << key_name;
158 } else if (r == -ENODEV || r == -ENOSYS) {
159 // running against older kernel; fall back to secret= in options
160 oss << ",secret=" << secret_str;
162 cerr << "rbd: failed to add secret '" << key_name << "' to kernel"
166 } else if (is_kernel_secret(key_name.c_str())) {
167 oss << ",key=" << key_name;
170 if (strcmp(options, "") != 0)
171 oss << "," << options;
173 oss << " " << pool << " " << image << " " << snap;
179 static int wait_for_udev_add(struct udev_monitor *mon, const char *pool,
180 const char *image, const char *snap,
183 struct udev_device *bus_dev = NULL;
186 * Catch /sys/devices/rbd/<id>/ and wait for the corresponding
187 * block device to show up. This is necessary because rbd devices
188 * and block devices aren't linked together in our sysfs layout.
191 struct pollfd fds[1];
192 struct udev_device *dev;
194 fds[0].fd = udev_monitor_get_fd(mon);
195 fds[0].events = POLLIN;
196 if (poll(fds, 1, POLL_TIMEOUT) < 0)
199 dev = udev_monitor_receive_device(mon);
203 if (strcmp(udev_device_get_action(dev), "add") != 0)
207 if (strcmp(udev_device_get_subsystem(dev), "rbd") == 0) {
208 const char *this_pool = udev_device_get_sysattr_value(dev, "pool");
209 const char *this_image = udev_device_get_sysattr_value(dev, "name");
210 const char *this_snap = udev_device_get_sysattr_value(dev,
213 if (this_pool && strcmp(this_pool, pool) == 0 &&
214 this_image && strcmp(this_image, image) == 0 &&
215 this_snap && strcmp(this_snap, snap) == 0) {
221 if (strcmp(udev_device_get_subsystem(dev), "block") == 0) {
222 const char *major = udev_device_get_sysattr_value(bus_dev, "major");
223 const char *minor = udev_device_get_sysattr_value(bus_dev, "minor");
224 const char *this_major = udev_device_get_property_value(dev, "MAJOR");
225 const char *this_minor = udev_device_get_property_value(dev, "MINOR");
227 assert(!minor ^ have_minor_attr());
229 if (strcmp(this_major, major) == 0 &&
230 (!minor || strcmp(this_minor, minor) == 0)) {
231 string name = get_kernel_rbd_name(udev_device_get_sysname(bus_dev));
233 assert(strcmp(udev_device_get_devnode(dev), name.c_str()) == 0);
236 udev_device_unref(dev);
237 udev_device_unref(bus_dev);
244 udev_device_unref(dev);
250 static int do_map(struct udev *udev, const char *pool, const char *image,
251 const char *snap, const string& buf, string *pname)
253 struct udev_monitor *mon;
256 mon = udev_monitor_new_from_netlink(udev, "udev");
260 r = udev_monitor_filter_add_match_subsystem_devtype(mon, "rbd", NULL);
264 r = udev_monitor_filter_add_match_subsystem_devtype(mon, "block", "disk");
268 r = udev_monitor_enable_receiving(mon);
272 r = sysfs_write_rbd_add(buf);
274 cerr << "rbd: sysfs write failed" << std::endl;
278 r = wait_for_udev_add(mon, pool, image, snap, pname);
280 cerr << "rbd: wait failed" << std::endl;
285 udev_monitor_unref(mon);
289 static int map_image(struct krbd_ctx *ctx, const char *pool, const char *image,
290 const char *snap, const char *options, string *pname)
295 if (strcmp(snap, "") == 0)
298 r = build_map_buf(ctx->cct, pool, image, snap, options, &buf);
303 * Modprobe rbd kernel module. If it supports single-major device
304 * number allocation scheme, make sure it's turned on.
306 if (access("/sys/bus/rbd", F_OK) != 0) {
307 const char *module_options = NULL;
308 if (module_has_param("rbd", "single_major"))
309 module_options = "single_major=Y";
311 r = module_load("rbd", module_options);
313 cerr << "rbd: failed to load rbd kernel module (" << r << ")"
316 * Ignore the error: modprobe failing doesn't necessarily prevent
322 return do_map(ctx->udev, pool, image, snap, buf, pname);
325 static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid)
327 struct udev_enumerate *enm;
328 struct udev_list_entry *l;
329 struct udev_device *dev;
332 enm = udev_enumerate_new(udev);
336 r = udev_enumerate_add_match_subsystem(enm, "rbd");
340 r = udev_enumerate_add_match_sysattr(enm, "major",
341 stringify(major(devno)).c_str());
345 if (have_minor_attr()) {
346 r = udev_enumerate_add_match_sysattr(enm, "minor",
347 stringify(minor(devno)).c_str());
352 r = udev_enumerate_scan_devices(enm);
356 l = udev_enumerate_get_list_entry(enm);
362 /* make sure there is only one match */
363 assert(!udev_list_entry_get_next(l));
365 dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l));
371 *pid = udev_device_get_sysname(dev);
373 udev_device_unref(dev);
375 udev_enumerate_unref(enm);
379 static int spec_to_devno_and_krbd_id(struct udev *udev, const char *pool,
380 const char *image, const char *snap,
381 dev_t *pdevno, string *pid)
383 struct udev_enumerate *enm;
384 struct udev_list_entry *l;
385 struct udev_device *dev;
386 unsigned int maj, min = 0;
390 enm = udev_enumerate_new(udev);
394 r = udev_enumerate_add_match_subsystem(enm, "rbd");
398 r = udev_enumerate_add_match_sysattr(enm, "pool", pool);
402 r = udev_enumerate_add_match_sysattr(enm, "name", image);
406 r = udev_enumerate_add_match_sysattr(enm, "current_snap", snap);
410 r = udev_enumerate_scan_devices(enm);
414 l = udev_enumerate_get_list_entry(enm);
420 dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l));
426 maj = strict_strtoll(udev_device_get_sysattr_value(dev, "major"), 10, &err);
428 cerr << "rbd: couldn't parse major: " << err << std::endl;
432 if (have_minor_attr()) {
433 min = strict_strtoll(udev_device_get_sysattr_value(dev, "minor"), 10, &err);
435 cerr << "rbd: couldn't parse minor: " << err << std::endl;
442 * If an image is mapped more than once don't bother trying to unmap
443 * all devices - let users run unmap the same number of times they
446 if (udev_list_entry_get_next(l))
447 cerr << "rbd: " << pool << "/" << image << "@" << snap
448 << ": mapped more than once, unmapping "
449 << get_kernel_rbd_name(udev_device_get_sysname(dev))
450 << " only" << std::endl;
452 *pdevno = makedev(maj, min);
453 *pid = udev_device_get_sysname(dev);
456 udev_device_unref(dev);
458 udev_enumerate_unref(enm);
462 static string build_unmap_buf(const string& id, const char *options)
465 if (strcmp(options, "") != 0) {
472 static int wait_for_udev_remove(struct udev_monitor *mon, dev_t devno)
475 struct pollfd fds[1];
476 struct udev_device *dev;
478 fds[0].fd = udev_monitor_get_fd(mon);
479 fds[0].events = POLLIN;
480 if (poll(fds, 1, POLL_TIMEOUT) < 0)
483 dev = udev_monitor_receive_device(mon);
487 if (strcmp(udev_device_get_action(dev), "remove") == 0 &&
488 udev_device_get_devnum(dev) == devno) {
489 udev_device_unref(dev);
493 udev_device_unref(dev);
499 static int do_unmap(struct udev *udev, dev_t devno, const string& buf)
501 struct udev_monitor *mon;
504 mon = udev_monitor_new_from_netlink(udev, "udev");
508 r = udev_monitor_filter_add_match_subsystem_devtype(mon, "block", "disk");
512 r = udev_monitor_enable_receiving(mon);
517 * On final device close(), kernel sends a block change event, in
518 * response to which udev apparently runs blkid on the device. This
519 * makes unmap fail with EBUSY, if issued right after final close().
520 * Try to circumvent this with a retry before turning to udev.
522 for (int tries = 0; ; tries++) {
523 r = sysfs_write_rbd_remove(buf);
526 } else if (r == -EBUSY && tries < 2) {
531 * libudev does not provide the "wait until the queue is empty"
532 * API or the sufficient amount of primitives to build it from.
534 string err = run_cmd("udevadm", "settle", "--timeout", "10", NULL);
536 cerr << "rbd: " << err << std::endl;
539 cerr << "rbd: sysfs write failed" << std::endl;
544 r = wait_for_udev_remove(mon, devno);
546 cerr << "rbd: wait failed" << std::endl;
551 udev_monitor_unref(mon);
555 static int unmap_image(struct krbd_ctx *ctx, const char *devnode,
559 dev_t wholedevno = 0;
563 if (stat(devnode, &sb) < 0 || !S_ISBLK(sb.st_mode)) {
564 cerr << "rbd: '" << devnode << "' is not a block device" << std::endl;
568 r = blkid_devno_to_wholedisk(sb.st_rdev, NULL, 0, &wholedevno);
570 cerr << "rbd: couldn't compute wholedevno: " << cpp_strerror(r)
573 * Ignore the error: we are given whole disks most of the time, and
574 * if it turns out this is a partition we will fail later anyway.
576 wholedevno = sb.st_rdev;
579 r = devno_to_krbd_id(ctx->udev, wholedevno, &id);
582 cerr << "rbd: '" << devnode << "' is not an rbd device" << std::endl;
588 return do_unmap(ctx->udev, wholedevno, build_unmap_buf(id, options));
591 static int unmap_image(struct krbd_ctx *ctx, const char *pool,
592 const char *image, const char *snap,
602 r = spec_to_devno_and_krbd_id(ctx->udev, pool, image, snap, &devno, &id);
605 cerr << "rbd: " << pool << "/" << image << "@" << snap
606 << ": not a mapped image or snapshot" << std::endl;
612 return do_unmap(ctx->udev, devno, build_unmap_buf(id, options));
615 static bool dump_one_image(Formatter *f, TextTable *tbl,
616 struct udev_device *dev)
618 const char *id = udev_device_get_sysname(dev);
619 const char *pool = udev_device_get_sysattr_value(dev, "pool");
620 const char *image = udev_device_get_sysattr_value(dev, "name");
621 const char *snap = udev_device_get_sysattr_value(dev, "current_snap");
622 string kname = get_kernel_rbd_name(id);
624 if (!pool || !image || !snap)
628 f->open_object_section(id);
629 f->dump_string("pool", pool);
630 f->dump_string("name", image);
631 f->dump_string("snap", snap);
632 f->dump_string("device", kname);
635 *tbl << id << pool << image << snap << kname << TextTable::endrow;
641 static int do_dump(struct udev *udev, Formatter *f, TextTable *tbl)
643 struct udev_enumerate *enm;
644 struct udev_list_entry *l;
645 bool have_output = false;
648 enm = udev_enumerate_new(udev);
652 r = udev_enumerate_add_match_subsystem(enm, "rbd");
656 r = udev_enumerate_scan_devices(enm);
660 udev_list_entry_foreach(l, udev_enumerate_get_list_entry(enm)) {
661 struct udev_device *dev;
663 dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l));
665 have_output |= dump_one_image(f, tbl, dev);
666 udev_device_unref(dev);
672 udev_enumerate_unref(enm);
676 int dump_images(struct krbd_ctx *ctx, Formatter *f)
682 f->open_object_section("devices");
684 tbl.define_column("id", TextTable::LEFT, TextTable::LEFT);
685 tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT);
686 tbl.define_column("image", TextTable::LEFT, TextTable::LEFT);
687 tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT);
688 tbl.define_column("device", TextTable::LEFT, TextTable::LEFT);
691 r = do_dump(ctx->udev, f, &tbl);
704 extern "C" int krbd_create_from_context(rados_config_t cct,
705 struct krbd_ctx **pctx)
707 struct krbd_ctx *ctx = new struct krbd_ctx();
709 ctx->cct = reinterpret_cast<CephContext *>(cct);
710 ctx->udev = udev_new();
720 extern "C" void krbd_destroy(struct krbd_ctx *ctx)
725 udev_unref(ctx->udev);
730 extern "C" int krbd_map(struct krbd_ctx *ctx, const char *pool,
731 const char *image, const char *snap,
732 const char *options, char **pdevnode)
738 r = map_image(ctx, pool, image, snap, options, &name);
742 devnode = strdup(name.c_str());
750 extern "C" int krbd_unmap(struct krbd_ctx *ctx, const char *devnode,
753 return unmap_image(ctx, devnode, options);
756 extern "C" int krbd_unmap_by_spec(struct krbd_ctx *ctx, const char *pool,
757 const char *image, const char *snap,
760 return unmap_image(ctx, pool, image, snap, options);
763 int krbd_showmapped(struct krbd_ctx *ctx, Formatter *f)
765 return dump_images(ctx, f);