/* * Ceph - scalable distributed file system * * Copyright (C) 2014 Inktank Storage, Inc. * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "auth/KeyRing.h" #include "common/errno.h" #include "common/Formatter.h" #include "common/module.h" #include "common/run_cmd.h" #include "common/safe_io.h" #include "common/secret.h" #include "common/TextTable.h" #include "include/assert.h" #include "include/stringify.h" #include "include/krbd.h" #include "mon/MonMap.h" #include #include using namespace std; const static int POLL_TIMEOUT=120000; struct krbd_ctx { CephContext *cct; struct udev *udev; }; static string get_kernel_rbd_name(const char *id) { return string("/dev/rbd") + id; } static int sysfs_write_rbd(const char *which, const string& buf) { const string s = string("/sys/bus/rbd/") + which; const string t = s + "_single_major"; int fd; int r; /* * 'add' and 'add_single_major' interfaces are identical, but if rbd * kernel module is new enough and is configured to use single-major * scheme, 'add' is disabled in order to prevent old userspace from * doing weird things at unmap time. * * Same goes for 'remove' vs 'remove_single_major'. */ fd = open(t.c_str(), O_WRONLY); if (fd < 0) { if (errno == ENOENT) { fd = open(s.c_str(), O_WRONLY); if (fd < 0) return -errno; } else { return -errno; } } r = safe_write(fd, buf.c_str(), buf.size()); close(fd); return r; } static int sysfs_write_rbd_add(const string& buf) { return sysfs_write_rbd("add", buf); } static int sysfs_write_rbd_remove(const string& buf) { return sysfs_write_rbd("remove", buf); } static int have_minor_attr(void) { /* * 'minor' attribute was added as part of single_major merge, which * exposed the 'single_major' parameter. 'minor' is always present, * regardless of whether single-major scheme is turned on or not. * * (Something like ver >= KERNEL_VERSION(3, 14, 0) is a no-go because * this has to work with rbd.ko backported to various kernels.) */ return access("/sys/module/rbd/parameters/single_major", F_OK) == 0; } static int build_map_buf(CephContext *cct, const char *pool, const char *image, const char *snap, const char *options, string *pbuf) { ostringstream oss; int r; MonMap monmap; r = monmap.build_initial(cct, cerr); if (r < 0) return r; list mon_addr; monmap.list_addrs(mon_addr); for (const auto &p : mon_addr) { if (oss.tellp() > 0) { oss << ","; } oss << p.get_sockaddr(); } oss << " name=" << cct->_conf->name.get_id(); KeyRing keyring; if (cct->_conf->auth_client_required != "none") { r = keyring.from_ceph_context(cct); if (r == -ENOENT && !(cct->_conf->keyfile.length() || cct->_conf->key.length())) r = 0; if (r < 0) { cerr << "rbd: failed to get secret" << std::endl; return r; } } CryptoKey secret; string key_name = string("client.") + cct->_conf->name.get_id(); if (keyring.get_secret(cct->_conf->name, secret)) { string secret_str; secret.encode_base64(secret_str); r = set_kernel_secret(secret_str.c_str(), key_name.c_str()); if (r >= 0) { if (r == 0) cerr << "rbd: warning: secret has length 0" << std::endl; oss << ",key=" << key_name; } else if (r == -ENODEV || r == -ENOSYS) { // running against older kernel; fall back to secret= in options oss << ",secret=" << secret_str; } else { cerr << "rbd: failed to add secret '" << key_name << "' to kernel" << std::endl; return r; } } else if (is_kernel_secret(key_name.c_str())) { oss << ",key=" << key_name; } if (strcmp(options, "") != 0) oss << "," << options; oss << " " << pool << " " << image << " " << snap; *pbuf = oss.str(); return 0; } static int wait_for_udev_add(struct udev_monitor *mon, const char *pool, const char *image, const char *snap, string *pname) { struct udev_device *bus_dev = NULL; /* * Catch /sys/devices/rbd// and wait for the corresponding * block device to show up. This is necessary because rbd devices * and block devices aren't linked together in our sysfs layout. */ for (;;) { struct pollfd fds[1]; struct udev_device *dev; fds[0].fd = udev_monitor_get_fd(mon); fds[0].events = POLLIN; if (poll(fds, 1, POLL_TIMEOUT) < 0) return -errno; dev = udev_monitor_receive_device(mon); if (!dev) continue; if (strcmp(udev_device_get_action(dev), "add") != 0) goto next; if (!bus_dev) { if (strcmp(udev_device_get_subsystem(dev), "rbd") == 0) { const char *this_pool = udev_device_get_sysattr_value(dev, "pool"); const char *this_image = udev_device_get_sysattr_value(dev, "name"); const char *this_snap = udev_device_get_sysattr_value(dev, "current_snap"); if (this_pool && strcmp(this_pool, pool) == 0 && this_image && strcmp(this_image, image) == 0 && this_snap && strcmp(this_snap, snap) == 0) { bus_dev = dev; continue; } } } else { if (strcmp(udev_device_get_subsystem(dev), "block") == 0) { const char *major = udev_device_get_sysattr_value(bus_dev, "major"); const char *minor = udev_device_get_sysattr_value(bus_dev, "minor"); const char *this_major = udev_device_get_property_value(dev, "MAJOR"); const char *this_minor = udev_device_get_property_value(dev, "MINOR"); assert(!minor ^ have_minor_attr()); if (strcmp(this_major, major) == 0 && (!minor || strcmp(this_minor, minor) == 0)) { string name = get_kernel_rbd_name(udev_device_get_sysname(bus_dev)); assert(strcmp(udev_device_get_devnode(dev), name.c_str()) == 0); *pname = name; udev_device_unref(dev); udev_device_unref(bus_dev); break; } } } next: udev_device_unref(dev); } return 0; } static int do_map(struct udev *udev, const char *pool, const char *image, const char *snap, const string& buf, string *pname) { struct udev_monitor *mon; int r; mon = udev_monitor_new_from_netlink(udev, "udev"); if (!mon) return -ENOMEM; r = udev_monitor_filter_add_match_subsystem_devtype(mon, "rbd", NULL); if (r < 0) goto out_mon; r = udev_monitor_filter_add_match_subsystem_devtype(mon, "block", "disk"); if (r < 0) goto out_mon; r = udev_monitor_enable_receiving(mon); if (r < 0) goto out_mon; r = sysfs_write_rbd_add(buf); if (r < 0) { cerr << "rbd: sysfs write failed" << std::endl; goto out_mon; } r = wait_for_udev_add(mon, pool, image, snap, pname); if (r < 0) { cerr << "rbd: wait failed" << std::endl; goto out_mon; } out_mon: udev_monitor_unref(mon); return r; } static int map_image(struct krbd_ctx *ctx, const char *pool, const char *image, const char *snap, const char *options, string *pname) { string buf; int r; if (strcmp(snap, "") == 0) snap = "-"; r = build_map_buf(ctx->cct, pool, image, snap, options, &buf); if (r < 0) return r; /* * Modprobe rbd kernel module. If it supports single-major device * number allocation scheme, make sure it's turned on. */ if (access("/sys/bus/rbd", F_OK) != 0) { const char *module_options = NULL; if (module_has_param("rbd", "single_major")) module_options = "single_major=Y"; r = module_load("rbd", module_options); if (r) { cerr << "rbd: failed to load rbd kernel module (" << r << ")" << std::endl; /* * Ignore the error: modprobe failing doesn't necessarily prevent * from working. */ } } return do_map(ctx->udev, pool, image, snap, buf, pname); } static int devno_to_krbd_id(struct udev *udev, dev_t devno, string *pid) { struct udev_enumerate *enm; struct udev_list_entry *l; struct udev_device *dev; int r; enm = udev_enumerate_new(udev); if (!enm) return -ENOMEM; r = udev_enumerate_add_match_subsystem(enm, "rbd"); if (r < 0) goto out_enm; r = udev_enumerate_add_match_sysattr(enm, "major", stringify(major(devno)).c_str()); if (r < 0) goto out_enm; if (have_minor_attr()) { r = udev_enumerate_add_match_sysattr(enm, "minor", stringify(minor(devno)).c_str()); if (r < 0) goto out_enm; } r = udev_enumerate_scan_devices(enm); if (r < 0) goto out_enm; l = udev_enumerate_get_list_entry(enm); if (!l) { r = -ENOENT; goto out_enm; } /* make sure there is only one match */ assert(!udev_list_entry_get_next(l)); dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l)); if (!dev) { r = -ENOMEM; goto out_enm; } *pid = udev_device_get_sysname(dev); udev_device_unref(dev); out_enm: udev_enumerate_unref(enm); return r; } static int spec_to_devno_and_krbd_id(struct udev *udev, const char *pool, const char *image, const char *snap, dev_t *pdevno, string *pid) { struct udev_enumerate *enm; struct udev_list_entry *l; struct udev_device *dev; unsigned int maj, min = 0; string err; int r; enm = udev_enumerate_new(udev); if (!enm) return -ENOMEM; r = udev_enumerate_add_match_subsystem(enm, "rbd"); if (r < 0) goto out_enm; r = udev_enumerate_add_match_sysattr(enm, "pool", pool); if (r < 0) goto out_enm; r = udev_enumerate_add_match_sysattr(enm, "name", image); if (r < 0) goto out_enm; r = udev_enumerate_add_match_sysattr(enm, "current_snap", snap); if (r < 0) goto out_enm; r = udev_enumerate_scan_devices(enm); if (r < 0) goto out_enm; l = udev_enumerate_get_list_entry(enm); if (!l) { r = -ENOENT; goto out_enm; } dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l)); if (!dev) { r = -ENOMEM; goto out_enm; } maj = strict_strtoll(udev_device_get_sysattr_value(dev, "major"), 10, &err); if (!err.empty()) { cerr << "rbd: couldn't parse major: " << err << std::endl; r = -EINVAL; goto out_dev; } if (have_minor_attr()) { min = strict_strtoll(udev_device_get_sysattr_value(dev, "minor"), 10, &err); if (!err.empty()) { cerr << "rbd: couldn't parse minor: " << err << std::endl; r = -EINVAL; goto out_dev; } } /* * If an image is mapped more than once don't bother trying to unmap * all devices - let users run unmap the same number of times they * ran map. */ if (udev_list_entry_get_next(l)) cerr << "rbd: " << pool << "/" << image << "@" << snap << ": mapped more than once, unmapping " << get_kernel_rbd_name(udev_device_get_sysname(dev)) << " only" << std::endl; *pdevno = makedev(maj, min); *pid = udev_device_get_sysname(dev); out_dev: udev_device_unref(dev); out_enm: udev_enumerate_unref(enm); return r; } static string build_unmap_buf(const string& id, const char *options) { string buf(id); if (strcmp(options, "") != 0) { buf += " "; buf += options; } return buf; } static int wait_for_udev_remove(struct udev_monitor *mon, dev_t devno) { for (;;) { struct pollfd fds[1]; struct udev_device *dev; fds[0].fd = udev_monitor_get_fd(mon); fds[0].events = POLLIN; if (poll(fds, 1, POLL_TIMEOUT) < 0) return -errno; dev = udev_monitor_receive_device(mon); if (!dev) continue; if (strcmp(udev_device_get_action(dev), "remove") == 0 && udev_device_get_devnum(dev) == devno) { udev_device_unref(dev); break; } udev_device_unref(dev); } return 0; } static int do_unmap(struct udev *udev, dev_t devno, const string& buf) { struct udev_monitor *mon; int r; mon = udev_monitor_new_from_netlink(udev, "udev"); if (!mon) return -ENOMEM; r = udev_monitor_filter_add_match_subsystem_devtype(mon, "block", "disk"); if (r < 0) goto out_mon; r = udev_monitor_enable_receiving(mon); if (r < 0) goto out_mon; /* * On final device close(), kernel sends a block change event, in * response to which udev apparently runs blkid on the device. This * makes unmap fail with EBUSY, if issued right after final close(). * Try to circumvent this with a retry before turning to udev. */ for (int tries = 0; ; tries++) { r = sysfs_write_rbd_remove(buf); if (r >= 0) { break; } else if (r == -EBUSY && tries < 2) { if (!tries) { usleep(250 * 1000); } else { /* * libudev does not provide the "wait until the queue is empty" * API or the sufficient amount of primitives to build it from. */ string err = run_cmd("udevadm", "settle", "--timeout", "10", NULL); if (!err.empty()) cerr << "rbd: " << err << std::endl; } } else { cerr << "rbd: sysfs write failed" << std::endl; goto out_mon; } } r = wait_for_udev_remove(mon, devno); if (r < 0) { cerr << "rbd: wait failed" << std::endl; goto out_mon; } out_mon: udev_monitor_unref(mon); return r; } static int unmap_image(struct krbd_ctx *ctx, const char *devnode, const char *options) { struct stat sb; dev_t wholedevno = 0; string id; int r; if (stat(devnode, &sb) < 0 || !S_ISBLK(sb.st_mode)) { cerr << "rbd: '" << devnode << "' is not a block device" << std::endl; return -EINVAL; } r = blkid_devno_to_wholedisk(sb.st_rdev, NULL, 0, &wholedevno); if (r < 0) { cerr << "rbd: couldn't compute wholedevno: " << cpp_strerror(r) << std::endl; /* * Ignore the error: we are given whole disks most of the time, and * if it turns out this is a partition we will fail later anyway. */ wholedevno = sb.st_rdev; } r = devno_to_krbd_id(ctx->udev, wholedevno, &id); if (r < 0) { if (r == -ENOENT) { cerr << "rbd: '" << devnode << "' is not an rbd device" << std::endl; r = -EINVAL; } return r; } return do_unmap(ctx->udev, wholedevno, build_unmap_buf(id, options)); } static int unmap_image(struct krbd_ctx *ctx, const char *pool, const char *image, const char *snap, const char *options) { dev_t devno = 0; string id; int r; if (!snap) snap = "-"; r = spec_to_devno_and_krbd_id(ctx->udev, pool, image, snap, &devno, &id); if (r < 0) { if (r == -ENOENT) { cerr << "rbd: " << pool << "/" << image << "@" << snap << ": not a mapped image or snapshot" << std::endl; r = -EINVAL; } return r; } return do_unmap(ctx->udev, devno, build_unmap_buf(id, options)); } static bool dump_one_image(Formatter *f, TextTable *tbl, struct udev_device *dev) { const char *id = udev_device_get_sysname(dev); const char *pool = udev_device_get_sysattr_value(dev, "pool"); const char *image = udev_device_get_sysattr_value(dev, "name"); const char *snap = udev_device_get_sysattr_value(dev, "current_snap"); string kname = get_kernel_rbd_name(id); if (!pool || !image || !snap) return false; if (f) { f->open_object_section(id); f->dump_string("pool", pool); f->dump_string("name", image); f->dump_string("snap", snap); f->dump_string("device", kname); f->close_section(); } else { *tbl << id << pool << image << snap << kname << TextTable::endrow; } return true; } static int do_dump(struct udev *udev, Formatter *f, TextTable *tbl) { struct udev_enumerate *enm; struct udev_list_entry *l; bool have_output = false; int r; enm = udev_enumerate_new(udev); if (!enm) return -ENOMEM; r = udev_enumerate_add_match_subsystem(enm, "rbd"); if (r < 0) goto out_enm; r = udev_enumerate_scan_devices(enm); if (r < 0) goto out_enm; udev_list_entry_foreach(l, udev_enumerate_get_list_entry(enm)) { struct udev_device *dev; dev = udev_device_new_from_syspath(udev, udev_list_entry_get_name(l)); if (dev) { have_output |= dump_one_image(f, tbl, dev); udev_device_unref(dev); } } r = have_output; out_enm: udev_enumerate_unref(enm); return r; } int dump_images(struct krbd_ctx *ctx, Formatter *f) { TextTable tbl; int r; if (f) { f->open_object_section("devices"); } else { tbl.define_column("id", TextTable::LEFT, TextTable::LEFT); tbl.define_column("pool", TextTable::LEFT, TextTable::LEFT); tbl.define_column("image", TextTable::LEFT, TextTable::LEFT); tbl.define_column("snap", TextTable::LEFT, TextTable::LEFT); tbl.define_column("device", TextTable::LEFT, TextTable::LEFT); } r = do_dump(ctx->udev, f, &tbl); if (f) { f->close_section(); f->flush(cout); } else { if (r > 0) cout << tbl; } return r; } extern "C" int krbd_create_from_context(rados_config_t cct, struct krbd_ctx **pctx) { struct krbd_ctx *ctx = new struct krbd_ctx(); ctx->cct = reinterpret_cast(cct); ctx->udev = udev_new(); if (!ctx->udev) { delete ctx; return -ENOMEM; } *pctx = ctx; return 0; } extern "C" void krbd_destroy(struct krbd_ctx *ctx) { if (!ctx) return; udev_unref(ctx->udev); delete ctx; } extern "C" int krbd_map(struct krbd_ctx *ctx, const char *pool, const char *image, const char *snap, const char *options, char **pdevnode) { string name; char *devnode; int r; r = map_image(ctx, pool, image, snap, options, &name); if (r < 0) return r; devnode = strdup(name.c_str()); if (!devnode) return -ENOMEM; *pdevnode = devnode; return r; } extern "C" int krbd_unmap(struct krbd_ctx *ctx, const char *devnode, const char *options) { return unmap_image(ctx, devnode, options); } extern "C" int krbd_unmap_by_spec(struct krbd_ctx *ctx, const char *pool, const char *image, const char *snap, const char *options) { return unmap_image(ctx, pool, image, snap, options); } int krbd_showmapped(struct krbd_ctx *ctx, Formatter *f) { return dump_images(ctx, f); }