X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=src%2Fceph%2Fsrc%2Fclient%2FClient.cc;fp=src%2Fceph%2Fsrc%2Fclient%2FClient.cc;h=1d9277a61b6ecfb1d5c01f83d2aedc4418600e3f;hb=812ff6ca9fcd3e629e49d4328905f33eee8ca3f5;hp=0000000000000000000000000000000000000000;hpb=15280273faafb77777eab341909a3f495cf248d9;p=stor4nfv.git diff --git a/src/ceph/src/client/Client.cc b/src/ceph/src/client/Client.cc new file mode 100644 index 0000000..1d9277a --- /dev/null +++ b/src/ceph/src/client/Client.cc @@ -0,0 +1,13842 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +// unix-ey fs stuff +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#if defined(__FreeBSD__) +#define XATTR_CREATE 0x1 +#define XATTR_REPLACE 0x2 +#else +#include +#endif + +#if defined(__linux__) +#include +#endif + +#include + +#include "common/config.h" +#include "common/version.h" + +// ceph stuff +#include "messages/MClientSession.h" +#include "messages/MClientReconnect.h" +#include "messages/MClientRequest.h" +#include "messages/MClientRequestForward.h" +#include "messages/MClientReply.h" +#include "messages/MClientCaps.h" +#include "messages/MClientLease.h" +#include "messages/MClientSnap.h" +#include "messages/MCommandReply.h" +#include "messages/MOSDMap.h" +#include "messages/MClientQuota.h" +#include "messages/MClientCapRelease.h" +#include "messages/MMDSMap.h" +#include "messages/MFSMap.h" +#include "messages/MFSMapUser.h" + +#include "mon/MonClient.h" + +#include "mds/flock.h" +#include "osd/OSDMap.h" +#include "osdc/Filer.h" + +#include "common/Cond.h" +#include "common/Mutex.h" +#include "common/perf_counters.h" +#include "common/admin_socket.h" +#include "common/errno.h" +#include "include/str_list.h" + +#define dout_subsys ceph_subsys_client + +#include "include/lru.h" +#include "include/compat.h" +#include "include/stringify.h" + +#include "Client.h" +#include "Inode.h" +#include "Dentry.h" +#include "Dir.h" +#include "ClientSnapRealm.h" +#include "Fh.h" +#include "MetaSession.h" +#include "MetaRequest.h" +#include "ObjecterWriteback.h" +#include "posix_acl.h" + +#include "include/assert.h" +#include "include/stat.h" + +#include "include/cephfs/ceph_statx.h" + +#if HAVE_GETGROUPLIST +#include +#include +#include +#endif + +#undef dout_prefix +#define dout_prefix *_dout << "client." << whoami << " " + +#define tout(cct) if (!cct->_conf->client_trace.empty()) traceout + +// FreeBSD fails to define this +#ifndef O_DSYNC +#define O_DSYNC 0x0 +#endif +// Darwin fails to define this +#ifndef O_RSYNC +#define O_RSYNC 0x0 +#endif + +#ifndef O_DIRECT +#define O_DIRECT 0x0 +#endif + +#define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED) + +void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset) +{ + Client *client = static_cast(p); + client->flush_set_callback(oset); +} + + +// ------------- + +Client::CommandHook::CommandHook(Client *client) : + m_client(client) +{ +} + +bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap, + std::string format, bufferlist& out) +{ + Formatter *f = Formatter::create(format); + f->open_object_section("result"); + m_client->client_lock.Lock(); + if (command == "mds_requests") + m_client->dump_mds_requests(f); + else if (command == "mds_sessions") + m_client->dump_mds_sessions(f); + else if (command == "dump_cache") + m_client->dump_cache(f); + else if (command == "kick_stale_sessions") + m_client->_kick_stale_sessions(); + else if (command == "status") + m_client->dump_status(f); + else + assert(0 == "bad command registered"); + m_client->client_lock.Unlock(); + f->close_section(); + f->flush(out); + delete f; + return true; +} + + +// ------------- + +dir_result_t::dir_result_t(Inode *in, const UserPerm& perms) + : inode(in), offset(0), next_offset(2), + release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0), + perms(perms) + { } + +void Client::_reset_faked_inos() +{ + ino_t start = 1024; + free_faked_inos.clear(); + free_faked_inos.insert(start, (uint32_t)-1 - start + 1); + last_used_faked_ino = 0; + _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos; +} + +void Client::_assign_faked_ino(Inode *in) +{ + interval_set::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1); + if (it == free_faked_inos.end() && last_used_faked_ino > 0) { + last_used_faked_ino = 0; + it = free_faked_inos.lower_bound(last_used_faked_ino + 1); + } + assert(it != free_faked_inos.end()); + if (last_used_faked_ino < it.get_start()) { + assert(it.get_len() > 0); + last_used_faked_ino = it.get_start(); + } else { + ++last_used_faked_ino; + assert(it.get_start() + it.get_len() > last_used_faked_ino); + } + in->faked_ino = last_used_faked_ino; + free_faked_inos.erase(in->faked_ino); + faked_ino_map[in->faked_ino] = in->vino(); +} + +void Client::_release_faked_ino(Inode *in) +{ + free_faked_inos.insert(in->faked_ino); + faked_ino_map.erase(in->faked_ino); +} + +vinodeno_t Client::_map_faked_ino(ino_t ino) +{ + vinodeno_t vino; + if (ino == 1) + vino = root->vino(); + else if (faked_ino_map.count(ino)) + vino = faked_ino_map[ino]; + else + vino = vinodeno_t(0, CEPH_NOSNAP); + ldout(cct, 10) << "map_faked_ino " << ino << " -> " << vino << dendl; + return vino; +} + +vinodeno_t Client::map_faked_ino(ino_t ino) +{ + Mutex::Locker lock(client_lock); + return _map_faked_ino(ino); +} + +// cons/des + +Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_) + : Dispatcher(m->cct), + m_command_hook(this), + timer(m->cct, client_lock), + callback_handle(NULL), + switch_interrupt_cb(NULL), + remount_cb(NULL), + ino_invalidate_cb(NULL), + dentry_invalidate_cb(NULL), + getgroups_cb(NULL), + umask_cb(NULL), + can_invalidate_dentries(false), + require_remount(false), + async_ino_invalidator(m->cct), + async_dentry_invalidator(m->cct), + interrupt_finisher(m->cct), + remount_finisher(m->cct), + objecter_finisher(m->cct), + tick_event(NULL), + messenger(m), monclient(mc), + objecter(objecter_), + whoami(mc->get_global_id()), cap_epoch_barrier(0), + last_tid(0), oldest_tid(0), last_flush_tid(1), + initialized(false), + mounted(false), unmounting(false), blacklisted(false), + local_osd(-1), local_osd_epoch(0), + unsafe_sync_write(0), + client_lock("Client::client_lock") +{ + _reset_faked_inos(); + // + root = 0; + + num_flushing_caps = 0; + + _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs); + _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs); + + user_id = cct->_conf->client_mount_uid; + group_id = cct->_conf->client_mount_gid; + + acl_type = NO_ACL; + if (cct->_conf->client_acl_type == "posix_acl") + acl_type = POSIX_ACL; + + lru.lru_set_midpoint(cct->_conf->client_cache_mid); + + // file handles + free_fd_set.insert(10, 1<<30); + + mdsmap.reset(new MDSMap); + + // osd interfaces + writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher, + &client_lock)); + objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock, + client_flush_set_callback, // all commit callback + (void*)this, + cct->_conf->client_oc_size, + cct->_conf->client_oc_max_objects, + cct->_conf->client_oc_max_dirty, + cct->_conf->client_oc_target_dirty, + cct->_conf->client_oc_max_dirty_age, + true)); + objecter_finisher.start(); + filer.reset(new Filer(objecter, &objecter_finisher)); + objecter->enable_blacklist_events(); +} + + +Client::~Client() +{ + assert(!client_lock.is_locked()); + + // It is necessary to hold client_lock, because any inode destruction + // may call into ObjectCacher, which asserts that it's lock (which is + // client_lock) is held. + client_lock.Lock(); + tear_down_cache(); + client_lock.Unlock(); +} + +void Client::tear_down_cache() +{ + // fd's + for (ceph::unordered_map::iterator it = fd_map.begin(); + it != fd_map.end(); + ++it) { + Fh *fh = it->second; + ldout(cct, 1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl; + _release_fh(fh); + } + fd_map.clear(); + + while (!opened_dirs.empty()) { + dir_result_t *dirp = *opened_dirs.begin(); + ldout(cct, 1) << "tear_down_cache forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl; + _closedir(dirp); + } + + // caps! + // *** FIXME *** + + // empty lru + trim_cache(); + assert(lru.lru_get_size() == 0); + + // close root ino + assert(inode_map.size() <= 1 + root_parents.size()); + if (root && inode_map.size() == 1 + root_parents.size()) { + delete root; + root = 0; + root_ancestor = 0; + while (!root_parents.empty()) + root_parents.erase(root_parents.begin()); + inode_map.clear(); + _reset_faked_inos(); + } + + assert(inode_map.empty()); +} + +inodeno_t Client::get_root_ino() +{ + Mutex::Locker l(client_lock); + if (use_faked_inos()) + return root->faked_ino; + else + return root->ino; +} + +Inode *Client::get_root() +{ + Mutex::Locker l(client_lock); + root->ll_get(); + return root; +} + + +// debug crapola + +void Client::dump_inode(Formatter *f, Inode *in, set& did, bool disconnected) +{ + filepath path; + in->make_long_path(path); + ldout(cct, 1) << "dump_inode: " + << (disconnected ? "DISCONNECTED ":"") + << "inode " << in->ino + << " " << path + << " ref " << in->get_num_ref() + << *in << dendl; + + if (f) { + f->open_object_section("inode"); + f->dump_stream("path") << path; + if (disconnected) + f->dump_int("disconnected", 1); + in->dump(f); + f->close_section(); + } + + did.insert(in); + if (in->dir) { + ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl; + for (ceph::unordered_map::iterator it = in->dir->dentries.begin(); + it != in->dir->dentries.end(); + ++it) { + ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl; + if (f) { + f->open_object_section("dentry"); + it->second->dump(f); + f->close_section(); + } + if (it->second->inode) + dump_inode(f, it->second->inode.get(), did, false); + } + } +} + +void Client::dump_cache(Formatter *f) +{ + set did; + + ldout(cct, 1) << "dump_cache" << dendl; + + if (f) + f->open_array_section("cache"); + + if (root) + dump_inode(f, root, did, true); + + // make a second pass to catch anything disconnected + for (ceph::unordered_map::iterator it = inode_map.begin(); + it != inode_map.end(); + ++it) { + if (did.count(it->second)) + continue; + dump_inode(f, it->second, did, true); + } + + if (f) + f->close_section(); +} + +void Client::dump_status(Formatter *f) +{ + assert(client_lock.is_locked_by_me()); + + ldout(cct, 1) << __func__ << dendl; + + const epoch_t osd_epoch + = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch)); + + if (f) { + f->open_object_section("metadata"); + for (const auto& kv : metadata) + f->dump_string(kv.first.c_str(), kv.second); + f->close_section(); + + f->dump_int("dentry_count", lru.lru_get_size()); + f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned()); + f->dump_int("id", get_nodeid().v); + f->dump_int("inode_count", inode_map.size()); + f->dump_int("mds_epoch", mdsmap->get_epoch()); + f->dump_int("osd_epoch", osd_epoch); + f->dump_int("osd_epoch_barrier", cap_epoch_barrier); + } +} + +int Client::init() +{ + timer.init(); + objectcacher->start(); + + client_lock.Lock(); + assert(!initialized); + + messenger->add_dispatcher_tail(this); + client_lock.Unlock(); + + _finish_init(); + return 0; +} + +void Client::_finish_init() +{ + client_lock.Lock(); + // logger + PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last); + plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request"); + plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request"); + plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation"); + logger.reset(plb.create_perf_counters()); + cct->get_perfcounters_collection()->add(logger.get()); + + client_lock.Unlock(); + + cct->_conf->add_observer(this); + + AdminSocket* admin_socket = cct->get_admin_socket(); + int ret = admin_socket->register_command("mds_requests", + "mds_requests", + &m_command_hook, + "show in-progress mds requests"); + if (ret < 0) { + lderr(cct) << "error registering admin socket command: " + << cpp_strerror(-ret) << dendl; + } + ret = admin_socket->register_command("mds_sessions", + "mds_sessions", + &m_command_hook, + "show mds session state"); + if (ret < 0) { + lderr(cct) << "error registering admin socket command: " + << cpp_strerror(-ret) << dendl; + } + ret = admin_socket->register_command("dump_cache", + "dump_cache", + &m_command_hook, + "show in-memory metadata cache contents"); + if (ret < 0) { + lderr(cct) << "error registering admin socket command: " + << cpp_strerror(-ret) << dendl; + } + ret = admin_socket->register_command("kick_stale_sessions", + "kick_stale_sessions", + &m_command_hook, + "kick sessions that were remote reset"); + if (ret < 0) { + lderr(cct) << "error registering admin socket command: " + << cpp_strerror(-ret) << dendl; + } + ret = admin_socket->register_command("status", + "status", + &m_command_hook, + "show overall client status"); + if (ret < 0) { + lderr(cct) << "error registering admin socket command: " + << cpp_strerror(-ret) << dendl; + } + + client_lock.Lock(); + initialized = true; + client_lock.Unlock(); +} + +void Client::shutdown() +{ + ldout(cct, 1) << "shutdown" << dendl; + + // If we were not mounted, but were being used for sending + // MDS commands, we may have sessions that need closing. + client_lock.Lock(); + _close_sessions(); + client_lock.Unlock(); + + cct->_conf->remove_observer(this); + + AdminSocket* admin_socket = cct->get_admin_socket(); + admin_socket->unregister_command("mds_requests"); + admin_socket->unregister_command("mds_sessions"); + admin_socket->unregister_command("dump_cache"); + admin_socket->unregister_command("kick_stale_sessions"); + admin_socket->unregister_command("status"); + + if (ino_invalidate_cb) { + ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl; + async_ino_invalidator.wait_for_empty(); + async_ino_invalidator.stop(); + } + + if (dentry_invalidate_cb) { + ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl; + async_dentry_invalidator.wait_for_empty(); + async_dentry_invalidator.stop(); + } + + if (switch_interrupt_cb) { + ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl; + interrupt_finisher.wait_for_empty(); + interrupt_finisher.stop(); + } + + if (remount_cb) { + ldout(cct, 10) << "shutdown stopping remount finisher" << dendl; + remount_finisher.wait_for_empty(); + remount_finisher.stop(); + } + + objectcacher->stop(); // outside of client_lock! this does a join. + + client_lock.Lock(); + assert(initialized); + initialized = false; + timer.shutdown(); + client_lock.Unlock(); + + objecter_finisher.wait_for_empty(); + objecter_finisher.stop(); + + if (logger) { + cct->get_perfcounters_collection()->remove(logger.get()); + logger.reset(); + } +} + + +// =================== +// metadata cache stuff + +void Client::trim_cache(bool trim_kernel_dcache) +{ + uint64_t max = cct->_conf->client_cache_size; + ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl; + unsigned last = 0; + while (lru.lru_get_size() != last) { + last = lru.lru_get_size(); + + if (!unmounting && lru.lru_get_size() <= max) break; + + // trim! + Dentry *dn = static_cast(lru.lru_get_next_expire()); + if (!dn) + break; // done + + trim_dentry(dn); + } + + if (trim_kernel_dcache && lru.lru_get_size() > max) + _invalidate_kernel_dcache(); + + // hose root? + if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) { + ldout(cct, 15) << "trim_cache trimmed root " << root << dendl; + delete root; + root = 0; + root_ancestor = 0; + while (!root_parents.empty()) + root_parents.erase(root_parents.begin()); + inode_map.clear(); + _reset_faked_inos(); + } +} + +void Client::trim_cache_for_reconnect(MetaSession *s) +{ + mds_rank_t mds = s->mds_num; + ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds << dendl; + + int trimmed = 0; + list skipped; + while (lru.lru_get_size() > 0) { + Dentry *dn = static_cast(lru.lru_expire()); + if (!dn) + break; + + if ((dn->inode && dn->inode->caps.count(mds)) || + dn->dir->parent_inode->caps.count(mds)) { + trim_dentry(dn); + trimmed++; + } else + skipped.push_back(dn); + } + + for(list::iterator p = skipped.begin(); p != skipped.end(); ++p) + lru.lru_insert_mid(*p); + + ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds + << " trimmed " << trimmed << " dentries" << dendl; + + if (s->caps.size() > 0) + _invalidate_kernel_dcache(); +} + +void Client::trim_dentry(Dentry *dn) +{ + ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name + << " in dir " << hex << dn->dir->parent_inode->ino + << dendl; + if (dn->inode) { + Inode *diri = dn->dir->parent_inode; + diri->dir_release_count++; + clear_dir_complete_and_ordered(diri, true); + } + unlink(dn, false, false); // drop dir, drop dentry +} + + +void Client::update_inode_file_bits(Inode *in, + uint64_t truncate_seq, uint64_t truncate_size, + uint64_t size, uint64_t change_attr, + uint64_t time_warp_seq, utime_t ctime, + utime_t mtime, + utime_t atime, + version_t inline_version, + bufferlist& inline_data, + int issued) +{ + bool warn = false; + ldout(cct, 10) << "update_inode_file_bits " << *in << " " << ccap_string(issued) + << " mtime " << mtime << dendl; + ldout(cct, 25) << "truncate_seq: mds " << truncate_seq << " local " + << in->truncate_seq << " time_warp_seq: mds " << time_warp_seq + << " local " << in->time_warp_seq << dendl; + uint64_t prior_size = in->size; + + if (inline_version > in->inline_version) { + in->inline_data = inline_data; + in->inline_version = inline_version; + } + + /* always take a newer change attr */ + if (change_attr > in->change_attr) + in->change_attr = change_attr; + + if (truncate_seq > in->truncate_seq || + (truncate_seq == in->truncate_seq && size > in->size)) { + ldout(cct, 10) << "size " << in->size << " -> " << size << dendl; + in->size = size; + in->reported_size = size; + if (truncate_seq != in->truncate_seq) { + ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> " + << truncate_seq << dendl; + in->truncate_seq = truncate_seq; + in->oset.truncate_seq = truncate_seq; + + // truncate cached file data + if (prior_size > size) { + _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size); + } + } + + // truncate inline data + if (in->inline_version < CEPH_INLINE_NONE) { + uint32_t len = in->inline_data.length(); + if (size < len) + in->inline_data.splice(size, len - size); + } + } + if (truncate_seq >= in->truncate_seq && + in->truncate_size != truncate_size) { + if (in->is_file()) { + ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> " + << truncate_size << dendl; + in->truncate_size = truncate_size; + in->oset.truncate_size = truncate_size; + } else { + ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl; + } + } + + // be careful with size, mtime, atime + if (issued & (CEPH_CAP_FILE_EXCL| + CEPH_CAP_FILE_WR| + CEPH_CAP_FILE_BUFFER| + CEPH_CAP_AUTH_EXCL| + CEPH_CAP_XATTR_EXCL)) { + ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl; + if (ctime > in->ctime) + in->ctime = ctime; + if (time_warp_seq > in->time_warp_seq) { + ldout(cct, 10) << "mds time_warp_seq " << time_warp_seq << " on inode " << *in + << " is higher than local time_warp_seq " + << in->time_warp_seq << dendl; + //the mds updated times, so take those! + in->mtime = mtime; + in->atime = atime; + in->time_warp_seq = time_warp_seq; + } else if (time_warp_seq == in->time_warp_seq) { + //take max times + if (mtime > in->mtime) + in->mtime = mtime; + if (atime > in->atime) + in->atime = atime; + } else if (issued & CEPH_CAP_FILE_EXCL) { + //ignore mds values as we have a higher seq + } else warn = true; + } else { + ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl; + if (time_warp_seq >= in->time_warp_seq) { + in->ctime = ctime; + in->mtime = mtime; + in->atime = atime; + in->time_warp_seq = time_warp_seq; + } else warn = true; + } + if (warn) { + ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq " + << time_warp_seq << " is lower than local time_warp_seq " + << in->time_warp_seq + << dendl; + } +} + +void Client::_fragmap_remove_non_leaves(Inode *in) +{ + for (map::iterator p = in->fragmap.begin(); p != in->fragmap.end(); ) + if (!in->dirfragtree.is_leaf(p->first)) + in->fragmap.erase(p++); + else + ++p; +} + +void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds) +{ + for (auto p = in->fragmap.begin(); p != in->fragmap.end(); ) + if (p->second == mds) + in->fragmap.erase(p++); + else + ++p; +} + +Inode * Client::add_update_inode(InodeStat *st, utime_t from, + MetaSession *session, + const UserPerm& request_perms) +{ + Inode *in; + bool was_new = false; + if (inode_map.count(st->vino)) { + in = inode_map[st->vino]; + ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl; + } else { + in = new Inode(this, st->vino, &st->layout); + inode_map[st->vino] = in; + + if (use_faked_inos()) + _assign_faked_ino(in); + + if (!root) { + root = in; + root_ancestor = in; + cwd = root; + } else if (!mounted) { + root_parents[root_ancestor] = in; + root_ancestor = in; + } + + // immutable bits + in->ino = st->vino.ino; + in->snapid = st->vino.snapid; + in->mode = st->mode & S_IFMT; + was_new = true; + } + + in->rdev = st->rdev; + if (in->is_symlink()) + in->symlink = st->symlink; + + if (was_new) + ldout(cct, 12) << "add_update_inode adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl; + + if (!st->cap.caps) + return in; // as with readdir returning indoes in different snaprealms (no caps!) + + // only update inode if mds info is strictly newer, or it is the same and projected (odd). + bool updating_inode = false; + int issued = 0; + if (st->version == 0 || + (in->version & ~1) < st->version) { + updating_inode = true; + + int implemented = 0; + issued = in->caps_issued(&implemented) | in->caps_dirty(); + issued |= implemented; + + in->version = st->version; + + if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { + in->mode = st->mode; + in->uid = st->uid; + in->gid = st->gid; + in->btime = st->btime; + } + + if ((issued & CEPH_CAP_LINK_EXCL) == 0) { + in->nlink = st->nlink; + } + + in->dirstat = st->dirstat; + in->rstat = st->rstat; + in->quota = st->quota; + in->layout = st->layout; + + if (in->is_dir()) { + in->dir_layout = st->dir_layout; + ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl; + } + + update_inode_file_bits(in, st->truncate_seq, st->truncate_size, st->size, + st->change_attr, st->time_warp_seq, st->ctime, + st->mtime, st->atime, st->inline_version, + st->inline_data, issued); + } else if (st->inline_version > in->inline_version) { + in->inline_data = st->inline_data; + in->inline_version = st->inline_version; + } + + if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) && + st->xattrbl.length() && + st->xattr_version > in->xattr_version) { + bufferlist::iterator p = st->xattrbl.begin(); + ::decode(in->xattrs, p); + in->xattr_version = st->xattr_version; + } + + // move me if/when version reflects fragtree changes. + if (in->dirfragtree != st->dirfragtree) { + in->dirfragtree = st->dirfragtree; + _fragmap_remove_non_leaves(in); + } + + if (in->snapid == CEPH_NOSNAP) { + add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq, + st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags, + request_perms); + if (in->auth_cap && in->auth_cap->session == session) + in->max_size = st->max_size; + } else + in->snap_caps |= st->cap.caps; + + // setting I_COMPLETE needs to happen after adding the cap + if (updating_inode && + in->is_dir() && + (st->cap.caps & CEPH_CAP_FILE_SHARED) && + (issued & CEPH_CAP_FILE_EXCL) == 0 && + in->dirstat.nfiles == 0 && + in->dirstat.nsubdirs == 0) { + ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl; + in->flags |= I_COMPLETE | I_DIR_ORDERED; + if (in->dir) { + ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with " + << in->dir->dentries.size() << " entries, marking all dentries null" << dendl; + in->dir->readdir_cache.clear(); + for (auto p = in->dir->dentries.begin(); + p != in->dir->dentries.end(); + ++p) { + unlink(p->second, true, true); // keep dir, keep dentry + } + if (in->dir->dentries.empty()) + close_dir(in->dir); + } + } + + return in; +} + + +/* + * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache. + */ +Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease, + Inode *in, utime_t from, MetaSession *session, + Dentry *old_dentry) +{ + Dentry *dn = NULL; + if (dir->dentries.count(dname)) + dn = dir->dentries[dname]; + + ldout(cct, 12) << "insert_dentry_inode '" << dname << "' vino " << in->vino() + << " in dir " << dir->parent_inode->vino() << " dn " << dn + << dendl; + + if (dn && dn->inode) { + if (dn->inode->vino() == in->vino()) { + touch_dn(dn); + ldout(cct, 12) << " had dentry " << dname + << " with correct vino " << dn->inode->vino() + << dendl; + } else { + ldout(cct, 12) << " had dentry " << dname + << " with WRONG vino " << dn->inode->vino() + << dendl; + unlink(dn, true, true); // keep dir, keep dentry + } + } + + if (!dn || !dn->inode) { + InodeRef tmp_ref(in); + if (old_dentry) { + if (old_dentry->dir != dir) { + Inode *old_diri = old_dentry->dir->parent_inode; + old_diri->dir_ordered_count++; + clear_dir_complete_and_ordered(old_diri, false); + } + unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir + } + Inode *diri = dir->parent_inode; + diri->dir_ordered_count++; + clear_dir_complete_and_ordered(diri, false); + dn = link(dir, dname, in, dn); + } + + update_dentry_lease(dn, dlease, from, session); + return dn; +} + +void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session) +{ + utime_t dttl = from; + dttl += (float)dlease->duration_ms / 1000.0; + + assert(dn); + + if (dlease->mask & CEPH_LOCK_DN) { + if (dttl > dn->lease_ttl) { + ldout(cct, 10) << "got dentry lease on " << dn->name + << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl; + dn->lease_ttl = dttl; + dn->lease_mds = session->mds_num; + dn->lease_seq = dlease->seq; + dn->lease_gen = session->cap_gen; + } + } + dn->cap_shared_gen = dn->dir->parent_inode->shared_gen; +} + + +/* + * update MDS location cache for a single inode + */ +void Client::update_dir_dist(Inode *in, DirStat *dst) +{ + // auth + ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl; + if (dst->auth >= 0) { + in->fragmap[dst->frag] = dst->auth; + } else { + in->fragmap.erase(dst->frag); + } + if (!in->dirfragtree.is_leaf(dst->frag)) { + in->dirfragtree.force_to_leaf(cct, dst->frag); + _fragmap_remove_non_leaves(in); + } + + // replicated + in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag! + + // dist + /* + if (!st->dirfrag_dist.empty()) { // FIXME + set dist = st->dirfrag_dist.begin()->second; + if (dist.empty() && !in->dir_contacts.empty()) + ldout(cct, 9) << "lost dist spec for " << in->ino + << " " << dist << dendl; + if (!dist.empty() && in->dir_contacts.empty()) + ldout(cct, 9) << "got dist spec for " << in->ino + << " " << dist << dendl; + in->dir_contacts = dist; + } + */ +} + +void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete) +{ + if (diri->flags & I_COMPLETE) { + if (complete) { + ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl; + diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED); + } else { + if (diri->flags & I_DIR_ORDERED) { + ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl; + diri->flags &= ~I_DIR_ORDERED; + } + } + if (diri->dir) + diri->dir->readdir_cache.clear(); + } +} + +/* + * insert results from readdir or lssnap into the metadata cache. + */ +void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) { + + MClientReply *reply = request->reply; + ConnectionRef con = request->reply->get_connection(); + uint64_t features = con->get_features(); + + dir_result_t *dirp = request->dirp; + assert(dirp); + + // the extra buffer list is only set for readdir and lssnap replies + bufferlist::iterator p = reply->get_extra_bl().begin(); + if (!p.end()) { + // snapdir? + if (request->head.op == CEPH_MDS_OP_LSSNAP) { + assert(diri); + diri = open_snapdir(diri); + } + + // only open dir if we're actually adding stuff to it! + Dir *dir = diri->open_dir(); + assert(dir); + + // dirstat + DirStat dst(p); + __u32 numdn; + __u16 flags; + ::decode(numdn, p); + ::decode(flags, p); + + bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END); + bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER); + + frag_t fg = (unsigned)request->head.args.readdir.frag; + unsigned readdir_offset = dirp->next_offset; + string readdir_start = dirp->last_name; + assert(!readdir_start.empty() || readdir_offset == 2); + + unsigned last_hash = 0; + if (hash_order) { + if (!readdir_start.empty()) { + last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start)); + } else if (flags & CEPH_READDIR_OFFSET_HASH) { + /* mds understands offset_hash */ + last_hash = (unsigned)request->head.args.readdir.offset_hash; + } + } + + if (fg != dst.frag) { + ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl; + fg = dst.frag; + if (!hash_order) { + readdir_offset = 2; + readdir_start.clear(); + dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false); + } + } + + ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end + << ", hash_order=" << hash_order + << ", readdir_start " << readdir_start + << ", last_hash " << last_hash + << ", next_offset " << readdir_offset << dendl; + + if (diri->snapid != CEPH_SNAPDIR && + fg.is_leftmost() && readdir_offset == 2 && + !(hash_order && last_hash)) { + dirp->release_count = diri->dir_release_count; + dirp->ordered_count = diri->dir_ordered_count; + dirp->start_shared_gen = diri->shared_gen; + dirp->cache_index = 0; + } + + dirp->buffer_frag = fg; + + _readdir_drop_dirp_buffer(dirp); + dirp->buffer.reserve(numdn); + + string dname; + LeaseStat dlease; + for (unsigned i=0; isent_stamp, session, + request->perms); + Dentry *dn; + if (diri->dir->dentries.count(dname)) { + Dentry *olddn = diri->dir->dentries[dname]; + if (olddn->inode != in) { + // replace incorrect dentry + unlink(olddn, true, true); // keep dir, dentry + dn = link(dir, dname, in, olddn); + assert(dn == olddn); + } else { + // keep existing dn + dn = olddn; + touch_dn(dn); + } + } else { + // new dn + dn = link(dir, dname, in, NULL); + } + + update_dentry_lease(dn, &dlease, request->sent_stamp, session); + if (hash_order) { + unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname)); + if (hash != last_hash) + readdir_offset = 2; + last_hash = hash; + dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true); + } else { + dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false); + } + // add to readdir cache + if (dirp->release_count == diri->dir_release_count && + dirp->ordered_count == diri->dir_ordered_count && + dirp->start_shared_gen == diri->shared_gen) { + if (dirp->cache_index == dir->readdir_cache.size()) { + if (i == 0) { + assert(!dirp->inode->is_complete_and_ordered()); + dir->readdir_cache.reserve(dirp->cache_index + numdn); + } + dir->readdir_cache.push_back(dn); + } else if (dirp->cache_index < dir->readdir_cache.size()) { + if (dirp->inode->is_complete_and_ordered()) + assert(dir->readdir_cache[dirp->cache_index] == dn); + else + dir->readdir_cache[dirp->cache_index] = dn; + } else { + assert(0 == "unexpected readdir buffer idx"); + } + dirp->cache_index++; + } + // add to cached result list + dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in)); + ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl; + } + + if (numdn > 0) + dirp->last_name = dname; + if (end) + dirp->next_offset = 2; + else + dirp->next_offset = readdir_offset; + + if (dir->is_empty()) + close_dir(dir); + } +} + +/** insert_trace + * + * insert a trace from a MDS reply into the cache. + */ +Inode* Client::insert_trace(MetaRequest *request, MetaSession *session) +{ + MClientReply *reply = request->reply; + int op = request->get_op(); + + ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num + << " is_target=" << (int)reply->head.is_target + << " is_dentry=" << (int)reply->head.is_dentry + << dendl; + + bufferlist::iterator p = reply->get_trace_bl().begin(); + if (request->got_unsafe) { + ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl; + assert(p.end()); + return NULL; + } + + if (p.end()) { + ldout(cct, 10) << "insert_trace -- no trace" << dendl; + + Dentry *d = request->dentry(); + if (d) { + Inode *diri = d->dir->parent_inode; + diri->dir_release_count++; + clear_dir_complete_and_ordered(diri, true); + } + + if (d && reply->get_result() == 0) { + if (op == CEPH_MDS_OP_RENAME) { + // rename + Dentry *od = request->old_dentry(); + ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl; + assert(od); + unlink(od, true, true); // keep dir, dentry + } else if (op == CEPH_MDS_OP_RMDIR || + op == CEPH_MDS_OP_UNLINK) { + // unlink, rmdir + ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl; + unlink(d, true, true); // keep dir, dentry + } + } + return NULL; + } + + ConnectionRef con = request->reply->get_connection(); + uint64_t features = con->get_features(); + ldout(cct, 10) << " features 0x" << hex << features << dec << dendl; + + // snap trace + SnapRealm *realm = NULL; + if (reply->snapbl.length()) + update_snap_trace(reply->snapbl, &realm); + + ldout(cct, 10) << " hrm " + << " is_target=" << (int)reply->head.is_target + << " is_dentry=" << (int)reply->head.is_dentry + << dendl; + + InodeStat dirst; + DirStat dst; + string dname; + LeaseStat dlease; + InodeStat ist; + + if (reply->head.is_dentry) { + dirst.decode(p, features); + dst.decode(p); + ::decode(dname, p); + ::decode(dlease, p); + } + + Inode *in = 0; + if (reply->head.is_target) { + ist.decode(p, features); + if (cct->_conf->client_debug_getattr_caps) { + unsigned wanted = 0; + if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP) + wanted = request->head.args.getattr.mask; + else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE) + wanted = request->head.args.open.mask; + + if ((wanted & CEPH_CAP_XATTR_SHARED) && + !(ist.xattr_version > 0 && ist.xattrbl.length() > 0)) + assert(0 == "MDS reply does not contain xattrs"); + } + + in = add_update_inode(&ist, request->sent_stamp, session, + request->perms); + } + + Inode *diri = NULL; + if (reply->head.is_dentry) { + diri = add_update_inode(&dirst, request->sent_stamp, session, + request->perms); + update_dir_dist(diri, &dst); // dir stat info is attached to .. + + if (in) { + Dir *dir = diri->open_dir(); + insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session, + (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL); + } else { + Dentry *dn = NULL; + if (diri->dir && diri->dir->dentries.count(dname)) { + dn = diri->dir->dentries[dname]; + if (dn->inode) { + diri->dir_ordered_count++; + clear_dir_complete_and_ordered(diri, false); + unlink(dn, true, true); // keep dir, dentry + } + } + if (dlease.duration_ms > 0) { + if (!dn) { + Dir *dir = diri->open_dir(); + dn = link(dir, dname, NULL, NULL); + } + update_dentry_lease(dn, &dlease, request->sent_stamp, session); + } + } + } else if (op == CEPH_MDS_OP_LOOKUPSNAP || + op == CEPH_MDS_OP_MKSNAP) { + ldout(cct, 10) << " faking snap lookup weirdness" << dendl; + // fake it for snap lookup + vinodeno_t vino = ist.vino; + vino.snapid = CEPH_SNAPDIR; + assert(inode_map.count(vino)); + diri = inode_map[vino]; + + string dname = request->path.last_dentry(); + + LeaseStat dlease; + dlease.duration_ms = 0; + + if (in) { + Dir *dir = diri->open_dir(); + insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session); + } else { + if (diri->dir && diri->dir->dentries.count(dname)) { + Dentry *dn = diri->dir->dentries[dname]; + if (dn->inode) + unlink(dn, true, true); // keep dir, dentry + } + } + } + + if (in) { + if (op == CEPH_MDS_OP_READDIR || + op == CEPH_MDS_OP_LSSNAP) { + insert_readdir_results(request, session, in); + } else if (op == CEPH_MDS_OP_LOOKUPNAME) { + // hack: return parent inode instead + in = diri; + } + + if (request->dentry() == NULL && in != request->inode()) { + // pin the target inode if its parent dentry is not pinned + request->set_other_inode(in); + } + } + + if (realm) + put_snap_realm(realm); + + request->target = in; + return in; +} + +// ------- + +mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri) +{ + mds_rank_t mds = MDS_RANK_NONE; + __u32 hash = 0; + bool is_hash = false; + + Inode *in = NULL; + Dentry *de = NULL; + Cap *cap = NULL; + + if (req->resend_mds >= 0) { + mds = req->resend_mds; + req->resend_mds = -1; + ldout(cct, 10) << "choose_target_mds resend_mds specified as mds." << mds << dendl; + goto out; + } + + if (cct->_conf->client_use_random_mds) + goto random_mds; + + in = req->inode(); + de = req->dentry(); + if (in) { + ldout(cct, 20) << "choose_target_mds starting with req->inode " << *in << dendl; + if (req->path.depth()) { + hash = in->hash_dentry_name(req->path[0]); + ldout(cct, 20) << "choose_target_mds inode dir hash is " << (int)in->dir_layout.dl_dir_hash + << " on " << req->path[0] + << " => " << hash << dendl; + is_hash = true; + } + } else if (de) { + if (de->inode) { + in = de->inode.get(); + ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl; + } else { + in = de->dir->parent_inode; + hash = in->hash_dentry_name(de->name); + ldout(cct, 20) << "choose_target_mds dentry dir hash is " << (int)in->dir_layout.dl_dir_hash + << " on " << de->name + << " => " << hash << dendl; + is_hash = true; + } + } + if (in) { + if (in->snapid != CEPH_NOSNAP) { + ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl; + while (in->snapid != CEPH_NOSNAP) { + if (in->snapid == CEPH_SNAPDIR) + in = in->snapdir_parent.get(); + else if (!in->dn_set.empty()) + /* In most cases there will only be one dentry, so getting it + * will be the correct action. If there are multiple hard links, + * I think the MDS should be able to redirect as needed*/ + in = in->get_first_parent()->dir->parent_inode; + else { + ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl; + break; + } + } + is_hash = false; + } + + ldout(cct, 20) << "choose_target_mds " << *in << " is_hash=" << is_hash + << " hash=" << hash << dendl; + + if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) { + frag_t fg = in->dirfragtree[hash]; + if (in->fragmap.count(fg)) { + mds = in->fragmap[fg]; + if (phash_diri) + *phash_diri = in; + ldout(cct, 10) << "choose_target_mds from dirfragtree hash" << dendl; + goto out; + } + } + + if (req->auth_is_best()) + cap = in->auth_cap; + if (!cap && !in->caps.empty()) + cap = in->caps.begin()->second; + if (!cap) + goto random_mds; + mds = cap->session->mds_num; + ldout(cct, 10) << "choose_target_mds from caps on inode " << *in << dendl; + + goto out; + } + +random_mds: + if (mds < 0) { + mds = _get_random_up_mds(); + ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl; + } + +out: + ldout(cct, 20) << "mds is " << mds << dendl; + return mds; +} + + +void Client::connect_mds_targets(mds_rank_t mds) +{ + ldout(cct, 10) << "connect_mds_targets for mds." << mds << dendl; + assert(mds_sessions.count(mds)); + const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds); + for (set::const_iterator q = info.export_targets.begin(); + q != info.export_targets.end(); + ++q) { + if (mds_sessions.count(*q) == 0 && + mdsmap->is_clientreplay_or_active_or_stopping(*q)) { + ldout(cct, 10) << "check_mds_sessions opening mds." << mds + << " export target mds." << *q << dendl; + _open_mds_session(*q); + } + } +} + +void Client::dump_mds_sessions(Formatter *f) +{ + f->dump_int("id", get_nodeid().v); + f->open_array_section("sessions"); + for (map::const_iterator p = mds_sessions.begin(); p != mds_sessions.end(); ++p) { + f->open_object_section("session"); + p->second->dump(f); + f->close_section(); + } + f->close_section(); + f->dump_int("mdsmap_epoch", mdsmap->get_epoch()); +} +void Client::dump_mds_requests(Formatter *f) +{ + for (map::iterator p = mds_requests.begin(); + p != mds_requests.end(); + ++p) { + f->open_object_section("request"); + p->second->dump(f); + f->close_section(); + } +} + +int Client::verify_reply_trace(int r, + MetaRequest *request, MClientReply *reply, + InodeRef *ptarget, bool *pcreated, + const UserPerm& perms) +{ + // check whether this request actually did the create, and set created flag + bufferlist extra_bl; + inodeno_t created_ino; + bool got_created_ino = false; + ceph::unordered_map::iterator p; + + extra_bl.claim(reply->get_extra_bl()); + if (extra_bl.length() >= 8) { + // if the extra bufferlist has a buffer, we assume its the created inode + // and that this request to create succeeded in actually creating + // the inode (won the race with other create requests) + ::decode(created_ino, extra_bl); + got_created_ino = true; + ldout(cct, 10) << "make_request created ino " << created_ino << dendl; + } + + if (pcreated) + *pcreated = got_created_ino; + + if (request->target) { + *ptarget = request->target; + ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl; + } else { + if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) { + (*ptarget) = p->second; + ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl; + } else { + // we got a traceless reply, and need to look up what we just + // created. for now, do this by name. someday, do this by the + // ino... which we know! FIXME. + InodeRef target; + Dentry *d = request->dentry(); + if (d) { + if (d->dir) { + ldout(cct, 10) << "make_request got traceless reply, looking up #" + << d->dir->parent_inode->ino << "/" << d->name + << " got_ino " << got_created_ino + << " ino " << created_ino + << dendl; + r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask, + &target, perms); + } else { + // if the dentry is not linked, just do our best. see #5021. + assert(0 == "how did this happen? i want logs!"); + } + } else { + Inode *in = request->inode(); + ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #" + << in->ino << dendl; + r = _getattr(in, request->regetattr_mask, perms, true); + target = in; + } + if (r >= 0) { + // verify ino returned in reply and trace_dist are the same + if (got_created_ino && + created_ino.val != target->ino.val) { + ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl; + r = -EINTR; + } + if (ptarget) + ptarget->swap(target); + } + } + } + + return r; +} + + +/** + * make a request + * + * Blocking helper to make an MDS request. + * + * If the ptarget flag is set, behavior changes slightly: the caller + * expects to get a pointer to the inode we are creating or operating + * on. As a result, we will follow up any traceless mutation reply + * with a getattr or lookup to transparently handle a traceless reply + * from the MDS (as when the MDS restarts and the client has to replay + * a request). + * + * @param request the MetaRequest to execute + * @param perms The user uid/gid to execute as (eventually, full group lists?) + * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on + * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file + * @param use_mds [optional] prefer a specific mds (-1 for default) + * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller + */ +int Client::make_request(MetaRequest *request, + const UserPerm& perms, + InodeRef *ptarget, bool *pcreated, + mds_rank_t use_mds, + bufferlist *pdirbl) +{ + int r = 0; + + // assign a unique tid + ceph_tid_t tid = ++last_tid; + request->set_tid(tid); + + // and timestamp + request->op_stamp = ceph_clock_now(); + + // make note + mds_requests[tid] = request->get(); + if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK) + oldest_tid = tid; + + request->set_caller_perms(perms); + + if (cct->_conf->client_inject_fixed_oldest_tid) { + ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl; + request->set_oldest_client_tid(1); + } else { + request->set_oldest_client_tid(oldest_tid); + } + + // hack target mds? + if (use_mds >= 0) + request->resend_mds = use_mds; + + while (1) { + if (request->aborted()) + break; + + if (blacklisted) { + request->abort(-EBLACKLISTED); + break; + } + + // set up wait cond + Cond caller_cond; + request->caller_cond = &caller_cond; + + // choose mds + Inode *hash_diri = NULL; + mds_rank_t mds = choose_target_mds(request, &hash_diri); + int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds); + if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) { + if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) { + if (hash_diri) { + ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl; + _fragmap_remove_stopped_mds(hash_diri, mds); + } else { + ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl; + request->resend_mds = _get_random_up_mds(); + } + } else { + ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl; + wait_on_list(waiting_for_mdsmap); + } + continue; + } + + // open a session? + MetaSession *session = NULL; + if (!have_open_session(mds)) { + session = _get_or_open_mds_session(mds); + + // wait + if (session->state == MetaSession::STATE_OPENING) { + ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl; + wait_on_context_list(session->waiting_for_open); + // Abort requests on REJECT from MDS + if (rejected_by_mds.count(mds)) { + request->abort(-EPERM); + break; + } + continue; + } + + if (!have_open_session(mds)) + continue; + } else { + session = mds_sessions[mds]; + } + + // send request. + send_request(request, session); + + // wait for signal + ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl; + request->kick = false; + while (!request->reply && // reply + request->resend_mds < 0 && // forward + !request->kick) + caller_cond.Wait(client_lock); + request->caller_cond = NULL; + + // did we get a reply? + if (request->reply) + break; + } + + if (!request->reply) { + assert(request->aborted()); + assert(!request->got_unsafe); + r = request->get_abort_code(); + request->item.remove_myself(); + unregister_request(request); + put_request(request); // ours + return r; + } + + // got it! + MClientReply *reply = request->reply; + request->reply = NULL; + r = reply->get_result(); + if (r >= 0) + request->success = true; + + // kick dispatcher (we've got it!) + assert(request->dispatch_cond); + request->dispatch_cond->Signal(); + ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl; + request->dispatch_cond = 0; + + if (r >= 0 && ptarget) + r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms); + + if (pdirbl) + pdirbl->claim(reply->get_extra_bl()); + + // -- log times -- + utime_t lat = ceph_clock_now(); + lat -= request->sent_stamp; + ldout(cct, 20) << "lat " << lat << dendl; + logger->tinc(l_c_lat, lat); + logger->tinc(l_c_reply, lat); + + put_request(request); + + reply->put(); + return r; +} + +void Client::unregister_request(MetaRequest *req) +{ + mds_requests.erase(req->tid); + if (req->tid == oldest_tid) { + map::iterator p = mds_requests.upper_bound(oldest_tid); + while (true) { + if (p == mds_requests.end()) { + oldest_tid = 0; + break; + } + if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) { + oldest_tid = p->first; + break; + } + ++p; + } + } + put_request(req); +} + +void Client::put_request(MetaRequest *request) +{ + if (request->_put()) { + int op = -1; + if (request->success) + op = request->get_op(); + InodeRef other_in; + request->take_other_inode(&other_in); + delete request; + + if (other_in && + (op == CEPH_MDS_OP_RMDIR || + op == CEPH_MDS_OP_RENAME || + op == CEPH_MDS_OP_RMSNAP)) { + _try_to_trim_inode(other_in.get(), false); + } + } +} + +int Client::encode_inode_release(Inode *in, MetaRequest *req, + mds_rank_t mds, int drop, + int unless, int force) +{ + ldout(cct, 20) << "encode_inode_release enter(in:" << *in << ", req:" << req + << " mds:" << mds << ", drop:" << drop << ", unless:" << unless + << ", have:" << ", force:" << force << ")" << dendl; + int released = 0; + if (in->caps.count(mds)) { + Cap *caps = in->caps[mds]; + drop &= ~(in->dirty_caps | get_caps_used(in)); + if ((drop & caps->issued) && + !(unless & caps->issued)) { + ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(caps->issued) << dendl; + caps->issued &= ~drop; + caps->implemented &= ~drop; + released = 1; + ldout(cct, 25) << "Now have: " << ccap_string(caps->issued) << dendl; + } else { + released = force; + } + if (released) { + ceph_mds_request_release rel; + rel.ino = in->ino; + rel.cap_id = caps->cap_id; + rel.seq = caps->seq; + rel.issue_seq = caps->issue_seq; + rel.mseq = caps->mseq; + rel.caps = caps->implemented; + rel.wanted = caps->wanted; + rel.dname_len = 0; + rel.dname_seq = 0; + req->cap_releases.push_back(MClientRequest::Release(rel,"")); + } + } + ldout(cct, 25) << "encode_inode_release exit(in:" << *in << ") released:" + << released << dendl; + return released; +} + +void Client::encode_dentry_release(Dentry *dn, MetaRequest *req, + mds_rank_t mds, int drop, int unless) +{ + ldout(cct, 20) << "encode_dentry_release enter(dn:" + << dn << ")" << dendl; + int released = 0; + if (dn->dir) + released = encode_inode_release(dn->dir->parent_inode, req, + mds, drop, unless, 1); + if (released && dn->lease_mds == mds) { + ldout(cct, 25) << "preemptively releasing dn to mds" << dendl; + MClientRequest::Release& rel = req->cap_releases.back(); + rel.item.dname_len = dn->name.length(); + rel.item.dname_seq = dn->lease_seq; + rel.dname = dn->name; + } + ldout(cct, 25) << "encode_dentry_release exit(dn:" + << dn << ")" << dendl; +} + + +/* + * This requires the MClientRequest *request member to be set. + * It will error out horribly without one. + * Additionally, if you set any *drop member, you'd better have + * set the corresponding dentry! + */ +void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds) +{ + ldout(cct, 20) << "encode_cap_releases enter (req: " + << req << ", mds: " << mds << ")" << dendl; + if (req->inode_drop && req->inode()) + encode_inode_release(req->inode(), req, + mds, req->inode_drop, + req->inode_unless); + + if (req->old_inode_drop && req->old_inode()) + encode_inode_release(req->old_inode(), req, + mds, req->old_inode_drop, + req->old_inode_unless); + if (req->other_inode_drop && req->other_inode()) + encode_inode_release(req->other_inode(), req, + mds, req->other_inode_drop, + req->other_inode_unless); + + if (req->dentry_drop && req->dentry()) + encode_dentry_release(req->dentry(), req, + mds, req->dentry_drop, + req->dentry_unless); + + if (req->old_dentry_drop && req->old_dentry()) + encode_dentry_release(req->old_dentry(), req, + mds, req->old_dentry_drop, + req->old_dentry_unless); + ldout(cct, 25) << "encode_cap_releases exit (req: " + << req << ", mds " << mds <state == MetaSession::STATE_OPEN || + mds_sessions[mds]->state == MetaSession::STATE_STALE); +} + +MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con) +{ + if (mds_sessions.count(mds) == 0) + return NULL; + MetaSession *s = mds_sessions[mds]; + if (s->con != con) + return NULL; + return s; +} + +MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds) +{ + if (mds_sessions.count(mds)) + return mds_sessions[mds]; + return _open_mds_session(mds); +} + +/** + * Populate a map of strings with client-identifying metadata, + * such as the hostname. Call this once at initialization. + */ +void Client::populate_metadata(const std::string &mount_root) +{ + // Hostname + struct utsname u; + int r = uname(&u); + if (r >= 0) { + metadata["hostname"] = u.nodename; + ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl; + } else { + ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl; + } + + metadata["pid"] = stringify(getpid()); + + // Ceph entity id (the '0' in "client.0") + metadata["entity_id"] = cct->_conf->name.get_id(); + + // Our mount position + if (!mount_root.empty()) { + metadata["root"] = mount_root; + } + + // Ceph version + metadata["ceph_version"] = pretty_version_to_str(); + metadata["ceph_sha1"] = git_version_to_str(); + + // Apply any metadata from the user's configured overrides + std::vector tokens; + get_str_vec(cct->_conf->client_metadata, ",", tokens); + for (const auto &i : tokens) { + auto eqpos = i.find("="); + // Throw out anything that isn't of the form "=" + if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) { + lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl; + continue; + } + metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1); + } +} + +/** + * Optionally add or override client metadata fields. + */ +void Client::update_metadata(std::string const &k, std::string const &v) +{ + Mutex::Locker l(client_lock); + assert(initialized); + + if (metadata.count(k)) { + ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k + << "' from '" << metadata[k] << "' to '" << v << "'" << dendl; + } + + metadata[k] = v; +} + +MetaSession *Client::_open_mds_session(mds_rank_t mds) +{ + ldout(cct, 10) << "_open_mds_session mds." << mds << dendl; + assert(mds_sessions.count(mds) == 0); + MetaSession *session = new MetaSession; + session->mds_num = mds; + session->seq = 0; + session->inst = mdsmap->get_inst(mds); + session->con = messenger->get_connection(session->inst); + session->state = MetaSession::STATE_OPENING; + session->mds_state = MDSMap::STATE_NULL; + mds_sessions[mds] = session; + + // Maybe skip sending a request to open if this MDS daemon + // has previously sent us a REJECT. + if (rejected_by_mds.count(mds)) { + if (rejected_by_mds[mds] == session->inst) { + ldout(cct, 4) << "_open_mds_session mds." << mds << " skipping " + "because we were rejected" << dendl; + return session; + } else { + ldout(cct, 4) << "_open_mds_session mds." << mds << " old inst " + "rejected us, trying with new inst" << dendl; + rejected_by_mds.erase(mds); + } + } + + MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_OPEN); + m->client_meta = metadata; + session->con->send_message(m); + return session; +} + +void Client::_close_mds_session(MetaSession *s) +{ + ldout(cct, 2) << "_close_mds_session mds." << s->mds_num << " seq " << s->seq << dendl; + s->state = MetaSession::STATE_CLOSING; + s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq)); +} + +void Client::_closed_mds_session(MetaSession *s) +{ + s->state = MetaSession::STATE_CLOSED; + s->con->mark_down(); + signal_context_list(s->waiting_for_open); + mount_cond.Signal(); + remove_session_caps(s); + kick_requests_closed(s); + mds_sessions.erase(s->mds_num); + delete s; +} + +void Client::handle_client_session(MClientSession *m) +{ + mds_rank_t from = mds_rank_t(m->get_source().num()); + ldout(cct, 10) << "handle_client_session " << *m << " from mds." << from << dendl; + + MetaSession *session = _get_mds_session(from, m->get_connection().get()); + if (!session) { + ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl; + m->put(); + return; + } + + switch (m->get_op()) { + case CEPH_SESSION_OPEN: + renew_caps(session); + session->state = MetaSession::STATE_OPEN; + if (unmounting) + mount_cond.Signal(); + else + connect_mds_targets(from); + signal_context_list(session->waiting_for_open); + break; + + case CEPH_SESSION_CLOSE: + _closed_mds_session(session); + break; + + case CEPH_SESSION_RENEWCAPS: + if (session->cap_renew_seq == m->get_seq()) { + session->cap_ttl = + session->last_cap_renew_request + mdsmap->get_session_timeout(); + wake_inode_waiters(session); + } + break; + + case CEPH_SESSION_STALE: + renew_caps(session); + break; + + case CEPH_SESSION_RECALL_STATE: + trim_caps(session, m->get_max_caps()); + break; + + case CEPH_SESSION_FLUSHMSG: + session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq())); + break; + + case CEPH_SESSION_FORCE_RO: + force_session_readonly(session); + break; + + case CEPH_SESSION_REJECT: + rejected_by_mds[session->mds_num] = session->inst; + _closed_mds_session(session); + + break; + + default: + ceph_abort(); + } + + m->put(); +} + +bool Client::_any_stale_sessions() const +{ + assert(client_lock.is_locked_by_me()); + + for (const auto &i : mds_sessions) { + if (i.second->state == MetaSession::STATE_STALE) { + return true; + } + } + + return false; +} + +void Client::_kick_stale_sessions() +{ + ldout(cct, 1) << "kick_stale_sessions" << dendl; + + for (map::iterator p = mds_sessions.begin(); + p != mds_sessions.end(); ) { + MetaSession *s = p->second; + ++p; + if (s->state == MetaSession::STATE_STALE) + _closed_mds_session(s); + } +} + +void Client::send_request(MetaRequest *request, MetaSession *session, + bool drop_cap_releases) +{ + // make the request + mds_rank_t mds = session->mds_num; + ldout(cct, 10) << "send_request rebuilding request " << request->get_tid() + << " for mds." << mds << dendl; + MClientRequest *r = build_client_request(request); + if (request->dentry()) { + r->set_dentry_wanted(); + } + if (request->got_unsafe) { + r->set_replayed_op(); + if (request->target) + r->head.ino = request->target->ino; + } else { + encode_cap_releases(request, mds); + if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases + request->cap_releases.clear(); + else + r->releases.swap(request->cap_releases); + } + r->set_mdsmap_epoch(mdsmap->get_epoch()); + if (r->head.op == CEPH_MDS_OP_SETXATTR) { + objecter->with_osdmap([r](const OSDMap& o) { + r->set_osdmap_epoch(o.get_epoch()); + }); + } + + if (request->mds == -1) { + request->sent_stamp = ceph_clock_now(); + ldout(cct, 20) << "send_request set sent_stamp to " << request->sent_stamp << dendl; + } + request->mds = mds; + + Inode *in = request->inode(); + if (in && in->caps.count(mds)) + request->sent_on_mseq = in->caps[mds]->mseq; + + session->requests.push_back(&request->item); + + ldout(cct, 10) << "send_request " << *r << " to mds." << mds << dendl; + session->con->send_message(r); +} + +MClientRequest* Client::build_client_request(MetaRequest *request) +{ + MClientRequest *req = new MClientRequest(request->get_op()); + req->set_tid(request->tid); + req->set_stamp(request->op_stamp); + memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head)); + + // if the filepath's haven't been set, set them! + if (request->path.empty()) { + Inode *in = request->inode(); + Dentry *de = request->dentry(); + if (in) + in->make_nosnap_relative_path(request->path); + else if (de) { + if (de->inode) + de->inode->make_nosnap_relative_path(request->path); + else if (de->dir) { + de->dir->parent_inode->make_nosnap_relative_path(request->path); + request->path.push_dentry(de->name); + } + else ldout(cct, 1) << "Warning -- unable to construct a filepath!" + << " No path, inode, or appropriately-endowed dentry given!" + << dendl; + } else ldout(cct, 1) << "Warning -- unable to construct a filepath!" + << " No path, inode, or dentry given!" + << dendl; + } + req->set_filepath(request->get_filepath()); + req->set_filepath2(request->get_filepath2()); + req->set_data(request->data); + req->set_retry_attempt(request->retry_attempt++); + req->head.num_fwd = request->num_fwd; + const gid_t *_gids; + int gid_count = request->perms.get_gids(&_gids); + req->set_gid_list(gid_count, _gids); + return req; +} + + + +void Client::handle_client_request_forward(MClientRequestForward *fwd) +{ + mds_rank_t mds = mds_rank_t(fwd->get_source().num()); + MetaSession *session = _get_mds_session(mds, fwd->get_connection().get()); + if (!session) { + fwd->put(); + return; + } + ceph_tid_t tid = fwd->get_tid(); + + if (mds_requests.count(tid) == 0) { + ldout(cct, 10) << "handle_client_request_forward no pending request on tid " << tid << dendl; + fwd->put(); + return; + } + + MetaRequest *request = mds_requests[tid]; + assert(request); + + // reset retry counter + request->retry_attempt = 0; + + // request not forwarded, or dest mds has no session. + // resend. + ldout(cct, 10) << "handle_client_request tid " << tid + << " fwd " << fwd->get_num_fwd() + << " to mds." << fwd->get_dest_mds() + << ", resending to " << fwd->get_dest_mds() + << dendl; + + request->mds = -1; + request->item.remove_myself(); + request->num_fwd = fwd->get_num_fwd(); + request->resend_mds = fwd->get_dest_mds(); + request->caller_cond->Signal(); + + fwd->put(); +} + +bool Client::is_dir_operation(MetaRequest *req) +{ + int op = req->get_op(); + if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK || + op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME || + op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR || + op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE) + return true; + return false; +} + +void Client::handle_client_reply(MClientReply *reply) +{ + mds_rank_t mds_num = mds_rank_t(reply->get_source().num()); + MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get()); + if (!session) { + reply->put(); + return; + } + + ceph_tid_t tid = reply->get_tid(); + bool is_safe = reply->is_safe(); + + if (mds_requests.count(tid) == 0) { + lderr(cct) << "handle_client_reply no pending request on tid " << tid + << " safe is:" << is_safe << dendl; + reply->put(); + return; + } + MetaRequest *request = mds_requests.at(tid); + + ldout(cct, 20) << "handle_client_reply got a reply. Safe:" << is_safe + << " tid " << tid << dendl; + + if (request->got_unsafe && !is_safe) { + //duplicate response + ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds " + << mds_num << " safe:" << is_safe << dendl; + reply->put(); + return; + } + + if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS + ldout(cct, 20) << "got ESTALE on tid " << request->tid + << " from mds." << request->mds << dendl; + request->send_to_auth = true; + request->resend_mds = choose_target_mds(request); + Inode *in = request->inode(); + if (request->resend_mds >= 0 && + request->resend_mds == request->mds && + (in == NULL || + in->caps.count(request->resend_mds) == 0 || + request->sent_on_mseq == in->caps[request->resend_mds]->mseq)) { + // have to return ESTALE + } else { + request->caller_cond->Signal(); + reply->put(); + return; + } + ldout(cct, 20) << "have to return ESTALE" << dendl; + } + + assert(request->reply == NULL); + request->reply = reply; + insert_trace(request, session); + + // Handle unsafe reply + if (!is_safe) { + request->got_unsafe = true; + session->unsafe_requests.push_back(&request->unsafe_item); + if (is_dir_operation(request)) { + Inode *dir = request->inode(); + assert(dir); + dir->unsafe_ops.push_back(&request->unsafe_dir_item); + } + if (request->target) { + InodeRef &in = request->target; + in->unsafe_ops.push_back(&request->unsafe_target_item); + } + } + + // Only signal the caller once (on the first reply): + // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent. + if (!is_safe || !request->got_unsafe) { + Cond cond; + request->dispatch_cond = &cond; + + // wake up waiter + ldout(cct, 20) << "handle_client_reply signalling caller " << (void*)request->caller_cond << dendl; + request->caller_cond->Signal(); + + // wake for kick back + while (request->dispatch_cond) { + ldout(cct, 20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl; + cond.Wait(client_lock); + } + } + + if (is_safe) { + // the filesystem change is committed to disk + // we're done, clean up + if (request->got_unsafe) { + request->unsafe_item.remove_myself(); + request->unsafe_dir_item.remove_myself(); + request->unsafe_target_item.remove_myself(); + signal_cond_list(request->waitfor_safe); + } + request->item.remove_myself(); + unregister_request(request); + } + if (unmounting) + mount_cond.Signal(); +} + +void Client::_handle_full_flag(int64_t pool) +{ + ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations " + << "on " << pool << dendl; + // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary + // to do this rather than blocking, because otherwise when we fill up we + // potentially lock caps forever on files with dirty pages, and we need + // to be able to release those caps to the MDS so that it can delete files + // and free up space. + epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool); + + // For all inodes with layouts in this pool and a pending flush write op + // (i.e. one of the ones we will cancel), we've got to purge_set their data + // from ObjectCacher so that it doesn't re-issue the write in response to + // the ENOSPC error. + // Fortunately since we're cancelling everything in a given pool, we don't + // need to know which ops belong to which ObjectSet, we can just blow all + // the un-flushed cached data away and mark any dirty inodes' async_err + // field with -ENOSPC as long as we're sure all the ops we cancelled were + // affecting this pool, and all the objectsets we're purging were also + // in this pool. + for (unordered_map::iterator i = inode_map.begin(); + i != inode_map.end(); ++i) + { + Inode *inode = i->second; + if (inode->oset.dirty_or_tx + && (pool == -1 || inode->layout.pool_id == pool)) { + ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec + << " has dirty objects, purging and setting ENOSPC" << dendl; + objectcacher->purge_set(&inode->oset); + inode->set_async_err(-ENOSPC); + } + } + + if (cancelled_epoch != (epoch_t)-1) { + set_cap_epoch_barrier(cancelled_epoch); + } +} + +void Client::handle_osd_map(MOSDMap *m) +{ + std::set new_blacklists; + objecter->consume_blacklist_events(&new_blacklists); + + const auto myaddr = messenger->get_myaddr(); + if (!blacklisted && new_blacklists.count(myaddr)) { + auto epoch = objecter->with_osdmap([](const OSDMap &o){ + return o.get_epoch(); + }); + lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl; + blacklisted = true; + for (std::map::iterator p = mds_requests.begin(); + p != mds_requests.end(); ) { + auto req = p->second; + ++p; + req->abort(-EBLACKLISTED); + if (req->caller_cond) { + req->kick = true; + req->caller_cond->Signal(); + } + } + + // Progress aborts on any requests that were on this waitlist. Any + // requests that were on a waiting_for_open session waitlist + // will get kicked during close session below. + signal_cond_list(waiting_for_mdsmap); + + // Force-close all sessions: assume this is not abandoning any state + // on the MDS side because the MDS will have seen the blacklist too. + while(!mds_sessions.empty()) { + auto i = mds_sessions.begin(); + auto session = i->second; + _closed_mds_session(session); + } + + // Since we know all our OSD ops will fail, cancel them all preemtively, + // so that on an unhealthy cluster we can umount promptly even if e.g. + // some PGs were inaccessible. + objecter->op_cancel_writes(-EBLACKLISTED); + + } else if (blacklisted) { + // Handle case where we were blacklisted but no longer are + blacklisted = objecter->with_osdmap([myaddr](const OSDMap &o){ + return o.is_blacklisted(myaddr);}); + } + + if (objecter->osdmap_full_flag()) { + _handle_full_flag(-1); + } else { + // Accumulate local list of full pools so that I can drop + // the objecter lock before re-entering objecter in + // cancel_writes + std::vector full_pools; + + objecter->with_osdmap([&full_pools](const OSDMap &o) { + for (const auto& kv : o.get_pools()) { + if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) { + full_pools.push_back(kv.first); + } + } + }); + + for (auto p : full_pools) + _handle_full_flag(p); + + // Subscribe to subsequent maps to watch for the full flag going + // away. For the global full flag objecter does this for us, but + // it pays no attention to the per-pool full flag so in this branch + // we do it ourselves. + if (!full_pools.empty()) { + objecter->maybe_request_map(); + } + } + + m->put(); +} + + +// ------------------------ +// incoming messages + + +bool Client::ms_dispatch(Message *m) +{ + Mutex::Locker l(client_lock); + if (!initialized) { + ldout(cct, 10) << "inactive, discarding " << *m << dendl; + m->put(); + return true; + } + + switch (m->get_type()) { + // mounting and mds sessions + case CEPH_MSG_MDS_MAP: + handle_mds_map(static_cast(m)); + break; + case CEPH_MSG_FS_MAP: + handle_fs_map(static_cast(m)); + break; + case CEPH_MSG_FS_MAP_USER: + handle_fs_map_user(static_cast(m)); + break; + case CEPH_MSG_CLIENT_SESSION: + handle_client_session(static_cast(m)); + break; + + case CEPH_MSG_OSD_MAP: + handle_osd_map(static_cast(m)); + break; + + // requests + case CEPH_MSG_CLIENT_REQUEST_FORWARD: + handle_client_request_forward(static_cast(m)); + break; + case CEPH_MSG_CLIENT_REPLY: + handle_client_reply(static_cast(m)); + break; + + case CEPH_MSG_CLIENT_SNAP: + handle_snap(static_cast(m)); + break; + case CEPH_MSG_CLIENT_CAPS: + handle_caps(static_cast(m)); + break; + case CEPH_MSG_CLIENT_LEASE: + handle_lease(static_cast(m)); + break; + case MSG_COMMAND_REPLY: + if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) { + handle_command_reply(static_cast(m)); + } else { + return false; + } + break; + case CEPH_MSG_CLIENT_QUOTA: + handle_quota(static_cast(m)); + break; + + default: + return false; + } + + // unmounting? + if (unmounting) { + ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size() + << "+" << inode_map.size() << dendl; + long unsigned size = lru.lru_get_size() + inode_map.size(); + trim_cache(); + if (size < lru.lru_get_size() + inode_map.size()) { + ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl; + mount_cond.Signal(); + } else { + ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size() + << "+" << inode_map.size() << dendl; + } + } + + return true; +} + +void Client::handle_fs_map(MFSMap *m) +{ + fsmap.reset(new FSMap(m->get_fsmap())); + m->put(); + + signal_cond_list(waiting_for_fsmap); + + monclient->sub_got("fsmap", fsmap->get_epoch()); +} + +void Client::handle_fs_map_user(MFSMapUser *m) +{ + fsmap_user.reset(new FSMapUser); + *fsmap_user = m->get_fsmap(); + m->put(); + + monclient->sub_got("fsmap.user", fsmap_user->get_epoch()); + signal_cond_list(waiting_for_fsmap); +} + +void Client::handle_mds_map(MMDSMap* m) +{ + if (m->get_epoch() <= mdsmap->get_epoch()) { + ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() + << " is identical to or older than our " + << mdsmap->get_epoch() << dendl; + m->put(); + return; + } + + ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl; + + std::unique_ptr oldmap(new MDSMap); + oldmap.swap(mdsmap); + + mdsmap->decode(m->get_encoded()); + + // Cancel any commands for missing or laggy GIDs + std::list cancel_ops; + auto &commands = command_table.get_commands(); + for (const auto &i : commands) { + auto &op = i.second; + const mds_gid_t op_mds_gid = op.mds_gid; + if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) { + ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl; + cancel_ops.push_back(i.first); + if (op.outs) { + std::ostringstream ss; + ss << "MDS " << op_mds_gid << " went away"; + *(op.outs) = ss.str(); + } + op.con->mark_down(); + if (op.on_finish) { + op.on_finish->complete(-ETIMEDOUT); + } + } + } + + for (std::list::iterator i = cancel_ops.begin(); + i != cancel_ops.end(); ++i) { + command_table.erase(*i); + } + + // reset session + for (map::iterator p = mds_sessions.begin(); + p != mds_sessions.end(); ) { + mds_rank_t mds = p->first; + MetaSession *session = p->second; + ++p; + + int oldstate = oldmap->get_state(mds); + int newstate = mdsmap->get_state(mds); + if (!mdsmap->is_up(mds)) { + session->con->mark_down(); + } else if (mdsmap->get_inst(mds) != session->inst) { + session->con->mark_down(); + session->inst = mdsmap->get_inst(mds); + // When new MDS starts to take over, notify kernel to trim unused entries + // in its dcache/icache. Hopefully, the kernel will release some unused + // inodes before the new MDS enters reconnect state. + trim_cache_for_reconnect(session); + } else if (oldstate == newstate) + continue; // no change + + session->mds_state = newstate; + if (newstate == MDSMap::STATE_RECONNECT) { + session->con = messenger->get_connection(session->inst); + send_reconnect(session); + } else if (newstate >= MDSMap::STATE_ACTIVE) { + if (oldstate < MDSMap::STATE_ACTIVE) { + // kick new requests + kick_requests(session); + kick_flushing_caps(session); + signal_context_list(session->waiting_for_open); + kick_maxsize_requests(session); + wake_inode_waiters(session); + } + connect_mds_targets(mds); + } else if (newstate == MDSMap::STATE_NULL && + mds >= mdsmap->get_max_mds()) { + _closed_mds_session(session); + } + } + + // kick any waiting threads + signal_cond_list(waiting_for_mdsmap); + + m->put(); + + monclient->sub_got("mdsmap", mdsmap->get_epoch()); +} + +void Client::send_reconnect(MetaSession *session) +{ + mds_rank_t mds = session->mds_num; + ldout(cct, 10) << "send_reconnect to mds." << mds << dendl; + + // trim unused caps to reduce MDS's cache rejoin time + trim_cache_for_reconnect(session); + + session->readonly = false; + + if (session->release) { + session->release->put(); + session->release = NULL; + } + + // reset my cap seq number + session->seq = 0; + //connect to the mds' offload targets + connect_mds_targets(mds); + //make sure unsafe requests get saved + resend_unsafe_requests(session); + + MClientReconnect *m = new MClientReconnect; + + // i have an open session. + ceph::unordered_set did_snaprealm; + for (ceph::unordered_map::iterator p = inode_map.begin(); + p != inode_map.end(); + ++p) { + Inode *in = p->second; + if (in->caps.count(mds)) { + ldout(cct, 10) << " caps on " << p->first + << " " << ccap_string(in->caps[mds]->issued) + << " wants " << ccap_string(in->caps_wanted()) + << dendl; + filepath path; + in->make_long_path(path); + ldout(cct, 10) << " path " << path << dendl; + + bufferlist flockbl; + _encode_filelocks(in, flockbl); + + Cap *cap = in->caps[mds]; + cap->seq = 0; // reset seq. + cap->issue_seq = 0; // reset seq. + cap->mseq = 0; // reset seq. + cap->issued = cap->implemented; + + snapid_t snap_follows = 0; + if (!in->cap_snaps.empty()) + snap_follows = in->cap_snaps.begin()->first; + + m->add_cap(p->first.ino, + cap->cap_id, + path.get_ino(), path.get_path(), // ino + in->caps_wanted(), // wanted + cap->issued, // issued + in->snaprealm->ino, + snap_follows, + flockbl); + + if (did_snaprealm.count(in->snaprealm->ino) == 0) { + ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl; + m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent); + did_snaprealm.insert(in->snaprealm->ino); + } + } + } + + early_kick_flushing_caps(session); + + session->con->send_message(m); + + mount_cond.Signal(); +} + + +void Client::kick_requests(MetaSession *session) +{ + ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl; + for (map::iterator p = mds_requests.begin(); + p != mds_requests.end(); + ++p) { + MetaRequest *req = p->second; + if (req->got_unsafe) + continue; + if (req->aborted()) { + if (req->caller_cond) { + req->kick = true; + req->caller_cond->Signal(); + } + continue; + } + if (req->retry_attempt > 0) + continue; // new requests only + if (req->mds == session->mds_num) { + send_request(p->second, session); + } + } +} + +void Client::resend_unsafe_requests(MetaSession *session) +{ + for (xlist::iterator iter = session->unsafe_requests.begin(); + !iter.end(); + ++iter) + send_request(*iter, session); + + // also re-send old requests when MDS enters reconnect stage. So that MDS can + // process completed requests in clientreplay stage. + for (map::iterator p = mds_requests.begin(); + p != mds_requests.end(); + ++p) { + MetaRequest *req = p->second; + if (req->got_unsafe) + continue; + if (req->aborted()) + continue; + if (req->retry_attempt == 0) + continue; // old requests only + if (req->mds == session->mds_num) + send_request(req, session, true); + } +} + +void Client::wait_unsafe_requests() +{ + list last_unsafe_reqs; + for (map::iterator p = mds_sessions.begin(); + p != mds_sessions.end(); + ++p) { + MetaSession *s = p->second; + if (!s->unsafe_requests.empty()) { + MetaRequest *req = s->unsafe_requests.back(); + req->get(); + last_unsafe_reqs.push_back(req); + } + } + + for (list::iterator p = last_unsafe_reqs.begin(); + p != last_unsafe_reqs.end(); + ++p) { + MetaRequest *req = *p; + if (req->unsafe_item.is_on_list()) + wait_on_list(req->waitfor_safe); + put_request(req); + } +} + +void Client::kick_requests_closed(MetaSession *session) +{ + ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl; + for (map::iterator p = mds_requests.begin(); + p != mds_requests.end(); ) { + MetaRequest *req = p->second; + ++p; + if (req->mds == session->mds_num) { + if (req->caller_cond) { + req->kick = true; + req->caller_cond->Signal(); + } + req->item.remove_myself(); + if (req->got_unsafe) { + lderr(cct) << "kick_requests_closed removing unsafe request " << req->get_tid() << dendl; + req->unsafe_item.remove_myself(); + req->unsafe_dir_item.remove_myself(); + req->unsafe_target_item.remove_myself(); + signal_cond_list(req->waitfor_safe); + unregister_request(req); + } + } + } + assert(session->requests.empty()); + assert(session->unsafe_requests.empty()); +} + + + + +/************ + * leases + */ + +void Client::got_mds_push(MetaSession *s) +{ + s->seq++; + ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl; + if (s->state == MetaSession::STATE_CLOSING) { + s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq)); + } +} + +void Client::handle_lease(MClientLease *m) +{ + ldout(cct, 10) << "handle_lease " << *m << dendl; + + assert(m->get_action() == CEPH_MDS_LEASE_REVOKE); + + mds_rank_t mds = mds_rank_t(m->get_source().num()); + MetaSession *session = _get_mds_session(mds, m->get_connection().get()); + if (!session) { + m->put(); + return; + } + + got_mds_push(session); + + ceph_seq_t seq = m->get_seq(); + + Inode *in; + vinodeno_t vino(m->get_ino(), CEPH_NOSNAP); + if (inode_map.count(vino) == 0) { + ldout(cct, 10) << " don't have vino " << vino << dendl; + goto revoke; + } + in = inode_map[vino]; + + if (m->get_mask() & CEPH_LOCK_DN) { + if (!in->dir || in->dir->dentries.count(m->dname) == 0) { + ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <dir->dentries[m->dname]; + ldout(cct, 10) << " revoked DN lease on " << dn << dendl; + dn->lease_mds = -1; + } + + revoke: + m->get_connection()->send_message( + new MClientLease( + CEPH_MDS_LEASE_RELEASE, seq, + m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname)); + m->put(); +} + +void Client::put_inode(Inode *in, int n) +{ + ldout(cct, 10) << "put_inode on " << *in << dendl; + int left = in->_put(n); + if (left == 0) { + // release any caps + remove_all_caps(in); + + ldout(cct, 10) << "put_inode deleting " << *in << dendl; + bool unclean = objectcacher->release_set(&in->oset); + assert(!unclean); + inode_map.erase(in->vino()); + if (use_faked_inos()) + _release_faked_ino(in); + + if (in == root) { + root = 0; + root_ancestor = 0; + while (!root_parents.empty()) + root_parents.erase(root_parents.begin()); + } + + delete in; + } +} + +void Client::close_dir(Dir *dir) +{ + Inode *in = dir->parent_inode; + ldout(cct, 15) << "close_dir dir " << dir << " on " << in << dendl; + assert(dir->is_empty()); + assert(in->dir == dir); + assert(in->dn_set.size() < 2); // dirs can't be hard-linked + if (!in->dn_set.empty()) + in->get_first_parent()->put(); // unpin dentry + + delete in->dir; + in->dir = 0; + put_inode(in); // unpin inode +} + + /** + * Don't call this with in==NULL, use get_or_create for that + * leave dn set to default NULL unless you're trying to add + * a new inode to a pre-created Dentry + */ +Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn) +{ + if (!dn) { + // create a new Dentry + dn = new Dentry; + dn->name = name; + + // link to dir + dn->dir = dir; + dir->dentries[dn->name] = dn; + lru.lru_insert_mid(dn); // mid or top? + + ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in + << " dn " << dn << " (new dn)" << dendl; + } else { + ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in + << " dn " << dn << " (old dn)" << dendl; + } + + if (in) { // link to inode + dn->inode = in; + if (in->is_dir()) { + if (in->dir) + dn->get(); // dir -> dn pin + if (in->ll_ref) + dn->get(); // ll_ref -> dn pin + } + + assert(in->dn_set.count(dn) == 0); + + // only one parent for directories! + if (in->is_dir() && !in->dn_set.empty()) { + Dentry *olddn = in->get_first_parent(); + assert(olddn->dir != dir || olddn->name != name); + Inode *old_diri = olddn->dir->parent_inode; + old_diri->dir_release_count++; + clear_dir_complete_and_ordered(old_diri, true); + unlink(olddn, true, true); // keep dir, dentry + } + + in->dn_set.insert(dn); + + ldout(cct, 20) << "link inode " << in << " parents now " << in->dn_set << dendl; + } + + return dn; +} + +void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry) +{ + InodeRef in; + in.swap(dn->inode); + ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn + << " inode " << dn->inode << dendl; + + // unlink from inode + if (in) { + if (in->is_dir()) { + if (in->dir) + dn->put(); // dir -> dn pin + if (in->ll_ref) + dn->put(); // ll_ref -> dn pin + } + dn->inode = 0; + assert(in->dn_set.count(dn)); + in->dn_set.erase(dn); + ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dn_set << dendl; + } + + if (keepdentry) { + dn->lease_mds = -1; + } else { + ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl; + + // unlink from dir + dn->dir->dentries.erase(dn->name); + if (dn->dir->is_empty() && !keepdir) + close_dir(dn->dir); + dn->dir = 0; + + // delete den + lru.lru_remove(dn); + dn->put(); + } +} + +/** + * For asynchronous flushes, check for errors from the IO and + * update the inode if necessary + */ +class C_Client_FlushComplete : public Context { +private: + Client *client; + InodeRef inode; +public: + C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { } + void finish(int r) override { + assert(client->client_lock.is_locked_by_me()); + if (r != 0) { + client_t const whoami = client->whoami; // For the benefit of ldout prefix + ldout(client->cct, 1) << "I/O error from flush on inode " << inode + << " 0x" << std::hex << inode->ino << std::dec + << ": " << r << "(" << cpp_strerror(r) << ")" << dendl; + inode->set_async_err(r); + } + } +}; + + +/**** + * caps + */ + +void Client::get_cap_ref(Inode *in, int cap) +{ + if ((cap & CEPH_CAP_FILE_BUFFER) && + in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) { + ldout(cct, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in << dendl; + in->get(); + } + if ((cap & CEPH_CAP_FILE_CACHE) && + in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) { + ldout(cct, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in << dendl; + in->get(); + } + in->get_cap_ref(cap); +} + +void Client::put_cap_ref(Inode *in, int cap) +{ + int last = in->put_cap_ref(cap); + if (last) { + int put_nref = 0; + int drop = last & ~in->caps_issued(); + if (in->snapid == CEPH_NOSNAP) { + if ((last & CEPH_CAP_FILE_WR) && + !in->cap_snaps.empty() && + in->cap_snaps.rbegin()->second.writing) { + ldout(cct, 10) << "put_cap_ref finishing pending cap_snap on " << *in << dendl; + in->cap_snaps.rbegin()->second.writing = 0; + finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in)); + signal_cond_list(in->waitfor_caps); // wake up blocked sync writers + } + if (last & CEPH_CAP_FILE_BUFFER) { + for (auto &p : in->cap_snaps) + p.second.dirty_data = 0; + signal_cond_list(in->waitfor_commit); + ldout(cct, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in << dendl; + ++put_nref; + } + } + if (last & CEPH_CAP_FILE_CACHE) { + ldout(cct, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in << dendl; + ++put_nref; + } + if (drop) + check_caps(in, 0); + if (put_nref) + put_inode(in, put_nref); + } +} + +int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff) +{ + int r = check_pool_perm(in, need); + if (r < 0) + return r; + + while (1) { + int file_wanted = in->caps_file_wanted(); + if ((file_wanted & need) != need) { + ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need) + << " file_wanted " << ccap_string(file_wanted) << ", EBADF " + << dendl; + return -EBADF; + } + + int implemented; + int have = in->caps_issued(&implemented); + + bool waitfor_caps = false; + bool waitfor_commit = false; + + if (have & need & CEPH_CAP_FILE_WR) { + if (endoff > 0 && + (endoff >= (loff_t)in->max_size || + endoff > (loff_t)(in->size << 1)) && + endoff > (loff_t)in->wanted_max_size) { + ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl; + in->wanted_max_size = endoff; + check_caps(in, 0); + } + + if (endoff >= 0 && endoff > (loff_t)in->max_size) { + ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl; + waitfor_caps = true; + } + if (!in->cap_snaps.empty()) { + if (in->cap_snaps.rbegin()->second.writing) { + ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl; + waitfor_caps = true; + } + for (auto &p : in->cap_snaps) { + if (p.second.dirty_data) { + waitfor_commit = true; + break; + } + } + if (waitfor_commit) { + _flush(in, new C_Client_FlushComplete(this, in)); + ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl; + } + } + } + + if (!waitfor_caps && !waitfor_commit) { + if ((have & need) == need) { + int revoking = implemented & ~have; + ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have) + << " need " << ccap_string(need) << " want " << ccap_string(want) + << " revoking " << ccap_string(revoking) + << dendl; + if ((revoking & want) == 0) { + *phave = need | (have & want); + in->get_cap_ref(need); + return 0; + } + } + ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl; + waitfor_caps = true; + } + + if ((need & CEPH_CAP_FILE_WR) && in->auth_cap && + in->auth_cap->session->readonly) + return -EROFS; + + if (in->flags & I_CAP_DROPPED) { + int mds_wanted = in->caps_mds_wanted(); + if ((mds_wanted & need) != need) { + int ret = _renew_caps(in); + if (ret < 0) + return ret; + continue; + } + if ((mds_wanted & file_wanted) == + (file_wanted & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) { + in->flags &= ~I_CAP_DROPPED; + } + } + + if (waitfor_caps) + wait_on_list(in->waitfor_caps); + else if (waitfor_commit) + wait_on_list(in->waitfor_commit); + } +} + +int Client::get_caps_used(Inode *in) +{ + unsigned used = in->caps_used(); + if (!(used & CEPH_CAP_FILE_CACHE) && + !objectcacher->set_is_empty(&in->oset)) + used |= CEPH_CAP_FILE_CACHE; + return used; +} + +void Client::cap_delay_requeue(Inode *in) +{ + ldout(cct, 10) << "cap_delay_requeue on " << *in << dendl; + in->hold_caps_until = ceph_clock_now(); + in->hold_caps_until += cct->_conf->client_caps_release_delay; + delayed_caps.push_back(&in->cap_item); +} + +void Client::send_cap(Inode *in, MetaSession *session, Cap *cap, + bool sync, int used, int want, int retain, + int flush, ceph_tid_t flush_tid) +{ + int held = cap->issued | cap->implemented; + int revoking = cap->implemented & ~cap->issued; + retain &= ~revoking; + int dropping = cap->issued & ~retain; + int op = CEPH_CAP_OP_UPDATE; + + ldout(cct, 10) << "send_cap " << *in + << " mds." << session->mds_num << " seq " << cap->seq + << (sync ? " sync " : " async ") + << " used " << ccap_string(used) + << " want " << ccap_string(want) + << " flush " << ccap_string(flush) + << " retain " << ccap_string(retain) + << " held "<< ccap_string(held) + << " revoking " << ccap_string(revoking) + << " dropping " << ccap_string(dropping) + << dendl; + + if (cct->_conf->client_inject_release_failure && revoking) { + const int would_have_issued = cap->issued & retain; + const int would_have_implemented = cap->implemented & (cap->issued | used); + // Simulated bug: + // - tell the server we think issued is whatever they issued plus whatever we implemented + // - leave what we have implemented in place + ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl; + cap->issued = cap->issued | cap->implemented; + + // Make an exception for revoking xattr caps: we are injecting + // failure to release other caps, but allow xattr because client + // will block on xattr ops if it can't release these to MDS (#9800) + const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL; + cap->issued ^= xattr_mask & revoking; + cap->implemented ^= xattr_mask & revoking; + + ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl; + ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl; + } else { + // Normal behaviour + cap->issued &= retain; + cap->implemented &= cap->issued | used; + } + + snapid_t follows = 0; + + if (flush) + follows = in->snaprealm->get_snap_context().seq; + + MClientCaps *m = new MClientCaps(op, + in->ino, + 0, + cap->cap_id, cap->seq, + cap->implemented, + want, + flush, + cap->mseq, + cap_epoch_barrier); + m->caller_uid = in->cap_dirtier_uid; + m->caller_gid = in->cap_dirtier_gid; + + m->head.issue_seq = cap->issue_seq; + m->set_tid(flush_tid); + + m->head.uid = in->uid; + m->head.gid = in->gid; + m->head.mode = in->mode; + + m->head.nlink = in->nlink; + + if (flush & CEPH_CAP_XATTR_EXCL) { + ::encode(in->xattrs, m->xattrbl); + m->head.xattr_version = in->xattr_version; + } + + m->size = in->size; + m->max_size = in->max_size; + m->truncate_seq = in->truncate_seq; + m->truncate_size = in->truncate_size; + m->mtime = in->mtime; + m->atime = in->atime; + m->ctime = in->ctime; + m->btime = in->btime; + m->time_warp_seq = in->time_warp_seq; + m->change_attr = in->change_attr; + if (sync) + m->flags |= CLIENT_CAPS_SYNC; + + if (flush & CEPH_CAP_FILE_WR) { + m->inline_version = in->inline_version; + m->inline_data = in->inline_data; + } + + in->reported_size = in->size; + m->set_snap_follows(follows); + cap->wanted = want; + if (cap == in->auth_cap) { + m->set_max_size(in->wanted_max_size); + in->requested_max_size = in->wanted_max_size; + ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl; + } + + if (!session->flushing_caps_tids.empty()) + m->set_oldest_flush_tid(*session->flushing_caps_tids.begin()); + + session->con->send_message(m); +} + +static bool is_max_size_approaching(Inode *in) +{ + /* mds will adjust max size according to the reported size */ + if (in->flushing_caps & CEPH_CAP_FILE_WR) + return false; + if (in->size >= in->max_size) + return true; + /* half of previous max_size increment has been used */ + if (in->max_size > in->reported_size && + (in->size << 1) >= in->max_size + in->reported_size) + return true; + return false; +} + +/** + * check_caps + * + * Examine currently used and wanted versus held caps. Release, flush or ack + * revoked caps to the MDS as appropriate. + * + * @param in the inode to check + * @param flags flags to apply to cap check + */ +void Client::check_caps(Inode *in, unsigned flags) +{ + unsigned wanted = in->caps_wanted(); + unsigned used = get_caps_used(in); + unsigned cap_used; + + if (in->is_dir() && (in->flags & I_COMPLETE)) { + // we do this here because we don't want to drop to Fs (and then + // drop the Fs if we do a create!) if that alone makes us send lookups + // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere + wanted |= CEPH_CAP_FILE_EXCL; + } + + int implemented; + int issued = in->caps_issued(&implemented); + int revoking = implemented & ~issued; + + int retain = wanted | used | CEPH_CAP_PIN; + if (!unmounting) { + if (wanted) + retain |= CEPH_CAP_ANY; + else + retain |= CEPH_CAP_ANY_SHARED; + } + + ldout(cct, 10) << "check_caps on " << *in + << " wanted " << ccap_string(wanted) + << " used " << ccap_string(used) + << " issued " << ccap_string(issued) + << " revoking " << ccap_string(revoking) + << " flags=" << flags + << dendl; + + if (in->snapid != CEPH_NOSNAP) + return; //snap caps last forever, can't write + + if (in->caps.empty()) + return; // guard if at end of func + + if ((revoking & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) && + (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER)) + _release(in); + + if (!in->cap_snaps.empty()) + flush_snaps(in); + + if (flags & CHECK_CAPS_NODELAY) + in->hold_caps_until = utime_t(); + else + cap_delay_requeue(in); + + utime_t now = ceph_clock_now(); + + map::iterator it = in->caps.begin(); + while (it != in->caps.end()) { + mds_rank_t mds = it->first; + Cap *cap = it->second; + ++it; + + MetaSession *session = mds_sessions[mds]; + assert(session); + + cap_used = used; + if (in->auth_cap && cap != in->auth_cap) + cap_used &= ~in->auth_cap->issued; + + revoking = cap->implemented & ~cap->issued; + + ldout(cct, 10) << " cap mds." << mds + << " issued " << ccap_string(cap->issued) + << " implemented " << ccap_string(cap->implemented) + << " revoking " << ccap_string(revoking) << dendl; + + if (in->wanted_max_size > in->max_size && + in->wanted_max_size > in->requested_max_size && + cap == in->auth_cap) + goto ack; + + /* approaching file_max? */ + if ((cap->issued & CEPH_CAP_FILE_WR) && + cap == in->auth_cap && + is_max_size_approaching(in)) { + ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size + << ", reported " << in->reported_size << dendl; + goto ack; + } + + /* completed revocation? */ + if (revoking && (revoking & cap_used) == 0) { + ldout(cct, 10) << "completed revocation of " << ccap_string(cap->implemented & ~cap->issued) << dendl; + goto ack; + } + + /* want more caps from mds? */ + if (wanted & ~(cap->wanted | cap->issued)) + goto ack; + + if (!revoking && unmounting && (cap_used == 0)) + goto ack; + + if (wanted == cap->wanted && // mds knows what we want. + ((cap->issued & ~retain) == 0) &&// and we don't have anything we wouldn't like + !in->dirty_caps) // and we have no dirty caps + continue; + + if (now < in->hold_caps_until) { + ldout(cct, 10) << "delaying cap release" << dendl; + continue; + } + + ack: + // re-send old cap/snapcap flushes first. + if (session->mds_state >= MDSMap::STATE_RECONNECT && + session->mds_state < MDSMap::STATE_ACTIVE && + session->early_flushing_caps.count(in) == 0) { + ldout(cct, 20) << " reflushing caps (check_caps) on " << *in + << " to mds." << session->mds_num << dendl; + session->early_flushing_caps.insert(in); + if (in->cap_snaps.size()) + flush_snaps(in, true); + if (in->flushing_caps) + flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS); + } + + int flushing; + ceph_tid_t flush_tid; + if (in->auth_cap == cap && in->dirty_caps) { + flushing = mark_caps_flushing(in, &flush_tid); + } else { + flushing = 0; + flush_tid = 0; + } + + send_cap(in, session, cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted, + retain, flushing, flush_tid); + } +} + + +void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc) +{ + int used = get_caps_used(in); + int dirty = in->caps_dirty(); + ldout(cct, 10) << "queue_cap_snap " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl; + + if (in->cap_snaps.size() && + in->cap_snaps.rbegin()->second.writing) { + ldout(cct, 10) << "queue_cap_snap already have pending cap_snap on " << *in << dendl; + return; + } else if (in->caps_dirty() || + (used & CEPH_CAP_FILE_WR) || + (dirty & CEPH_CAP_ANY_WR)) { + const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in)); + assert(capsnapem.second == true); /* element inserted */ + CapSnap &capsnap = capsnapem.first->second; + capsnap.context = old_snapc; + capsnap.issued = in->caps_issued(); + capsnap.dirty = in->caps_dirty(); + + capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER); + + capsnap.uid = in->uid; + capsnap.gid = in->gid; + capsnap.mode = in->mode; + capsnap.btime = in->btime; + capsnap.xattrs = in->xattrs; + capsnap.xattr_version = in->xattr_version; + + if (used & CEPH_CAP_FILE_WR) { + ldout(cct, 10) << "queue_cap_snap WR used on " << *in << dendl; + capsnap.writing = 1; + } else { + finish_cap_snap(in, capsnap, used); + } + } else { + ldout(cct, 10) << "queue_cap_snap not dirty|writing on " << *in << dendl; + } +} + +void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used) +{ + ldout(cct, 10) << "finish_cap_snap " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl; + capsnap.size = in->size; + capsnap.mtime = in->mtime; + capsnap.atime = in->atime; + capsnap.ctime = in->ctime; + capsnap.time_warp_seq = in->time_warp_seq; + capsnap.change_attr = in->change_attr; + + capsnap.dirty |= in->caps_dirty(); + + if (capsnap.dirty & CEPH_CAP_FILE_WR) { + capsnap.inline_data = in->inline_data; + capsnap.inline_version = in->inline_version; + } + + if (used & CEPH_CAP_FILE_BUFFER) { + ldout(cct, 10) << "finish_cap_snap " << *in << " cap_snap " << &capsnap << " used " << used + << " WRBUFFER, delaying" << dendl; + } else { + capsnap.dirty_data = 0; + flush_snaps(in); + } +} + +void Client::_flushed_cap_snap(Inode *in, snapid_t seq) +{ + ldout(cct, 10) << "_flushed_cap_snap seq " << seq << " on " << *in << dendl; + in->cap_snaps.at(seq).dirty_data = 0; + flush_snaps(in); +} + +void Client::flush_snaps(Inode *in, bool all_again) +{ + ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl; + assert(in->cap_snaps.size()); + + // pick auth mds + assert(in->auth_cap); + MetaSession *session = in->auth_cap->session; + int mseq = in->auth_cap->mseq; + + for (auto &p : in->cap_snaps) { + CapSnap &capsnap = p.second; + if (!all_again) { + // only flush once per session + if (capsnap.flush_tid > 0) + continue; + } + + ldout(cct, 10) << "flush_snaps mds." << session->mds_num + << " follows " << p.first + << " size " << capsnap.size + << " mtime " << capsnap.mtime + << " dirty_data=" << capsnap.dirty_data + << " writing=" << capsnap.writing + << " on " << *in << dendl; + if (capsnap.dirty_data || capsnap.writing) + continue; + + if (capsnap.flush_tid == 0) { + capsnap.flush_tid = ++last_flush_tid; + if (!in->flushing_cap_item.is_on_list()) + session->flushing_caps.push_back(&in->flushing_cap_item); + session->flushing_caps_tids.insert(capsnap.flush_tid); + } + + MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq, + cap_epoch_barrier); + if (user_id >= 0) + m->caller_uid = user_id; + if (group_id >= 0) + m->caller_gid = group_id; + + m->set_client_tid(capsnap.flush_tid); + m->head.snap_follows = p.first; + + m->head.caps = capsnap.issued; + m->head.dirty = capsnap.dirty; + + m->head.uid = capsnap.uid; + m->head.gid = capsnap.gid; + m->head.mode = capsnap.mode; + m->btime = capsnap.btime; + + m->size = capsnap.size; + + m->head.xattr_version = capsnap.xattr_version; + ::encode(capsnap.xattrs, m->xattrbl); + + m->ctime = capsnap.ctime; + m->btime = capsnap.btime; + m->mtime = capsnap.mtime; + m->atime = capsnap.atime; + m->time_warp_seq = capsnap.time_warp_seq; + m->change_attr = capsnap.change_attr; + + if (capsnap.dirty & CEPH_CAP_FILE_WR) { + m->inline_version = in->inline_version; + m->inline_data = in->inline_data; + } + + assert(!session->flushing_caps_tids.empty()); + m->set_oldest_flush_tid(*session->flushing_caps_tids.begin()); + + session->con->send_message(m); + } +} + + + +void Client::wait_on_list(list& ls) +{ + Cond cond; + ls.push_back(&cond); + cond.Wait(client_lock); + ls.remove(&cond); +} + +void Client::signal_cond_list(list& ls) +{ + for (list::iterator it = ls.begin(); it != ls.end(); ++it) + (*it)->Signal(); +} + +void Client::wait_on_context_list(list& ls) +{ + Cond cond; + bool done = false; + int r; + ls.push_back(new C_Cond(&cond, &done, &r)); + while (!done) + cond.Wait(client_lock); +} + +void Client::signal_context_list(list& ls) +{ + while (!ls.empty()) { + ls.front()->complete(0); + ls.pop_front(); + } +} + +void Client::wake_inode_waiters(MetaSession *s) +{ + xlist::iterator iter = s->caps.begin(); + while (!iter.end()){ + signal_cond_list((*iter)->inode->waitfor_caps); + ++iter; + } +} + + +// flush dirty data (from objectcache) + +class C_Client_CacheInvalidate : public Context { +private: + Client *client; + vinodeno_t ino; + int64_t offset, length; +public: + C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) : + client(c), offset(off), length(len) { + if (client->use_faked_inos()) + ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP); + else + ino = in->vino(); + } + void finish(int r) override { + // _async_invalidate takes the lock when it needs to, call this back from outside of lock. + assert(!client->client_lock.is_locked_by_me()); + client->_async_invalidate(ino, offset, length); + } +}; + +void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len) +{ + if (unmounting) + return; + ldout(cct, 10) << "_async_invalidate " << ino << " " << off << "~" << len << dendl; + ino_invalidate_cb(callback_handle, ino, off, len); +} + +void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) { + + if (ino_invalidate_cb) + // we queue the invalidate, which calls the callback and decrements the ref + async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len)); +} + +void Client::_invalidate_inode_cache(Inode *in) +{ + ldout(cct, 10) << "_invalidate_inode_cache " << *in << dendl; + + // invalidate our userspace inode cache + if (cct->_conf->client_oc) + objectcacher->release_set(&in->oset); + + _schedule_invalidate_callback(in, 0, 0); +} + +void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len) +{ + ldout(cct, 10) << "_invalidate_inode_cache " << *in << " " << off << "~" << len << dendl; + + // invalidate our userspace inode cache + if (cct->_conf->client_oc) { + vector ls; + Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls); + objectcacher->discard_set(&in->oset, ls); + } + + _schedule_invalidate_callback(in, off, len); +} + +bool Client::_release(Inode *in) +{ + ldout(cct, 20) << "_release " << *in << dendl; + if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) { + _invalidate_inode_cache(in); + return true; + } + return false; +} + +bool Client::_flush(Inode *in, Context *onfinish) +{ + ldout(cct, 10) << "_flush " << *in << dendl; + + if (!in->oset.dirty_or_tx) { + ldout(cct, 10) << " nothing to flush" << dendl; + onfinish->complete(0); + return true; + } + + if (objecter->osdmap_pool_full(in->layout.pool_id)) { + ldout(cct, 1) << __func__ << ": FULL, purging for ENOSPC" << dendl; + objectcacher->purge_set(&in->oset); + if (onfinish) { + onfinish->complete(-ENOSPC); + } + return true; + } + + return objectcacher->flush_set(&in->oset, onfinish); +} + +void Client::_flush_range(Inode *in, int64_t offset, uint64_t size) +{ + assert(client_lock.is_locked()); + if (!in->oset.dirty_or_tx) { + ldout(cct, 10) << " nothing to flush" << dendl; + return; + } + + Mutex flock("Client::_flush_range flock"); + Cond cond; + bool safe = false; + Context *onflush = new C_SafeCond(&flock, &cond, &safe); + bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(), + offset, size, onflush); + if (!ret) { + // wait for flush + client_lock.Unlock(); + flock.Lock(); + while (!safe) + cond.Wait(flock); + flock.Unlock(); + client_lock.Lock(); + } +} + +void Client::flush_set_callback(ObjectCacher::ObjectSet *oset) +{ + // Mutex::Locker l(client_lock); + assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ... + Inode *in = static_cast(oset->parent); + assert(in); + _flushed(in); +} + +void Client::_flushed(Inode *in) +{ + ldout(cct, 10) << "_flushed " << *in << dendl; + + put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER); +} + + + +// checks common to add_update_cap, handle_cap_grant +void Client::check_cap_issue(Inode *in, Cap *cap, unsigned issued) +{ + unsigned had = in->caps_issued(); + + if ((issued & CEPH_CAP_FILE_CACHE) && + !(had & CEPH_CAP_FILE_CACHE)) + in->cache_gen++; + + if ((issued & CEPH_CAP_FILE_SHARED) && + !(had & CEPH_CAP_FILE_SHARED)) { + in->shared_gen++; + + if (in->is_dir()) + clear_dir_complete_and_ordered(in, true); + } +} + +void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id, + unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm, + int flags, const UserPerm& cap_perms) +{ + Cap *cap = 0; + mds_rank_t mds = mds_session->mds_num; + if (in->caps.count(mds)) { + cap = in->caps[mds]; + + /* + * auth mds of the inode changed. we received the cap export + * message, but still haven't received the cap import message. + * handle_cap_export() updated the new auth MDS' cap. + * + * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing + * a message that was send before the cap import message. So + * don't remove caps. + */ + if (ceph_seq_cmp(seq, cap->seq) <= 0) { + assert(cap == in->auth_cap); + assert(cap->cap_id == cap_id); + seq = cap->seq; + mseq = cap->mseq; + issued |= cap->issued; + flags |= CEPH_CAP_FLAG_AUTH; + } + } else { + mds_session->num_caps++; + if (!in->is_any_caps()) { + assert(in->snaprealm == 0); + in->snaprealm = get_snap_realm(realm); + in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item); + ldout(cct, 15) << "add_update_cap first one, opened snaprealm " << in->snaprealm << dendl; + } + in->caps[mds] = cap = new Cap; + + mds_session->caps.push_back(&cap->cap_item); + cap->session = mds_session; + cap->inode = in; + cap->gen = mds_session->cap_gen; + cap_list.push_back(&in->cap_item); + } + + check_cap_issue(in, cap, issued); + + if (flags & CEPH_CAP_FLAG_AUTH) { + if (in->auth_cap != cap && + (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) { + if (in->auth_cap && in->flushing_cap_item.is_on_list()) { + ldout(cct, 10) << "add_update_cap changing auth cap: " + << "add myself to new auth MDS' flushing caps list" << dendl; + adjust_session_flushing_caps(in, in->auth_cap->session, mds_session); + } + in->auth_cap = cap; + } + } + + unsigned old_caps = cap->issued; + cap->cap_id = cap_id; + cap->issued |= issued; + cap->implemented |= issued; + cap->seq = seq; + cap->issue_seq = seq; + cap->mseq = mseq; + cap->latest_perms = cap_perms; + ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued) + << " from mds." << mds + << " on " << *in + << dendl; + + if ((issued & ~old_caps) && in->auth_cap == cap) { + // non-auth MDS is revoking the newly grant caps ? + for (map::iterator it = in->caps.begin(); it != in->caps.end(); ++it) { + if (it->second == cap) + continue; + if (it->second->implemented & ~it->second->issued & issued) { + check_caps(in, CHECK_CAPS_NODELAY); + break; + } + } + } + + if (issued & ~old_caps) + signal_cond_list(in->waitfor_caps); +} + +void Client::remove_cap(Cap *cap, bool queue_release) +{ + Inode *in = cap->inode; + MetaSession *session = cap->session; + mds_rank_t mds = cap->session->mds_num; + + ldout(cct, 10) << "remove_cap mds." << mds << " on " << *in << dendl; + + if (queue_release) { + session->enqueue_cap_release( + in->ino, + cap->cap_id, + cap->issue_seq, + cap->mseq, + cap_epoch_barrier); + } + + if (in->auth_cap == cap) { + if (in->flushing_cap_item.is_on_list()) { + ldout(cct, 10) << " removing myself from flushing_cap list" << dendl; + in->flushing_cap_item.remove_myself(); + } + in->auth_cap = NULL; + } + assert(in->caps.count(mds)); + in->caps.erase(mds); + + cap->cap_item.remove_myself(); + delete cap; + cap = nullptr; + + if (!in->is_any_caps()) { + ldout(cct, 15) << "remove_cap last one, closing snaprealm " << in->snaprealm << dendl; + in->snaprealm_item.remove_myself(); + put_snap_realm(in->snaprealm); + in->snaprealm = 0; + } +} + +void Client::remove_all_caps(Inode *in) +{ + while (!in->caps.empty()) + remove_cap(in->caps.begin()->second, true); +} + +void Client::remove_session_caps(MetaSession *s) +{ + ldout(cct, 10) << "remove_session_caps mds." << s->mds_num << dendl; + + while (s->caps.size()) { + Cap *cap = *s->caps.begin(); + Inode *in = cap->inode; + bool dirty_caps = false, cap_snaps = false; + if (in->auth_cap == cap) { + cap_snaps = !in->cap_snaps.empty(); + dirty_caps = in->dirty_caps | in->flushing_caps; + in->wanted_max_size = 0; + in->requested_max_size = 0; + in->flags |= I_CAP_DROPPED; + } + remove_cap(cap, false); + signal_cond_list(in->waitfor_caps); + if (cap_snaps) { + InodeRef tmp_ref(in); + in->cap_snaps.clear(); + } + if (dirty_caps) { + lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl; + if (in->flushing_caps) { + num_flushing_caps--; + in->flushing_cap_tids.clear(); + } + in->flushing_caps = 0; + in->dirty_caps = 0; + put_inode(in); + } + } + s->flushing_caps_tids.clear(); + sync_cond.Signal(); +} + +class C_Client_Remount : public Context { +private: + Client *client; +public: + explicit C_Client_Remount(Client *c) : client(c) {} + void finish(int r) override { + assert (r == 0); + r = client->remount_cb(client->callback_handle); + if (r != 0) { + client_t whoami = client->get_nodeid(); + lderr(client->cct) << "tried to remount (to trim kernel dentries) and got error " + << r << dendl; + if (client->require_remount && !client->unmounting) { + assert(0 == "failed to remount for kernel dentry trimming"); + } + } + } +}; + +void Client::_invalidate_kernel_dcache() +{ + if (unmounting) + return; + if (can_invalidate_dentries && dentry_invalidate_cb && root->dir) { + for (ceph::unordered_map::iterator p = root->dir->dentries.begin(); + p != root->dir->dentries.end(); + ++p) { + if (p->second->inode) + _schedule_invalidate_dentry_callback(p->second, false); + } + } else if (remount_cb) { + // Hacky: + // when remounting a file system, linux kernel trims all unused dentries in the fs + remount_finisher.queue(new C_Client_Remount(this)); + } +} + +void Client::trim_caps(MetaSession *s, int max) +{ + mds_rank_t mds = s->mds_num; + int caps_size = s->caps.size(); + ldout(cct, 10) << "trim_caps mds." << mds << " max " << max + << " caps " << caps_size << dendl; + + int trimmed = 0; + xlist::iterator p = s->caps.begin(); + while ((caps_size - trimmed) > max && !p.end()) { + Cap *cap = *p; + Inode *in = cap->inode; + + // Increment p early because it will be invalidated if cap + // is deleted inside remove_cap + ++p; + + if (in->caps.size() > 1 && cap != in->auth_cap) { + int mine = cap->issued | cap->implemented; + int oissued = in->auth_cap ? in->auth_cap->issued : 0; + // disposable non-auth cap + if (!(get_caps_used(in) & ~oissued & mine)) { + ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl; + remove_cap(cap, true); + trimmed++; + } + } else { + ldout(cct, 20) << " trying to trim dentries for " << *in << dendl; + bool all = true; + set::iterator q = in->dn_set.begin(); + InodeRef tmp_ref(in); + while (q != in->dn_set.end()) { + Dentry *dn = *q++; + if (dn->lru_is_expireable()) { + if (can_invalidate_dentries && + dn->dir->parent_inode->ino == MDS_INO_ROOT) { + // Only issue one of these per DN for inodes in root: handle + // others more efficiently by calling for root-child DNs at + // the end of this function. + _schedule_invalidate_dentry_callback(dn, true); + } + trim_dentry(dn); + } else { + ldout(cct, 20) << " not expirable: " << dn->name << dendl; + all = false; + } + } + if (all && in->ino != MDS_INO_ROOT) { + ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl; + trimmed++; + } + } + } + + if (s->caps.size() > max) + _invalidate_kernel_dcache(); +} + +void Client::force_session_readonly(MetaSession *s) +{ + s->readonly = true; + for (xlist::iterator p = s->caps.begin(); !p.end(); ++p) { + Inode *in = (*p)->inode; + if (in->caps_wanted() & CEPH_CAP_FILE_WR) + signal_cond_list(in->waitfor_caps); + } +} + +void Client::mark_caps_dirty(Inode *in, int caps) +{ + ldout(cct, 10) << "mark_caps_dirty " << *in << " " << ccap_string(in->dirty_caps) << " -> " + << ccap_string(in->dirty_caps | caps) << dendl; + if (caps && !in->caps_dirty()) + in->get(); + in->dirty_caps |= caps; +} + +int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid) +{ + MetaSession *session = in->auth_cap->session; + + int flushing = in->dirty_caps; + assert(flushing); + + ceph_tid_t flush_tid = ++last_flush_tid; + in->flushing_cap_tids[flush_tid] = flushing; + + if (!in->flushing_caps) { + ldout(cct, 10) << "mark_caps_flushing " << ccap_string(flushing) << " " << *in << dendl; + num_flushing_caps++; + } else { + ldout(cct, 10) << "mark_caps_flushing (more) " << ccap_string(flushing) << " " << *in << dendl; + } + + in->flushing_caps |= flushing; + in->dirty_caps = 0; + + if (!in->flushing_cap_item.is_on_list()) + session->flushing_caps.push_back(&in->flushing_cap_item); + session->flushing_caps_tids.insert(flush_tid); + + *ptid = flush_tid; + return flushing; +} + +void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s) +{ + for (auto &p : in->cap_snaps) { + CapSnap &capsnap = p.second; + if (capsnap.flush_tid > 0) { + old_s->flushing_caps_tids.erase(capsnap.flush_tid); + new_s->flushing_caps_tids.insert(capsnap.flush_tid); + } + } + for (map::iterator it = in->flushing_cap_tids.begin(); + it != in->flushing_cap_tids.end(); + ++it) { + old_s->flushing_caps_tids.erase(it->first); + new_s->flushing_caps_tids.insert(it->first); + } + new_s->flushing_caps.push_back(&in->flushing_cap_item); +} + +/* + * Flush all caps back to the MDS. Because the callers generally wait on the + * result of this function (syncfs and umount cases), we set + * CHECK_CAPS_SYNCHRONOUS on the last check_caps call. + */ +void Client::flush_caps_sync() +{ + ldout(cct, 10) << __func__ << dendl; + xlist::iterator p = delayed_caps.begin(); + while (!p.end()) { + unsigned flags = CHECK_CAPS_NODELAY; + Inode *in = *p; + + ++p; + delayed_caps.pop_front(); + if (p.end() && cap_list.empty()) + flags |= CHECK_CAPS_SYNCHRONOUS; + check_caps(in, flags); + } + + // other caps, too + p = cap_list.begin(); + while (!p.end()) { + unsigned flags = CHECK_CAPS_NODELAY; + Inode *in = *p; + + ++p; + if (p.end()) + flags |= CHECK_CAPS_SYNCHRONOUS; + check_caps(in, flags); + } +} + +void Client::flush_caps(Inode *in, MetaSession *session, bool sync) +{ + ldout(cct, 10) << "flush_caps " << in << " mds." << session->mds_num << dendl; + Cap *cap = in->auth_cap; + assert(cap->session == session); + + for (map::iterator p = in->flushing_cap_tids.begin(); + p != in->flushing_cap_tids.end(); + ++p) { + bool req_sync = false; + + /* If this is a synchronous request, then flush the journal on last one */ + if (sync && (p->first == in->flushing_cap_tids.rbegin()->first)) + req_sync = true; + + send_cap(in, session, cap, req_sync, + (get_caps_used(in) | in->caps_dirty()), + in->caps_wanted(), (cap->issued | cap->implemented), + p->second, p->first); + } +} + +void Client::wait_sync_caps(Inode *in, ceph_tid_t want) +{ + while (in->flushing_caps) { + map::iterator it = in->flushing_cap_tids.begin(); + assert(it != in->flushing_cap_tids.end()); + if (it->first > want) + break; + ldout(cct, 10) << "wait_sync_caps on " << *in << " flushing " + << ccap_string(it->second) << " want " << want + << " last " << it->first << dendl; + wait_on_list(in->waitfor_caps); + } +} + +void Client::wait_sync_caps(ceph_tid_t want) +{ + retry: + ldout(cct, 10) << "wait_sync_caps want " << want << " (last is " << last_flush_tid << ", " + << num_flushing_caps << " total flushing)" << dendl; + for (map::iterator p = mds_sessions.begin(); + p != mds_sessions.end(); + ++p) { + MetaSession *s = p->second; + if (s->flushing_caps_tids.empty()) + continue; + ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin(); + if (oldest_tid <= want) { + ldout(cct, 10) << " waiting on mds." << p->first << " tid " << oldest_tid + << " (want " << want << ")" << dendl; + sync_cond.Wait(client_lock); + goto retry; + } + } +} + +void Client::kick_flushing_caps(MetaSession *session) +{ + mds_rank_t mds = session->mds_num; + ldout(cct, 10) << "kick_flushing_caps mds." << mds << dendl; + + for (xlist::iterator p = session->flushing_caps.begin(); !p.end(); ++p) { + Inode *in = *p; + if (session->early_flushing_caps.count(in)) + continue; + ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl; + if (in->cap_snaps.size()) + flush_snaps(in, true); + if (in->flushing_caps) + flush_caps(in, session); + } + + session->early_flushing_caps.clear(); +} + +void Client::early_kick_flushing_caps(MetaSession *session) +{ + session->early_flushing_caps.clear(); + + for (xlist::iterator p = session->flushing_caps.begin(); !p.end(); ++p) { + Inode *in = *p; + assert(in->auth_cap); + + // if flushing caps were revoked, we re-send the cap flush in client reconnect + // stage. This guarantees that MDS processes the cap flush message before issuing + // the flushing caps to other client. + if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps) + continue; + + ldout(cct, 20) << " reflushing caps (early_kick) on " << *in + << " to mds." << session->mds_num << dendl; + + session->early_flushing_caps.insert(in); + + if (in->cap_snaps.size()) + flush_snaps(in, true); + if (in->flushing_caps) + flush_caps(in, session); + + } +} + +void Client::kick_maxsize_requests(MetaSession *session) +{ + xlist::iterator iter = session->caps.begin(); + while (!iter.end()){ + (*iter)->inode->requested_max_size = 0; + (*iter)->inode->wanted_max_size = 0; + signal_cond_list((*iter)->inode->waitfor_caps); + ++iter; + } +} + +void SnapRealm::build_snap_context() +{ + set snaps; + snapid_t max_seq = seq; + + // start with prior_parents? + for (unsigned i=0; iget_snap_context(); + for (unsigned i=0; i= parent_since) + snaps.insert(psnapc.snaps[i]); + if (psnapc.seq > max_seq) + max_seq = psnapc.seq; + } + + // my snaps + for (unsigned i=0; i::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p) + cached_snap_context.snaps.push_back(*p); +} + +void Client::invalidate_snaprealm_and_children(SnapRealm *realm) +{ + list q; + q.push_back(realm); + + while (!q.empty()) { + realm = q.front(); + q.pop_front(); + + ldout(cct, 10) << "invalidate_snaprealm_and_children " << *realm << dendl; + realm->invalidate_cache(); + + for (set::iterator p = realm->pchildren.begin(); + p != realm->pchildren.end(); + ++p) + q.push_back(*p); + } +} + +SnapRealm *Client::get_snap_realm(inodeno_t r) +{ + SnapRealm *realm = snap_realms[r]; + if (!realm) + snap_realms[r] = realm = new SnapRealm(r); + ldout(cct, 20) << "get_snap_realm " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl; + realm->nref++; + return realm; +} + +SnapRealm *Client::get_snap_realm_maybe(inodeno_t r) +{ + if (snap_realms.count(r) == 0) { + ldout(cct, 20) << "get_snap_realm_maybe " << r << " fail" << dendl; + return NULL; + } + SnapRealm *realm = snap_realms[r]; + ldout(cct, 20) << "get_snap_realm_maybe " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl; + realm->nref++; + return realm; +} + +void Client::put_snap_realm(SnapRealm *realm) +{ + ldout(cct, 20) << "put_snap_realm " << realm->ino << " " << realm + << " " << realm->nref << " -> " << (realm->nref - 1) << dendl; + if (--realm->nref == 0) { + snap_realms.erase(realm->ino); + if (realm->pparent) { + realm->pparent->pchildren.erase(realm); + put_snap_realm(realm->pparent); + } + delete realm; + } +} + +bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent) +{ + if (realm->parent != parent) { + ldout(cct, 10) << "adjust_realm_parent " << *realm + << " " << realm->parent << " -> " << parent << dendl; + realm->parent = parent; + if (realm->pparent) { + realm->pparent->pchildren.erase(realm); + put_snap_realm(realm->pparent); + } + realm->pparent = get_snap_realm(parent); + realm->pparent->pchildren.insert(realm); + return true; + } + return false; +} + +static bool has_new_snaps(const SnapContext& old_snapc, + const SnapContext& new_snapc) +{ + return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq; +} + + +void Client::update_snap_trace(bufferlist& bl, SnapRealm **realm_ret, bool flush) +{ + SnapRealm *first_realm = NULL; + ldout(cct, 10) << "update_snap_trace len " << bl.length() << dendl; + + map dirty_realms; + + bufferlist::iterator p = bl.begin(); + while (!p.end()) { + SnapRealmInfo info; + ::decode(info, p); + SnapRealm *realm = get_snap_realm(info.ino()); + + bool invalidate = false; + + if (info.seq() > realm->seq) { + ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() << " > " << realm->seq + << dendl; + + if (flush) { + // writeback any dirty caps _before_ updating snap list (i.e. with old snap info) + // flush me + children + list q; + q.push_back(realm); + while (!q.empty()) { + SnapRealm *realm = q.front(); + q.pop_front(); + + for (set::iterator p = realm->pchildren.begin(); + p != realm->pchildren.end(); + ++p) + q.push_back(*p); + + if (dirty_realms.count(realm) == 0) { + realm->nref++; + dirty_realms[realm] = realm->get_snap_context(); + } + } + } + + // update + realm->seq = info.seq(); + realm->created = info.created(); + realm->parent_since = info.parent_since(); + realm->prior_parent_snaps = info.prior_parent_snaps; + realm->my_snaps = info.my_snaps; + invalidate = true; + } + + // _always_ verify parent + if (adjust_realm_parent(realm, info.parent())) + invalidate = true; + + if (invalidate) { + invalidate_snaprealm_and_children(realm); + ldout(cct, 15) << "update_snap_trace " << *realm << " self|parent updated" << dendl; + ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl; + } else { + ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() + << " <= " << realm->seq << " and same parent, SKIPPING" << dendl; + } + + if (!first_realm) + first_realm = realm; + else + put_snap_realm(realm); + } + + for (map::iterator q = dirty_realms.begin(); + q != dirty_realms.end(); + ++q) { + SnapRealm *realm = q->first; + // if there are new snaps ? + if (has_new_snaps(q->second, realm->get_snap_context())) { + ldout(cct, 10) << " flushing caps on " << *realm << dendl; + xlist::iterator r = realm->inodes_with_caps.begin(); + while (!r.end()) { + Inode *in = *r; + ++r; + queue_cap_snap(in, q->second); + } + } else { + ldout(cct, 10) << " no new snap on " << *realm << dendl; + } + put_snap_realm(realm); + } + + if (realm_ret) + *realm_ret = first_realm; + else + put_snap_realm(first_realm); +} + +void Client::handle_snap(MClientSnap *m) +{ + ldout(cct, 10) << "handle_snap " << *m << dendl; + mds_rank_t mds = mds_rank_t(m->get_source().num()); + MetaSession *session = _get_mds_session(mds, m->get_connection().get()); + if (!session) { + m->put(); + return; + } + + got_mds_push(session); + + map to_move; + SnapRealm *realm = 0; + + if (m->head.op == CEPH_SNAP_OP_SPLIT) { + assert(m->head.split); + SnapRealmInfo info; + bufferlist::iterator p = m->bl.begin(); + ::decode(info, p); + assert(info.ino() == m->head.split); + + // flush, then move, ino's. + realm = get_snap_realm(info.ino()); + ldout(cct, 10) << " splitting off " << *realm << dendl; + for (vector::iterator p = m->split_inos.begin(); + p != m->split_inos.end(); + ++p) { + vinodeno_t vino(*p, CEPH_NOSNAP); + if (inode_map.count(vino)) { + Inode *in = inode_map[vino]; + if (!in->snaprealm || in->snaprealm == realm) + continue; + if (in->snaprealm->created > info.created()) { + ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm " + << *in->snaprealm << dendl; + continue; + } + ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl; + + + in->snaprealm_item.remove_myself(); + to_move[in] = in->snaprealm->get_snap_context(); + put_snap_realm(in->snaprealm); + } + } + + // move child snaprealms, too + for (vector::iterator p = m->split_realms.begin(); + p != m->split_realms.end(); + ++p) { + ldout(cct, 10) << "adjusting snaprealm " << *p << " parent" << dendl; + SnapRealm *child = get_snap_realm_maybe(*p); + if (!child) + continue; + adjust_realm_parent(child, realm->ino); + put_snap_realm(child); + } + } + + update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY); + + if (realm) { + for (auto p = to_move.begin(); p != to_move.end(); ++p) { + Inode *in = p->first; + in->snaprealm = realm; + realm->inodes_with_caps.push_back(&in->snaprealm_item); + realm->nref++; + // queue for snap writeback + if (has_new_snaps(p->second, realm->get_snap_context())) + queue_cap_snap(in, p->second); + } + put_snap_realm(realm); + } + + m->put(); +} + +void Client::handle_quota(MClientQuota *m) +{ + mds_rank_t mds = mds_rank_t(m->get_source().num()); + MetaSession *session = _get_mds_session(mds, m->get_connection().get()); + if (!session) { + m->put(); + return; + } + + got_mds_push(session); + + ldout(cct, 10) << "handle_quota " << *m << " from mds." << mds << dendl; + + vinodeno_t vino(m->ino, CEPH_NOSNAP); + if (inode_map.count(vino)) { + Inode *in = NULL; + in = inode_map[vino]; + + if (in) { + in->quota = m->quota; + in->rstat = m->rstat; + } + } + + m->put(); +} + +void Client::handle_caps(MClientCaps *m) +{ + mds_rank_t mds = mds_rank_t(m->get_source().num()); + MetaSession *session = _get_mds_session(mds, m->get_connection().get()); + if (!session) { + m->put(); + return; + } + + if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) { + // Pause RADOS operations until we see the required epoch + objecter->set_epoch_barrier(m->osd_epoch_barrier); + } + + if (m->osd_epoch_barrier > cap_epoch_barrier) { + // Record the barrier so that we will transmit it to MDS when releasing + set_cap_epoch_barrier(m->osd_epoch_barrier); + } + + got_mds_push(session); + + m->clear_payload(); // for if/when we send back to MDS + + Inode *in = 0; + vinodeno_t vino(m->get_ino(), CEPH_NOSNAP); + if (inode_map.count(vino)) + in = inode_map[vino]; + if (!in) { + if (m->get_op() == CEPH_CAP_OP_IMPORT) { + ldout(cct, 5) << "handle_caps don't have vino " << vino << " on IMPORT, immediately releasing" << dendl; + session->enqueue_cap_release( + m->get_ino(), + m->get_cap_id(), + m->get_seq(), + m->get_mseq(), + cap_epoch_barrier); + } else { + ldout(cct, 5) << "handle_caps don't have vino " << vino << ", dropping" << dendl; + } + m->put(); + + // in case the mds is waiting on e.g. a revocation + flush_cap_releases(); + return; + } + + switch (m->get_op()) { + case CEPH_CAP_OP_EXPORT: + return handle_cap_export(session, in, m); + case CEPH_CAP_OP_FLUSHSNAP_ACK: + return handle_cap_flushsnap_ack(session, in, m); + case CEPH_CAP_OP_IMPORT: + handle_cap_import(session, in, m); + } + + if (in->caps.count(mds) == 0) { + ldout(cct, 5) << "handle_caps don't have " << *in << " cap on mds." << mds << dendl; + m->put(); + return; + } + + Cap *cap = in->caps[mds]; + + switch (m->get_op()) { + case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m); + case CEPH_CAP_OP_IMPORT: + case CEPH_CAP_OP_REVOKE: + case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, cap, m); + case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, cap, m); + default: + m->put(); + } +} + +void Client::handle_cap_import(MetaSession *session, Inode *in, MClientCaps *m) +{ + mds_rank_t mds = session->mds_num; + + ldout(cct, 5) << "handle_cap_import ino " << m->get_ino() << " mseq " << m->get_mseq() + << " IMPORT from mds." << mds << dendl; + + const mds_rank_t peer_mds = mds_rank_t(m->peer.mds); + Cap *cap = NULL; + UserPerm cap_perms; + if (m->peer.cap_id && in->caps.count(peer_mds)) { + cap = in->caps[peer_mds]; + if (cap) { + cap_perms = cap->latest_perms; + } + } + + // add/update it + SnapRealm *realm = NULL; + update_snap_trace(m->snapbl, &realm); + + add_update_cap(in, session, m->get_cap_id(), + m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(), + CEPH_CAP_FLAG_AUTH, cap_perms); + + if (cap && cap->cap_id == m->peer.cap_id) { + remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE)); + } + + if (realm) + put_snap_realm(realm); + + if (in->auth_cap && in->auth_cap->session->mds_num == mds) { + // reflush any/all caps (if we are now the auth_cap) + if (in->cap_snaps.size()) + flush_snaps(in, true); + if (in->flushing_caps) + flush_caps(in, session); + } +} + +void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m) +{ + mds_rank_t mds = session->mds_num; + + ldout(cct, 5) << "handle_cap_export ino " << m->get_ino() << " mseq " << m->get_mseq() + << " EXPORT from mds." << mds << dendl; + + Cap *cap = NULL; + if (in->caps.count(mds)) + cap = in->caps[mds]; + + const mds_rank_t peer_mds = mds_rank_t(m->peer.mds); + + if (cap && cap->cap_id == m->get_cap_id()) { + if (m->peer.cap_id) { + MetaSession *tsession = _get_or_open_mds_session(peer_mds); + if (in->caps.count(peer_mds)) { + Cap *tcap = in->caps[peer_mds]; + if (tcap->cap_id == m->peer.cap_id && + ceph_seq_cmp(tcap->seq, m->peer.seq) < 0) { + tcap->cap_id = m->peer.cap_id; + tcap->seq = m->peer.seq - 1; + tcap->issue_seq = tcap->seq; + tcap->mseq = m->peer.mseq; + tcap->issued |= cap->issued; + tcap->implemented |= cap->issued; + if (cap == in->auth_cap) + in->auth_cap = tcap; + if (in->auth_cap == tcap && in->flushing_cap_item.is_on_list()) + adjust_session_flushing_caps(in, session, tsession); + } + } else { + add_update_cap(in, tsession, m->peer.cap_id, cap->issued, + m->peer.seq - 1, m->peer.mseq, (uint64_t)-1, + cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0, + cap->latest_perms); + } + } else { + if (cap == in->auth_cap) + in->flags |= I_CAP_DROPPED; + } + + remove_cap(cap, false); + } + + m->put(); +} + +void Client::handle_cap_trunc(MetaSession *session, Inode *in, MClientCaps *m) +{ + mds_rank_t mds = session->mds_num; + assert(in->caps[mds]); + + ldout(cct, 10) << "handle_cap_trunc on ino " << *in + << " size " << in->size << " -> " << m->get_size() + << dendl; + + int implemented = 0; + int issued = in->caps_issued(&implemented) | in->caps_dirty(); + issued |= implemented; + update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(), + m->get_size(), m->get_change_attr(), m->get_time_warp_seq(), + m->get_ctime(), m->get_mtime(), m->get_atime(), + m->inline_version, m->inline_data, issued); + m->put(); +} + +void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m) +{ + ceph_tid_t flush_ack_tid = m->get_client_tid(); + int dirty = m->get_dirty(); + int cleaned = 0; + int flushed = 0; + + for (map::iterator it = in->flushing_cap_tids.begin(); + it != in->flushing_cap_tids.end(); ) { + if (it->first == flush_ack_tid) + cleaned = it->second; + if (it->first <= flush_ack_tid) { + session->flushing_caps_tids.erase(it->first); + in->flushing_cap_tids.erase(it++); + ++flushed; + continue; + } + cleaned &= ~it->second; + if (!cleaned) + break; + ++it; + } + + ldout(cct, 5) << "handle_cap_flush_ack mds." << session->mds_num + << " cleaned " << ccap_string(cleaned) << " on " << *in + << " with " << ccap_string(dirty) << dendl; + + if (flushed) { + signal_cond_list(in->waitfor_caps); + if (session->flushing_caps_tids.empty() || + *session->flushing_caps_tids.begin() > flush_ack_tid) + sync_cond.Signal(); + } + + if (!dirty) { + in->cap_dirtier_uid = -1; + in->cap_dirtier_gid = -1; + } + + if (!cleaned) { + ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl; + } else { + if (in->flushing_caps) { + ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps) + << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl; + in->flushing_caps &= ~cleaned; + if (in->flushing_caps == 0) { + ldout(cct, 10) << " " << *in << " !flushing" << dendl; + num_flushing_caps--; + if (in->cap_snaps.empty()) + in->flushing_cap_item.remove_myself(); + } + if (!in->caps_dirty()) + put_inode(in); + } + } + + m->put(); +} + + +void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCaps *m) +{ + mds_rank_t mds = session->mds_num; + assert(in->caps[mds]); + snapid_t follows = m->get_snap_follows(); + + if (in->cap_snaps.count(follows)) { + CapSnap &capsnap = in->cap_snaps.at(follows); + if (m->get_client_tid() != capsnap.flush_tid) { + ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl; + } else { + ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows + << " on " << *in << dendl; + InodeRef tmp_ref; + if (in->get_num_ref() == 1) + tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps + if (in->flushing_caps == 0 && in->cap_snaps.empty()) + in->flushing_cap_item.remove_myself(); + session->flushing_caps_tids.erase(capsnap.flush_tid); + in->cap_snaps.erase(follows); + } + } else { + ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows + << " on " << *in << dendl; + // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back) + } + + m->put(); +} + +class C_Client_DentryInvalidate : public Context { +private: + Client *client; + vinodeno_t dirino; + vinodeno_t ino; + string name; +public: + C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) : + client(c), name(dn->name) { + if (client->use_faked_inos()) { + dirino.ino = dn->dir->parent_inode->faked_ino; + if (del) + ino.ino = dn->inode->faked_ino; + } else { + dirino = dn->dir->parent_inode->vino(); + if (del) + ino = dn->inode->vino(); + } + if (!del) + ino.ino = inodeno_t(); + } + void finish(int r) override { + // _async_dentry_invalidate is responsible for its own locking + assert(!client->client_lock.is_locked_by_me()); + client->_async_dentry_invalidate(dirino, ino, name); + } +}; + +void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name) +{ + if (unmounting) + return; + ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino + << " in dir " << dirino << dendl; + dentry_invalidate_cb(callback_handle, dirino, ino, name); +} + +void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del) +{ + if (dentry_invalidate_cb && dn->inode->ll_ref > 0) + async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del)); +} + +void Client::_try_to_trim_inode(Inode *in, bool sched_inval) +{ + int ref = in->get_num_ref(); + + if (in->dir && !in->dir->dentries.empty()) { + for (auto p = in->dir->dentries.begin(); + p != in->dir->dentries.end(); ) { + Dentry *dn = p->second; + ++p; + /* rmsnap removes whole subtree, need trim inodes recursively. + * we don't need to invalidate dentries recursively. because + * invalidating a directory dentry effectively invalidate + * whole subtree */ + if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir()) + _try_to_trim_inode(dn->inode.get(), false); + + if (dn->lru_is_expireable()) + unlink(dn, true, false); // keep dir, drop dentry + } + if (in->dir->dentries.empty()) { + close_dir(in->dir); + --ref; + } + } + + if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) { + InodeRef snapdir = open_snapdir(in); + _try_to_trim_inode(snapdir.get(), false); + --ref; + } + + if (ref > 0 && in->ll_ref > 0 && sched_inval) { + set::iterator q = in->dn_set.begin(); + while (q != in->dn_set.end()) { + Dentry *dn = *q++; + // FIXME: we play lots of unlink/link tricks when handling MDS replies, + // so in->dn_set doesn't always reflect the state of kernel's dcache. + _schedule_invalidate_dentry_callback(dn, true); + unlink(dn, true, true); + } + } +} + +void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m) +{ + mds_rank_t mds = session->mds_num; + int used = get_caps_used(in); + int wanted = in->caps_wanted(); + + const int old_caps = cap->issued; + const int new_caps = m->get_caps(); + ldout(cct, 5) << "handle_cap_grant on in " << m->get_ino() + << " mds." << mds << " seq " << m->get_seq() + << " caps now " << ccap_string(new_caps) + << " was " << ccap_string(old_caps) << dendl; + cap->seq = m->get_seq(); + + in->layout = m->get_layout(); + + // update inode + int implemented = 0; + int issued = in->caps_issued(&implemented) | in->caps_dirty(); + issued |= implemented; + + if ((issued & CEPH_CAP_AUTH_EXCL) == 0) { + in->mode = m->head.mode; + in->uid = m->head.uid; + in->gid = m->head.gid; + in->btime = m->btime; + } + bool deleted_inode = false; + if ((issued & CEPH_CAP_LINK_EXCL) == 0) { + in->nlink = m->head.nlink; + if (in->nlink == 0 && + (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) + deleted_inode = true; + } + if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && + m->xattrbl.length() && + m->head.xattr_version > in->xattr_version) { + bufferlist::iterator p = m->xattrbl.begin(); + ::decode(in->xattrs, p); + in->xattr_version = m->head.xattr_version; + } + update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(), m->get_size(), + m->get_change_attr(), m->get_time_warp_seq(), m->get_ctime(), + m->get_mtime(), m->get_atime(), + m->inline_version, m->inline_data, issued); + + // max_size + if (cap == in->auth_cap && + m->get_max_size() != in->max_size) { + ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl; + in->max_size = m->get_max_size(); + if (in->max_size > in->wanted_max_size) { + in->wanted_max_size = 0; + in->requested_max_size = 0; + } + } + + bool check = false; + if (m->get_op() == CEPH_CAP_OP_IMPORT && m->get_wanted() != wanted) + check = true; + + check_cap_issue(in, cap, new_caps); + + // update caps + if (old_caps & ~new_caps) { + ldout(cct, 10) << " revocation of " << ccap_string(~new_caps & old_caps) << dendl; + cap->issued = new_caps; + cap->implemented |= new_caps; + + if (((used & ~new_caps) & CEPH_CAP_FILE_BUFFER) + && !_flush(in, new C_Client_FlushComplete(this, in))) { + // waitin' for flush + } else if ((old_caps & ~new_caps) & CEPH_CAP_FILE_CACHE) { + if (_release(in)) + check = true; + } else { + cap->wanted = 0; // don't let check_caps skip sending a response to MDS + check = true; + } + + } else if (old_caps == new_caps) { + ldout(cct, 10) << " caps unchanged at " << ccap_string(old_caps) << dendl; + } else { + ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~old_caps) << dendl; + cap->issued = new_caps; + cap->implemented |= new_caps; + + if (cap == in->auth_cap) { + // non-auth MDS is revoking the newly grant caps ? + for (map::iterator it = in->caps.begin(); it != in->caps.end(); ++it) { + if (it->second == cap) + continue; + if (it->second->implemented & ~it->second->issued & new_caps) { + check = true; + break; + } + } + } + } + + if (check) + check_caps(in, 0); + + // wake up waiters + if (new_caps) + signal_cond_list(in->waitfor_caps); + + // may drop inode's last ref + if (deleted_inode) + _try_to_trim_inode(in, true); + + m->put(); +} + +int Client::_getgrouplist(gid_t** sgids, uid_t uid, gid_t gid) +{ + // cppcheck-suppress variableScope + int sgid_count; + gid_t *sgid_buf; + + if (getgroups_cb) { + sgid_count = getgroups_cb(callback_handle, &sgid_buf); + if (sgid_count > 0) { + *sgids = sgid_buf; + return sgid_count; + } + } + +#if HAVE_GETGROUPLIST + struct passwd *pw; + pw = getpwuid(uid); + if (pw == NULL) { + ldout(cct, 3) << "getting user entry failed" << dendl; + return -errno; + } + //use PAM to get the group list + // initial number of group entries, defaults to posix standard of 16 + // PAM implementations may provide more than 16 groups.... + sgid_count = 16; + sgid_buf = (gid_t*)malloc(sgid_count * sizeof(gid_t)); + if (sgid_buf == NULL) { + ldout(cct, 3) << "allocating group memory failed" << dendl; + return -ENOMEM; + } + + while (1) { +#if defined(__APPLE__) + if (getgrouplist(pw->pw_name, gid, (int*)sgid_buf, &sgid_count) == -1) { +#else + if (getgrouplist(pw->pw_name, gid, sgid_buf, &sgid_count) == -1) { +#endif + // we need to resize the group list and try again + void *_realloc = NULL; + if ((_realloc = realloc(sgid_buf, sgid_count * sizeof(gid_t))) == NULL) { + ldout(cct, 3) << "allocating group memory failed" << dendl; + free(sgid_buf); + return -ENOMEM; + } + sgid_buf = (gid_t*)_realloc; + continue; + } + // list was successfully retrieved + break; + } + *sgids = sgid_buf; + return sgid_count; +#else + return 0; +#endif +} + +int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want) +{ + if (perms.uid() == 0) + return 0; + + if (perms.uid() != in->uid && (in->mode & S_IRWXG)) { + int ret = _posix_acl_permission(in, perms, want); + if (ret != -EAGAIN) + return ret; + } + + // check permissions before doing anything else + if (!in->check_mode(perms, want)) + return -EACCES; + return 0; +} + +int Client::xattr_permission(Inode *in, const char *name, unsigned want, + const UserPerm& perms) +{ + int r = _getattr_for_perm(in, perms); + if (r < 0) + goto out; + + r = 0; + if (strncmp(name, "system.", 7) == 0) { + if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid)) + r = -EPERM; + } else { + r = inode_permission(in, perms, want); + } +out: + ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl; + return r; +} + +ostream& operator<<(ostream &out, const UserPerm& perm) { + out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")"; + return out; +} + +int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask, + const UserPerm& perms) +{ + ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl; + int r = _getattr_for_perm(in, perms); + if (r < 0) + goto out; + + if (mask & CEPH_SETATTR_SIZE) { + r = inode_permission(in, perms, MAY_WRITE); + if (r < 0) + goto out; + } + + r = -EPERM; + if (mask & CEPH_SETATTR_UID) { + if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid)) + goto out; + } + if (mask & CEPH_SETATTR_GID) { + if (perms.uid() != 0 && (perms.uid() != in->uid || + (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid))) + goto out; + } + + if (mask & CEPH_SETATTR_MODE) { + if (perms.uid() != 0 && perms.uid() != in->uid) + goto out; + + gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid; + if (perms.uid() != 0 && !perms.gid_in_groups(i_gid)) + stx->stx_mode &= ~S_ISGID; + } + + if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME | + CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) { + if (perms.uid() != 0 && perms.uid() != in->uid) { + int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME; + if (!(mask & CEPH_SETATTR_MTIME_NOW)) + check_mask |= CEPH_SETATTR_MTIME; + if (!(mask & CEPH_SETATTR_ATIME_NOW)) + check_mask |= CEPH_SETATTR_ATIME; + if (check_mask & mask) { + goto out; + } else { + r = inode_permission(in, perms, MAY_WRITE); + if (r < 0) + goto out; + } + } + } + r = 0; +out: + ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl; + return r; +} + +int Client::may_open(Inode *in, int flags, const UserPerm& perms) +{ + ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl; + unsigned want = 0; + + if ((flags & O_ACCMODE) == O_WRONLY) + want = MAY_WRITE; + else if ((flags & O_ACCMODE) == O_RDWR) + want = MAY_READ | MAY_WRITE; + else if ((flags & O_ACCMODE) == O_RDONLY) + want = MAY_READ; + if (flags & O_TRUNC) + want |= MAY_WRITE; + + int r = 0; + switch (in->mode & S_IFMT) { + case S_IFLNK: + r = -ELOOP; + goto out; + case S_IFDIR: + if (want & MAY_WRITE) { + r = -EISDIR; + goto out; + } + break; + } + + r = _getattr_for_perm(in, perms); + if (r < 0) + goto out; + + r = inode_permission(in, perms, want); +out: + ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl; + return r; +} + +int Client::may_lookup(Inode *dir, const UserPerm& perms) +{ + ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl; + int r = _getattr_for_perm(dir, perms); + if (r < 0) + goto out; + + r = inode_permission(dir, perms, MAY_EXEC); +out: + ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl; + return r; +} + +int Client::may_create(Inode *dir, const UserPerm& perms) +{ + ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl; + int r = _getattr_for_perm(dir, perms); + if (r < 0) + goto out; + + r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE); +out: + ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl; + return r; +} + +int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms) +{ + ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl; + int r = _getattr_for_perm(dir, perms); + if (r < 0) + goto out; + + r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE); + if (r < 0) + goto out; + + /* 'name == NULL' means rmsnap */ + if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) { + InodeRef otherin; + r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms); + if (r < 0) + goto out; + if (dir->uid != perms.uid() && otherin->uid != perms.uid()) + r = -EPERM; + } +out: + ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl; + return r; +} + +int Client::may_hardlink(Inode *in, const UserPerm& perms) +{ + ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl; + int r = _getattr_for_perm(in, perms); + if (r < 0) + goto out; + + if (perms.uid() == 0 || perms.uid() == in->uid) { + r = 0; + goto out; + } + + r = -EPERM; + if (!S_ISREG(in->mode)) + goto out; + + if (in->mode & S_ISUID) + goto out; + + if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) + goto out; + + r = inode_permission(in, perms, MAY_READ | MAY_WRITE); +out: + ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl; + return r; +} + +int Client::_getattr_for_perm(Inode *in, const UserPerm& perms) +{ + int mask = CEPH_STAT_CAP_MODE; + bool force = false; + if (acl_type != NO_ACL) { + mask |= CEPH_STAT_CAP_XATTR; + force = in->xattr_version == 0; + } + return _getattr(in, mask, perms, force); +} + +vinodeno_t Client::_get_vino(Inode *in) +{ + /* The caller must hold the client lock */ + return vinodeno_t(in->ino, in->snapid); +} + +inodeno_t Client::_get_inodeno(Inode *in) +{ + /* The caller must hold the client lock */ + return in->ino; +} + + +/** + * Resolve an MDS spec to a list of MDS daemon GIDs. + * + * The spec is a string representing a GID, rank, filesystem:rank, or name/id. + * It may be '*' in which case it matches all GIDs. + * + * If no error is returned, the `targets` vector will be populated with at least + * one MDS. + */ +int Client::resolve_mds( + const std::string &mds_spec, + std::vector *targets) +{ + assert(fsmap); + assert(targets != nullptr); + + mds_role_t role; + std::stringstream ss; + int role_r = fsmap->parse_role(mds_spec, &role, ss); + if (role_r == 0) { + // We got a role, resolve it to a GID + ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '" + << role << "'" << dendl; + targets->push_back( + fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id); + return 0; + } + + std::string strtol_err; + long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err); + if (strtol_err.empty()) { + // It is a possible GID + const mds_gid_t mds_gid = mds_gid_t(rank_or_gid); + if (fsmap->gid_exists(mds_gid)) { + ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl; + targets->push_back(mds_gid); + } else { + lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map" + << dendl; + return -ENOENT; + } + } else if (mds_spec == "*") { + // It is a wildcard: use all MDSs + const auto mds_info = fsmap->get_mds_info(); + + if (mds_info.empty()) { + lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl; + return -ENOENT; + } + + for (const auto i : mds_info) { + targets->push_back(i.first); + } + } else { + // It did not parse as an integer, it is not a wildcard, it must be a name + const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec); + if (mds_gid == 0) { + lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl; + + lderr(cct) << "FSMap: " << *fsmap << dendl; + + return -ENOENT; + } else { + ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec + << "' to GID " << mds_gid << dendl; + targets->push_back(mds_gid); + } + } + + return 0; +} + + +/** + * Authenticate with mon and establish global ID + */ +int Client::authenticate() +{ + assert(client_lock.is_locked_by_me()); + + if (monclient->is_authenticated()) { + return 0; + } + + client_lock.Unlock(); + int r = monclient->authenticate(cct->_conf->client_mount_timeout); + client_lock.Lock(); + if (r < 0) { + return r; + } + + whoami = monclient->get_global_id(); + messenger->set_myname(entity_name_t::CLIENT(whoami.v)); + + return 0; +} + +int Client::fetch_fsmap(bool user) +{ + int r; + // Retrieve FSMap to enable looking up daemon addresses. We need FSMap + // rather than MDSMap because no one MDSMap contains all the daemons, and + // a `tell` can address any daemon. + version_t fsmap_latest; + do { + C_SaferCond cond; + monclient->get_version("fsmap", &fsmap_latest, NULL, &cond); + client_lock.Unlock(); + r = cond.wait(); + client_lock.Lock(); + } while (r == -EAGAIN); + + if (r < 0) { + lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl; + return r; + } + + ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl; + + if (user) { + if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) { + monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME); + monclient->renew_subs(); + wait_on_list(waiting_for_fsmap); + } + assert(fsmap_user); + assert(fsmap_user->get_epoch() >= fsmap_latest); + } else { + if (!fsmap || fsmap->get_epoch() < fsmap_latest) { + monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME); + monclient->renew_subs(); + wait_on_list(waiting_for_fsmap); + } + assert(fsmap); + assert(fsmap->get_epoch() >= fsmap_latest); + } + ldout(cct, 10) << __func__ << " finished waiting for FSMap version " + << fsmap_latest << dendl; + return 0; +} + +/** + * + * @mds_spec one of ID, rank, GID, "*" + * + */ +int Client::mds_command( + const std::string &mds_spec, + const vector& cmd, + const bufferlist& inbl, + bufferlist *outbl, + string *outs, + Context *onfinish) +{ + Mutex::Locker lock(client_lock); + + if (!initialized) + return -ENOTCONN; + + int r; + r = authenticate(); + if (r < 0) { + return r; + } + + r = fetch_fsmap(false); + if (r < 0) { + return r; + } + + // Look up MDS target(s) of the command + std::vector targets; + r = resolve_mds(mds_spec, &targets); + if (r < 0) { + return r; + } + + // If daemons are laggy, we won't send them commands. If all + // are laggy then we fail. + std::vector non_laggy; + for (const auto gid : targets) { + const auto info = fsmap->get_info_gid(gid); + if (!info.laggy()) { + non_laggy.push_back(gid); + } + } + if (non_laggy.size() == 0) { + *outs = "All targeted MDS daemons are laggy"; + return -ENOENT; + } + + if (metadata.empty()) { + // We are called on an unmounted client, so metadata + // won't be initialized yet. + populate_metadata(""); + } + + // Send commands to targets + C_GatherBuilder gather(cct, onfinish); + for (const auto target_gid : non_laggy) { + const auto info = fsmap->get_info_gid(target_gid); + + // Open a connection to the target MDS + entity_inst_t inst = info.get_inst(); + ConnectionRef conn = messenger->get_connection(inst); + + // Generate MDSCommandOp state + auto &op = command_table.start_command(); + + op.on_finish = gather.new_sub(); + op.cmd = cmd; + op.outbl = outbl; + op.outs = outs; + op.inbl = inbl; + op.mds_gid = target_gid; + op.con = conn; + + ldout(cct, 4) << __func__ << ": new command op to " << target_gid + << " tid=" << op.tid << cmd << dendl; + + // Construct and send MCommand + MCommand *m = op.get_message(monclient->get_fsid()); + conn->send_message(m); + } + gather.activate(); + + return 0; +} + +void Client::handle_command_reply(MCommandReply *m) +{ + ceph_tid_t const tid = m->get_tid(); + + ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl; + + if (!command_table.exists(tid)) { + ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl; + m->put(); + return; + } + + auto &op = command_table.get_command(tid); + if (op.outbl) { + op.outbl->claim(m->get_data()); + } + if (op.outs) { + *op.outs = m->rs; + } + + if (op.on_finish) { + op.on_finish->complete(m->r); + } + + command_table.erase(tid); + + m->put(); +} + +// ------------------- +// MOUNT + +int Client::mount(const std::string &mount_root, const UserPerm& perms, + bool require_mds) +{ + Mutex::Locker lock(client_lock); + + if (mounted) { + ldout(cct, 5) << "already mounted" << dendl; + return 0; + } + + int r = authenticate(); + if (r < 0) { + lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl; + return r; + } + + std::string want = "mdsmap"; + const auto &mds_ns = cct->_conf->client_mds_namespace; + if (!mds_ns.empty()) { + r = fetch_fsmap(true); + if (r < 0) + return r; + fs_cluster_id_t cid = fsmap_user->get_fs_cid(mds_ns); + if (cid == FS_CLUSTER_ID_NONE) + return -ENOENT; + + std::ostringstream oss; + oss << want << "." << cid; + want = oss.str(); + } + ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl; + + monclient->sub_want(want, 0, 0); + monclient->renew_subs(); + + tick(); // start tick + + if (require_mds) { + while (1) { + auto availability = mdsmap->is_cluster_available(); + if (availability == MDSMap::STUCK_UNAVAILABLE) { + // Error out + ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl; + return CEPH_FUSE_NO_MDS_UP; + } else if (availability == MDSMap::AVAILABLE) { + // Continue to mount + break; + } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) { + // Else, wait. MDSMonitor will update the map to bring + // us to a conclusion eventually. + wait_on_list(waiting_for_mdsmap); + } else { + // Unexpected value! + ceph_abort(); + } + } + } + + populate_metadata(mount_root.empty() ? "/" : mount_root); + + filepath fp(CEPH_INO_ROOT); + if (!mount_root.empty()) { + fp = filepath(mount_root.c_str()); + } + while (true) { + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR); + req->set_filepath(fp); + req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL; + int res = make_request(req, perms); + if (res < 0) { + if (res == -EACCES && root) { + ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl; + break; + } + return res; + } + + if (fp.depth()) + fp.pop_dentry(); + else + break; + } + + assert(root); + _ll_get(root); + + mounted = true; + + // trace? + if (!cct->_conf->client_trace.empty()) { + traceout.open(cct->_conf->client_trace.c_str()); + if (traceout.is_open()) { + ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl; + } else { + ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl; + } + } + + /* + ldout(cct, 3) << "op: // client trace data structs" << dendl; + ldout(cct, 3) << "op: struct stat st;" << dendl; + ldout(cct, 3) << "op: struct utimbuf utim;" << dendl; + ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl; + ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl; + ldout(cct, 3) << "op: map dir_contents;" << dendl; + ldout(cct, 3) << "op: map open_files;" << dendl; + ldout(cct, 3) << "op: int fd;" << dendl; + */ + return 0; +} + +// UNMOUNT + +void Client::_close_sessions() +{ + while (!mds_sessions.empty()) { + // send session closes! + for (map::iterator p = mds_sessions.begin(); + p != mds_sessions.end(); + ++p) { + if (p->second->state != MetaSession::STATE_CLOSING) { + _close_mds_session(p->second); + } + } + + // wait for sessions to close + ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl; + mount_cond.Wait(client_lock); + } +} + +void Client::flush_mdlog_sync() +{ + if (mds_requests.empty()) + return; + for (map::iterator p = mds_sessions.begin(); + p != mds_sessions.end(); + ++p) { + MetaSession *s = p->second; + flush_mdlog(s); + } +} + +void Client::flush_mdlog(MetaSession *session) +{ + // Only send this to Luminous or newer MDS daemons, older daemons + // will crash if they see an unknown CEPH_SESSION_* value in this msg. + const uint64_t features = session->con->get_features(); + if (HAVE_FEATURE(features, SERVER_LUMINOUS)) { + MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_FLUSH_MDLOG); + session->con->send_message(m); + } +} + + +void Client::unmount() +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return; + + ldout(cct, 2) << "unmounting" << dendl; + unmounting = true; + + flush_mdlog_sync(); // flush the mdlog for pending requests, if any + while (!mds_requests.empty()) { + ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl; + mount_cond.Wait(client_lock); + } + + if (tick_event) + timer.cancel_event(tick_event); + tick_event = 0; + + cwd.reset(); + + // clean up any unclosed files + while (!fd_map.empty()) { + Fh *fh = fd_map.begin()->second; + fd_map.erase(fd_map.begin()); + ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl; + _release_fh(fh); + } + + while (!ll_unclosed_fh_set.empty()) { + set::iterator it = ll_unclosed_fh_set.begin(); + Fh *fh = *it; + ll_unclosed_fh_set.erase(fh); + ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl; + _release_fh(fh); + } + + while (!opened_dirs.empty()) { + dir_result_t *dirp = *opened_dirs.begin(); + ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl; + _closedir(dirp); + } + + _ll_drop_pins(); + + if (blacklisted) { + ldout(cct, 0) << " skipping clean shutdown, we are blacklisted" << dendl; + + if (cct->_conf->client_oc) { + // Purge all cached data so that ObjectCacher doesn't get hung up + // trying to flush it. ObjectCacher's behaviour on EBLACKLISTED + // is to just leave things marked dirty + // (http://tracker.ceph.com/issues/9105) + for (const auto &i : inode_map) { + objectcacher->purge_set(&(i.second->oset)); + } + } + + mounted = false; + return; + } + + while (unsafe_sync_write > 0) { + ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl; + mount_cond.Wait(client_lock); + } + + if (cct->_conf->client_oc) { + // flush/release all buffered data + ceph::unordered_map::iterator next; + for (ceph::unordered_map::iterator p = inode_map.begin(); + p != inode_map.end(); + p = next) { + next = p; + ++next; + Inode *in = p->second; + if (!in) { + ldout(cct, 0) << "null inode_map entry ino " << p->first << dendl; + assert(in); + } + if (!in->caps.empty()) { + InodeRef tmp_ref(in); + _release(in); + _flush(in, new C_Client_FlushComplete(this, in)); + } + } + } + + flush_caps_sync(); + wait_sync_caps(last_flush_tid); + + // empty lru cache + trim_cache(); + + while (lru.lru_get_size() > 0 || + !inode_map.empty()) { + ldout(cct, 2) << "cache still has " << lru.lru_get_size() + << "+" << inode_map.size() << " items" + << ", waiting (for caps to release?)" + << dendl; + utime_t until = ceph_clock_now() + utime_t(5, 0); + int r = mount_cond.WaitUntil(client_lock, until); + if (r == ETIMEDOUT) { + dump_cache(NULL); + } + } + assert(lru.lru_get_size() == 0); + assert(inode_map.empty()); + + // stop tracing + if (!cct->_conf->client_trace.empty()) { + ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl; + traceout.close(); + } + + _close_sessions(); + + mounted = false; + + ldout(cct, 2) << "unmounted." << dendl; +} + +void Client::flush_cap_releases() +{ + // send any cap releases + for (map::iterator p = mds_sessions.begin(); + p != mds_sessions.end(); + ++p) { + if (p->second->release && mdsmap->is_clientreplay_or_active_or_stopping( + p->first)) { + if (cct->_conf->client_inject_release_failure) { + ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl; + p->second->release->put(); + } else { + p->second->con->send_message(p->second->release); + } + p->second->release = 0; + } + } +} + +void Client::tick() +{ + if (cct->_conf->client_debug_inject_tick_delay > 0) { + sleep(cct->_conf->client_debug_inject_tick_delay); + assert(0 == cct->_conf->set_val("client_debug_inject_tick_delay", "0")); + cct->_conf->apply_changes(NULL); + } + + ldout(cct, 21) << "tick" << dendl; + tick_event = timer.add_event_after( + cct->_conf->client_tick_interval, + new FunctionContext([this](int) { + // Called back via Timer, which takes client_lock for us + assert(client_lock.is_locked_by_me()); + tick(); + })); + utime_t now = ceph_clock_now(); + + if (!mounted && !mds_requests.empty()) { + MetaRequest *req = mds_requests.begin()->second; + if (req->op_stamp + cct->_conf->client_mount_timeout < now) { + req->abort(-ETIMEDOUT); + if (req->caller_cond) { + req->kick = true; + req->caller_cond->Signal(); + } + signal_cond_list(waiting_for_mdsmap); + for (map::iterator p = mds_sessions.begin(); + p != mds_sessions.end(); + ++p) + signal_context_list(p->second->waiting_for_open); + } + } + + if (mdsmap->get_epoch()) { + // renew caps? + utime_t el = now - last_cap_renew; + if (el > mdsmap->get_session_timeout() / 3.0) + renew_caps(); + + flush_cap_releases(); + } + + // delayed caps + xlist::iterator p = delayed_caps.begin(); + while (!p.end()) { + Inode *in = *p; + ++p; + if (in->hold_caps_until > now) + break; + delayed_caps.pop_front(); + cap_list.push_back(&in->cap_item); + check_caps(in, CHECK_CAPS_NODELAY); + } + + trim_cache(true); +} + +void Client::renew_caps() +{ + ldout(cct, 10) << "renew_caps()" << dendl; + last_cap_renew = ceph_clock_now(); + + for (map::iterator p = mds_sessions.begin(); + p != mds_sessions.end(); + ++p) { + ldout(cct, 15) << "renew_caps requesting from mds." << p->first << dendl; + if (mdsmap->get_state(p->first) >= MDSMap::STATE_REJOIN) + renew_caps(p->second); + } +} + +void Client::renew_caps(MetaSession *session) +{ + ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl; + session->last_cap_renew_request = ceph_clock_now(); + uint64_t seq = ++session->cap_renew_seq; + session->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_RENEWCAPS, seq)); +} + + +// =============================================================== +// high level (POSIXy) interface + +int Client::_do_lookup(Inode *dir, const string& name, int mask, + InodeRef *target, const UserPerm& perms) +{ + int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP; + MetaRequest *req = new MetaRequest(op); + filepath path; + dir->make_nosnap_relative_path(path); + path.push_dentry(name); + req->set_filepath(path); + req->set_inode(dir); + if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP) + mask |= DEBUG_GETATTR_CAPS; + req->head.args.getattr.mask = mask; + + ldout(cct, 10) << "_do_lookup on " << path << dendl; + + int r = make_request(req, perms, target); + ldout(cct, 10) << "_do_lookup res is " << r << dendl; + return r; +} + +int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target, + const UserPerm& perms) +{ + int r = 0; + Dentry *dn = NULL; + + if (!dir->is_dir()) { + r = -ENOTDIR; + goto done; + } + + if (dname == "..") { + if (dir->dn_set.empty()) + *target = dir; + else + *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked + goto done; + } + + if (dname == ".") { + *target = dir; + goto done; + } + + if (dname.length() > NAME_MAX) { + r = -ENAMETOOLONG; + goto done; + } + + if (dname == cct->_conf->client_snapdir && + dir->snapid == CEPH_NOSNAP) { + *target = open_snapdir(dir); + goto done; + } + + if (dir->dir && + dir->dir->dentries.count(dname)) { + dn = dir->dir->dentries[dname]; + + ldout(cct, 20) << "_lookup have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl + << " seq " << dn->lease_seq + << dendl; + + if (!dn->inode || dn->inode->caps_issued_mask(mask)) { + // is dn lease valid? + utime_t now = ceph_clock_now(); + if (dn->lease_mds >= 0 && + dn->lease_ttl > now && + mds_sessions.count(dn->lease_mds)) { + MetaSession *s = mds_sessions[dn->lease_mds]; + if (s->cap_ttl > now && + s->cap_gen == dn->lease_gen) { + // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to + // make trim_caps() behave. + dir->try_touch_cap(dn->lease_mds); + goto hit_dn; + } + ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen + << " vs lease_gen " << dn->lease_gen << dendl; + } + // dir lease? + if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED)) { + if (dn->cap_shared_gen == dir->shared_gen && + (!dn->inode || dn->inode->caps_issued_mask(mask))) + goto hit_dn; + if (!dn->inode && (dir->flags & I_COMPLETE)) { + ldout(cct, 10) << "_lookup concluded ENOENT locally for " + << *dir << " dn '" << dname << "'" << dendl; + return -ENOENT; + } + } + } else { + ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl; + } + } else { + // can we conclude ENOENT locally? + if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED) && + (dir->flags & I_COMPLETE)) { + ldout(cct, 10) << "_lookup concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl; + return -ENOENT; + } + } + + r = _do_lookup(dir, dname, mask, target, perms); + goto done; + + hit_dn: + if (dn->inode) { + *target = dn->inode; + } else { + r = -ENOENT; + } + touch_dn(dn); + + done: + if (r < 0) + ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << r << dendl; + else + ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << **target << dendl; + return r; +} + +int Client::get_or_create(Inode *dir, const char* name, + Dentry **pdn, bool expect_null) +{ + // lookup + ldout(cct, 20) << "get_or_create " << *dir << " name " << name << dendl; + dir->open_dir(); + if (dir->dir->dentries.count(name)) { + Dentry *dn = dir->dir->dentries[name]; + + // is dn lease valid? + utime_t now = ceph_clock_now(); + if (dn->inode && + dn->lease_mds >= 0 && + dn->lease_ttl > now && + mds_sessions.count(dn->lease_mds)) { + MetaSession *s = mds_sessions[dn->lease_mds]; + if (s->cap_ttl > now && + s->cap_gen == dn->lease_gen) { + if (expect_null) + return -EEXIST; + } + } + *pdn = dn; + } else { + // otherwise link up a new one + *pdn = link(dir->dir, name, NULL, NULL); + } + + // success + return 0; +} + +int Client::path_walk(const filepath& origpath, InodeRef *end, + const UserPerm& perms, bool followsym, int mask) +{ + filepath path = origpath; + InodeRef cur; + if (origpath.absolute()) + cur = root; + else + cur = cwd; + assert(cur); + + ldout(cct, 10) << "path_walk " << path << dendl; + + int symlinks = 0; + + unsigned i=0; + while (i < path.depth() && cur) { + int caps = 0; + const string &dname = path[i]; + ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl; + ldout(cct, 20) << " (path is " << path << ")" << dendl; + InodeRef next; + if (cct->_conf->client_permissions) { + int r = may_lookup(cur.get(), perms); + if (r < 0) + return r; + caps = CEPH_CAP_AUTH_SHARED; + } + + /* Get extra requested caps on the last component */ + if (i == (path.depth() - 1)) + caps |= mask; + int r = _lookup(cur.get(), dname, caps, &next, perms); + if (r < 0) + return r; + // only follow trailing symlink if followsym. always follow + // 'directory' symlinks. + if (next && next->is_symlink()) { + symlinks++; + ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl; + if (symlinks > MAXSYMLINKS) { + return -ELOOP; + } + + if (i < path.depth() - 1) { + // dir symlink + // replace consumed components of path with symlink dir target + filepath resolved(next->symlink.c_str()); + resolved.append(path.postfixpath(i + 1)); + path = resolved; + i = 0; + if (next->symlink[0] == '/') { + cur = root; + } + continue; + } else if (followsym) { + if (next->symlink[0] == '/') { + path = next->symlink.c_str(); + i = 0; + // reset position + cur = root; + } else { + filepath more(next->symlink.c_str()); + // we need to remove the symlink component from off of the path + // before adding the target that the symlink points to. remain + // at the same position in the path. + path.pop_dentry(); + path.append(more); + } + continue; + } + } + cur.swap(next); + i++; + } + if (!cur) + return -ENOENT; + if (end) + end->swap(cur); + return 0; +} + + +// namespace ops + +int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "link" << std::endl; + tout(cct) << relexisting << std::endl; + tout(cct) << relpath << std::endl; + + if (unmounting) + return -ENOTCONN; + + filepath existing(relexisting); + + InodeRef in, dir; + int r = path_walk(existing, &in, perm, true); + if (r < 0) + return r; + if (std::string(relpath) == "/") { + r = -EEXIST; + return r; + } + filepath path(relpath); + string name = path.last_dentry(); + path.pop_dentry(); + + r = path_walk(path, &dir, perm, true); + if (r < 0) + return r; + if (cct->_conf->client_permissions) { + if (S_ISDIR(in->mode)) { + r = -EPERM; + return r; + } + r = may_hardlink(in.get(), perm); + if (r < 0) + return r; + r = may_create(dir.get(), perm); + if (r < 0) + return r; + } + r = _link(in.get(), dir.get(), name.c_str(), perm); + return r; +} + +int Client::unlink(const char *relpath, const UserPerm& perm) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "unlink" << std::endl; + tout(cct) << relpath << std::endl; + + if (unmounting) + return -ENOTCONN; + + if (std::string(relpath) == "/") + return -EISDIR; + + filepath path(relpath); + string name = path.last_dentry(); + path.pop_dentry(); + InodeRef dir; + int r = path_walk(path, &dir, perm); + if (r < 0) + return r; + if (cct->_conf->client_permissions) { + r = may_delete(dir.get(), name.c_str(), perm); + if (r < 0) + return r; + } + return _unlink(dir.get(), name.c_str(), perm); +} + +int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "rename" << std::endl; + tout(cct) << relfrom << std::endl; + tout(cct) << relto << std::endl; + + if (unmounting) + return -ENOTCONN; + + if (std::string(relfrom) == "/" || std::string(relto) == "/") + return -EBUSY; + + filepath from(relfrom); + filepath to(relto); + string fromname = from.last_dentry(); + from.pop_dentry(); + string toname = to.last_dentry(); + to.pop_dentry(); + + InodeRef fromdir, todir; + int r = path_walk(from, &fromdir, perm); + if (r < 0) + goto out; + r = path_walk(to, &todir, perm); + if (r < 0) + goto out; + + if (cct->_conf->client_permissions) { + int r = may_delete(fromdir.get(), fromname.c_str(), perm); + if (r < 0) + return r; + r = may_delete(todir.get(), toname.c_str(), perm); + if (r < 0 && r != -ENOENT) + return r; + } + r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm); +out: + return r; +} + +// dirs + +int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "mkdir" << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << mode << std::endl; + ldout(cct, 10) << "mkdir: " << relpath << dendl; + + if (unmounting) + return -ENOTCONN; + + if (std::string(relpath) == "/") + return -EEXIST; + + filepath path(relpath); + string name = path.last_dentry(); + path.pop_dentry(); + InodeRef dir; + int r = path_walk(path, &dir, perm); + if (r < 0) + return r; + if (cct->_conf->client_permissions) { + r = may_create(dir.get(), perm); + if (r < 0) + return r; + } + return _mkdir(dir.get(), name.c_str(), mode, perm); +} + +int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + ldout(cct, 10) << "Client::mkdirs " << relpath << dendl; + tout(cct) << "mkdirs" << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << mode << std::endl; + + if (unmounting) + return -ENOTCONN; + + //get through existing parts of path + filepath path(relpath); + unsigned int i; + int r = 0, caps = 0; + InodeRef cur, next; + cur = cwd; + for (i=0; i_conf->client_permissions) { + r = may_lookup(cur.get(), perms); + if (r < 0) + break; + caps = CEPH_CAP_AUTH_SHARED; + } + r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms); + if (r < 0) + break; + cur.swap(next); + } + //check that we have work left to do + if (i==path.depth()) return -EEXIST; + if (r!=-ENOENT) return r; + ldout(cct, 20) << "mkdirs got through " << i << " directories on path " << relpath << dendl; + //make new directory at each level + for (; i_conf->client_permissions) { + r = may_create(cur.get(), perms); + if (r < 0) + return r; + } + //make new dir + r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next); + + //check proper creation/existence + if(-EEXIST == r && i < path.depth() - 1) { + r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms); + } + if (r < 0) + return r; + //move to new dir and continue + cur.swap(next); + ldout(cct, 20) << "mkdirs: successfully created directory " + << filepath(cur->ino).get_path() << dendl; + } + return 0; +} + +int Client::rmdir(const char *relpath, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "rmdir" << std::endl; + tout(cct) << relpath << std::endl; + + if (unmounting) + return -ENOTCONN; + + if (std::string(relpath) == "/") + return -EBUSY; + + filepath path(relpath); + string name = path.last_dentry(); + path.pop_dentry(); + InodeRef dir; + int r = path_walk(path, &dir, perms); + if (r < 0) + return r; + if (cct->_conf->client_permissions) { + int r = may_delete(dir.get(), name.c_str(), perms); + if (r < 0) + return r; + } + return _rmdir(dir.get(), name.c_str(), perms); +} + +int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "mknod" << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << mode << std::endl; + tout(cct) << rdev << std::endl; + + if (unmounting) + return -ENOTCONN; + + if (std::string(relpath) == "/") + return -EEXIST; + + filepath path(relpath); + string name = path.last_dentry(); + path.pop_dentry(); + InodeRef dir; + int r = path_walk(path, &dir, perms); + if (r < 0) + return r; + if (cct->_conf->client_permissions) { + int r = may_create(dir.get(), perms); + if (r < 0) + return r; + } + return _mknod(dir.get(), name.c_str(), mode, rdev, perms); +} + +// symlinks + +int Client::symlink(const char *target, const char *relpath, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "symlink" << std::endl; + tout(cct) << target << std::endl; + tout(cct) << relpath << std::endl; + + if (unmounting) + return -ENOTCONN; + + if (std::string(relpath) == "/") + return -EEXIST; + + filepath path(relpath); + string name = path.last_dentry(); + path.pop_dentry(); + InodeRef dir; + int r = path_walk(path, &dir, perms); + if (r < 0) + return r; + if (cct->_conf->client_permissions) { + int r = may_create(dir.get(), perms); + if (r < 0) + return r; + } + return _symlink(dir.get(), name.c_str(), target, perms); +} + +int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "readlink" << std::endl; + tout(cct) << relpath << std::endl; + + if (unmounting) + return -ENOTCONN; + + filepath path(relpath); + InodeRef in; + int r = path_walk(path, &in, perms, false); + if (r < 0) + return r; + + return _readlink(in.get(), buf, size); +} + +int Client::_readlink(Inode *in, char *buf, size_t size) +{ + if (!in->is_symlink()) + return -EINVAL; + + // copy into buf (at most size bytes) + int r = in->symlink.length(); + if (r > (int)size) + r = size; + memcpy(buf, in->symlink.c_str(), r); + return r; +} + + +// inode stuff + +int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force) +{ + bool yes = in->caps_issued_mask(mask); + + ldout(cct, 10) << "_getattr mask " << ccap_string(mask) << " issued=" << yes << dendl; + if (yes && !force) + return 0; + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR); + filepath path; + in->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_inode(in); + req->head.args.getattr.mask = mask; + + int res = make_request(req, perms); + ldout(cct, 10) << "_getattr result=" << res << dendl; + return res; +} + +int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask, + const UserPerm& perms, InodeRef *inp) +{ + int issued = in->caps_issued(); + + ldout(cct, 10) << "_setattr mask " << mask << " issued " << + ccap_string(issued) << dendl; + + if (in->snapid != CEPH_NOSNAP) { + return -EROFS; + } + if ((mask & CEPH_SETATTR_SIZE) && + (unsigned long)stx->stx_size > in->size && + is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size, + perms)) { + return -EDQUOT; + } + + // make the change locally? + if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) || + (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) { + ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid() + << " != cap dirtier " << in->cap_dirtier_uid << ":" + << in->cap_dirtier_gid << ", forcing sync setattr" + << dendl; + /* + * This works because we implicitly flush the caps as part of the + * request, so the cap update check will happen with the writeback + * cap context, and then the setattr check will happen with the + * caller's context. + * + * In reality this pattern is likely pretty rare (different users + * setattr'ing the same file). If that turns out not to be the + * case later, we can build a more complex pipelined cap writeback + * infrastructure... + */ + if (!mask) + mask |= CEPH_SETATTR_CTIME; + goto force_request; + } + + if (!mask) { + // caller just needs us to bump the ctime + in->ctime = ceph_clock_now(); + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + if (issued & CEPH_CAP_AUTH_EXCL) + mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL); + else if (issued & CEPH_CAP_FILE_EXCL) + mark_caps_dirty(in, CEPH_CAP_FILE_EXCL); + else if (issued & CEPH_CAP_XATTR_EXCL) + mark_caps_dirty(in, CEPH_CAP_XATTR_EXCL); + else + mask |= CEPH_SETATTR_CTIME; + } + + if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) { + bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID); + + mask &= ~CEPH_SETATTR_KILL_SGUID; + + if (mask & CEPH_SETATTR_UID) { + in->ctime = ceph_clock_now(); + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + in->uid = stx->stx_uid; + mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL); + mask &= ~CEPH_SETATTR_UID; + kill_sguid = true; + ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl; + } + if (mask & CEPH_SETATTR_GID) { + in->ctime = ceph_clock_now(); + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + in->gid = stx->stx_gid; + mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL); + mask &= ~CEPH_SETATTR_GID; + kill_sguid = true; + ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl; + } + + if (mask & CEPH_SETATTR_MODE) { + in->ctime = ceph_clock_now(); + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777); + mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL); + mask &= ~CEPH_SETATTR_MODE; + ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl; + } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) { + /* Must squash the any setuid/setgid bits with an ownership change */ + in->mode &= ~(S_ISUID|S_ISGID); + mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL); + } + + if (mask & CEPH_SETATTR_BTIME) { + in->ctime = ceph_clock_now(); + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + in->btime = utime_t(stx->stx_btime); + mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL); + mask &= ~CEPH_SETATTR_BTIME; + ldout(cct,10) << "changing btime to " << in->btime << dendl; + } + } else if (mask & CEPH_SETATTR_SIZE) { + /* If we don't have Ax, then we must ask the server to clear them on truncate */ + mask |= CEPH_SETATTR_KILL_SGUID; + } + + if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) { + if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) { + if (mask & CEPH_SETATTR_MTIME) + in->mtime = utime_t(stx->stx_mtime); + if (mask & CEPH_SETATTR_ATIME) + in->atime = utime_t(stx->stx_atime); + in->ctime = ceph_clock_now(); + in->cap_dirtier_uid = perms.uid(); + in->cap_dirtier_gid = perms.gid(); + in->time_warp_seq++; + mark_caps_dirty(in, CEPH_CAP_FILE_EXCL); + mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME); + } + } + if (!mask) { + in->change_attr++; + return 0; + } + +force_request: + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR); + + filepath path; + + in->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_inode(in); + + if (mask & CEPH_SETATTR_KILL_SGUID) { + req->inode_drop |= CEPH_CAP_AUTH_SHARED; + } + if (mask & CEPH_SETATTR_MODE) { + req->head.args.setattr.mode = stx->stx_mode; + req->inode_drop |= CEPH_CAP_AUTH_SHARED; + ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl; + } + if (mask & CEPH_SETATTR_UID) { + req->head.args.setattr.uid = stx->stx_uid; + req->inode_drop |= CEPH_CAP_AUTH_SHARED; + ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl; + } + if (mask & CEPH_SETATTR_GID) { + req->head.args.setattr.gid = stx->stx_gid; + req->inode_drop |= CEPH_CAP_AUTH_SHARED; + ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl; + } + if (mask & CEPH_SETATTR_BTIME) { + req->head.args.setattr.btime = utime_t(stx->stx_btime); + req->inode_drop |= CEPH_CAP_AUTH_SHARED; + } + if (mask & CEPH_SETATTR_MTIME) { + req->head.args.setattr.mtime = utime_t(stx->stx_mtime); + req->inode_drop |= CEPH_CAP_AUTH_SHARED | CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR; + } + if (mask & CEPH_SETATTR_ATIME) { + req->head.args.setattr.atime = utime_t(stx->stx_atime); + req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR; + } + if (mask & CEPH_SETATTR_SIZE) { + if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) { + req->head.args.setattr.size = stx->stx_size; + ldout(cct,10) << "changing size to " << stx->stx_size << dendl; + } else { //too big! + put_request(req); + ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl; + return -EFBIG; + } + req->inode_drop |= CEPH_CAP_AUTH_SHARED | CEPH_CAP_FILE_RD | + CEPH_CAP_FILE_WR; + } + req->head.args.setattr.mask = mask; + + req->regetattr_mask = mask; + + int res = make_request(req, perms, inp); + ldout(cct, 10) << "_setattr result=" << res << dendl; + return res; +} + +/* Note that we only care about attrs that setattr cares about */ +void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx) +{ + stx->stx_size = st->st_size; + stx->stx_mode = st->st_mode; + stx->stx_uid = st->st_uid; + stx->stx_gid = st->st_gid; + stx->stx_mtime = st->st_mtim; + stx->stx_atime = st->st_atim; +} + +int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask, + const UserPerm& perms, InodeRef *inp) +{ + int ret = _do_setattr(in, stx, mask, perms, inp); + if (ret < 0) + return ret; + if (mask & CEPH_SETATTR_MODE) + ret = _posix_acl_chmod(in, stx->stx_mode, perms); + return ret; +} + +int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask, + const UserPerm& perms) +{ + mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID | + CEPH_SETATTR_GID | CEPH_SETATTR_MTIME | + CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE | + CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME); + if (cct->_conf->client_permissions) { + int r = may_setattr(in.get(), stx, mask, perms); + if (r < 0) + return r; + } + return __setattrx(in.get(), stx, mask, perms); +} + +int Client::_setattr(InodeRef &in, struct stat *attr, int mask, + const UserPerm& perms) +{ + struct ceph_statx stx; + + stat_to_statx(attr, &stx); + mask &= ~CEPH_SETATTR_BTIME; + + if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast(-1)) { + mask &= ~CEPH_SETATTR_UID; + } + if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast(-1)) { + mask &= ~CEPH_SETATTR_GID; + } + + return _setattrx(in, &stx, mask, perms); +} + +int Client::setattr(const char *relpath, struct stat *attr, int mask, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "setattr" << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << mask << std::endl; + + if (unmounting) + return -ENOTCONN; + + filepath path(relpath); + InodeRef in; + int r = path_walk(path, &in, perms); + if (r < 0) + return r; + return _setattr(in, attr, mask, perms); +} + +int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask, + const UserPerm& perms, int flags) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "setattrx" << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << mask << std::endl; + + if (unmounting) + return -ENOTCONN; + + filepath path(relpath); + InodeRef in; + int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW)); + if (r < 0) + return r; + return _setattrx(in, stx, mask, perms); +} + +int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "fsetattr" << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << mask << std::endl; + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -EBADF; +#endif + return _setattr(f->inode, attr, mask, perms); +} + +int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "fsetattr" << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << mask << std::endl; + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -EBADF; +#endif + return _setattrx(f->inode, stx, mask, perms); +} + +int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms, + frag_info_t *dirstat, int mask) +{ + ldout(cct, 3) << "stat enter (relpath " << relpath << " mask " << mask << ")" << dendl; + Mutex::Locker lock(client_lock); + tout(cct) << "stat" << std::endl; + tout(cct) << relpath << std::endl; + + if (unmounting) + return -ENOTCONN; + + filepath path(relpath); + InodeRef in; + int r = path_walk(path, &in, perms, true, mask); + if (r < 0) + return r; + r = _getattr(in, mask, perms); + if (r < 0) { + ldout(cct, 3) << "stat exit on error!" << dendl; + return r; + } + fill_stat(in, stbuf, dirstat); + ldout(cct, 3) << "stat exit (relpath " << relpath << " mask " << mask << ")" << dendl; + return r; +} + +unsigned Client::statx_to_mask(unsigned int flags, unsigned int want) +{ + unsigned mask = 0; + + /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */ + if (flags & AT_NO_ATTR_SYNC) + goto out; + + /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */ + mask |= CEPH_CAP_PIN; + if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION)) + mask |= CEPH_CAP_AUTH_SHARED; + if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION)) + mask |= CEPH_CAP_LINK_SHARED; + if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION)) + mask |= CEPH_CAP_FILE_SHARED; + if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME)) + mask |= CEPH_CAP_XATTR_SHARED; +out: + return mask; +} + +int Client::statx(const char *relpath, struct ceph_statx *stx, + const UserPerm& perms, + unsigned int want, unsigned int flags) +{ + ldout(cct, 3) << "statx enter (relpath " << relpath << " want " << want << ")" << dendl; + Mutex::Locker lock(client_lock); + tout(cct) << "statx" << std::endl; + tout(cct) << relpath << std::endl; + + if (unmounting) + return -ENOTCONN; + + filepath path(relpath); + InodeRef in; + + unsigned mask = statx_to_mask(flags, want); + + int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask); + if (r < 0) + return r; + + r = _getattr(in, mask, perms); + if (r < 0) { + ldout(cct, 3) << "statx exit on error!" << dendl; + return r; + } + + fill_statx(in, mask, stx); + ldout(cct, 3) << "statx exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl; + return r; +} + +int Client::lstat(const char *relpath, struct stat *stbuf, + const UserPerm& perms, frag_info_t *dirstat, int mask) +{ + ldout(cct, 3) << "lstat enter (relpath " << relpath << " mask " << mask << ")" << dendl; + Mutex::Locker lock(client_lock); + tout(cct) << "lstat" << std::endl; + tout(cct) << relpath << std::endl; + + if (unmounting) + return -ENOTCONN; + + filepath path(relpath); + InodeRef in; + // don't follow symlinks + int r = path_walk(path, &in, perms, false, mask); + if (r < 0) + return r; + r = _getattr(in, mask, perms); + if (r < 0) { + ldout(cct, 3) << "lstat exit on error!" << dendl; + return r; + } + fill_stat(in, stbuf, dirstat); + ldout(cct, 3) << "lstat exit (relpath " << relpath << " mask " << mask << ")" << dendl; + return r; +} + +int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat) +{ + ldout(cct, 10) << "fill_stat on " << in->ino << " snap/dev" << in->snapid + << " mode 0" << oct << in->mode << dec + << " mtime " << in->mtime << " ctime " << in->ctime << dendl; + memset(st, 0, sizeof(struct stat)); + if (use_faked_inos()) + st->st_ino = in->faked_ino; + else + st->st_ino = in->ino; + st->st_dev = in->snapid; + st->st_mode = in->mode; + st->st_rdev = in->rdev; + st->st_nlink = in->nlink; + st->st_uid = in->uid; + st->st_gid = in->gid; + if (in->ctime > in->mtime) { + stat_set_ctime_sec(st, in->ctime.sec()); + stat_set_ctime_nsec(st, in->ctime.nsec()); + } else { + stat_set_ctime_sec(st, in->mtime.sec()); + stat_set_ctime_nsec(st, in->mtime.nsec()); + } + stat_set_atime_sec(st, in->atime.sec()); + stat_set_atime_nsec(st, in->atime.nsec()); + stat_set_mtime_sec(st, in->mtime.sec()); + stat_set_mtime_nsec(st, in->mtime.nsec()); + if (in->is_dir()) { + if (cct->_conf->client_dirsize_rbytes) + st->st_size = in->rstat.rbytes; + else + st->st_size = in->dirstat.size(); + st->st_blocks = 1; + } else { + st->st_size = in->size; + st->st_blocks = (in->size + 511) >> 9; + } + st->st_blksize = MAX(in->layout.stripe_unit, 4096); + + if (dirstat) + *dirstat = in->dirstat; + if (rstat) + *rstat = in->rstat; + + return in->caps_issued(); +} + +void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx) +{ + ldout(cct, 10) << "fill_statx on " << in->ino << " snap/dev" << in->snapid + << " mode 0" << oct << in->mode << dec + << " mtime " << in->mtime << " ctime " << in->ctime << dendl; + memset(stx, 0, sizeof(struct ceph_statx)); + + /* + * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask + * so that all bits are set. + */ + if (!mask) + mask = ~0; + + /* These are always considered to be available */ + stx->stx_dev = in->snapid; + stx->stx_blksize = MAX(in->layout.stripe_unit, 4096); + + /* Type bits are always set, even when CEPH_STATX_MODE is not */ + stx->stx_mode = S_IFMT & in->mode; + stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino; + stx->stx_rdev = in->rdev; + stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV); + + if (mask & CEPH_CAP_AUTH_SHARED) { + stx->stx_uid = in->uid; + stx->stx_gid = in->gid; + stx->stx_mode = in->mode; + in->btime.to_timespec(&stx->stx_btime); + stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME); + } + + if (mask & CEPH_CAP_LINK_SHARED) { + stx->stx_nlink = in->nlink; + stx->stx_mask |= CEPH_STATX_NLINK; + } + + if (mask & CEPH_CAP_FILE_SHARED) { + + in->atime.to_timespec(&stx->stx_atime); + in->mtime.to_timespec(&stx->stx_mtime); + + if (in->is_dir()) { + if (cct->_conf->client_dirsize_rbytes) + stx->stx_size = in->rstat.rbytes; + else + stx->stx_size = in->dirstat.size(); + stx->stx_blocks = 1; + } else { + stx->stx_size = in->size; + stx->stx_blocks = (in->size + 511) >> 9; + } + stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME| + CEPH_STATX_SIZE|CEPH_STATX_BLOCKS); + } + + /* Change time and change_attr both require all shared caps to view */ + if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) { + stx->stx_version = in->change_attr; + if (in->ctime > in->mtime) + in->ctime.to_timespec(&stx->stx_ctime); + else + in->mtime.to_timespec(&stx->stx_ctime); + stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION); + } + +} + +void Client::touch_dn(Dentry *dn) +{ + lru.lru_touch(dn); +} + +int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "chmod" << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << mode << std::endl; + + if (unmounting) + return -ENOTCONN; + + filepath path(relpath); + InodeRef in; + int r = path_walk(path, &in, perms); + if (r < 0) + return r; + struct stat attr; + attr.st_mode = mode; + return _setattr(in, &attr, CEPH_SETATTR_MODE, perms); +} + +int Client::fchmod(int fd, mode_t mode, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "fchmod" << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << mode << std::endl; + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -EBADF; +#endif + struct stat attr; + attr.st_mode = mode; + return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms); +} + +int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "lchmod" << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << mode << std::endl; + + if (unmounting) + return -ENOTCONN; + + filepath path(relpath); + InodeRef in; + // don't follow symlinks + int r = path_walk(path, &in, perms, false); + if (r < 0) + return r; + struct stat attr; + attr.st_mode = mode; + return _setattr(in, &attr, CEPH_SETATTR_MODE, perms); +} + +int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "chown" << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << new_uid << std::endl; + tout(cct) << new_gid << std::endl; + + if (unmounting) + return -ENOTCONN; + + filepath path(relpath); + InodeRef in; + int r = path_walk(path, &in, perms); + if (r < 0) + return r; + struct stat attr; + attr.st_uid = new_uid; + attr.st_gid = new_gid; + return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms); +} + +int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "fchown" << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << new_uid << std::endl; + tout(cct) << new_gid << std::endl; + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -EBADF; +#endif + struct stat attr; + attr.st_uid = new_uid; + attr.st_gid = new_gid; + int mask = 0; + if (new_uid != static_cast(-1)) mask |= CEPH_SETATTR_UID; + if (new_gid != static_cast(-1)) mask |= CEPH_SETATTR_GID; + return _setattr(f->inode, &attr, mask, perms); +} + +int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "lchown" << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << new_uid << std::endl; + tout(cct) << new_gid << std::endl; + + if (unmounting) + return -ENOTCONN; + + filepath path(relpath); + InodeRef in; + // don't follow symlinks + int r = path_walk(path, &in, perms, false); + if (r < 0) + return r; + struct stat attr; + attr.st_uid = new_uid; + attr.st_gid = new_gid; + int mask = 0; + if (new_uid != static_cast(-1)) mask |= CEPH_SETATTR_UID; + if (new_gid != static_cast(-1)) mask |= CEPH_SETATTR_GID; + return _setattr(in, &attr, mask, perms); +} + +int Client::utime(const char *relpath, struct utimbuf *buf, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "utime" << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << buf->modtime << std::endl; + tout(cct) << buf->actime << std::endl; + + if (unmounting) + return -ENOTCONN; + + filepath path(relpath); + InodeRef in; + int r = path_walk(path, &in, perms); + if (r < 0) + return r; + struct stat attr; + stat_set_mtime_sec(&attr, buf->modtime); + stat_set_mtime_nsec(&attr, 0); + stat_set_atime_sec(&attr, buf->actime); + stat_set_atime_nsec(&attr, 0); + return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms); +} + +int Client::lutime(const char *relpath, struct utimbuf *buf, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "lutime" << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << buf->modtime << std::endl; + tout(cct) << buf->actime << std::endl; + + if (unmounting) + return -ENOTCONN; + + filepath path(relpath); + InodeRef in; + // don't follow symlinks + int r = path_walk(path, &in, perms, false); + if (r < 0) + return r; + struct stat attr; + stat_set_mtime_sec(&attr, buf->modtime); + stat_set_mtime_nsec(&attr, 0); + stat_set_atime_sec(&attr, buf->actime); + stat_set_atime_nsec(&attr, 0); + return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms); +} + +int Client::flock(int fd, int operation, uint64_t owner) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "flock" << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << operation << std::endl; + tout(cct) << owner << std::endl; + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + + return _flock(f, operation, owner); +} + +int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "opendir" << std::endl; + tout(cct) << relpath << std::endl; + + if (unmounting) + return -ENOTCONN; + + filepath path(relpath); + InodeRef in; + int r = path_walk(path, &in, perms, true); + if (r < 0) + return r; + if (cct->_conf->client_permissions) { + int r = may_open(in.get(), O_RDONLY, perms); + if (r < 0) + return r; + } + r = _opendir(in.get(), dirpp, perms); + /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */ + if (r != -ENOTDIR) + tout(cct) << (unsigned long)*dirpp << std::endl; + return r; +} + +int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms) +{ + if (!in->is_dir()) + return -ENOTDIR; + *dirpp = new dir_result_t(in, perms); + opened_dirs.insert(*dirpp); + ldout(cct, 3) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl; + return 0; +} + + +int Client::closedir(dir_result_t *dir) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "closedir" << std::endl; + tout(cct) << (unsigned long)dir << std::endl; + + ldout(cct, 3) << "closedir(" << dir << ") = 0" << dendl; + _closedir(dir); + return 0; +} + +void Client::_closedir(dir_result_t *dirp) +{ + ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl; + if (dirp->inode) { + ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl; + dirp->inode.reset(); + } + _readdir_drop_dirp_buffer(dirp); + opened_dirs.erase(dirp); + delete dirp; +} + +void Client::rewinddir(dir_result_t *dirp) +{ + Mutex::Locker lock(client_lock); + ldout(cct, 3) << "rewinddir(" << dirp << ")" << dendl; + + if (unmounting) + return; + + dir_result_t *d = static_cast(dirp); + _readdir_drop_dirp_buffer(d); + d->reset(); +} + +loff_t Client::telldir(dir_result_t *dirp) +{ + dir_result_t *d = static_cast(dirp); + ldout(cct, 3) << "telldir(" << dirp << ") = " << d->offset << dendl; + return d->offset; +} + +void Client::seekdir(dir_result_t *dirp, loff_t offset) +{ + Mutex::Locker lock(client_lock); + + ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl; + + if (unmounting) + return; + + if (offset == dirp->offset) + return; + + if (offset > dirp->offset) + dirp->release_count = 0; // bump if we do a forward seek + else + dirp->ordered_count = 0; // disable filling readdir cache + + if (dirp->hash_order()) { + if (dirp->offset > offset) { + _readdir_drop_dirp_buffer(dirp); + dirp->reset(); + } + } else { + if (offset == 0 || + dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) || + dirp->offset_low() > dir_result_t::fpos_low(offset)) { + _readdir_drop_dirp_buffer(dirp); + dirp->reset(); + } + } + + dirp->offset = offset; +} + + +//struct dirent { +// ino_t d_ino; /* inode number */ +// off_t d_off; /* offset to the next dirent */ +// unsigned short d_reclen; /* length of this record */ +// unsigned char d_type; /* type of file */ +// char d_name[256]; /* filename */ +//}; +void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off) +{ + strncpy(de->d_name, name, 255); + de->d_name[255] = '\0'; +#ifndef __CYGWIN__ + de->d_ino = ino; +#if !defined(DARWIN) && !defined(__FreeBSD__) + de->d_off = next_off; +#endif + de->d_reclen = 1; + de->d_type = IFTODT(type); + ldout(cct, 10) << "fill_dirent '" << de->d_name << "' -> " << inodeno_t(de->d_ino) + << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl; +#endif +} + +void Client::_readdir_next_frag(dir_result_t *dirp) +{ + frag_t fg = dirp->buffer_frag; + + if (fg.is_rightmost()) { + ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to END" << dendl; + dirp->set_end(); + return; + } + + // advance + fg = fg.next(); + ldout(cct, 10) << "_readdir_next_frag advance from " << dirp->buffer_frag << " to " << fg << dendl; + + if (dirp->hash_order()) { + // keep last_name + int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true); + if (dirp->offset < new_offset) // don't decrease offset + dirp->offset = new_offset; + } else { + dirp->last_name.clear(); + dirp->offset = dir_result_t::make_fpos(fg, 2, false); + _readdir_rechoose_frag(dirp); + } +} + +void Client::_readdir_rechoose_frag(dir_result_t *dirp) +{ + assert(dirp->inode); + + if (dirp->hash_order()) + return; + + frag_t cur = frag_t(dirp->offset_high()); + frag_t fg = dirp->inode->dirfragtree[cur.value()]; + if (fg != cur) { + ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << fg << dendl; + dirp->offset = dir_result_t::make_fpos(fg, 2, false); + dirp->last_name.clear(); + dirp->next_offset = 2; + } +} + +void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp) +{ + ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl; + dirp->buffer.clear(); +} + +int Client::_readdir_get_frag(dir_result_t *dirp) +{ + assert(dirp); + assert(dirp->inode); + + // get the current frag. + frag_t fg; + if (dirp->hash_order()) + fg = dirp->inode->dirfragtree[dirp->offset_high()]; + else + fg = frag_t(dirp->offset_high()); + + ldout(cct, 10) << "_readdir_get_frag " << dirp << " on " << dirp->inode->ino << " fg " << fg + << " offset " << hex << dirp->offset << dec << dendl; + + int op = CEPH_MDS_OP_READDIR; + if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR) + op = CEPH_MDS_OP_LSSNAP; + + InodeRef& diri = dirp->inode; + + MetaRequest *req = new MetaRequest(op); + filepath path; + diri->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_inode(diri.get()); + req->head.args.readdir.frag = fg; + req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS; + if (dirp->last_name.length()) { + req->path2.set_path(dirp->last_name.c_str()); + } else if (dirp->hash_order()) { + req->head.args.readdir.offset_hash = dirp->offset_high(); + } + req->dirp = dirp; + + bufferlist dirbl; + int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl); + + if (res == -EAGAIN) { + ldout(cct, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl; + _readdir_rechoose_frag(dirp); + return _readdir_get_frag(dirp); + } + + if (res == 0) { + ldout(cct, 10) << "_readdir_get_frag " << dirp << " got frag " << dirp->buffer_frag + << " size " << dirp->buffer.size() << dendl; + } else { + ldout(cct, 10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl; + dirp->set_end(); + } + + return res; +} + +struct dentry_off_lt { + bool operator()(const Dentry* dn, int64_t off) const { + return dir_result_t::fpos_cmp(dn->offset, off) < 0; + } +}; + +int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p, + int caps, bool getref) +{ + assert(client_lock.is_locked()); + ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino + << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec + << dendl; + Dir *dir = dirp->inode->dir; + + if (!dir) { + ldout(cct, 10) << " dir is empty" << dendl; + dirp->set_end(); + return 0; + } + + vector::iterator pd = std::lower_bound(dir->readdir_cache.begin(), + dir->readdir_cache.end(), + dirp->offset, dentry_off_lt()); + + string dn_name; + while (true) { + if (!dirp->inode->is_complete_and_ordered()) + return -EAGAIN; + if (pd == dir->readdir_cache.end()) + break; + Dentry *dn = *pd; + if (dn->inode == NULL) { + ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl; + ++pd; + continue; + } + if (dn->cap_shared_gen != dir->parent_inode->shared_gen) { + ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl; + ++pd; + continue; + } + + int r = _getattr(dn->inode, caps, dirp->perms); + if (r < 0) + return r; + + struct ceph_statx stx; + struct dirent de; + fill_statx(dn->inode, caps, &stx); + + uint64_t next_off = dn->offset + 1; + ++pd; + if (pd == dir->readdir_cache.end()) + next_off = dir_result_t::END; + + Inode *in = NULL; + fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off); + if (getref) { + in = dn->inode.get(); + _ll_get(in); + } + + dn_name = dn->name; // fill in name while we have lock + + client_lock.Unlock(); + r = cb(p, &de, &stx, next_off, in); // _next_ offset + client_lock.Lock(); + ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec + << " = " << r << dendl; + if (r < 0) { + return r; + } + + dirp->offset = next_off; + if (dirp->at_end()) + dirp->next_offset = 2; + else + dirp->next_offset = dirp->offset_low(); + dirp->last_name = dn_name; // we successfully returned this one; update! + if (r > 0) + return r; + } + + ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino << " at end" << dendl; + dirp->set_end(); + return 0; +} + +int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p, + unsigned want, unsigned flags, bool getref) +{ + int caps = statx_to_mask(flags, want); + + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + dir_result_t *dirp = static_cast(d); + + ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset + << dec << " at_end=" << dirp->at_end() + << " hash_order=" << dirp->hash_order() << dendl; + + struct dirent de; + struct ceph_statx stx; + memset(&de, 0, sizeof(de)); + memset(&stx, 0, sizeof(stx)); + + InodeRef& diri = dirp->inode; + + if (dirp->at_end()) + return 0; + + if (dirp->offset == 0) { + ldout(cct, 15) << " including ." << dendl; + assert(diri->dn_set.size() < 2); // can't have multiple hard-links to a dir + uint64_t next_off = 1; + + int r; + r = _getattr(diri, caps, dirp->perms); + if (r < 0) + return r; + + fill_statx(diri, caps, &stx); + fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off); + + Inode *inode = NULL; + if (getref) { + inode = diri.get(); + _ll_get(inode); + } + + client_lock.Unlock(); + r = cb(p, &de, &stx, next_off, inode); + client_lock.Lock(); + if (r < 0) + return r; + + dirp->offset = next_off; + if (r > 0) + return r; + } + if (dirp->offset == 1) { + ldout(cct, 15) << " including .." << dendl; + uint64_t next_off = 2; + InodeRef in; + if (diri->dn_set.empty()) + in = diri; + else + in = diri->get_first_parent()->inode; + + int r; + r = _getattr(diri, caps, dirp->perms); + if (r < 0) + return r; + + fill_statx(in, caps, &stx); + fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off); + + Inode *inode = NULL; + if (getref) { + inode = in.get(); + _ll_get(inode); + } + + client_lock.Unlock(); + r = cb(p, &de, &stx, next_off, inode); + client_lock.Lock(); + if (r < 0) + return r; + + dirp->offset = next_off; + if (r > 0) + return r; + } + + // can we read from our cache? + ldout(cct, 10) << "offset " << hex << dirp->offset << dec + << " snapid " << dirp->inode->snapid << " (complete && ordered) " + << dirp->inode->is_complete_and_ordered() + << " issued " << ccap_string(dirp->inode->caps_issued()) + << dendl; + if (dirp->inode->snapid != CEPH_SNAPDIR && + dirp->inode->is_complete_and_ordered() && + dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED)) { + int err = _readdir_cache_cb(dirp, cb, p, caps, getref); + if (err != -EAGAIN) + return err; + } + + while (1) { + if (dirp->at_end()) + return 0; + + bool check_caps = true; + if (!dirp->is_cached()) { + int r = _readdir_get_frag(dirp); + if (r) + return r; + // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is + // different than the requested one. (our dirfragtree was outdated) + check_caps = false; + } + frag_t fg = dirp->buffer_frag; + + ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size() + << " offset " << hex << dirp->offset << dendl; + + for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(), + dirp->offset, dir_result_t::dentry_off_lt()); + it != dirp->buffer.end(); + ++it) { + dir_result_t::dentry &entry = *it; + + uint64_t next_off = entry.offset + 1; + + int r; + if (check_caps) { + r = _getattr(entry.inode, caps, dirp->perms); + if (r < 0) + return r; + } + + fill_statx(entry.inode, caps, &stx); + fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off); + + Inode *inode = NULL; + if (getref) { + inode = entry.inode.get(); + _ll_get(inode); + } + + client_lock.Unlock(); + r = cb(p, &de, &stx, next_off, inode); // _next_ offset + client_lock.Lock(); + + ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec + << " = " << r << dendl; + if (r < 0) + return r; + + dirp->offset = next_off; + if (r > 0) + return r; + } + + if (dirp->next_offset > 2) { + ldout(cct, 10) << " fetching next chunk of this frag" << dendl; + _readdir_drop_dirp_buffer(dirp); + continue; // more! + } + + if (!fg.is_rightmost()) { + // next frag! + _readdir_next_frag(dirp); + continue; + } + + if (diri->shared_gen == dirp->start_shared_gen && + diri->dir_release_count == dirp->release_count) { + if (diri->dir_ordered_count == dirp->ordered_count) { + ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl; + if (diri->dir) { + assert(diri->dir->readdir_cache.size() >= dirp->cache_index); + diri->dir->readdir_cache.resize(dirp->cache_index); + } + diri->flags |= I_COMPLETE | I_DIR_ORDERED; + } else { + ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl; + diri->flags |= I_COMPLETE; + } + } + + dirp->set_end(); + return 0; + } + ceph_abort(); + return 0; +} + + +int Client::readdir_r(dir_result_t *d, struct dirent *de) +{ + return readdirplus_r(d, de, 0, 0, 0, NULL); +} + +/* + * readdirplus_r + * + * returns + * 1 if we got a dirent + * 0 for end of directory + * <0 on error + */ + +struct single_readdir { + struct dirent *de; + struct ceph_statx *stx; + Inode *inode; + bool full; +}; + +static int _readdir_single_dirent_cb(void *p, struct dirent *de, + struct ceph_statx *stx, off_t off, + Inode *in) +{ + single_readdir *c = static_cast(p); + + if (c->full) + return -1; // already filled this dirent + + *c->de = *de; + if (c->stx) + *c->stx = *stx; + c->inode = in; + c->full = true; + return 1; +} + +struct dirent *Client::readdir(dir_result_t *d) +{ + int ret; + static struct dirent de; + single_readdir sr; + sr.de = &de; + sr.stx = NULL; + sr.inode = NULL; + sr.full = false; + + // our callback fills the dirent and sets sr.full=true on first + // call, and returns -1 the second time around. + ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr); + if (ret < -1) { + errno = -ret; // this sucks. + return (dirent *) NULL; + } + if (sr.full) { + return &de; + } + return (dirent *) NULL; +} + +int Client::readdirplus_r(dir_result_t *d, struct dirent *de, + struct ceph_statx *stx, unsigned want, + unsigned flags, Inode **out) +{ + single_readdir sr; + sr.de = de; + sr.stx = stx; + sr.inode = NULL; + sr.full = false; + + // our callback fills the dirent and sets sr.full=true on first + // call, and returns -1 the second time around. + int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out); + if (r < -1) + return r; + if (out) + *out = sr.inode; + if (sr.full) + return 1; + return 0; +} + + +/* getdents */ +struct getdents_result { + char *buf; + int buflen; + int pos; + bool fullent; +}; + +static int _readdir_getdent_cb(void *p, struct dirent *de, + struct ceph_statx *stx, off_t off, Inode *in) +{ + struct getdents_result *c = static_cast(p); + + int dlen; + if (c->fullent) + dlen = sizeof(*de); + else + dlen = strlen(de->d_name) + 1; + + if (c->pos + dlen > c->buflen) + return -1; // doesn't fit + + if (c->fullent) { + memcpy(c->buf + c->pos, de, sizeof(*de)); + } else { + memcpy(c->buf + c->pos, de->d_name, dlen); + } + c->pos += dlen; + return 0; +} + +int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent) +{ + getdents_result gr; + gr.buf = buf; + gr.buflen = buflen; + gr.fullent = fullent; + gr.pos = 0; + + int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr); + + if (r < 0) { // some error + if (r == -1) { // buffer ran out of space + if (gr.pos) { // but we got some entries already! + return gr.pos; + } // or we need a larger buffer + return -ERANGE; + } else { // actual error, return it + return r; + } + } + return gr.pos; +} + + +/* getdir */ +struct getdir_result { + list *contents; + int num; +}; + +static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in) +{ + getdir_result *r = static_cast(p); + + r->contents->push_back(de->d_name); + r->num++; + return 0; +} + +int Client::getdir(const char *relpath, list& contents, + const UserPerm& perms) +{ + ldout(cct, 3) << "getdir(" << relpath << ")" << dendl; + { + Mutex::Locker lock(client_lock); + tout(cct) << "getdir" << std::endl; + tout(cct) << relpath << std::endl; + } + + dir_result_t *d; + int r = opendir(relpath, &d, perms); + if (r < 0) + return r; + + getdir_result gr; + gr.contents = &contents; + gr.num = 0; + r = readdir_r_cb(d, _getdir_cb, (void *)&gr); + + closedir(d); + + if (r < 0) + return r; + return gr.num; +} + + +/****** file i/o **********/ +int Client::open(const char *relpath, int flags, const UserPerm& perms, + mode_t mode, int stripe_unit, int stripe_count, + int object_size, const char *data_pool) +{ + ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl; + Mutex::Locker lock(client_lock); + tout(cct) << "open" << std::endl; + tout(cct) << relpath << std::endl; + tout(cct) << ceph_flags_sys2wire(flags) << std::endl; + + if (unmounting) + return -ENOTCONN; + + Fh *fh = NULL; + +#if defined(__linux__) && defined(O_PATH) + /* When the O_PATH is being specified, others flags than O_DIRECTORY + * and O_NOFOLLOW are ignored. Please refer do_entry_open() function + * in kernel (fs/open.c). */ + if (flags & O_PATH) + flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; +#endif + + filepath path(relpath); + InodeRef in; + bool created = false; + /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */ + bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL))); + int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode)); + + if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL)) + return -EEXIST; + +#if defined(__linux__) && defined(O_PATH) + if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH)) +#else + if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW)) +#endif + return -ELOOP; + + if (r == -ENOENT && (flags & O_CREAT)) { + filepath dirpath = path; + string dname = dirpath.last_dentry(); + dirpath.pop_dentry(); + InodeRef dir; + r = path_walk(dirpath, &dir, perms, true, + cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0); + if (r < 0) + goto out; + if (cct->_conf->client_permissions) { + r = may_create(dir.get(), perms); + if (r < 0) + goto out; + } + r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit, + stripe_count, object_size, data_pool, &created, perms); + } + if (r < 0) + goto out; + + if (!created) { + // posix says we can only check permissions of existing files + if (cct->_conf->client_permissions) { + r = may_open(in.get(), flags, perms); + if (r < 0) + goto out; + } + } + + if (!fh) + r = _open(in.get(), flags, mode, &fh, perms); + if (r >= 0) { + // allocate a integer file descriptor + assert(fh); + r = get_fd(); + assert(fd_map.count(r) == 0); + fd_map[r] = fh; + } + + out: + tout(cct) << r << std::endl; + ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl; + return r; +} + +int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode) +{ + /* Use default file striping parameters */ + return open(relpath, flags, perms, mode, 0, 0, 0, NULL); +} + +int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + ldout(cct, 3) << "lookup_hash enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl; + + if (unmounting) + return -ENOTCONN; + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH); + filepath path(ino); + req->set_filepath(path); + + uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name)); + char f[30]; + sprintf(f, "%u", h); + filepath path2(dirino); + path2.push_dentry(string(f)); + req->set_filepath2(path2); + + int r = make_request(req, perms, NULL, NULL, + rand() % mdsmap->get_num_in_mds()); + ldout(cct, 3) << "lookup_hash exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl; + return r; +} + + +/** + * Load inode into local cache. + * + * If inode pointer is non-NULL, and take a reference on + * the resulting Inode object in one operation, so that caller + * can safely assume inode will still be there after return. + */ +int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode) +{ + Mutex::Locker lock(client_lock); + ldout(cct, 3) << "lookup_ino enter(" << ino << ")" << dendl; + + if (unmounting) + return -ENOTCONN; + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO); + filepath path(ino); + req->set_filepath(path); + + int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds()); + if (r == 0 && inode != NULL) { + vinodeno_t vino(ino, CEPH_NOSNAP); + unordered_map::iterator p = inode_map.find(vino); + assert(p != inode_map.end()); + *inode = p->second; + _ll_get(*inode); + } + ldout(cct, 3) << "lookup_ino exit(" << ino << ") = " << r << dendl; + return r; +} + + + +/** + * Find the parent inode of `ino` and insert it into + * our cache. Conditionally also set `parent` to a referenced + * Inode* if caller provides non-NULL value. + */ +int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent) +{ + Mutex::Locker lock(client_lock); + ldout(cct, 3) << "lookup_parent enter(" << ino->ino << ")" << dendl; + + if (unmounting) + return -ENOTCONN; + + if (!ino->dn_set.empty()) { + // if we exposed the parent here, we'd need to check permissions, + // but right now we just rely on the MDS doing so in make_request + ldout(cct, 3) << "lookup_parent dentry already present" << dendl; + return 0; + } + + if (ino->is_root()) { + *parent = NULL; + ldout(cct, 3) << "ino is root, no parent" << dendl; + return -EINVAL; + } + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT); + filepath path(ino->ino); + req->set_filepath(path); + + InodeRef target; + int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds()); + // Give caller a reference to the parent ino if they provided a pointer. + if (parent != NULL) { + if (r == 0) { + *parent = target.get(); + _ll_get(*parent); + ldout(cct, 3) << "lookup_parent found parent " << (*parent)->ino << dendl; + } else { + *parent = NULL; + } + } + ldout(cct, 3) << "lookup_parent exit(" << ino->ino << ") = " << r << dendl; + return r; +} + + +/** + * Populate the parent dentry for `ino`, provided it is + * a child of `parent`. + */ +int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms) +{ + assert(parent->is_dir()); + + Mutex::Locker lock(client_lock); + ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl; + + if (unmounting) + return -ENOTCONN; + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME); + req->set_filepath2(filepath(parent->ino)); + req->set_filepath(filepath(ino->ino)); + req->set_inode(ino); + + int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds()); + ldout(cct, 3) << "lookup_name exit(" << ino->ino << ") = " << r << dendl; + return r; +} + + + Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms) +{ + assert(in); + Fh *f = new Fh(in); + f->mode = cmode; + f->flags = flags; + + // inode + f->actor_perms = perms; + + ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl; + + if (in->snapid != CEPH_NOSNAP) { + in->snap_cap_refs++; + ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps " + << ccap_string(in->caps_issued()) << dendl; + } + + const md_config_t *conf = cct->_conf; + f->readahead.set_trigger_requests(1); + f->readahead.set_min_readahead_size(conf->client_readahead_min); + uint64_t max_readahead = Readahead::NO_LIMIT; + if (conf->client_readahead_max_bytes) { + max_readahead = MIN(max_readahead, (uint64_t)conf->client_readahead_max_bytes); + } + if (conf->client_readahead_max_periods) { + max_readahead = MIN(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods); + } + f->readahead.set_max_readahead_size(max_readahead); + vector alignments; + alignments.push_back(in->layout.get_period()); + alignments.push_back(in->layout.stripe_unit); + f->readahead.set_alignments(alignments); + + return f; +} + +int Client::_release_fh(Fh *f) +{ + //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl; + //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl; + Inode *in = f->inode.get(); + ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl; + + if (in->snapid == CEPH_NOSNAP) { + if (in->put_open_ref(f->mode)) { + _flush(in, new C_Client_FlushComplete(this, in)); + check_caps(in, 0); + } + } else { + assert(in->snap_cap_refs > 0); + in->snap_cap_refs--; + } + + _release_filelocks(f); + + // Finally, read any async err (i.e. from flushes) + int err = f->take_async_err(); + if (err != 0) { + ldout(cct, 1) << "_release_fh " << f << " on inode " << *in << " caught async_err = " + << cpp_strerror(err) << dendl; + } else { + ldout(cct, 10) << "_release_fh " << f << " on inode " << *in << " no async_err state" << dendl; + } + + _put_fh(f); + + return err; +} + +void Client::_put_fh(Fh *f) +{ + int left = f->put(); + if (!left) { + delete f; + } +} + +int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp, + const UserPerm& perms) +{ + if (in->snapid != CEPH_NOSNAP && + (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) { + return -EROFS; + } + + // use normalized flags to generate cmode + int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags)); + if (cmode < 0) + return -EINVAL; + int want = ceph_caps_for_mode(cmode); + int result = 0; + + in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps. + + if ((flags & O_TRUNC) == 0 && + in->caps_issued_mask(want)) { + // update wanted? + check_caps(in, CHECK_CAPS_NODELAY); + } else { + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN); + filepath path; + in->make_nosnap_relative_path(path); + req->set_filepath(path); + req->head.args.open.flags = ceph_flags_sys2wire(flags & ~O_CREAT); + req->head.args.open.mode = mode; + req->head.args.open.pool = -1; + if (cct->_conf->client_debug_getattr_caps) + req->head.args.open.mask = DEBUG_GETATTR_CAPS; + else + req->head.args.open.mask = 0; + req->head.args.open.old_size = in->size; // for O_TRUNC + req->set_inode(in); + result = make_request(req, perms); + } + + // success? + if (result >= 0) { + if (fhp) + *fhp = _create_fh(in, flags, cmode, perms); + } else { + in->put_open_ref(cmode); + } + + trim_cache(); + + return result; +} + +int Client::_renew_caps(Inode *in) +{ + int wanted = in->caps_file_wanted(); + if (in->is_any_caps() && + ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) { + check_caps(in, CHECK_CAPS_NODELAY); + return 0; + } + + int flags = 0; + if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR)) + flags = O_RDWR; + else if (wanted & CEPH_CAP_FILE_RD) + flags = O_RDONLY; + else if (wanted & CEPH_CAP_FILE_WR) + flags = O_WRONLY; + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN); + filepath path; + in->make_nosnap_relative_path(path); + req->set_filepath(path); + req->head.args.open.flags = flags; + req->head.args.open.pool = -1; + if (cct->_conf->client_debug_getattr_caps) + req->head.args.open.mask = DEBUG_GETATTR_CAPS; + else + req->head.args.open.mask = 0; + req->set_inode(in); + + // duplicate in case Cap goes away; not sure if that race is a concern? + const UserPerm *pperm = in->get_best_perms(); + UserPerm perms; + if (pperm != NULL) + perms = *pperm; + int ret = make_request(req, perms); + return ret; +} + +int Client::close(int fd) +{ + ldout(cct, 3) << "close enter(" << fd << ")" << dendl; + Mutex::Locker lock(client_lock); + tout(cct) << "close" << std::endl; + tout(cct) << fd << std::endl; + + if (unmounting) + return -ENOTCONN; + + Fh *fh = get_filehandle(fd); + if (!fh) + return -EBADF; + int err = _release_fh(fh); + fd_map.erase(fd); + put_fd(fd); + ldout(cct, 3) << "close exit(" << fd << ")" << dendl; + return err; +} + + +// ------------ +// read, write + +loff_t Client::lseek(int fd, loff_t offset, int whence) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "lseek" << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << offset << std::endl; + tout(cct) << whence << std::endl; + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -EBADF; +#endif + return _lseek(f, offset, whence); +} + +loff_t Client::_lseek(Fh *f, loff_t offset, int whence) +{ + Inode *in = f->inode.get(); + int r; + + switch (whence) { + case SEEK_SET: + f->pos = offset; + break; + + case SEEK_CUR: + f->pos += offset; + break; + + case SEEK_END: + r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms); + if (r < 0) + return r; + f->pos = in->size + offset; + break; + + default: + ceph_abort(); + } + + ldout(cct, 3) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl; + return f->pos; +} + + +void Client::lock_fh_pos(Fh *f) +{ + ldout(cct, 10) << "lock_fh_pos " << f << dendl; + + if (f->pos_locked || !f->pos_waiters.empty()) { + Cond cond; + f->pos_waiters.push_back(&cond); + ldout(cct, 10) << "lock_fh_pos BLOCKING on " << f << dendl; + while (f->pos_locked || f->pos_waiters.front() != &cond) + cond.Wait(client_lock); + ldout(cct, 10) << "lock_fh_pos UNBLOCKING on " << f << dendl; + assert(f->pos_waiters.front() == &cond); + f->pos_waiters.pop_front(); + } + + f->pos_locked = true; +} + +void Client::unlock_fh_pos(Fh *f) +{ + ldout(cct, 10) << "unlock_fh_pos " << f << dendl; + f->pos_locked = false; +} + +int Client::uninline_data(Inode *in, Context *onfinish) +{ + if (!in->inline_data.length()) { + onfinish->complete(0); + return 0; + } + + char oid_buf[32]; + snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino); + object_t oid = oid_buf; + + ObjectOperation create_ops; + create_ops.create(false); + + objecter->mutate(oid, + OSDMap::file_to_object_locator(in->layout), + create_ops, + in->snaprealm->get_snap_context(), + ceph::real_clock::now(), + 0, + NULL); + + bufferlist inline_version_bl; + ::encode(in->inline_version, inline_version_bl); + + ObjectOperation uninline_ops; + uninline_ops.cmpxattr("inline_version", + CEPH_OSD_CMPXATTR_OP_GT, + CEPH_OSD_CMPXATTR_MODE_U64, + inline_version_bl); + bufferlist inline_data = in->inline_data; + uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq); + uninline_ops.setxattr("inline_version", stringify(in->inline_version)); + + objecter->mutate(oid, + OSDMap::file_to_object_locator(in->layout), + uninline_ops, + in->snaprealm->get_snap_context(), + ceph::real_clock::now(), + 0, + onfinish); + + return 0; +} + +// + +// blocking osd interface + +int Client::read(int fd, char *buf, loff_t size, loff_t offset) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "read" << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << size << std::endl; + tout(cct) << offset << std::endl; + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -EBADF; +#endif + bufferlist bl; + int r = _read(f, offset, size, &bl); + ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl; + if (r >= 0) { + bl.copy(0, bl.length(), buf); + r = bl.length(); + } + return r; +} + +int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset) +{ + if (iovcnt < 0) + return -EINVAL; + return _preadv_pwritev(fd, iov, iovcnt, offset, false); +} + +int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl) +{ + const md_config_t *conf = cct->_conf; + Inode *in = f->inode.get(); + + if ((f->mode & CEPH_FILE_MODE_RD) == 0) + return -EBADF; + //bool lazy = f->mode == CEPH_FILE_MODE_LAZY; + + bool movepos = false; + if (offset < 0) { + lock_fh_pos(f); + offset = f->pos; + movepos = true; + } + loff_t start_pos = offset; + + if (in->inline_version == 0) { + int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true); + if (r < 0) { + if (movepos) + unlock_fh_pos(f); + return r; + } + assert(in->inline_version > 0); + } + +retry: + int have; + int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1); + if (r < 0) { + if (movepos) + unlock_fh_pos(f); + return r; + } + if (f->flags & O_DIRECT) + have &= ~CEPH_CAP_FILE_CACHE; + + Mutex uninline_flock("Client::_read_uninline_data flock"); + Cond uninline_cond; + bool uninline_done = false; + int uninline_ret = 0; + Context *onuninline = NULL; + + if (in->inline_version < CEPH_INLINE_NONE) { + if (!(have & CEPH_CAP_FILE_CACHE)) { + onuninline = new C_SafeCond(&uninline_flock, + &uninline_cond, + &uninline_done, + &uninline_ret); + uninline_data(in, onuninline); + } else { + uint32_t len = in->inline_data.length(); + + uint64_t endoff = offset + size; + if (endoff > in->size) + endoff = in->size; + + if (offset < len) { + if (endoff <= len) { + bl->substr_of(in->inline_data, offset, endoff - offset); + } else { + bl->substr_of(in->inline_data, offset, len - offset); + bl->append_zero(endoff - len); + } + } else if ((uint64_t)offset < endoff) { + bl->append_zero(endoff - offset); + } + + goto success; + } + } + + if (!conf->client_debug_force_sync_read && + (conf->client_oc && (have & CEPH_CAP_FILE_CACHE))) { + + if (f->flags & O_RSYNC) { + _flush_range(in, offset, size); + } + r = _read_async(f, offset, size, bl); + if (r < 0) + goto done; + } else { + if (f->flags & O_DIRECT) + _flush_range(in, offset, size); + + bool checkeof = false; + r = _read_sync(f, offset, size, bl, &checkeof); + if (r < 0) + goto done; + if (checkeof) { + offset += r; + size -= r; + + put_cap_ref(in, CEPH_CAP_FILE_RD); + have = 0; + // reverify size + r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms); + if (r < 0) + goto done; + + // eof? short read. + if ((uint64_t)offset < in->size) + goto retry; + } + } + +success: + if (movepos) { + // adjust fd pos + f->pos = start_pos + bl->length(); + unlock_fh_pos(f); + } + +done: + // done! + + if (onuninline) { + client_lock.Unlock(); + uninline_flock.Lock(); + while (!uninline_done) + uninline_cond.Wait(uninline_flock); + uninline_flock.Unlock(); + client_lock.Lock(); + + if (uninline_ret >= 0 || uninline_ret == -ECANCELED) { + in->inline_data.clear(); + in->inline_version = CEPH_INLINE_NONE; + mark_caps_dirty(in, CEPH_CAP_FILE_WR); + check_caps(in, 0); + } else + r = uninline_ret; + } + + if (have) + put_cap_ref(in, CEPH_CAP_FILE_RD); + if (r < 0) { + if (movepos) + unlock_fh_pos(f); + return r; + } else + return bl->length(); +} + +Client::C_Readahead::C_Readahead(Client *c, Fh *f) : + client(c), f(f) { + f->get(); + f->readahead.inc_pending(); +} + +Client::C_Readahead::~C_Readahead() { + f->readahead.dec_pending(); + client->_put_fh(f); +} + +void Client::C_Readahead::finish(int r) { + lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl; + client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE); +} + +int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl) +{ + const md_config_t *conf = cct->_conf; + Inode *in = f->inode.get(); + + ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl; + + // trim read based on file size? + if (off >= in->size) + return 0; + if (len == 0) + return 0; + if (off + len > in->size) { + len = in->size - off; + } + + ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size() + << " max_bytes=" << f->readahead.get_max_readahead_size() + << " max_periods=" << conf->client_readahead_max_periods << dendl; + + // read (and possibly block) + int r, rvalue = 0; + Mutex flock("Client::_read_async flock"); + Cond cond; + bool done = false; + Context *onfinish = new C_SafeCond(&flock, &cond, &done, &rvalue); + r = objectcacher->file_read(&in->oset, &in->layout, in->snapid, + off, len, bl, 0, onfinish); + if (r == 0) { + get_cap_ref(in, CEPH_CAP_FILE_CACHE); + client_lock.Unlock(); + flock.Lock(); + while (!done) + cond.Wait(flock); + flock.Unlock(); + client_lock.Lock(); + put_cap_ref(in, CEPH_CAP_FILE_CACHE); + r = rvalue; + } else { + // it was cached. + delete onfinish; + } + + if(f->readahead.get_min_readahead_size() > 0) { + pair readahead_extent = f->readahead.update(off, len, in->size); + if (readahead_extent.second > 0) { + ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second + << " (caller wants " << off << "~" << len << ")" << dendl; + Context *onfinish2 = new C_Readahead(this, f); + int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid, + readahead_extent.first, readahead_extent.second, + NULL, 0, onfinish2); + if (r2 == 0) { + ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl; + get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE); + } else { + ldout(cct, 20) << "readahead was no-op, already cached" << dendl; + delete onfinish2; + } + } + } + + return r; +} + +int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl, + bool *checkeof) +{ + Inode *in = f->inode.get(); + uint64_t pos = off; + int left = len; + int read = 0; + + ldout(cct, 10) << "_read_sync " << *in << " " << off << "~" << len << dendl; + + Mutex flock("Client::_read_sync flock"); + Cond cond; + while (left > 0) { + int r = 0; + bool done = false; + Context *onfinish = new C_SafeCond(&flock, &cond, &done, &r); + bufferlist tbl; + + int wanted = left; + filer->read_trunc(in->ino, &in->layout, in->snapid, + pos, left, &tbl, 0, + in->truncate_size, in->truncate_seq, + onfinish); + client_lock.Unlock(); + flock.Lock(); + while (!done) + cond.Wait(flock); + flock.Unlock(); + client_lock.Lock(); + + // if we get ENOENT from OSD, assume 0 bytes returned + if (r == -ENOENT) + r = 0; + if (r < 0) + return r; + if (tbl.length()) { + r = tbl.length(); + + read += r; + pos += r; + left -= r; + bl->claim_append(tbl); + } + // short read? + if (r >= 0 && r < wanted) { + if (pos < in->size) { + // zero up to known EOF + int64_t some = in->size - pos; + if (some > left) + some = left; + bufferptr z(some); + z.zero(); + bl->push_back(z); + read += some; + pos += some; + left -= some; + if (left == 0) + return read; + } + + *checkeof = true; + return read; + } + } + return read; +} + + +/* + * we keep count of uncommitted sync writes on the inode, so that + * fsync can DDRT. + */ +void Client::_sync_write_commit(Inode *in) +{ + assert(unsafe_sync_write > 0); + unsafe_sync_write--; + + put_cap_ref(in, CEPH_CAP_FILE_BUFFER); + + ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl; + if (unsafe_sync_write == 0 && unmounting) { + ldout(cct, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl; + mount_cond.Signal(); + } +} + +int Client::write(int fd, const char *buf, loff_t size, loff_t offset) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "write" << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << size << std::endl; + tout(cct) << offset << std::endl; + + if (unmounting) + return -ENOTCONN; + + Fh *fh = get_filehandle(fd); + if (!fh) + return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (fh->flags & O_PATH) + return -EBADF; +#endif + int r = _write(fh, offset, size, buf, NULL, 0); + ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl; + return r; +} + +int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset) +{ + if (iovcnt < 0) + return -EINVAL; + return _preadv_pwritev(fd, iov, iovcnt, offset, true); +} + +int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write) +{ + Mutex::Locker lock(client_lock); + tout(cct) << fd << std::endl; + tout(cct) << offset << std::endl; + + if (unmounting) + return -ENOTCONN; + + Fh *fh = get_filehandle(fd); + if (!fh) + return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (fh->flags & O_PATH) + return -EBADF; +#endif + loff_t totallen = 0; + for (unsigned i = 0; i < iovcnt; i++) { + totallen += iov[i].iov_len; + } + if (write) { + int w = _write(fh, offset, totallen, NULL, iov, iovcnt); + ldout(cct, 3) << "pwritev(" << fd << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl; + return w; + } else { + bufferlist bl; + int r = _read(fh, offset, totallen, &bl); + ldout(cct, 3) << "preadv(" << fd << ", " << offset << ") = " << r << dendl; + if (r <= 0) + return r; + + int bufoff = 0; + for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) { + /* + * This piece of code aims to handle the case that bufferlist does not have enough data + * to fill in the iov + */ + if (resid < iov[j].iov_len) { + bl.copy(bufoff, resid, (char *)iov[j].iov_base); + break; + } else { + bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base); + } + resid -= iov[j].iov_len; + bufoff += iov[j].iov_len; + } + return r; + } +} + +int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf, + const struct iovec *iov, int iovcnt) +{ + if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large! + return -EFBIG; + + //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl; + Inode *in = f->inode.get(); + + if (objecter->osdmap_pool_full(in->layout.pool_id)) { + return -ENOSPC; + } + + assert(in->snapid == CEPH_NOSNAP); + + // was Fh opened as writeable? + if ((f->mode & CEPH_FILE_MODE_WR) == 0) + return -EBADF; + + // check quota + uint64_t endoff = offset + size; + if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size, + f->actor_perms)) { + return -EDQUOT; + } + + // use/adjust fd pos? + if (offset < 0) { + lock_fh_pos(f); + /* + * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may + * change out from under us. + */ + if (f->flags & O_APPEND) { + int r = _lseek(f, 0, SEEK_END); + if (r < 0) { + unlock_fh_pos(f); + return r; + } + } + offset = f->pos; + f->pos = offset+size; + unlock_fh_pos(f); + } + + //bool lazy = f->mode == CEPH_FILE_MODE_LAZY; + + ldout(cct, 10) << "cur file size is " << in->size << dendl; + + // time it. + utime_t start = ceph_clock_now(); + + if (in->inline_version == 0) { + int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true); + if (r < 0) + return r; + assert(in->inline_version > 0); + } + + // copy into fresh buffer (since our write may be resub, async) + bufferlist bl; + if (buf) { + if (size > 0) + bl.append(buf, size); + } else if (iov){ + for (int i = 0; i < iovcnt; i++) { + if (iov[i].iov_len > 0) { + bl.append((const char *)iov[i].iov_base, iov[i].iov_len); + } + } + } + + utime_t lat; + uint64_t totalwritten; + int have; + int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED, + CEPH_CAP_FILE_BUFFER, &have, endoff); + if (r < 0) + return r; + + /* clear the setuid/setgid bits, if any */ + if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) { + struct ceph_statx stx = { 0 }; + + put_cap_ref(in, CEPH_CAP_AUTH_SHARED); + r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms); + if (r < 0) + return r; + } else { + put_cap_ref(in, CEPH_CAP_AUTH_SHARED); + } + + if (f->flags & O_DIRECT) + have &= ~CEPH_CAP_FILE_BUFFER; + + ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl; + + Mutex uninline_flock("Client::_write_uninline_data flock"); + Cond uninline_cond; + bool uninline_done = false; + int uninline_ret = 0; + Context *onuninline = NULL; + + if (in->inline_version < CEPH_INLINE_NONE) { + if (endoff > cct->_conf->client_max_inline_size || + endoff > CEPH_INLINE_MAX_SIZE || + !(have & CEPH_CAP_FILE_BUFFER)) { + onuninline = new C_SafeCond(&uninline_flock, + &uninline_cond, + &uninline_done, + &uninline_ret); + uninline_data(in, onuninline); + } else { + get_cap_ref(in, CEPH_CAP_FILE_BUFFER); + + uint32_t len = in->inline_data.length(); + + if (endoff < len) + in->inline_data.copy(endoff, len - endoff, bl); + + if (offset < len) + in->inline_data.splice(offset, len - offset); + else if (offset > len) + in->inline_data.append_zero(offset - len); + + in->inline_data.append(bl); + in->inline_version++; + + put_cap_ref(in, CEPH_CAP_FILE_BUFFER); + + goto success; + } + } + + if (cct->_conf->client_oc && (have & CEPH_CAP_FILE_BUFFER)) { + // do buffered write + if (!in->oset.dirty_or_tx) + get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER); + + get_cap_ref(in, CEPH_CAP_FILE_BUFFER); + + // async, caching, non-blocking. + r = objectcacher->file_write(&in->oset, &in->layout, + in->snaprealm->get_snap_context(), + offset, size, bl, ceph::real_clock::now(), + 0); + put_cap_ref(in, CEPH_CAP_FILE_BUFFER); + + if (r < 0) + goto done; + + // flush cached write if O_SYNC is set on file fh + // O_DSYNC == O_SYNC on linux < 2.6.33 + // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33 + if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) { + _flush_range(in, offset, size); + } + } else { + if (f->flags & O_DIRECT) + _flush_range(in, offset, size); + + // simple, non-atomic sync write + Mutex flock("Client::_write flock"); + Cond cond; + bool done = false; + Context *onfinish = new C_SafeCond(&flock, &cond, &done); + + unsafe_sync_write++; + get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback + + filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(), + offset, size, bl, ceph::real_clock::now(), 0, + in->truncate_size, in->truncate_seq, + onfinish); + client_lock.Unlock(); + flock.Lock(); + + while (!done) + cond.Wait(flock); + flock.Unlock(); + client_lock.Lock(); + _sync_write_commit(in); + } + + // if we get here, write was successful, update client metadata +success: + // time + lat = ceph_clock_now(); + lat -= start; + logger->tinc(l_c_wrlat, lat); + + totalwritten = size; + r = (int)totalwritten; + + // extend file? + if (totalwritten + offset > in->size) { + in->size = totalwritten + offset; + mark_caps_dirty(in, CEPH_CAP_FILE_WR); + + if (is_quota_bytes_approaching(in, f->actor_perms)) { + check_caps(in, CHECK_CAPS_NODELAY); + } else if (is_max_size_approaching(in)) { + check_caps(in, 0); + } + + ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl; + } else { + ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl; + } + + // mtime + in->mtime = ceph_clock_now(); + in->change_attr++; + mark_caps_dirty(in, CEPH_CAP_FILE_WR); + +done: + + if (onuninline) { + client_lock.Unlock(); + uninline_flock.Lock(); + while (!uninline_done) + uninline_cond.Wait(uninline_flock); + uninline_flock.Unlock(); + client_lock.Lock(); + + if (uninline_ret >= 0 || uninline_ret == -ECANCELED) { + in->inline_data.clear(); + in->inline_version = CEPH_INLINE_NONE; + mark_caps_dirty(in, CEPH_CAP_FILE_WR); + check_caps(in, 0); + } else + r = uninline_ret; + } + + put_cap_ref(in, CEPH_CAP_FILE_WR); + return r; +} + +int Client::_flush(Fh *f) +{ + Inode *in = f->inode.get(); + int err = f->take_async_err(); + if (err != 0) { + ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = " + << cpp_strerror(err) << dendl; + } else { + ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl; + } + + return err; +} + +int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms) +{ + struct ceph_statx stx; + stx.stx_size = length; + return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms); +} + +int Client::ftruncate(int fd, loff_t length, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "ftruncate" << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << length << std::endl; + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -EBADF; +#endif + struct stat attr; + attr.st_size = length; + return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms); +} + +int Client::fsync(int fd, bool syncdataonly) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "fsync" << std::endl; + tout(cct) << fd << std::endl; + tout(cct) << syncdataonly << std::endl; + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (f->flags & O_PATH) + return -EBADF; +#endif + int r = _fsync(f, syncdataonly); + if (r == 0) { + // The IOs in this fsync were okay, but maybe something happened + // in the background that we shoudl be reporting? + r = f->take_async_err(); + ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly + << ") = 0, async_err = " << r << dendl; + } else { + // Assume that an error we encountered during fsync, even reported + // synchronously, would also have applied the error to the Fh, and we + // should clear it here to avoid returning the same error again on next + // call. + ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly << ") = " + << r << dendl; + f->take_async_err(); + } + return r; +} + +int Client::_fsync(Inode *in, bool syncdataonly) +{ + int r = 0; + Mutex lock("Client::_fsync::lock"); + Cond cond; + bool done = false; + C_SafeCond *object_cacher_completion = NULL; + ceph_tid_t flush_tid = 0; + InodeRef tmp_ref; + + ldout(cct, 3) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl; + + if (cct->_conf->client_oc) { + object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r); + tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either + _flush(in, object_cacher_completion); + ldout(cct, 15) << "using return-valued form of _fsync" << dendl; + } + + if (!syncdataonly && in->dirty_caps) { + check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS); + if (in->flushing_caps) + flush_tid = last_flush_tid; + } else ldout(cct, 10) << "no metadata needs to commit" << dendl; + + if (!syncdataonly && !in->unsafe_ops.empty()) { + MetaRequest *req = in->unsafe_ops.back(); + ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl; + + req->get(); + wait_on_list(req->waitfor_safe); + put_request(req); + } + + if (object_cacher_completion) { // wait on a real reply instead of guessing + client_lock.Unlock(); + lock.Lock(); + ldout(cct, 15) << "waiting on data to flush" << dendl; + while (!done) + cond.Wait(lock); + lock.Unlock(); + client_lock.Lock(); + ldout(cct, 15) << "got " << r << " from flush writeback" << dendl; + } else { + // FIXME: this can starve + while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) { + ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER] + << " uncommitted, waiting" << dendl; + wait_on_list(in->waitfor_commit); + } + } + + if (!r) { + if (flush_tid > 0) + wait_sync_caps(in, flush_tid); + + ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl; + } else { + ldout(cct, 1) << "ino " << in->ino << " failed to commit to disk! " + << cpp_strerror(-r) << dendl; + } + + return r; +} + +int Client::_fsync(Fh *f, bool syncdataonly) +{ + ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl; + return _fsync(f->inode.get(), syncdataonly); +} + +int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "fstat mask " << hex << mask << dec << std::endl; + tout(cct) << fd << std::endl; + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + int r = _getattr(f->inode, mask, perms); + if (r < 0) + return r; + fill_stat(f->inode, stbuf, NULL); + ldout(cct, 3) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl; + return r; +} + +int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms, + unsigned int want, unsigned int flags) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl; + tout(cct) << fd << std::endl; + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + + unsigned mask = statx_to_mask(flags, want); + + int r = 0; + if (mask && !f->inode->caps_issued_mask(mask)) { + r = _getattr(f->inode, mask, perms); + if (r < 0) { + ldout(cct, 3) << "fstatx exit on error!" << dendl; + return r; + } + } + + fill_statx(f->inode, mask, stx); + ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl; + return r; +} + +// not written yet, but i want to link! + +int Client::chdir(const char *relpath, std::string &new_cwd, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "chdir" << std::endl; + tout(cct) << relpath << std::endl; + + if (unmounting) + return -ENOTCONN; + + filepath path(relpath); + InodeRef in; + int r = path_walk(path, &in, perms); + if (r < 0) + return r; + if (cwd != in) + cwd.swap(in); + ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl; + + _getcwd(new_cwd, perms); + return 0; +} + +void Client::_getcwd(string& dir, const UserPerm& perms) +{ + filepath path; + ldout(cct, 10) << "getcwd " << *cwd << dendl; + + Inode *in = cwd.get(); + while (in != root) { + assert(in->dn_set.size() < 2); // dirs can't be hard-linked + + // A cwd or ancester is unlinked + if (in->dn_set.empty()) { + return; + } + + Dentry *dn = in->get_first_parent(); + + + if (!dn) { + // look it up + ldout(cct, 10) << "getcwd looking up parent for " << *in << dendl; + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME); + filepath path(in->ino); + req->set_filepath(path); + req->set_inode(in); + int res = make_request(req, perms); + if (res < 0) + break; + + // start over + path = filepath(); + in = cwd.get(); + continue; + } + path.push_front_dentry(dn->name); + in = dn->dir->parent_inode; + } + dir = "/"; + dir += path.get_path(); +} + +void Client::getcwd(string& dir, const UserPerm& perms) +{ + Mutex::Locker l(client_lock); + if (!unmounting) + _getcwd(dir, perms); +} + +int Client::statfs(const char *path, struct statvfs *stbuf, + const UserPerm& perms) +{ + Mutex::Locker l(client_lock); + tout(cct) << "statfs" << std::endl; + + if (unmounting) + return -ENOTCONN; + + ceph_statfs stats; + C_SaferCond cond; + + const vector &data_pools = mdsmap->get_data_pools(); + if (data_pools.size() == 1) { + objecter->get_fs_stats(stats, data_pools[0], &cond); + } else { + objecter->get_fs_stats(stats, boost::optional(), &cond); + } + + client_lock.Unlock(); + int rval = cond.wait(); + client_lock.Lock(); + + if (rval < 0) { + ldout(cct, 1) << "underlying call to statfs returned error: " + << cpp_strerror(rval) + << dendl; + return rval; + } + + memset(stbuf, 0, sizeof(*stbuf)); + + /* + * we're going to set a block size of 4MB so we can represent larger + * FSes without overflowing. Additionally convert the space + * measurements from KB to bytes while making them in terms of + * blocks. We use 4MB only because it is big enough, and because it + * actually *is* the (ceph) default block size. + */ + const int CEPH_BLOCK_SHIFT = 22; + stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT; + stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT; + stbuf->f_files = stats.num_objects; + stbuf->f_ffree = -1; + stbuf->f_favail = -1; + stbuf->f_fsid = -1; // ?? + stbuf->f_flag = 0; // ?? + stbuf->f_namemax = NAME_MAX; + + // Usually quota_root will == root_ancestor, but if the mount root has no + // quota but we can see a parent of it that does have a quota, we'll + // respect that one instead. + assert(root != nullptr); + Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms); + + // get_quota_root should always give us something + // because client quotas are always enabled + assert(quota_root != nullptr); + + if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) { + + // Skip the getattr if any sessions are stale, as we don't want to + // block `df` if this client has e.g. been evicted, or if the MDS cluster + // is unhealthy. + if (!_any_stale_sessions()) { + int r = _getattr(quota_root, 0, perms, true); + if (r != 0) { + // Ignore return value: error getting latest inode metadata is not a good + // reason to break "df". + lderr(cct) << "Error in getattr on quota root 0x" + << std::hex << quota_root->ino << std::dec + << " statfs result may be outdated" << dendl; + } + } + + // Special case: if there is a size quota set on the Inode acting + // as the root for this client mount, then report the quota status + // as the filesystem statistics. + const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT; + const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT; + // It is possible for a quota to be exceeded: arithmetic here must + // handle case where used > total. + const fsblkcnt_t free = total > used ? total - used : 0; + + stbuf->f_blocks = total; + stbuf->f_bfree = free; + stbuf->f_bavail = free; + } else { + // General case: report the cluster statistics returned from RADOS. Because + // multiple pools may be used without one filesystem namespace via + // layouts, this is the most correct thing we can do. + stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10); + stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10); + stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10); + } + + return rval; +} + +int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep, + struct flock *fl, uint64_t owner, bool removing) +{ + ldout(cct, 10) << "_do_filelock ino " << in->ino + << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock") + << " type " << fl->l_type << " owner " << owner + << " " << fl->l_start << "~" << fl->l_len << dendl; + + int lock_cmd; + if (F_RDLCK == fl->l_type) + lock_cmd = CEPH_LOCK_SHARED; + else if (F_WRLCK == fl->l_type) + lock_cmd = CEPH_LOCK_EXCL; + else if (F_UNLCK == fl->l_type) + lock_cmd = CEPH_LOCK_UNLOCK; + else + return -EIO; + + if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK) + sleep = 0; + + /* + * Set the most significant bit, so that MDS knows the 'owner' + * is sufficient to identify the owner of lock. (old code uses + * both 'owner' and 'pid') + */ + owner |= (1ULL << 63); + + MetaRequest *req = new MetaRequest(op); + filepath path; + in->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_inode(in); + + req->head.args.filelock_change.rule = lock_type; + req->head.args.filelock_change.type = lock_cmd; + req->head.args.filelock_change.owner = owner; + req->head.args.filelock_change.pid = fl->l_pid; + req->head.args.filelock_change.start = fl->l_start; + req->head.args.filelock_change.length = fl->l_len; + req->head.args.filelock_change.wait = sleep; + + int ret; + bufferlist bl; + + if (sleep && switch_interrupt_cb) { + // enable interrupt + switch_interrupt_cb(callback_handle, req->get()); + ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl); + // disable interrupt + switch_interrupt_cb(callback_handle, NULL); + if (ret == 0 && req->aborted()) { + // effect of this lock request has been revoked by the 'lock intr' request + ret = req->get_abort_code(); + } + put_request(req); + } else { + ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl); + } + + if (ret == 0) { + if (op == CEPH_MDS_OP_GETFILELOCK) { + ceph_filelock filelock; + bufferlist::iterator p = bl.begin(); + ::decode(filelock, p); + + if (CEPH_LOCK_SHARED == filelock.type) + fl->l_type = F_RDLCK; + else if (CEPH_LOCK_EXCL == filelock.type) + fl->l_type = F_WRLCK; + else + fl->l_type = F_UNLCK; + + fl->l_whence = SEEK_SET; + fl->l_start = filelock.start; + fl->l_len = filelock.length; + fl->l_pid = filelock.pid; + } else if (op == CEPH_MDS_OP_SETFILELOCK) { + ceph_lock_state_t *lock_state; + if (lock_type == CEPH_LOCK_FCNTL) { + if (!in->fcntl_locks) + in->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL); + lock_state = in->fcntl_locks; + } else if (lock_type == CEPH_LOCK_FLOCK) { + if (!in->flock_locks) + in->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK); + lock_state = in->flock_locks; + } else { + ceph_abort(); + return -EINVAL; + } + _update_lock_state(fl, owner, lock_state); + + if (!removing) { + if (lock_type == CEPH_LOCK_FCNTL) { + if (!fh->fcntl_locks) + fh->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL); + lock_state = fh->fcntl_locks; + } else { + if (!fh->flock_locks) + fh->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK); + lock_state = fh->flock_locks; + } + _update_lock_state(fl, owner, lock_state); + } + } else + ceph_abort(); + } + return ret; +} + +int Client::_interrupt_filelock(MetaRequest *req) +{ + // Set abort code, but do not kick. The abort code prevents the request + // from being re-sent. + req->abort(-EINTR); + if (req->mds < 0) + return 0; // haven't sent the request + + Inode *in = req->inode(); + + int lock_type; + if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK) + lock_type = CEPH_LOCK_FLOCK_INTR; + else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL) + lock_type = CEPH_LOCK_FCNTL_INTR; + else { + ceph_abort(); + return -EINVAL; + } + + MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK); + filepath path; + in->make_nosnap_relative_path(path); + intr_req->set_filepath(path); + intr_req->set_inode(in); + intr_req->head.args.filelock_change = req->head.args.filelock_change; + intr_req->head.args.filelock_change.rule = lock_type; + intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK; + + UserPerm perms(req->get_uid(), req->get_gid()); + return make_request(intr_req, perms, NULL, NULL, -1); +} + +void Client::_encode_filelocks(Inode *in, bufferlist& bl) +{ + if (!in->fcntl_locks && !in->flock_locks) + return; + + unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0; + ::encode(nr_fcntl_locks, bl); + if (nr_fcntl_locks) { + ceph_lock_state_t* lock_state = in->fcntl_locks; + for(multimap::iterator p = lock_state->held_locks.begin(); + p != lock_state->held_locks.end(); + ++p) + ::encode(p->second, bl); + } + + unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0; + ::encode(nr_flock_locks, bl); + if (nr_flock_locks) { + ceph_lock_state_t* lock_state = in->flock_locks; + for(multimap::iterator p = lock_state->held_locks.begin(); + p != lock_state->held_locks.end(); + ++p) + ::encode(p->second, bl); + } + + ldout(cct, 10) << "_encode_filelocks ino " << in->ino << ", " << nr_fcntl_locks + << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl; +} + +void Client::_release_filelocks(Fh *fh) +{ + if (!fh->fcntl_locks && !fh->flock_locks) + return; + + Inode *in = fh->inode.get(); + ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl; + + list > to_release; + + if (fh->fcntl_locks) { + ceph_lock_state_t* lock_state = fh->fcntl_locks; + for(multimap::iterator p = lock_state->held_locks.begin(); + p != lock_state->held_locks.end(); + ++p) + to_release.push_back(pair(CEPH_LOCK_FCNTL, p->second)); + delete fh->fcntl_locks; + } + if (fh->flock_locks) { + ceph_lock_state_t* lock_state = fh->flock_locks; + for(multimap::iterator p = lock_state->held_locks.begin(); + p != lock_state->held_locks.end(); + ++p) + to_release.push_back(pair(CEPH_LOCK_FLOCK, p->second)); + delete fh->flock_locks; + } + + if (to_release.empty()) + return; + + struct flock fl; + memset(&fl, 0, sizeof(fl)); + fl.l_whence = SEEK_SET; + fl.l_type = F_UNLCK; + + for (list >::iterator p = to_release.begin(); + p != to_release.end(); + ++p) { + fl.l_start = p->second.start; + fl.l_len = p->second.length; + fl.l_pid = p->second.pid; + _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl, + p->second.owner, true); + } +} + +void Client::_update_lock_state(struct flock *fl, uint64_t owner, + ceph_lock_state_t *lock_state) +{ + int lock_cmd; + if (F_RDLCK == fl->l_type) + lock_cmd = CEPH_LOCK_SHARED; + else if (F_WRLCK == fl->l_type) + lock_cmd = CEPH_LOCK_EXCL; + else + lock_cmd = CEPH_LOCK_UNLOCK;; + + ceph_filelock filelock; + filelock.start = fl->l_start; + filelock.length = fl->l_len; + filelock.client = 0; + // see comment in _do_filelock() + filelock.owner = owner | (1ULL << 63); + filelock.pid = fl->l_pid; + filelock.type = lock_cmd; + + if (filelock.type == CEPH_LOCK_UNLOCK) { + list activated_locks; + lock_state->remove_lock(filelock, activated_locks); + } else { + bool r = lock_state->add_lock(filelock, false, false, NULL); + assert(r); + } +} + +int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner) +{ + Inode *in = fh->inode.get(); + ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl; + int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner); + return ret; +} + +int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep) +{ + Inode *in = fh->inode.get(); + ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl; + int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner); + ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl; + return ret; +} + +int Client::_flock(Fh *fh, int cmd, uint64_t owner) +{ + Inode *in = fh->inode.get(); + ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl; + + int sleep = !(cmd & LOCK_NB); + cmd &= ~LOCK_NB; + + int type; + switch (cmd) { + case LOCK_SH: + type = F_RDLCK; + break; + case LOCK_EX: + type = F_WRLCK; + break; + case LOCK_UN: + type = F_UNLCK; + break; + default: + return -EINVAL; + } + + struct flock fl; + memset(&fl, 0, sizeof(fl)); + fl.l_type = type; + fl.l_whence = SEEK_SET; + + int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner); + ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl; + return ret; +} + +int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms) +{ + /* Since the only thing this does is wrap a call to statfs, and + statfs takes a lock, it doesn't seem we have a need to split it + out. */ + return statfs(0, stbuf, perms); +} + +void Client::ll_register_callbacks(struct client_callback_args *args) +{ + if (!args) + return; + Mutex::Locker l(client_lock); + ldout(cct, 10) << "ll_register_callbacks cb " << args->handle + << " invalidate_ino_cb " << args->ino_cb + << " invalidate_dentry_cb " << args->dentry_cb + << " getgroups_cb" << args->getgroups_cb + << " switch_interrupt_cb " << args->switch_intr_cb + << " remount_cb " << args->remount_cb + << dendl; + callback_handle = args->handle; + if (args->ino_cb) { + ino_invalidate_cb = args->ino_cb; + async_ino_invalidator.start(); + } + if (args->dentry_cb) { + dentry_invalidate_cb = args->dentry_cb; + async_dentry_invalidator.start(); + } + if (args->switch_intr_cb) { + switch_interrupt_cb = args->switch_intr_cb; + interrupt_finisher.start(); + } + if (args->remount_cb) { + remount_cb = args->remount_cb; + remount_finisher.start(); + } + getgroups_cb = args->getgroups_cb; + umask_cb = args->umask_cb; +} + +int Client::test_dentry_handling(bool can_invalidate) +{ + int r = 0; + + can_invalidate_dentries = can_invalidate; + + if (can_invalidate_dentries) { + assert(dentry_invalidate_cb); + ldout(cct, 1) << "using dentry_invalidate_cb" << dendl; + } else if (remount_cb) { + ldout(cct, 1) << "using remount_cb" << dendl; + int s = remount_cb(callback_handle); + if (s) { + lderr(cct) << "Failed to invoke remount, needed to ensure kernel dcache consistency" + << dendl; + } + if (cct->_conf->client_die_on_failed_remount) { + require_remount = true; + r = s; + } + } else { + lderr(cct) << "no method to invalidate kernel dentry cache; expect issues!" << dendl; + if (cct->_conf->client_die_on_failed_remount) + ceph_abort(); + } + return r; +} + +int Client::_sync_fs() +{ + ldout(cct, 10) << "_sync_fs" << dendl; + + // flush file data + Mutex lock("Client::_fsync::lock"); + Cond cond; + bool flush_done = false; + if (cct->_conf->client_oc) + objectcacher->flush_all(new C_SafeCond(&lock, &cond, &flush_done)); + else + flush_done = true; + + // flush caps + flush_caps_sync(); + ceph_tid_t flush_tid = last_flush_tid; + + // wait for unsafe mds requests + wait_unsafe_requests(); + + wait_sync_caps(flush_tid); + + if (!flush_done) { + client_lock.Unlock(); + lock.Lock(); + ldout(cct, 15) << "waiting on data to flush" << dendl; + while (!flush_done) + cond.Wait(lock); + lock.Unlock(); + client_lock.Lock(); + } + + return 0; +} + +int Client::sync_fs() +{ + Mutex::Locker l(client_lock); + + if (unmounting) + return -ENOTCONN; + + return _sync_fs(); +} + +int64_t Client::drop_caches() +{ + Mutex::Locker l(client_lock); + return objectcacher->release_all(); +} + + +int Client::lazyio_propogate(int fd, loff_t offset, size_t count) +{ + Mutex::Locker l(client_lock); + ldout(cct, 3) << "op: client->lazyio_propogate(" << fd + << ", " << offset << ", " << count << ")" << dendl; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + + // for now + _fsync(f, true); + + return 0; +} + +int Client::lazyio_synchronize(int fd, loff_t offset, size_t count) +{ + Mutex::Locker l(client_lock); + ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd + << ", " << offset << ", " << count << ")" << dendl; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + Inode *in = f->inode.get(); + + _fsync(f, true); + if (_release(in)) + check_caps(in, 0); + return 0; +} + + +// ============================= +// snaps + +int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm) +{ + Mutex::Locker l(client_lock); + + if (unmounting) + return -ENOTCONN; + + filepath path(relpath); + InodeRef in; + int r = path_walk(path, &in, perm); + if (r < 0) + return r; + if (cct->_conf->client_permissions) { + r = may_create(in.get(), perm); + if (r < 0) + return r; + } + Inode *snapdir = open_snapdir(in.get()); + return _mkdir(snapdir, name, 0, perm); +} + +int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms) +{ + Mutex::Locker l(client_lock); + + if (unmounting) + return -ENOTCONN; + + filepath path(relpath); + InodeRef in; + int r = path_walk(path, &in, perms); + if (r < 0) + return r; + if (cct->_conf->client_permissions) { + r = may_delete(in.get(), NULL, perms); + if (r < 0) + return r; + } + Inode *snapdir = open_snapdir(in.get()); + return _rmdir(snapdir, name, perms); +} + +// ============================= +// expose caps + +int Client::get_caps_issued(int fd) { + + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + + return f->inode->caps_issued(); +} + +int Client::get_caps_issued(const char *path, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + filepath p(path); + InodeRef in; + int r = path_walk(p, &in, perms, true); + if (r < 0) + return r; + return in->caps_issued(); +} + +// ========================================= +// low level + +Inode *Client::open_snapdir(Inode *diri) +{ + Inode *in; + vinodeno_t vino(diri->ino, CEPH_SNAPDIR); + if (!inode_map.count(vino)) { + in = new Inode(this, vino, &diri->layout); + + in->ino = diri->ino; + in->snapid = CEPH_SNAPDIR; + in->mode = diri->mode; + in->uid = diri->uid; + in->gid = diri->gid; + in->mtime = diri->mtime; + in->ctime = diri->ctime; + in->btime = diri->btime; + in->size = diri->size; + in->change_attr = diri->change_attr; + + in->dirfragtree.clear(); + in->snapdir_parent = diri; + diri->flags |= I_SNAPDIR_OPEN; + inode_map[vino] = in; + if (use_faked_inos()) + _assign_faked_ino(in); + ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl; + } else { + in = inode_map[vino]; + ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl; + } + return in; +} + +int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr, + Inode **out, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + vinodeno_t vparent = _get_vino(parent); + ldout(cct, 3) << "ll_lookup " << vparent << " " << name << dendl; + tout(cct) << "ll_lookup" << std::endl; + tout(cct) << name << std::endl; + + if (unmounting) + return -ENOTCONN; + + int r = 0; + if (!cct->_conf->fuse_default_permissions) { + r = may_lookup(parent, perms); + if (r < 0) + return r; + } + + string dname(name); + InodeRef in; + + r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms); + if (r < 0) { + attr->st_ino = 0; + goto out; + } + + assert(in); + fill_stat(in, attr); + _ll_get(in.get()); + + out: + ldout(cct, 3) << "ll_lookup " << vparent << " " << name + << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; + tout(cct) << attr->st_ino << std::endl; + *out = in.get(); + return r; +} + +int Client::ll_lookupx(Inode *parent, const char *name, Inode **out, + struct ceph_statx *stx, unsigned want, unsigned flags, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + vinodeno_t vparent = _get_vino(parent); + ldout(cct, 3) << "ll_lookupx " << vparent << " " << name << dendl; + tout(cct) << "ll_lookupx" << std::endl; + tout(cct) << name << std::endl; + + if (unmounting) + return -ENOTCONN; + + int r = 0; + if (!cct->_conf->fuse_default_permissions) { + r = may_lookup(parent, perms); + if (r < 0) + return r; + } + + string dname(name); + InodeRef in; + + unsigned mask = statx_to_mask(flags, want); + r = _lookup(parent, dname, mask, &in, perms); + if (r < 0) { + stx->stx_ino = 0; + stx->stx_mask = 0; + } else { + assert(in); + fill_statx(in, mask, stx); + _ll_get(in.get()); + } + + ldout(cct, 3) << "ll_lookupx " << vparent << " " << name + << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl; + tout(cct) << stx->stx_ino << std::endl; + *out = in.get(); + return r; +} + +int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx, + unsigned int want, unsigned int flags, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + filepath fp(name, 0); + InodeRef in; + int rc; + unsigned mask = statx_to_mask(flags, want); + + ldout(cct, 3) << "ll_walk" << name << dendl; + tout(cct) << "ll_walk" << std::endl; + tout(cct) << name << std::endl; + + rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask); + if (rc < 0) { + /* zero out mask, just in case... */ + stx->stx_mask = 0; + stx->stx_ino = 0; + *out = NULL; + return rc; + } else { + assert(in); + fill_statx(in, mask, stx); + _ll_get(in.get()); + *out = in.get(); + return 0; + } +} + +void Client::_ll_get(Inode *in) +{ + if (in->ll_ref == 0) { + in->get(); + if (in->is_dir() && !in->dn_set.empty()) { + assert(in->dn_set.size() == 1); // dirs can't be hard-linked + in->get_first_parent()->get(); // pin dentry + } + } + in->ll_get(); + ldout(cct, 20) << "_ll_get " << in << " " << in->ino << " -> " << in->ll_ref << dendl; +} + +int Client::_ll_put(Inode *in, int num) +{ + in->ll_put(num); + ldout(cct, 20) << "_ll_put " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl; + if (in->ll_ref == 0) { + if (in->is_dir() && !in->dn_set.empty()) { + assert(in->dn_set.size() == 1); // dirs can't be hard-linked + in->get_first_parent()->put(); // unpin dentry + } + put_inode(in); + return 0; + } else { + return in->ll_ref; + } +} + +void Client::_ll_drop_pins() +{ + ldout(cct, 10) << "_ll_drop_pins" << dendl; + ceph::unordered_map::iterator next; + for (ceph::unordered_map::iterator it = inode_map.begin(); + it != inode_map.end(); + it = next) { + Inode *in = it->second; + next = it; + ++next; + if (in->ll_ref) + _ll_put(in, in->ll_ref); + } +} + +bool Client::ll_forget(Inode *in, int count) +{ + Mutex::Locker lock(client_lock); + inodeno_t ino = _get_inodeno(in); + + ldout(cct, 3) << "ll_forget " << ino << " " << count << dendl; + tout(cct) << "ll_forget" << std::endl; + tout(cct) << ino.val << std::endl; + tout(cct) << count << std::endl; + + // Ignore forget if we're no longer mounted + if (unmounting) + return true; + + if (ino == 1) return true; // ignore forget on root. + + bool last = false; + if (in->ll_ref < count) { + ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count + << ", which only has ll_ref=" << in->ll_ref << dendl; + _ll_put(in, in->ll_ref); + last = true; + } else { + if (_ll_put(in, count) == 0) + last = true; + } + + return last; +} + +bool Client::ll_put(Inode *in) +{ + /* ll_forget already takes the lock */ + return ll_forget(in, 1); +} + +snapid_t Client::ll_get_snapid(Inode *in) +{ + Mutex::Locker lock(client_lock); + return in->snapid; +} + +Inode *Client::ll_get_inode(ino_t ino) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return NULL; + + vinodeno_t vino = _map_faked_ino(ino); + unordered_map::iterator p = inode_map.find(vino); + if (p == inode_map.end()) + return NULL; + Inode *in = p->second; + _ll_get(in); + return in; +} + +Inode *Client::ll_get_inode(vinodeno_t vino) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return NULL; + + unordered_map::iterator p = inode_map.find(vino); + if (p == inode_map.end()) + return NULL; + Inode *in = p->second; + _ll_get(in); + return in; +} + +int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms) +{ + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << "ll_getattr " << vino << dendl; + tout(cct) << "ll_getattr" << std::endl; + tout(cct) << vino.ino.val << std::endl; + + if (vino.snapid < CEPH_NOSNAP) + return 0; + else + return _getattr(in, caps, perms); +} + +int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms); + + if (res == 0) + fill_stat(in, attr); + ldout(cct, 3) << "ll_getattr " << _get_vino(in) << " = " << res << dendl; + return res; +} + +int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want, + unsigned int flags, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + int res = 0; + unsigned mask = statx_to_mask(flags, want); + + if (mask && !in->caps_issued_mask(mask)) + res = _ll_getattr(in, mask, perms); + + if (res == 0) + fill_statx(in, mask, stx); + ldout(cct, 3) << "ll_getattrx " << _get_vino(in) << " = " << res << dendl; + return res; +} + +int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask, + const UserPerm& perms, InodeRef *inp) +{ + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << "ll_setattrx " << vino << " mask " << hex << mask << dec + << dendl; + tout(cct) << "ll_setattrx" << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << stx->stx_mode << std::endl; + tout(cct) << stx->stx_uid << std::endl; + tout(cct) << stx->stx_gid << std::endl; + tout(cct) << stx->stx_size << std::endl; + tout(cct) << stx->stx_mtime << std::endl; + tout(cct) << stx->stx_atime << std::endl; + tout(cct) << stx->stx_btime << std::endl; + tout(cct) << mask << std::endl; + + if (!cct->_conf->fuse_default_permissions) { + int res = may_setattr(in, stx, mask, perms); + if (res < 0) + return res; + } + + mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW); + + return __setattrx(in, stx, mask, perms, inp); +} + +int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + InodeRef target(in); + int res = _ll_setattrx(in, stx, mask, perms, &target); + if (res == 0) { + assert(in == target.get()); + fill_statx(in, in->caps_issued(), stx); + } + + ldout(cct, 3) << "ll_setattrx " << _get_vino(in) << " = " << res << dendl; + return res; +} + +int Client::ll_setattr(Inode *in, struct stat *attr, int mask, + const UserPerm& perms) +{ + struct ceph_statx stx; + stat_to_statx(attr, &stx); + + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + InodeRef target(in); + int res = _ll_setattrx(in, &stx, mask, perms, &target); + if (res == 0) { + assert(in == target.get()); + fill_stat(in, attr); + } + + ldout(cct, 3) << "ll_setattr " << _get_vino(in) << " = " << res << dendl; + return res; +} + + +// ---------- +// xattrs + +int Client::getxattr(const char *path, const char *name, void *value, size_t size, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + InodeRef in; + int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR); + if (r < 0) + return r; + return _getxattr(in, name, value, size, perms); +} + +int Client::lgetxattr(const char *path, const char *name, void *value, size_t size, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + InodeRef in; + int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR); + if (r < 0) + return r; + return _getxattr(in, name, value, size, perms); +} + +int Client::fgetxattr(int fd, const char *name, void *value, size_t size, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + return _getxattr(f->inode, name, value, size, perms); +} + +int Client::listxattr(const char *path, char *list, size_t size, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + InodeRef in; + int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR); + if (r < 0) + return r; + return Client::_listxattr(in.get(), list, size, perms); +} + +int Client::llistxattr(const char *path, char *list, size_t size, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + InodeRef in; + int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR); + if (r < 0) + return r; + return Client::_listxattr(in.get(), list, size, perms); +} + +int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + return Client::_listxattr(f->inode.get(), list, size, perms); +} + +int Client::removexattr(const char *path, const char *name, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + InodeRef in; + int r = Client::path_walk(path, &in, perms, true); + if (r < 0) + return r; + return _removexattr(in, name, perms); +} + +int Client::lremovexattr(const char *path, const char *name, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + InodeRef in; + int r = Client::path_walk(path, &in, perms, false); + if (r < 0) + return r; + return _removexattr(in, name, perms); +} + +int Client::fremovexattr(int fd, const char *name, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + return _removexattr(f->inode, name, perms); +} + +int Client::setxattr(const char *path, const char *name, const void *value, + size_t size, int flags, const UserPerm& perms) +{ + _setxattr_maybe_wait_for_osdmap(name, value, size); + + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + InodeRef in; + int r = Client::path_walk(path, &in, perms, true); + if (r < 0) + return r; + return _setxattr(in, name, value, size, flags, perms); +} + +int Client::lsetxattr(const char *path, const char *name, const void *value, + size_t size, int flags, const UserPerm& perms) +{ + _setxattr_maybe_wait_for_osdmap(name, value, size); + + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + InodeRef in; + int r = Client::path_walk(path, &in, perms, false); + if (r < 0) + return r; + return _setxattr(in, name, value, size, flags, perms); +} + +int Client::fsetxattr(int fd, const char *name, const void *value, size_t size, + int flags, const UserPerm& perms) +{ + _setxattr_maybe_wait_for_osdmap(name, value, size); + + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + return _setxattr(f->inode, name, value, size, flags, perms); +} + +int Client::_getxattr(Inode *in, const char *name, void *value, size_t size, + const UserPerm& perms) +{ + int r; + + const VXattr *vxattr = _match_vxattr(in, name); + if (vxattr) { + r = -ENODATA; + + // Do a force getattr to get the latest quota before returning + // a value to userspace. + r = _getattr(in, 0, perms, true); + if (r != 0) { + // Error from getattr! + return r; + } + + // call pointer-to-member function + char buf[256]; + if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) { + r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf)); + } else { + r = -ENODATA; + } + + if (size != 0) { + if (r > (int)size) { + r = -ERANGE; + } else if (r > 0) { + memcpy(value, buf, r); + } + } + goto out; + } + + if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) { + r = -EOPNOTSUPP; + goto out; + } + + r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0); + if (r == 0) { + string n(name); + r = -ENODATA; + if (in->xattrs.count(n)) { + r = in->xattrs[n].length(); + if (r > 0 && size != 0) { + if (size >= (unsigned)r) + memcpy(value, in->xattrs[n].c_str(), r); + else + r = -ERANGE; + } + } + } + out: + ldout(cct, 3) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl; + return r; +} + +int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size, + const UserPerm& perms) +{ + if (cct->_conf->client_permissions) { + int r = xattr_permission(in.get(), name, MAY_READ, perms); + if (r < 0) + return r; + } + return _getxattr(in.get(), name, value, size, perms); +} + +int Client::ll_getxattr(Inode *in, const char *name, void *value, + size_t size, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << "ll_getxattr " << vino << " " << name << " size " << size << dendl; + tout(cct) << "ll_getxattr" << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << name << std::endl; + + if (!cct->_conf->fuse_default_permissions) { + int r = xattr_permission(in, name, MAY_READ, perms); + if (r < 0) + return r; + } + + return _getxattr(in, name, value, size, perms); +} + +int Client::_listxattr(Inode *in, char *name, size_t size, + const UserPerm& perms) +{ + int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0); + if (r == 0) { + for (map::iterator p = in->xattrs.begin(); + p != in->xattrs.end(); + ++p) + r += p->first.length() + 1; + + const VXattr *vxattrs = _get_vxattrs(in); + r += _vxattrs_name_size(vxattrs); + + if (size != 0) { + if (size >= (unsigned)r) { + for (map::iterator p = in->xattrs.begin(); + p != in->xattrs.end(); + ++p) { + memcpy(name, p->first.c_str(), p->first.length()); + name += p->first.length(); + *name = '\0'; + name++; + } + if (vxattrs) { + for (int i = 0; !vxattrs[i].name.empty(); i++) { + const VXattr& vxattr = vxattrs[i]; + if (vxattr.hidden) + continue; + // call pointer-to-member function + if(vxattr.exists_cb && !(this->*(vxattr.exists_cb))(in)) + continue; + memcpy(name, vxattr.name.c_str(), vxattr.name.length()); + name += vxattr.name.length(); + *name = '\0'; + name++; + } + } + } else + r = -ERANGE; + } + } + ldout(cct, 3) << "_listxattr(" << in->ino << ", " << size << ") = " << r << dendl; + return r; +} + +int Client::ll_listxattr(Inode *in, char *names, size_t size, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << "ll_listxattr " << vino << " size " << size << dendl; + tout(cct) << "ll_listxattr" << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << size << std::endl; + + return _listxattr(in, names, size, perms); +} + +int Client::_do_setxattr(Inode *in, const char *name, const void *value, + size_t size, int flags, const UserPerm& perms) +{ + + int xattr_flags = 0; + if (!value) + xattr_flags |= CEPH_XATTR_REMOVE; + if (flags & XATTR_CREATE) + xattr_flags |= CEPH_XATTR_CREATE; + if (flags & XATTR_REPLACE) + xattr_flags |= CEPH_XATTR_REPLACE; + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR); + filepath path; + in->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_string2(name); + req->set_inode(in); + req->head.args.setxattr.flags = xattr_flags; + + bufferlist bl; + bl.append((const char*)value, size); + req->set_data(bl); + + int res = make_request(req, perms); + + trim_cache(); + ldout(cct, 3) << "_setxattr(" << in->ino << ", \"" << name << "\") = " << + res << dendl; + return res; +} + +int Client::_setxattr(Inode *in, const char *name, const void *value, + size_t size, int flags, const UserPerm& perms) +{ + if (in->snapid != CEPH_NOSNAP) { + return -EROFS; + } + + bool posix_acl_xattr = false; + if (acl_type == POSIX_ACL) + posix_acl_xattr = !strncmp(name, "system.", 7); + + if (strncmp(name, "user.", 5) && + strncmp(name, "security.", 9) && + strncmp(name, "trusted.", 8) && + strncmp(name, "ceph.", 5) && + !posix_acl_xattr) + return -EOPNOTSUPP; + + if (posix_acl_xattr) { + if (!strcmp(name, ACL_EA_ACCESS)) { + mode_t new_mode = in->mode; + if (value) { + int ret = posix_acl_equiv_mode(value, size, &new_mode); + if (ret < 0) + return ret; + if (ret == 0) { + value = NULL; + size = 0; + } + if (new_mode != in->mode) { + struct ceph_statx stx; + stx.stx_mode = new_mode; + ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL); + if (ret < 0) + return ret; + } + } + } else if (!strcmp(name, ACL_EA_DEFAULT)) { + if (value) { + if (!S_ISDIR(in->mode)) + return -EACCES; + int ret = posix_acl_check(value, size); + if (ret < 0) + return -EINVAL; + if (ret == 0) { + value = NULL; + size = 0; + } + } + } else { + return -EOPNOTSUPP; + } + } else { + const VXattr *vxattr = _match_vxattr(in, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; + } + + return _do_setxattr(in, name, value, size, flags, perms); +} + +int Client::_setxattr(InodeRef &in, const char *name, const void *value, + size_t size, int flags, const UserPerm& perms) +{ + if (cct->_conf->client_permissions) { + int r = xattr_permission(in.get(), name, MAY_WRITE, perms); + if (r < 0) + return r; + } + return _setxattr(in.get(), name, value, size, flags, perms); +} + +int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap) +{ + string tmp; + if (name == "layout") { + string::iterator begin = value.begin(); + string::iterator end = value.end(); + keys_and_values p; // create instance of parser + std::map m; // map to receive results + if (!qi::parse(begin, end, p, m)) { // returns true if successful + return -EINVAL; + } + if (begin != end) + return -EINVAL; + for (map::iterator q = m.begin(); q != m.end(); ++q) { + if (q->first == "pool") { + tmp = q->second; + break; + } + } + } else if (name == "layout.pool") { + tmp = value; + } + + if (tmp.length()) { + int64_t pool; + try { + pool = boost::lexical_cast(tmp); + if (!osdmap->have_pg_pool(pool)) + return -ENOENT; + } catch (boost::bad_lexical_cast const&) { + pool = osdmap->lookup_pg_pool_name(tmp); + if (pool < 0) { + return -ENOENT; + } + } + } + + return 0; +} + +void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size) +{ + // For setting pool of layout, MetaRequest need osdmap epoch. + // There is a race which create a new data pool but client and mds both don't have. + // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap. + if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 || + strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) { + string rest(strstr(name, "layout")); + string v((const char*)value, size); + int r = objecter->with_osdmap([&](const OSDMap& o) { + return _setxattr_check_data_pool(rest, v, &o); + }); + + if (r == -ENOENT) { + C_SaferCond ctx; + objecter->wait_for_latest_osdmap(&ctx); + ctx.wait(); + } + } +} + +int Client::ll_setxattr(Inode *in, const char *name, const void *value, + size_t size, int flags, const UserPerm& perms) +{ + _setxattr_maybe_wait_for_osdmap(name, value, size); + + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << "ll_setxattr " << vino << " " << name << " size " << size << dendl; + tout(cct) << "ll_setxattr" << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << name << std::endl; + + if (!cct->_conf->fuse_default_permissions) { + int r = xattr_permission(in, name, MAY_WRITE, perms); + if (r < 0) + return r; + } + return _setxattr(in, name, value, size, flags, perms); +} + +int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms) +{ + if (in->snapid != CEPH_NOSNAP) { + return -EROFS; + } + + // same xattrs supported by kernel client + if (strncmp(name, "user.", 5) && + strncmp(name, "system.", 7) && + strncmp(name, "security.", 9) && + strncmp(name, "trusted.", 8) && + strncmp(name, "ceph.", 5)) + return -EOPNOTSUPP; + + const VXattr *vxattr = _match_vxattr(in, name); + if (vxattr && vxattr->readonly) + return -EOPNOTSUPP; + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR); + filepath path; + in->make_nosnap_relative_path(path); + req->set_filepath(path); + req->set_filepath2(name); + req->set_inode(in); + + int res = make_request(req, perms); + + trim_cache(); + ldout(cct, 3) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl; + return res; +} + +int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms) +{ + if (cct->_conf->client_permissions) { + int r = xattr_permission(in.get(), name, MAY_WRITE, perms); + if (r < 0) + return r; + } + return _removexattr(in.get(), name, perms); +} + +int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl; + tout(cct) << "ll_removexattr" << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << name << std::endl; + + if (!cct->_conf->fuse_default_permissions) { + int r = xattr_permission(in, name, MAY_WRITE, perms); + if (r < 0) + return r; + } + + return _removexattr(in, name, perms); +} + +bool Client::_vxattrcb_quota_exists(Inode *in) +{ + return in->quota.is_enable(); +} +size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, + "max_bytes=%lld max_files=%lld", + (long long int)in->quota.max_bytes, + (long long int)in->quota.max_files); +} +size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes); +} +size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%lld", (long long int)in->quota.max_files); +} + +bool Client::_vxattrcb_layout_exists(Inode *in) +{ + return in->layout != file_layout_t(); +} +size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size) +{ + int r = snprintf(val, size, + "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=", + (unsigned long long)in->layout.stripe_unit, + (unsigned long long)in->layout.stripe_count, + (unsigned long long)in->layout.object_size); + objecter->with_osdmap([&](const OSDMap& o) { + if (o.have_pg_pool(in->layout.pool_id)) + r += snprintf(val + r, size - r, "%s", + o.get_pool_name(in->layout.pool_id).c_str()); + else + r += snprintf(val + r, size - r, "%" PRIu64, + (uint64_t)in->layout.pool_id); + }); + if (in->layout.pool_ns.length()) + r += snprintf(val + r, size - r, " pool_namespace=%s", + in->layout.pool_ns.c_str()); + return r; +} +size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_unit); +} +size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_count); +} +size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%lld", (unsigned long long)in->layout.object_size); +} +size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size) +{ + size_t r; + objecter->with_osdmap([&](const OSDMap& o) { + if (o.have_pg_pool(in->layout.pool_id)) + r = snprintf(val, size, "%s", o.get_pool_name( + in->layout.pool_id).c_str()); + else + r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id); + }); + return r; +} +size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%s", in->layout.pool_ns.c_str()); +} +size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%lld", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs)); +} +size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nfiles); +} +size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nsubdirs); +} +size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%lld", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs)); +} +size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rfiles); +} +size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rsubdirs); +} +size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rbytes); +} +size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size) +{ + return snprintf(val, size, "%ld.09%ld", (long)in->rstat.rctime.sec(), + (long)in->rstat.rctime.nsec()); +} + +#define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name +#define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2 + +#define XATTR_NAME_CEPH(_type, _name) \ +{ \ + name: CEPH_XATTR_NAME(_type, _name), \ + getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \ + readonly: true, \ + hidden: false, \ + exists_cb: NULL, \ +} +#define XATTR_LAYOUT_FIELD(_type, _name, _field) \ +{ \ + name: CEPH_XATTR_NAME2(_type, _name, _field), \ + getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \ + readonly: false, \ + hidden: true, \ + exists_cb: &Client::_vxattrcb_layout_exists, \ +} +#define XATTR_QUOTA_FIELD(_type, _name) \ +{ \ + name: CEPH_XATTR_NAME(_type, _name), \ + getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \ + readonly: false, \ + hidden: true, \ + exists_cb: &Client::_vxattrcb_quota_exists, \ +} + +const Client::VXattr Client::_dir_vxattrs[] = { + { + name: "ceph.dir.layout", + getxattr_cb: &Client::_vxattrcb_layout, + readonly: false, + hidden: true, + exists_cb: &Client::_vxattrcb_layout_exists, + }, + XATTR_LAYOUT_FIELD(dir, layout, stripe_unit), + XATTR_LAYOUT_FIELD(dir, layout, stripe_count), + XATTR_LAYOUT_FIELD(dir, layout, object_size), + XATTR_LAYOUT_FIELD(dir, layout, pool), + XATTR_LAYOUT_FIELD(dir, layout, pool_namespace), + XATTR_NAME_CEPH(dir, entries), + XATTR_NAME_CEPH(dir, files), + XATTR_NAME_CEPH(dir, subdirs), + XATTR_NAME_CEPH(dir, rentries), + XATTR_NAME_CEPH(dir, rfiles), + XATTR_NAME_CEPH(dir, rsubdirs), + XATTR_NAME_CEPH(dir, rbytes), + XATTR_NAME_CEPH(dir, rctime), + { + name: "ceph.quota", + getxattr_cb: &Client::_vxattrcb_quota, + readonly: false, + hidden: true, + exists_cb: &Client::_vxattrcb_quota_exists, + }, + XATTR_QUOTA_FIELD(quota, max_bytes), + XATTR_QUOTA_FIELD(quota, max_files), + { name: "" } /* Required table terminator */ +}; + +const Client::VXattr Client::_file_vxattrs[] = { + { + name: "ceph.file.layout", + getxattr_cb: &Client::_vxattrcb_layout, + readonly: false, + hidden: true, + exists_cb: &Client::_vxattrcb_layout_exists, + }, + XATTR_LAYOUT_FIELD(file, layout, stripe_unit), + XATTR_LAYOUT_FIELD(file, layout, stripe_count), + XATTR_LAYOUT_FIELD(file, layout, object_size), + XATTR_LAYOUT_FIELD(file, layout, pool), + XATTR_LAYOUT_FIELD(file, layout, pool_namespace), + { name: "" } /* Required table terminator */ +}; + +const Client::VXattr *Client::_get_vxattrs(Inode *in) +{ + if (in->is_dir()) + return _dir_vxattrs; + else if (in->is_file()) + return _file_vxattrs; + return NULL; +} + +const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name) +{ + if (strncmp(name, "ceph.", 5) == 0) { + const VXattr *vxattr = _get_vxattrs(in); + if (vxattr) { + while (!vxattr->name.empty()) { + if (vxattr->name == name) + return vxattr; + vxattr++; + } + } + } + return NULL; +} + +size_t Client::_vxattrs_calcu_name_size(const VXattr *vxattr) +{ + size_t len = 0; + while (!vxattr->name.empty()) { + if (!vxattr->hidden) + len += vxattr->name.length() + 1; + vxattr++; + } + return len; +} + +int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << "ll_readlink " << vino << dendl; + tout(cct) << "ll_readlink" << std::endl; + tout(cct) << vino.ino.val << std::endl; + + set::iterator dn = in->dn_set.begin(); + while (dn != in->dn_set.end()) { + touch_dn(*dn); + ++dn; + } + + int r = _readlink(in, buf, buflen); // FIXME: no permission checking! + ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl; + return r; +} + +int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev, + const UserPerm& perms, InodeRef *inp) +{ + ldout(cct, 3) << "_mknod(" << dir->ino << " " << name << ", 0" << oct + << mode << dec << ", " << rdev << ", uid " << perms.uid() + << ", gid " << perms.gid() << ")" << dendl; + + if (strlen(name) > NAME_MAX) + return -ENAMETOOLONG; + + if (dir->snapid != CEPH_NOSNAP) { + return -EROFS; + } + if (is_quota_files_exceeded(dir, perms)) { + return -EDQUOT; + } + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD); + + filepath path; + dir->make_nosnap_relative_path(path); + path.push_dentry(name); + req->set_filepath(path); + req->set_inode(dir); + req->head.args.mknod.rdev = rdev; + req->dentry_drop = CEPH_CAP_FILE_SHARED; + req->dentry_unless = CEPH_CAP_FILE_EXCL; + + bufferlist xattrs_bl; + int res = _posix_acl_create(dir, &mode, xattrs_bl, perms); + if (res < 0) + goto fail; + req->head.args.mknod.mode = mode; + if (xattrs_bl.length() > 0) + req->set_data(xattrs_bl); + + Dentry *de; + res = get_or_create(dir, name, &de); + if (res < 0) + goto fail; + req->set_dentry(de); + + res = make_request(req, perms, inp); + + trim_cache(); + + ldout(cct, 3) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl; + return res; + + fail: + put_request(req); + return res; +} + +int Client::ll_mknod(Inode *parent, const char *name, mode_t mode, + dev_t rdev, struct stat *attr, Inode **out, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vparent = _get_vino(parent); + + ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl; + tout(cct) << "ll_mknod" << std::endl; + tout(cct) << vparent.ino.val << std::endl; + tout(cct) << name << std::endl; + tout(cct) << mode << std::endl; + tout(cct) << rdev << std::endl; + + if (!cct->_conf->fuse_default_permissions) { + int r = may_create(parent, perms); + if (r < 0) + return r; + } + + InodeRef in; + int r = _mknod(parent, name, mode, rdev, perms, &in); + if (r == 0) { + fill_stat(in, attr); + _ll_get(in.get()); + } + tout(cct) << attr->st_ino << std::endl; + ldout(cct, 3) << "ll_mknod " << vparent << " " << name + << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; + *out = in.get(); + return r; +} + +int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode, + dev_t rdev, Inode **out, + struct ceph_statx *stx, unsigned want, unsigned flags, + const UserPerm& perms) +{ + unsigned caps = statx_to_mask(flags, want); + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vparent = _get_vino(parent); + + ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl; + tout(cct) << "ll_mknodx" << std::endl; + tout(cct) << vparent.ino.val << std::endl; + tout(cct) << name << std::endl; + tout(cct) << mode << std::endl; + tout(cct) << rdev << std::endl; + + if (!cct->_conf->fuse_default_permissions) { + int r = may_create(parent, perms); + if (r < 0) + return r; + } + + InodeRef in; + int r = _mknod(parent, name, mode, rdev, perms, &in); + if (r == 0) { + fill_statx(in, caps, stx); + _ll_get(in.get()); + } + tout(cct) << stx->stx_ino << std::endl; + ldout(cct, 3) << "ll_mknodx " << vparent << " " << name + << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl; + *out = in.get(); + return r; +} + +int Client::_create(Inode *dir, const char *name, int flags, mode_t mode, + InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count, + int object_size, const char *data_pool, bool *created, + const UserPerm& perms) +{ + ldout(cct, 3) << "_create(" << dir->ino << " " << name << ", 0" << oct << + mode << dec << ")" << dendl; + + if (strlen(name) > NAME_MAX) + return -ENAMETOOLONG; + if (dir->snapid != CEPH_NOSNAP) { + return -EROFS; + } + if (is_quota_files_exceeded(dir, perms)) { + return -EDQUOT; + } + + // use normalized flags to generate cmode + int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags)); + if (cmode < 0) + return -EINVAL; + + int64_t pool_id = -1; + if (data_pool && *data_pool) { + pool_id = objecter->with_osdmap( + std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool); + if (pool_id < 0) + return -EINVAL; + if (pool_id > 0xffffffffll) + return -ERANGE; // bummer! + } + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE); + + filepath path; + dir->make_nosnap_relative_path(path); + path.push_dentry(name); + req->set_filepath(path); + req->set_inode(dir); + req->head.args.open.flags = ceph_flags_sys2wire(flags | O_CREAT); + + req->head.args.open.stripe_unit = stripe_unit; + req->head.args.open.stripe_count = stripe_count; + req->head.args.open.object_size = object_size; + if (cct->_conf->client_debug_getattr_caps) + req->head.args.open.mask = DEBUG_GETATTR_CAPS; + else + req->head.args.open.mask = 0; + req->head.args.open.pool = pool_id; + req->dentry_drop = CEPH_CAP_FILE_SHARED; + req->dentry_unless = CEPH_CAP_FILE_EXCL; + + mode |= S_IFREG; + bufferlist xattrs_bl; + int res = _posix_acl_create(dir, &mode, xattrs_bl, perms); + if (res < 0) + goto fail; + req->head.args.open.mode = mode; + if (xattrs_bl.length() > 0) + req->set_data(xattrs_bl); + + Dentry *de; + res = get_or_create(dir, name, &de); + if (res < 0) + goto fail; + req->set_dentry(de); + + res = make_request(req, perms, inp, created); + if (res < 0) { + goto reply_error; + } + + /* If the caller passed a value in fhp, do the open */ + if(fhp) { + (*inp)->get_open_ref(cmode); + *fhp = _create_fh(inp->get(), flags, cmode, perms); + } + + reply_error: + trim_cache(); + + ldout(cct, 3) << "create(" << path << ", 0" << oct << mode << dec + << " layout " << stripe_unit + << ' ' << stripe_count + << ' ' << object_size + <<") = " << res << dendl; + return res; + + fail: + put_request(req); + return res; +} + + +int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm, + InodeRef *inp) +{ + ldout(cct, 3) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct + << mode << dec << ", uid " << perm.uid() + << ", gid " << perm.gid() << ")" << dendl; + + if (strlen(name) > NAME_MAX) + return -ENAMETOOLONG; + + if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) { + return -EROFS; + } + if (is_quota_files_exceeded(dir, perm)) { + return -EDQUOT; + } + MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ? + CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR); + + filepath path; + dir->make_nosnap_relative_path(path); + path.push_dentry(name); + req->set_filepath(path); + req->set_inode(dir); + req->dentry_drop = CEPH_CAP_FILE_SHARED; + req->dentry_unless = CEPH_CAP_FILE_EXCL; + + mode |= S_IFDIR; + bufferlist xattrs_bl; + int res = _posix_acl_create(dir, &mode, xattrs_bl, perm); + if (res < 0) + goto fail; + req->head.args.mkdir.mode = mode; + if (xattrs_bl.length() > 0) + req->set_data(xattrs_bl); + + Dentry *de; + res = get_or_create(dir, name, &de); + if (res < 0) + goto fail; + req->set_dentry(de); + + ldout(cct, 10) << "_mkdir: making request" << dendl; + res = make_request(req, perm, inp); + ldout(cct, 10) << "_mkdir result is " << res << dendl; + + trim_cache(); + + ldout(cct, 3) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl; + return res; + + fail: + put_request(req); + return res; +} + +int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode, + struct stat *attr, Inode **out, const UserPerm& perm) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vparent = _get_vino(parent); + + ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl; + tout(cct) << "ll_mkdir" << std::endl; + tout(cct) << vparent.ino.val << std::endl; + tout(cct) << name << std::endl; + tout(cct) << mode << std::endl; + + if (!cct->_conf->fuse_default_permissions) { + int r = may_create(parent, perm); + if (r < 0) + return r; + } + + InodeRef in; + int r = _mkdir(parent, name, mode, perm, &in); + if (r == 0) { + fill_stat(in, attr); + _ll_get(in.get()); + } + tout(cct) << attr->st_ino << std::endl; + ldout(cct, 3) << "ll_mkdir " << vparent << " " << name + << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; + *out = in.get(); + return r; +} + +int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out, + struct ceph_statx *stx, unsigned want, unsigned flags, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vparent = _get_vino(parent); + + ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl; + tout(cct) << "ll_mkdirx" << std::endl; + tout(cct) << vparent.ino.val << std::endl; + tout(cct) << name << std::endl; + tout(cct) << mode << std::endl; + + if (!cct->_conf->fuse_default_permissions) { + int r = may_create(parent, perms); + if (r < 0) + return r; + } + + InodeRef in; + int r = _mkdir(parent, name, mode, perms, &in); + if (r == 0) { + fill_statx(in, statx_to_mask(flags, want), stx); + _ll_get(in.get()); + } else { + stx->stx_ino = 0; + stx->stx_mask = 0; + } + tout(cct) << stx->stx_ino << std::endl; + ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name + << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl; + *out = in.get(); + return r; +} + +int Client::_symlink(Inode *dir, const char *name, const char *target, + const UserPerm& perms, InodeRef *inp) +{ + ldout(cct, 3) << "_symlink(" << dir->ino << " " << name << ", " << target + << ", uid " << perms.uid() << ", gid " << perms.gid() << ")" + << dendl; + + if (strlen(name) > NAME_MAX) + return -ENAMETOOLONG; + + if (dir->snapid != CEPH_NOSNAP) { + return -EROFS; + } + if (is_quota_files_exceeded(dir, perms)) { + return -EDQUOT; + } + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK); + + filepath path; + dir->make_nosnap_relative_path(path); + path.push_dentry(name); + req->set_filepath(path); + req->set_inode(dir); + req->set_string2(target); + req->dentry_drop = CEPH_CAP_FILE_SHARED; + req->dentry_unless = CEPH_CAP_FILE_EXCL; + + Dentry *de; + int res = get_or_create(dir, name, &de); + if (res < 0) + goto fail; + req->set_dentry(de); + + res = make_request(req, perms, inp); + + trim_cache(); + ldout(cct, 3) << "_symlink(\"" << path << "\", \"" << target << "\") = " << + res << dendl; + return res; + + fail: + put_request(req); + return res; +} + +int Client::ll_symlink(Inode *parent, const char *name, const char *value, + struct stat *attr, Inode **out, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vparent = _get_vino(parent); + + ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value + << dendl; + tout(cct) << "ll_symlink" << std::endl; + tout(cct) << vparent.ino.val << std::endl; + tout(cct) << name << std::endl; + tout(cct) << value << std::endl; + + if (!cct->_conf->fuse_default_permissions) { + int r = may_create(parent, perms); + if (r < 0) + return r; + } + + InodeRef in; + int r = _symlink(parent, name, value, perms, &in); + if (r == 0) { + fill_stat(in, attr); + _ll_get(in.get()); + } + tout(cct) << attr->st_ino << std::endl; + ldout(cct, 3) << "ll_symlink " << vparent << " " << name + << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl; + *out = in.get(); + return r; +} + +int Client::ll_symlinkx(Inode *parent, const char *name, const char *value, + Inode **out, struct ceph_statx *stx, unsigned want, + unsigned flags, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vparent = _get_vino(parent); + + ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value + << dendl; + tout(cct) << "ll_symlinkx" << std::endl; + tout(cct) << vparent.ino.val << std::endl; + tout(cct) << name << std::endl; + tout(cct) << value << std::endl; + + if (!cct->_conf->fuse_default_permissions) { + int r = may_create(parent, perms); + if (r < 0) + return r; + } + + InodeRef in; + int r = _symlink(parent, name, value, perms, &in); + if (r == 0) { + fill_statx(in, statx_to_mask(flags, want), stx); + _ll_get(in.get()); + } + tout(cct) << stx->stx_ino << std::endl; + ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name + << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl; + *out = in.get(); + return r; +} + +int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm) +{ + ldout(cct, 3) << "_unlink(" << dir->ino << " " << name + << " uid " << perm.uid() << " gid " << perm.gid() + << ")" << dendl; + + if (dir->snapid != CEPH_NOSNAP) { + return -EROFS; + } + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK); + + filepath path; + dir->make_nosnap_relative_path(path); + path.push_dentry(name); + req->set_filepath(path); + + InodeRef otherin; + + Dentry *de; + int res = get_or_create(dir, name, &de); + if (res < 0) + goto fail; + req->set_dentry(de); + req->dentry_drop = CEPH_CAP_FILE_SHARED; + req->dentry_unless = CEPH_CAP_FILE_EXCL; + + res = _lookup(dir, name, 0, &otherin, perm); + if (res < 0) + goto fail; + req->set_other_inode(otherin.get()); + req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + + req->set_inode(dir); + + res = make_request(req, perm); + + trim_cache(); + ldout(cct, 3) << "unlink(" << path << ") = " << res << dendl; + return res; + + fail: + put_request(req); + return res; +} + +int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl; + tout(cct) << "ll_unlink" << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << name << std::endl; + + if (!cct->_conf->fuse_default_permissions) { + int r = may_delete(in, name, perm); + if (r < 0) + return r; + } + return _unlink(in, name, perm); +} + +int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms) +{ + ldout(cct, 3) << "_rmdir(" << dir->ino << " " << name << " uid " + << perms.uid() << " gid " << perms.gid() << ")" << dendl; + + if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) { + return -EROFS; + } + + MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP:CEPH_MDS_OP_RMDIR); + filepath path; + dir->make_nosnap_relative_path(path); + path.push_dentry(name); + req->set_filepath(path); + + req->dentry_drop = CEPH_CAP_FILE_SHARED; + req->dentry_unless = CEPH_CAP_FILE_EXCL; + req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + + InodeRef in; + + Dentry *de; + int res = get_or_create(dir, name, &de); + if (res < 0) + goto fail; + res = _lookup(dir, name, 0, &in, perms); + if (res < 0) + goto fail; + if (req->get_op() == CEPH_MDS_OP_RMDIR) { + req->set_inode(dir); + req->set_dentry(de); + req->set_other_inode(in.get()); + } else { + unlink(de, true, true); + req->set_other_inode(in.get()); + } + + res = make_request(req, perms); + + trim_cache(); + ldout(cct, 3) << "rmdir(" << path << ") = " << res << dendl; + return res; + + fail: + put_request(req); + return res; +} + +int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl; + tout(cct) << "ll_rmdir" << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << name << std::endl; + + if (!cct->_conf->fuse_default_permissions) { + int r = may_delete(in, name, perms); + if (r < 0) + return r; + } + + return _rmdir(in, name, perms); +} + +int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm) +{ + ldout(cct, 3) << "_rename(" << fromdir->ino << " " << fromname << " to " + << todir->ino << " " << toname + << " uid " << perm.uid() << " gid " << perm.gid() << ")" + << dendl; + + if (fromdir->snapid != todir->snapid) + return -EXDEV; + + int op = CEPH_MDS_OP_RENAME; + if (fromdir->snapid != CEPH_NOSNAP) { + if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR) + op = CEPH_MDS_OP_RENAMESNAP; + else + return -EROFS; + } + if (fromdir != todir) { + Inode *fromdir_root = + fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm); + Inode *todir_root = + todir->quota.is_enable() ? todir : get_quota_root(todir, perm); + if (fromdir_root != todir_root) { + return -EXDEV; + } + } + + InodeRef target; + MetaRequest *req = new MetaRequest(op); + + filepath from; + fromdir->make_nosnap_relative_path(from); + from.push_dentry(fromname); + filepath to; + todir->make_nosnap_relative_path(to); + to.push_dentry(toname); + req->set_filepath(to); + req->set_filepath2(from); + + Dentry *oldde; + int res = get_or_create(fromdir, fromname, &oldde); + if (res < 0) + goto fail; + Dentry *de; + res = get_or_create(todir, toname, &de); + if (res < 0) + goto fail; + + if (op == CEPH_MDS_OP_RENAME) { + req->set_old_dentry(oldde); + req->old_dentry_drop = CEPH_CAP_FILE_SHARED; + req->old_dentry_unless = CEPH_CAP_FILE_EXCL; + + req->set_dentry(de); + req->dentry_drop = CEPH_CAP_FILE_SHARED; + req->dentry_unless = CEPH_CAP_FILE_EXCL; + + InodeRef oldin, otherin; + res = _lookup(fromdir, fromname, 0, &oldin, perm); + if (res < 0) + goto fail; + req->set_old_inode(oldin.get()); + req->old_inode_drop = CEPH_CAP_LINK_SHARED; + + res = _lookup(todir, toname, 0, &otherin, perm); + if (res != 0 && res != -ENOENT) { + goto fail; + } else if (res == 0) { + req->set_other_inode(otherin.get()); + req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL; + } + + req->set_inode(todir); + } else { + // renamesnap reply contains no tracedn, so we need to invalidate + // dentry manually + unlink(oldde, true, true); + unlink(de, true, true); + } + + res = make_request(req, perm, &target); + ldout(cct, 10) << "rename result is " << res << dendl; + + // renamed item from our cache + + trim_cache(); + ldout(cct, 3) << "_rename(" << from << ", " << to << ") = " << res << dendl; + return res; + + fail: + put_request(req); + return res; +} + +int Client::ll_rename(Inode *parent, const char *name, Inode *newparent, + const char *newname, const UserPerm& perm) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vparent = _get_vino(parent); + vinodeno_t vnewparent = _get_vino(newparent); + + ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to " + << vnewparent << " " << newname << dendl; + tout(cct) << "ll_rename" << std::endl; + tout(cct) << vparent.ino.val << std::endl; + tout(cct) << name << std::endl; + tout(cct) << vnewparent.ino.val << std::endl; + tout(cct) << newname << std::endl; + + if (!cct->_conf->fuse_default_permissions) { + int r = may_delete(parent, name, perm); + if (r < 0) + return r; + r = may_delete(newparent, newname, perm); + if (r < 0 && r != -ENOENT) + return r; + } + + return _rename(parent, name, newparent, newname, perm); +} + +int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp) +{ + ldout(cct, 3) << "_link(" << in->ino << " to " << dir->ino << " " << newname + << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl; + + if (strlen(newname) > NAME_MAX) + return -ENAMETOOLONG; + + if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) { + return -EROFS; + } + if (is_quota_files_exceeded(dir, perm)) { + return -EDQUOT; + } + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK); + + filepath path(newname, dir->ino); + req->set_filepath(path); + filepath existing(in->ino); + req->set_filepath2(existing); + + req->set_inode(dir); + req->inode_drop = CEPH_CAP_FILE_SHARED; + req->inode_unless = CEPH_CAP_FILE_EXCL; + + Dentry *de; + int res = get_or_create(dir, newname, &de); + if (res < 0) + goto fail; + req->set_dentry(de); + + res = make_request(req, perm, inp); + ldout(cct, 10) << "link result is " << res << dendl; + + trim_cache(); + ldout(cct, 3) << "link(" << existing << ", " << path << ") = " << res << dendl; + return res; + + fail: + put_request(req); + return res; +} + +int Client::ll_link(Inode *in, Inode *newparent, const char *newname, + const UserPerm& perm) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vino = _get_vino(in); + vinodeno_t vnewparent = _get_vino(newparent); + + ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " << + newname << dendl; + tout(cct) << "ll_link" << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << vnewparent << std::endl; + tout(cct) << newname << std::endl; + + int r = 0; + InodeRef target; + + if (!cct->_conf->fuse_default_permissions) { + if (S_ISDIR(in->mode)) + return -EPERM; + + r = may_hardlink(in, perm); + if (r < 0) + return r; + + r = may_create(newparent, perm); + if (r < 0) + return r; + } + + return _link(in, newparent, newname, perm, &target); +} + +int Client::ll_num_osds(void) +{ + Mutex::Locker lock(client_lock); + return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds)); +} + +int Client::ll_osdaddr(int osd, uint32_t *addr) +{ + Mutex::Locker lock(client_lock); + + entity_addr_t g; + bool exists = objecter->with_osdmap([&](const OSDMap& o) { + if (!o.exists(osd)) + return false; + g = o.get_addr(osd); + return true; + }); + if (!exists) + return -1; + uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr; + *addr = ntohl(nb_addr); + return 0; +} + +uint32_t Client::ll_stripe_unit(Inode *in) +{ + Mutex::Locker lock(client_lock); + return in->layout.stripe_unit; +} + +uint64_t Client::ll_snap_seq(Inode *in) +{ + Mutex::Locker lock(client_lock); + return in->snaprealm->seq; +} + +int Client::ll_file_layout(Inode *in, file_layout_t *layout) +{ + Mutex::Locker lock(client_lock); + *layout = in->layout; + return 0; +} + +int Client::ll_file_layout(Fh *fh, file_layout_t *layout) +{ + return ll_file_layout(fh->inode.get(), layout); +} + +/* Currently we cannot take advantage of redundancy in reads, since we + would have to go through all possible placement groups (a + potentially quite large number determined by a hash), and use CRUSH + to calculate the appropriate set of OSDs for each placement group, + then index into that. An array with one entry per OSD is much more + tractable and works for demonstration purposes. */ + +int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno, + file_layout_t* layout) +{ + Mutex::Locker lock(client_lock); + + inodeno_t ino = ll_get_inodeno(in); + uint32_t object_size = layout->object_size; + uint32_t su = layout->stripe_unit; + uint32_t stripe_count = layout->stripe_count; + uint64_t stripes_per_object = object_size / su; + + uint64_t stripeno = blockno / stripe_count; // which horizontal stripe (Y) + uint64_t stripepos = blockno % stripe_count; // which object in the object set (X) + uint64_t objectsetno = stripeno / stripes_per_object; // which object set + uint64_t objectno = objectsetno * stripe_count + stripepos; // object id + + object_t oid = file_object_t(ino, objectno); + return objecter->with_osdmap([&](const OSDMap& o) { + ceph_object_layout olayout = + o.file_to_object_layout(oid, *layout); + pg_t pg = (pg_t)olayout.ol_pgid; + vector osds; + int primary; + o.pg_to_acting_osds(pg, &osds, &primary); + return primary; + }); +} + +/* Return the offset of the block, internal to the object */ + +uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno) +{ + Mutex::Locker lock(client_lock); + file_layout_t *layout=&(in->layout); + uint32_t object_size = layout->object_size; + uint32_t su = layout->stripe_unit; + uint64_t stripes_per_object = object_size / su; + + return (blockno % stripes_per_object) * su; +} + +int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << "ll_opendir " << vino << dendl; + tout(cct) << "ll_opendir" << std::endl; + tout(cct) << vino.ino.val << std::endl; + + if (!cct->_conf->fuse_default_permissions) { + int r = may_open(in, flags, perms); + if (r < 0) + return r; + } + + int r = _opendir(in, dirpp, perms); + tout(cct) << (unsigned long)*dirpp << std::endl; + + ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")" + << dendl; + return r; +} + +int Client::ll_releasedir(dir_result_t *dirp) +{ + Mutex::Locker lock(client_lock); + ldout(cct, 3) << "ll_releasedir " << dirp << dendl; + tout(cct) << "ll_releasedir" << std::endl; + tout(cct) << (unsigned long)dirp << std::endl; + + if (unmounting) + return -ENOTCONN; + + _closedir(dirp); + return 0; +} + +int Client::ll_fsyncdir(dir_result_t *dirp) +{ + Mutex::Locker lock(client_lock); + ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl; + tout(cct) << "ll_fsyncdir" << std::endl; + tout(cct) << (unsigned long)dirp << std::endl; + + if (unmounting) + return -ENOTCONN; + + return _fsync(dirp->inode.get(), false); +} + +int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms) +{ + assert(!(flags & O_CREAT)); + + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vino = _get_vino(in); + + ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl; + tout(cct) << "ll_open" << std::endl; + tout(cct) << vino.ino.val << std::endl; + tout(cct) << ceph_flags_sys2wire(flags) << std::endl; + + int r; + if (!cct->_conf->fuse_default_permissions) { + r = may_open(in, flags, perms); + if (r < 0) + goto out; + } + + r = _open(in, flags, 0, fhp /* may be NULL */, perms); + + out: + Fh *fhptr = fhp ? *fhp : NULL; + if (fhptr) { + ll_unclosed_fh_set.insert(fhptr); + } + tout(cct) << (unsigned long)fhptr << std::endl; + ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << + " = " << r << " (" << fhptr << ")" << dendl; + return r; +} + +int Client::_ll_create(Inode *parent, const char *name, mode_t mode, + int flags, InodeRef *in, int caps, Fh **fhp, + const UserPerm& perms) +{ + *fhp = NULL; + + vinodeno_t vparent = _get_vino(parent); + + ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct << + mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid() + << ", gid " << perms.gid() << dendl; + tout(cct) << "ll_create" << std::endl; + tout(cct) << vparent.ino.val << std::endl; + tout(cct) << name << std::endl; + tout(cct) << mode << std::endl; + tout(cct) << ceph_flags_sys2wire(flags) << std::endl; + + bool created = false; + int r = _lookup(parent, name, caps, in, perms); + + if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL)) + return -EEXIST; + + if (r == -ENOENT && (flags & O_CREAT)) { + if (!cct->_conf->fuse_default_permissions) { + r = may_create(parent, perms); + if (r < 0) + goto out; + } + r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created, + perms); + if (r < 0) + goto out; + } + + if (r < 0) + goto out; + + assert(*in); + + ldout(cct, 20) << "_ll_create created = " << created << dendl; + if (!created) { + if (!cct->_conf->fuse_default_permissions) { + r = may_open(in->get(), flags, perms); + if (r < 0) { + if (*fhp) { + int release_r = _release_fh(*fhp); + assert(release_r == 0); // during create, no async data ops should have happened + } + goto out; + } + } + if (*fhp == NULL) { + r = _open(in->get(), flags, mode, fhp, perms); + if (r < 0) + goto out; + } + } + +out: + if (*fhp) { + ll_unclosed_fh_set.insert(*fhp); + } + + ino_t ino = 0; + if (r >= 0) { + Inode *inode = in->get(); + if (use_faked_inos()) + ino = inode->faked_ino; + else + ino = inode->ino; + } + + tout(cct) << (unsigned long)*fhp << std::endl; + tout(cct) << ino << std::endl; + ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct << + mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" << + *fhp << " " << hex << ino << dec << ")" << dendl; + + return r; +} + +int Client::ll_create(Inode *parent, const char *name, mode_t mode, + int flags, struct stat *attr, Inode **outp, Fh **fhp, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + InodeRef in; + + if (unmounting) + return -ENOTCONN; + + int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL, + fhp, perms); + if (r >= 0) { + assert(in); + + // passing an Inode in outp requires an additional ref + if (outp) { + _ll_get(in.get()); + *outp = in.get(); + } + fill_stat(in, attr); + } else { + attr->st_ino = 0; + } + + return r; +} + +int Client::ll_createx(Inode *parent, const char *name, mode_t mode, + int oflags, Inode **outp, Fh **fhp, + struct ceph_statx *stx, unsigned want, unsigned lflags, + const UserPerm& perms) +{ + unsigned caps = statx_to_mask(lflags, want); + Mutex::Locker lock(client_lock); + InodeRef in; + + if (unmounting) + return -ENOTCONN; + + int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms); + if (r >= 0) { + assert(in); + + // passing an Inode in outp requires an additional ref + if (outp) { + _ll_get(in.get()); + *outp = in.get(); + } + fill_statx(in, caps, stx); + } else { + stx->stx_ino = 0; + stx->stx_mask = 0; + } + + return r; +} + +loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "ll_lseek" << std::endl; + tout(cct) << offset << std::endl; + tout(cct) << whence << std::endl; + + if (unmounting) + return -ENOTCONN; + + return _lseek(fh, offset, whence); +} + +int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl) +{ + Mutex::Locker lock(client_lock); + ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl; + tout(cct) << "ll_read" << std::endl; + tout(cct) << (unsigned long)fh << std::endl; + tout(cct) << off << std::endl; + tout(cct) << len << std::endl; + + if (unmounting) + return -ENOTCONN; + + return _read(fh, off, len, bl); +} + +int Client::ll_read_block(Inode *in, uint64_t blockid, + char *buf, + uint64_t offset, + uint64_t length, + file_layout_t* layout) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + vinodeno_t vino = ll_get_vino(in); + object_t oid = file_object_t(vino.ino, blockid); + C_SaferCond onfinish; + bufferlist bl; + + objecter->read(oid, + object_locator_t(layout->pool_id), + offset, + length, + vino.snapid, + &bl, + CEPH_OSD_FLAG_READ, + &onfinish); + + client_lock.Unlock(); + int r = onfinish.wait(); + client_lock.Lock(); + + if (r >= 0) { + bl.copy(0, bl.length(), buf); + r = bl.length(); + } + + return r; +} + +/* It appears that the OSD doesn't return success unless the entire + buffer was written, return the write length on success. */ + +int Client::ll_write_block(Inode *in, uint64_t blockid, + char* buf, uint64_t offset, + uint64_t length, file_layout_t* layout, + uint64_t snapseq, uint32_t sync) +{ + Mutex flock("Client::ll_write_block flock"); + vinodeno_t vino = ll_get_vino(in); + Cond cond; + bool done; + int r = 0; + Context *onsafe = nullptr; + + if (length == 0) { + return -EINVAL; + } + if (true || sync) { + /* if write is stable, the epilogue is waiting on + * flock */ + onsafe = new C_SafeCond(&flock, &cond, &done, &r); + done = false; + } else { + /* if write is unstable, we just place a barrier for + * future commits to wait on */ + /*onsafe = new C_Block_Sync(this, vino.ino, + barrier_interval(offset, offset + length), &r); + */ + done = true; + } + object_t oid = file_object_t(vino.ino, blockid); + SnapContext fakesnap; + bufferptr bp; + if (length > 0) bp = buffer::copy(buf, length); + bufferlist bl; + bl.push_back(bp); + + ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid + << dendl; + + fakesnap.seq = snapseq; + + /* lock just in time */ + client_lock.Lock(); + if (unmounting) { + client_lock.Unlock(); + delete onsafe; + return -ENOTCONN; + } + + objecter->write(oid, + object_locator_t(layout->pool_id), + offset, + length, + fakesnap, + bl, + ceph::real_clock::now(), + 0, + onsafe); + + client_lock.Unlock(); + if (!done /* also !sync */) { + flock.Lock(); + while (! done) + cond.Wait(flock); + flock.Unlock(); + } + + if (r < 0) { + return r; + } else { + return length; + } +} + +int Client::ll_commit_blocks(Inode *in, + uint64_t offset, + uint64_t length) +{ + Mutex::Locker lock(client_lock); + /* + BarrierContext *bctx; + vinodeno_t vino = ll_get_vino(in); + uint64_t ino = vino.ino; + + ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from " + << offset << " to " << length << dendl; + + if (length == 0) { + return -EINVAL; + } + + map::iterator p = barriers.find(ino); + if (p != barriers.end()) { + barrier_interval civ(offset, offset + length); + p->second->commit_barrier(civ); + } + */ + return 0; +} + +int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data) +{ + Mutex::Locker lock(client_lock); + ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off << + "~" << len << dendl; + tout(cct) << "ll_write" << std::endl; + tout(cct) << (unsigned long)fh << std::endl; + tout(cct) << off << std::endl; + tout(cct) << len << std::endl; + + if (unmounting) + return -ENOTCONN; + + int r = _write(fh, off, len, data, NULL, 0); + ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r + << dendl; + return r; +} + +int Client::ll_flush(Fh *fh) +{ + Mutex::Locker lock(client_lock); + ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl; + tout(cct) << "ll_flush" << std::endl; + tout(cct) << (unsigned long)fh << std::endl; + + if (unmounting) + return -ENOTCONN; + + return _flush(fh); +} + +int Client::ll_fsync(Fh *fh, bool syncdataonly) +{ + Mutex::Locker lock(client_lock); + ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl; + tout(cct) << "ll_fsync" << std::endl; + tout(cct) << (unsigned long)fh << std::endl; + + if (unmounting) + return -ENOTCONN; + + int r = _fsync(fh, syncdataonly); + if (r) { + // If we're returning an error, clear it from the FH + fh->take_async_err(); + } + return r; +} + +#ifdef FALLOC_FL_PUNCH_HOLE + +int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length) +{ + if (offset < 0 || length <= 0) + return -EINVAL; + + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + return -EOPNOTSUPP; + + if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE)) + return -EOPNOTSUPP; + + Inode *in = fh->inode.get(); + + if (objecter->osdmap_pool_full(in->layout.pool_id) && + !(mode & FALLOC_FL_PUNCH_HOLE)) { + return -ENOSPC; + } + + if (in->snapid != CEPH_NOSNAP) + return -EROFS; + + if ((fh->mode & CEPH_FILE_MODE_WR) == 0) + return -EBADF; + + uint64_t size = offset + length; + if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) && + size > in->size && + is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) { + return -EDQUOT; + } + + int have; + int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1); + if (r < 0) + return r; + + Mutex uninline_flock("Client::_fallocate_uninline_data flock"); + Cond uninline_cond; + bool uninline_done = false; + int uninline_ret = 0; + Context *onuninline = NULL; + + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (in->inline_version < CEPH_INLINE_NONE && + (have & CEPH_CAP_FILE_BUFFER)) { + bufferlist bl; + int len = in->inline_data.length(); + if (offset < len) { + if (offset > 0) + in->inline_data.copy(0, offset, bl); + int size = length; + if (offset + size > len) + size = len - offset; + if (size > 0) + bl.append_zero(size); + if (offset + size < len) + in->inline_data.copy(offset + size, len - offset - size, bl); + in->inline_data = bl; + in->inline_version++; + } + in->mtime = ceph_clock_now(); + in->change_attr++; + mark_caps_dirty(in, CEPH_CAP_FILE_WR); + } else { + if (in->inline_version < CEPH_INLINE_NONE) { + onuninline = new C_SafeCond(&uninline_flock, + &uninline_cond, + &uninline_done, + &uninline_ret); + uninline_data(in, onuninline); + } + + Mutex flock("Client::_punch_hole flock"); + Cond cond; + bool done = false; + Context *onfinish = new C_SafeCond(&flock, &cond, &done); + + unsafe_sync_write++; + get_cap_ref(in, CEPH_CAP_FILE_BUFFER); + + _invalidate_inode_cache(in, offset, length); + filer->zero(in->ino, &in->layout, + in->snaprealm->get_snap_context(), + offset, length, + ceph::real_clock::now(), + 0, true, onfinish); + in->mtime = ceph_clock_now(); + in->change_attr++; + mark_caps_dirty(in, CEPH_CAP_FILE_WR); + + client_lock.Unlock(); + flock.Lock(); + while (!done) + cond.Wait(flock); + flock.Unlock(); + client_lock.Lock(); + _sync_write_commit(in); + } + } else if (!(mode & FALLOC_FL_KEEP_SIZE)) { + uint64_t size = offset + length; + if (size > in->size) { + in->size = size; + in->mtime = ceph_clock_now(); + in->change_attr++; + mark_caps_dirty(in, CEPH_CAP_FILE_WR); + + if (is_quota_bytes_approaching(in, fh->actor_perms)) { + check_caps(in, CHECK_CAPS_NODELAY); + } else if (is_max_size_approaching(in)) { + check_caps(in, 0); + } + } + } + + if (onuninline) { + client_lock.Unlock(); + uninline_flock.Lock(); + while (!uninline_done) + uninline_cond.Wait(uninline_flock); + uninline_flock.Unlock(); + client_lock.Lock(); + + if (uninline_ret >= 0 || uninline_ret == -ECANCELED) { + in->inline_data.clear(); + in->inline_version = CEPH_INLINE_NONE; + mark_caps_dirty(in, CEPH_CAP_FILE_WR); + check_caps(in, 0); + } else + r = uninline_ret; + } + + put_cap_ref(in, CEPH_CAP_FILE_WR); + return r; +} +#else + +int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length) +{ + return -EOPNOTSUPP; +} + +#endif + + +int Client::ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length) +{ + Mutex::Locker lock(client_lock); + ldout(cct, 3) << "ll_fallocate " << fh << " " << fh->inode->ino << " " << dendl; + tout(cct) << "ll_fallocate " << mode << " " << offset << " " << length << std::endl; + tout(cct) << (unsigned long)fh << std::endl; + + if (unmounting) + return -ENOTCONN; + + return _fallocate(fh, mode, offset, length); +} + +int Client::fallocate(int fd, int mode, loff_t offset, loff_t length) +{ + Mutex::Locker lock(client_lock); + tout(cct) << "fallocate " << " " << fd << mode << " " << offset << " " << length << std::endl; + + if (unmounting) + return -ENOTCONN; + + Fh *fh = get_filehandle(fd); + if (!fh) + return -EBADF; +#if defined(__linux__) && defined(O_PATH) + if (fh->flags & O_PATH) + return -EBADF; +#endif + return _fallocate(fh, mode, offset, length); +} + +int Client::ll_release(Fh *fh) +{ + Mutex::Locker lock(client_lock); + ldout(cct, 3) << "ll_release (fh)" << fh << " " << fh->inode->ino << " " << + dendl; + tout(cct) << "ll_release (fh)" << std::endl; + tout(cct) << (unsigned long)fh << std::endl; + + if (unmounting) + return -ENOTCONN; + + if (ll_unclosed_fh_set.count(fh)) + ll_unclosed_fh_set.erase(fh); + return _release_fh(fh); +} + +int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner) +{ + Mutex::Locker lock(client_lock); + + ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl; + tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl; + + if (unmounting) + return -ENOTCONN; + + return _getlk(fh, fl, owner); +} + +int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep) +{ + Mutex::Locker lock(client_lock); + + ldout(cct, 3) << "ll_setlk (fh) " << fh << " " << fh->inode->ino << dendl; + tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl; + + if (unmounting) + return -ENOTCONN; + + return _setlk(fh, fl, owner, sleep); +} + +int Client::ll_flock(Fh *fh, int cmd, uint64_t owner) +{ + Mutex::Locker lock(client_lock); + + ldout(cct, 3) << "ll_flock (fh) " << fh << " " << fh->inode->ino << dendl; + tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl; + + if (unmounting) + return -ENOTCONN; + + return _flock(fh, cmd, owner); +} + +class C_Client_RequestInterrupt : public Context { +private: + Client *client; + MetaRequest *req; +public: + C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) { + req->get(); + } + void finish(int r) override { + Mutex::Locker l(client->client_lock); + assert(req->head.op == CEPH_MDS_OP_SETFILELOCK); + client->_interrupt_filelock(req); + client->put_request(req); + } +}; + +void Client::ll_interrupt(void *d) +{ + MetaRequest *req = static_cast(d); + ldout(cct, 3) << "ll_interrupt tid " << req->get_tid() << dendl; + tout(cct) << "ll_interrupt tid " << req->get_tid() << std::endl; + interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req)); +} + +// ========================================= +// layout + +// expose file layouts + +int Client::describe_layout(const char *relpath, file_layout_t *lp, + const UserPerm& perms) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + filepath path(relpath); + InodeRef in; + int r = path_walk(path, &in, perms); + if (r < 0) + return r; + + *lp = in->layout; + + ldout(cct, 3) << "describe_layout(" << relpath << ") = 0" << dendl; + return 0; +} + +int Client::fdescribe_layout(int fd, file_layout_t *lp) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + Inode *in = f->inode.get(); + + *lp = in->layout; + + ldout(cct, 3) << "fdescribe_layout(" << fd << ") = 0" << dendl; + return 0; +} + +int64_t Client::get_default_pool_id() +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + /* first data pool is the default */ + return mdsmap->get_first_data_pool(); +} + +// expose osdmap + +int64_t Client::get_pool_id(const char *pool_name) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name), + pool_name); +} + +string Client::get_pool_name(int64_t pool) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return string(); + + return objecter->with_osdmap([pool](const OSDMap& o) { + return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string(); + }); +} + +int Client::get_pool_replication(int64_t pool) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + return objecter->with_osdmap([pool](const OSDMap& o) { + return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT; + }); +} + +int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector& osds) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + Inode *in = f->inode.get(); + + vector extents; + Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents); + assert(extents.size() == 1); + + objecter->with_osdmap([&](const OSDMap& o) { + pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc); + o.pg_to_acting_osds(pg, osds); + }); + + if (osds.empty()) + return -EINVAL; + + /* + * Return the remainder of the extent (stripe unit) + * + * If length = 1 is passed to Striper::file_to_extents we get a single + * extent back, but its length is one so we still need to compute the length + * to the end of the stripe unit. + * + * If length = su then we may get 1 or 2 objects back in the extents vector + * which would have to be examined. Even then, the offsets are local to the + * object, so matching up to the file offset is extra work. + * + * It seems simpler to stick with length = 1 and manually compute the + * remainder. + */ + if (len) { + uint64_t su = in->layout.stripe_unit; + *len = su - (off % su); + } + + return 0; +} + +int Client::get_osd_crush_location(int id, vector >& path) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + if (id < 0) + return -EINVAL; + return objecter->with_osdmap([&](const OSDMap& o) { + return o.crush->get_full_location_ordered(id, path); + }); +} + +int Client::get_file_stripe_address(int fd, loff_t offset, + vector& address) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + Inode *in = f->inode.get(); + + // which object? + vector extents; + Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1, + in->truncate_size, extents); + assert(extents.size() == 1); + + // now we have the object and its 'layout' + return objecter->with_osdmap([&](const OSDMap& o) { + pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc); + vector osds; + o.pg_to_acting_osds(pg, osds); + if (osds.empty()) + return -EINVAL; + for (unsigned i = 0; i < osds.size(); i++) { + entity_addr_t addr = o.get_addr(osds[i]); + address.push_back(addr); + } + return 0; + }); +} + +int Client::get_osd_addr(int osd, entity_addr_t& addr) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + return objecter->with_osdmap([&](const OSDMap& o) { + if (!o.exists(osd)) + return -ENOENT; + + addr = o.get_addr(osd); + return 0; + }); +} + +int Client::enumerate_layout(int fd, vector& result, + loff_t length, loff_t offset) +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + Fh *f = get_filehandle(fd); + if (!f) + return -EBADF; + Inode *in = f->inode.get(); + + // map to a list of extents + Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result); + + ldout(cct, 3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl; + return 0; +} + + +/* + * find an osd with the same ip. -1 if none. + */ +int Client::get_local_osd() +{ + Mutex::Locker lock(client_lock); + + if (unmounting) + return -ENOTCONN; + + objecter->with_osdmap([this](const OSDMap& o) { + if (o.get_epoch() != local_osd_epoch) { + local_osd = o.find_osd_on_ip(messenger->get_myaddr()); + local_osd_epoch = o.get_epoch(); + } + }); + return local_osd; +} + + + + + + +// =============================== + +void Client::ms_handle_connect(Connection *con) +{ + ldout(cct, 10) << "ms_handle_connect on " << con->get_peer_addr() << dendl; +} + +bool Client::ms_handle_reset(Connection *con) +{ + ldout(cct, 0) << "ms_handle_reset on " << con->get_peer_addr() << dendl; + return false; +} + +void Client::ms_handle_remote_reset(Connection *con) +{ + ldout(cct, 0) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl; + Mutex::Locker l(client_lock); + switch (con->get_peer_type()) { + case CEPH_ENTITY_TYPE_MDS: + { + // kludge to figure out which mds this is; fixme with a Connection* state + mds_rank_t mds = MDS_RANK_NONE; + MetaSession *s = NULL; + for (map::iterator p = mds_sessions.begin(); + p != mds_sessions.end(); + ++p) { + if (mdsmap->get_addr(p->first) == con->get_peer_addr()) { + mds = p->first; + s = p->second; + } + } + if (mds >= 0) { + assert (s != NULL); + switch (s->state) { + case MetaSession::STATE_CLOSING: + ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl; + _closed_mds_session(s); + break; + + case MetaSession::STATE_OPENING: + { + ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl; + list waiters; + waiters.swap(s->waiting_for_open); + _closed_mds_session(s); + MetaSession *news = _get_or_open_mds_session(mds); + news->waiting_for_open.swap(waiters); + } + break; + + case MetaSession::STATE_OPEN: + { + const md_config_t *conf = cct->_conf; + if (conf->client_reconnect_stale) { + ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl; + _closed_mds_session(s); + } else { + ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl; + s->state = MetaSession::STATE_STALE; + } + } + break; + + case MetaSession::STATE_NEW: + case MetaSession::STATE_CLOSED: + default: + break; + } + } + } + break; + } +} + +bool Client::ms_handle_refused(Connection *con) +{ + ldout(cct, 1) << "ms_handle_refused on " << con->get_peer_addr() << dendl; + return false; +} + +bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new) +{ + if (dest_type == CEPH_ENTITY_TYPE_MON) + return true; + *authorizer = monclient->build_authorizer(dest_type); + return true; +} + +Inode *Client::get_quota_root(Inode *in, const UserPerm& perms) +{ + Inode *cur = in; + utime_t now = ceph_clock_now(); + + while (cur) { + if (cur != in && cur->quota.is_enable()) + break; + + Inode *parent_in = NULL; + if (!cur->dn_set.empty()) { + for (auto p = cur->dn_set.begin(); p != cur->dn_set.end(); ++p) { + Dentry *dn = *p; + if (dn->lease_mds >= 0 && + dn->lease_ttl > now && + mds_sessions.count(dn->lease_mds)) { + parent_in = dn->dir->parent_inode; + } else { + Inode *diri = dn->dir->parent_inode; + if (diri->caps_issued_mask(CEPH_CAP_FILE_SHARED) && + diri->shared_gen == dn->cap_shared_gen) { + parent_in = dn->dir->parent_inode; + } + } + if (parent_in) + break; + } + } else if (root_parents.count(cur)) { + parent_in = root_parents[cur].get(); + } + + if (parent_in) { + cur = parent_in; + continue; + } + + if (cur == root_ancestor) + break; + + // deleted inode + if (cur->nlink == 0) { + cur = root_ancestor; + break; + } + + MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME); + filepath path(cur->ino); + req->set_filepath(path); + req->set_inode(cur); + + InodeRef parent_ref; + int ret = make_request(req, perms, &parent_ref); + if (ret < 0) { + ldout(cct, 1) << __func__ << " " << in->vino() + << " failed to find parent of " << cur->vino() + << " err " << ret << dendl; + // FIXME: what to do? + cur = root_ancestor; + break; + } + + now = ceph_clock_now(); + if (cur == in) + cur = parent_ref.get(); + else + cur = in; // start over + } + + ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << cur->vino() << dendl; + return cur; +} + +/** + * Traverse quota ancestors of the Inode, return true + * if any of them passes the passed function + */ +bool Client::check_quota_condition(Inode *in, const UserPerm& perms, + std::function test) +{ + while (true) { + assert(in != NULL); + if (test(*in)) { + return true; + } + + if (in == root_ancestor) { + // We're done traversing, drop out + return false; + } else { + // Continue up the tree + in = get_quota_root(in, perms); + } + } + + return false; +} + +bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms) +{ + return check_quota_condition(in, perms, + [](const Inode &in) { + return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files; + }); +} + +bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes, + const UserPerm& perms) +{ + return check_quota_condition(in, perms, + [&new_bytes](const Inode &in) { + return in.quota.max_bytes && (in.rstat.rbytes + new_bytes) + > in.quota.max_bytes; + }); +} + +bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms) +{ + return check_quota_condition(in, perms, + [](const Inode &in) { + if (in.quota.max_bytes) { + if (in.rstat.rbytes >= in.quota.max_bytes) { + return true; + } + + assert(in.size >= in.reported_size); + const uint64_t space = in.quota.max_bytes - in.rstat.rbytes; + const uint64_t size = in.size - in.reported_size; + return (space >> 4) < size; + } else { + return false; + } + }); +} + +enum { + POOL_CHECKED = 1, + POOL_CHECKING = 2, + POOL_READ = 4, + POOL_WRITE = 8, +}; + +int Client::check_pool_perm(Inode *in, int need) +{ + if (!cct->_conf->client_check_pool_perm) + return 0; + + int64_t pool_id = in->layout.pool_id; + std::string pool_ns = in->layout.pool_ns; + std::pair perm_key(pool_id, pool_ns); + int have = 0; + while (true) { + auto it = pool_perms.find(perm_key); + if (it == pool_perms.end()) + break; + if (it->second == POOL_CHECKING) { + // avoid concurrent checkings + wait_on_list(waiting_for_pool_perm); + } else { + have = it->second; + assert(have & POOL_CHECKED); + break; + } + } + + if (!have) { + if (in->snapid != CEPH_NOSNAP) { + // pool permission check needs to write to the first object. But for snapshot, + // head of the first object may have alread been deleted. To avoid creating + // orphan object, skip the check for now. + return 0; + } + + pool_perms[perm_key] = POOL_CHECKING; + + char oid_buf[32]; + snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino); + object_t oid = oid_buf; + + SnapContext nullsnapc; + + C_SaferCond rd_cond; + ObjectOperation rd_op; + rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL); + + objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op, + nullsnapc, ceph::real_clock::now(), 0, &rd_cond); + + C_SaferCond wr_cond; + ObjectOperation wr_op; + wr_op.create(true); + + objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op, + nullsnapc, ceph::real_clock::now(), 0, &wr_cond); + + client_lock.Unlock(); + int rd_ret = rd_cond.wait(); + int wr_ret = wr_cond.wait(); + client_lock.Lock(); + + bool errored = false; + + if (rd_ret == 0 || rd_ret == -ENOENT) + have |= POOL_READ; + else if (rd_ret != -EPERM) { + ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns + << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl; + errored = true; + } + + if (wr_ret == 0 || wr_ret == -EEXIST) + have |= POOL_WRITE; + else if (wr_ret != -EPERM) { + ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns + << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl; + errored = true; + } + + if (errored) { + // Indeterminate: erase CHECKING state so that subsequent calls re-check. + // Raise EIO because actual error code might be misleading for + // userspace filesystem user. + pool_perms.erase(perm_key); + signal_cond_list(waiting_for_pool_perm); + return -EIO; + } + + pool_perms[perm_key] = have | POOL_CHECKED; + signal_cond_list(waiting_for_pool_perm); + } + + if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) { + ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns + << " need " << ccap_string(need) << ", but no read perm" << dendl; + return -EPERM; + } + if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) { + ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns + << " need " << ccap_string(need) << ", but no write perm" << dendl; + return -EPERM; + } + + return 0; +} + +int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want) +{ + if (acl_type == POSIX_ACL) { + if (in->xattrs.count(ACL_EA_ACCESS)) { + const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS]; + + return posix_acl_permits(access_acl, in->uid, in->gid, perms, want); + } + } + return -EAGAIN; +} + +int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms) +{ + if (acl_type == NO_ACL) + return 0; + + int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0); + if (r < 0) + goto out; + + if (acl_type == POSIX_ACL) { + if (in->xattrs.count(ACL_EA_ACCESS)) { + const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS]; + bufferptr acl(access_acl.c_str(), access_acl.length()); + r = posix_acl_access_chmod(acl, mode); + if (r < 0) + goto out; + r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms); + } else { + r = 0; + } + } +out: + ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl; + return r; +} + +int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl, + const UserPerm& perms) +{ + if (acl_type == NO_ACL) + return 0; + + if (S_ISLNK(*mode)) + return 0; + + int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0); + if (r < 0) + goto out; + + if (acl_type == POSIX_ACL) { + if (dir->xattrs.count(ACL_EA_DEFAULT)) { + map xattrs; + + const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT]; + bufferptr acl(default_acl.c_str(), default_acl.length()); + r = posix_acl_inherit_mode(acl, mode); + if (r < 0) + goto out; + + if (r > 0) { + r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode); + if (r < 0) + goto out; + if (r > 0) + xattrs[ACL_EA_ACCESS] = acl; + } + + if (S_ISDIR(*mode)) + xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT]; + + r = xattrs.size(); + if (r > 0) + ::encode(xattrs, xattrs_bl); + } else { + if (umask_cb) + *mode &= ~umask_cb(callback_handle); + r = 0; + } + } +out: + ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl; + return r; +} + +void Client::set_filer_flags(int flags) +{ + Mutex::Locker l(client_lock); + assert(flags == 0 || + flags == CEPH_OSD_FLAG_LOCALIZE_READS); + objecter->add_global_op_flags(flags); +} + +void Client::clear_filer_flags(int flags) +{ + Mutex::Locker l(client_lock); + assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS); + objecter->clear_global_op_flag(flags); +} + +/** + * This is included in cap release messages, to cause + * the MDS to wait until this OSD map epoch. It is necessary + * in corner cases where we cancel RADOS ops, so that + * nobody else tries to do IO to the same objects in + * the same epoch as the cancelled ops. + */ +void Client::set_cap_epoch_barrier(epoch_t e) +{ + ldout(cct, 5) << __func__ << " epoch = " << e << dendl; + cap_epoch_barrier = e; +} + +const char** Client::get_tracked_conf_keys() const +{ + static const char* keys[] = { + "client_cache_size", + "client_cache_mid", + "client_acl_type", + NULL + }; + return keys; +} + +void Client::handle_conf_change(const struct md_config_t *conf, + const std::set &changed) +{ + Mutex::Locker lock(client_lock); + + if (changed.count("client_cache_mid")) { + lru.lru_set_midpoint(cct->_conf->client_cache_mid); + } + if (changed.count("client_acl_type")) { + acl_type = NO_ACL; + if (cct->_conf->client_acl_type == "posix_acl") + acl_type = POSIX_ACL; + } +} + +void Client::init_groups(UserPerm *perms) +{ + gid_t *sgids; + int count = _getgrouplist(&sgids, perms->uid(), perms->gid()); + perms->init_gids(sgids, count); +} + +void intrusive_ptr_add_ref(Inode *in) +{ + in->get(); +} + +void intrusive_ptr_release(Inode *in) +{ + in->client->put_inode(in); +} + +mds_rank_t Client::_get_random_up_mds() const +{ + assert(client_lock.is_locked_by_me()); + + std::set up; + mdsmap->get_up_mds_set(up); + + if (up.empty()) + return MDS_RANK_NONE; + std::set::const_iterator p = up.begin(); + for (int n = rand() % up.size(); n; n--) + ++p; + return *p; +} + + +StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc) + : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0)) +{ + monclient->set_messenger(m); + objecter->set_client_incarnation(0); +} + +StandaloneClient::~StandaloneClient() +{ + delete objecter; + objecter = nullptr; +} + +int StandaloneClient::init() +{ + timer.init(); + objectcacher->start(); + objecter->init(); + + client_lock.Lock(); + assert(!initialized); + + messenger->add_dispatcher_tail(objecter); + messenger->add_dispatcher_tail(this); + + monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD); + int r = monclient->init(); + if (r < 0) { + // need to do cleanup because we're in an intermediate init state + timer.shutdown(); + client_lock.Unlock(); + objecter->shutdown(); + objectcacher->stop(); + monclient->shutdown(); + return r; + } + objecter->start(); + + client_lock.Unlock(); + _finish_init(); + + return 0; +} + +void StandaloneClient::shutdown() +{ + Client::shutdown(); + objecter->shutdown(); + monclient->shutdown(); +} +