1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <sys/types.h>
22 #include <sys/param.h>
25 #include <sys/utsname.h>
28 #include <boost/lexical_cast.hpp>
29 #include <boost/fusion/include/std_pair.hpp>
31 #if defined(__FreeBSD__)
32 #define XATTR_CREATE 0x1
33 #define XATTR_REPLACE 0x2
35 #include <sys/xattr.h>
38 #if defined(__linux__)
39 #include <linux/falloc.h>
42 #include <sys/statvfs.h>
44 #include "common/config.h"
45 #include "common/version.h"
48 #include "messages/MClientSession.h"
49 #include "messages/MClientReconnect.h"
50 #include "messages/MClientRequest.h"
51 #include "messages/MClientRequestForward.h"
52 #include "messages/MClientReply.h"
53 #include "messages/MClientCaps.h"
54 #include "messages/MClientLease.h"
55 #include "messages/MClientSnap.h"
56 #include "messages/MCommandReply.h"
57 #include "messages/MOSDMap.h"
58 #include "messages/MClientQuota.h"
59 #include "messages/MClientCapRelease.h"
60 #include "messages/MMDSMap.h"
61 #include "messages/MFSMap.h"
62 #include "messages/MFSMapUser.h"
64 #include "mon/MonClient.h"
66 #include "mds/flock.h"
67 #include "osd/OSDMap.h"
68 #include "osdc/Filer.h"
70 #include "common/Cond.h"
71 #include "common/Mutex.h"
72 #include "common/perf_counters.h"
73 #include "common/admin_socket.h"
74 #include "common/errno.h"
75 #include "include/str_list.h"
77 #define dout_subsys ceph_subsys_client
79 #include "include/lru.h"
80 #include "include/compat.h"
81 #include "include/stringify.h"
87 #include "ClientSnapRealm.h"
89 #include "MetaSession.h"
90 #include "MetaRequest.h"
91 #include "ObjecterWriteback.h"
92 #include "posix_acl.h"
94 #include "include/assert.h"
95 #include "include/stat.h"
97 #include "include/cephfs/ceph_statx.h"
106 #define dout_prefix *_dout << "client." << whoami << " "
108 #define tout(cct) if (!cct->_conf->client_trace.empty()) traceout
110 // FreeBSD fails to define this
114 // Darwin fails to define this
123 #define DEBUG_GETATTR_CAPS (CEPH_CAP_XATTR_SHARED)
125 void client_flush_set_callback(void *p, ObjectCacher::ObjectSet *oset)
127 Client *client = static_cast<Client*>(p);
128 client->flush_set_callback(oset);
134 Client::CommandHook::CommandHook(Client *client) :
139 bool Client::CommandHook::call(std::string command, cmdmap_t& cmdmap,
140 std::string format, bufferlist& out)
142 Formatter *f = Formatter::create(format);
143 f->open_object_section("result");
144 m_client->client_lock.Lock();
145 if (command == "mds_requests")
146 m_client->dump_mds_requests(f);
147 else if (command == "mds_sessions")
148 m_client->dump_mds_sessions(f);
149 else if (command == "dump_cache")
150 m_client->dump_cache(f);
151 else if (command == "kick_stale_sessions")
152 m_client->_kick_stale_sessions();
153 else if (command == "status")
154 m_client->dump_status(f);
156 assert(0 == "bad command registered");
157 m_client->client_lock.Unlock();
167 dir_result_t::dir_result_t(Inode *in, const UserPerm& perms)
168 : inode(in), offset(0), next_offset(2),
169 release_count(0), ordered_count(0), cache_index(0), start_shared_gen(0),
173 void Client::_reset_faked_inos()
176 free_faked_inos.clear();
177 free_faked_inos.insert(start, (uint32_t)-1 - start + 1);
178 last_used_faked_ino = 0;
179 _use_faked_inos = sizeof(ino_t) < 8 || cct->_conf->client_use_faked_inos;
182 void Client::_assign_faked_ino(Inode *in)
184 interval_set<ino_t>::const_iterator it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
185 if (it == free_faked_inos.end() && last_used_faked_ino > 0) {
186 last_used_faked_ino = 0;
187 it = free_faked_inos.lower_bound(last_used_faked_ino + 1);
189 assert(it != free_faked_inos.end());
190 if (last_used_faked_ino < it.get_start()) {
191 assert(it.get_len() > 0);
192 last_used_faked_ino = it.get_start();
194 ++last_used_faked_ino;
195 assert(it.get_start() + it.get_len() > last_used_faked_ino);
197 in->faked_ino = last_used_faked_ino;
198 free_faked_inos.erase(in->faked_ino);
199 faked_ino_map[in->faked_ino] = in->vino();
202 void Client::_release_faked_ino(Inode *in)
204 free_faked_inos.insert(in->faked_ino);
205 faked_ino_map.erase(in->faked_ino);
208 vinodeno_t Client::_map_faked_ino(ino_t ino)
213 else if (faked_ino_map.count(ino))
214 vino = faked_ino_map[ino];
216 vino = vinodeno_t(0, CEPH_NOSNAP);
217 ldout(cct, 10) << "map_faked_ino " << ino << " -> " << vino << dendl;
221 vinodeno_t Client::map_faked_ino(ino_t ino)
223 Mutex::Locker lock(client_lock);
224 return _map_faked_ino(ino);
229 Client::Client(Messenger *m, MonClient *mc, Objecter *objecter_)
230 : Dispatcher(m->cct),
231 m_command_hook(this),
232 timer(m->cct, client_lock),
233 callback_handle(NULL),
234 switch_interrupt_cb(NULL),
236 ino_invalidate_cb(NULL),
237 dentry_invalidate_cb(NULL),
240 can_invalidate_dentries(false),
241 require_remount(false),
242 async_ino_invalidator(m->cct),
243 async_dentry_invalidator(m->cct),
244 interrupt_finisher(m->cct),
245 remount_finisher(m->cct),
246 objecter_finisher(m->cct),
248 messenger(m), monclient(mc),
250 whoami(mc->get_global_id()), cap_epoch_barrier(0),
251 last_tid(0), oldest_tid(0), last_flush_tid(1),
253 mounted(false), unmounting(false), blacklisted(false),
254 local_osd(-1), local_osd_epoch(0),
255 unsafe_sync_write(0),
256 client_lock("Client::client_lock")
262 num_flushing_caps = 0;
264 _dir_vxattrs_name_size = _vxattrs_calcu_name_size(_dir_vxattrs);
265 _file_vxattrs_name_size = _vxattrs_calcu_name_size(_file_vxattrs);
267 user_id = cct->_conf->client_mount_uid;
268 group_id = cct->_conf->client_mount_gid;
271 if (cct->_conf->client_acl_type == "posix_acl")
272 acl_type = POSIX_ACL;
274 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
277 free_fd_set.insert(10, 1<<30);
279 mdsmap.reset(new MDSMap);
282 writeback_handler.reset(new ObjecterWriteback(objecter, &objecter_finisher,
284 objectcacher.reset(new ObjectCacher(cct, "libcephfs", *writeback_handler, client_lock,
285 client_flush_set_callback, // all commit callback
287 cct->_conf->client_oc_size,
288 cct->_conf->client_oc_max_objects,
289 cct->_conf->client_oc_max_dirty,
290 cct->_conf->client_oc_target_dirty,
291 cct->_conf->client_oc_max_dirty_age,
293 objecter_finisher.start();
294 filer.reset(new Filer(objecter, &objecter_finisher));
295 objecter->enable_blacklist_events();
301 assert(!client_lock.is_locked());
303 // It is necessary to hold client_lock, because any inode destruction
304 // may call into ObjectCacher, which asserts that it's lock (which is
305 // client_lock) is held.
308 client_lock.Unlock();
311 void Client::tear_down_cache()
314 for (ceph::unordered_map<int, Fh*>::iterator it = fd_map.begin();
318 ldout(cct, 1) << "tear_down_cache forcing close of fh " << it->first << " ino " << fh->inode->ino << dendl;
323 while (!opened_dirs.empty()) {
324 dir_result_t *dirp = *opened_dirs.begin();
325 ldout(cct, 1) << "tear_down_cache forcing close of dir " << dirp << " ino " << dirp->inode->ino << dendl;
334 assert(lru.lru_get_size() == 0);
337 assert(inode_map.size() <= 1 + root_parents.size());
338 if (root && inode_map.size() == 1 + root_parents.size()) {
342 while (!root_parents.empty())
343 root_parents.erase(root_parents.begin());
348 assert(inode_map.empty());
351 inodeno_t Client::get_root_ino()
353 Mutex::Locker l(client_lock);
354 if (use_faked_inos())
355 return root->faked_ino;
360 Inode *Client::get_root()
362 Mutex::Locker l(client_lock);
370 void Client::dump_inode(Formatter *f, Inode *in, set<Inode*>& did, bool disconnected)
373 in->make_long_path(path);
374 ldout(cct, 1) << "dump_inode: "
375 << (disconnected ? "DISCONNECTED ":"")
376 << "inode " << in->ino
378 << " ref " << in->get_num_ref()
382 f->open_object_section("inode");
383 f->dump_stream("path") << path;
385 f->dump_int("disconnected", 1);
392 ldout(cct, 1) << " dir " << in->dir << " size " << in->dir->dentries.size() << dendl;
393 for (ceph::unordered_map<string, Dentry*>::iterator it = in->dir->dentries.begin();
394 it != in->dir->dentries.end();
396 ldout(cct, 1) << " " << in->ino << " dn " << it->first << " " << it->second << " ref " << it->second->ref << dendl;
398 f->open_object_section("dentry");
402 if (it->second->inode)
403 dump_inode(f, it->second->inode.get(), did, false);
408 void Client::dump_cache(Formatter *f)
412 ldout(cct, 1) << "dump_cache" << dendl;
415 f->open_array_section("cache");
418 dump_inode(f, root, did, true);
420 // make a second pass to catch anything disconnected
421 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
422 it != inode_map.end();
424 if (did.count(it->second))
426 dump_inode(f, it->second, did, true);
433 void Client::dump_status(Formatter *f)
435 assert(client_lock.is_locked_by_me());
437 ldout(cct, 1) << __func__ << dendl;
439 const epoch_t osd_epoch
440 = objecter->with_osdmap(std::mem_fn(&OSDMap::get_epoch));
443 f->open_object_section("metadata");
444 for (const auto& kv : metadata)
445 f->dump_string(kv.first.c_str(), kv.second);
448 f->dump_int("dentry_count", lru.lru_get_size());
449 f->dump_int("dentry_pinned_count", lru.lru_get_num_pinned());
450 f->dump_int("id", get_nodeid().v);
451 f->dump_int("inode_count", inode_map.size());
452 f->dump_int("mds_epoch", mdsmap->get_epoch());
453 f->dump_int("osd_epoch", osd_epoch);
454 f->dump_int("osd_epoch_barrier", cap_epoch_barrier);
461 objectcacher->start();
464 assert(!initialized);
466 messenger->add_dispatcher_tail(this);
467 client_lock.Unlock();
473 void Client::_finish_init()
477 PerfCountersBuilder plb(cct, "client", l_c_first, l_c_last);
478 plb.add_time_avg(l_c_reply, "reply", "Latency of receiving a reply on metadata request");
479 plb.add_time_avg(l_c_lat, "lat", "Latency of processing a metadata request");
480 plb.add_time_avg(l_c_wrlat, "wrlat", "Latency of a file data write operation");
481 logger.reset(plb.create_perf_counters());
482 cct->get_perfcounters_collection()->add(logger.get());
484 client_lock.Unlock();
486 cct->_conf->add_observer(this);
488 AdminSocket* admin_socket = cct->get_admin_socket();
489 int ret = admin_socket->register_command("mds_requests",
492 "show in-progress mds requests");
494 lderr(cct) << "error registering admin socket command: "
495 << cpp_strerror(-ret) << dendl;
497 ret = admin_socket->register_command("mds_sessions",
500 "show mds session state");
502 lderr(cct) << "error registering admin socket command: "
503 << cpp_strerror(-ret) << dendl;
505 ret = admin_socket->register_command("dump_cache",
508 "show in-memory metadata cache contents");
510 lderr(cct) << "error registering admin socket command: "
511 << cpp_strerror(-ret) << dendl;
513 ret = admin_socket->register_command("kick_stale_sessions",
514 "kick_stale_sessions",
516 "kick sessions that were remote reset");
518 lderr(cct) << "error registering admin socket command: "
519 << cpp_strerror(-ret) << dendl;
521 ret = admin_socket->register_command("status",
524 "show overall client status");
526 lderr(cct) << "error registering admin socket command: "
527 << cpp_strerror(-ret) << dendl;
532 client_lock.Unlock();
535 void Client::shutdown()
537 ldout(cct, 1) << "shutdown" << dendl;
539 // If we were not mounted, but were being used for sending
540 // MDS commands, we may have sessions that need closing.
543 client_lock.Unlock();
545 cct->_conf->remove_observer(this);
547 AdminSocket* admin_socket = cct->get_admin_socket();
548 admin_socket->unregister_command("mds_requests");
549 admin_socket->unregister_command("mds_sessions");
550 admin_socket->unregister_command("dump_cache");
551 admin_socket->unregister_command("kick_stale_sessions");
552 admin_socket->unregister_command("status");
554 if (ino_invalidate_cb) {
555 ldout(cct, 10) << "shutdown stopping cache invalidator finisher" << dendl;
556 async_ino_invalidator.wait_for_empty();
557 async_ino_invalidator.stop();
560 if (dentry_invalidate_cb) {
561 ldout(cct, 10) << "shutdown stopping dentry invalidator finisher" << dendl;
562 async_dentry_invalidator.wait_for_empty();
563 async_dentry_invalidator.stop();
566 if (switch_interrupt_cb) {
567 ldout(cct, 10) << "shutdown stopping interrupt finisher" << dendl;
568 interrupt_finisher.wait_for_empty();
569 interrupt_finisher.stop();
573 ldout(cct, 10) << "shutdown stopping remount finisher" << dendl;
574 remount_finisher.wait_for_empty();
575 remount_finisher.stop();
578 objectcacher->stop(); // outside of client_lock! this does a join.
584 client_lock.Unlock();
586 objecter_finisher.wait_for_empty();
587 objecter_finisher.stop();
590 cct->get_perfcounters_collection()->remove(logger.get());
596 // ===================
597 // metadata cache stuff
599 void Client::trim_cache(bool trim_kernel_dcache)
601 uint64_t max = cct->_conf->client_cache_size;
602 ldout(cct, 20) << "trim_cache size " << lru.lru_get_size() << " max " << max << dendl;
604 while (lru.lru_get_size() != last) {
605 last = lru.lru_get_size();
607 if (!unmounting && lru.lru_get_size() <= max) break;
610 Dentry *dn = static_cast<Dentry*>(lru.lru_get_next_expire());
617 if (trim_kernel_dcache && lru.lru_get_size() > max)
618 _invalidate_kernel_dcache();
621 if (lru.lru_get_size() == 0 && root && root->get_num_ref() == 0 && inode_map.size() == 1 + root_parents.size()) {
622 ldout(cct, 15) << "trim_cache trimmed root " << root << dendl;
626 while (!root_parents.empty())
627 root_parents.erase(root_parents.begin());
633 void Client::trim_cache_for_reconnect(MetaSession *s)
635 mds_rank_t mds = s->mds_num;
636 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds << dendl;
639 list<Dentry*> skipped;
640 while (lru.lru_get_size() > 0) {
641 Dentry *dn = static_cast<Dentry*>(lru.lru_expire());
645 if ((dn->inode && dn->inode->caps.count(mds)) ||
646 dn->dir->parent_inode->caps.count(mds)) {
650 skipped.push_back(dn);
653 for(list<Dentry*>::iterator p = skipped.begin(); p != skipped.end(); ++p)
654 lru.lru_insert_mid(*p);
656 ldout(cct, 20) << "trim_cache_for_reconnect mds." << mds
657 << " trimmed " << trimmed << " dentries" << dendl;
659 if (s->caps.size() > 0)
660 _invalidate_kernel_dcache();
663 void Client::trim_dentry(Dentry *dn)
665 ldout(cct, 15) << "trim_dentry unlinking dn " << dn->name
666 << " in dir " << hex << dn->dir->parent_inode->ino
669 Inode *diri = dn->dir->parent_inode;
670 diri->dir_release_count++;
671 clear_dir_complete_and_ordered(diri, true);
673 unlink(dn, false, false); // drop dir, drop dentry
677 void Client::update_inode_file_bits(Inode *in,
678 uint64_t truncate_seq, uint64_t truncate_size,
679 uint64_t size, uint64_t change_attr,
680 uint64_t time_warp_seq, utime_t ctime,
683 version_t inline_version,
684 bufferlist& inline_data,
688 ldout(cct, 10) << "update_inode_file_bits " << *in << " " << ccap_string(issued)
689 << " mtime " << mtime << dendl;
690 ldout(cct, 25) << "truncate_seq: mds " << truncate_seq << " local "
691 << in->truncate_seq << " time_warp_seq: mds " << time_warp_seq
692 << " local " << in->time_warp_seq << dendl;
693 uint64_t prior_size = in->size;
695 if (inline_version > in->inline_version) {
696 in->inline_data = inline_data;
697 in->inline_version = inline_version;
700 /* always take a newer change attr */
701 if (change_attr > in->change_attr)
702 in->change_attr = change_attr;
704 if (truncate_seq > in->truncate_seq ||
705 (truncate_seq == in->truncate_seq && size > in->size)) {
706 ldout(cct, 10) << "size " << in->size << " -> " << size << dendl;
708 in->reported_size = size;
709 if (truncate_seq != in->truncate_seq) {
710 ldout(cct, 10) << "truncate_seq " << in->truncate_seq << " -> "
711 << truncate_seq << dendl;
712 in->truncate_seq = truncate_seq;
713 in->oset.truncate_seq = truncate_seq;
715 // truncate cached file data
716 if (prior_size > size) {
717 _invalidate_inode_cache(in, truncate_size, prior_size - truncate_size);
721 // truncate inline data
722 if (in->inline_version < CEPH_INLINE_NONE) {
723 uint32_t len = in->inline_data.length();
725 in->inline_data.splice(size, len - size);
728 if (truncate_seq >= in->truncate_seq &&
729 in->truncate_size != truncate_size) {
731 ldout(cct, 10) << "truncate_size " << in->truncate_size << " -> "
732 << truncate_size << dendl;
733 in->truncate_size = truncate_size;
734 in->oset.truncate_size = truncate_size;
736 ldout(cct, 0) << "Hmmm, truncate_seq && truncate_size changed on non-file inode!" << dendl;
740 // be careful with size, mtime, atime
741 if (issued & (CEPH_CAP_FILE_EXCL|
743 CEPH_CAP_FILE_BUFFER|
745 CEPH_CAP_XATTR_EXCL)) {
746 ldout(cct, 30) << "Yay have enough caps to look at our times" << dendl;
747 if (ctime > in->ctime)
749 if (time_warp_seq > in->time_warp_seq) {
750 ldout(cct, 10) << "mds time_warp_seq " << time_warp_seq << " on inode " << *in
751 << " is higher than local time_warp_seq "
752 << in->time_warp_seq << dendl;
753 //the mds updated times, so take those!
756 in->time_warp_seq = time_warp_seq;
757 } else if (time_warp_seq == in->time_warp_seq) {
759 if (mtime > in->mtime)
761 if (atime > in->atime)
763 } else if (issued & CEPH_CAP_FILE_EXCL) {
764 //ignore mds values as we have a higher seq
767 ldout(cct, 30) << "Don't have enough caps, just taking mds' time values" << dendl;
768 if (time_warp_seq >= in->time_warp_seq) {
772 in->time_warp_seq = time_warp_seq;
776 ldout(cct, 0) << "WARNING: " << *in << " mds time_warp_seq "
777 << time_warp_seq << " is lower than local time_warp_seq "
783 void Client::_fragmap_remove_non_leaves(Inode *in)
785 for (map<frag_t,int>::iterator p = in->fragmap.begin(); p != in->fragmap.end(); )
786 if (!in->dirfragtree.is_leaf(p->first))
787 in->fragmap.erase(p++);
792 void Client::_fragmap_remove_stopped_mds(Inode *in, mds_rank_t mds)
794 for (auto p = in->fragmap.begin(); p != in->fragmap.end(); )
795 if (p->second == mds)
796 in->fragmap.erase(p++);
801 Inode * Client::add_update_inode(InodeStat *st, utime_t from,
802 MetaSession *session,
803 const UserPerm& request_perms)
806 bool was_new = false;
807 if (inode_map.count(st->vino)) {
808 in = inode_map[st->vino];
809 ldout(cct, 12) << "add_update_inode had " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
811 in = new Inode(this, st->vino, &st->layout);
812 inode_map[st->vino] = in;
814 if (use_faked_inos())
815 _assign_faked_ino(in);
821 } else if (!mounted) {
822 root_parents[root_ancestor] = in;
827 in->ino = st->vino.ino;
828 in->snapid = st->vino.snapid;
829 in->mode = st->mode & S_IFMT;
834 if (in->is_symlink())
835 in->symlink = st->symlink;
838 ldout(cct, 12) << "add_update_inode adding " << *in << " caps " << ccap_string(st->cap.caps) << dendl;
841 return in; // as with readdir returning indoes in different snaprealms (no caps!)
843 // only update inode if mds info is strictly newer, or it is the same and projected (odd).
844 bool updating_inode = false;
846 if (st->version == 0 ||
847 (in->version & ~1) < st->version) {
848 updating_inode = true;
851 issued = in->caps_issued(&implemented) | in->caps_dirty();
852 issued |= implemented;
854 in->version = st->version;
856 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
860 in->btime = st->btime;
863 if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
864 in->nlink = st->nlink;
867 in->dirstat = st->dirstat;
868 in->rstat = st->rstat;
869 in->quota = st->quota;
870 in->layout = st->layout;
873 in->dir_layout = st->dir_layout;
874 ldout(cct, 20) << " dir hash is " << (int)in->dir_layout.dl_dir_hash << dendl;
877 update_inode_file_bits(in, st->truncate_seq, st->truncate_size, st->size,
878 st->change_attr, st->time_warp_seq, st->ctime,
879 st->mtime, st->atime, st->inline_version,
880 st->inline_data, issued);
881 } else if (st->inline_version > in->inline_version) {
882 in->inline_data = st->inline_data;
883 in->inline_version = st->inline_version;
886 if ((in->xattr_version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) &&
887 st->xattrbl.length() &&
888 st->xattr_version > in->xattr_version) {
889 bufferlist::iterator p = st->xattrbl.begin();
890 ::decode(in->xattrs, p);
891 in->xattr_version = st->xattr_version;
894 // move me if/when version reflects fragtree changes.
895 if (in->dirfragtree != st->dirfragtree) {
896 in->dirfragtree = st->dirfragtree;
897 _fragmap_remove_non_leaves(in);
900 if (in->snapid == CEPH_NOSNAP) {
901 add_update_cap(in, session, st->cap.cap_id, st->cap.caps, st->cap.seq,
902 st->cap.mseq, inodeno_t(st->cap.realm), st->cap.flags,
904 if (in->auth_cap && in->auth_cap->session == session)
905 in->max_size = st->max_size;
907 in->snap_caps |= st->cap.caps;
909 // setting I_COMPLETE needs to happen after adding the cap
910 if (updating_inode &&
912 (st->cap.caps & CEPH_CAP_FILE_SHARED) &&
913 (issued & CEPH_CAP_FILE_EXCL) == 0 &&
914 in->dirstat.nfiles == 0 &&
915 in->dirstat.nsubdirs == 0) {
916 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on empty dir " << *in << dendl;
917 in->flags |= I_COMPLETE | I_DIR_ORDERED;
919 ldout(cct, 10) << " dir is open on empty dir " << in->ino << " with "
920 << in->dir->dentries.size() << " entries, marking all dentries null" << dendl;
921 in->dir->readdir_cache.clear();
922 for (auto p = in->dir->dentries.begin();
923 p != in->dir->dentries.end();
925 unlink(p->second, true, true); // keep dir, keep dentry
927 if (in->dir->dentries.empty())
937 * insert_dentry_inode - insert + link a single dentry + inode into the metadata cache.
939 Dentry *Client::insert_dentry_inode(Dir *dir, const string& dname, LeaseStat *dlease,
940 Inode *in, utime_t from, MetaSession *session,
944 if (dir->dentries.count(dname))
945 dn = dir->dentries[dname];
947 ldout(cct, 12) << "insert_dentry_inode '" << dname << "' vino " << in->vino()
948 << " in dir " << dir->parent_inode->vino() << " dn " << dn
951 if (dn && dn->inode) {
952 if (dn->inode->vino() == in->vino()) {
954 ldout(cct, 12) << " had dentry " << dname
955 << " with correct vino " << dn->inode->vino()
958 ldout(cct, 12) << " had dentry " << dname
959 << " with WRONG vino " << dn->inode->vino()
961 unlink(dn, true, true); // keep dir, keep dentry
965 if (!dn || !dn->inode) {
966 InodeRef tmp_ref(in);
968 if (old_dentry->dir != dir) {
969 Inode *old_diri = old_dentry->dir->parent_inode;
970 old_diri->dir_ordered_count++;
971 clear_dir_complete_and_ordered(old_diri, false);
973 unlink(old_dentry, dir == old_dentry->dir, false); // drop dentry, keep dir open if its the same dir
975 Inode *diri = dir->parent_inode;
976 diri->dir_ordered_count++;
977 clear_dir_complete_and_ordered(diri, false);
978 dn = link(dir, dname, in, dn);
981 update_dentry_lease(dn, dlease, from, session);
985 void Client::update_dentry_lease(Dentry *dn, LeaseStat *dlease, utime_t from, MetaSession *session)
988 dttl += (float)dlease->duration_ms / 1000.0;
992 if (dlease->mask & CEPH_LOCK_DN) {
993 if (dttl > dn->lease_ttl) {
994 ldout(cct, 10) << "got dentry lease on " << dn->name
995 << " dur " << dlease->duration_ms << "ms ttl " << dttl << dendl;
996 dn->lease_ttl = dttl;
997 dn->lease_mds = session->mds_num;
998 dn->lease_seq = dlease->seq;
999 dn->lease_gen = session->cap_gen;
1002 dn->cap_shared_gen = dn->dir->parent_inode->shared_gen;
1007 * update MDS location cache for a single inode
1009 void Client::update_dir_dist(Inode *in, DirStat *dst)
1012 ldout(cct, 20) << "got dirfrag map for " << in->ino << " frag " << dst->frag << " to mds " << dst->auth << dendl;
1013 if (dst->auth >= 0) {
1014 in->fragmap[dst->frag] = dst->auth;
1016 in->fragmap.erase(dst->frag);
1018 if (!in->dirfragtree.is_leaf(dst->frag)) {
1019 in->dirfragtree.force_to_leaf(cct, dst->frag);
1020 _fragmap_remove_non_leaves(in);
1024 in->dir_replicated = !dst->dist.empty(); // FIXME that's just one frag!
1028 if (!st->dirfrag_dist.empty()) { // FIXME
1029 set<int> dist = st->dirfrag_dist.begin()->second;
1030 if (dist.empty() && !in->dir_contacts.empty())
1031 ldout(cct, 9) << "lost dist spec for " << in->ino
1032 << " " << dist << dendl;
1033 if (!dist.empty() && in->dir_contacts.empty())
1034 ldout(cct, 9) << "got dist spec for " << in->ino
1035 << " " << dist << dendl;
1036 in->dir_contacts = dist;
1041 void Client::clear_dir_complete_and_ordered(Inode *diri, bool complete)
1043 if (diri->flags & I_COMPLETE) {
1045 ldout(cct, 10) << " clearing (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
1046 diri->flags &= ~(I_COMPLETE | I_DIR_ORDERED);
1048 if (diri->flags & I_DIR_ORDERED) {
1049 ldout(cct, 10) << " clearing I_DIR_ORDERED on " << *diri << dendl;
1050 diri->flags &= ~I_DIR_ORDERED;
1054 diri->dir->readdir_cache.clear();
1059 * insert results from readdir or lssnap into the metadata cache.
1061 void Client::insert_readdir_results(MetaRequest *request, MetaSession *session, Inode *diri) {
1063 MClientReply *reply = request->reply;
1064 ConnectionRef con = request->reply->get_connection();
1065 uint64_t features = con->get_features();
1067 dir_result_t *dirp = request->dirp;
1070 // the extra buffer list is only set for readdir and lssnap replies
1071 bufferlist::iterator p = reply->get_extra_bl().begin();
1074 if (request->head.op == CEPH_MDS_OP_LSSNAP) {
1076 diri = open_snapdir(diri);
1079 // only open dir if we're actually adding stuff to it!
1080 Dir *dir = diri->open_dir();
1090 bool end = ((unsigned)flags & CEPH_READDIR_FRAG_END);
1091 bool hash_order = ((unsigned)flags & CEPH_READDIR_HASH_ORDER);
1093 frag_t fg = (unsigned)request->head.args.readdir.frag;
1094 unsigned readdir_offset = dirp->next_offset;
1095 string readdir_start = dirp->last_name;
1096 assert(!readdir_start.empty() || readdir_offset == 2);
1098 unsigned last_hash = 0;
1100 if (!readdir_start.empty()) {
1101 last_hash = ceph_frag_value(diri->hash_dentry_name(readdir_start));
1102 } else if (flags & CEPH_READDIR_OFFSET_HASH) {
1103 /* mds understands offset_hash */
1104 last_hash = (unsigned)request->head.args.readdir.offset_hash;
1108 if (fg != dst.frag) {
1109 ldout(cct, 10) << "insert_trace got new frag " << fg << " -> " << dst.frag << dendl;
1113 readdir_start.clear();
1114 dirp->offset = dir_result_t::make_fpos(fg, readdir_offset, false);
1118 ldout(cct, 10) << __func__ << " " << numdn << " readdir items, end=" << end
1119 << ", hash_order=" << hash_order
1120 << ", readdir_start " << readdir_start
1121 << ", last_hash " << last_hash
1122 << ", next_offset " << readdir_offset << dendl;
1124 if (diri->snapid != CEPH_SNAPDIR &&
1125 fg.is_leftmost() && readdir_offset == 2 &&
1126 !(hash_order && last_hash)) {
1127 dirp->release_count = diri->dir_release_count;
1128 dirp->ordered_count = diri->dir_ordered_count;
1129 dirp->start_shared_gen = diri->shared_gen;
1130 dirp->cache_index = 0;
1133 dirp->buffer_frag = fg;
1135 _readdir_drop_dirp_buffer(dirp);
1136 dirp->buffer.reserve(numdn);
1140 for (unsigned i=0; i<numdn; i++) {
1142 ::decode(dlease, p);
1143 InodeStat ist(p, features);
1145 ldout(cct, 15) << "" << i << ": '" << dname << "'" << dendl;
1147 Inode *in = add_update_inode(&ist, request->sent_stamp, session,
1150 if (diri->dir->dentries.count(dname)) {
1151 Dentry *olddn = diri->dir->dentries[dname];
1152 if (olddn->inode != in) {
1153 // replace incorrect dentry
1154 unlink(olddn, true, true); // keep dir, dentry
1155 dn = link(dir, dname, in, olddn);
1156 assert(dn == olddn);
1164 dn = link(dir, dname, in, NULL);
1167 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1169 unsigned hash = ceph_frag_value(diri->hash_dentry_name(dname));
1170 if (hash != last_hash)
1173 dn->offset = dir_result_t::make_fpos(hash, readdir_offset++, true);
1175 dn->offset = dir_result_t::make_fpos(fg, readdir_offset++, false);
1177 // add to readdir cache
1178 if (dirp->release_count == diri->dir_release_count &&
1179 dirp->ordered_count == diri->dir_ordered_count &&
1180 dirp->start_shared_gen == diri->shared_gen) {
1181 if (dirp->cache_index == dir->readdir_cache.size()) {
1183 assert(!dirp->inode->is_complete_and_ordered());
1184 dir->readdir_cache.reserve(dirp->cache_index + numdn);
1186 dir->readdir_cache.push_back(dn);
1187 } else if (dirp->cache_index < dir->readdir_cache.size()) {
1188 if (dirp->inode->is_complete_and_ordered())
1189 assert(dir->readdir_cache[dirp->cache_index] == dn);
1191 dir->readdir_cache[dirp->cache_index] = dn;
1193 assert(0 == "unexpected readdir buffer idx");
1195 dirp->cache_index++;
1197 // add to cached result list
1198 dirp->buffer.push_back(dir_result_t::dentry(dn->offset, dname, in));
1199 ldout(cct, 15) << __func__ << " " << hex << dn->offset << dec << ": '" << dname << "' -> " << in->ino << dendl;
1203 dirp->last_name = dname;
1205 dirp->next_offset = 2;
1207 dirp->next_offset = readdir_offset;
1209 if (dir->is_empty())
1216 * insert a trace from a MDS reply into the cache.
1218 Inode* Client::insert_trace(MetaRequest *request, MetaSession *session)
1220 MClientReply *reply = request->reply;
1221 int op = request->get_op();
1223 ldout(cct, 10) << "insert_trace from " << request->sent_stamp << " mds." << session->mds_num
1224 << " is_target=" << (int)reply->head.is_target
1225 << " is_dentry=" << (int)reply->head.is_dentry
1228 bufferlist::iterator p = reply->get_trace_bl().begin();
1229 if (request->got_unsafe) {
1230 ldout(cct, 10) << "insert_trace -- already got unsafe; ignoring" << dendl;
1236 ldout(cct, 10) << "insert_trace -- no trace" << dendl;
1238 Dentry *d = request->dentry();
1240 Inode *diri = d->dir->parent_inode;
1241 diri->dir_release_count++;
1242 clear_dir_complete_and_ordered(diri, true);
1245 if (d && reply->get_result() == 0) {
1246 if (op == CEPH_MDS_OP_RENAME) {
1248 Dentry *od = request->old_dentry();
1249 ldout(cct, 10) << " unlinking rename src dn " << od << " for traceless reply" << dendl;
1251 unlink(od, true, true); // keep dir, dentry
1252 } else if (op == CEPH_MDS_OP_RMDIR ||
1253 op == CEPH_MDS_OP_UNLINK) {
1255 ldout(cct, 10) << " unlinking unlink/rmdir dn " << d << " for traceless reply" << dendl;
1256 unlink(d, true, true); // keep dir, dentry
1262 ConnectionRef con = request->reply->get_connection();
1263 uint64_t features = con->get_features();
1264 ldout(cct, 10) << " features 0x" << hex << features << dec << dendl;
1267 SnapRealm *realm = NULL;
1268 if (reply->snapbl.length())
1269 update_snap_trace(reply->snapbl, &realm);
1271 ldout(cct, 10) << " hrm "
1272 << " is_target=" << (int)reply->head.is_target
1273 << " is_dentry=" << (int)reply->head.is_dentry
1282 if (reply->head.is_dentry) {
1283 dirst.decode(p, features);
1286 ::decode(dlease, p);
1290 if (reply->head.is_target) {
1291 ist.decode(p, features);
1292 if (cct->_conf->client_debug_getattr_caps) {
1293 unsigned wanted = 0;
1294 if (op == CEPH_MDS_OP_GETATTR || op == CEPH_MDS_OP_LOOKUP)
1295 wanted = request->head.args.getattr.mask;
1296 else if (op == CEPH_MDS_OP_OPEN || op == CEPH_MDS_OP_CREATE)
1297 wanted = request->head.args.open.mask;
1299 if ((wanted & CEPH_CAP_XATTR_SHARED) &&
1300 !(ist.xattr_version > 0 && ist.xattrbl.length() > 0))
1301 assert(0 == "MDS reply does not contain xattrs");
1304 in = add_update_inode(&ist, request->sent_stamp, session,
1309 if (reply->head.is_dentry) {
1310 diri = add_update_inode(&dirst, request->sent_stamp, session,
1312 update_dir_dist(diri, &dst); // dir stat info is attached to ..
1315 Dir *dir = diri->open_dir();
1316 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session,
1317 (op == CEPH_MDS_OP_RENAME) ? request->old_dentry() : NULL);
1320 if (diri->dir && diri->dir->dentries.count(dname)) {
1321 dn = diri->dir->dentries[dname];
1323 diri->dir_ordered_count++;
1324 clear_dir_complete_and_ordered(diri, false);
1325 unlink(dn, true, true); // keep dir, dentry
1328 if (dlease.duration_ms > 0) {
1330 Dir *dir = diri->open_dir();
1331 dn = link(dir, dname, NULL, NULL);
1333 update_dentry_lease(dn, &dlease, request->sent_stamp, session);
1336 } else if (op == CEPH_MDS_OP_LOOKUPSNAP ||
1337 op == CEPH_MDS_OP_MKSNAP) {
1338 ldout(cct, 10) << " faking snap lookup weirdness" << dendl;
1339 // fake it for snap lookup
1340 vinodeno_t vino = ist.vino;
1341 vino.snapid = CEPH_SNAPDIR;
1342 assert(inode_map.count(vino));
1343 diri = inode_map[vino];
1345 string dname = request->path.last_dentry();
1348 dlease.duration_ms = 0;
1351 Dir *dir = diri->open_dir();
1352 insert_dentry_inode(dir, dname, &dlease, in, request->sent_stamp, session);
1354 if (diri->dir && diri->dir->dentries.count(dname)) {
1355 Dentry *dn = diri->dir->dentries[dname];
1357 unlink(dn, true, true); // keep dir, dentry
1363 if (op == CEPH_MDS_OP_READDIR ||
1364 op == CEPH_MDS_OP_LSSNAP) {
1365 insert_readdir_results(request, session, in);
1366 } else if (op == CEPH_MDS_OP_LOOKUPNAME) {
1367 // hack: return parent inode instead
1371 if (request->dentry() == NULL && in != request->inode()) {
1372 // pin the target inode if its parent dentry is not pinned
1373 request->set_other_inode(in);
1378 put_snap_realm(realm);
1380 request->target = in;
1386 mds_rank_t Client::choose_target_mds(MetaRequest *req, Inode** phash_diri)
1388 mds_rank_t mds = MDS_RANK_NONE;
1390 bool is_hash = false;
1396 if (req->resend_mds >= 0) {
1397 mds = req->resend_mds;
1398 req->resend_mds = -1;
1399 ldout(cct, 10) << "choose_target_mds resend_mds specified as mds." << mds << dendl;
1403 if (cct->_conf->client_use_random_mds)
1409 ldout(cct, 20) << "choose_target_mds starting with req->inode " << *in << dendl;
1410 if (req->path.depth()) {
1411 hash = in->hash_dentry_name(req->path[0]);
1412 ldout(cct, 20) << "choose_target_mds inode dir hash is " << (int)in->dir_layout.dl_dir_hash
1413 << " on " << req->path[0]
1414 << " => " << hash << dendl;
1419 in = de->inode.get();
1420 ldout(cct, 20) << "choose_target_mds starting with req->dentry inode " << *in << dendl;
1422 in = de->dir->parent_inode;
1423 hash = in->hash_dentry_name(de->name);
1424 ldout(cct, 20) << "choose_target_mds dentry dir hash is " << (int)in->dir_layout.dl_dir_hash
1425 << " on " << de->name
1426 << " => " << hash << dendl;
1431 if (in->snapid != CEPH_NOSNAP) {
1432 ldout(cct, 10) << "choose_target_mds " << *in << " is snapped, using nonsnap parent" << dendl;
1433 while (in->snapid != CEPH_NOSNAP) {
1434 if (in->snapid == CEPH_SNAPDIR)
1435 in = in->snapdir_parent.get();
1436 else if (!in->dn_set.empty())
1437 /* In most cases there will only be one dentry, so getting it
1438 * will be the correct action. If there are multiple hard links,
1439 * I think the MDS should be able to redirect as needed*/
1440 in = in->get_first_parent()->dir->parent_inode;
1442 ldout(cct, 10) << "got unlinked inode, can't look at parent" << dendl;
1449 ldout(cct, 20) << "choose_target_mds " << *in << " is_hash=" << is_hash
1450 << " hash=" << hash << dendl;
1452 if (is_hash && S_ISDIR(in->mode) && !in->fragmap.empty()) {
1453 frag_t fg = in->dirfragtree[hash];
1454 if (in->fragmap.count(fg)) {
1455 mds = in->fragmap[fg];
1458 ldout(cct, 10) << "choose_target_mds from dirfragtree hash" << dendl;
1463 if (req->auth_is_best())
1465 if (!cap && !in->caps.empty())
1466 cap = in->caps.begin()->second;
1469 mds = cap->session->mds_num;
1470 ldout(cct, 10) << "choose_target_mds from caps on inode " << *in << dendl;
1477 mds = _get_random_up_mds();
1478 ldout(cct, 10) << "did not get mds through better means, so chose random mds " << mds << dendl;
1482 ldout(cct, 20) << "mds is " << mds << dendl;
1487 void Client::connect_mds_targets(mds_rank_t mds)
1489 ldout(cct, 10) << "connect_mds_targets for mds." << mds << dendl;
1490 assert(mds_sessions.count(mds));
1491 const MDSMap::mds_info_t& info = mdsmap->get_mds_info(mds);
1492 for (set<mds_rank_t>::const_iterator q = info.export_targets.begin();
1493 q != info.export_targets.end();
1495 if (mds_sessions.count(*q) == 0 &&
1496 mdsmap->is_clientreplay_or_active_or_stopping(*q)) {
1497 ldout(cct, 10) << "check_mds_sessions opening mds." << mds
1498 << " export target mds." << *q << dendl;
1499 _open_mds_session(*q);
1504 void Client::dump_mds_sessions(Formatter *f)
1506 f->dump_int("id", get_nodeid().v);
1507 f->open_array_section("sessions");
1508 for (map<mds_rank_t,MetaSession*>::const_iterator p = mds_sessions.begin(); p != mds_sessions.end(); ++p) {
1509 f->open_object_section("session");
1514 f->dump_int("mdsmap_epoch", mdsmap->get_epoch());
1516 void Client::dump_mds_requests(Formatter *f)
1518 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
1519 p != mds_requests.end();
1521 f->open_object_section("request");
1527 int Client::verify_reply_trace(int r,
1528 MetaRequest *request, MClientReply *reply,
1529 InodeRef *ptarget, bool *pcreated,
1530 const UserPerm& perms)
1532 // check whether this request actually did the create, and set created flag
1533 bufferlist extra_bl;
1534 inodeno_t created_ino;
1535 bool got_created_ino = false;
1536 ceph::unordered_map<vinodeno_t, Inode*>::iterator p;
1538 extra_bl.claim(reply->get_extra_bl());
1539 if (extra_bl.length() >= 8) {
1540 // if the extra bufferlist has a buffer, we assume its the created inode
1541 // and that this request to create succeeded in actually creating
1542 // the inode (won the race with other create requests)
1543 ::decode(created_ino, extra_bl);
1544 got_created_ino = true;
1545 ldout(cct, 10) << "make_request created ino " << created_ino << dendl;
1549 *pcreated = got_created_ino;
1551 if (request->target) {
1552 *ptarget = request->target;
1553 ldout(cct, 20) << "make_request target is " << *ptarget->get() << dendl;
1555 if (got_created_ino && (p = inode_map.find(vinodeno_t(created_ino, CEPH_NOSNAP))) != inode_map.end()) {
1556 (*ptarget) = p->second;
1557 ldout(cct, 20) << "make_request created, target is " << *ptarget->get() << dendl;
1559 // we got a traceless reply, and need to look up what we just
1560 // created. for now, do this by name. someday, do this by the
1561 // ino... which we know! FIXME.
1563 Dentry *d = request->dentry();
1566 ldout(cct, 10) << "make_request got traceless reply, looking up #"
1567 << d->dir->parent_inode->ino << "/" << d->name
1568 << " got_ino " << got_created_ino
1569 << " ino " << created_ino
1571 r = _do_lookup(d->dir->parent_inode, d->name, request->regetattr_mask,
1574 // if the dentry is not linked, just do our best. see #5021.
1575 assert(0 == "how did this happen? i want logs!");
1578 Inode *in = request->inode();
1579 ldout(cct, 10) << "make_request got traceless reply, forcing getattr on #"
1580 << in->ino << dendl;
1581 r = _getattr(in, request->regetattr_mask, perms, true);
1585 // verify ino returned in reply and trace_dist are the same
1586 if (got_created_ino &&
1587 created_ino.val != target->ino.val) {
1588 ldout(cct, 5) << "create got ino " << created_ino << " but then failed on lookup; EINTR?" << dendl;
1592 ptarget->swap(target);
1604 * Blocking helper to make an MDS request.
1606 * If the ptarget flag is set, behavior changes slightly: the caller
1607 * expects to get a pointer to the inode we are creating or operating
1608 * on. As a result, we will follow up any traceless mutation reply
1609 * with a getattr or lookup to transparently handle a traceless reply
1610 * from the MDS (as when the MDS restarts and the client has to replay
1613 * @param request the MetaRequest to execute
1614 * @param perms The user uid/gid to execute as (eventually, full group lists?)
1615 * @param ptarget [optional] address to store a pointer to the target inode we want to create or operate on
1616 * @param pcreated [optional; required if ptarget] where to store a bool of whether our create atomically created a file
1617 * @param use_mds [optional] prefer a specific mds (-1 for default)
1618 * @param pdirbl [optional; disallowed if ptarget] where to pass extra reply payload to the caller
1620 int Client::make_request(MetaRequest *request,
1621 const UserPerm& perms,
1622 InodeRef *ptarget, bool *pcreated,
1628 // assign a unique tid
1629 ceph_tid_t tid = ++last_tid;
1630 request->set_tid(tid);
1633 request->op_stamp = ceph_clock_now();
1636 mds_requests[tid] = request->get();
1637 if (oldest_tid == 0 && request->get_op() != CEPH_MDS_OP_SETFILELOCK)
1640 request->set_caller_perms(perms);
1642 if (cct->_conf->client_inject_fixed_oldest_tid) {
1643 ldout(cct, 20) << __func__ << " injecting fixed oldest_client_tid(1)" << dendl;
1644 request->set_oldest_client_tid(1);
1646 request->set_oldest_client_tid(oldest_tid);
1651 request->resend_mds = use_mds;
1654 if (request->aborted())
1658 request->abort(-EBLACKLISTED);
1664 request->caller_cond = &caller_cond;
1667 Inode *hash_diri = NULL;
1668 mds_rank_t mds = choose_target_mds(request, &hash_diri);
1669 int mds_state = (mds == MDS_RANK_NONE) ? MDSMap::STATE_NULL : mdsmap->get_state(mds);
1670 if (mds_state != MDSMap::STATE_ACTIVE && mds_state != MDSMap::STATE_STOPPING) {
1671 if (mds_state == MDSMap::STATE_NULL && mds >= mdsmap->get_max_mds()) {
1673 ldout(cct, 10) << " target mds." << mds << " has stopped, remove it from fragmap" << dendl;
1674 _fragmap_remove_stopped_mds(hash_diri, mds);
1676 ldout(cct, 10) << " target mds." << mds << " has stopped, trying a random mds" << dendl;
1677 request->resend_mds = _get_random_up_mds();
1680 ldout(cct, 10) << " target mds." << mds << " not active, waiting for new mdsmap" << dendl;
1681 wait_on_list(waiting_for_mdsmap);
1687 MetaSession *session = NULL;
1688 if (!have_open_session(mds)) {
1689 session = _get_or_open_mds_session(mds);
1692 if (session->state == MetaSession::STATE_OPENING) {
1693 ldout(cct, 10) << "waiting for session to mds." << mds << " to open" << dendl;
1694 wait_on_context_list(session->waiting_for_open);
1695 // Abort requests on REJECT from MDS
1696 if (rejected_by_mds.count(mds)) {
1697 request->abort(-EPERM);
1703 if (!have_open_session(mds))
1706 session = mds_sessions[mds];
1710 send_request(request, session);
1713 ldout(cct, 20) << "awaiting reply|forward|kick on " << &caller_cond << dendl;
1714 request->kick = false;
1715 while (!request->reply && // reply
1716 request->resend_mds < 0 && // forward
1718 caller_cond.Wait(client_lock);
1719 request->caller_cond = NULL;
1721 // did we get a reply?
1726 if (!request->reply) {
1727 assert(request->aborted());
1728 assert(!request->got_unsafe);
1729 r = request->get_abort_code();
1730 request->item.remove_myself();
1731 unregister_request(request);
1732 put_request(request); // ours
1737 MClientReply *reply = request->reply;
1738 request->reply = NULL;
1739 r = reply->get_result();
1741 request->success = true;
1743 // kick dispatcher (we've got it!)
1744 assert(request->dispatch_cond);
1745 request->dispatch_cond->Signal();
1746 ldout(cct, 20) << "sendrecv kickback on tid " << tid << " " << request->dispatch_cond << dendl;
1747 request->dispatch_cond = 0;
1749 if (r >= 0 && ptarget)
1750 r = verify_reply_trace(r, request, reply, ptarget, pcreated, perms);
1753 pdirbl->claim(reply->get_extra_bl());
1756 utime_t lat = ceph_clock_now();
1757 lat -= request->sent_stamp;
1758 ldout(cct, 20) << "lat " << lat << dendl;
1759 logger->tinc(l_c_lat, lat);
1760 logger->tinc(l_c_reply, lat);
1762 put_request(request);
1768 void Client::unregister_request(MetaRequest *req)
1770 mds_requests.erase(req->tid);
1771 if (req->tid == oldest_tid) {
1772 map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.upper_bound(oldest_tid);
1774 if (p == mds_requests.end()) {
1778 if (p->second->get_op() != CEPH_MDS_OP_SETFILELOCK) {
1779 oldest_tid = p->first;
1788 void Client::put_request(MetaRequest *request)
1790 if (request->_put()) {
1792 if (request->success)
1793 op = request->get_op();
1795 request->take_other_inode(&other_in);
1799 (op == CEPH_MDS_OP_RMDIR ||
1800 op == CEPH_MDS_OP_RENAME ||
1801 op == CEPH_MDS_OP_RMSNAP)) {
1802 _try_to_trim_inode(other_in.get(), false);
1807 int Client::encode_inode_release(Inode *in, MetaRequest *req,
1808 mds_rank_t mds, int drop,
1809 int unless, int force)
1811 ldout(cct, 20) << "encode_inode_release enter(in:" << *in << ", req:" << req
1812 << " mds:" << mds << ", drop:" << drop << ", unless:" << unless
1813 << ", have:" << ", force:" << force << ")" << dendl;
1815 if (in->caps.count(mds)) {
1816 Cap *caps = in->caps[mds];
1817 drop &= ~(in->dirty_caps | get_caps_used(in));
1818 if ((drop & caps->issued) &&
1819 !(unless & caps->issued)) {
1820 ldout(cct, 25) << "Dropping caps. Initial " << ccap_string(caps->issued) << dendl;
1821 caps->issued &= ~drop;
1822 caps->implemented &= ~drop;
1824 ldout(cct, 25) << "Now have: " << ccap_string(caps->issued) << dendl;
1829 ceph_mds_request_release rel;
1831 rel.cap_id = caps->cap_id;
1832 rel.seq = caps->seq;
1833 rel.issue_seq = caps->issue_seq;
1834 rel.mseq = caps->mseq;
1835 rel.caps = caps->implemented;
1836 rel.wanted = caps->wanted;
1839 req->cap_releases.push_back(MClientRequest::Release(rel,""));
1842 ldout(cct, 25) << "encode_inode_release exit(in:" << *in << ") released:"
1843 << released << dendl;
1847 void Client::encode_dentry_release(Dentry *dn, MetaRequest *req,
1848 mds_rank_t mds, int drop, int unless)
1850 ldout(cct, 20) << "encode_dentry_release enter(dn:"
1851 << dn << ")" << dendl;
1854 released = encode_inode_release(dn->dir->parent_inode, req,
1855 mds, drop, unless, 1);
1856 if (released && dn->lease_mds == mds) {
1857 ldout(cct, 25) << "preemptively releasing dn to mds" << dendl;
1858 MClientRequest::Release& rel = req->cap_releases.back();
1859 rel.item.dname_len = dn->name.length();
1860 rel.item.dname_seq = dn->lease_seq;
1861 rel.dname = dn->name;
1863 ldout(cct, 25) << "encode_dentry_release exit(dn:"
1864 << dn << ")" << dendl;
1869 * This requires the MClientRequest *request member to be set.
1870 * It will error out horribly without one.
1871 * Additionally, if you set any *drop member, you'd better have
1872 * set the corresponding dentry!
1874 void Client::encode_cap_releases(MetaRequest *req, mds_rank_t mds)
1876 ldout(cct, 20) << "encode_cap_releases enter (req: "
1877 << req << ", mds: " << mds << ")" << dendl;
1878 if (req->inode_drop && req->inode())
1879 encode_inode_release(req->inode(), req,
1880 mds, req->inode_drop,
1883 if (req->old_inode_drop && req->old_inode())
1884 encode_inode_release(req->old_inode(), req,
1885 mds, req->old_inode_drop,
1886 req->old_inode_unless);
1887 if (req->other_inode_drop && req->other_inode())
1888 encode_inode_release(req->other_inode(), req,
1889 mds, req->other_inode_drop,
1890 req->other_inode_unless);
1892 if (req->dentry_drop && req->dentry())
1893 encode_dentry_release(req->dentry(), req,
1894 mds, req->dentry_drop,
1895 req->dentry_unless);
1897 if (req->old_dentry_drop && req->old_dentry())
1898 encode_dentry_release(req->old_dentry(), req,
1899 mds, req->old_dentry_drop,
1900 req->old_dentry_unless);
1901 ldout(cct, 25) << "encode_cap_releases exit (req: "
1902 << req << ", mds " << mds <<dendl;
1905 bool Client::have_open_session(mds_rank_t mds)
1908 mds_sessions.count(mds) &&
1909 (mds_sessions[mds]->state == MetaSession::STATE_OPEN ||
1910 mds_sessions[mds]->state == MetaSession::STATE_STALE);
1913 MetaSession *Client::_get_mds_session(mds_rank_t mds, Connection *con)
1915 if (mds_sessions.count(mds) == 0)
1917 MetaSession *s = mds_sessions[mds];
1923 MetaSession *Client::_get_or_open_mds_session(mds_rank_t mds)
1925 if (mds_sessions.count(mds))
1926 return mds_sessions[mds];
1927 return _open_mds_session(mds);
1931 * Populate a map of strings with client-identifying metadata,
1932 * such as the hostname. Call this once at initialization.
1934 void Client::populate_metadata(const std::string &mount_root)
1940 metadata["hostname"] = u.nodename;
1941 ldout(cct, 20) << __func__ << " read hostname '" << u.nodename << "'" << dendl;
1943 ldout(cct, 1) << __func__ << " failed to read hostname (" << cpp_strerror(r) << ")" << dendl;
1946 metadata["pid"] = stringify(getpid());
1948 // Ceph entity id (the '0' in "client.0")
1949 metadata["entity_id"] = cct->_conf->name.get_id();
1951 // Our mount position
1952 if (!mount_root.empty()) {
1953 metadata["root"] = mount_root;
1957 metadata["ceph_version"] = pretty_version_to_str();
1958 metadata["ceph_sha1"] = git_version_to_str();
1960 // Apply any metadata from the user's configured overrides
1961 std::vector<std::string> tokens;
1962 get_str_vec(cct->_conf->client_metadata, ",", tokens);
1963 for (const auto &i : tokens) {
1964 auto eqpos = i.find("=");
1965 // Throw out anything that isn't of the form "<str>=<str>"
1966 if (eqpos == 0 || eqpos == std::string::npos || eqpos == i.size()) {
1967 lderr(cct) << "Invalid metadata keyval pair: '" << i << "'" << dendl;
1970 metadata[i.substr(0, eqpos)] = i.substr(eqpos + 1);
1975 * Optionally add or override client metadata fields.
1977 void Client::update_metadata(std::string const &k, std::string const &v)
1979 Mutex::Locker l(client_lock);
1980 assert(initialized);
1982 if (metadata.count(k)) {
1983 ldout(cct, 1) << __func__ << " warning, overriding metadata field '" << k
1984 << "' from '" << metadata[k] << "' to '" << v << "'" << dendl;
1990 MetaSession *Client::_open_mds_session(mds_rank_t mds)
1992 ldout(cct, 10) << "_open_mds_session mds." << mds << dendl;
1993 assert(mds_sessions.count(mds) == 0);
1994 MetaSession *session = new MetaSession;
1995 session->mds_num = mds;
1997 session->inst = mdsmap->get_inst(mds);
1998 session->con = messenger->get_connection(session->inst);
1999 session->state = MetaSession::STATE_OPENING;
2000 session->mds_state = MDSMap::STATE_NULL;
2001 mds_sessions[mds] = session;
2003 // Maybe skip sending a request to open if this MDS daemon
2004 // has previously sent us a REJECT.
2005 if (rejected_by_mds.count(mds)) {
2006 if (rejected_by_mds[mds] == session->inst) {
2007 ldout(cct, 4) << "_open_mds_session mds." << mds << " skipping "
2008 "because we were rejected" << dendl;
2011 ldout(cct, 4) << "_open_mds_session mds." << mds << " old inst "
2012 "rejected us, trying with new inst" << dendl;
2013 rejected_by_mds.erase(mds);
2017 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_OPEN);
2018 m->client_meta = metadata;
2019 session->con->send_message(m);
2023 void Client::_close_mds_session(MetaSession *s)
2025 ldout(cct, 2) << "_close_mds_session mds." << s->mds_num << " seq " << s->seq << dendl;
2026 s->state = MetaSession::STATE_CLOSING;
2027 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2030 void Client::_closed_mds_session(MetaSession *s)
2032 s->state = MetaSession::STATE_CLOSED;
2033 s->con->mark_down();
2034 signal_context_list(s->waiting_for_open);
2035 mount_cond.Signal();
2036 remove_session_caps(s);
2037 kick_requests_closed(s);
2038 mds_sessions.erase(s->mds_num);
2042 void Client::handle_client_session(MClientSession *m)
2044 mds_rank_t from = mds_rank_t(m->get_source().num());
2045 ldout(cct, 10) << "handle_client_session " << *m << " from mds." << from << dendl;
2047 MetaSession *session = _get_mds_session(from, m->get_connection().get());
2049 ldout(cct, 10) << " discarding session message from sessionless mds " << m->get_source_inst() << dendl;
2054 switch (m->get_op()) {
2055 case CEPH_SESSION_OPEN:
2056 renew_caps(session);
2057 session->state = MetaSession::STATE_OPEN;
2059 mount_cond.Signal();
2061 connect_mds_targets(from);
2062 signal_context_list(session->waiting_for_open);
2065 case CEPH_SESSION_CLOSE:
2066 _closed_mds_session(session);
2069 case CEPH_SESSION_RENEWCAPS:
2070 if (session->cap_renew_seq == m->get_seq()) {
2072 session->last_cap_renew_request + mdsmap->get_session_timeout();
2073 wake_inode_waiters(session);
2077 case CEPH_SESSION_STALE:
2078 renew_caps(session);
2081 case CEPH_SESSION_RECALL_STATE:
2082 trim_caps(session, m->get_max_caps());
2085 case CEPH_SESSION_FLUSHMSG:
2086 session->con->send_message(new MClientSession(CEPH_SESSION_FLUSHMSG_ACK, m->get_seq()));
2089 case CEPH_SESSION_FORCE_RO:
2090 force_session_readonly(session);
2093 case CEPH_SESSION_REJECT:
2094 rejected_by_mds[session->mds_num] = session->inst;
2095 _closed_mds_session(session);
2106 bool Client::_any_stale_sessions() const
2108 assert(client_lock.is_locked_by_me());
2110 for (const auto &i : mds_sessions) {
2111 if (i.second->state == MetaSession::STATE_STALE) {
2119 void Client::_kick_stale_sessions()
2121 ldout(cct, 1) << "kick_stale_sessions" << dendl;
2123 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2124 p != mds_sessions.end(); ) {
2125 MetaSession *s = p->second;
2127 if (s->state == MetaSession::STATE_STALE)
2128 _closed_mds_session(s);
2132 void Client::send_request(MetaRequest *request, MetaSession *session,
2133 bool drop_cap_releases)
2136 mds_rank_t mds = session->mds_num;
2137 ldout(cct, 10) << "send_request rebuilding request " << request->get_tid()
2138 << " for mds." << mds << dendl;
2139 MClientRequest *r = build_client_request(request);
2140 if (request->dentry()) {
2141 r->set_dentry_wanted();
2143 if (request->got_unsafe) {
2144 r->set_replayed_op();
2145 if (request->target)
2146 r->head.ino = request->target->ino;
2148 encode_cap_releases(request, mds);
2149 if (drop_cap_releases) // we haven't send cap reconnect yet, drop cap releases
2150 request->cap_releases.clear();
2152 r->releases.swap(request->cap_releases);
2154 r->set_mdsmap_epoch(mdsmap->get_epoch());
2155 if (r->head.op == CEPH_MDS_OP_SETXATTR) {
2156 objecter->with_osdmap([r](const OSDMap& o) {
2157 r->set_osdmap_epoch(o.get_epoch());
2161 if (request->mds == -1) {
2162 request->sent_stamp = ceph_clock_now();
2163 ldout(cct, 20) << "send_request set sent_stamp to " << request->sent_stamp << dendl;
2167 Inode *in = request->inode();
2168 if (in && in->caps.count(mds))
2169 request->sent_on_mseq = in->caps[mds]->mseq;
2171 session->requests.push_back(&request->item);
2173 ldout(cct, 10) << "send_request " << *r << " to mds." << mds << dendl;
2174 session->con->send_message(r);
2177 MClientRequest* Client::build_client_request(MetaRequest *request)
2179 MClientRequest *req = new MClientRequest(request->get_op());
2180 req->set_tid(request->tid);
2181 req->set_stamp(request->op_stamp);
2182 memcpy(&req->head, &request->head, sizeof(ceph_mds_request_head));
2184 // if the filepath's haven't been set, set them!
2185 if (request->path.empty()) {
2186 Inode *in = request->inode();
2187 Dentry *de = request->dentry();
2189 in->make_nosnap_relative_path(request->path);
2192 de->inode->make_nosnap_relative_path(request->path);
2194 de->dir->parent_inode->make_nosnap_relative_path(request->path);
2195 request->path.push_dentry(de->name);
2197 else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2198 << " No path, inode, or appropriately-endowed dentry given!"
2200 } else ldout(cct, 1) << "Warning -- unable to construct a filepath!"
2201 << " No path, inode, or dentry given!"
2204 req->set_filepath(request->get_filepath());
2205 req->set_filepath2(request->get_filepath2());
2206 req->set_data(request->data);
2207 req->set_retry_attempt(request->retry_attempt++);
2208 req->head.num_fwd = request->num_fwd;
2210 int gid_count = request->perms.get_gids(&_gids);
2211 req->set_gid_list(gid_count, _gids);
2217 void Client::handle_client_request_forward(MClientRequestForward *fwd)
2219 mds_rank_t mds = mds_rank_t(fwd->get_source().num());
2220 MetaSession *session = _get_mds_session(mds, fwd->get_connection().get());
2225 ceph_tid_t tid = fwd->get_tid();
2227 if (mds_requests.count(tid) == 0) {
2228 ldout(cct, 10) << "handle_client_request_forward no pending request on tid " << tid << dendl;
2233 MetaRequest *request = mds_requests[tid];
2236 // reset retry counter
2237 request->retry_attempt = 0;
2239 // request not forwarded, or dest mds has no session.
2241 ldout(cct, 10) << "handle_client_request tid " << tid
2242 << " fwd " << fwd->get_num_fwd()
2243 << " to mds." << fwd->get_dest_mds()
2244 << ", resending to " << fwd->get_dest_mds()
2248 request->item.remove_myself();
2249 request->num_fwd = fwd->get_num_fwd();
2250 request->resend_mds = fwd->get_dest_mds();
2251 request->caller_cond->Signal();
2256 bool Client::is_dir_operation(MetaRequest *req)
2258 int op = req->get_op();
2259 if (op == CEPH_MDS_OP_MKNOD || op == CEPH_MDS_OP_LINK ||
2260 op == CEPH_MDS_OP_UNLINK || op == CEPH_MDS_OP_RENAME ||
2261 op == CEPH_MDS_OP_MKDIR || op == CEPH_MDS_OP_RMDIR ||
2262 op == CEPH_MDS_OP_SYMLINK || op == CEPH_MDS_OP_CREATE)
2267 void Client::handle_client_reply(MClientReply *reply)
2269 mds_rank_t mds_num = mds_rank_t(reply->get_source().num());
2270 MetaSession *session = _get_mds_session(mds_num, reply->get_connection().get());
2276 ceph_tid_t tid = reply->get_tid();
2277 bool is_safe = reply->is_safe();
2279 if (mds_requests.count(tid) == 0) {
2280 lderr(cct) << "handle_client_reply no pending request on tid " << tid
2281 << " safe is:" << is_safe << dendl;
2285 MetaRequest *request = mds_requests.at(tid);
2287 ldout(cct, 20) << "handle_client_reply got a reply. Safe:" << is_safe
2288 << " tid " << tid << dendl;
2290 if (request->got_unsafe && !is_safe) {
2291 //duplicate response
2292 ldout(cct, 0) << "got a duplicate reply on tid " << tid << " from mds "
2293 << mds_num << " safe:" << is_safe << dendl;
2298 if (-ESTALE == reply->get_result()) { // see if we can get to proper MDS
2299 ldout(cct, 20) << "got ESTALE on tid " << request->tid
2300 << " from mds." << request->mds << dendl;
2301 request->send_to_auth = true;
2302 request->resend_mds = choose_target_mds(request);
2303 Inode *in = request->inode();
2304 if (request->resend_mds >= 0 &&
2305 request->resend_mds == request->mds &&
2307 in->caps.count(request->resend_mds) == 0 ||
2308 request->sent_on_mseq == in->caps[request->resend_mds]->mseq)) {
2309 // have to return ESTALE
2311 request->caller_cond->Signal();
2315 ldout(cct, 20) << "have to return ESTALE" << dendl;
2318 assert(request->reply == NULL);
2319 request->reply = reply;
2320 insert_trace(request, session);
2322 // Handle unsafe reply
2324 request->got_unsafe = true;
2325 session->unsafe_requests.push_back(&request->unsafe_item);
2326 if (is_dir_operation(request)) {
2327 Inode *dir = request->inode();
2329 dir->unsafe_ops.push_back(&request->unsafe_dir_item);
2331 if (request->target) {
2332 InodeRef &in = request->target;
2333 in->unsafe_ops.push_back(&request->unsafe_target_item);
2337 // Only signal the caller once (on the first reply):
2338 // Either its an unsafe reply, or its a safe reply and no unsafe reply was sent.
2339 if (!is_safe || !request->got_unsafe) {
2341 request->dispatch_cond = &cond;
2344 ldout(cct, 20) << "handle_client_reply signalling caller " << (void*)request->caller_cond << dendl;
2345 request->caller_cond->Signal();
2347 // wake for kick back
2348 while (request->dispatch_cond) {
2349 ldout(cct, 20) << "handle_client_reply awaiting kickback on tid " << tid << " " << &cond << dendl;
2350 cond.Wait(client_lock);
2355 // the filesystem change is committed to disk
2356 // we're done, clean up
2357 if (request->got_unsafe) {
2358 request->unsafe_item.remove_myself();
2359 request->unsafe_dir_item.remove_myself();
2360 request->unsafe_target_item.remove_myself();
2361 signal_cond_list(request->waitfor_safe);
2363 request->item.remove_myself();
2364 unregister_request(request);
2367 mount_cond.Signal();
2370 void Client::_handle_full_flag(int64_t pool)
2372 ldout(cct, 1) << __func__ << ": FULL: cancelling outstanding operations "
2373 << "on " << pool << dendl;
2374 // Cancel all outstanding ops in this pool with -ENOSPC: it is necessary
2375 // to do this rather than blocking, because otherwise when we fill up we
2376 // potentially lock caps forever on files with dirty pages, and we need
2377 // to be able to release those caps to the MDS so that it can delete files
2378 // and free up space.
2379 epoch_t cancelled_epoch = objecter->op_cancel_writes(-ENOSPC, pool);
2381 // For all inodes with layouts in this pool and a pending flush write op
2382 // (i.e. one of the ones we will cancel), we've got to purge_set their data
2383 // from ObjectCacher so that it doesn't re-issue the write in response to
2384 // the ENOSPC error.
2385 // Fortunately since we're cancelling everything in a given pool, we don't
2386 // need to know which ops belong to which ObjectSet, we can just blow all
2387 // the un-flushed cached data away and mark any dirty inodes' async_err
2388 // field with -ENOSPC as long as we're sure all the ops we cancelled were
2389 // affecting this pool, and all the objectsets we're purging were also
2391 for (unordered_map<vinodeno_t,Inode*>::iterator i = inode_map.begin();
2392 i != inode_map.end(); ++i)
2394 Inode *inode = i->second;
2395 if (inode->oset.dirty_or_tx
2396 && (pool == -1 || inode->layout.pool_id == pool)) {
2397 ldout(cct, 4) << __func__ << ": FULL: inode 0x" << std::hex << i->first << std::dec
2398 << " has dirty objects, purging and setting ENOSPC" << dendl;
2399 objectcacher->purge_set(&inode->oset);
2400 inode->set_async_err(-ENOSPC);
2404 if (cancelled_epoch != (epoch_t)-1) {
2405 set_cap_epoch_barrier(cancelled_epoch);
2409 void Client::handle_osd_map(MOSDMap *m)
2411 std::set<entity_addr_t> new_blacklists;
2412 objecter->consume_blacklist_events(&new_blacklists);
2414 const auto myaddr = messenger->get_myaddr();
2415 if (!blacklisted && new_blacklists.count(myaddr)) {
2416 auto epoch = objecter->with_osdmap([](const OSDMap &o){
2417 return o.get_epoch();
2419 lderr(cct) << "I was blacklisted at osd epoch " << epoch << dendl;
2421 for (std::map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2422 p != mds_requests.end(); ) {
2423 auto req = p->second;
2425 req->abort(-EBLACKLISTED);
2426 if (req->caller_cond) {
2428 req->caller_cond->Signal();
2432 // Progress aborts on any requests that were on this waitlist. Any
2433 // requests that were on a waiting_for_open session waitlist
2434 // will get kicked during close session below.
2435 signal_cond_list(waiting_for_mdsmap);
2437 // Force-close all sessions: assume this is not abandoning any state
2438 // on the MDS side because the MDS will have seen the blacklist too.
2439 while(!mds_sessions.empty()) {
2440 auto i = mds_sessions.begin();
2441 auto session = i->second;
2442 _closed_mds_session(session);
2445 // Since we know all our OSD ops will fail, cancel them all preemtively,
2446 // so that on an unhealthy cluster we can umount promptly even if e.g.
2447 // some PGs were inaccessible.
2448 objecter->op_cancel_writes(-EBLACKLISTED);
2450 } else if (blacklisted) {
2451 // Handle case where we were blacklisted but no longer are
2452 blacklisted = objecter->with_osdmap([myaddr](const OSDMap &o){
2453 return o.is_blacklisted(myaddr);});
2456 if (objecter->osdmap_full_flag()) {
2457 _handle_full_flag(-1);
2459 // Accumulate local list of full pools so that I can drop
2460 // the objecter lock before re-entering objecter in
2462 std::vector<int64_t> full_pools;
2464 objecter->with_osdmap([&full_pools](const OSDMap &o) {
2465 for (const auto& kv : o.get_pools()) {
2466 if (kv.second.has_flag(pg_pool_t::FLAG_FULL)) {
2467 full_pools.push_back(kv.first);
2472 for (auto p : full_pools)
2473 _handle_full_flag(p);
2475 // Subscribe to subsequent maps to watch for the full flag going
2476 // away. For the global full flag objecter does this for us, but
2477 // it pays no attention to the per-pool full flag so in this branch
2478 // we do it ourselves.
2479 if (!full_pools.empty()) {
2480 objecter->maybe_request_map();
2488 // ------------------------
2489 // incoming messages
2492 bool Client::ms_dispatch(Message *m)
2494 Mutex::Locker l(client_lock);
2496 ldout(cct, 10) << "inactive, discarding " << *m << dendl;
2501 switch (m->get_type()) {
2502 // mounting and mds sessions
2503 case CEPH_MSG_MDS_MAP:
2504 handle_mds_map(static_cast<MMDSMap*>(m));
2506 case CEPH_MSG_FS_MAP:
2507 handle_fs_map(static_cast<MFSMap*>(m));
2509 case CEPH_MSG_FS_MAP_USER:
2510 handle_fs_map_user(static_cast<MFSMapUser*>(m));
2512 case CEPH_MSG_CLIENT_SESSION:
2513 handle_client_session(static_cast<MClientSession*>(m));
2516 case CEPH_MSG_OSD_MAP:
2517 handle_osd_map(static_cast<MOSDMap*>(m));
2521 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2522 handle_client_request_forward(static_cast<MClientRequestForward*>(m));
2524 case CEPH_MSG_CLIENT_REPLY:
2525 handle_client_reply(static_cast<MClientReply*>(m));
2528 case CEPH_MSG_CLIENT_SNAP:
2529 handle_snap(static_cast<MClientSnap*>(m));
2531 case CEPH_MSG_CLIENT_CAPS:
2532 handle_caps(static_cast<MClientCaps*>(m));
2534 case CEPH_MSG_CLIENT_LEASE:
2535 handle_lease(static_cast<MClientLease*>(m));
2537 case MSG_COMMAND_REPLY:
2538 if (m->get_source().type() == CEPH_ENTITY_TYPE_MDS) {
2539 handle_command_reply(static_cast<MCommandReply*>(m));
2544 case CEPH_MSG_CLIENT_QUOTA:
2545 handle_quota(static_cast<MClientQuota*>(m));
2554 ldout(cct, 10) << "unmounting: trim pass, size was " << lru.lru_get_size()
2555 << "+" << inode_map.size() << dendl;
2556 long unsigned size = lru.lru_get_size() + inode_map.size();
2558 if (size < lru.lru_get_size() + inode_map.size()) {
2559 ldout(cct, 10) << "unmounting: trim pass, cache shrank, poking unmount()" << dendl;
2560 mount_cond.Signal();
2562 ldout(cct, 10) << "unmounting: trim pass, size still " << lru.lru_get_size()
2563 << "+" << inode_map.size() << dendl;
2570 void Client::handle_fs_map(MFSMap *m)
2572 fsmap.reset(new FSMap(m->get_fsmap()));
2575 signal_cond_list(waiting_for_fsmap);
2577 monclient->sub_got("fsmap", fsmap->get_epoch());
2580 void Client::handle_fs_map_user(MFSMapUser *m)
2582 fsmap_user.reset(new FSMapUser);
2583 *fsmap_user = m->get_fsmap();
2586 monclient->sub_got("fsmap.user", fsmap_user->get_epoch());
2587 signal_cond_list(waiting_for_fsmap);
2590 void Client::handle_mds_map(MMDSMap* m)
2592 if (m->get_epoch() <= mdsmap->get_epoch()) {
2593 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch()
2594 << " is identical to or older than our "
2595 << mdsmap->get_epoch() << dendl;
2600 ldout(cct, 1) << "handle_mds_map epoch " << m->get_epoch() << dendl;
2602 std::unique_ptr<MDSMap> oldmap(new MDSMap);
2603 oldmap.swap(mdsmap);
2605 mdsmap->decode(m->get_encoded());
2607 // Cancel any commands for missing or laggy GIDs
2608 std::list<ceph_tid_t> cancel_ops;
2609 auto &commands = command_table.get_commands();
2610 for (const auto &i : commands) {
2611 auto &op = i.second;
2612 const mds_gid_t op_mds_gid = op.mds_gid;
2613 if (mdsmap->is_dne_gid(op_mds_gid) || mdsmap->is_laggy_gid(op_mds_gid)) {
2614 ldout(cct, 1) << __func__ << ": cancelling command op " << i.first << dendl;
2615 cancel_ops.push_back(i.first);
2617 std::ostringstream ss;
2618 ss << "MDS " << op_mds_gid << " went away";
2619 *(op.outs) = ss.str();
2621 op.con->mark_down();
2623 op.on_finish->complete(-ETIMEDOUT);
2628 for (std::list<ceph_tid_t>::iterator i = cancel_ops.begin();
2629 i != cancel_ops.end(); ++i) {
2630 command_table.erase(*i);
2634 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2635 p != mds_sessions.end(); ) {
2636 mds_rank_t mds = p->first;
2637 MetaSession *session = p->second;
2640 int oldstate = oldmap->get_state(mds);
2641 int newstate = mdsmap->get_state(mds);
2642 if (!mdsmap->is_up(mds)) {
2643 session->con->mark_down();
2644 } else if (mdsmap->get_inst(mds) != session->inst) {
2645 session->con->mark_down();
2646 session->inst = mdsmap->get_inst(mds);
2647 // When new MDS starts to take over, notify kernel to trim unused entries
2648 // in its dcache/icache. Hopefully, the kernel will release some unused
2649 // inodes before the new MDS enters reconnect state.
2650 trim_cache_for_reconnect(session);
2651 } else if (oldstate == newstate)
2652 continue; // no change
2654 session->mds_state = newstate;
2655 if (newstate == MDSMap::STATE_RECONNECT) {
2656 session->con = messenger->get_connection(session->inst);
2657 send_reconnect(session);
2658 } else if (newstate >= MDSMap::STATE_ACTIVE) {
2659 if (oldstate < MDSMap::STATE_ACTIVE) {
2660 // kick new requests
2661 kick_requests(session);
2662 kick_flushing_caps(session);
2663 signal_context_list(session->waiting_for_open);
2664 kick_maxsize_requests(session);
2665 wake_inode_waiters(session);
2667 connect_mds_targets(mds);
2668 } else if (newstate == MDSMap::STATE_NULL &&
2669 mds >= mdsmap->get_max_mds()) {
2670 _closed_mds_session(session);
2674 // kick any waiting threads
2675 signal_cond_list(waiting_for_mdsmap);
2679 monclient->sub_got("mdsmap", mdsmap->get_epoch());
2682 void Client::send_reconnect(MetaSession *session)
2684 mds_rank_t mds = session->mds_num;
2685 ldout(cct, 10) << "send_reconnect to mds." << mds << dendl;
2687 // trim unused caps to reduce MDS's cache rejoin time
2688 trim_cache_for_reconnect(session);
2690 session->readonly = false;
2692 if (session->release) {
2693 session->release->put();
2694 session->release = NULL;
2697 // reset my cap seq number
2699 //connect to the mds' offload targets
2700 connect_mds_targets(mds);
2701 //make sure unsafe requests get saved
2702 resend_unsafe_requests(session);
2704 MClientReconnect *m = new MClientReconnect;
2706 // i have an open session.
2707 ceph::unordered_set<inodeno_t> did_snaprealm;
2708 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
2709 p != inode_map.end();
2711 Inode *in = p->second;
2712 if (in->caps.count(mds)) {
2713 ldout(cct, 10) << " caps on " << p->first
2714 << " " << ccap_string(in->caps[mds]->issued)
2715 << " wants " << ccap_string(in->caps_wanted())
2718 in->make_long_path(path);
2719 ldout(cct, 10) << " path " << path << dendl;
2722 _encode_filelocks(in, flockbl);
2724 Cap *cap = in->caps[mds];
2725 cap->seq = 0; // reset seq.
2726 cap->issue_seq = 0; // reset seq.
2727 cap->mseq = 0; // reset seq.
2728 cap->issued = cap->implemented;
2730 snapid_t snap_follows = 0;
2731 if (!in->cap_snaps.empty())
2732 snap_follows = in->cap_snaps.begin()->first;
2734 m->add_cap(p->first.ino,
2736 path.get_ino(), path.get_path(), // ino
2737 in->caps_wanted(), // wanted
2738 cap->issued, // issued
2743 if (did_snaprealm.count(in->snaprealm->ino) == 0) {
2744 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
2745 m->add_snaprealm(in->snaprealm->ino, in->snaprealm->seq, in->snaprealm->parent);
2746 did_snaprealm.insert(in->snaprealm->ino);
2751 early_kick_flushing_caps(session);
2753 session->con->send_message(m);
2755 mount_cond.Signal();
2759 void Client::kick_requests(MetaSession *session)
2761 ldout(cct, 10) << "kick_requests for mds." << session->mds_num << dendl;
2762 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2763 p != mds_requests.end();
2765 MetaRequest *req = p->second;
2766 if (req->got_unsafe)
2768 if (req->aborted()) {
2769 if (req->caller_cond) {
2771 req->caller_cond->Signal();
2775 if (req->retry_attempt > 0)
2776 continue; // new requests only
2777 if (req->mds == session->mds_num) {
2778 send_request(p->second, session);
2783 void Client::resend_unsafe_requests(MetaSession *session)
2785 for (xlist<MetaRequest*>::iterator iter = session->unsafe_requests.begin();
2788 send_request(*iter, session);
2790 // also re-send old requests when MDS enters reconnect stage. So that MDS can
2791 // process completed requests in clientreplay stage.
2792 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2793 p != mds_requests.end();
2795 MetaRequest *req = p->second;
2796 if (req->got_unsafe)
2800 if (req->retry_attempt == 0)
2801 continue; // old requests only
2802 if (req->mds == session->mds_num)
2803 send_request(req, session, true);
2807 void Client::wait_unsafe_requests()
2809 list<MetaRequest*> last_unsafe_reqs;
2810 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
2811 p != mds_sessions.end();
2813 MetaSession *s = p->second;
2814 if (!s->unsafe_requests.empty()) {
2815 MetaRequest *req = s->unsafe_requests.back();
2817 last_unsafe_reqs.push_back(req);
2821 for (list<MetaRequest*>::iterator p = last_unsafe_reqs.begin();
2822 p != last_unsafe_reqs.end();
2824 MetaRequest *req = *p;
2825 if (req->unsafe_item.is_on_list())
2826 wait_on_list(req->waitfor_safe);
2831 void Client::kick_requests_closed(MetaSession *session)
2833 ldout(cct, 10) << "kick_requests_closed for mds." << session->mds_num << dendl;
2834 for (map<ceph_tid_t, MetaRequest*>::iterator p = mds_requests.begin();
2835 p != mds_requests.end(); ) {
2836 MetaRequest *req = p->second;
2838 if (req->mds == session->mds_num) {
2839 if (req->caller_cond) {
2841 req->caller_cond->Signal();
2843 req->item.remove_myself();
2844 if (req->got_unsafe) {
2845 lderr(cct) << "kick_requests_closed removing unsafe request " << req->get_tid() << dendl;
2846 req->unsafe_item.remove_myself();
2847 req->unsafe_dir_item.remove_myself();
2848 req->unsafe_target_item.remove_myself();
2849 signal_cond_list(req->waitfor_safe);
2850 unregister_request(req);
2854 assert(session->requests.empty());
2855 assert(session->unsafe_requests.empty());
2865 void Client::got_mds_push(MetaSession *s)
2868 ldout(cct, 10) << " mds." << s->mds_num << " seq now " << s->seq << dendl;
2869 if (s->state == MetaSession::STATE_CLOSING) {
2870 s->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_CLOSE, s->seq));
2874 void Client::handle_lease(MClientLease *m)
2876 ldout(cct, 10) << "handle_lease " << *m << dendl;
2878 assert(m->get_action() == CEPH_MDS_LEASE_REVOKE);
2880 mds_rank_t mds = mds_rank_t(m->get_source().num());
2881 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
2887 got_mds_push(session);
2889 ceph_seq_t seq = m->get_seq();
2892 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
2893 if (inode_map.count(vino) == 0) {
2894 ldout(cct, 10) << " don't have vino " << vino << dendl;
2897 in = inode_map[vino];
2899 if (m->get_mask() & CEPH_LOCK_DN) {
2900 if (!in->dir || in->dir->dentries.count(m->dname) == 0) {
2901 ldout(cct, 10) << " don't have dir|dentry " << m->get_ino() << "/" << m->dname <<dendl;
2904 Dentry *dn = in->dir->dentries[m->dname];
2905 ldout(cct, 10) << " revoked DN lease on " << dn << dendl;
2910 m->get_connection()->send_message(
2912 CEPH_MDS_LEASE_RELEASE, seq,
2913 m->get_mask(), m->get_ino(), m->get_first(), m->get_last(), m->dname));
2917 void Client::put_inode(Inode *in, int n)
2919 ldout(cct, 10) << "put_inode on " << *in << dendl;
2920 int left = in->_put(n);
2923 remove_all_caps(in);
2925 ldout(cct, 10) << "put_inode deleting " << *in << dendl;
2926 bool unclean = objectcacher->release_set(&in->oset);
2928 inode_map.erase(in->vino());
2929 if (use_faked_inos())
2930 _release_faked_ino(in);
2935 while (!root_parents.empty())
2936 root_parents.erase(root_parents.begin());
2943 void Client::close_dir(Dir *dir)
2945 Inode *in = dir->parent_inode;
2946 ldout(cct, 15) << "close_dir dir " << dir << " on " << in << dendl;
2947 assert(dir->is_empty());
2948 assert(in->dir == dir);
2949 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
2950 if (!in->dn_set.empty())
2951 in->get_first_parent()->put(); // unpin dentry
2955 put_inode(in); // unpin inode
2959 * Don't call this with in==NULL, use get_or_create for that
2960 * leave dn set to default NULL unless you're trying to add
2961 * a new inode to a pre-created Dentry
2963 Dentry* Client::link(Dir *dir, const string& name, Inode *in, Dentry *dn)
2966 // create a new Dentry
2972 dir->dentries[dn->name] = dn;
2973 lru.lru_insert_mid(dn); // mid or top?
2975 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2976 << " dn " << dn << " (new dn)" << dendl;
2978 ldout(cct, 15) << "link dir " << dir->parent_inode << " '" << name << "' to inode " << in
2979 << " dn " << dn << " (old dn)" << dendl;
2982 if (in) { // link to inode
2986 dn->get(); // dir -> dn pin
2988 dn->get(); // ll_ref -> dn pin
2991 assert(in->dn_set.count(dn) == 0);
2993 // only one parent for directories!
2994 if (in->is_dir() && !in->dn_set.empty()) {
2995 Dentry *olddn = in->get_first_parent();
2996 assert(olddn->dir != dir || olddn->name != name);
2997 Inode *old_diri = olddn->dir->parent_inode;
2998 old_diri->dir_release_count++;
2999 clear_dir_complete_and_ordered(old_diri, true);
3000 unlink(olddn, true, true); // keep dir, dentry
3003 in->dn_set.insert(dn);
3005 ldout(cct, 20) << "link inode " << in << " parents now " << in->dn_set << dendl;
3011 void Client::unlink(Dentry *dn, bool keepdir, bool keepdentry)
3015 ldout(cct, 15) << "unlink dir " << dn->dir->parent_inode << " '" << dn->name << "' dn " << dn
3016 << " inode " << dn->inode << dendl;
3018 // unlink from inode
3022 dn->put(); // dir -> dn pin
3024 dn->put(); // ll_ref -> dn pin
3027 assert(in->dn_set.count(dn));
3028 in->dn_set.erase(dn);
3029 ldout(cct, 20) << "unlink inode " << in << " parents now " << in->dn_set << dendl;
3035 ldout(cct, 15) << "unlink removing '" << dn->name << "' dn " << dn << dendl;
3038 dn->dir->dentries.erase(dn->name);
3039 if (dn->dir->is_empty() && !keepdir)
3050 * For asynchronous flushes, check for errors from the IO and
3051 * update the inode if necessary
3053 class C_Client_FlushComplete : public Context {
3058 C_Client_FlushComplete(Client *c, Inode *in) : client(c), inode(in) { }
3059 void finish(int r) override {
3060 assert(client->client_lock.is_locked_by_me());
3062 client_t const whoami = client->whoami; // For the benefit of ldout prefix
3063 ldout(client->cct, 1) << "I/O error from flush on inode " << inode
3064 << " 0x" << std::hex << inode->ino << std::dec
3065 << ": " << r << "(" << cpp_strerror(r) << ")" << dendl;
3066 inode->set_async_err(r);
3076 void Client::get_cap_ref(Inode *in, int cap)
3078 if ((cap & CEPH_CAP_FILE_BUFFER) &&
3079 in->cap_refs[CEPH_CAP_FILE_BUFFER] == 0) {
3080 ldout(cct, 5) << "get_cap_ref got first FILE_BUFFER ref on " << *in << dendl;
3083 if ((cap & CEPH_CAP_FILE_CACHE) &&
3084 in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3085 ldout(cct, 5) << "get_cap_ref got first FILE_CACHE ref on " << *in << dendl;
3088 in->get_cap_ref(cap);
3091 void Client::put_cap_ref(Inode *in, int cap)
3093 int last = in->put_cap_ref(cap);
3096 int drop = last & ~in->caps_issued();
3097 if (in->snapid == CEPH_NOSNAP) {
3098 if ((last & CEPH_CAP_FILE_WR) &&
3099 !in->cap_snaps.empty() &&
3100 in->cap_snaps.rbegin()->second.writing) {
3101 ldout(cct, 10) << "put_cap_ref finishing pending cap_snap on " << *in << dendl;
3102 in->cap_snaps.rbegin()->second.writing = 0;
3103 finish_cap_snap(in, in->cap_snaps.rbegin()->second, get_caps_used(in));
3104 signal_cond_list(in->waitfor_caps); // wake up blocked sync writers
3106 if (last & CEPH_CAP_FILE_BUFFER) {
3107 for (auto &p : in->cap_snaps)
3108 p.second.dirty_data = 0;
3109 signal_cond_list(in->waitfor_commit);
3110 ldout(cct, 5) << "put_cap_ref dropped last FILE_BUFFER ref on " << *in << dendl;
3114 if (last & CEPH_CAP_FILE_CACHE) {
3115 ldout(cct, 5) << "put_cap_ref dropped last FILE_CACHE ref on " << *in << dendl;
3121 put_inode(in, put_nref);
3125 int Client::get_caps(Inode *in, int need, int want, int *phave, loff_t endoff)
3127 int r = check_pool_perm(in, need);
3132 int file_wanted = in->caps_file_wanted();
3133 if ((file_wanted & need) != need) {
3134 ldout(cct, 10) << "get_caps " << *in << " need " << ccap_string(need)
3135 << " file_wanted " << ccap_string(file_wanted) << ", EBADF "
3141 int have = in->caps_issued(&implemented);
3143 bool waitfor_caps = false;
3144 bool waitfor_commit = false;
3146 if (have & need & CEPH_CAP_FILE_WR) {
3148 (endoff >= (loff_t)in->max_size ||
3149 endoff > (loff_t)(in->size << 1)) &&
3150 endoff > (loff_t)in->wanted_max_size) {
3151 ldout(cct, 10) << "wanted_max_size " << in->wanted_max_size << " -> " << endoff << dendl;
3152 in->wanted_max_size = endoff;
3156 if (endoff >= 0 && endoff > (loff_t)in->max_size) {
3157 ldout(cct, 10) << "waiting on max_size, endoff " << endoff << " max_size " << in->max_size << " on " << *in << dendl;
3158 waitfor_caps = true;
3160 if (!in->cap_snaps.empty()) {
3161 if (in->cap_snaps.rbegin()->second.writing) {
3162 ldout(cct, 10) << "waiting on cap_snap write to complete" << dendl;
3163 waitfor_caps = true;
3165 for (auto &p : in->cap_snaps) {
3166 if (p.second.dirty_data) {
3167 waitfor_commit = true;
3171 if (waitfor_commit) {
3172 _flush(in, new C_Client_FlushComplete(this, in));
3173 ldout(cct, 10) << "waiting for WRBUFFER to get dropped" << dendl;
3178 if (!waitfor_caps && !waitfor_commit) {
3179 if ((have & need) == need) {
3180 int revoking = implemented & ~have;
3181 ldout(cct, 10) << "get_caps " << *in << " have " << ccap_string(have)
3182 << " need " << ccap_string(need) << " want " << ccap_string(want)
3183 << " revoking " << ccap_string(revoking)
3185 if ((revoking & want) == 0) {
3186 *phave = need | (have & want);
3187 in->get_cap_ref(need);
3191 ldout(cct, 10) << "waiting for caps " << *in << " need " << ccap_string(need) << " want " << ccap_string(want) << dendl;
3192 waitfor_caps = true;
3195 if ((need & CEPH_CAP_FILE_WR) && in->auth_cap &&
3196 in->auth_cap->session->readonly)
3199 if (in->flags & I_CAP_DROPPED) {
3200 int mds_wanted = in->caps_mds_wanted();
3201 if ((mds_wanted & need) != need) {
3202 int ret = _renew_caps(in);
3207 if ((mds_wanted & file_wanted) ==
3208 (file_wanted & (CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR))) {
3209 in->flags &= ~I_CAP_DROPPED;
3214 wait_on_list(in->waitfor_caps);
3215 else if (waitfor_commit)
3216 wait_on_list(in->waitfor_commit);
3220 int Client::get_caps_used(Inode *in)
3222 unsigned used = in->caps_used();
3223 if (!(used & CEPH_CAP_FILE_CACHE) &&
3224 !objectcacher->set_is_empty(&in->oset))
3225 used |= CEPH_CAP_FILE_CACHE;
3229 void Client::cap_delay_requeue(Inode *in)
3231 ldout(cct, 10) << "cap_delay_requeue on " << *in << dendl;
3232 in->hold_caps_until = ceph_clock_now();
3233 in->hold_caps_until += cct->_conf->client_caps_release_delay;
3234 delayed_caps.push_back(&in->cap_item);
3237 void Client::send_cap(Inode *in, MetaSession *session, Cap *cap,
3238 bool sync, int used, int want, int retain,
3239 int flush, ceph_tid_t flush_tid)
3241 int held = cap->issued | cap->implemented;
3242 int revoking = cap->implemented & ~cap->issued;
3243 retain &= ~revoking;
3244 int dropping = cap->issued & ~retain;
3245 int op = CEPH_CAP_OP_UPDATE;
3247 ldout(cct, 10) << "send_cap " << *in
3248 << " mds." << session->mds_num << " seq " << cap->seq
3249 << (sync ? " sync " : " async ")
3250 << " used " << ccap_string(used)
3251 << " want " << ccap_string(want)
3252 << " flush " << ccap_string(flush)
3253 << " retain " << ccap_string(retain)
3254 << " held "<< ccap_string(held)
3255 << " revoking " << ccap_string(revoking)
3256 << " dropping " << ccap_string(dropping)
3259 if (cct->_conf->client_inject_release_failure && revoking) {
3260 const int would_have_issued = cap->issued & retain;
3261 const int would_have_implemented = cap->implemented & (cap->issued | used);
3263 // - tell the server we think issued is whatever they issued plus whatever we implemented
3264 // - leave what we have implemented in place
3265 ldout(cct, 20) << __func__ << " injecting failure to release caps" << dendl;
3266 cap->issued = cap->issued | cap->implemented;
3268 // Make an exception for revoking xattr caps: we are injecting
3269 // failure to release other caps, but allow xattr because client
3270 // will block on xattr ops if it can't release these to MDS (#9800)
3271 const int xattr_mask = CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
3272 cap->issued ^= xattr_mask & revoking;
3273 cap->implemented ^= xattr_mask & revoking;
3275 ldout(cct, 20) << __func__ << " issued " << ccap_string(cap->issued) << " vs " << ccap_string(would_have_issued) << dendl;
3276 ldout(cct, 20) << __func__ << " implemented " << ccap_string(cap->implemented) << " vs " << ccap_string(would_have_implemented) << dendl;
3279 cap->issued &= retain;
3280 cap->implemented &= cap->issued | used;
3283 snapid_t follows = 0;
3286 follows = in->snaprealm->get_snap_context().seq;
3288 MClientCaps *m = new MClientCaps(op,
3291 cap->cap_id, cap->seq,
3297 m->caller_uid = in->cap_dirtier_uid;
3298 m->caller_gid = in->cap_dirtier_gid;
3300 m->head.issue_seq = cap->issue_seq;
3301 m->set_tid(flush_tid);
3303 m->head.uid = in->uid;
3304 m->head.gid = in->gid;
3305 m->head.mode = in->mode;
3307 m->head.nlink = in->nlink;
3309 if (flush & CEPH_CAP_XATTR_EXCL) {
3310 ::encode(in->xattrs, m->xattrbl);
3311 m->head.xattr_version = in->xattr_version;
3315 m->max_size = in->max_size;
3316 m->truncate_seq = in->truncate_seq;
3317 m->truncate_size = in->truncate_size;
3318 m->mtime = in->mtime;
3319 m->atime = in->atime;
3320 m->ctime = in->ctime;
3321 m->btime = in->btime;
3322 m->time_warp_seq = in->time_warp_seq;
3323 m->change_attr = in->change_attr;
3325 m->flags |= CLIENT_CAPS_SYNC;
3327 if (flush & CEPH_CAP_FILE_WR) {
3328 m->inline_version = in->inline_version;
3329 m->inline_data = in->inline_data;
3332 in->reported_size = in->size;
3333 m->set_snap_follows(follows);
3335 if (cap == in->auth_cap) {
3336 m->set_max_size(in->wanted_max_size);
3337 in->requested_max_size = in->wanted_max_size;
3338 ldout(cct, 15) << "auth cap, setting max_size = " << in->requested_max_size << dendl;
3341 if (!session->flushing_caps_tids.empty())
3342 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3344 session->con->send_message(m);
3347 static bool is_max_size_approaching(Inode *in)
3349 /* mds will adjust max size according to the reported size */
3350 if (in->flushing_caps & CEPH_CAP_FILE_WR)
3352 if (in->size >= in->max_size)
3354 /* half of previous max_size increment has been used */
3355 if (in->max_size > in->reported_size &&
3356 (in->size << 1) >= in->max_size + in->reported_size)
3364 * Examine currently used and wanted versus held caps. Release, flush or ack
3365 * revoked caps to the MDS as appropriate.
3367 * @param in the inode to check
3368 * @param flags flags to apply to cap check
3370 void Client::check_caps(Inode *in, unsigned flags)
3372 unsigned wanted = in->caps_wanted();
3373 unsigned used = get_caps_used(in);
3376 if (in->is_dir() && (in->flags & I_COMPLETE)) {
3377 // we do this here because we don't want to drop to Fs (and then
3378 // drop the Fs if we do a create!) if that alone makes us send lookups
3379 // to the MDS. Doing it in in->caps_wanted() has knock-on effects elsewhere
3380 wanted |= CEPH_CAP_FILE_EXCL;
3384 int issued = in->caps_issued(&implemented);
3385 int revoking = implemented & ~issued;
3387 int retain = wanted | used | CEPH_CAP_PIN;
3390 retain |= CEPH_CAP_ANY;
3392 retain |= CEPH_CAP_ANY_SHARED;
3395 ldout(cct, 10) << "check_caps on " << *in
3396 << " wanted " << ccap_string(wanted)
3397 << " used " << ccap_string(used)
3398 << " issued " << ccap_string(issued)
3399 << " revoking " << ccap_string(revoking)
3400 << " flags=" << flags
3403 if (in->snapid != CEPH_NOSNAP)
3404 return; //snap caps last forever, can't write
3406 if (in->caps.empty())
3407 return; // guard if at end of func
3409 if ((revoking & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) &&
3410 (used & CEPH_CAP_FILE_CACHE) && !(used & CEPH_CAP_FILE_BUFFER))
3413 if (!in->cap_snaps.empty())
3416 if (flags & CHECK_CAPS_NODELAY)
3417 in->hold_caps_until = utime_t();
3419 cap_delay_requeue(in);
3421 utime_t now = ceph_clock_now();
3423 map<mds_rank_t, Cap*>::iterator it = in->caps.begin();
3424 while (it != in->caps.end()) {
3425 mds_rank_t mds = it->first;
3426 Cap *cap = it->second;
3429 MetaSession *session = mds_sessions[mds];
3433 if (in->auth_cap && cap != in->auth_cap)
3434 cap_used &= ~in->auth_cap->issued;
3436 revoking = cap->implemented & ~cap->issued;
3438 ldout(cct, 10) << " cap mds." << mds
3439 << " issued " << ccap_string(cap->issued)
3440 << " implemented " << ccap_string(cap->implemented)
3441 << " revoking " << ccap_string(revoking) << dendl;
3443 if (in->wanted_max_size > in->max_size &&
3444 in->wanted_max_size > in->requested_max_size &&
3445 cap == in->auth_cap)
3448 /* approaching file_max? */
3449 if ((cap->issued & CEPH_CAP_FILE_WR) &&
3450 cap == in->auth_cap &&
3451 is_max_size_approaching(in)) {
3452 ldout(cct, 10) << "size " << in->size << " approaching max_size " << in->max_size
3453 << ", reported " << in->reported_size << dendl;
3457 /* completed revocation? */
3458 if (revoking && (revoking & cap_used) == 0) {
3459 ldout(cct, 10) << "completed revocation of " << ccap_string(cap->implemented & ~cap->issued) << dendl;
3463 /* want more caps from mds? */
3464 if (wanted & ~(cap->wanted | cap->issued))
3467 if (!revoking && unmounting && (cap_used == 0))
3470 if (wanted == cap->wanted && // mds knows what we want.
3471 ((cap->issued & ~retain) == 0) &&// and we don't have anything we wouldn't like
3472 !in->dirty_caps) // and we have no dirty caps
3475 if (now < in->hold_caps_until) {
3476 ldout(cct, 10) << "delaying cap release" << dendl;
3481 // re-send old cap/snapcap flushes first.
3482 if (session->mds_state >= MDSMap::STATE_RECONNECT &&
3483 session->mds_state < MDSMap::STATE_ACTIVE &&
3484 session->early_flushing_caps.count(in) == 0) {
3485 ldout(cct, 20) << " reflushing caps (check_caps) on " << *in
3486 << " to mds." << session->mds_num << dendl;
3487 session->early_flushing_caps.insert(in);
3488 if (in->cap_snaps.size())
3489 flush_snaps(in, true);
3490 if (in->flushing_caps)
3491 flush_caps(in, session, flags & CHECK_CAPS_SYNCHRONOUS);
3495 ceph_tid_t flush_tid;
3496 if (in->auth_cap == cap && in->dirty_caps) {
3497 flushing = mark_caps_flushing(in, &flush_tid);
3503 send_cap(in, session, cap, flags & CHECK_CAPS_SYNCHRONOUS, cap_used, wanted,
3504 retain, flushing, flush_tid);
3509 void Client::queue_cap_snap(Inode *in, SnapContext& old_snapc)
3511 int used = get_caps_used(in);
3512 int dirty = in->caps_dirty();
3513 ldout(cct, 10) << "queue_cap_snap " << *in << " snapc " << old_snapc << " used " << ccap_string(used) << dendl;
3515 if (in->cap_snaps.size() &&
3516 in->cap_snaps.rbegin()->second.writing) {
3517 ldout(cct, 10) << "queue_cap_snap already have pending cap_snap on " << *in << dendl;
3519 } else if (in->caps_dirty() ||
3520 (used & CEPH_CAP_FILE_WR) ||
3521 (dirty & CEPH_CAP_ANY_WR)) {
3522 const auto &capsnapem = in->cap_snaps.emplace(std::piecewise_construct, std::make_tuple(old_snapc.seq), std::make_tuple(in));
3523 assert(capsnapem.second == true); /* element inserted */
3524 CapSnap &capsnap = capsnapem.first->second;
3525 capsnap.context = old_snapc;
3526 capsnap.issued = in->caps_issued();
3527 capsnap.dirty = in->caps_dirty();
3529 capsnap.dirty_data = (used & CEPH_CAP_FILE_BUFFER);
3531 capsnap.uid = in->uid;
3532 capsnap.gid = in->gid;
3533 capsnap.mode = in->mode;
3534 capsnap.btime = in->btime;
3535 capsnap.xattrs = in->xattrs;
3536 capsnap.xattr_version = in->xattr_version;
3538 if (used & CEPH_CAP_FILE_WR) {
3539 ldout(cct, 10) << "queue_cap_snap WR used on " << *in << dendl;
3540 capsnap.writing = 1;
3542 finish_cap_snap(in, capsnap, used);
3545 ldout(cct, 10) << "queue_cap_snap not dirty|writing on " << *in << dendl;
3549 void Client::finish_cap_snap(Inode *in, CapSnap &capsnap, int used)
3551 ldout(cct, 10) << "finish_cap_snap " << *in << " capsnap " << (void *)&capsnap << " used " << ccap_string(used) << dendl;
3552 capsnap.size = in->size;
3553 capsnap.mtime = in->mtime;
3554 capsnap.atime = in->atime;
3555 capsnap.ctime = in->ctime;
3556 capsnap.time_warp_seq = in->time_warp_seq;
3557 capsnap.change_attr = in->change_attr;
3559 capsnap.dirty |= in->caps_dirty();
3561 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3562 capsnap.inline_data = in->inline_data;
3563 capsnap.inline_version = in->inline_version;
3566 if (used & CEPH_CAP_FILE_BUFFER) {
3567 ldout(cct, 10) << "finish_cap_snap " << *in << " cap_snap " << &capsnap << " used " << used
3568 << " WRBUFFER, delaying" << dendl;
3570 capsnap.dirty_data = 0;
3575 void Client::_flushed_cap_snap(Inode *in, snapid_t seq)
3577 ldout(cct, 10) << "_flushed_cap_snap seq " << seq << " on " << *in << dendl;
3578 in->cap_snaps.at(seq).dirty_data = 0;
3582 void Client::flush_snaps(Inode *in, bool all_again)
3584 ldout(cct, 10) << "flush_snaps on " << *in << " all_again " << all_again << dendl;
3585 assert(in->cap_snaps.size());
3588 assert(in->auth_cap);
3589 MetaSession *session = in->auth_cap->session;
3590 int mseq = in->auth_cap->mseq;
3592 for (auto &p : in->cap_snaps) {
3593 CapSnap &capsnap = p.second;
3595 // only flush once per session
3596 if (capsnap.flush_tid > 0)
3600 ldout(cct, 10) << "flush_snaps mds." << session->mds_num
3601 << " follows " << p.first
3602 << " size " << capsnap.size
3603 << " mtime " << capsnap.mtime
3604 << " dirty_data=" << capsnap.dirty_data
3605 << " writing=" << capsnap.writing
3606 << " on " << *in << dendl;
3607 if (capsnap.dirty_data || capsnap.writing)
3610 if (capsnap.flush_tid == 0) {
3611 capsnap.flush_tid = ++last_flush_tid;
3612 if (!in->flushing_cap_item.is_on_list())
3613 session->flushing_caps.push_back(&in->flushing_cap_item);
3614 session->flushing_caps_tids.insert(capsnap.flush_tid);
3617 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_FLUSHSNAP, in->ino, in->snaprealm->ino, 0, mseq,
3620 m->caller_uid = user_id;
3622 m->caller_gid = group_id;
3624 m->set_client_tid(capsnap.flush_tid);
3625 m->head.snap_follows = p.first;
3627 m->head.caps = capsnap.issued;
3628 m->head.dirty = capsnap.dirty;
3630 m->head.uid = capsnap.uid;
3631 m->head.gid = capsnap.gid;
3632 m->head.mode = capsnap.mode;
3633 m->btime = capsnap.btime;
3635 m->size = capsnap.size;
3637 m->head.xattr_version = capsnap.xattr_version;
3638 ::encode(capsnap.xattrs, m->xattrbl);
3640 m->ctime = capsnap.ctime;
3641 m->btime = capsnap.btime;
3642 m->mtime = capsnap.mtime;
3643 m->atime = capsnap.atime;
3644 m->time_warp_seq = capsnap.time_warp_seq;
3645 m->change_attr = capsnap.change_attr;
3647 if (capsnap.dirty & CEPH_CAP_FILE_WR) {
3648 m->inline_version = in->inline_version;
3649 m->inline_data = in->inline_data;
3652 assert(!session->flushing_caps_tids.empty());
3653 m->set_oldest_flush_tid(*session->flushing_caps_tids.begin());
3655 session->con->send_message(m);
3661 void Client::wait_on_list(list<Cond*>& ls)
3664 ls.push_back(&cond);
3665 cond.Wait(client_lock);
3669 void Client::signal_cond_list(list<Cond*>& ls)
3671 for (list<Cond*>::iterator it = ls.begin(); it != ls.end(); ++it)
3675 void Client::wait_on_context_list(list<Context*>& ls)
3680 ls.push_back(new C_Cond(&cond, &done, &r));
3682 cond.Wait(client_lock);
3685 void Client::signal_context_list(list<Context*>& ls)
3687 while (!ls.empty()) {
3688 ls.front()->complete(0);
3693 void Client::wake_inode_waiters(MetaSession *s)
3695 xlist<Cap*>::iterator iter = s->caps.begin();
3696 while (!iter.end()){
3697 signal_cond_list((*iter)->inode->waitfor_caps);
3703 // flush dirty data (from objectcache)
3705 class C_Client_CacheInvalidate : public Context {
3709 int64_t offset, length;
3711 C_Client_CacheInvalidate(Client *c, Inode *in, int64_t off, int64_t len) :
3712 client(c), offset(off), length(len) {
3713 if (client->use_faked_inos())
3714 ino = vinodeno_t(in->faked_ino, CEPH_NOSNAP);
3718 void finish(int r) override {
3719 // _async_invalidate takes the lock when it needs to, call this back from outside of lock.
3720 assert(!client->client_lock.is_locked_by_me());
3721 client->_async_invalidate(ino, offset, length);
3725 void Client::_async_invalidate(vinodeno_t ino, int64_t off, int64_t len)
3729 ldout(cct, 10) << "_async_invalidate " << ino << " " << off << "~" << len << dendl;
3730 ino_invalidate_cb(callback_handle, ino, off, len);
3733 void Client::_schedule_invalidate_callback(Inode *in, int64_t off, int64_t len) {
3735 if (ino_invalidate_cb)
3736 // we queue the invalidate, which calls the callback and decrements the ref
3737 async_ino_invalidator.queue(new C_Client_CacheInvalidate(this, in, off, len));
3740 void Client::_invalidate_inode_cache(Inode *in)
3742 ldout(cct, 10) << "_invalidate_inode_cache " << *in << dendl;
3744 // invalidate our userspace inode cache
3745 if (cct->_conf->client_oc)
3746 objectcacher->release_set(&in->oset);
3748 _schedule_invalidate_callback(in, 0, 0);
3751 void Client::_invalidate_inode_cache(Inode *in, int64_t off, int64_t len)
3753 ldout(cct, 10) << "_invalidate_inode_cache " << *in << " " << off << "~" << len << dendl;
3755 // invalidate our userspace inode cache
3756 if (cct->_conf->client_oc) {
3757 vector<ObjectExtent> ls;
3758 Striper::file_to_extents(cct, in->ino, &in->layout, off, len, in->truncate_size, ls);
3759 objectcacher->discard_set(&in->oset, ls);
3762 _schedule_invalidate_callback(in, off, len);
3765 bool Client::_release(Inode *in)
3767 ldout(cct, 20) << "_release " << *in << dendl;
3768 if (in->cap_refs[CEPH_CAP_FILE_CACHE] == 0) {
3769 _invalidate_inode_cache(in);
3775 bool Client::_flush(Inode *in, Context *onfinish)
3777 ldout(cct, 10) << "_flush " << *in << dendl;
3779 if (!in->oset.dirty_or_tx) {
3780 ldout(cct, 10) << " nothing to flush" << dendl;
3781 onfinish->complete(0);
3785 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
3786 ldout(cct, 1) << __func__ << ": FULL, purging for ENOSPC" << dendl;
3787 objectcacher->purge_set(&in->oset);
3789 onfinish->complete(-ENOSPC);
3794 return objectcacher->flush_set(&in->oset, onfinish);
3797 void Client::_flush_range(Inode *in, int64_t offset, uint64_t size)
3799 assert(client_lock.is_locked());
3800 if (!in->oset.dirty_or_tx) {
3801 ldout(cct, 10) << " nothing to flush" << dendl;
3805 Mutex flock("Client::_flush_range flock");
3808 Context *onflush = new C_SafeCond(&flock, &cond, &safe);
3809 bool ret = objectcacher->file_flush(&in->oset, &in->layout, in->snaprealm->get_snap_context(),
3810 offset, size, onflush);
3813 client_lock.Unlock();
3822 void Client::flush_set_callback(ObjectCacher::ObjectSet *oset)
3824 // Mutex::Locker l(client_lock);
3825 assert(client_lock.is_locked()); // will be called via dispatch() -> objecter -> ...
3826 Inode *in = static_cast<Inode *>(oset->parent);
3831 void Client::_flushed(Inode *in)
3833 ldout(cct, 10) << "_flushed " << *in << dendl;
3835 put_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
3840 // checks common to add_update_cap, handle_cap_grant
3841 void Client::check_cap_issue(Inode *in, Cap *cap, unsigned issued)
3843 unsigned had = in->caps_issued();
3845 if ((issued & CEPH_CAP_FILE_CACHE) &&
3846 !(had & CEPH_CAP_FILE_CACHE))
3849 if ((issued & CEPH_CAP_FILE_SHARED) &&
3850 !(had & CEPH_CAP_FILE_SHARED)) {
3854 clear_dir_complete_and_ordered(in, true);
3858 void Client::add_update_cap(Inode *in, MetaSession *mds_session, uint64_t cap_id,
3859 unsigned issued, unsigned seq, unsigned mseq, inodeno_t realm,
3860 int flags, const UserPerm& cap_perms)
3863 mds_rank_t mds = mds_session->mds_num;
3864 if (in->caps.count(mds)) {
3865 cap = in->caps[mds];
3868 * auth mds of the inode changed. we received the cap export
3869 * message, but still haven't received the cap import message.
3870 * handle_cap_export() updated the new auth MDS' cap.
3872 * "ceph_seq_cmp(seq, cap->seq) <= 0" means we are processing
3873 * a message that was send before the cap import message. So
3874 * don't remove caps.
3876 if (ceph_seq_cmp(seq, cap->seq) <= 0) {
3877 assert(cap == in->auth_cap);
3878 assert(cap->cap_id == cap_id);
3881 issued |= cap->issued;
3882 flags |= CEPH_CAP_FLAG_AUTH;
3885 mds_session->num_caps++;
3886 if (!in->is_any_caps()) {
3887 assert(in->snaprealm == 0);
3888 in->snaprealm = get_snap_realm(realm);
3889 in->snaprealm->inodes_with_caps.push_back(&in->snaprealm_item);
3890 ldout(cct, 15) << "add_update_cap first one, opened snaprealm " << in->snaprealm << dendl;
3892 in->caps[mds] = cap = new Cap;
3894 mds_session->caps.push_back(&cap->cap_item);
3895 cap->session = mds_session;
3897 cap->gen = mds_session->cap_gen;
3898 cap_list.push_back(&in->cap_item);
3901 check_cap_issue(in, cap, issued);
3903 if (flags & CEPH_CAP_FLAG_AUTH) {
3904 if (in->auth_cap != cap &&
3905 (!in->auth_cap || ceph_seq_cmp(in->auth_cap->mseq, mseq) < 0)) {
3906 if (in->auth_cap && in->flushing_cap_item.is_on_list()) {
3907 ldout(cct, 10) << "add_update_cap changing auth cap: "
3908 << "add myself to new auth MDS' flushing caps list" << dendl;
3909 adjust_session_flushing_caps(in, in->auth_cap->session, mds_session);
3915 unsigned old_caps = cap->issued;
3916 cap->cap_id = cap_id;
3917 cap->issued |= issued;
3918 cap->implemented |= issued;
3920 cap->issue_seq = seq;
3922 cap->latest_perms = cap_perms;
3923 ldout(cct, 10) << "add_update_cap issued " << ccap_string(old_caps) << " -> " << ccap_string(cap->issued)
3924 << " from mds." << mds
3928 if ((issued & ~old_caps) && in->auth_cap == cap) {
3929 // non-auth MDS is revoking the newly grant caps ?
3930 for (map<mds_rank_t,Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
3931 if (it->second == cap)
3933 if (it->second->implemented & ~it->second->issued & issued) {
3934 check_caps(in, CHECK_CAPS_NODELAY);
3940 if (issued & ~old_caps)
3941 signal_cond_list(in->waitfor_caps);
3944 void Client::remove_cap(Cap *cap, bool queue_release)
3946 Inode *in = cap->inode;
3947 MetaSession *session = cap->session;
3948 mds_rank_t mds = cap->session->mds_num;
3950 ldout(cct, 10) << "remove_cap mds." << mds << " on " << *in << dendl;
3952 if (queue_release) {
3953 session->enqueue_cap_release(
3961 if (in->auth_cap == cap) {
3962 if (in->flushing_cap_item.is_on_list()) {
3963 ldout(cct, 10) << " removing myself from flushing_cap list" << dendl;
3964 in->flushing_cap_item.remove_myself();
3966 in->auth_cap = NULL;
3968 assert(in->caps.count(mds));
3969 in->caps.erase(mds);
3971 cap->cap_item.remove_myself();
3975 if (!in->is_any_caps()) {
3976 ldout(cct, 15) << "remove_cap last one, closing snaprealm " << in->snaprealm << dendl;
3977 in->snaprealm_item.remove_myself();
3978 put_snap_realm(in->snaprealm);
3983 void Client::remove_all_caps(Inode *in)
3985 while (!in->caps.empty())
3986 remove_cap(in->caps.begin()->second, true);
3989 void Client::remove_session_caps(MetaSession *s)
3991 ldout(cct, 10) << "remove_session_caps mds." << s->mds_num << dendl;
3993 while (s->caps.size()) {
3994 Cap *cap = *s->caps.begin();
3995 Inode *in = cap->inode;
3996 bool dirty_caps = false, cap_snaps = false;
3997 if (in->auth_cap == cap) {
3998 cap_snaps = !in->cap_snaps.empty();
3999 dirty_caps = in->dirty_caps | in->flushing_caps;
4000 in->wanted_max_size = 0;
4001 in->requested_max_size = 0;
4002 in->flags |= I_CAP_DROPPED;
4004 remove_cap(cap, false);
4005 signal_cond_list(in->waitfor_caps);
4007 InodeRef tmp_ref(in);
4008 in->cap_snaps.clear();
4011 lderr(cct) << "remove_session_caps still has dirty|flushing caps on " << *in << dendl;
4012 if (in->flushing_caps) {
4013 num_flushing_caps--;
4014 in->flushing_cap_tids.clear();
4016 in->flushing_caps = 0;
4021 s->flushing_caps_tids.clear();
4025 class C_Client_Remount : public Context {
4029 explicit C_Client_Remount(Client *c) : client(c) {}
4030 void finish(int r) override {
4032 r = client->remount_cb(client->callback_handle);
4034 client_t whoami = client->get_nodeid();
4035 lderr(client->cct) << "tried to remount (to trim kernel dentries) and got error "
4037 if (client->require_remount && !client->unmounting) {
4038 assert(0 == "failed to remount for kernel dentry trimming");
4044 void Client::_invalidate_kernel_dcache()
4048 if (can_invalidate_dentries && dentry_invalidate_cb && root->dir) {
4049 for (ceph::unordered_map<string, Dentry*>::iterator p = root->dir->dentries.begin();
4050 p != root->dir->dentries.end();
4052 if (p->second->inode)
4053 _schedule_invalidate_dentry_callback(p->second, false);
4055 } else if (remount_cb) {
4057 // when remounting a file system, linux kernel trims all unused dentries in the fs
4058 remount_finisher.queue(new C_Client_Remount(this));
4062 void Client::trim_caps(MetaSession *s, int max)
4064 mds_rank_t mds = s->mds_num;
4065 int caps_size = s->caps.size();
4066 ldout(cct, 10) << "trim_caps mds." << mds << " max " << max
4067 << " caps " << caps_size << dendl;
4070 xlist<Cap*>::iterator p = s->caps.begin();
4071 while ((caps_size - trimmed) > max && !p.end()) {
4073 Inode *in = cap->inode;
4075 // Increment p early because it will be invalidated if cap
4076 // is deleted inside remove_cap
4079 if (in->caps.size() > 1 && cap != in->auth_cap) {
4080 int mine = cap->issued | cap->implemented;
4081 int oissued = in->auth_cap ? in->auth_cap->issued : 0;
4082 // disposable non-auth cap
4083 if (!(get_caps_used(in) & ~oissued & mine)) {
4084 ldout(cct, 20) << " removing unused, unneeded non-auth cap on " << *in << dendl;
4085 remove_cap(cap, true);
4089 ldout(cct, 20) << " trying to trim dentries for " << *in << dendl;
4091 set<Dentry*>::iterator q = in->dn_set.begin();
4092 InodeRef tmp_ref(in);
4093 while (q != in->dn_set.end()) {
4095 if (dn->lru_is_expireable()) {
4096 if (can_invalidate_dentries &&
4097 dn->dir->parent_inode->ino == MDS_INO_ROOT) {
4098 // Only issue one of these per DN for inodes in root: handle
4099 // others more efficiently by calling for root-child DNs at
4100 // the end of this function.
4101 _schedule_invalidate_dentry_callback(dn, true);
4105 ldout(cct, 20) << " not expirable: " << dn->name << dendl;
4109 if (all && in->ino != MDS_INO_ROOT) {
4110 ldout(cct, 20) << __func__ << " counting as trimmed: " << *in << dendl;
4116 if (s->caps.size() > max)
4117 _invalidate_kernel_dcache();
4120 void Client::force_session_readonly(MetaSession *s)
4123 for (xlist<Cap*>::iterator p = s->caps.begin(); !p.end(); ++p) {
4124 Inode *in = (*p)->inode;
4125 if (in->caps_wanted() & CEPH_CAP_FILE_WR)
4126 signal_cond_list(in->waitfor_caps);
4130 void Client::mark_caps_dirty(Inode *in, int caps)
4132 ldout(cct, 10) << "mark_caps_dirty " << *in << " " << ccap_string(in->dirty_caps) << " -> "
4133 << ccap_string(in->dirty_caps | caps) << dendl;
4134 if (caps && !in->caps_dirty())
4136 in->dirty_caps |= caps;
4139 int Client::mark_caps_flushing(Inode *in, ceph_tid_t* ptid)
4141 MetaSession *session = in->auth_cap->session;
4143 int flushing = in->dirty_caps;
4146 ceph_tid_t flush_tid = ++last_flush_tid;
4147 in->flushing_cap_tids[flush_tid] = flushing;
4149 if (!in->flushing_caps) {
4150 ldout(cct, 10) << "mark_caps_flushing " << ccap_string(flushing) << " " << *in << dendl;
4151 num_flushing_caps++;
4153 ldout(cct, 10) << "mark_caps_flushing (more) " << ccap_string(flushing) << " " << *in << dendl;
4156 in->flushing_caps |= flushing;
4159 if (!in->flushing_cap_item.is_on_list())
4160 session->flushing_caps.push_back(&in->flushing_cap_item);
4161 session->flushing_caps_tids.insert(flush_tid);
4167 void Client::adjust_session_flushing_caps(Inode *in, MetaSession *old_s, MetaSession *new_s)
4169 for (auto &p : in->cap_snaps) {
4170 CapSnap &capsnap = p.second;
4171 if (capsnap.flush_tid > 0) {
4172 old_s->flushing_caps_tids.erase(capsnap.flush_tid);
4173 new_s->flushing_caps_tids.insert(capsnap.flush_tid);
4176 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4177 it != in->flushing_cap_tids.end();
4179 old_s->flushing_caps_tids.erase(it->first);
4180 new_s->flushing_caps_tids.insert(it->first);
4182 new_s->flushing_caps.push_back(&in->flushing_cap_item);
4186 * Flush all caps back to the MDS. Because the callers generally wait on the
4187 * result of this function (syncfs and umount cases), we set
4188 * CHECK_CAPS_SYNCHRONOUS on the last check_caps call.
4190 void Client::flush_caps_sync()
4192 ldout(cct, 10) << __func__ << dendl;
4193 xlist<Inode*>::iterator p = delayed_caps.begin();
4195 unsigned flags = CHECK_CAPS_NODELAY;
4199 delayed_caps.pop_front();
4200 if (p.end() && cap_list.empty())
4201 flags |= CHECK_CAPS_SYNCHRONOUS;
4202 check_caps(in, flags);
4206 p = cap_list.begin();
4208 unsigned flags = CHECK_CAPS_NODELAY;
4213 flags |= CHECK_CAPS_SYNCHRONOUS;
4214 check_caps(in, flags);
4218 void Client::flush_caps(Inode *in, MetaSession *session, bool sync)
4220 ldout(cct, 10) << "flush_caps " << in << " mds." << session->mds_num << dendl;
4221 Cap *cap = in->auth_cap;
4222 assert(cap->session == session);
4224 for (map<ceph_tid_t,int>::iterator p = in->flushing_cap_tids.begin();
4225 p != in->flushing_cap_tids.end();
4227 bool req_sync = false;
4229 /* If this is a synchronous request, then flush the journal on last one */
4230 if (sync && (p->first == in->flushing_cap_tids.rbegin()->first))
4233 send_cap(in, session, cap, req_sync,
4234 (get_caps_used(in) | in->caps_dirty()),
4235 in->caps_wanted(), (cap->issued | cap->implemented),
4236 p->second, p->first);
4240 void Client::wait_sync_caps(Inode *in, ceph_tid_t want)
4242 while (in->flushing_caps) {
4243 map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4244 assert(it != in->flushing_cap_tids.end());
4245 if (it->first > want)
4247 ldout(cct, 10) << "wait_sync_caps on " << *in << " flushing "
4248 << ccap_string(it->second) << " want " << want
4249 << " last " << it->first << dendl;
4250 wait_on_list(in->waitfor_caps);
4254 void Client::wait_sync_caps(ceph_tid_t want)
4257 ldout(cct, 10) << "wait_sync_caps want " << want << " (last is " << last_flush_tid << ", "
4258 << num_flushing_caps << " total flushing)" << dendl;
4259 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
4260 p != mds_sessions.end();
4262 MetaSession *s = p->second;
4263 if (s->flushing_caps_tids.empty())
4265 ceph_tid_t oldest_tid = *s->flushing_caps_tids.begin();
4266 if (oldest_tid <= want) {
4267 ldout(cct, 10) << " waiting on mds." << p->first << " tid " << oldest_tid
4268 << " (want " << want << ")" << dendl;
4269 sync_cond.Wait(client_lock);
4275 void Client::kick_flushing_caps(MetaSession *session)
4277 mds_rank_t mds = session->mds_num;
4278 ldout(cct, 10) << "kick_flushing_caps mds." << mds << dendl;
4280 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4282 if (session->early_flushing_caps.count(in))
4284 ldout(cct, 20) << " reflushing caps on " << *in << " to mds." << mds << dendl;
4285 if (in->cap_snaps.size())
4286 flush_snaps(in, true);
4287 if (in->flushing_caps)
4288 flush_caps(in, session);
4291 session->early_flushing_caps.clear();
4294 void Client::early_kick_flushing_caps(MetaSession *session)
4296 session->early_flushing_caps.clear();
4298 for (xlist<Inode*>::iterator p = session->flushing_caps.begin(); !p.end(); ++p) {
4300 assert(in->auth_cap);
4302 // if flushing caps were revoked, we re-send the cap flush in client reconnect
4303 // stage. This guarantees that MDS processes the cap flush message before issuing
4304 // the flushing caps to other client.
4305 if ((in->flushing_caps & in->auth_cap->issued) == in->flushing_caps)
4308 ldout(cct, 20) << " reflushing caps (early_kick) on " << *in
4309 << " to mds." << session->mds_num << dendl;
4311 session->early_flushing_caps.insert(in);
4313 if (in->cap_snaps.size())
4314 flush_snaps(in, true);
4315 if (in->flushing_caps)
4316 flush_caps(in, session);
4321 void Client::kick_maxsize_requests(MetaSession *session)
4323 xlist<Cap*>::iterator iter = session->caps.begin();
4324 while (!iter.end()){
4325 (*iter)->inode->requested_max_size = 0;
4326 (*iter)->inode->wanted_max_size = 0;
4327 signal_cond_list((*iter)->inode->waitfor_caps);
4332 void SnapRealm::build_snap_context()
4334 set<snapid_t> snaps;
4335 snapid_t max_seq = seq;
4337 // start with prior_parents?
4338 for (unsigned i=0; i<prior_parent_snaps.size(); i++)
4339 snaps.insert(prior_parent_snaps[i]);
4341 // current parent's snaps
4343 const SnapContext& psnapc = pparent->get_snap_context();
4344 for (unsigned i=0; i<psnapc.snaps.size(); i++)
4345 if (psnapc.snaps[i] >= parent_since)
4346 snaps.insert(psnapc.snaps[i]);
4347 if (psnapc.seq > max_seq)
4348 max_seq = psnapc.seq;
4352 for (unsigned i=0; i<my_snaps.size(); i++)
4353 snaps.insert(my_snaps[i]);
4356 cached_snap_context.seq = max_seq;
4357 cached_snap_context.snaps.resize(0);
4358 cached_snap_context.snaps.reserve(snaps.size());
4359 for (set<snapid_t>::reverse_iterator p = snaps.rbegin(); p != snaps.rend(); ++p)
4360 cached_snap_context.snaps.push_back(*p);
4363 void Client::invalidate_snaprealm_and_children(SnapRealm *realm)
4368 while (!q.empty()) {
4372 ldout(cct, 10) << "invalidate_snaprealm_and_children " << *realm << dendl;
4373 realm->invalidate_cache();
4375 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4376 p != realm->pchildren.end();
4382 SnapRealm *Client::get_snap_realm(inodeno_t r)
4384 SnapRealm *realm = snap_realms[r];
4386 snap_realms[r] = realm = new SnapRealm(r);
4387 ldout(cct, 20) << "get_snap_realm " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4392 SnapRealm *Client::get_snap_realm_maybe(inodeno_t r)
4394 if (snap_realms.count(r) == 0) {
4395 ldout(cct, 20) << "get_snap_realm_maybe " << r << " fail" << dendl;
4398 SnapRealm *realm = snap_realms[r];
4399 ldout(cct, 20) << "get_snap_realm_maybe " << r << " " << realm << " " << realm->nref << " -> " << (realm->nref + 1) << dendl;
4404 void Client::put_snap_realm(SnapRealm *realm)
4406 ldout(cct, 20) << "put_snap_realm " << realm->ino << " " << realm
4407 << " " << realm->nref << " -> " << (realm->nref - 1) << dendl;
4408 if (--realm->nref == 0) {
4409 snap_realms.erase(realm->ino);
4410 if (realm->pparent) {
4411 realm->pparent->pchildren.erase(realm);
4412 put_snap_realm(realm->pparent);
4418 bool Client::adjust_realm_parent(SnapRealm *realm, inodeno_t parent)
4420 if (realm->parent != parent) {
4421 ldout(cct, 10) << "adjust_realm_parent " << *realm
4422 << " " << realm->parent << " -> " << parent << dendl;
4423 realm->parent = parent;
4424 if (realm->pparent) {
4425 realm->pparent->pchildren.erase(realm);
4426 put_snap_realm(realm->pparent);
4428 realm->pparent = get_snap_realm(parent);
4429 realm->pparent->pchildren.insert(realm);
4435 static bool has_new_snaps(const SnapContext& old_snapc,
4436 const SnapContext& new_snapc)
4438 return !new_snapc.snaps.empty() && new_snapc.snaps[0] > old_snapc.seq;
4442 void Client::update_snap_trace(bufferlist& bl, SnapRealm **realm_ret, bool flush)
4444 SnapRealm *first_realm = NULL;
4445 ldout(cct, 10) << "update_snap_trace len " << bl.length() << dendl;
4447 map<SnapRealm*, SnapContext> dirty_realms;
4449 bufferlist::iterator p = bl.begin();
4453 SnapRealm *realm = get_snap_realm(info.ino());
4455 bool invalidate = false;
4457 if (info.seq() > realm->seq) {
4458 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq() << " > " << realm->seq
4462 // writeback any dirty caps _before_ updating snap list (i.e. with old snap info)
4463 // flush me + children
4466 while (!q.empty()) {
4467 SnapRealm *realm = q.front();
4470 for (set<SnapRealm*>::iterator p = realm->pchildren.begin();
4471 p != realm->pchildren.end();
4475 if (dirty_realms.count(realm) == 0) {
4477 dirty_realms[realm] = realm->get_snap_context();
4483 realm->seq = info.seq();
4484 realm->created = info.created();
4485 realm->parent_since = info.parent_since();
4486 realm->prior_parent_snaps = info.prior_parent_snaps;
4487 realm->my_snaps = info.my_snaps;
4491 // _always_ verify parent
4492 if (adjust_realm_parent(realm, info.parent()))
4496 invalidate_snaprealm_and_children(realm);
4497 ldout(cct, 15) << "update_snap_trace " << *realm << " self|parent updated" << dendl;
4498 ldout(cct, 15) << " snapc " << realm->get_snap_context() << dendl;
4500 ldout(cct, 10) << "update_snap_trace " << *realm << " seq " << info.seq()
4501 << " <= " << realm->seq << " and same parent, SKIPPING" << dendl;
4505 first_realm = realm;
4507 put_snap_realm(realm);
4510 for (map<SnapRealm*, SnapContext>::iterator q = dirty_realms.begin();
4511 q != dirty_realms.end();
4513 SnapRealm *realm = q->first;
4514 // if there are new snaps ?
4515 if (has_new_snaps(q->second, realm->get_snap_context())) {
4516 ldout(cct, 10) << " flushing caps on " << *realm << dendl;
4517 xlist<Inode*>::iterator r = realm->inodes_with_caps.begin();
4521 queue_cap_snap(in, q->second);
4524 ldout(cct, 10) << " no new snap on " << *realm << dendl;
4526 put_snap_realm(realm);
4530 *realm_ret = first_realm;
4532 put_snap_realm(first_realm);
4535 void Client::handle_snap(MClientSnap *m)
4537 ldout(cct, 10) << "handle_snap " << *m << dendl;
4538 mds_rank_t mds = mds_rank_t(m->get_source().num());
4539 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4545 got_mds_push(session);
4547 map<Inode*, SnapContext> to_move;
4548 SnapRealm *realm = 0;
4550 if (m->head.op == CEPH_SNAP_OP_SPLIT) {
4551 assert(m->head.split);
4553 bufferlist::iterator p = m->bl.begin();
4555 assert(info.ino() == m->head.split);
4557 // flush, then move, ino's.
4558 realm = get_snap_realm(info.ino());
4559 ldout(cct, 10) << " splitting off " << *realm << dendl;
4560 for (vector<inodeno_t>::iterator p = m->split_inos.begin();
4561 p != m->split_inos.end();
4563 vinodeno_t vino(*p, CEPH_NOSNAP);
4564 if (inode_map.count(vino)) {
4565 Inode *in = inode_map[vino];
4566 if (!in->snaprealm || in->snaprealm == realm)
4568 if (in->snaprealm->created > info.created()) {
4569 ldout(cct, 10) << " NOT moving " << *in << " from _newer_ realm "
4570 << *in->snaprealm << dendl;
4573 ldout(cct, 10) << " moving " << *in << " from " << *in->snaprealm << dendl;
4576 in->snaprealm_item.remove_myself();
4577 to_move[in] = in->snaprealm->get_snap_context();
4578 put_snap_realm(in->snaprealm);
4582 // move child snaprealms, too
4583 for (vector<inodeno_t>::iterator p = m->split_realms.begin();
4584 p != m->split_realms.end();
4586 ldout(cct, 10) << "adjusting snaprealm " << *p << " parent" << dendl;
4587 SnapRealm *child = get_snap_realm_maybe(*p);
4590 adjust_realm_parent(child, realm->ino);
4591 put_snap_realm(child);
4595 update_snap_trace(m->bl, NULL, m->head.op != CEPH_SNAP_OP_DESTROY);
4598 for (auto p = to_move.begin(); p != to_move.end(); ++p) {
4599 Inode *in = p->first;
4600 in->snaprealm = realm;
4601 realm->inodes_with_caps.push_back(&in->snaprealm_item);
4603 // queue for snap writeback
4604 if (has_new_snaps(p->second, realm->get_snap_context()))
4605 queue_cap_snap(in, p->second);
4607 put_snap_realm(realm);
4613 void Client::handle_quota(MClientQuota *m)
4615 mds_rank_t mds = mds_rank_t(m->get_source().num());
4616 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4622 got_mds_push(session);
4624 ldout(cct, 10) << "handle_quota " << *m << " from mds." << mds << dendl;
4626 vinodeno_t vino(m->ino, CEPH_NOSNAP);
4627 if (inode_map.count(vino)) {
4629 in = inode_map[vino];
4632 in->quota = m->quota;
4633 in->rstat = m->rstat;
4640 void Client::handle_caps(MClientCaps *m)
4642 mds_rank_t mds = mds_rank_t(m->get_source().num());
4643 MetaSession *session = _get_mds_session(mds, m->get_connection().get());
4649 if (m->osd_epoch_barrier && !objecter->have_map(m->osd_epoch_barrier)) {
4650 // Pause RADOS operations until we see the required epoch
4651 objecter->set_epoch_barrier(m->osd_epoch_barrier);
4654 if (m->osd_epoch_barrier > cap_epoch_barrier) {
4655 // Record the barrier so that we will transmit it to MDS when releasing
4656 set_cap_epoch_barrier(m->osd_epoch_barrier);
4659 got_mds_push(session);
4661 m->clear_payload(); // for if/when we send back to MDS
4664 vinodeno_t vino(m->get_ino(), CEPH_NOSNAP);
4665 if (inode_map.count(vino))
4666 in = inode_map[vino];
4668 if (m->get_op() == CEPH_CAP_OP_IMPORT) {
4669 ldout(cct, 5) << "handle_caps don't have vino " << vino << " on IMPORT, immediately releasing" << dendl;
4670 session->enqueue_cap_release(
4677 ldout(cct, 5) << "handle_caps don't have vino " << vino << ", dropping" << dendl;
4681 // in case the mds is waiting on e.g. a revocation
4682 flush_cap_releases();
4686 switch (m->get_op()) {
4687 case CEPH_CAP_OP_EXPORT:
4688 return handle_cap_export(session, in, m);
4689 case CEPH_CAP_OP_FLUSHSNAP_ACK:
4690 return handle_cap_flushsnap_ack(session, in, m);
4691 case CEPH_CAP_OP_IMPORT:
4692 handle_cap_import(session, in, m);
4695 if (in->caps.count(mds) == 0) {
4696 ldout(cct, 5) << "handle_caps don't have " << *in << " cap on mds." << mds << dendl;
4701 Cap *cap = in->caps[mds];
4703 switch (m->get_op()) {
4704 case CEPH_CAP_OP_TRUNC: return handle_cap_trunc(session, in, m);
4705 case CEPH_CAP_OP_IMPORT:
4706 case CEPH_CAP_OP_REVOKE:
4707 case CEPH_CAP_OP_GRANT: return handle_cap_grant(session, in, cap, m);
4708 case CEPH_CAP_OP_FLUSH_ACK: return handle_cap_flush_ack(session, in, cap, m);
4714 void Client::handle_cap_import(MetaSession *session, Inode *in, MClientCaps *m)
4716 mds_rank_t mds = session->mds_num;
4718 ldout(cct, 5) << "handle_cap_import ino " << m->get_ino() << " mseq " << m->get_mseq()
4719 << " IMPORT from mds." << mds << dendl;
4721 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4724 if (m->peer.cap_id && in->caps.count(peer_mds)) {
4725 cap = in->caps[peer_mds];
4727 cap_perms = cap->latest_perms;
4732 SnapRealm *realm = NULL;
4733 update_snap_trace(m->snapbl, &realm);
4735 add_update_cap(in, session, m->get_cap_id(),
4736 m->get_caps(), m->get_seq(), m->get_mseq(), m->get_realm(),
4737 CEPH_CAP_FLAG_AUTH, cap_perms);
4739 if (cap && cap->cap_id == m->peer.cap_id) {
4740 remove_cap(cap, (m->peer.flags & CEPH_CAP_FLAG_RELEASE));
4744 put_snap_realm(realm);
4746 if (in->auth_cap && in->auth_cap->session->mds_num == mds) {
4747 // reflush any/all caps (if we are now the auth_cap)
4748 if (in->cap_snaps.size())
4749 flush_snaps(in, true);
4750 if (in->flushing_caps)
4751 flush_caps(in, session);
4755 void Client::handle_cap_export(MetaSession *session, Inode *in, MClientCaps *m)
4757 mds_rank_t mds = session->mds_num;
4759 ldout(cct, 5) << "handle_cap_export ino " << m->get_ino() << " mseq " << m->get_mseq()
4760 << " EXPORT from mds." << mds << dendl;
4763 if (in->caps.count(mds))
4764 cap = in->caps[mds];
4766 const mds_rank_t peer_mds = mds_rank_t(m->peer.mds);
4768 if (cap && cap->cap_id == m->get_cap_id()) {
4769 if (m->peer.cap_id) {
4770 MetaSession *tsession = _get_or_open_mds_session(peer_mds);
4771 if (in->caps.count(peer_mds)) {
4772 Cap *tcap = in->caps[peer_mds];
4773 if (tcap->cap_id == m->peer.cap_id &&
4774 ceph_seq_cmp(tcap->seq, m->peer.seq) < 0) {
4775 tcap->cap_id = m->peer.cap_id;
4776 tcap->seq = m->peer.seq - 1;
4777 tcap->issue_seq = tcap->seq;
4778 tcap->mseq = m->peer.mseq;
4779 tcap->issued |= cap->issued;
4780 tcap->implemented |= cap->issued;
4781 if (cap == in->auth_cap)
4782 in->auth_cap = tcap;
4783 if (in->auth_cap == tcap && in->flushing_cap_item.is_on_list())
4784 adjust_session_flushing_caps(in, session, tsession);
4787 add_update_cap(in, tsession, m->peer.cap_id, cap->issued,
4788 m->peer.seq - 1, m->peer.mseq, (uint64_t)-1,
4789 cap == in->auth_cap ? CEPH_CAP_FLAG_AUTH : 0,
4793 if (cap == in->auth_cap)
4794 in->flags |= I_CAP_DROPPED;
4797 remove_cap(cap, false);
4803 void Client::handle_cap_trunc(MetaSession *session, Inode *in, MClientCaps *m)
4805 mds_rank_t mds = session->mds_num;
4806 assert(in->caps[mds]);
4808 ldout(cct, 10) << "handle_cap_trunc on ino " << *in
4809 << " size " << in->size << " -> " << m->get_size()
4812 int implemented = 0;
4813 int issued = in->caps_issued(&implemented) | in->caps_dirty();
4814 issued |= implemented;
4815 update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(),
4816 m->get_size(), m->get_change_attr(), m->get_time_warp_seq(),
4817 m->get_ctime(), m->get_mtime(), m->get_atime(),
4818 m->inline_version, m->inline_data, issued);
4822 void Client::handle_cap_flush_ack(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
4824 ceph_tid_t flush_ack_tid = m->get_client_tid();
4825 int dirty = m->get_dirty();
4829 for (map<ceph_tid_t, int>::iterator it = in->flushing_cap_tids.begin();
4830 it != in->flushing_cap_tids.end(); ) {
4831 if (it->first == flush_ack_tid)
4832 cleaned = it->second;
4833 if (it->first <= flush_ack_tid) {
4834 session->flushing_caps_tids.erase(it->first);
4835 in->flushing_cap_tids.erase(it++);
4839 cleaned &= ~it->second;
4845 ldout(cct, 5) << "handle_cap_flush_ack mds." << session->mds_num
4846 << " cleaned " << ccap_string(cleaned) << " on " << *in
4847 << " with " << ccap_string(dirty) << dendl;
4850 signal_cond_list(in->waitfor_caps);
4851 if (session->flushing_caps_tids.empty() ||
4852 *session->flushing_caps_tids.begin() > flush_ack_tid)
4857 in->cap_dirtier_uid = -1;
4858 in->cap_dirtier_gid = -1;
4862 ldout(cct, 10) << " tid " << m->get_client_tid() << " != any cap bit tids" << dendl;
4864 if (in->flushing_caps) {
4865 ldout(cct, 5) << " flushing_caps " << ccap_string(in->flushing_caps)
4866 << " -> " << ccap_string(in->flushing_caps & ~cleaned) << dendl;
4867 in->flushing_caps &= ~cleaned;
4868 if (in->flushing_caps == 0) {
4869 ldout(cct, 10) << " " << *in << " !flushing" << dendl;
4870 num_flushing_caps--;
4871 if (in->cap_snaps.empty())
4872 in->flushing_cap_item.remove_myself();
4874 if (!in->caps_dirty())
4883 void Client::handle_cap_flushsnap_ack(MetaSession *session, Inode *in, MClientCaps *m)
4885 mds_rank_t mds = session->mds_num;
4886 assert(in->caps[mds]);
4887 snapid_t follows = m->get_snap_follows();
4889 if (in->cap_snaps.count(follows)) {
4890 CapSnap &capsnap = in->cap_snaps.at(follows);
4891 if (m->get_client_tid() != capsnap.flush_tid) {
4892 ldout(cct, 10) << " tid " << m->get_client_tid() << " != " << capsnap.flush_tid << dendl;
4894 ldout(cct, 5) << "handle_cap_flushedsnap mds." << mds << " flushed snap follows " << follows
4895 << " on " << *in << dendl;
4897 if (in->get_num_ref() == 1)
4898 tmp_ref = in; // make sure inode not get freed while erasing item from in->cap_snaps
4899 if (in->flushing_caps == 0 && in->cap_snaps.empty())
4900 in->flushing_cap_item.remove_myself();
4901 session->flushing_caps_tids.erase(capsnap.flush_tid);
4902 in->cap_snaps.erase(follows);
4905 ldout(cct, 5) << "handle_cap_flushedsnap DUP(?) mds." << mds << " flushed snap follows " << follows
4906 << " on " << *in << dendl;
4907 // we may not have it if we send multiple FLUSHSNAP requests and (got multiple FLUSHEDSNAPs back)
4913 class C_Client_DentryInvalidate : public Context {
4920 C_Client_DentryInvalidate(Client *c, Dentry *dn, bool del) :
4921 client(c), name(dn->name) {
4922 if (client->use_faked_inos()) {
4923 dirino.ino = dn->dir->parent_inode->faked_ino;
4925 ino.ino = dn->inode->faked_ino;
4927 dirino = dn->dir->parent_inode->vino();
4929 ino = dn->inode->vino();
4932 ino.ino = inodeno_t();
4934 void finish(int r) override {
4935 // _async_dentry_invalidate is responsible for its own locking
4936 assert(!client->client_lock.is_locked_by_me());
4937 client->_async_dentry_invalidate(dirino, ino, name);
4941 void Client::_async_dentry_invalidate(vinodeno_t dirino, vinodeno_t ino, string& name)
4945 ldout(cct, 10) << "_async_dentry_invalidate '" << name << "' ino " << ino
4946 << " in dir " << dirino << dendl;
4947 dentry_invalidate_cb(callback_handle, dirino, ino, name);
4950 void Client::_schedule_invalidate_dentry_callback(Dentry *dn, bool del)
4952 if (dentry_invalidate_cb && dn->inode->ll_ref > 0)
4953 async_dentry_invalidator.queue(new C_Client_DentryInvalidate(this, dn, del));
4956 void Client::_try_to_trim_inode(Inode *in, bool sched_inval)
4958 int ref = in->get_num_ref();
4960 if (in->dir && !in->dir->dentries.empty()) {
4961 for (auto p = in->dir->dentries.begin();
4962 p != in->dir->dentries.end(); ) {
4963 Dentry *dn = p->second;
4965 /* rmsnap removes whole subtree, need trim inodes recursively.
4966 * we don't need to invalidate dentries recursively. because
4967 * invalidating a directory dentry effectively invalidate
4969 if (in->snapid != CEPH_NOSNAP && dn->inode && dn->inode->is_dir())
4970 _try_to_trim_inode(dn->inode.get(), false);
4972 if (dn->lru_is_expireable())
4973 unlink(dn, true, false); // keep dir, drop dentry
4975 if (in->dir->dentries.empty()) {
4981 if (ref > 0 && (in->flags & I_SNAPDIR_OPEN)) {
4982 InodeRef snapdir = open_snapdir(in);
4983 _try_to_trim_inode(snapdir.get(), false);
4987 if (ref > 0 && in->ll_ref > 0 && sched_inval) {
4988 set<Dentry*>::iterator q = in->dn_set.begin();
4989 while (q != in->dn_set.end()) {
4991 // FIXME: we play lots of unlink/link tricks when handling MDS replies,
4992 // so in->dn_set doesn't always reflect the state of kernel's dcache.
4993 _schedule_invalidate_dentry_callback(dn, true);
4994 unlink(dn, true, true);
4999 void Client::handle_cap_grant(MetaSession *session, Inode *in, Cap *cap, MClientCaps *m)
5001 mds_rank_t mds = session->mds_num;
5002 int used = get_caps_used(in);
5003 int wanted = in->caps_wanted();
5005 const int old_caps = cap->issued;
5006 const int new_caps = m->get_caps();
5007 ldout(cct, 5) << "handle_cap_grant on in " << m->get_ino()
5008 << " mds." << mds << " seq " << m->get_seq()
5009 << " caps now " << ccap_string(new_caps)
5010 << " was " << ccap_string(old_caps) << dendl;
5011 cap->seq = m->get_seq();
5013 in->layout = m->get_layout();
5016 int implemented = 0;
5017 int issued = in->caps_issued(&implemented) | in->caps_dirty();
5018 issued |= implemented;
5020 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
5021 in->mode = m->head.mode;
5022 in->uid = m->head.uid;
5023 in->gid = m->head.gid;
5024 in->btime = m->btime;
5026 bool deleted_inode = false;
5027 if ((issued & CEPH_CAP_LINK_EXCL) == 0) {
5028 in->nlink = m->head.nlink;
5029 if (in->nlink == 0 &&
5030 (new_caps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL)))
5031 deleted_inode = true;
5033 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
5034 m->xattrbl.length() &&
5035 m->head.xattr_version > in->xattr_version) {
5036 bufferlist::iterator p = m->xattrbl.begin();
5037 ::decode(in->xattrs, p);
5038 in->xattr_version = m->head.xattr_version;
5040 update_inode_file_bits(in, m->get_truncate_seq(), m->get_truncate_size(), m->get_size(),
5041 m->get_change_attr(), m->get_time_warp_seq(), m->get_ctime(),
5042 m->get_mtime(), m->get_atime(),
5043 m->inline_version, m->inline_data, issued);
5046 if (cap == in->auth_cap &&
5047 m->get_max_size() != in->max_size) {
5048 ldout(cct, 10) << "max_size " << in->max_size << " -> " << m->get_max_size() << dendl;
5049 in->max_size = m->get_max_size();
5050 if (in->max_size > in->wanted_max_size) {
5051 in->wanted_max_size = 0;
5052 in->requested_max_size = 0;
5057 if (m->get_op() == CEPH_CAP_OP_IMPORT && m->get_wanted() != wanted)
5060 check_cap_issue(in, cap, new_caps);
5063 if (old_caps & ~new_caps) {
5064 ldout(cct, 10) << " revocation of " << ccap_string(~new_caps & old_caps) << dendl;
5065 cap->issued = new_caps;
5066 cap->implemented |= new_caps;
5068 if (((used & ~new_caps) & CEPH_CAP_FILE_BUFFER)
5069 && !_flush(in, new C_Client_FlushComplete(this, in))) {
5070 // waitin' for flush
5071 } else if ((old_caps & ~new_caps) & CEPH_CAP_FILE_CACHE) {
5075 cap->wanted = 0; // don't let check_caps skip sending a response to MDS
5079 } else if (old_caps == new_caps) {
5080 ldout(cct, 10) << " caps unchanged at " << ccap_string(old_caps) << dendl;
5082 ldout(cct, 10) << " grant, new caps are " << ccap_string(new_caps & ~old_caps) << dendl;
5083 cap->issued = new_caps;
5084 cap->implemented |= new_caps;
5086 if (cap == in->auth_cap) {
5087 // non-auth MDS is revoking the newly grant caps ?
5088 for (map<mds_rank_t, Cap*>::iterator it = in->caps.begin(); it != in->caps.end(); ++it) {
5089 if (it->second == cap)
5091 if (it->second->implemented & ~it->second->issued & new_caps) {
5104 signal_cond_list(in->waitfor_caps);
5106 // may drop inode's last ref
5108 _try_to_trim_inode(in, true);
5113 int Client::_getgrouplist(gid_t** sgids, uid_t uid, gid_t gid)
5115 // cppcheck-suppress variableScope
5120 sgid_count = getgroups_cb(callback_handle, &sgid_buf);
5121 if (sgid_count > 0) {
5127 #if HAVE_GETGROUPLIST
5131 ldout(cct, 3) << "getting user entry failed" << dendl;
5134 //use PAM to get the group list
5135 // initial number of group entries, defaults to posix standard of 16
5136 // PAM implementations may provide more than 16 groups....
5138 sgid_buf = (gid_t*)malloc(sgid_count * sizeof(gid_t));
5139 if (sgid_buf == NULL) {
5140 ldout(cct, 3) << "allocating group memory failed" << dendl;
5145 #if defined(__APPLE__)
5146 if (getgrouplist(pw->pw_name, gid, (int*)sgid_buf, &sgid_count) == -1) {
5148 if (getgrouplist(pw->pw_name, gid, sgid_buf, &sgid_count) == -1) {
5150 // we need to resize the group list and try again
5151 void *_realloc = NULL;
5152 if ((_realloc = realloc(sgid_buf, sgid_count * sizeof(gid_t))) == NULL) {
5153 ldout(cct, 3) << "allocating group memory failed" << dendl;
5157 sgid_buf = (gid_t*)_realloc;
5160 // list was successfully retrieved
5170 int Client::inode_permission(Inode *in, const UserPerm& perms, unsigned want)
5172 if (perms.uid() == 0)
5175 if (perms.uid() != in->uid && (in->mode & S_IRWXG)) {
5176 int ret = _posix_acl_permission(in, perms, want);
5181 // check permissions before doing anything else
5182 if (!in->check_mode(perms, want))
5187 int Client::xattr_permission(Inode *in, const char *name, unsigned want,
5188 const UserPerm& perms)
5190 int r = _getattr_for_perm(in, perms);
5195 if (strncmp(name, "system.", 7) == 0) {
5196 if ((want & MAY_WRITE) && (perms.uid() != 0 && perms.uid() != in->uid))
5199 r = inode_permission(in, perms, want);
5202 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5206 ostream& operator<<(ostream &out, const UserPerm& perm) {
5207 out << "UserPerm(uid: " << perm.uid() << ", gid: " << perm.gid() << ")";
5211 int Client::may_setattr(Inode *in, struct ceph_statx *stx, int mask,
5212 const UserPerm& perms)
5214 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5215 int r = _getattr_for_perm(in, perms);
5219 if (mask & CEPH_SETATTR_SIZE) {
5220 r = inode_permission(in, perms, MAY_WRITE);
5226 if (mask & CEPH_SETATTR_UID) {
5227 if (perms.uid() != 0 && (perms.uid() != in->uid || stx->stx_uid != in->uid))
5230 if (mask & CEPH_SETATTR_GID) {
5231 if (perms.uid() != 0 && (perms.uid() != in->uid ||
5232 (!perms.gid_in_groups(stx->stx_gid) && stx->stx_gid != in->gid)))
5236 if (mask & CEPH_SETATTR_MODE) {
5237 if (perms.uid() != 0 && perms.uid() != in->uid)
5240 gid_t i_gid = (mask & CEPH_SETATTR_GID) ? stx->stx_gid : in->gid;
5241 if (perms.uid() != 0 && !perms.gid_in_groups(i_gid))
5242 stx->stx_mode &= ~S_ISGID;
5245 if (mask & (CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME |
5246 CEPH_SETATTR_MTIME | CEPH_SETATTR_ATIME)) {
5247 if (perms.uid() != 0 && perms.uid() != in->uid) {
5248 int check_mask = CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME;
5249 if (!(mask & CEPH_SETATTR_MTIME_NOW))
5250 check_mask |= CEPH_SETATTR_MTIME;
5251 if (!(mask & CEPH_SETATTR_ATIME_NOW))
5252 check_mask |= CEPH_SETATTR_ATIME;
5253 if (check_mask & mask) {
5256 r = inode_permission(in, perms, MAY_WRITE);
5264 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5268 int Client::may_open(Inode *in, int flags, const UserPerm& perms)
5270 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5273 if ((flags & O_ACCMODE) == O_WRONLY)
5275 else if ((flags & O_ACCMODE) == O_RDWR)
5276 want = MAY_READ | MAY_WRITE;
5277 else if ((flags & O_ACCMODE) == O_RDONLY)
5279 if (flags & O_TRUNC)
5283 switch (in->mode & S_IFMT) {
5288 if (want & MAY_WRITE) {
5295 r = _getattr_for_perm(in, perms);
5299 r = inode_permission(in, perms, want);
5301 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5305 int Client::may_lookup(Inode *dir, const UserPerm& perms)
5307 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5308 int r = _getattr_for_perm(dir, perms);
5312 r = inode_permission(dir, perms, MAY_EXEC);
5314 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5318 int Client::may_create(Inode *dir, const UserPerm& perms)
5320 ldout(cct, 20) << __func__ << " " << *dir << "; " << perms << dendl;
5321 int r = _getattr_for_perm(dir, perms);
5325 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5327 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5331 int Client::may_delete(Inode *dir, const char *name, const UserPerm& perms)
5333 ldout(cct, 20) << __func__ << " " << *dir << "; " << "; name " << name << "; " << perms << dendl;
5334 int r = _getattr_for_perm(dir, perms);
5338 r = inode_permission(dir, perms, MAY_EXEC | MAY_WRITE);
5342 /* 'name == NULL' means rmsnap */
5343 if (perms.uid() != 0 && name && (dir->mode & S_ISVTX)) {
5345 r = _lookup(dir, name, CEPH_CAP_AUTH_SHARED, &otherin, perms);
5348 if (dir->uid != perms.uid() && otherin->uid != perms.uid())
5352 ldout(cct, 3) << __func__ << " " << dir << " = " << r << dendl;
5356 int Client::may_hardlink(Inode *in, const UserPerm& perms)
5358 ldout(cct, 20) << __func__ << " " << *in << "; " << perms << dendl;
5359 int r = _getattr_for_perm(in, perms);
5363 if (perms.uid() == 0 || perms.uid() == in->uid) {
5369 if (!S_ISREG(in->mode))
5372 if (in->mode & S_ISUID)
5375 if ((in->mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP))
5378 r = inode_permission(in, perms, MAY_READ | MAY_WRITE);
5380 ldout(cct, 3) << __func__ << " " << in << " = " << r << dendl;
5384 int Client::_getattr_for_perm(Inode *in, const UserPerm& perms)
5386 int mask = CEPH_STAT_CAP_MODE;
5388 if (acl_type != NO_ACL) {
5389 mask |= CEPH_STAT_CAP_XATTR;
5390 force = in->xattr_version == 0;
5392 return _getattr(in, mask, perms, force);
5395 vinodeno_t Client::_get_vino(Inode *in)
5397 /* The caller must hold the client lock */
5398 return vinodeno_t(in->ino, in->snapid);
5401 inodeno_t Client::_get_inodeno(Inode *in)
5403 /* The caller must hold the client lock */
5409 * Resolve an MDS spec to a list of MDS daemon GIDs.
5411 * The spec is a string representing a GID, rank, filesystem:rank, or name/id.
5412 * It may be '*' in which case it matches all GIDs.
5414 * If no error is returned, the `targets` vector will be populated with at least
5417 int Client::resolve_mds(
5418 const std::string &mds_spec,
5419 std::vector<mds_gid_t> *targets)
5422 assert(targets != nullptr);
5425 std::stringstream ss;
5426 int role_r = fsmap->parse_role(mds_spec, &role, ss);
5428 // We got a role, resolve it to a GID
5429 ldout(cct, 10) << __func__ << ": resolved '" << mds_spec << "' to role '"
5430 << role << "'" << dendl;
5432 fsmap->get_filesystem(role.fscid)->mds_map.get_info(role.rank).global_id);
5436 std::string strtol_err;
5437 long long rank_or_gid = strict_strtoll(mds_spec.c_str(), 10, &strtol_err);
5438 if (strtol_err.empty()) {
5439 // It is a possible GID
5440 const mds_gid_t mds_gid = mds_gid_t(rank_or_gid);
5441 if (fsmap->gid_exists(mds_gid)) {
5442 ldout(cct, 10) << __func__ << ": validated GID " << mds_gid << dendl;
5443 targets->push_back(mds_gid);
5445 lderr(cct) << __func__ << ": GID " << mds_gid << " not in MDS map"
5449 } else if (mds_spec == "*") {
5450 // It is a wildcard: use all MDSs
5451 const auto mds_info = fsmap->get_mds_info();
5453 if (mds_info.empty()) {
5454 lderr(cct) << __func__ << ": * passed but no MDS daemons found" << dendl;
5458 for (const auto i : mds_info) {
5459 targets->push_back(i.first);
5462 // It did not parse as an integer, it is not a wildcard, it must be a name
5463 const mds_gid_t mds_gid = fsmap->find_mds_gid_by_name(mds_spec);
5465 lderr(cct) << "MDS ID '" << mds_spec << "' not found" << dendl;
5467 lderr(cct) << "FSMap: " << *fsmap << dendl;
5471 ldout(cct, 10) << __func__ << ": resolved ID '" << mds_spec
5472 << "' to GID " << mds_gid << dendl;
5473 targets->push_back(mds_gid);
5482 * Authenticate with mon and establish global ID
5484 int Client::authenticate()
5486 assert(client_lock.is_locked_by_me());
5488 if (monclient->is_authenticated()) {
5492 client_lock.Unlock();
5493 int r = monclient->authenticate(cct->_conf->client_mount_timeout);
5499 whoami = monclient->get_global_id();
5500 messenger->set_myname(entity_name_t::CLIENT(whoami.v));
5505 int Client::fetch_fsmap(bool user)
5508 // Retrieve FSMap to enable looking up daemon addresses. We need FSMap
5509 // rather than MDSMap because no one MDSMap contains all the daemons, and
5510 // a `tell` can address any daemon.
5511 version_t fsmap_latest;
5514 monclient->get_version("fsmap", &fsmap_latest, NULL, &cond);
5515 client_lock.Unlock();
5518 } while (r == -EAGAIN);
5521 lderr(cct) << "Failed to learn FSMap version: " << cpp_strerror(r) << dendl;
5525 ldout(cct, 10) << __func__ << " learned FSMap version " << fsmap_latest << dendl;
5528 if (!fsmap_user || fsmap_user->get_epoch() < fsmap_latest) {
5529 monclient->sub_want("fsmap.user", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5530 monclient->renew_subs();
5531 wait_on_list(waiting_for_fsmap);
5534 assert(fsmap_user->get_epoch() >= fsmap_latest);
5536 if (!fsmap || fsmap->get_epoch() < fsmap_latest) {
5537 monclient->sub_want("fsmap", fsmap_latest, CEPH_SUBSCRIBE_ONETIME);
5538 monclient->renew_subs();
5539 wait_on_list(waiting_for_fsmap);
5542 assert(fsmap->get_epoch() >= fsmap_latest);
5544 ldout(cct, 10) << __func__ << " finished waiting for FSMap version "
5545 << fsmap_latest << dendl;
5551 * @mds_spec one of ID, rank, GID, "*"
5554 int Client::mds_command(
5555 const std::string &mds_spec,
5556 const vector<string>& cmd,
5557 const bufferlist& inbl,
5562 Mutex::Locker lock(client_lock);
5573 r = fetch_fsmap(false);
5578 // Look up MDS target(s) of the command
5579 std::vector<mds_gid_t> targets;
5580 r = resolve_mds(mds_spec, &targets);
5585 // If daemons are laggy, we won't send them commands. If all
5586 // are laggy then we fail.
5587 std::vector<mds_gid_t> non_laggy;
5588 for (const auto gid : targets) {
5589 const auto info = fsmap->get_info_gid(gid);
5590 if (!info.laggy()) {
5591 non_laggy.push_back(gid);
5594 if (non_laggy.size() == 0) {
5595 *outs = "All targeted MDS daemons are laggy";
5599 if (metadata.empty()) {
5600 // We are called on an unmounted client, so metadata
5601 // won't be initialized yet.
5602 populate_metadata("");
5605 // Send commands to targets
5606 C_GatherBuilder gather(cct, onfinish);
5607 for (const auto target_gid : non_laggy) {
5608 const auto info = fsmap->get_info_gid(target_gid);
5610 // Open a connection to the target MDS
5611 entity_inst_t inst = info.get_inst();
5612 ConnectionRef conn = messenger->get_connection(inst);
5614 // Generate MDSCommandOp state
5615 auto &op = command_table.start_command();
5617 op.on_finish = gather.new_sub();
5622 op.mds_gid = target_gid;
5625 ldout(cct, 4) << __func__ << ": new command op to " << target_gid
5626 << " tid=" << op.tid << cmd << dendl;
5628 // Construct and send MCommand
5629 MCommand *m = op.get_message(monclient->get_fsid());
5630 conn->send_message(m);
5637 void Client::handle_command_reply(MCommandReply *m)
5639 ceph_tid_t const tid = m->get_tid();
5641 ldout(cct, 10) << __func__ << ": tid=" << m->get_tid() << dendl;
5643 if (!command_table.exists(tid)) {
5644 ldout(cct, 1) << __func__ << ": unknown tid " << tid << ", dropping" << dendl;
5649 auto &op = command_table.get_command(tid);
5651 op.outbl->claim(m->get_data());
5658 op.on_finish->complete(m->r);
5661 command_table.erase(tid);
5666 // -------------------
5669 int Client::mount(const std::string &mount_root, const UserPerm& perms,
5672 Mutex::Locker lock(client_lock);
5675 ldout(cct, 5) << "already mounted" << dendl;
5679 int r = authenticate();
5681 lderr(cct) << "authentication failed: " << cpp_strerror(r) << dendl;
5685 std::string want = "mdsmap";
5686 const auto &mds_ns = cct->_conf->client_mds_namespace;
5687 if (!mds_ns.empty()) {
5688 r = fetch_fsmap(true);
5691 fs_cluster_id_t cid = fsmap_user->get_fs_cid(mds_ns);
5692 if (cid == FS_CLUSTER_ID_NONE)
5695 std::ostringstream oss;
5696 oss << want << "." << cid;
5699 ldout(cct, 10) << "Subscribing to map '" << want << "'" << dendl;
5701 monclient->sub_want(want, 0, 0);
5702 monclient->renew_subs();
5704 tick(); // start tick
5708 auto availability = mdsmap->is_cluster_available();
5709 if (availability == MDSMap::STUCK_UNAVAILABLE) {
5711 ldout(cct, 10) << "mds cluster unavailable: epoch=" << mdsmap->get_epoch() << dendl;
5712 return CEPH_FUSE_NO_MDS_UP;
5713 } else if (availability == MDSMap::AVAILABLE) {
5714 // Continue to mount
5716 } else if (availability == MDSMap::TRANSIENT_UNAVAILABLE) {
5717 // Else, wait. MDSMonitor will update the map to bring
5718 // us to a conclusion eventually.
5719 wait_on_list(waiting_for_mdsmap);
5721 // Unexpected value!
5727 populate_metadata(mount_root.empty() ? "/" : mount_root);
5729 filepath fp(CEPH_INO_ROOT);
5730 if (!mount_root.empty()) {
5731 fp = filepath(mount_root.c_str());
5734 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
5735 req->set_filepath(fp);
5736 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
5737 int res = make_request(req, perms);
5739 if (res == -EACCES && root) {
5740 ldout(cct, 1) << __func__ << " EACCES on parent of mount point; quotas may not work" << dendl;
5758 if (!cct->_conf->client_trace.empty()) {
5759 traceout.open(cct->_conf->client_trace.c_str());
5760 if (traceout.is_open()) {
5761 ldout(cct, 1) << "opened trace file '" << cct->_conf->client_trace << "'" << dendl;
5763 ldout(cct, 1) << "FAILED to open trace file '" << cct->_conf->client_trace << "'" << dendl;
5768 ldout(cct, 3) << "op: // client trace data structs" << dendl;
5769 ldout(cct, 3) << "op: struct stat st;" << dendl;
5770 ldout(cct, 3) << "op: struct utimbuf utim;" << dendl;
5771 ldout(cct, 3) << "op: int readlinkbuf_len = 1000;" << dendl;
5772 ldout(cct, 3) << "op: char readlinkbuf[readlinkbuf_len];" << dendl;
5773 ldout(cct, 3) << "op: map<string, inode_t*> dir_contents;" << dendl;
5774 ldout(cct, 3) << "op: map<int, int> open_files;" << dendl;
5775 ldout(cct, 3) << "op: int fd;" << dendl;
5782 void Client::_close_sessions()
5784 while (!mds_sessions.empty()) {
5785 // send session closes!
5786 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5787 p != mds_sessions.end();
5789 if (p->second->state != MetaSession::STATE_CLOSING) {
5790 _close_mds_session(p->second);
5794 // wait for sessions to close
5795 ldout(cct, 2) << "waiting for " << mds_sessions.size() << " mds sessions to close" << dendl;
5796 mount_cond.Wait(client_lock);
5800 void Client::flush_mdlog_sync()
5802 if (mds_requests.empty())
5804 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5805 p != mds_sessions.end();
5807 MetaSession *s = p->second;
5812 void Client::flush_mdlog(MetaSession *session)
5814 // Only send this to Luminous or newer MDS daemons, older daemons
5815 // will crash if they see an unknown CEPH_SESSION_* value in this msg.
5816 const uint64_t features = session->con->get_features();
5817 if (HAVE_FEATURE(features, SERVER_LUMINOUS)) {
5818 MClientSession *m = new MClientSession(CEPH_SESSION_REQUEST_FLUSH_MDLOG);
5819 session->con->send_message(m);
5824 void Client::unmount()
5826 Mutex::Locker lock(client_lock);
5831 ldout(cct, 2) << "unmounting" << dendl;
5834 flush_mdlog_sync(); // flush the mdlog for pending requests, if any
5835 while (!mds_requests.empty()) {
5836 ldout(cct, 10) << "waiting on " << mds_requests.size() << " requests" << dendl;
5837 mount_cond.Wait(client_lock);
5841 timer.cancel_event(tick_event);
5846 // clean up any unclosed files
5847 while (!fd_map.empty()) {
5848 Fh *fh = fd_map.begin()->second;
5849 fd_map.erase(fd_map.begin());
5850 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *fh->inode << dendl;
5854 while (!ll_unclosed_fh_set.empty()) {
5855 set<Fh*>::iterator it = ll_unclosed_fh_set.begin();
5857 ll_unclosed_fh_set.erase(fh);
5858 ldout(cct, 0) << " destroyed lost open file " << fh << " on " << *(fh->inode) << dendl;
5862 while (!opened_dirs.empty()) {
5863 dir_result_t *dirp = *opened_dirs.begin();
5864 ldout(cct, 0) << " destroyed lost open dir " << dirp << " on " << *dirp->inode << dendl;
5871 ldout(cct, 0) << " skipping clean shutdown, we are blacklisted" << dendl;
5873 if (cct->_conf->client_oc) {
5874 // Purge all cached data so that ObjectCacher doesn't get hung up
5875 // trying to flush it. ObjectCacher's behaviour on EBLACKLISTED
5876 // is to just leave things marked dirty
5877 // (http://tracker.ceph.com/issues/9105)
5878 for (const auto &i : inode_map) {
5879 objectcacher->purge_set(&(i.second->oset));
5887 while (unsafe_sync_write > 0) {
5888 ldout(cct, 0) << unsafe_sync_write << " unsafe_sync_writes, waiting" << dendl;
5889 mount_cond.Wait(client_lock);
5892 if (cct->_conf->client_oc) {
5893 // flush/release all buffered data
5894 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
5895 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator p = inode_map.begin();
5896 p != inode_map.end();
5900 Inode *in = p->second;
5902 ldout(cct, 0) << "null inode_map entry ino " << p->first << dendl;
5905 if (!in->caps.empty()) {
5906 InodeRef tmp_ref(in);
5908 _flush(in, new C_Client_FlushComplete(this, in));
5914 wait_sync_caps(last_flush_tid);
5919 while (lru.lru_get_size() > 0 ||
5920 !inode_map.empty()) {
5921 ldout(cct, 2) << "cache still has " << lru.lru_get_size()
5922 << "+" << inode_map.size() << " items"
5923 << ", waiting (for caps to release?)"
5925 utime_t until = ceph_clock_now() + utime_t(5, 0);
5926 int r = mount_cond.WaitUntil(client_lock, until);
5927 if (r == ETIMEDOUT) {
5931 assert(lru.lru_get_size() == 0);
5932 assert(inode_map.empty());
5935 if (!cct->_conf->client_trace.empty()) {
5936 ldout(cct, 1) << "closing trace file '" << cct->_conf->client_trace << "'" << dendl;
5944 ldout(cct, 2) << "unmounted." << dendl;
5947 void Client::flush_cap_releases()
5949 // send any cap releases
5950 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5951 p != mds_sessions.end();
5953 if (p->second->release && mdsmap->is_clientreplay_or_active_or_stopping(
5955 if (cct->_conf->client_inject_release_failure) {
5956 ldout(cct, 20) << __func__ << " injecting failure to send cap release message" << dendl;
5957 p->second->release->put();
5959 p->second->con->send_message(p->second->release);
5961 p->second->release = 0;
5968 if (cct->_conf->client_debug_inject_tick_delay > 0) {
5969 sleep(cct->_conf->client_debug_inject_tick_delay);
5970 assert(0 == cct->_conf->set_val("client_debug_inject_tick_delay", "0"));
5971 cct->_conf->apply_changes(NULL);
5974 ldout(cct, 21) << "tick" << dendl;
5975 tick_event = timer.add_event_after(
5976 cct->_conf->client_tick_interval,
5977 new FunctionContext([this](int) {
5978 // Called back via Timer, which takes client_lock for us
5979 assert(client_lock.is_locked_by_me());
5982 utime_t now = ceph_clock_now();
5984 if (!mounted && !mds_requests.empty()) {
5985 MetaRequest *req = mds_requests.begin()->second;
5986 if (req->op_stamp + cct->_conf->client_mount_timeout < now) {
5987 req->abort(-ETIMEDOUT);
5988 if (req->caller_cond) {
5990 req->caller_cond->Signal();
5992 signal_cond_list(waiting_for_mdsmap);
5993 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
5994 p != mds_sessions.end();
5996 signal_context_list(p->second->waiting_for_open);
6000 if (mdsmap->get_epoch()) {
6002 utime_t el = now - last_cap_renew;
6003 if (el > mdsmap->get_session_timeout() / 3.0)
6006 flush_cap_releases();
6010 xlist<Inode*>::iterator p = delayed_caps.begin();
6014 if (in->hold_caps_until > now)
6016 delayed_caps.pop_front();
6017 cap_list.push_back(&in->cap_item);
6018 check_caps(in, CHECK_CAPS_NODELAY);
6024 void Client::renew_caps()
6026 ldout(cct, 10) << "renew_caps()" << dendl;
6027 last_cap_renew = ceph_clock_now();
6029 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
6030 p != mds_sessions.end();
6032 ldout(cct, 15) << "renew_caps requesting from mds." << p->first << dendl;
6033 if (mdsmap->get_state(p->first) >= MDSMap::STATE_REJOIN)
6034 renew_caps(p->second);
6038 void Client::renew_caps(MetaSession *session)
6040 ldout(cct, 10) << "renew_caps mds." << session->mds_num << dendl;
6041 session->last_cap_renew_request = ceph_clock_now();
6042 uint64_t seq = ++session->cap_renew_seq;
6043 session->con->send_message(new MClientSession(CEPH_SESSION_REQUEST_RENEWCAPS, seq));
6047 // ===============================================================
6048 // high level (POSIXy) interface
6050 int Client::_do_lookup(Inode *dir, const string& name, int mask,
6051 InodeRef *target, const UserPerm& perms)
6053 int op = dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
6054 MetaRequest *req = new MetaRequest(op);
6056 dir->make_nosnap_relative_path(path);
6057 path.push_dentry(name);
6058 req->set_filepath(path);
6059 req->set_inode(dir);
6060 if (cct->_conf->client_debug_getattr_caps && op == CEPH_MDS_OP_LOOKUP)
6061 mask |= DEBUG_GETATTR_CAPS;
6062 req->head.args.getattr.mask = mask;
6064 ldout(cct, 10) << "_do_lookup on " << path << dendl;
6066 int r = make_request(req, perms, target);
6067 ldout(cct, 10) << "_do_lookup res is " << r << dendl;
6071 int Client::_lookup(Inode *dir, const string& dname, int mask, InodeRef *target,
6072 const UserPerm& perms)
6077 if (!dir->is_dir()) {
6082 if (dname == "..") {
6083 if (dir->dn_set.empty())
6086 *target = dir->get_first_parent()->dir->parent_inode; //dirs can't be hard-linked
6095 if (dname.length() > NAME_MAX) {
6100 if (dname == cct->_conf->client_snapdir &&
6101 dir->snapid == CEPH_NOSNAP) {
6102 *target = open_snapdir(dir);
6107 dir->dir->dentries.count(dname)) {
6108 dn = dir->dir->dentries[dname];
6110 ldout(cct, 20) << "_lookup have dn " << dname << " mds." << dn->lease_mds << " ttl " << dn->lease_ttl
6111 << " seq " << dn->lease_seq
6114 if (!dn->inode || dn->inode->caps_issued_mask(mask)) {
6115 // is dn lease valid?
6116 utime_t now = ceph_clock_now();
6117 if (dn->lease_mds >= 0 &&
6118 dn->lease_ttl > now &&
6119 mds_sessions.count(dn->lease_mds)) {
6120 MetaSession *s = mds_sessions[dn->lease_mds];
6121 if (s->cap_ttl > now &&
6122 s->cap_gen == dn->lease_gen) {
6123 // touch this mds's dir cap too, even though we don't _explicitly_ use it here, to
6124 // make trim_caps() behave.
6125 dir->try_touch_cap(dn->lease_mds);
6128 ldout(cct, 20) << " bad lease, cap_ttl " << s->cap_ttl << ", cap_gen " << s->cap_gen
6129 << " vs lease_gen " << dn->lease_gen << dendl;
6132 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED)) {
6133 if (dn->cap_shared_gen == dir->shared_gen &&
6134 (!dn->inode || dn->inode->caps_issued_mask(mask)))
6136 if (!dn->inode && (dir->flags & I_COMPLETE)) {
6137 ldout(cct, 10) << "_lookup concluded ENOENT locally for "
6138 << *dir << " dn '" << dname << "'" << dendl;
6143 ldout(cct, 20) << " no cap on " << dn->inode->vino() << dendl;
6146 // can we conclude ENOENT locally?
6147 if (dir->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
6148 (dir->flags & I_COMPLETE)) {
6149 ldout(cct, 10) << "_lookup concluded ENOENT locally for " << *dir << " dn '" << dname << "'" << dendl;
6154 r = _do_lookup(dir, dname, mask, target, perms);
6159 *target = dn->inode;
6167 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << r << dendl;
6169 ldout(cct, 10) << "_lookup " << *dir << " " << dname << " = " << **target << dendl;
6173 int Client::get_or_create(Inode *dir, const char* name,
6174 Dentry **pdn, bool expect_null)
6177 ldout(cct, 20) << "get_or_create " << *dir << " name " << name << dendl;
6179 if (dir->dir->dentries.count(name)) {
6180 Dentry *dn = dir->dir->dentries[name];
6182 // is dn lease valid?
6183 utime_t now = ceph_clock_now();
6185 dn->lease_mds >= 0 &&
6186 dn->lease_ttl > now &&
6187 mds_sessions.count(dn->lease_mds)) {
6188 MetaSession *s = mds_sessions[dn->lease_mds];
6189 if (s->cap_ttl > now &&
6190 s->cap_gen == dn->lease_gen) {
6197 // otherwise link up a new one
6198 *pdn = link(dir->dir, name, NULL, NULL);
6205 int Client::path_walk(const filepath& origpath, InodeRef *end,
6206 const UserPerm& perms, bool followsym, int mask)
6208 filepath path = origpath;
6210 if (origpath.absolute())
6216 ldout(cct, 10) << "path_walk " << path << dendl;
6221 while (i < path.depth() && cur) {
6223 const string &dname = path[i];
6224 ldout(cct, 10) << " " << i << " " << *cur << " " << dname << dendl;
6225 ldout(cct, 20) << " (path is " << path << ")" << dendl;
6227 if (cct->_conf->client_permissions) {
6228 int r = may_lookup(cur.get(), perms);
6231 caps = CEPH_CAP_AUTH_SHARED;
6234 /* Get extra requested caps on the last component */
6235 if (i == (path.depth() - 1))
6237 int r = _lookup(cur.get(), dname, caps, &next, perms);
6240 // only follow trailing symlink if followsym. always follow
6241 // 'directory' symlinks.
6242 if (next && next->is_symlink()) {
6244 ldout(cct, 20) << " symlink count " << symlinks << ", value is '" << next->symlink << "'" << dendl;
6245 if (symlinks > MAXSYMLINKS) {
6249 if (i < path.depth() - 1) {
6251 // replace consumed components of path with symlink dir target
6252 filepath resolved(next->symlink.c_str());
6253 resolved.append(path.postfixpath(i + 1));
6256 if (next->symlink[0] == '/') {
6260 } else if (followsym) {
6261 if (next->symlink[0] == '/') {
6262 path = next->symlink.c_str();
6267 filepath more(next->symlink.c_str());
6268 // we need to remove the symlink component from off of the path
6269 // before adding the target that the symlink points to. remain
6270 // at the same position in the path.
6290 int Client::link(const char *relexisting, const char *relpath, const UserPerm& perm)
6292 Mutex::Locker lock(client_lock);
6293 tout(cct) << "link" << std::endl;
6294 tout(cct) << relexisting << std::endl;
6295 tout(cct) << relpath << std::endl;
6300 filepath existing(relexisting);
6303 int r = path_walk(existing, &in, perm, true);
6306 if (std::string(relpath) == "/") {
6310 filepath path(relpath);
6311 string name = path.last_dentry();
6314 r = path_walk(path, &dir, perm, true);
6317 if (cct->_conf->client_permissions) {
6318 if (S_ISDIR(in->mode)) {
6322 r = may_hardlink(in.get(), perm);
6325 r = may_create(dir.get(), perm);
6329 r = _link(in.get(), dir.get(), name.c_str(), perm);
6333 int Client::unlink(const char *relpath, const UserPerm& perm)
6335 Mutex::Locker lock(client_lock);
6336 tout(cct) << "unlink" << std::endl;
6337 tout(cct) << relpath << std::endl;
6342 if (std::string(relpath) == "/")
6345 filepath path(relpath);
6346 string name = path.last_dentry();
6349 int r = path_walk(path, &dir, perm);
6352 if (cct->_conf->client_permissions) {
6353 r = may_delete(dir.get(), name.c_str(), perm);
6357 return _unlink(dir.get(), name.c_str(), perm);
6360 int Client::rename(const char *relfrom, const char *relto, const UserPerm& perm)
6362 Mutex::Locker lock(client_lock);
6363 tout(cct) << "rename" << std::endl;
6364 tout(cct) << relfrom << std::endl;
6365 tout(cct) << relto << std::endl;
6370 if (std::string(relfrom) == "/" || std::string(relto) == "/")
6373 filepath from(relfrom);
6375 string fromname = from.last_dentry();
6377 string toname = to.last_dentry();
6380 InodeRef fromdir, todir;
6381 int r = path_walk(from, &fromdir, perm);
6384 r = path_walk(to, &todir, perm);
6388 if (cct->_conf->client_permissions) {
6389 int r = may_delete(fromdir.get(), fromname.c_str(), perm);
6392 r = may_delete(todir.get(), toname.c_str(), perm);
6393 if (r < 0 && r != -ENOENT)
6396 r = _rename(fromdir.get(), fromname.c_str(), todir.get(), toname.c_str(), perm);
6403 int Client::mkdir(const char *relpath, mode_t mode, const UserPerm& perm)
6405 Mutex::Locker lock(client_lock);
6406 tout(cct) << "mkdir" << std::endl;
6407 tout(cct) << relpath << std::endl;
6408 tout(cct) << mode << std::endl;
6409 ldout(cct, 10) << "mkdir: " << relpath << dendl;
6414 if (std::string(relpath) == "/")
6417 filepath path(relpath);
6418 string name = path.last_dentry();
6421 int r = path_walk(path, &dir, perm);
6424 if (cct->_conf->client_permissions) {
6425 r = may_create(dir.get(), perm);
6429 return _mkdir(dir.get(), name.c_str(), mode, perm);
6432 int Client::mkdirs(const char *relpath, mode_t mode, const UserPerm& perms)
6434 Mutex::Locker lock(client_lock);
6435 ldout(cct, 10) << "Client::mkdirs " << relpath << dendl;
6436 tout(cct) << "mkdirs" << std::endl;
6437 tout(cct) << relpath << std::endl;
6438 tout(cct) << mode << std::endl;
6443 //get through existing parts of path
6444 filepath path(relpath);
6446 int r = 0, caps = 0;
6449 for (i=0; i<path.depth(); ++i) {
6450 if (cct->_conf->client_permissions) {
6451 r = may_lookup(cur.get(), perms);
6454 caps = CEPH_CAP_AUTH_SHARED;
6456 r = _lookup(cur.get(), path[i].c_str(), caps, &next, perms);
6461 //check that we have work left to do
6462 if (i==path.depth()) return -EEXIST;
6463 if (r!=-ENOENT) return r;
6464 ldout(cct, 20) << "mkdirs got through " << i << " directories on path " << relpath << dendl;
6465 //make new directory at each level
6466 for (; i<path.depth(); ++i) {
6467 if (cct->_conf->client_permissions) {
6468 r = may_create(cur.get(), perms);
6473 r = _mkdir(cur.get(), path[i].c_str(), mode, perms, &next);
6475 //check proper creation/existence
6476 if(-EEXIST == r && i < path.depth() - 1) {
6477 r = _lookup(cur.get(), path[i].c_str(), CEPH_CAP_AUTH_SHARED, &next, perms);
6481 //move to new dir and continue
6483 ldout(cct, 20) << "mkdirs: successfully created directory "
6484 << filepath(cur->ino).get_path() << dendl;
6489 int Client::rmdir(const char *relpath, const UserPerm& perms)
6491 Mutex::Locker lock(client_lock);
6492 tout(cct) << "rmdir" << std::endl;
6493 tout(cct) << relpath << std::endl;
6498 if (std::string(relpath) == "/")
6501 filepath path(relpath);
6502 string name = path.last_dentry();
6505 int r = path_walk(path, &dir, perms);
6508 if (cct->_conf->client_permissions) {
6509 int r = may_delete(dir.get(), name.c_str(), perms);
6513 return _rmdir(dir.get(), name.c_str(), perms);
6516 int Client::mknod(const char *relpath, mode_t mode, const UserPerm& perms, dev_t rdev)
6518 Mutex::Locker lock(client_lock);
6519 tout(cct) << "mknod" << std::endl;
6520 tout(cct) << relpath << std::endl;
6521 tout(cct) << mode << std::endl;
6522 tout(cct) << rdev << std::endl;
6527 if (std::string(relpath) == "/")
6530 filepath path(relpath);
6531 string name = path.last_dentry();
6534 int r = path_walk(path, &dir, perms);
6537 if (cct->_conf->client_permissions) {
6538 int r = may_create(dir.get(), perms);
6542 return _mknod(dir.get(), name.c_str(), mode, rdev, perms);
6547 int Client::symlink(const char *target, const char *relpath, const UserPerm& perms)
6549 Mutex::Locker lock(client_lock);
6550 tout(cct) << "symlink" << std::endl;
6551 tout(cct) << target << std::endl;
6552 tout(cct) << relpath << std::endl;
6557 if (std::string(relpath) == "/")
6560 filepath path(relpath);
6561 string name = path.last_dentry();
6564 int r = path_walk(path, &dir, perms);
6567 if (cct->_conf->client_permissions) {
6568 int r = may_create(dir.get(), perms);
6572 return _symlink(dir.get(), name.c_str(), target, perms);
6575 int Client::readlink(const char *relpath, char *buf, loff_t size, const UserPerm& perms)
6577 Mutex::Locker lock(client_lock);
6578 tout(cct) << "readlink" << std::endl;
6579 tout(cct) << relpath << std::endl;
6584 filepath path(relpath);
6586 int r = path_walk(path, &in, perms, false);
6590 return _readlink(in.get(), buf, size);
6593 int Client::_readlink(Inode *in, char *buf, size_t size)
6595 if (!in->is_symlink())
6598 // copy into buf (at most size bytes)
6599 int r = in->symlink.length();
6602 memcpy(buf, in->symlink.c_str(), r);
6609 int Client::_getattr(Inode *in, int mask, const UserPerm& perms, bool force)
6611 bool yes = in->caps_issued_mask(mask);
6613 ldout(cct, 10) << "_getattr mask " << ccap_string(mask) << " issued=" << yes << dendl;
6617 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_GETATTR);
6619 in->make_nosnap_relative_path(path);
6620 req->set_filepath(path);
6622 req->head.args.getattr.mask = mask;
6624 int res = make_request(req, perms);
6625 ldout(cct, 10) << "_getattr result=" << res << dendl;
6629 int Client::_do_setattr(Inode *in, struct ceph_statx *stx, int mask,
6630 const UserPerm& perms, InodeRef *inp)
6632 int issued = in->caps_issued();
6634 ldout(cct, 10) << "_setattr mask " << mask << " issued " <<
6635 ccap_string(issued) << dendl;
6637 if (in->snapid != CEPH_NOSNAP) {
6640 if ((mask & CEPH_SETATTR_SIZE) &&
6641 (unsigned long)stx->stx_size > in->size &&
6642 is_quota_bytes_exceeded(in, (unsigned long)stx->stx_size - in->size,
6647 // make the change locally?
6648 if ((in->cap_dirtier_uid >= 0 && perms.uid() != in->cap_dirtier_uid) ||
6649 (in->cap_dirtier_gid >= 0 && perms.gid() != in->cap_dirtier_gid)) {
6650 ldout(cct, 10) << __func__ << " caller " << perms.uid() << ":" << perms.gid()
6651 << " != cap dirtier " << in->cap_dirtier_uid << ":"
6652 << in->cap_dirtier_gid << ", forcing sync setattr"
6655 * This works because we implicitly flush the caps as part of the
6656 * request, so the cap update check will happen with the writeback
6657 * cap context, and then the setattr check will happen with the
6660 * In reality this pattern is likely pretty rare (different users
6661 * setattr'ing the same file). If that turns out not to be the
6662 * case later, we can build a more complex pipelined cap writeback
6666 mask |= CEPH_SETATTR_CTIME;
6671 // caller just needs us to bump the ctime
6672 in->ctime = ceph_clock_now();
6673 in->cap_dirtier_uid = perms.uid();
6674 in->cap_dirtier_gid = perms.gid();
6675 if (issued & CEPH_CAP_AUTH_EXCL)
6676 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6677 else if (issued & CEPH_CAP_FILE_EXCL)
6678 mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
6679 else if (issued & CEPH_CAP_XATTR_EXCL)
6680 mark_caps_dirty(in, CEPH_CAP_XATTR_EXCL);
6682 mask |= CEPH_SETATTR_CTIME;
6685 if (in->caps_issued_mask(CEPH_CAP_AUTH_EXCL)) {
6686 bool kill_sguid = mask & (CEPH_SETATTR_SIZE|CEPH_SETATTR_KILL_SGUID);
6688 mask &= ~CEPH_SETATTR_KILL_SGUID;
6690 if (mask & CEPH_SETATTR_UID) {
6691 in->ctime = ceph_clock_now();
6692 in->cap_dirtier_uid = perms.uid();
6693 in->cap_dirtier_gid = perms.gid();
6694 in->uid = stx->stx_uid;
6695 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6696 mask &= ~CEPH_SETATTR_UID;
6698 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6700 if (mask & CEPH_SETATTR_GID) {
6701 in->ctime = ceph_clock_now();
6702 in->cap_dirtier_uid = perms.uid();
6703 in->cap_dirtier_gid = perms.gid();
6704 in->gid = stx->stx_gid;
6705 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6706 mask &= ~CEPH_SETATTR_GID;
6708 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6711 if (mask & CEPH_SETATTR_MODE) {
6712 in->ctime = ceph_clock_now();
6713 in->cap_dirtier_uid = perms.uid();
6714 in->cap_dirtier_gid = perms.gid();
6715 in->mode = (in->mode & ~07777) | (stx->stx_mode & 07777);
6716 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6717 mask &= ~CEPH_SETATTR_MODE;
6718 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6719 } else if (kill_sguid && S_ISREG(in->mode) && (in->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
6720 /* Must squash the any setuid/setgid bits with an ownership change */
6721 in->mode &= ~(S_ISUID|S_ISGID);
6722 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6725 if (mask & CEPH_SETATTR_BTIME) {
6726 in->ctime = ceph_clock_now();
6727 in->cap_dirtier_uid = perms.uid();
6728 in->cap_dirtier_gid = perms.gid();
6729 in->btime = utime_t(stx->stx_btime);
6730 mark_caps_dirty(in, CEPH_CAP_AUTH_EXCL);
6731 mask &= ~CEPH_SETATTR_BTIME;
6732 ldout(cct,10) << "changing btime to " << in->btime << dendl;
6734 } else if (mask & CEPH_SETATTR_SIZE) {
6735 /* If we don't have Ax, then we must ask the server to clear them on truncate */
6736 mask |= CEPH_SETATTR_KILL_SGUID;
6739 if (in->caps_issued_mask(CEPH_CAP_FILE_EXCL)) {
6740 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME)) {
6741 if (mask & CEPH_SETATTR_MTIME)
6742 in->mtime = utime_t(stx->stx_mtime);
6743 if (mask & CEPH_SETATTR_ATIME)
6744 in->atime = utime_t(stx->stx_atime);
6745 in->ctime = ceph_clock_now();
6746 in->cap_dirtier_uid = perms.uid();
6747 in->cap_dirtier_gid = perms.gid();
6748 in->time_warp_seq++;
6749 mark_caps_dirty(in, CEPH_CAP_FILE_EXCL);
6750 mask &= ~(CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME);
6759 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETATTR);
6763 in->make_nosnap_relative_path(path);
6764 req->set_filepath(path);
6767 if (mask & CEPH_SETATTR_KILL_SGUID) {
6768 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6770 if (mask & CEPH_SETATTR_MODE) {
6771 req->head.args.setattr.mode = stx->stx_mode;
6772 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6773 ldout(cct,10) << "changing mode to " << stx->stx_mode << dendl;
6775 if (mask & CEPH_SETATTR_UID) {
6776 req->head.args.setattr.uid = stx->stx_uid;
6777 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6778 ldout(cct,10) << "changing uid to " << stx->stx_uid << dendl;
6780 if (mask & CEPH_SETATTR_GID) {
6781 req->head.args.setattr.gid = stx->stx_gid;
6782 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6783 ldout(cct,10) << "changing gid to " << stx->stx_gid << dendl;
6785 if (mask & CEPH_SETATTR_BTIME) {
6786 req->head.args.setattr.btime = utime_t(stx->stx_btime);
6787 req->inode_drop |= CEPH_CAP_AUTH_SHARED;
6789 if (mask & CEPH_SETATTR_MTIME) {
6790 req->head.args.setattr.mtime = utime_t(stx->stx_mtime);
6791 req->inode_drop |= CEPH_CAP_AUTH_SHARED | CEPH_CAP_FILE_RD |
6794 if (mask & CEPH_SETATTR_ATIME) {
6795 req->head.args.setattr.atime = utime_t(stx->stx_atime);
6796 req->inode_drop |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
6799 if (mask & CEPH_SETATTR_SIZE) {
6800 if ((unsigned long)stx->stx_size < mdsmap->get_max_filesize()) {
6801 req->head.args.setattr.size = stx->stx_size;
6802 ldout(cct,10) << "changing size to " << stx->stx_size << dendl;
6805 ldout(cct,10) << "unable to set size to " << stx->stx_size << ". Too large!" << dendl;
6808 req->inode_drop |= CEPH_CAP_AUTH_SHARED | CEPH_CAP_FILE_RD |
6811 req->head.args.setattr.mask = mask;
6813 req->regetattr_mask = mask;
6815 int res = make_request(req, perms, inp);
6816 ldout(cct, 10) << "_setattr result=" << res << dendl;
6820 /* Note that we only care about attrs that setattr cares about */
6821 void Client::stat_to_statx(struct stat *st, struct ceph_statx *stx)
6823 stx->stx_size = st->st_size;
6824 stx->stx_mode = st->st_mode;
6825 stx->stx_uid = st->st_uid;
6826 stx->stx_gid = st->st_gid;
6827 stx->stx_mtime = st->st_mtim;
6828 stx->stx_atime = st->st_atim;
6831 int Client::__setattrx(Inode *in, struct ceph_statx *stx, int mask,
6832 const UserPerm& perms, InodeRef *inp)
6834 int ret = _do_setattr(in, stx, mask, perms, inp);
6837 if (mask & CEPH_SETATTR_MODE)
6838 ret = _posix_acl_chmod(in, stx->stx_mode, perms);
6842 int Client::_setattrx(InodeRef &in, struct ceph_statx *stx, int mask,
6843 const UserPerm& perms)
6845 mask &= (CEPH_SETATTR_MODE | CEPH_SETATTR_UID |
6846 CEPH_SETATTR_GID | CEPH_SETATTR_MTIME |
6847 CEPH_SETATTR_ATIME | CEPH_SETATTR_SIZE |
6848 CEPH_SETATTR_CTIME | CEPH_SETATTR_BTIME);
6849 if (cct->_conf->client_permissions) {
6850 int r = may_setattr(in.get(), stx, mask, perms);
6854 return __setattrx(in.get(), stx, mask, perms);
6857 int Client::_setattr(InodeRef &in, struct stat *attr, int mask,
6858 const UserPerm& perms)
6860 struct ceph_statx stx;
6862 stat_to_statx(attr, &stx);
6863 mask &= ~CEPH_SETATTR_BTIME;
6865 if ((mask & CEPH_SETATTR_UID) && attr->st_uid == static_cast<uid_t>(-1)) {
6866 mask &= ~CEPH_SETATTR_UID;
6868 if ((mask & CEPH_SETATTR_GID) && attr->st_gid == static_cast<uid_t>(-1)) {
6869 mask &= ~CEPH_SETATTR_GID;
6872 return _setattrx(in, &stx, mask, perms);
6875 int Client::setattr(const char *relpath, struct stat *attr, int mask,
6876 const UserPerm& perms)
6878 Mutex::Locker lock(client_lock);
6879 tout(cct) << "setattr" << std::endl;
6880 tout(cct) << relpath << std::endl;
6881 tout(cct) << mask << std::endl;
6886 filepath path(relpath);
6888 int r = path_walk(path, &in, perms);
6891 return _setattr(in, attr, mask, perms);
6894 int Client::setattrx(const char *relpath, struct ceph_statx *stx, int mask,
6895 const UserPerm& perms, int flags)
6897 Mutex::Locker lock(client_lock);
6898 tout(cct) << "setattrx" << std::endl;
6899 tout(cct) << relpath << std::endl;
6900 tout(cct) << mask << std::endl;
6905 filepath path(relpath);
6907 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW));
6910 return _setattrx(in, stx, mask, perms);
6913 int Client::fsetattr(int fd, struct stat *attr, int mask, const UserPerm& perms)
6915 Mutex::Locker lock(client_lock);
6916 tout(cct) << "fsetattr" << std::endl;
6917 tout(cct) << fd << std::endl;
6918 tout(cct) << mask << std::endl;
6923 Fh *f = get_filehandle(fd);
6926 #if defined(__linux__) && defined(O_PATH)
6927 if (f->flags & O_PATH)
6930 return _setattr(f->inode, attr, mask, perms);
6933 int Client::fsetattrx(int fd, struct ceph_statx *stx, int mask, const UserPerm& perms)
6935 Mutex::Locker lock(client_lock);
6936 tout(cct) << "fsetattr" << std::endl;
6937 tout(cct) << fd << std::endl;
6938 tout(cct) << mask << std::endl;
6943 Fh *f = get_filehandle(fd);
6946 #if defined(__linux__) && defined(O_PATH)
6947 if (f->flags & O_PATH)
6950 return _setattrx(f->inode, stx, mask, perms);
6953 int Client::stat(const char *relpath, struct stat *stbuf, const UserPerm& perms,
6954 frag_info_t *dirstat, int mask)
6956 ldout(cct, 3) << "stat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
6957 Mutex::Locker lock(client_lock);
6958 tout(cct) << "stat" << std::endl;
6959 tout(cct) << relpath << std::endl;
6964 filepath path(relpath);
6966 int r = path_walk(path, &in, perms, true, mask);
6969 r = _getattr(in, mask, perms);
6971 ldout(cct, 3) << "stat exit on error!" << dendl;
6974 fill_stat(in, stbuf, dirstat);
6975 ldout(cct, 3) << "stat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
6979 unsigned Client::statx_to_mask(unsigned int flags, unsigned int want)
6983 /* if NO_ATTR_SYNC is set, then we don't need any -- just use what's in cache */
6984 if (flags & AT_NO_ATTR_SYNC)
6987 /* Always set PIN to distinguish from AT_NO_ATTR_SYNC case */
6988 mask |= CEPH_CAP_PIN;
6989 if (want & (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
6990 mask |= CEPH_CAP_AUTH_SHARED;
6991 if (want & (CEPH_STATX_NLINK|CEPH_STATX_CTIME|CEPH_STATX_VERSION))
6992 mask |= CEPH_CAP_LINK_SHARED;
6993 if (want & (CEPH_STATX_ATIME|CEPH_STATX_MTIME|CEPH_STATX_CTIME|CEPH_STATX_SIZE|CEPH_STATX_BLOCKS|CEPH_STATX_VERSION))
6994 mask |= CEPH_CAP_FILE_SHARED;
6995 if (want & (CEPH_STATX_VERSION|CEPH_STATX_CTIME))
6996 mask |= CEPH_CAP_XATTR_SHARED;
7001 int Client::statx(const char *relpath, struct ceph_statx *stx,
7002 const UserPerm& perms,
7003 unsigned int want, unsigned int flags)
7005 ldout(cct, 3) << "statx enter (relpath " << relpath << " want " << want << ")" << dendl;
7006 Mutex::Locker lock(client_lock);
7007 tout(cct) << "statx" << std::endl;
7008 tout(cct) << relpath << std::endl;
7013 filepath path(relpath);
7016 unsigned mask = statx_to_mask(flags, want);
7018 int r = path_walk(path, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
7022 r = _getattr(in, mask, perms);
7024 ldout(cct, 3) << "statx exit on error!" << dendl;
7028 fill_statx(in, mask, stx);
7029 ldout(cct, 3) << "statx exit (relpath " << relpath << " mask " << stx->stx_mask << ")" << dendl;
7033 int Client::lstat(const char *relpath, struct stat *stbuf,
7034 const UserPerm& perms, frag_info_t *dirstat, int mask)
7036 ldout(cct, 3) << "lstat enter (relpath " << relpath << " mask " << mask << ")" << dendl;
7037 Mutex::Locker lock(client_lock);
7038 tout(cct) << "lstat" << std::endl;
7039 tout(cct) << relpath << std::endl;
7044 filepath path(relpath);
7046 // don't follow symlinks
7047 int r = path_walk(path, &in, perms, false, mask);
7050 r = _getattr(in, mask, perms);
7052 ldout(cct, 3) << "lstat exit on error!" << dendl;
7055 fill_stat(in, stbuf, dirstat);
7056 ldout(cct, 3) << "lstat exit (relpath " << relpath << " mask " << mask << ")" << dendl;
7060 int Client::fill_stat(Inode *in, struct stat *st, frag_info_t *dirstat, nest_info_t *rstat)
7062 ldout(cct, 10) << "fill_stat on " << in->ino << " snap/dev" << in->snapid
7063 << " mode 0" << oct << in->mode << dec
7064 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7065 memset(st, 0, sizeof(struct stat));
7066 if (use_faked_inos())
7067 st->st_ino = in->faked_ino;
7069 st->st_ino = in->ino;
7070 st->st_dev = in->snapid;
7071 st->st_mode = in->mode;
7072 st->st_rdev = in->rdev;
7073 st->st_nlink = in->nlink;
7074 st->st_uid = in->uid;
7075 st->st_gid = in->gid;
7076 if (in->ctime > in->mtime) {
7077 stat_set_ctime_sec(st, in->ctime.sec());
7078 stat_set_ctime_nsec(st, in->ctime.nsec());
7080 stat_set_ctime_sec(st, in->mtime.sec());
7081 stat_set_ctime_nsec(st, in->mtime.nsec());
7083 stat_set_atime_sec(st, in->atime.sec());
7084 stat_set_atime_nsec(st, in->atime.nsec());
7085 stat_set_mtime_sec(st, in->mtime.sec());
7086 stat_set_mtime_nsec(st, in->mtime.nsec());
7088 if (cct->_conf->client_dirsize_rbytes)
7089 st->st_size = in->rstat.rbytes;
7091 st->st_size = in->dirstat.size();
7094 st->st_size = in->size;
7095 st->st_blocks = (in->size + 511) >> 9;
7097 st->st_blksize = MAX(in->layout.stripe_unit, 4096);
7100 *dirstat = in->dirstat;
7104 return in->caps_issued();
7107 void Client::fill_statx(Inode *in, unsigned int mask, struct ceph_statx *stx)
7109 ldout(cct, 10) << "fill_statx on " << in->ino << " snap/dev" << in->snapid
7110 << " mode 0" << oct << in->mode << dec
7111 << " mtime " << in->mtime << " ctime " << in->ctime << dendl;
7112 memset(stx, 0, sizeof(struct ceph_statx));
7115 * If mask is 0, then the caller set AT_NO_ATTR_SYNC. Reset the mask
7116 * so that all bits are set.
7121 /* These are always considered to be available */
7122 stx->stx_dev = in->snapid;
7123 stx->stx_blksize = MAX(in->layout.stripe_unit, 4096);
7125 /* Type bits are always set, even when CEPH_STATX_MODE is not */
7126 stx->stx_mode = S_IFMT & in->mode;
7127 stx->stx_ino = use_faked_inos() ? in->faked_ino : (ino_t)in->ino;
7128 stx->stx_rdev = in->rdev;
7129 stx->stx_mask |= (CEPH_STATX_INO|CEPH_STATX_RDEV);
7131 if (mask & CEPH_CAP_AUTH_SHARED) {
7132 stx->stx_uid = in->uid;
7133 stx->stx_gid = in->gid;
7134 stx->stx_mode = in->mode;
7135 in->btime.to_timespec(&stx->stx_btime);
7136 stx->stx_mask |= (CEPH_STATX_MODE|CEPH_STATX_UID|CEPH_STATX_GID|CEPH_STATX_BTIME);
7139 if (mask & CEPH_CAP_LINK_SHARED) {
7140 stx->stx_nlink = in->nlink;
7141 stx->stx_mask |= CEPH_STATX_NLINK;
7144 if (mask & CEPH_CAP_FILE_SHARED) {
7146 in->atime.to_timespec(&stx->stx_atime);
7147 in->mtime.to_timespec(&stx->stx_mtime);
7150 if (cct->_conf->client_dirsize_rbytes)
7151 stx->stx_size = in->rstat.rbytes;
7153 stx->stx_size = in->dirstat.size();
7154 stx->stx_blocks = 1;
7156 stx->stx_size = in->size;
7157 stx->stx_blocks = (in->size + 511) >> 9;
7159 stx->stx_mask |= (CEPH_STATX_ATIME|CEPH_STATX_MTIME|
7160 CEPH_STATX_SIZE|CEPH_STATX_BLOCKS);
7163 /* Change time and change_attr both require all shared caps to view */
7164 if ((mask & CEPH_STAT_CAP_INODE_ALL) == CEPH_STAT_CAP_INODE_ALL) {
7165 stx->stx_version = in->change_attr;
7166 if (in->ctime > in->mtime)
7167 in->ctime.to_timespec(&stx->stx_ctime);
7169 in->mtime.to_timespec(&stx->stx_ctime);
7170 stx->stx_mask |= (CEPH_STATX_CTIME|CEPH_STATX_VERSION);
7175 void Client::touch_dn(Dentry *dn)
7180 int Client::chmod(const char *relpath, mode_t mode, const UserPerm& perms)
7182 Mutex::Locker lock(client_lock);
7183 tout(cct) << "chmod" << std::endl;
7184 tout(cct) << relpath << std::endl;
7185 tout(cct) << mode << std::endl;
7190 filepath path(relpath);
7192 int r = path_walk(path, &in, perms);
7196 attr.st_mode = mode;
7197 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7200 int Client::fchmod(int fd, mode_t mode, const UserPerm& perms)
7202 Mutex::Locker lock(client_lock);
7203 tout(cct) << "fchmod" << std::endl;
7204 tout(cct) << fd << std::endl;
7205 tout(cct) << mode << std::endl;
7210 Fh *f = get_filehandle(fd);
7213 #if defined(__linux__) && defined(O_PATH)
7214 if (f->flags & O_PATH)
7218 attr.st_mode = mode;
7219 return _setattr(f->inode, &attr, CEPH_SETATTR_MODE, perms);
7222 int Client::lchmod(const char *relpath, mode_t mode, const UserPerm& perms)
7224 Mutex::Locker lock(client_lock);
7225 tout(cct) << "lchmod" << std::endl;
7226 tout(cct) << relpath << std::endl;
7227 tout(cct) << mode << std::endl;
7232 filepath path(relpath);
7234 // don't follow symlinks
7235 int r = path_walk(path, &in, perms, false);
7239 attr.st_mode = mode;
7240 return _setattr(in, &attr, CEPH_SETATTR_MODE, perms);
7243 int Client::chown(const char *relpath, uid_t new_uid, gid_t new_gid,
7244 const UserPerm& perms)
7246 Mutex::Locker lock(client_lock);
7247 tout(cct) << "chown" << std::endl;
7248 tout(cct) << relpath << std::endl;
7249 tout(cct) << new_uid << std::endl;
7250 tout(cct) << new_gid << std::endl;
7255 filepath path(relpath);
7257 int r = path_walk(path, &in, perms);
7261 attr.st_uid = new_uid;
7262 attr.st_gid = new_gid;
7263 return _setattr(in, &attr, CEPH_SETATTR_UID|CEPH_SETATTR_GID, perms);
7266 int Client::fchown(int fd, uid_t new_uid, gid_t new_gid, const UserPerm& perms)
7268 Mutex::Locker lock(client_lock);
7269 tout(cct) << "fchown" << std::endl;
7270 tout(cct) << fd << std::endl;
7271 tout(cct) << new_uid << std::endl;
7272 tout(cct) << new_gid << std::endl;
7277 Fh *f = get_filehandle(fd);
7280 #if defined(__linux__) && defined(O_PATH)
7281 if (f->flags & O_PATH)
7285 attr.st_uid = new_uid;
7286 attr.st_gid = new_gid;
7288 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7289 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7290 return _setattr(f->inode, &attr, mask, perms);
7293 int Client::lchown(const char *relpath, uid_t new_uid, gid_t new_gid,
7294 const UserPerm& perms)
7296 Mutex::Locker lock(client_lock);
7297 tout(cct) << "lchown" << std::endl;
7298 tout(cct) << relpath << std::endl;
7299 tout(cct) << new_uid << std::endl;
7300 tout(cct) << new_gid << std::endl;
7305 filepath path(relpath);
7307 // don't follow symlinks
7308 int r = path_walk(path, &in, perms, false);
7312 attr.st_uid = new_uid;
7313 attr.st_gid = new_gid;
7315 if (new_uid != static_cast<uid_t>(-1)) mask |= CEPH_SETATTR_UID;
7316 if (new_gid != static_cast<gid_t>(-1)) mask |= CEPH_SETATTR_GID;
7317 return _setattr(in, &attr, mask, perms);
7320 int Client::utime(const char *relpath, struct utimbuf *buf,
7321 const UserPerm& perms)
7323 Mutex::Locker lock(client_lock);
7324 tout(cct) << "utime" << std::endl;
7325 tout(cct) << relpath << std::endl;
7326 tout(cct) << buf->modtime << std::endl;
7327 tout(cct) << buf->actime << std::endl;
7332 filepath path(relpath);
7334 int r = path_walk(path, &in, perms);
7338 stat_set_mtime_sec(&attr, buf->modtime);
7339 stat_set_mtime_nsec(&attr, 0);
7340 stat_set_atime_sec(&attr, buf->actime);
7341 stat_set_atime_nsec(&attr, 0);
7342 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7345 int Client::lutime(const char *relpath, struct utimbuf *buf,
7346 const UserPerm& perms)
7348 Mutex::Locker lock(client_lock);
7349 tout(cct) << "lutime" << std::endl;
7350 tout(cct) << relpath << std::endl;
7351 tout(cct) << buf->modtime << std::endl;
7352 tout(cct) << buf->actime << std::endl;
7357 filepath path(relpath);
7359 // don't follow symlinks
7360 int r = path_walk(path, &in, perms, false);
7364 stat_set_mtime_sec(&attr, buf->modtime);
7365 stat_set_mtime_nsec(&attr, 0);
7366 stat_set_atime_sec(&attr, buf->actime);
7367 stat_set_atime_nsec(&attr, 0);
7368 return _setattr(in, &attr, CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME, perms);
7371 int Client::flock(int fd, int operation, uint64_t owner)
7373 Mutex::Locker lock(client_lock);
7374 tout(cct) << "flock" << std::endl;
7375 tout(cct) << fd << std::endl;
7376 tout(cct) << operation << std::endl;
7377 tout(cct) << owner << std::endl;
7382 Fh *f = get_filehandle(fd);
7386 return _flock(f, operation, owner);
7389 int Client::opendir(const char *relpath, dir_result_t **dirpp, const UserPerm& perms)
7391 Mutex::Locker lock(client_lock);
7392 tout(cct) << "opendir" << std::endl;
7393 tout(cct) << relpath << std::endl;
7398 filepath path(relpath);
7400 int r = path_walk(path, &in, perms, true);
7403 if (cct->_conf->client_permissions) {
7404 int r = may_open(in.get(), O_RDONLY, perms);
7408 r = _opendir(in.get(), dirpp, perms);
7409 /* if ENOTDIR, dirpp will be an uninitialized point and it's very dangerous to access its value */
7411 tout(cct) << (unsigned long)*dirpp << std::endl;
7415 int Client::_opendir(Inode *in, dir_result_t **dirpp, const UserPerm& perms)
7419 *dirpp = new dir_result_t(in, perms);
7420 opened_dirs.insert(*dirpp);
7421 ldout(cct, 3) << "_opendir(" << in->ino << ") = " << 0 << " (" << *dirpp << ")" << dendl;
7426 int Client::closedir(dir_result_t *dir)
7428 Mutex::Locker lock(client_lock);
7429 tout(cct) << "closedir" << std::endl;
7430 tout(cct) << (unsigned long)dir << std::endl;
7432 ldout(cct, 3) << "closedir(" << dir << ") = 0" << dendl;
7437 void Client::_closedir(dir_result_t *dirp)
7439 ldout(cct, 10) << "_closedir(" << dirp << ")" << dendl;
7441 ldout(cct, 10) << "_closedir detaching inode " << dirp->inode << dendl;
7442 dirp->inode.reset();
7444 _readdir_drop_dirp_buffer(dirp);
7445 opened_dirs.erase(dirp);
7449 void Client::rewinddir(dir_result_t *dirp)
7451 Mutex::Locker lock(client_lock);
7452 ldout(cct, 3) << "rewinddir(" << dirp << ")" << dendl;
7457 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7458 _readdir_drop_dirp_buffer(d);
7462 loff_t Client::telldir(dir_result_t *dirp)
7464 dir_result_t *d = static_cast<dir_result_t*>(dirp);
7465 ldout(cct, 3) << "telldir(" << dirp << ") = " << d->offset << dendl;
7469 void Client::seekdir(dir_result_t *dirp, loff_t offset)
7471 Mutex::Locker lock(client_lock);
7473 ldout(cct, 3) << "seekdir(" << dirp << ", " << offset << ")" << dendl;
7478 if (offset == dirp->offset)
7481 if (offset > dirp->offset)
7482 dirp->release_count = 0; // bump if we do a forward seek
7484 dirp->ordered_count = 0; // disable filling readdir cache
7486 if (dirp->hash_order()) {
7487 if (dirp->offset > offset) {
7488 _readdir_drop_dirp_buffer(dirp);
7493 dirp->buffer_frag != frag_t(dir_result_t::fpos_high(offset)) ||
7494 dirp->offset_low() > dir_result_t::fpos_low(offset)) {
7495 _readdir_drop_dirp_buffer(dirp);
7500 dirp->offset = offset;
7505 // ino_t d_ino; /* inode number */
7506 // off_t d_off; /* offset to the next dirent */
7507 // unsigned short d_reclen; /* length of this record */
7508 // unsigned char d_type; /* type of file */
7509 // char d_name[256]; /* filename */
7511 void Client::fill_dirent(struct dirent *de, const char *name, int type, uint64_t ino, loff_t next_off)
7513 strncpy(de->d_name, name, 255);
7514 de->d_name[255] = '\0';
7517 #if !defined(DARWIN) && !defined(__FreeBSD__)
7518 de->d_off = next_off;
7521 de->d_type = IFTODT(type);
7522 ldout(cct, 10) << "fill_dirent '" << de->d_name << "' -> " << inodeno_t(de->d_ino)
7523 << " type " << (int)de->d_type << " w/ next_off " << hex << next_off << dec << dendl;
7527 void Client::_readdir_next_frag(dir_result_t *dirp)
7529 frag_t fg = dirp->buffer_frag;
7531 if (fg.is_rightmost()) {
7532 ldout(cct, 10) << "_readdir_next_frag advance from " << fg << " to END" << dendl;
7539 ldout(cct, 10) << "_readdir_next_frag advance from " << dirp->buffer_frag << " to " << fg << dendl;
7541 if (dirp->hash_order()) {
7543 int64_t new_offset = dir_result_t::make_fpos(fg.value(), 2, true);
7544 if (dirp->offset < new_offset) // don't decrease offset
7545 dirp->offset = new_offset;
7547 dirp->last_name.clear();
7548 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7549 _readdir_rechoose_frag(dirp);
7553 void Client::_readdir_rechoose_frag(dir_result_t *dirp)
7555 assert(dirp->inode);
7557 if (dirp->hash_order())
7560 frag_t cur = frag_t(dirp->offset_high());
7561 frag_t fg = dirp->inode->dirfragtree[cur.value()];
7563 ldout(cct, 10) << "_readdir_rechoose_frag frag " << cur << " maps to " << fg << dendl;
7564 dirp->offset = dir_result_t::make_fpos(fg, 2, false);
7565 dirp->last_name.clear();
7566 dirp->next_offset = 2;
7570 void Client::_readdir_drop_dirp_buffer(dir_result_t *dirp)
7572 ldout(cct, 10) << "_readdir_drop_dirp_buffer " << dirp << dendl;
7573 dirp->buffer.clear();
7576 int Client::_readdir_get_frag(dir_result_t *dirp)
7579 assert(dirp->inode);
7581 // get the current frag.
7583 if (dirp->hash_order())
7584 fg = dirp->inode->dirfragtree[dirp->offset_high()];
7586 fg = frag_t(dirp->offset_high());
7588 ldout(cct, 10) << "_readdir_get_frag " << dirp << " on " << dirp->inode->ino << " fg " << fg
7589 << " offset " << hex << dirp->offset << dec << dendl;
7591 int op = CEPH_MDS_OP_READDIR;
7592 if (dirp->inode && dirp->inode->snapid == CEPH_SNAPDIR)
7593 op = CEPH_MDS_OP_LSSNAP;
7595 InodeRef& diri = dirp->inode;
7597 MetaRequest *req = new MetaRequest(op);
7599 diri->make_nosnap_relative_path(path);
7600 req->set_filepath(path);
7601 req->set_inode(diri.get());
7602 req->head.args.readdir.frag = fg;
7603 req->head.args.readdir.flags = CEPH_READDIR_REPLY_BITFLAGS;
7604 if (dirp->last_name.length()) {
7605 req->path2.set_path(dirp->last_name.c_str());
7606 } else if (dirp->hash_order()) {
7607 req->head.args.readdir.offset_hash = dirp->offset_high();
7612 int res = make_request(req, dirp->perms, NULL, NULL, -1, &dirbl);
7614 if (res == -EAGAIN) {
7615 ldout(cct, 10) << "_readdir_get_frag got EAGAIN, retrying" << dendl;
7616 _readdir_rechoose_frag(dirp);
7617 return _readdir_get_frag(dirp);
7621 ldout(cct, 10) << "_readdir_get_frag " << dirp << " got frag " << dirp->buffer_frag
7622 << " size " << dirp->buffer.size() << dendl;
7624 ldout(cct, 10) << "_readdir_get_frag got error " << res << ", setting end flag" << dendl;
7631 struct dentry_off_lt {
7632 bool operator()(const Dentry* dn, int64_t off) const {
7633 return dir_result_t::fpos_cmp(dn->offset, off) < 0;
7637 int Client::_readdir_cache_cb(dir_result_t *dirp, add_dirent_cb_t cb, void *p,
7638 int caps, bool getref)
7640 assert(client_lock.is_locked());
7641 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino
7642 << " last_name " << dirp->last_name << " offset " << hex << dirp->offset << dec
7644 Dir *dir = dirp->inode->dir;
7647 ldout(cct, 10) << " dir is empty" << dendl;
7652 vector<Dentry*>::iterator pd = std::lower_bound(dir->readdir_cache.begin(),
7653 dir->readdir_cache.end(),
7654 dirp->offset, dentry_off_lt());
7658 if (!dirp->inode->is_complete_and_ordered())
7660 if (pd == dir->readdir_cache.end())
7663 if (dn->inode == NULL) {
7664 ldout(cct, 15) << " skipping null '" << dn->name << "'" << dendl;
7668 if (dn->cap_shared_gen != dir->parent_inode->shared_gen) {
7669 ldout(cct, 15) << " skipping mismatch shared gen '" << dn->name << "'" << dendl;
7674 int r = _getattr(dn->inode, caps, dirp->perms);
7678 struct ceph_statx stx;
7680 fill_statx(dn->inode, caps, &stx);
7682 uint64_t next_off = dn->offset + 1;
7684 if (pd == dir->readdir_cache.end())
7685 next_off = dir_result_t::END;
7688 fill_dirent(&de, dn->name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7690 in = dn->inode.get();
7694 dn_name = dn->name; // fill in name while we have lock
7696 client_lock.Unlock();
7697 r = cb(p, &de, &stx, next_off, in); // _next_ offset
7699 ldout(cct, 15) << " de " << de.d_name << " off " << hex << dn->offset << dec
7700 << " = " << r << dendl;
7705 dirp->offset = next_off;
7707 dirp->next_offset = 2;
7709 dirp->next_offset = dirp->offset_low();
7710 dirp->last_name = dn_name; // we successfully returned this one; update!
7715 ldout(cct, 10) << "_readdir_cache_cb " << dirp << " on " << dirp->inode->ino << " at end" << dendl;
7720 int Client::readdir_r_cb(dir_result_t *d, add_dirent_cb_t cb, void *p,
7721 unsigned want, unsigned flags, bool getref)
7723 int caps = statx_to_mask(flags, want);
7725 Mutex::Locker lock(client_lock);
7730 dir_result_t *dirp = static_cast<dir_result_t*>(d);
7732 ldout(cct, 10) << "readdir_r_cb " << *dirp->inode << " offset " << hex << dirp->offset
7733 << dec << " at_end=" << dirp->at_end()
7734 << " hash_order=" << dirp->hash_order() << dendl;
7737 struct ceph_statx stx;
7738 memset(&de, 0, sizeof(de));
7739 memset(&stx, 0, sizeof(stx));
7741 InodeRef& diri = dirp->inode;
7746 if (dirp->offset == 0) {
7747 ldout(cct, 15) << " including ." << dendl;
7748 assert(diri->dn_set.size() < 2); // can't have multiple hard-links to a dir
7749 uint64_t next_off = 1;
7752 r = _getattr(diri, caps, dirp->perms);
7756 fill_statx(diri, caps, &stx);
7757 fill_dirent(&de, ".", S_IFDIR, stx.stx_ino, next_off);
7759 Inode *inode = NULL;
7765 client_lock.Unlock();
7766 r = cb(p, &de, &stx, next_off, inode);
7771 dirp->offset = next_off;
7775 if (dirp->offset == 1) {
7776 ldout(cct, 15) << " including .." << dendl;
7777 uint64_t next_off = 2;
7779 if (diri->dn_set.empty())
7782 in = diri->get_first_parent()->inode;
7785 r = _getattr(diri, caps, dirp->perms);
7789 fill_statx(in, caps, &stx);
7790 fill_dirent(&de, "..", S_IFDIR, stx.stx_ino, next_off);
7792 Inode *inode = NULL;
7798 client_lock.Unlock();
7799 r = cb(p, &de, &stx, next_off, inode);
7804 dirp->offset = next_off;
7809 // can we read from our cache?
7810 ldout(cct, 10) << "offset " << hex << dirp->offset << dec
7811 << " snapid " << dirp->inode->snapid << " (complete && ordered) "
7812 << dirp->inode->is_complete_and_ordered()
7813 << " issued " << ccap_string(dirp->inode->caps_issued())
7815 if (dirp->inode->snapid != CEPH_SNAPDIR &&
7816 dirp->inode->is_complete_and_ordered() &&
7817 dirp->inode->caps_issued_mask(CEPH_CAP_FILE_SHARED)) {
7818 int err = _readdir_cache_cb(dirp, cb, p, caps, getref);
7827 bool check_caps = true;
7828 if (!dirp->is_cached()) {
7829 int r = _readdir_get_frag(dirp);
7832 // _readdir_get_frag () may updates dirp->offset if the replied dirfrag is
7833 // different than the requested one. (our dirfragtree was outdated)
7836 frag_t fg = dirp->buffer_frag;
7838 ldout(cct, 10) << "frag " << fg << " buffer size " << dirp->buffer.size()
7839 << " offset " << hex << dirp->offset << dendl;
7841 for (auto it = std::lower_bound(dirp->buffer.begin(), dirp->buffer.end(),
7842 dirp->offset, dir_result_t::dentry_off_lt());
7843 it != dirp->buffer.end();
7845 dir_result_t::dentry &entry = *it;
7847 uint64_t next_off = entry.offset + 1;
7851 r = _getattr(entry.inode, caps, dirp->perms);
7856 fill_statx(entry.inode, caps, &stx);
7857 fill_dirent(&de, entry.name.c_str(), stx.stx_mode, stx.stx_ino, next_off);
7859 Inode *inode = NULL;
7861 inode = entry.inode.get();
7865 client_lock.Unlock();
7866 r = cb(p, &de, &stx, next_off, inode); // _next_ offset
7869 ldout(cct, 15) << " de " << de.d_name << " off " << hex << next_off - 1 << dec
7870 << " = " << r << dendl;
7874 dirp->offset = next_off;
7879 if (dirp->next_offset > 2) {
7880 ldout(cct, 10) << " fetching next chunk of this frag" << dendl;
7881 _readdir_drop_dirp_buffer(dirp);
7885 if (!fg.is_rightmost()) {
7887 _readdir_next_frag(dirp);
7891 if (diri->shared_gen == dirp->start_shared_gen &&
7892 diri->dir_release_count == dirp->release_count) {
7893 if (diri->dir_ordered_count == dirp->ordered_count) {
7894 ldout(cct, 10) << " marking (I_COMPLETE|I_DIR_ORDERED) on " << *diri << dendl;
7896 assert(diri->dir->readdir_cache.size() >= dirp->cache_index);
7897 diri->dir->readdir_cache.resize(dirp->cache_index);
7899 diri->flags |= I_COMPLETE | I_DIR_ORDERED;
7901 ldout(cct, 10) << " marking I_COMPLETE on " << *diri << dendl;
7902 diri->flags |= I_COMPLETE;
7914 int Client::readdir_r(dir_result_t *d, struct dirent *de)
7916 return readdirplus_r(d, de, 0, 0, 0, NULL);
7923 * 1 if we got a dirent
7924 * 0 for end of directory
7928 struct single_readdir {
7930 struct ceph_statx *stx;
7935 static int _readdir_single_dirent_cb(void *p, struct dirent *de,
7936 struct ceph_statx *stx, off_t off,
7939 single_readdir *c = static_cast<single_readdir *>(p);
7942 return -1; // already filled this dirent
7952 struct dirent *Client::readdir(dir_result_t *d)
7955 static struct dirent de;
7962 // our callback fills the dirent and sets sr.full=true on first
7963 // call, and returns -1 the second time around.
7964 ret = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr);
7966 errno = -ret; // this sucks.
7967 return (dirent *) NULL;
7972 return (dirent *) NULL;
7975 int Client::readdirplus_r(dir_result_t *d, struct dirent *de,
7976 struct ceph_statx *stx, unsigned want,
7977 unsigned flags, Inode **out)
7985 // our callback fills the dirent and sets sr.full=true on first
7986 // call, and returns -1 the second time around.
7987 int r = readdir_r_cb(d, _readdir_single_dirent_cb, (void *)&sr, want, flags, out);
7999 struct getdents_result {
8006 static int _readdir_getdent_cb(void *p, struct dirent *de,
8007 struct ceph_statx *stx, off_t off, Inode *in)
8009 struct getdents_result *c = static_cast<getdents_result *>(p);
8015 dlen = strlen(de->d_name) + 1;
8017 if (c->pos + dlen > c->buflen)
8018 return -1; // doesn't fit
8021 memcpy(c->buf + c->pos, de, sizeof(*de));
8023 memcpy(c->buf + c->pos, de->d_name, dlen);
8029 int Client::_getdents(dir_result_t *dir, char *buf, int buflen, bool fullent)
8034 gr.fullent = fullent;
8037 int r = readdir_r_cb(dir, _readdir_getdent_cb, (void *)&gr);
8039 if (r < 0) { // some error
8040 if (r == -1) { // buffer ran out of space
8041 if (gr.pos) { // but we got some entries already!
8043 } // or we need a larger buffer
8045 } else { // actual error, return it
8054 struct getdir_result {
8055 list<string> *contents;
8059 static int _getdir_cb(void *p, struct dirent *de, struct ceph_statx *stx, off_t off, Inode *in)
8061 getdir_result *r = static_cast<getdir_result *>(p);
8063 r->contents->push_back(de->d_name);
8068 int Client::getdir(const char *relpath, list<string>& contents,
8069 const UserPerm& perms)
8071 ldout(cct, 3) << "getdir(" << relpath << ")" << dendl;
8073 Mutex::Locker lock(client_lock);
8074 tout(cct) << "getdir" << std::endl;
8075 tout(cct) << relpath << std::endl;
8079 int r = opendir(relpath, &d, perms);
8084 gr.contents = &contents;
8086 r = readdir_r_cb(d, _getdir_cb, (void *)&gr);
8096 /****** file i/o **********/
8097 int Client::open(const char *relpath, int flags, const UserPerm& perms,
8098 mode_t mode, int stripe_unit, int stripe_count,
8099 int object_size, const char *data_pool)
8101 ldout(cct, 3) << "open enter(" << relpath << ", " << ceph_flags_sys2wire(flags) << "," << mode << ")" << dendl;
8102 Mutex::Locker lock(client_lock);
8103 tout(cct) << "open" << std::endl;
8104 tout(cct) << relpath << std::endl;
8105 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
8112 #if defined(__linux__) && defined(O_PATH)
8113 /* When the O_PATH is being specified, others flags than O_DIRECTORY
8114 * and O_NOFOLLOW are ignored. Please refer do_entry_open() function
8115 * in kernel (fs/open.c). */
8117 flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
8120 filepath path(relpath);
8122 bool created = false;
8123 /* O_CREATE with O_EXCL enforces O_NOFOLLOW. */
8124 bool followsym = !((flags & O_NOFOLLOW) || ((flags & O_CREAT) && (flags & O_EXCL)));
8125 int r = path_walk(path, &in, perms, followsym, ceph_caps_for_mode(mode));
8127 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
8130 #if defined(__linux__) && defined(O_PATH)
8131 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW) && !(flags & O_PATH))
8133 if (r == 0 && in->is_symlink() && (flags & O_NOFOLLOW))
8137 if (r == -ENOENT && (flags & O_CREAT)) {
8138 filepath dirpath = path;
8139 string dname = dirpath.last_dentry();
8140 dirpath.pop_dentry();
8142 r = path_walk(dirpath, &dir, perms, true,
8143 cct->_conf->client_permissions ? CEPH_CAP_AUTH_SHARED : 0);
8146 if (cct->_conf->client_permissions) {
8147 r = may_create(dir.get(), perms);
8151 r = _create(dir.get(), dname.c_str(), flags, mode, &in, &fh, stripe_unit,
8152 stripe_count, object_size, data_pool, &created, perms);
8158 // posix says we can only check permissions of existing files
8159 if (cct->_conf->client_permissions) {
8160 r = may_open(in.get(), flags, perms);
8167 r = _open(in.get(), flags, mode, &fh, perms);
8169 // allocate a integer file descriptor
8172 assert(fd_map.count(r) == 0);
8177 tout(cct) << r << std::endl;
8178 ldout(cct, 3) << "open exit(" << path << ", " << ceph_flags_sys2wire(flags) << ") = " << r << dendl;
8182 int Client::open(const char *relpath, int flags, const UserPerm& perms, mode_t mode)
8184 /* Use default file striping parameters */
8185 return open(relpath, flags, perms, mode, 0, 0, 0, NULL);
8188 int Client::lookup_hash(inodeno_t ino, inodeno_t dirino, const char *name,
8189 const UserPerm& perms)
8191 Mutex::Locker lock(client_lock);
8192 ldout(cct, 3) << "lookup_hash enter(" << ino << ", #" << dirino << "/" << name << ")" << dendl;
8197 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPHASH);
8199 req->set_filepath(path);
8201 uint32_t h = ceph_str_hash(CEPH_STR_HASH_RJENKINS, name, strlen(name));
8203 sprintf(f, "%u", h);
8204 filepath path2(dirino);
8205 path2.push_dentry(string(f));
8206 req->set_filepath2(path2);
8208 int r = make_request(req, perms, NULL, NULL,
8209 rand() % mdsmap->get_num_in_mds());
8210 ldout(cct, 3) << "lookup_hash exit(" << ino << ", #" << dirino << "/" << name << ") = " << r << dendl;
8216 * Load inode into local cache.
8218 * If inode pointer is non-NULL, and take a reference on
8219 * the resulting Inode object in one operation, so that caller
8220 * can safely assume inode will still be there after return.
8222 int Client::lookup_ino(inodeno_t ino, const UserPerm& perms, Inode **inode)
8224 Mutex::Locker lock(client_lock);
8225 ldout(cct, 3) << "lookup_ino enter(" << ino << ")" << dendl;
8230 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPINO);
8232 req->set_filepath(path);
8234 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8235 if (r == 0 && inode != NULL) {
8236 vinodeno_t vino(ino, CEPH_NOSNAP);
8237 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
8238 assert(p != inode_map.end());
8242 ldout(cct, 3) << "lookup_ino exit(" << ino << ") = " << r << dendl;
8249 * Find the parent inode of `ino` and insert it into
8250 * our cache. Conditionally also set `parent` to a referenced
8251 * Inode* if caller provides non-NULL value.
8253 int Client::lookup_parent(Inode *ino, const UserPerm& perms, Inode **parent)
8255 Mutex::Locker lock(client_lock);
8256 ldout(cct, 3) << "lookup_parent enter(" << ino->ino << ")" << dendl;
8261 if (!ino->dn_set.empty()) {
8262 // if we exposed the parent here, we'd need to check permissions,
8263 // but right now we just rely on the MDS doing so in make_request
8264 ldout(cct, 3) << "lookup_parent dentry already present" << dendl;
8268 if (ino->is_root()) {
8270 ldout(cct, 3) << "ino is root, no parent" << dendl;
8274 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPPARENT);
8275 filepath path(ino->ino);
8276 req->set_filepath(path);
8279 int r = make_request(req, perms, &target, NULL, rand() % mdsmap->get_num_in_mds());
8280 // Give caller a reference to the parent ino if they provided a pointer.
8281 if (parent != NULL) {
8283 *parent = target.get();
8285 ldout(cct, 3) << "lookup_parent found parent " << (*parent)->ino << dendl;
8290 ldout(cct, 3) << "lookup_parent exit(" << ino->ino << ") = " << r << dendl;
8296 * Populate the parent dentry for `ino`, provided it is
8297 * a child of `parent`.
8299 int Client::lookup_name(Inode *ino, Inode *parent, const UserPerm& perms)
8301 assert(parent->is_dir());
8303 Mutex::Locker lock(client_lock);
8304 ldout(cct, 3) << "lookup_name enter(" << ino->ino << ")" << dendl;
8309 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
8310 req->set_filepath2(filepath(parent->ino));
8311 req->set_filepath(filepath(ino->ino));
8312 req->set_inode(ino);
8314 int r = make_request(req, perms, NULL, NULL, rand() % mdsmap->get_num_in_mds());
8315 ldout(cct, 3) << "lookup_name exit(" << ino->ino << ") = " << r << dendl;
8320 Fh *Client::_create_fh(Inode *in, int flags, int cmode, const UserPerm& perms)
8328 f->actor_perms = perms;
8330 ldout(cct, 10) << "_create_fh " << in->ino << " mode " << cmode << dendl;
8332 if (in->snapid != CEPH_NOSNAP) {
8333 in->snap_cap_refs++;
8334 ldout(cct, 5) << "open success, fh is " << f << " combined IMMUTABLE SNAP caps "
8335 << ccap_string(in->caps_issued()) << dendl;
8338 const md_config_t *conf = cct->_conf;
8339 f->readahead.set_trigger_requests(1);
8340 f->readahead.set_min_readahead_size(conf->client_readahead_min);
8341 uint64_t max_readahead = Readahead::NO_LIMIT;
8342 if (conf->client_readahead_max_bytes) {
8343 max_readahead = MIN(max_readahead, (uint64_t)conf->client_readahead_max_bytes);
8345 if (conf->client_readahead_max_periods) {
8346 max_readahead = MIN(max_readahead, in->layout.get_period()*(uint64_t)conf->client_readahead_max_periods);
8348 f->readahead.set_max_readahead_size(max_readahead);
8349 vector<uint64_t> alignments;
8350 alignments.push_back(in->layout.get_period());
8351 alignments.push_back(in->layout.stripe_unit);
8352 f->readahead.set_alignments(alignments);
8357 int Client::_release_fh(Fh *f)
8359 //ldout(cct, 3) << "op: client->close(open_files[ " << fh << " ]);" << dendl;
8360 //ldout(cct, 3) << "op: open_files.erase( " << fh << " );" << dendl;
8361 Inode *in = f->inode.get();
8362 ldout(cct, 5) << "_release_fh " << f << " mode " << f->mode << " on " << *in << dendl;
8364 if (in->snapid == CEPH_NOSNAP) {
8365 if (in->put_open_ref(f->mode)) {
8366 _flush(in, new C_Client_FlushComplete(this, in));
8370 assert(in->snap_cap_refs > 0);
8371 in->snap_cap_refs--;
8374 _release_filelocks(f);
8376 // Finally, read any async err (i.e. from flushes)
8377 int err = f->take_async_err();
8379 ldout(cct, 1) << "_release_fh " << f << " on inode " << *in << " caught async_err = "
8380 << cpp_strerror(err) << dendl;
8382 ldout(cct, 10) << "_release_fh " << f << " on inode " << *in << " no async_err state" << dendl;
8390 void Client::_put_fh(Fh *f)
8392 int left = f->put();
8398 int Client::_open(Inode *in, int flags, mode_t mode, Fh **fhp,
8399 const UserPerm& perms)
8401 if (in->snapid != CEPH_NOSNAP &&
8402 (flags & (O_WRONLY | O_RDWR | O_CREAT | O_TRUNC | O_APPEND))) {
8406 // use normalized flags to generate cmode
8407 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
8410 int want = ceph_caps_for_mode(cmode);
8413 in->get_open_ref(cmode); // make note of pending open, since it effects _wanted_ caps.
8415 if ((flags & O_TRUNC) == 0 &&
8416 in->caps_issued_mask(want)) {
8418 check_caps(in, CHECK_CAPS_NODELAY);
8420 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8422 in->make_nosnap_relative_path(path);
8423 req->set_filepath(path);
8424 req->head.args.open.flags = ceph_flags_sys2wire(flags & ~O_CREAT);
8425 req->head.args.open.mode = mode;
8426 req->head.args.open.pool = -1;
8427 if (cct->_conf->client_debug_getattr_caps)
8428 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8430 req->head.args.open.mask = 0;
8431 req->head.args.open.old_size = in->size; // for O_TRUNC
8433 result = make_request(req, perms);
8439 *fhp = _create_fh(in, flags, cmode, perms);
8441 in->put_open_ref(cmode);
8449 int Client::_renew_caps(Inode *in)
8451 int wanted = in->caps_file_wanted();
8452 if (in->is_any_caps() &&
8453 ((wanted & CEPH_CAP_ANY_WR) == 0 || in->auth_cap)) {
8454 check_caps(in, CHECK_CAPS_NODELAY);
8459 if ((wanted & CEPH_CAP_FILE_RD) && (wanted & CEPH_CAP_FILE_WR))
8461 else if (wanted & CEPH_CAP_FILE_RD)
8463 else if (wanted & CEPH_CAP_FILE_WR)
8466 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_OPEN);
8468 in->make_nosnap_relative_path(path);
8469 req->set_filepath(path);
8470 req->head.args.open.flags = flags;
8471 req->head.args.open.pool = -1;
8472 if (cct->_conf->client_debug_getattr_caps)
8473 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
8475 req->head.args.open.mask = 0;
8478 // duplicate in case Cap goes away; not sure if that race is a concern?
8479 const UserPerm *pperm = in->get_best_perms();
8483 int ret = make_request(req, perms);
8487 int Client::close(int fd)
8489 ldout(cct, 3) << "close enter(" << fd << ")" << dendl;
8490 Mutex::Locker lock(client_lock);
8491 tout(cct) << "close" << std::endl;
8492 tout(cct) << fd << std::endl;
8497 Fh *fh = get_filehandle(fd);
8500 int err = _release_fh(fh);
8503 ldout(cct, 3) << "close exit(" << fd << ")" << dendl;
8511 loff_t Client::lseek(int fd, loff_t offset, int whence)
8513 Mutex::Locker lock(client_lock);
8514 tout(cct) << "lseek" << std::endl;
8515 tout(cct) << fd << std::endl;
8516 tout(cct) << offset << std::endl;
8517 tout(cct) << whence << std::endl;
8522 Fh *f = get_filehandle(fd);
8525 #if defined(__linux__) && defined(O_PATH)
8526 if (f->flags & O_PATH)
8529 return _lseek(f, offset, whence);
8532 loff_t Client::_lseek(Fh *f, loff_t offset, int whence)
8534 Inode *in = f->inode.get();
8547 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8550 f->pos = in->size + offset;
8557 ldout(cct, 3) << "_lseek(" << f << ", " << offset << ", " << whence << ") = " << f->pos << dendl;
8562 void Client::lock_fh_pos(Fh *f)
8564 ldout(cct, 10) << "lock_fh_pos " << f << dendl;
8566 if (f->pos_locked || !f->pos_waiters.empty()) {
8568 f->pos_waiters.push_back(&cond);
8569 ldout(cct, 10) << "lock_fh_pos BLOCKING on " << f << dendl;
8570 while (f->pos_locked || f->pos_waiters.front() != &cond)
8571 cond.Wait(client_lock);
8572 ldout(cct, 10) << "lock_fh_pos UNBLOCKING on " << f << dendl;
8573 assert(f->pos_waiters.front() == &cond);
8574 f->pos_waiters.pop_front();
8577 f->pos_locked = true;
8580 void Client::unlock_fh_pos(Fh *f)
8582 ldout(cct, 10) << "unlock_fh_pos " << f << dendl;
8583 f->pos_locked = false;
8586 int Client::uninline_data(Inode *in, Context *onfinish)
8588 if (!in->inline_data.length()) {
8589 onfinish->complete(0);
8594 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (long long unsigned)in->ino);
8595 object_t oid = oid_buf;
8597 ObjectOperation create_ops;
8598 create_ops.create(false);
8600 objecter->mutate(oid,
8601 OSDMap::file_to_object_locator(in->layout),
8603 in->snaprealm->get_snap_context(),
8604 ceph::real_clock::now(),
8608 bufferlist inline_version_bl;
8609 ::encode(in->inline_version, inline_version_bl);
8611 ObjectOperation uninline_ops;
8612 uninline_ops.cmpxattr("inline_version",
8613 CEPH_OSD_CMPXATTR_OP_GT,
8614 CEPH_OSD_CMPXATTR_MODE_U64,
8616 bufferlist inline_data = in->inline_data;
8617 uninline_ops.write(0, inline_data, in->truncate_size, in->truncate_seq);
8618 uninline_ops.setxattr("inline_version", stringify(in->inline_version));
8620 objecter->mutate(oid,
8621 OSDMap::file_to_object_locator(in->layout),
8623 in->snaprealm->get_snap_context(),
8624 ceph::real_clock::now(),
8633 // blocking osd interface
8635 int Client::read(int fd, char *buf, loff_t size, loff_t offset)
8637 Mutex::Locker lock(client_lock);
8638 tout(cct) << "read" << std::endl;
8639 tout(cct) << fd << std::endl;
8640 tout(cct) << size << std::endl;
8641 tout(cct) << offset << std::endl;
8646 Fh *f = get_filehandle(fd);
8649 #if defined(__linux__) && defined(O_PATH)
8650 if (f->flags & O_PATH)
8654 int r = _read(f, offset, size, &bl);
8655 ldout(cct, 3) << "read(" << fd << ", " << (void*)buf << ", " << size << ", " << offset << ") = " << r << dendl;
8657 bl.copy(0, bl.length(), buf);
8663 int Client::preadv(int fd, const struct iovec *iov, int iovcnt, loff_t offset)
8667 return _preadv_pwritev(fd, iov, iovcnt, offset, false);
8670 int Client::_read(Fh *f, int64_t offset, uint64_t size, bufferlist *bl)
8672 const md_config_t *conf = cct->_conf;
8673 Inode *in = f->inode.get();
8675 if ((f->mode & CEPH_FILE_MODE_RD) == 0)
8677 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
8679 bool movepos = false;
8685 loff_t start_pos = offset;
8687 if (in->inline_version == 0) {
8688 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
8694 assert(in->inline_version > 0);
8699 int r = get_caps(in, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE, &have, -1);
8705 if (f->flags & O_DIRECT)
8706 have &= ~CEPH_CAP_FILE_CACHE;
8708 Mutex uninline_flock("Client::_read_uninline_data flock");
8710 bool uninline_done = false;
8711 int uninline_ret = 0;
8712 Context *onuninline = NULL;
8714 if (in->inline_version < CEPH_INLINE_NONE) {
8715 if (!(have & CEPH_CAP_FILE_CACHE)) {
8716 onuninline = new C_SafeCond(&uninline_flock,
8720 uninline_data(in, onuninline);
8722 uint32_t len = in->inline_data.length();
8724 uint64_t endoff = offset + size;
8725 if (endoff > in->size)
8729 if (endoff <= len) {
8730 bl->substr_of(in->inline_data, offset, endoff - offset);
8732 bl->substr_of(in->inline_data, offset, len - offset);
8733 bl->append_zero(endoff - len);
8735 } else if ((uint64_t)offset < endoff) {
8736 bl->append_zero(endoff - offset);
8743 if (!conf->client_debug_force_sync_read &&
8744 (conf->client_oc && (have & CEPH_CAP_FILE_CACHE))) {
8746 if (f->flags & O_RSYNC) {
8747 _flush_range(in, offset, size);
8749 r = _read_async(f, offset, size, bl);
8753 if (f->flags & O_DIRECT)
8754 _flush_range(in, offset, size);
8756 bool checkeof = false;
8757 r = _read_sync(f, offset, size, bl, &checkeof);
8764 put_cap_ref(in, CEPH_CAP_FILE_RD);
8767 r = _getattr(in, CEPH_STAT_CAP_SIZE, f->actor_perms);
8772 if ((uint64_t)offset < in->size)
8780 f->pos = start_pos + bl->length();
8788 client_lock.Unlock();
8789 uninline_flock.Lock();
8790 while (!uninline_done)
8791 uninline_cond.Wait(uninline_flock);
8792 uninline_flock.Unlock();
8795 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
8796 in->inline_data.clear();
8797 in->inline_version = CEPH_INLINE_NONE;
8798 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
8805 put_cap_ref(in, CEPH_CAP_FILE_RD);
8811 return bl->length();
8814 Client::C_Readahead::C_Readahead(Client *c, Fh *f) :
8817 f->readahead.inc_pending();
8820 Client::C_Readahead::~C_Readahead() {
8821 f->readahead.dec_pending();
8825 void Client::C_Readahead::finish(int r) {
8826 lgeneric_subdout(client->cct, client, 20) << "client." << client->get_nodeid() << " " << "C_Readahead on " << f->inode << dendl;
8827 client->put_cap_ref(f->inode.get(), CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8830 int Client::_read_async(Fh *f, uint64_t off, uint64_t len, bufferlist *bl)
8832 const md_config_t *conf = cct->_conf;
8833 Inode *in = f->inode.get();
8835 ldout(cct, 10) << "_read_async " << *in << " " << off << "~" << len << dendl;
8837 // trim read based on file size?
8838 if (off >= in->size)
8842 if (off + len > in->size) {
8843 len = in->size - off;
8846 ldout(cct, 10) << " min_bytes=" << f->readahead.get_min_readahead_size()
8847 << " max_bytes=" << f->readahead.get_max_readahead_size()
8848 << " max_periods=" << conf->client_readahead_max_periods << dendl;
8850 // read (and possibly block)
8852 Mutex flock("Client::_read_async flock");
8855 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &rvalue);
8856 r = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8857 off, len, bl, 0, onfinish);
8859 get_cap_ref(in, CEPH_CAP_FILE_CACHE);
8860 client_lock.Unlock();
8866 put_cap_ref(in, CEPH_CAP_FILE_CACHE);
8873 if(f->readahead.get_min_readahead_size() > 0) {
8874 pair<uint64_t, uint64_t> readahead_extent = f->readahead.update(off, len, in->size);
8875 if (readahead_extent.second > 0) {
8876 ldout(cct, 20) << "readahead " << readahead_extent.first << "~" << readahead_extent.second
8877 << " (caller wants " << off << "~" << len << ")" << dendl;
8878 Context *onfinish2 = new C_Readahead(this, f);
8879 int r2 = objectcacher->file_read(&in->oset, &in->layout, in->snapid,
8880 readahead_extent.first, readahead_extent.second,
8881 NULL, 0, onfinish2);
8883 ldout(cct, 20) << "readahead initiated, c " << onfinish2 << dendl;
8884 get_cap_ref(in, CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE);
8886 ldout(cct, 20) << "readahead was no-op, already cached" << dendl;
8895 int Client::_read_sync(Fh *f, uint64_t off, uint64_t len, bufferlist *bl,
8898 Inode *in = f->inode.get();
8903 ldout(cct, 10) << "_read_sync " << *in << " " << off << "~" << len << dendl;
8905 Mutex flock("Client::_read_sync flock");
8910 Context *onfinish = new C_SafeCond(&flock, &cond, &done, &r);
8914 filer->read_trunc(in->ino, &in->layout, in->snapid,
8916 in->truncate_size, in->truncate_seq,
8918 client_lock.Unlock();
8925 // if we get ENOENT from OSD, assume 0 bytes returned
8936 bl->claim_append(tbl);
8939 if (r >= 0 && r < wanted) {
8940 if (pos < in->size) {
8941 // zero up to known EOF
8942 int64_t some = in->size - pos;
8964 * we keep count of uncommitted sync writes on the inode, so that
8967 void Client::_sync_write_commit(Inode *in)
8969 assert(unsafe_sync_write > 0);
8970 unsafe_sync_write--;
8972 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
8974 ldout(cct, 15) << "sync_write_commit unsafe_sync_write = " << unsafe_sync_write << dendl;
8975 if (unsafe_sync_write == 0 && unmounting) {
8976 ldout(cct, 10) << "sync_write_commit -- no more unsafe writes, unmount can proceed" << dendl;
8977 mount_cond.Signal();
8981 int Client::write(int fd, const char *buf, loff_t size, loff_t offset)
8983 Mutex::Locker lock(client_lock);
8984 tout(cct) << "write" << std::endl;
8985 tout(cct) << fd << std::endl;
8986 tout(cct) << size << std::endl;
8987 tout(cct) << offset << std::endl;
8992 Fh *fh = get_filehandle(fd);
8995 #if defined(__linux__) && defined(O_PATH)
8996 if (fh->flags & O_PATH)
8999 int r = _write(fh, offset, size, buf, NULL, 0);
9000 ldout(cct, 3) << "write(" << fd << ", \"...\", " << size << ", " << offset << ") = " << r << dendl;
9004 int Client::pwritev(int fd, const struct iovec *iov, int iovcnt, int64_t offset)
9008 return _preadv_pwritev(fd, iov, iovcnt, offset, true);
9011 int Client::_preadv_pwritev(int fd, const struct iovec *iov, unsigned iovcnt, int64_t offset, bool write)
9013 Mutex::Locker lock(client_lock);
9014 tout(cct) << fd << std::endl;
9015 tout(cct) << offset << std::endl;
9020 Fh *fh = get_filehandle(fd);
9023 #if defined(__linux__) && defined(O_PATH)
9024 if (fh->flags & O_PATH)
9027 loff_t totallen = 0;
9028 for (unsigned i = 0; i < iovcnt; i++) {
9029 totallen += iov[i].iov_len;
9032 int w = _write(fh, offset, totallen, NULL, iov, iovcnt);
9033 ldout(cct, 3) << "pwritev(" << fd << ", \"...\", " << totallen << ", " << offset << ") = " << w << dendl;
9037 int r = _read(fh, offset, totallen, &bl);
9038 ldout(cct, 3) << "preadv(" << fd << ", " << offset << ") = " << r << dendl;
9043 for (unsigned j = 0, resid = r; j < iovcnt && resid > 0; j++) {
9045 * This piece of code aims to handle the case that bufferlist does not have enough data
9046 * to fill in the iov
9048 if (resid < iov[j].iov_len) {
9049 bl.copy(bufoff, resid, (char *)iov[j].iov_base);
9052 bl.copy(bufoff, iov[j].iov_len, (char *)iov[j].iov_base);
9054 resid -= iov[j].iov_len;
9055 bufoff += iov[j].iov_len;
9061 int Client::_write(Fh *f, int64_t offset, uint64_t size, const char *buf,
9062 const struct iovec *iov, int iovcnt)
9064 if ((uint64_t)(offset+size) > mdsmap->get_max_filesize()) //too large!
9067 //ldout(cct, 7) << "write fh " << fh << " size " << size << " offset " << offset << dendl;
9068 Inode *in = f->inode.get();
9070 if (objecter->osdmap_pool_full(in->layout.pool_id)) {
9074 assert(in->snapid == CEPH_NOSNAP);
9076 // was Fh opened as writeable?
9077 if ((f->mode & CEPH_FILE_MODE_WR) == 0)
9081 uint64_t endoff = offset + size;
9082 if (endoff > in->size && is_quota_bytes_exceeded(in, endoff - in->size,
9087 // use/adjust fd pos?
9091 * FIXME: this is racy in that we may block _after_ this point waiting for caps, and size may
9092 * change out from under us.
9094 if (f->flags & O_APPEND) {
9095 int r = _lseek(f, 0, SEEK_END);
9102 f->pos = offset+size;
9106 //bool lazy = f->mode == CEPH_FILE_MODE_LAZY;
9108 ldout(cct, 10) << "cur file size is " << in->size << dendl;
9111 utime_t start = ceph_clock_now();
9113 if (in->inline_version == 0) {
9114 int r = _getattr(in, CEPH_STAT_CAP_INLINE_DATA, f->actor_perms, true);
9117 assert(in->inline_version > 0);
9120 // copy into fresh buffer (since our write may be resub, async)
9124 bl.append(buf, size);
9126 for (int i = 0; i < iovcnt; i++) {
9127 if (iov[i].iov_len > 0) {
9128 bl.append((const char *)iov[i].iov_base, iov[i].iov_len);
9134 uint64_t totalwritten;
9136 int r = get_caps(in, CEPH_CAP_FILE_WR|CEPH_CAP_AUTH_SHARED,
9137 CEPH_CAP_FILE_BUFFER, &have, endoff);
9141 /* clear the setuid/setgid bits, if any */
9142 if (unlikely(in->mode & (S_ISUID|S_ISGID)) && size > 0) {
9143 struct ceph_statx stx = { 0 };
9145 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9146 r = __setattrx(in, &stx, CEPH_SETATTR_KILL_SGUID, f->actor_perms);
9150 put_cap_ref(in, CEPH_CAP_AUTH_SHARED);
9153 if (f->flags & O_DIRECT)
9154 have &= ~CEPH_CAP_FILE_BUFFER;
9156 ldout(cct, 10) << " snaprealm " << *in->snaprealm << dendl;
9158 Mutex uninline_flock("Client::_write_uninline_data flock");
9160 bool uninline_done = false;
9161 int uninline_ret = 0;
9162 Context *onuninline = NULL;
9164 if (in->inline_version < CEPH_INLINE_NONE) {
9165 if (endoff > cct->_conf->client_max_inline_size ||
9166 endoff > CEPH_INLINE_MAX_SIZE ||
9167 !(have & CEPH_CAP_FILE_BUFFER)) {
9168 onuninline = new C_SafeCond(&uninline_flock,
9172 uninline_data(in, onuninline);
9174 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9176 uint32_t len = in->inline_data.length();
9179 in->inline_data.copy(endoff, len - endoff, bl);
9182 in->inline_data.splice(offset, len - offset);
9183 else if (offset > len)
9184 in->inline_data.append_zero(offset - len);
9186 in->inline_data.append(bl);
9187 in->inline_version++;
9189 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9195 if (cct->_conf->client_oc && (have & CEPH_CAP_FILE_BUFFER)) {
9196 // do buffered write
9197 if (!in->oset.dirty_or_tx)
9198 get_cap_ref(in, CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_BUFFER);
9200 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9202 // async, caching, non-blocking.
9203 r = objectcacher->file_write(&in->oset, &in->layout,
9204 in->snaprealm->get_snap_context(),
9205 offset, size, bl, ceph::real_clock::now(),
9207 put_cap_ref(in, CEPH_CAP_FILE_BUFFER);
9212 // flush cached write if O_SYNC is set on file fh
9213 // O_DSYNC == O_SYNC on linux < 2.6.33
9214 // O_SYNC = __O_SYNC | O_DSYNC on linux >= 2.6.33
9215 if ((f->flags & O_SYNC) || (f->flags & O_DSYNC)) {
9216 _flush_range(in, offset, size);
9219 if (f->flags & O_DIRECT)
9220 _flush_range(in, offset, size);
9222 // simple, non-atomic sync write
9223 Mutex flock("Client::_write flock");
9226 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
9228 unsafe_sync_write++;
9229 get_cap_ref(in, CEPH_CAP_FILE_BUFFER); // released by onsafe callback
9231 filer->write_trunc(in->ino, &in->layout, in->snaprealm->get_snap_context(),
9232 offset, size, bl, ceph::real_clock::now(), 0,
9233 in->truncate_size, in->truncate_seq,
9235 client_lock.Unlock();
9242 _sync_write_commit(in);
9245 // if we get here, write was successful, update client metadata
9248 lat = ceph_clock_now();
9250 logger->tinc(l_c_wrlat, lat);
9252 totalwritten = size;
9253 r = (int)totalwritten;
9256 if (totalwritten + offset > in->size) {
9257 in->size = totalwritten + offset;
9258 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9260 if (is_quota_bytes_approaching(in, f->actor_perms)) {
9261 check_caps(in, CHECK_CAPS_NODELAY);
9262 } else if (is_max_size_approaching(in)) {
9266 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", extending file size" << dendl;
9268 ldout(cct, 7) << "wrote to " << totalwritten+offset << ", leaving file size at " << in->size << dendl;
9272 in->mtime = ceph_clock_now();
9274 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9279 client_lock.Unlock();
9280 uninline_flock.Lock();
9281 while (!uninline_done)
9282 uninline_cond.Wait(uninline_flock);
9283 uninline_flock.Unlock();
9286 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
9287 in->inline_data.clear();
9288 in->inline_version = CEPH_INLINE_NONE;
9289 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
9295 put_cap_ref(in, CEPH_CAP_FILE_WR);
9299 int Client::_flush(Fh *f)
9301 Inode *in = f->inode.get();
9302 int err = f->take_async_err();
9304 ldout(cct, 1) << __func__ << ": " << f << " on inode " << *in << " caught async_err = "
9305 << cpp_strerror(err) << dendl;
9307 ldout(cct, 10) << __func__ << ": " << f << " on inode " << *in << " no async_err state" << dendl;
9313 int Client::truncate(const char *relpath, loff_t length, const UserPerm& perms)
9315 struct ceph_statx stx;
9316 stx.stx_size = length;
9317 return setattrx(relpath, &stx, CEPH_SETATTR_SIZE, perms);
9320 int Client::ftruncate(int fd, loff_t length, const UserPerm& perms)
9322 Mutex::Locker lock(client_lock);
9323 tout(cct) << "ftruncate" << std::endl;
9324 tout(cct) << fd << std::endl;
9325 tout(cct) << length << std::endl;
9330 Fh *f = get_filehandle(fd);
9333 #if defined(__linux__) && defined(O_PATH)
9334 if (f->flags & O_PATH)
9338 attr.st_size = length;
9339 return _setattr(f->inode, &attr, CEPH_SETATTR_SIZE, perms);
9342 int Client::fsync(int fd, bool syncdataonly)
9344 Mutex::Locker lock(client_lock);
9345 tout(cct) << "fsync" << std::endl;
9346 tout(cct) << fd << std::endl;
9347 tout(cct) << syncdataonly << std::endl;
9352 Fh *f = get_filehandle(fd);
9355 #if defined(__linux__) && defined(O_PATH)
9356 if (f->flags & O_PATH)
9359 int r = _fsync(f, syncdataonly);
9361 // The IOs in this fsync were okay, but maybe something happened
9362 // in the background that we shoudl be reporting?
9363 r = f->take_async_err();
9364 ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly
9365 << ") = 0, async_err = " << r << dendl;
9367 // Assume that an error we encountered during fsync, even reported
9368 // synchronously, would also have applied the error to the Fh, and we
9369 // should clear it here to avoid returning the same error again on next
9371 ldout(cct, 3) << "fsync(" << fd << ", " << syncdataonly << ") = "
9373 f->take_async_err();
9378 int Client::_fsync(Inode *in, bool syncdataonly)
9381 Mutex lock("Client::_fsync::lock");
9384 C_SafeCond *object_cacher_completion = NULL;
9385 ceph_tid_t flush_tid = 0;
9388 ldout(cct, 3) << "_fsync on " << *in << " " << (syncdataonly ? "(dataonly)":"(data+metadata)") << dendl;
9390 if (cct->_conf->client_oc) {
9391 object_cacher_completion = new C_SafeCond(&lock, &cond, &done, &r);
9392 tmp_ref = in; // take a reference; C_SafeCond doesn't and _flush won't either
9393 _flush(in, object_cacher_completion);
9394 ldout(cct, 15) << "using return-valued form of _fsync" << dendl;
9397 if (!syncdataonly && in->dirty_caps) {
9398 check_caps(in, CHECK_CAPS_NODELAY|CHECK_CAPS_SYNCHRONOUS);
9399 if (in->flushing_caps)
9400 flush_tid = last_flush_tid;
9401 } else ldout(cct, 10) << "no metadata needs to commit" << dendl;
9403 if (!syncdataonly && !in->unsafe_ops.empty()) {
9404 MetaRequest *req = in->unsafe_ops.back();
9405 ldout(cct, 15) << "waiting on unsafe requests, last tid " << req->get_tid() << dendl;
9408 wait_on_list(req->waitfor_safe);
9412 if (object_cacher_completion) { // wait on a real reply instead of guessing
9413 client_lock.Unlock();
9415 ldout(cct, 15) << "waiting on data to flush" << dendl;
9420 ldout(cct, 15) << "got " << r << " from flush writeback" << dendl;
9422 // FIXME: this can starve
9423 while (in->cap_refs[CEPH_CAP_FILE_BUFFER] > 0) {
9424 ldout(cct, 10) << "ino " << in->ino << " has " << in->cap_refs[CEPH_CAP_FILE_BUFFER]
9425 << " uncommitted, waiting" << dendl;
9426 wait_on_list(in->waitfor_commit);
9432 wait_sync_caps(in, flush_tid);
9434 ldout(cct, 10) << "ino " << in->ino << " has no uncommitted writes" << dendl;
9436 ldout(cct, 1) << "ino " << in->ino << " failed to commit to disk! "
9437 << cpp_strerror(-r) << dendl;
9443 int Client::_fsync(Fh *f, bool syncdataonly)
9445 ldout(cct, 3) << "_fsync(" << f << ", " << (syncdataonly ? "dataonly)":"data+metadata)") << dendl;
9446 return _fsync(f->inode.get(), syncdataonly);
9449 int Client::fstat(int fd, struct stat *stbuf, const UserPerm& perms, int mask)
9451 Mutex::Locker lock(client_lock);
9452 tout(cct) << "fstat mask " << hex << mask << dec << std::endl;
9453 tout(cct) << fd << std::endl;
9458 Fh *f = get_filehandle(fd);
9461 int r = _getattr(f->inode, mask, perms);
9464 fill_stat(f->inode, stbuf, NULL);
9465 ldout(cct, 3) << "fstat(" << fd << ", " << stbuf << ") = " << r << dendl;
9469 int Client::fstatx(int fd, struct ceph_statx *stx, const UserPerm& perms,
9470 unsigned int want, unsigned int flags)
9472 Mutex::Locker lock(client_lock);
9473 tout(cct) << "fstatx flags " << hex << flags << " want " << want << dec << std::endl;
9474 tout(cct) << fd << std::endl;
9479 Fh *f = get_filehandle(fd);
9483 unsigned mask = statx_to_mask(flags, want);
9486 if (mask && !f->inode->caps_issued_mask(mask)) {
9487 r = _getattr(f->inode, mask, perms);
9489 ldout(cct, 3) << "fstatx exit on error!" << dendl;
9494 fill_statx(f->inode, mask, stx);
9495 ldout(cct, 3) << "fstatx(" << fd << ", " << stx << ") = " << r << dendl;
9499 // not written yet, but i want to link!
9501 int Client::chdir(const char *relpath, std::string &new_cwd,
9502 const UserPerm& perms)
9504 Mutex::Locker lock(client_lock);
9505 tout(cct) << "chdir" << std::endl;
9506 tout(cct) << relpath << std::endl;
9511 filepath path(relpath);
9513 int r = path_walk(path, &in, perms);
9518 ldout(cct, 3) << "chdir(" << relpath << ") cwd now " << cwd->ino << dendl;
9520 _getcwd(new_cwd, perms);
9524 void Client::_getcwd(string& dir, const UserPerm& perms)
9527 ldout(cct, 10) << "getcwd " << *cwd << dendl;
9529 Inode *in = cwd.get();
9530 while (in != root) {
9531 assert(in->dn_set.size() < 2); // dirs can't be hard-linked
9533 // A cwd or ancester is unlinked
9534 if (in->dn_set.empty()) {
9538 Dentry *dn = in->get_first_parent();
9543 ldout(cct, 10) << "getcwd looking up parent for " << *in << dendl;
9544 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
9545 filepath path(in->ino);
9546 req->set_filepath(path);
9548 int res = make_request(req, perms);
9557 path.push_front_dentry(dn->name);
9558 in = dn->dir->parent_inode;
9561 dir += path.get_path();
9564 void Client::getcwd(string& dir, const UserPerm& perms)
9566 Mutex::Locker l(client_lock);
9568 _getcwd(dir, perms);
9571 int Client::statfs(const char *path, struct statvfs *stbuf,
9572 const UserPerm& perms)
9574 Mutex::Locker l(client_lock);
9575 tout(cct) << "statfs" << std::endl;
9583 const vector<int64_t> &data_pools = mdsmap->get_data_pools();
9584 if (data_pools.size() == 1) {
9585 objecter->get_fs_stats(stats, data_pools[0], &cond);
9587 objecter->get_fs_stats(stats, boost::optional<int64_t>(), &cond);
9590 client_lock.Unlock();
9591 int rval = cond.wait();
9595 ldout(cct, 1) << "underlying call to statfs returned error: "
9596 << cpp_strerror(rval)
9601 memset(stbuf, 0, sizeof(*stbuf));
9604 * we're going to set a block size of 4MB so we can represent larger
9605 * FSes without overflowing. Additionally convert the space
9606 * measurements from KB to bytes while making them in terms of
9607 * blocks. We use 4MB only because it is big enough, and because it
9608 * actually *is* the (ceph) default block size.
9610 const int CEPH_BLOCK_SHIFT = 22;
9611 stbuf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
9612 stbuf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
9613 stbuf->f_files = stats.num_objects;
9614 stbuf->f_ffree = -1;
9615 stbuf->f_favail = -1;
9616 stbuf->f_fsid = -1; // ??
9617 stbuf->f_flag = 0; // ??
9618 stbuf->f_namemax = NAME_MAX;
9620 // Usually quota_root will == root_ancestor, but if the mount root has no
9621 // quota but we can see a parent of it that does have a quota, we'll
9622 // respect that one instead.
9623 assert(root != nullptr);
9624 Inode *quota_root = root->quota.is_enable() ? root : get_quota_root(root, perms);
9626 // get_quota_root should always give us something
9627 // because client quotas are always enabled
9628 assert(quota_root != nullptr);
9630 if (quota_root && cct->_conf->client_quota_df && quota_root->quota.max_bytes) {
9632 // Skip the getattr if any sessions are stale, as we don't want to
9633 // block `df` if this client has e.g. been evicted, or if the MDS cluster
9635 if (!_any_stale_sessions()) {
9636 int r = _getattr(quota_root, 0, perms, true);
9638 // Ignore return value: error getting latest inode metadata is not a good
9639 // reason to break "df".
9640 lderr(cct) << "Error in getattr on quota root 0x"
9641 << std::hex << quota_root->ino << std::dec
9642 << " statfs result may be outdated" << dendl;
9646 // Special case: if there is a size quota set on the Inode acting
9647 // as the root for this client mount, then report the quota status
9648 // as the filesystem statistics.
9649 const fsblkcnt_t total = quota_root->quota.max_bytes >> CEPH_BLOCK_SHIFT;
9650 const fsblkcnt_t used = quota_root->rstat.rbytes >> CEPH_BLOCK_SHIFT;
9651 // It is possible for a quota to be exceeded: arithmetic here must
9652 // handle case where used > total.
9653 const fsblkcnt_t free = total > used ? total - used : 0;
9655 stbuf->f_blocks = total;
9656 stbuf->f_bfree = free;
9657 stbuf->f_bavail = free;
9659 // General case: report the cluster statistics returned from RADOS. Because
9660 // multiple pools may be used without one filesystem namespace via
9661 // layouts, this is the most correct thing we can do.
9662 stbuf->f_blocks = stats.kb >> (CEPH_BLOCK_SHIFT - 10);
9663 stbuf->f_bfree = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9664 stbuf->f_bavail = stats.kb_avail >> (CEPH_BLOCK_SHIFT - 10);
9670 int Client::_do_filelock(Inode *in, Fh *fh, int lock_type, int op, int sleep,
9671 struct flock *fl, uint64_t owner, bool removing)
9673 ldout(cct, 10) << "_do_filelock ino " << in->ino
9674 << (lock_type == CEPH_LOCK_FCNTL ? " fcntl" : " flock")
9675 << " type " << fl->l_type << " owner " << owner
9676 << " " << fl->l_start << "~" << fl->l_len << dendl;
9679 if (F_RDLCK == fl->l_type)
9680 lock_cmd = CEPH_LOCK_SHARED;
9681 else if (F_WRLCK == fl->l_type)
9682 lock_cmd = CEPH_LOCK_EXCL;
9683 else if (F_UNLCK == fl->l_type)
9684 lock_cmd = CEPH_LOCK_UNLOCK;
9688 if (op != CEPH_MDS_OP_SETFILELOCK || lock_cmd == CEPH_LOCK_UNLOCK)
9692 * Set the most significant bit, so that MDS knows the 'owner'
9693 * is sufficient to identify the owner of lock. (old code uses
9694 * both 'owner' and 'pid')
9696 owner |= (1ULL << 63);
9698 MetaRequest *req = new MetaRequest(op);
9700 in->make_nosnap_relative_path(path);
9701 req->set_filepath(path);
9704 req->head.args.filelock_change.rule = lock_type;
9705 req->head.args.filelock_change.type = lock_cmd;
9706 req->head.args.filelock_change.owner = owner;
9707 req->head.args.filelock_change.pid = fl->l_pid;
9708 req->head.args.filelock_change.start = fl->l_start;
9709 req->head.args.filelock_change.length = fl->l_len;
9710 req->head.args.filelock_change.wait = sleep;
9715 if (sleep && switch_interrupt_cb) {
9717 switch_interrupt_cb(callback_handle, req->get());
9718 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9719 // disable interrupt
9720 switch_interrupt_cb(callback_handle, NULL);
9721 if (ret == 0 && req->aborted()) {
9722 // effect of this lock request has been revoked by the 'lock intr' request
9723 ret = req->get_abort_code();
9727 ret = make_request(req, fh->actor_perms, NULL, NULL, -1, &bl);
9731 if (op == CEPH_MDS_OP_GETFILELOCK) {
9732 ceph_filelock filelock;
9733 bufferlist::iterator p = bl.begin();
9734 ::decode(filelock, p);
9736 if (CEPH_LOCK_SHARED == filelock.type)
9737 fl->l_type = F_RDLCK;
9738 else if (CEPH_LOCK_EXCL == filelock.type)
9739 fl->l_type = F_WRLCK;
9741 fl->l_type = F_UNLCK;
9743 fl->l_whence = SEEK_SET;
9744 fl->l_start = filelock.start;
9745 fl->l_len = filelock.length;
9746 fl->l_pid = filelock.pid;
9747 } else if (op == CEPH_MDS_OP_SETFILELOCK) {
9748 ceph_lock_state_t *lock_state;
9749 if (lock_type == CEPH_LOCK_FCNTL) {
9750 if (!in->fcntl_locks)
9751 in->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9752 lock_state = in->fcntl_locks;
9753 } else if (lock_type == CEPH_LOCK_FLOCK) {
9754 if (!in->flock_locks)
9755 in->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9756 lock_state = in->flock_locks;
9761 _update_lock_state(fl, owner, lock_state);
9764 if (lock_type == CEPH_LOCK_FCNTL) {
9765 if (!fh->fcntl_locks)
9766 fh->fcntl_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FCNTL);
9767 lock_state = fh->fcntl_locks;
9769 if (!fh->flock_locks)
9770 fh->flock_locks = new ceph_lock_state_t(cct, CEPH_LOCK_FLOCK);
9771 lock_state = fh->flock_locks;
9773 _update_lock_state(fl, owner, lock_state);
9781 int Client::_interrupt_filelock(MetaRequest *req)
9783 // Set abort code, but do not kick. The abort code prevents the request
9784 // from being re-sent.
9787 return 0; // haven't sent the request
9789 Inode *in = req->inode();
9792 if (req->head.args.filelock_change.rule == CEPH_LOCK_FLOCK)
9793 lock_type = CEPH_LOCK_FLOCK_INTR;
9794 else if (req->head.args.filelock_change.rule == CEPH_LOCK_FCNTL)
9795 lock_type = CEPH_LOCK_FCNTL_INTR;
9801 MetaRequest *intr_req = new MetaRequest(CEPH_MDS_OP_SETFILELOCK);
9803 in->make_nosnap_relative_path(path);
9804 intr_req->set_filepath(path);
9805 intr_req->set_inode(in);
9806 intr_req->head.args.filelock_change = req->head.args.filelock_change;
9807 intr_req->head.args.filelock_change.rule = lock_type;
9808 intr_req->head.args.filelock_change.type = CEPH_LOCK_UNLOCK;
9810 UserPerm perms(req->get_uid(), req->get_gid());
9811 return make_request(intr_req, perms, NULL, NULL, -1);
9814 void Client::_encode_filelocks(Inode *in, bufferlist& bl)
9816 if (!in->fcntl_locks && !in->flock_locks)
9819 unsigned nr_fcntl_locks = in->fcntl_locks ? in->fcntl_locks->held_locks.size() : 0;
9820 ::encode(nr_fcntl_locks, bl);
9821 if (nr_fcntl_locks) {
9822 ceph_lock_state_t* lock_state = in->fcntl_locks;
9823 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9824 p != lock_state->held_locks.end();
9826 ::encode(p->second, bl);
9829 unsigned nr_flock_locks = in->flock_locks ? in->flock_locks->held_locks.size() : 0;
9830 ::encode(nr_flock_locks, bl);
9831 if (nr_flock_locks) {
9832 ceph_lock_state_t* lock_state = in->flock_locks;
9833 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9834 p != lock_state->held_locks.end();
9836 ::encode(p->second, bl);
9839 ldout(cct, 10) << "_encode_filelocks ino " << in->ino << ", " << nr_fcntl_locks
9840 << " fcntl locks, " << nr_flock_locks << " flock locks" << dendl;
9843 void Client::_release_filelocks(Fh *fh)
9845 if (!fh->fcntl_locks && !fh->flock_locks)
9848 Inode *in = fh->inode.get();
9849 ldout(cct, 10) << "_release_filelocks " << fh << " ino " << in->ino << dendl;
9851 list<pair<int, ceph_filelock> > to_release;
9853 if (fh->fcntl_locks) {
9854 ceph_lock_state_t* lock_state = fh->fcntl_locks;
9855 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9856 p != lock_state->held_locks.end();
9858 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FCNTL, p->second));
9859 delete fh->fcntl_locks;
9861 if (fh->flock_locks) {
9862 ceph_lock_state_t* lock_state = fh->flock_locks;
9863 for(multimap<uint64_t, ceph_filelock>::iterator p = lock_state->held_locks.begin();
9864 p != lock_state->held_locks.end();
9866 to_release.push_back(pair<int, ceph_filelock>(CEPH_LOCK_FLOCK, p->second));
9867 delete fh->flock_locks;
9870 if (to_release.empty())
9874 memset(&fl, 0, sizeof(fl));
9875 fl.l_whence = SEEK_SET;
9876 fl.l_type = F_UNLCK;
9878 for (list<pair<int, ceph_filelock> >::iterator p = to_release.begin();
9879 p != to_release.end();
9881 fl.l_start = p->second.start;
9882 fl.l_len = p->second.length;
9883 fl.l_pid = p->second.pid;
9884 _do_filelock(in, fh, p->first, CEPH_MDS_OP_SETFILELOCK, 0, &fl,
9885 p->second.owner, true);
9889 void Client::_update_lock_state(struct flock *fl, uint64_t owner,
9890 ceph_lock_state_t *lock_state)
9893 if (F_RDLCK == fl->l_type)
9894 lock_cmd = CEPH_LOCK_SHARED;
9895 else if (F_WRLCK == fl->l_type)
9896 lock_cmd = CEPH_LOCK_EXCL;
9898 lock_cmd = CEPH_LOCK_UNLOCK;;
9900 ceph_filelock filelock;
9901 filelock.start = fl->l_start;
9902 filelock.length = fl->l_len;
9903 filelock.client = 0;
9904 // see comment in _do_filelock()
9905 filelock.owner = owner | (1ULL << 63);
9906 filelock.pid = fl->l_pid;
9907 filelock.type = lock_cmd;
9909 if (filelock.type == CEPH_LOCK_UNLOCK) {
9910 list<ceph_filelock> activated_locks;
9911 lock_state->remove_lock(filelock, activated_locks);
9913 bool r = lock_state->add_lock(filelock, false, false, NULL);
9918 int Client::_getlk(Fh *fh, struct flock *fl, uint64_t owner)
9920 Inode *in = fh->inode.get();
9921 ldout(cct, 10) << "_getlk " << fh << " ino " << in->ino << dendl;
9922 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_GETFILELOCK, 0, fl, owner);
9926 int Client::_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
9928 Inode *in = fh->inode.get();
9929 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << dendl;
9930 int ret = _do_filelock(in, fh, CEPH_LOCK_FCNTL, CEPH_MDS_OP_SETFILELOCK, sleep, fl, owner);
9931 ldout(cct, 10) << "_setlk " << fh << " ino " << in->ino << " result=" << ret << dendl;
9935 int Client::_flock(Fh *fh, int cmd, uint64_t owner)
9937 Inode *in = fh->inode.get();
9938 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << dendl;
9940 int sleep = !(cmd & LOCK_NB);
9959 memset(&fl, 0, sizeof(fl));
9961 fl.l_whence = SEEK_SET;
9963 int ret = _do_filelock(in, fh, CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, sleep, &fl, owner);
9964 ldout(cct, 10) << "_flock " << fh << " ino " << in->ino << " result=" << ret << dendl;
9968 int Client::ll_statfs(Inode *in, struct statvfs *stbuf, const UserPerm& perms)
9970 /* Since the only thing this does is wrap a call to statfs, and
9971 statfs takes a lock, it doesn't seem we have a need to split it
9973 return statfs(0, stbuf, perms);
9976 void Client::ll_register_callbacks(struct client_callback_args *args)
9980 Mutex::Locker l(client_lock);
9981 ldout(cct, 10) << "ll_register_callbacks cb " << args->handle
9982 << " invalidate_ino_cb " << args->ino_cb
9983 << " invalidate_dentry_cb " << args->dentry_cb
9984 << " getgroups_cb" << args->getgroups_cb
9985 << " switch_interrupt_cb " << args->switch_intr_cb
9986 << " remount_cb " << args->remount_cb
9988 callback_handle = args->handle;
9990 ino_invalidate_cb = args->ino_cb;
9991 async_ino_invalidator.start();
9993 if (args->dentry_cb) {
9994 dentry_invalidate_cb = args->dentry_cb;
9995 async_dentry_invalidator.start();
9997 if (args->switch_intr_cb) {
9998 switch_interrupt_cb = args->switch_intr_cb;
9999 interrupt_finisher.start();
10001 if (args->remount_cb) {
10002 remount_cb = args->remount_cb;
10003 remount_finisher.start();
10005 getgroups_cb = args->getgroups_cb;
10006 umask_cb = args->umask_cb;
10009 int Client::test_dentry_handling(bool can_invalidate)
10013 can_invalidate_dentries = can_invalidate;
10015 if (can_invalidate_dentries) {
10016 assert(dentry_invalidate_cb);
10017 ldout(cct, 1) << "using dentry_invalidate_cb" << dendl;
10018 } else if (remount_cb) {
10019 ldout(cct, 1) << "using remount_cb" << dendl;
10020 int s = remount_cb(callback_handle);
10022 lderr(cct) << "Failed to invoke remount, needed to ensure kernel dcache consistency"
10025 if (cct->_conf->client_die_on_failed_remount) {
10026 require_remount = true;
10030 lderr(cct) << "no method to invalidate kernel dentry cache; expect issues!" << dendl;
10031 if (cct->_conf->client_die_on_failed_remount)
10037 int Client::_sync_fs()
10039 ldout(cct, 10) << "_sync_fs" << dendl;
10042 Mutex lock("Client::_fsync::lock");
10044 bool flush_done = false;
10045 if (cct->_conf->client_oc)
10046 objectcacher->flush_all(new C_SafeCond(&lock, &cond, &flush_done));
10052 ceph_tid_t flush_tid = last_flush_tid;
10054 // wait for unsafe mds requests
10055 wait_unsafe_requests();
10057 wait_sync_caps(flush_tid);
10060 client_lock.Unlock();
10062 ldout(cct, 15) << "waiting on data to flush" << dendl;
10063 while (!flush_done)
10066 client_lock.Lock();
10072 int Client::sync_fs()
10074 Mutex::Locker l(client_lock);
10082 int64_t Client::drop_caches()
10084 Mutex::Locker l(client_lock);
10085 return objectcacher->release_all();
10089 int Client::lazyio_propogate(int fd, loff_t offset, size_t count)
10091 Mutex::Locker l(client_lock);
10092 ldout(cct, 3) << "op: client->lazyio_propogate(" << fd
10093 << ", " << offset << ", " << count << ")" << dendl;
10095 Fh *f = get_filehandle(fd);
10105 int Client::lazyio_synchronize(int fd, loff_t offset, size_t count)
10107 Mutex::Locker l(client_lock);
10108 ldout(cct, 3) << "op: client->lazyio_synchronize(" << fd
10109 << ", " << offset << ", " << count << ")" << dendl;
10111 Fh *f = get_filehandle(fd);
10114 Inode *in = f->inode.get();
10123 // =============================
10126 int Client::mksnap(const char *relpath, const char *name, const UserPerm& perm)
10128 Mutex::Locker l(client_lock);
10133 filepath path(relpath);
10135 int r = path_walk(path, &in, perm);
10138 if (cct->_conf->client_permissions) {
10139 r = may_create(in.get(), perm);
10143 Inode *snapdir = open_snapdir(in.get());
10144 return _mkdir(snapdir, name, 0, perm);
10147 int Client::rmsnap(const char *relpath, const char *name, const UserPerm& perms)
10149 Mutex::Locker l(client_lock);
10154 filepath path(relpath);
10156 int r = path_walk(path, &in, perms);
10159 if (cct->_conf->client_permissions) {
10160 r = may_delete(in.get(), NULL, perms);
10164 Inode *snapdir = open_snapdir(in.get());
10165 return _rmdir(snapdir, name, perms);
10168 // =============================
10171 int Client::get_caps_issued(int fd) {
10173 Mutex::Locker lock(client_lock);
10178 Fh *f = get_filehandle(fd);
10182 return f->inode->caps_issued();
10185 int Client::get_caps_issued(const char *path, const UserPerm& perms)
10187 Mutex::Locker lock(client_lock);
10194 int r = path_walk(p, &in, perms, true);
10197 return in->caps_issued();
10200 // =========================================
10203 Inode *Client::open_snapdir(Inode *diri)
10206 vinodeno_t vino(diri->ino, CEPH_SNAPDIR);
10207 if (!inode_map.count(vino)) {
10208 in = new Inode(this, vino, &diri->layout);
10210 in->ino = diri->ino;
10211 in->snapid = CEPH_SNAPDIR;
10212 in->mode = diri->mode;
10213 in->uid = diri->uid;
10214 in->gid = diri->gid;
10215 in->mtime = diri->mtime;
10216 in->ctime = diri->ctime;
10217 in->btime = diri->btime;
10218 in->size = diri->size;
10219 in->change_attr = diri->change_attr;
10221 in->dirfragtree.clear();
10222 in->snapdir_parent = diri;
10223 diri->flags |= I_SNAPDIR_OPEN;
10224 inode_map[vino] = in;
10225 if (use_faked_inos())
10226 _assign_faked_ino(in);
10227 ldout(cct, 10) << "open_snapdir created snapshot inode " << *in << dendl;
10229 in = inode_map[vino];
10230 ldout(cct, 10) << "open_snapdir had snapshot inode " << *in << dendl;
10235 int Client::ll_lookup(Inode *parent, const char *name, struct stat *attr,
10236 Inode **out, const UserPerm& perms)
10238 Mutex::Locker lock(client_lock);
10239 vinodeno_t vparent = _get_vino(parent);
10240 ldout(cct, 3) << "ll_lookup " << vparent << " " << name << dendl;
10241 tout(cct) << "ll_lookup" << std::endl;
10242 tout(cct) << name << std::endl;
10248 if (!cct->_conf->fuse_default_permissions) {
10249 r = may_lookup(parent, perms);
10254 string dname(name);
10257 r = _lookup(parent, dname, CEPH_STAT_CAP_INODE_ALL, &in, perms);
10264 fill_stat(in, attr);
10268 ldout(cct, 3) << "ll_lookup " << vparent << " " << name
10269 << " -> " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
10270 tout(cct) << attr->st_ino << std::endl;
10275 int Client::ll_lookupx(Inode *parent, const char *name, Inode **out,
10276 struct ceph_statx *stx, unsigned want, unsigned flags,
10277 const UserPerm& perms)
10279 Mutex::Locker lock(client_lock);
10280 vinodeno_t vparent = _get_vino(parent);
10281 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name << dendl;
10282 tout(cct) << "ll_lookupx" << std::endl;
10283 tout(cct) << name << std::endl;
10289 if (!cct->_conf->fuse_default_permissions) {
10290 r = may_lookup(parent, perms);
10295 string dname(name);
10298 unsigned mask = statx_to_mask(flags, want);
10299 r = _lookup(parent, dname, mask, &in, perms);
10305 fill_statx(in, mask, stx);
10309 ldout(cct, 3) << "ll_lookupx " << vparent << " " << name
10310 << " -> " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
10311 tout(cct) << stx->stx_ino << std::endl;
10316 int Client::ll_walk(const char* name, Inode **out, struct ceph_statx *stx,
10317 unsigned int want, unsigned int flags, const UserPerm& perms)
10319 Mutex::Locker lock(client_lock);
10324 filepath fp(name, 0);
10327 unsigned mask = statx_to_mask(flags, want);
10329 ldout(cct, 3) << "ll_walk" << name << dendl;
10330 tout(cct) << "ll_walk" << std::endl;
10331 tout(cct) << name << std::endl;
10333 rc = path_walk(fp, &in, perms, !(flags & AT_SYMLINK_NOFOLLOW), mask);
10335 /* zero out mask, just in case... */
10342 fill_statx(in, mask, stx);
10349 void Client::_ll_get(Inode *in)
10351 if (in->ll_ref == 0) {
10353 if (in->is_dir() && !in->dn_set.empty()) {
10354 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10355 in->get_first_parent()->get(); // pin dentry
10359 ldout(cct, 20) << "_ll_get " << in << " " << in->ino << " -> " << in->ll_ref << dendl;
10362 int Client::_ll_put(Inode *in, int num)
10365 ldout(cct, 20) << "_ll_put " << in << " " << in->ino << " " << num << " -> " << in->ll_ref << dendl;
10366 if (in->ll_ref == 0) {
10367 if (in->is_dir() && !in->dn_set.empty()) {
10368 assert(in->dn_set.size() == 1); // dirs can't be hard-linked
10369 in->get_first_parent()->put(); // unpin dentry
10378 void Client::_ll_drop_pins()
10380 ldout(cct, 10) << "_ll_drop_pins" << dendl;
10381 ceph::unordered_map<vinodeno_t, Inode*>::iterator next;
10382 for (ceph::unordered_map<vinodeno_t, Inode*>::iterator it = inode_map.begin();
10383 it != inode_map.end();
10385 Inode *in = it->second;
10389 _ll_put(in, in->ll_ref);
10393 bool Client::ll_forget(Inode *in, int count)
10395 Mutex::Locker lock(client_lock);
10396 inodeno_t ino = _get_inodeno(in);
10398 ldout(cct, 3) << "ll_forget " << ino << " " << count << dendl;
10399 tout(cct) << "ll_forget" << std::endl;
10400 tout(cct) << ino.val << std::endl;
10401 tout(cct) << count << std::endl;
10403 // Ignore forget if we're no longer mounted
10407 if (ino == 1) return true; // ignore forget on root.
10410 if (in->ll_ref < count) {
10411 ldout(cct, 1) << "WARNING: ll_forget on " << ino << " " << count
10412 << ", which only has ll_ref=" << in->ll_ref << dendl;
10413 _ll_put(in, in->ll_ref);
10416 if (_ll_put(in, count) == 0)
10423 bool Client::ll_put(Inode *in)
10425 /* ll_forget already takes the lock */
10426 return ll_forget(in, 1);
10429 snapid_t Client::ll_get_snapid(Inode *in)
10431 Mutex::Locker lock(client_lock);
10435 Inode *Client::ll_get_inode(ino_t ino)
10437 Mutex::Locker lock(client_lock);
10442 vinodeno_t vino = _map_faked_ino(ino);
10443 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10444 if (p == inode_map.end())
10446 Inode *in = p->second;
10451 Inode *Client::ll_get_inode(vinodeno_t vino)
10453 Mutex::Locker lock(client_lock);
10458 unordered_map<vinodeno_t,Inode*>::iterator p = inode_map.find(vino);
10459 if (p == inode_map.end())
10461 Inode *in = p->second;
10466 int Client::_ll_getattr(Inode *in, int caps, const UserPerm& perms)
10468 vinodeno_t vino = _get_vino(in);
10470 ldout(cct, 3) << "ll_getattr " << vino << dendl;
10471 tout(cct) << "ll_getattr" << std::endl;
10472 tout(cct) << vino.ino.val << std::endl;
10474 if (vino.snapid < CEPH_NOSNAP)
10477 return _getattr(in, caps, perms);
10480 int Client::ll_getattr(Inode *in, struct stat *attr, const UserPerm& perms)
10482 Mutex::Locker lock(client_lock);
10487 int res = _ll_getattr(in, CEPH_STAT_CAP_INODE_ALL, perms);
10490 fill_stat(in, attr);
10491 ldout(cct, 3) << "ll_getattr " << _get_vino(in) << " = " << res << dendl;
10495 int Client::ll_getattrx(Inode *in, struct ceph_statx *stx, unsigned int want,
10496 unsigned int flags, const UserPerm& perms)
10498 Mutex::Locker lock(client_lock);
10504 unsigned mask = statx_to_mask(flags, want);
10506 if (mask && !in->caps_issued_mask(mask))
10507 res = _ll_getattr(in, mask, perms);
10510 fill_statx(in, mask, stx);
10511 ldout(cct, 3) << "ll_getattrx " << _get_vino(in) << " = " << res << dendl;
10515 int Client::_ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10516 const UserPerm& perms, InodeRef *inp)
10518 vinodeno_t vino = _get_vino(in);
10520 ldout(cct, 3) << "ll_setattrx " << vino << " mask " << hex << mask << dec
10522 tout(cct) << "ll_setattrx" << std::endl;
10523 tout(cct) << vino.ino.val << std::endl;
10524 tout(cct) << stx->stx_mode << std::endl;
10525 tout(cct) << stx->stx_uid << std::endl;
10526 tout(cct) << stx->stx_gid << std::endl;
10527 tout(cct) << stx->stx_size << std::endl;
10528 tout(cct) << stx->stx_mtime << std::endl;
10529 tout(cct) << stx->stx_atime << std::endl;
10530 tout(cct) << stx->stx_btime << std::endl;
10531 tout(cct) << mask << std::endl;
10533 if (!cct->_conf->fuse_default_permissions) {
10534 int res = may_setattr(in, stx, mask, perms);
10539 mask &= ~(CEPH_SETATTR_MTIME_NOW | CEPH_SETATTR_ATIME_NOW);
10541 return __setattrx(in, stx, mask, perms, inp);
10544 int Client::ll_setattrx(Inode *in, struct ceph_statx *stx, int mask,
10545 const UserPerm& perms)
10547 Mutex::Locker lock(client_lock);
10552 InodeRef target(in);
10553 int res = _ll_setattrx(in, stx, mask, perms, &target);
10555 assert(in == target.get());
10556 fill_statx(in, in->caps_issued(), stx);
10559 ldout(cct, 3) << "ll_setattrx " << _get_vino(in) << " = " << res << dendl;
10563 int Client::ll_setattr(Inode *in, struct stat *attr, int mask,
10564 const UserPerm& perms)
10566 struct ceph_statx stx;
10567 stat_to_statx(attr, &stx);
10569 Mutex::Locker lock(client_lock);
10574 InodeRef target(in);
10575 int res = _ll_setattrx(in, &stx, mask, perms, &target);
10577 assert(in == target.get());
10578 fill_stat(in, attr);
10581 ldout(cct, 3) << "ll_setattr " << _get_vino(in) << " = " << res << dendl;
10589 int Client::getxattr(const char *path, const char *name, void *value, size_t size,
10590 const UserPerm& perms)
10592 Mutex::Locker lock(client_lock);
10598 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10601 return _getxattr(in, name, value, size, perms);
10604 int Client::lgetxattr(const char *path, const char *name, void *value, size_t size,
10605 const UserPerm& perms)
10607 Mutex::Locker lock(client_lock);
10613 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10616 return _getxattr(in, name, value, size, perms);
10619 int Client::fgetxattr(int fd, const char *name, void *value, size_t size,
10620 const UserPerm& perms)
10622 Mutex::Locker lock(client_lock);
10627 Fh *f = get_filehandle(fd);
10630 return _getxattr(f->inode, name, value, size, perms);
10633 int Client::listxattr(const char *path, char *list, size_t size,
10634 const UserPerm& perms)
10636 Mutex::Locker lock(client_lock);
10642 int r = Client::path_walk(path, &in, perms, true, CEPH_STAT_CAP_XATTR);
10645 return Client::_listxattr(in.get(), list, size, perms);
10648 int Client::llistxattr(const char *path, char *list, size_t size,
10649 const UserPerm& perms)
10651 Mutex::Locker lock(client_lock);
10657 int r = Client::path_walk(path, &in, perms, false, CEPH_STAT_CAP_XATTR);
10660 return Client::_listxattr(in.get(), list, size, perms);
10663 int Client::flistxattr(int fd, char *list, size_t size, const UserPerm& perms)
10665 Mutex::Locker lock(client_lock);
10670 Fh *f = get_filehandle(fd);
10673 return Client::_listxattr(f->inode.get(), list, size, perms);
10676 int Client::removexattr(const char *path, const char *name,
10677 const UserPerm& perms)
10679 Mutex::Locker lock(client_lock);
10685 int r = Client::path_walk(path, &in, perms, true);
10688 return _removexattr(in, name, perms);
10691 int Client::lremovexattr(const char *path, const char *name,
10692 const UserPerm& perms)
10694 Mutex::Locker lock(client_lock);
10700 int r = Client::path_walk(path, &in, perms, false);
10703 return _removexattr(in, name, perms);
10706 int Client::fremovexattr(int fd, const char *name, const UserPerm& perms)
10708 Mutex::Locker lock(client_lock);
10713 Fh *f = get_filehandle(fd);
10716 return _removexattr(f->inode, name, perms);
10719 int Client::setxattr(const char *path, const char *name, const void *value,
10720 size_t size, int flags, const UserPerm& perms)
10722 _setxattr_maybe_wait_for_osdmap(name, value, size);
10724 Mutex::Locker lock(client_lock);
10730 int r = Client::path_walk(path, &in, perms, true);
10733 return _setxattr(in, name, value, size, flags, perms);
10736 int Client::lsetxattr(const char *path, const char *name, const void *value,
10737 size_t size, int flags, const UserPerm& perms)
10739 _setxattr_maybe_wait_for_osdmap(name, value, size);
10741 Mutex::Locker lock(client_lock);
10747 int r = Client::path_walk(path, &in, perms, false);
10750 return _setxattr(in, name, value, size, flags, perms);
10753 int Client::fsetxattr(int fd, const char *name, const void *value, size_t size,
10754 int flags, const UserPerm& perms)
10756 _setxattr_maybe_wait_for_osdmap(name, value, size);
10758 Mutex::Locker lock(client_lock);
10763 Fh *f = get_filehandle(fd);
10766 return _setxattr(f->inode, name, value, size, flags, perms);
10769 int Client::_getxattr(Inode *in, const char *name, void *value, size_t size,
10770 const UserPerm& perms)
10774 const VXattr *vxattr = _match_vxattr(in, name);
10778 // Do a force getattr to get the latest quota before returning
10779 // a value to userspace.
10780 r = _getattr(in, 0, perms, true);
10782 // Error from getattr!
10786 // call pointer-to-member function
10788 if (!(vxattr->exists_cb && !(this->*(vxattr->exists_cb))(in))) {
10789 r = (this->*(vxattr->getxattr_cb))(in, buf, sizeof(buf));
10795 if (r > (int)size) {
10797 } else if (r > 0) {
10798 memcpy(value, buf, r);
10804 if (acl_type == NO_ACL && !strncmp(name, "system.", 7)) {
10809 r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10813 if (in->xattrs.count(n)) {
10814 r = in->xattrs[n].length();
10815 if (r > 0 && size != 0) {
10816 if (size >= (unsigned)r)
10817 memcpy(value, in->xattrs[n].c_str(), r);
10824 ldout(cct, 3) << "_getxattr(" << in->ino << ", \"" << name << "\", " << size << ") = " << r << dendl;
10828 int Client::_getxattr(InodeRef &in, const char *name, void *value, size_t size,
10829 const UserPerm& perms)
10831 if (cct->_conf->client_permissions) {
10832 int r = xattr_permission(in.get(), name, MAY_READ, perms);
10836 return _getxattr(in.get(), name, value, size, perms);
10839 int Client::ll_getxattr(Inode *in, const char *name, void *value,
10840 size_t size, const UserPerm& perms)
10842 Mutex::Locker lock(client_lock);
10847 vinodeno_t vino = _get_vino(in);
10849 ldout(cct, 3) << "ll_getxattr " << vino << " " << name << " size " << size << dendl;
10850 tout(cct) << "ll_getxattr" << std::endl;
10851 tout(cct) << vino.ino.val << std::endl;
10852 tout(cct) << name << std::endl;
10854 if (!cct->_conf->fuse_default_permissions) {
10855 int r = xattr_permission(in, name, MAY_READ, perms);
10860 return _getxattr(in, name, value, size, perms);
10863 int Client::_listxattr(Inode *in, char *name, size_t size,
10864 const UserPerm& perms)
10866 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
10868 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10869 p != in->xattrs.end();
10871 r += p->first.length() + 1;
10873 const VXattr *vxattrs = _get_vxattrs(in);
10874 r += _vxattrs_name_size(vxattrs);
10877 if (size >= (unsigned)r) {
10878 for (map<string,bufferptr>::iterator p = in->xattrs.begin();
10879 p != in->xattrs.end();
10881 memcpy(name, p->first.c_str(), p->first.length());
10882 name += p->first.length();
10887 for (int i = 0; !vxattrs[i].name.empty(); i++) {
10888 const VXattr& vxattr = vxattrs[i];
10891 // call pointer-to-member function
10892 if(vxattr.exists_cb && !(this->*(vxattr.exists_cb))(in))
10894 memcpy(name, vxattr.name.c_str(), vxattr.name.length());
10895 name += vxattr.name.length();
10904 ldout(cct, 3) << "_listxattr(" << in->ino << ", " << size << ") = " << r << dendl;
10908 int Client::ll_listxattr(Inode *in, char *names, size_t size,
10909 const UserPerm& perms)
10911 Mutex::Locker lock(client_lock);
10916 vinodeno_t vino = _get_vino(in);
10918 ldout(cct, 3) << "ll_listxattr " << vino << " size " << size << dendl;
10919 tout(cct) << "ll_listxattr" << std::endl;
10920 tout(cct) << vino.ino.val << std::endl;
10921 tout(cct) << size << std::endl;
10923 return _listxattr(in, names, size, perms);
10926 int Client::_do_setxattr(Inode *in, const char *name, const void *value,
10927 size_t size, int flags, const UserPerm& perms)
10930 int xattr_flags = 0;
10932 xattr_flags |= CEPH_XATTR_REMOVE;
10933 if (flags & XATTR_CREATE)
10934 xattr_flags |= CEPH_XATTR_CREATE;
10935 if (flags & XATTR_REPLACE)
10936 xattr_flags |= CEPH_XATTR_REPLACE;
10938 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SETXATTR);
10940 in->make_nosnap_relative_path(path);
10941 req->set_filepath(path);
10942 req->set_string2(name);
10943 req->set_inode(in);
10944 req->head.args.setxattr.flags = xattr_flags;
10947 bl.append((const char*)value, size);
10950 int res = make_request(req, perms);
10953 ldout(cct, 3) << "_setxattr(" << in->ino << ", \"" << name << "\") = " <<
10958 int Client::_setxattr(Inode *in, const char *name, const void *value,
10959 size_t size, int flags, const UserPerm& perms)
10961 if (in->snapid != CEPH_NOSNAP) {
10965 bool posix_acl_xattr = false;
10966 if (acl_type == POSIX_ACL)
10967 posix_acl_xattr = !strncmp(name, "system.", 7);
10969 if (strncmp(name, "user.", 5) &&
10970 strncmp(name, "security.", 9) &&
10971 strncmp(name, "trusted.", 8) &&
10972 strncmp(name, "ceph.", 5) &&
10974 return -EOPNOTSUPP;
10976 if (posix_acl_xattr) {
10977 if (!strcmp(name, ACL_EA_ACCESS)) {
10978 mode_t new_mode = in->mode;
10980 int ret = posix_acl_equiv_mode(value, size, &new_mode);
10987 if (new_mode != in->mode) {
10988 struct ceph_statx stx;
10989 stx.stx_mode = new_mode;
10990 ret = _do_setattr(in, &stx, CEPH_SETATTR_MODE, perms, NULL);
10995 } else if (!strcmp(name, ACL_EA_DEFAULT)) {
10997 if (!S_ISDIR(in->mode))
10999 int ret = posix_acl_check(value, size);
11008 return -EOPNOTSUPP;
11011 const VXattr *vxattr = _match_vxattr(in, name);
11012 if (vxattr && vxattr->readonly)
11013 return -EOPNOTSUPP;
11016 return _do_setxattr(in, name, value, size, flags, perms);
11019 int Client::_setxattr(InodeRef &in, const char *name, const void *value,
11020 size_t size, int flags, const UserPerm& perms)
11022 if (cct->_conf->client_permissions) {
11023 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11027 return _setxattr(in.get(), name, value, size, flags, perms);
11030 int Client::_setxattr_check_data_pool(string& name, string& value, const OSDMap *osdmap)
11033 if (name == "layout") {
11034 string::iterator begin = value.begin();
11035 string::iterator end = value.end();
11036 keys_and_values<string::iterator> p; // create instance of parser
11037 std::map<string, string> m; // map to receive results
11038 if (!qi::parse(begin, end, p, m)) { // returns true if successful
11043 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
11044 if (q->first == "pool") {
11049 } else if (name == "layout.pool") {
11053 if (tmp.length()) {
11056 pool = boost::lexical_cast<unsigned>(tmp);
11057 if (!osdmap->have_pg_pool(pool))
11059 } catch (boost::bad_lexical_cast const&) {
11060 pool = osdmap->lookup_pg_pool_name(tmp);
11070 void Client::_setxattr_maybe_wait_for_osdmap(const char *name, const void *value, size_t size)
11072 // For setting pool of layout, MetaRequest need osdmap epoch.
11073 // There is a race which create a new data pool but client and mds both don't have.
11074 // Make client got the latest osdmap which make mds quickly judge whether get newer osdmap.
11075 if (strcmp(name, "ceph.file.layout.pool") == 0 || strcmp(name, "ceph.dir.layout.pool") == 0 ||
11076 strcmp(name, "ceph.file.layout") == 0 || strcmp(name, "ceph.dir.layout") == 0) {
11077 string rest(strstr(name, "layout"));
11078 string v((const char*)value, size);
11079 int r = objecter->with_osdmap([&](const OSDMap& o) {
11080 return _setxattr_check_data_pool(rest, v, &o);
11083 if (r == -ENOENT) {
11085 objecter->wait_for_latest_osdmap(&ctx);
11091 int Client::ll_setxattr(Inode *in, const char *name, const void *value,
11092 size_t size, int flags, const UserPerm& perms)
11094 _setxattr_maybe_wait_for_osdmap(name, value, size);
11096 Mutex::Locker lock(client_lock);
11101 vinodeno_t vino = _get_vino(in);
11103 ldout(cct, 3) << "ll_setxattr " << vino << " " << name << " size " << size << dendl;
11104 tout(cct) << "ll_setxattr" << std::endl;
11105 tout(cct) << vino.ino.val << std::endl;
11106 tout(cct) << name << std::endl;
11108 if (!cct->_conf->fuse_default_permissions) {
11109 int r = xattr_permission(in, name, MAY_WRITE, perms);
11113 return _setxattr(in, name, value, size, flags, perms);
11116 int Client::_removexattr(Inode *in, const char *name, const UserPerm& perms)
11118 if (in->snapid != CEPH_NOSNAP) {
11122 // same xattrs supported by kernel client
11123 if (strncmp(name, "user.", 5) &&
11124 strncmp(name, "system.", 7) &&
11125 strncmp(name, "security.", 9) &&
11126 strncmp(name, "trusted.", 8) &&
11127 strncmp(name, "ceph.", 5))
11128 return -EOPNOTSUPP;
11130 const VXattr *vxattr = _match_vxattr(in, name);
11131 if (vxattr && vxattr->readonly)
11132 return -EOPNOTSUPP;
11134 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_RMXATTR);
11136 in->make_nosnap_relative_path(path);
11137 req->set_filepath(path);
11138 req->set_filepath2(name);
11139 req->set_inode(in);
11141 int res = make_request(req, perms);
11144 ldout(cct, 3) << "_removexattr(" << in->ino << ", \"" << name << "\") = " << res << dendl;
11148 int Client::_removexattr(InodeRef &in, const char *name, const UserPerm& perms)
11150 if (cct->_conf->client_permissions) {
11151 int r = xattr_permission(in.get(), name, MAY_WRITE, perms);
11155 return _removexattr(in.get(), name, perms);
11158 int Client::ll_removexattr(Inode *in, const char *name, const UserPerm& perms)
11160 Mutex::Locker lock(client_lock);
11165 vinodeno_t vino = _get_vino(in);
11167 ldout(cct, 3) << "ll_removexattr " << vino << " " << name << dendl;
11168 tout(cct) << "ll_removexattr" << std::endl;
11169 tout(cct) << vino.ino.val << std::endl;
11170 tout(cct) << name << std::endl;
11172 if (!cct->_conf->fuse_default_permissions) {
11173 int r = xattr_permission(in, name, MAY_WRITE, perms);
11178 return _removexattr(in, name, perms);
11181 bool Client::_vxattrcb_quota_exists(Inode *in)
11183 return in->quota.is_enable();
11185 size_t Client::_vxattrcb_quota(Inode *in, char *val, size_t size)
11187 return snprintf(val, size,
11188 "max_bytes=%lld max_files=%lld",
11189 (long long int)in->quota.max_bytes,
11190 (long long int)in->quota.max_files);
11192 size_t Client::_vxattrcb_quota_max_bytes(Inode *in, char *val, size_t size)
11194 return snprintf(val, size, "%lld", (long long int)in->quota.max_bytes);
11196 size_t Client::_vxattrcb_quota_max_files(Inode *in, char *val, size_t size)
11198 return snprintf(val, size, "%lld", (long long int)in->quota.max_files);
11201 bool Client::_vxattrcb_layout_exists(Inode *in)
11203 return in->layout != file_layout_t();
11205 size_t Client::_vxattrcb_layout(Inode *in, char *val, size_t size)
11207 int r = snprintf(val, size,
11208 "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
11209 (unsigned long long)in->layout.stripe_unit,
11210 (unsigned long long)in->layout.stripe_count,
11211 (unsigned long long)in->layout.object_size);
11212 objecter->with_osdmap([&](const OSDMap& o) {
11213 if (o.have_pg_pool(in->layout.pool_id))
11214 r += snprintf(val + r, size - r, "%s",
11215 o.get_pool_name(in->layout.pool_id).c_str());
11217 r += snprintf(val + r, size - r, "%" PRIu64,
11218 (uint64_t)in->layout.pool_id);
11220 if (in->layout.pool_ns.length())
11221 r += snprintf(val + r, size - r, " pool_namespace=%s",
11222 in->layout.pool_ns.c_str());
11225 size_t Client::_vxattrcb_layout_stripe_unit(Inode *in, char *val, size_t size)
11227 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_unit);
11229 size_t Client::_vxattrcb_layout_stripe_count(Inode *in, char *val, size_t size)
11231 return snprintf(val, size, "%lld", (unsigned long long)in->layout.stripe_count);
11233 size_t Client::_vxattrcb_layout_object_size(Inode *in, char *val, size_t size)
11235 return snprintf(val, size, "%lld", (unsigned long long)in->layout.object_size);
11237 size_t Client::_vxattrcb_layout_pool(Inode *in, char *val, size_t size)
11240 objecter->with_osdmap([&](const OSDMap& o) {
11241 if (o.have_pg_pool(in->layout.pool_id))
11242 r = snprintf(val, size, "%s", o.get_pool_name(
11243 in->layout.pool_id).c_str());
11245 r = snprintf(val, size, "%" PRIu64, (uint64_t)in->layout.pool_id);
11249 size_t Client::_vxattrcb_layout_pool_namespace(Inode *in, char *val, size_t size)
11251 return snprintf(val, size, "%s", in->layout.pool_ns.c_str());
11253 size_t Client::_vxattrcb_dir_entries(Inode *in, char *val, size_t size)
11255 return snprintf(val, size, "%lld", (unsigned long long)(in->dirstat.nfiles + in->dirstat.nsubdirs));
11257 size_t Client::_vxattrcb_dir_files(Inode *in, char *val, size_t size)
11259 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nfiles);
11261 size_t Client::_vxattrcb_dir_subdirs(Inode *in, char *val, size_t size)
11263 return snprintf(val, size, "%lld", (unsigned long long)in->dirstat.nsubdirs);
11265 size_t Client::_vxattrcb_dir_rentries(Inode *in, char *val, size_t size)
11267 return snprintf(val, size, "%lld", (unsigned long long)(in->rstat.rfiles + in->rstat.rsubdirs));
11269 size_t Client::_vxattrcb_dir_rfiles(Inode *in, char *val, size_t size)
11271 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rfiles);
11273 size_t Client::_vxattrcb_dir_rsubdirs(Inode *in, char *val, size_t size)
11275 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rsubdirs);
11277 size_t Client::_vxattrcb_dir_rbytes(Inode *in, char *val, size_t size)
11279 return snprintf(val, size, "%lld", (unsigned long long)in->rstat.rbytes);
11281 size_t Client::_vxattrcb_dir_rctime(Inode *in, char *val, size_t size)
11283 return snprintf(val, size, "%ld.09%ld", (long)in->rstat.rctime.sec(),
11284 (long)in->rstat.rctime.nsec());
11287 #define CEPH_XATTR_NAME(_type, _name) "ceph." #_type "." #_name
11288 #define CEPH_XATTR_NAME2(_type, _name, _name2) "ceph." #_type "." #_name "." #_name2
11290 #define XATTR_NAME_CEPH(_type, _name) \
11292 name: CEPH_XATTR_NAME(_type, _name), \
11293 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11298 #define XATTR_LAYOUT_FIELD(_type, _name, _field) \
11300 name: CEPH_XATTR_NAME2(_type, _name, _field), \
11301 getxattr_cb: &Client::_vxattrcb_ ## _name ## _ ## _field, \
11304 exists_cb: &Client::_vxattrcb_layout_exists, \
11306 #define XATTR_QUOTA_FIELD(_type, _name) \
11308 name: CEPH_XATTR_NAME(_type, _name), \
11309 getxattr_cb: &Client::_vxattrcb_ ## _type ## _ ## _name, \
11312 exists_cb: &Client::_vxattrcb_quota_exists, \
11315 const Client::VXattr Client::_dir_vxattrs[] = {
11317 name: "ceph.dir.layout",
11318 getxattr_cb: &Client::_vxattrcb_layout,
11321 exists_cb: &Client::_vxattrcb_layout_exists,
11323 XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
11324 XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
11325 XATTR_LAYOUT_FIELD(dir, layout, object_size),
11326 XATTR_LAYOUT_FIELD(dir, layout, pool),
11327 XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
11328 XATTR_NAME_CEPH(dir, entries),
11329 XATTR_NAME_CEPH(dir, files),
11330 XATTR_NAME_CEPH(dir, subdirs),
11331 XATTR_NAME_CEPH(dir, rentries),
11332 XATTR_NAME_CEPH(dir, rfiles),
11333 XATTR_NAME_CEPH(dir, rsubdirs),
11334 XATTR_NAME_CEPH(dir, rbytes),
11335 XATTR_NAME_CEPH(dir, rctime),
11337 name: "ceph.quota",
11338 getxattr_cb: &Client::_vxattrcb_quota,
11341 exists_cb: &Client::_vxattrcb_quota_exists,
11343 XATTR_QUOTA_FIELD(quota, max_bytes),
11344 XATTR_QUOTA_FIELD(quota, max_files),
11345 { name: "" } /* Required table terminator */
11348 const Client::VXattr Client::_file_vxattrs[] = {
11350 name: "ceph.file.layout",
11351 getxattr_cb: &Client::_vxattrcb_layout,
11354 exists_cb: &Client::_vxattrcb_layout_exists,
11356 XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
11357 XATTR_LAYOUT_FIELD(file, layout, stripe_count),
11358 XATTR_LAYOUT_FIELD(file, layout, object_size),
11359 XATTR_LAYOUT_FIELD(file, layout, pool),
11360 XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
11361 { name: "" } /* Required table terminator */
11364 const Client::VXattr *Client::_get_vxattrs(Inode *in)
11367 return _dir_vxattrs;
11368 else if (in->is_file())
11369 return _file_vxattrs;
11373 const Client::VXattr *Client::_match_vxattr(Inode *in, const char *name)
11375 if (strncmp(name, "ceph.", 5) == 0) {
11376 const VXattr *vxattr = _get_vxattrs(in);
11378 while (!vxattr->name.empty()) {
11379 if (vxattr->name == name)
11388 size_t Client::_vxattrs_calcu_name_size(const VXattr *vxattr)
11391 while (!vxattr->name.empty()) {
11392 if (!vxattr->hidden)
11393 len += vxattr->name.length() + 1;
11399 int Client::ll_readlink(Inode *in, char *buf, size_t buflen, const UserPerm& perms)
11401 Mutex::Locker lock(client_lock);
11406 vinodeno_t vino = _get_vino(in);
11408 ldout(cct, 3) << "ll_readlink " << vino << dendl;
11409 tout(cct) << "ll_readlink" << std::endl;
11410 tout(cct) << vino.ino.val << std::endl;
11412 set<Dentry*>::iterator dn = in->dn_set.begin();
11413 while (dn != in->dn_set.end()) {
11418 int r = _readlink(in, buf, buflen); // FIXME: no permission checking!
11419 ldout(cct, 3) << "ll_readlink " << vino << " = " << r << dendl;
11423 int Client::_mknod(Inode *dir, const char *name, mode_t mode, dev_t rdev,
11424 const UserPerm& perms, InodeRef *inp)
11426 ldout(cct, 3) << "_mknod(" << dir->ino << " " << name << ", 0" << oct
11427 << mode << dec << ", " << rdev << ", uid " << perms.uid()
11428 << ", gid " << perms.gid() << ")" << dendl;
11430 if (strlen(name) > NAME_MAX)
11431 return -ENAMETOOLONG;
11433 if (dir->snapid != CEPH_NOSNAP) {
11436 if (is_quota_files_exceeded(dir, perms)) {
11440 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_MKNOD);
11443 dir->make_nosnap_relative_path(path);
11444 path.push_dentry(name);
11445 req->set_filepath(path);
11446 req->set_inode(dir);
11447 req->head.args.mknod.rdev = rdev;
11448 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11449 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11451 bufferlist xattrs_bl;
11452 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11455 req->head.args.mknod.mode = mode;
11456 if (xattrs_bl.length() > 0)
11457 req->set_data(xattrs_bl);
11460 res = get_or_create(dir, name, &de);
11463 req->set_dentry(de);
11465 res = make_request(req, perms, inp);
11469 ldout(cct, 3) << "mknod(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11477 int Client::ll_mknod(Inode *parent, const char *name, mode_t mode,
11478 dev_t rdev, struct stat *attr, Inode **out,
11479 const UserPerm& perms)
11481 Mutex::Locker lock(client_lock);
11486 vinodeno_t vparent = _get_vino(parent);
11488 ldout(cct, 3) << "ll_mknod " << vparent << " " << name << dendl;
11489 tout(cct) << "ll_mknod" << std::endl;
11490 tout(cct) << vparent.ino.val << std::endl;
11491 tout(cct) << name << std::endl;
11492 tout(cct) << mode << std::endl;
11493 tout(cct) << rdev << std::endl;
11495 if (!cct->_conf->fuse_default_permissions) {
11496 int r = may_create(parent, perms);
11502 int r = _mknod(parent, name, mode, rdev, perms, &in);
11504 fill_stat(in, attr);
11507 tout(cct) << attr->st_ino << std::endl;
11508 ldout(cct, 3) << "ll_mknod " << vparent << " " << name
11509 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11514 int Client::ll_mknodx(Inode *parent, const char *name, mode_t mode,
11515 dev_t rdev, Inode **out,
11516 struct ceph_statx *stx, unsigned want, unsigned flags,
11517 const UserPerm& perms)
11519 unsigned caps = statx_to_mask(flags, want);
11520 Mutex::Locker lock(client_lock);
11525 vinodeno_t vparent = _get_vino(parent);
11527 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name << dendl;
11528 tout(cct) << "ll_mknodx" << std::endl;
11529 tout(cct) << vparent.ino.val << std::endl;
11530 tout(cct) << name << std::endl;
11531 tout(cct) << mode << std::endl;
11532 tout(cct) << rdev << std::endl;
11534 if (!cct->_conf->fuse_default_permissions) {
11535 int r = may_create(parent, perms);
11541 int r = _mknod(parent, name, mode, rdev, perms, &in);
11543 fill_statx(in, caps, stx);
11546 tout(cct) << stx->stx_ino << std::endl;
11547 ldout(cct, 3) << "ll_mknodx " << vparent << " " << name
11548 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11553 int Client::_create(Inode *dir, const char *name, int flags, mode_t mode,
11554 InodeRef *inp, Fh **fhp, int stripe_unit, int stripe_count,
11555 int object_size, const char *data_pool, bool *created,
11556 const UserPerm& perms)
11558 ldout(cct, 3) << "_create(" << dir->ino << " " << name << ", 0" << oct <<
11559 mode << dec << ")" << dendl;
11561 if (strlen(name) > NAME_MAX)
11562 return -ENAMETOOLONG;
11563 if (dir->snapid != CEPH_NOSNAP) {
11566 if (is_quota_files_exceeded(dir, perms)) {
11570 // use normalized flags to generate cmode
11571 int cmode = ceph_flags_to_mode(ceph_flags_sys2wire(flags));
11575 int64_t pool_id = -1;
11576 if (data_pool && *data_pool) {
11577 pool_id = objecter->with_osdmap(
11578 std::mem_fn(&OSDMap::lookup_pg_pool_name), data_pool);
11581 if (pool_id > 0xffffffffll)
11582 return -ERANGE; // bummer!
11585 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_CREATE);
11588 dir->make_nosnap_relative_path(path);
11589 path.push_dentry(name);
11590 req->set_filepath(path);
11591 req->set_inode(dir);
11592 req->head.args.open.flags = ceph_flags_sys2wire(flags | O_CREAT);
11594 req->head.args.open.stripe_unit = stripe_unit;
11595 req->head.args.open.stripe_count = stripe_count;
11596 req->head.args.open.object_size = object_size;
11597 if (cct->_conf->client_debug_getattr_caps)
11598 req->head.args.open.mask = DEBUG_GETATTR_CAPS;
11600 req->head.args.open.mask = 0;
11601 req->head.args.open.pool = pool_id;
11602 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11603 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11606 bufferlist xattrs_bl;
11607 int res = _posix_acl_create(dir, &mode, xattrs_bl, perms);
11610 req->head.args.open.mode = mode;
11611 if (xattrs_bl.length() > 0)
11612 req->set_data(xattrs_bl);
11615 res = get_or_create(dir, name, &de);
11618 req->set_dentry(de);
11620 res = make_request(req, perms, inp, created);
11625 /* If the caller passed a value in fhp, do the open */
11627 (*inp)->get_open_ref(cmode);
11628 *fhp = _create_fh(inp->get(), flags, cmode, perms);
11634 ldout(cct, 3) << "create(" << path << ", 0" << oct << mode << dec
11635 << " layout " << stripe_unit
11636 << ' ' << stripe_count
11637 << ' ' << object_size
11638 <<") = " << res << dendl;
11647 int Client::_mkdir(Inode *dir, const char *name, mode_t mode, const UserPerm& perm,
11650 ldout(cct, 3) << "_mkdir(" << dir->ino << " " << name << ", 0" << oct
11651 << mode << dec << ", uid " << perm.uid()
11652 << ", gid " << perm.gid() << ")" << dendl;
11654 if (strlen(name) > NAME_MAX)
11655 return -ENAMETOOLONG;
11657 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11660 if (is_quota_files_exceeded(dir, perm)) {
11663 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ?
11664 CEPH_MDS_OP_MKSNAP : CEPH_MDS_OP_MKDIR);
11667 dir->make_nosnap_relative_path(path);
11668 path.push_dentry(name);
11669 req->set_filepath(path);
11670 req->set_inode(dir);
11671 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11672 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11675 bufferlist xattrs_bl;
11676 int res = _posix_acl_create(dir, &mode, xattrs_bl, perm);
11679 req->head.args.mkdir.mode = mode;
11680 if (xattrs_bl.length() > 0)
11681 req->set_data(xattrs_bl);
11684 res = get_or_create(dir, name, &de);
11687 req->set_dentry(de);
11689 ldout(cct, 10) << "_mkdir: making request" << dendl;
11690 res = make_request(req, perm, inp);
11691 ldout(cct, 10) << "_mkdir result is " << res << dendl;
11695 ldout(cct, 3) << "_mkdir(" << path << ", 0" << oct << mode << dec << ") = " << res << dendl;
11703 int Client::ll_mkdir(Inode *parent, const char *name, mode_t mode,
11704 struct stat *attr, Inode **out, const UserPerm& perm)
11706 Mutex::Locker lock(client_lock);
11711 vinodeno_t vparent = _get_vino(parent);
11713 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name << dendl;
11714 tout(cct) << "ll_mkdir" << std::endl;
11715 tout(cct) << vparent.ino.val << std::endl;
11716 tout(cct) << name << std::endl;
11717 tout(cct) << mode << std::endl;
11719 if (!cct->_conf->fuse_default_permissions) {
11720 int r = may_create(parent, perm);
11726 int r = _mkdir(parent, name, mode, perm, &in);
11728 fill_stat(in, attr);
11731 tout(cct) << attr->st_ino << std::endl;
11732 ldout(cct, 3) << "ll_mkdir " << vparent << " " << name
11733 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11738 int Client::ll_mkdirx(Inode *parent, const char *name, mode_t mode, Inode **out,
11739 struct ceph_statx *stx, unsigned want, unsigned flags,
11740 const UserPerm& perms)
11742 Mutex::Locker lock(client_lock);
11747 vinodeno_t vparent = _get_vino(parent);
11749 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name << dendl;
11750 tout(cct) << "ll_mkdirx" << std::endl;
11751 tout(cct) << vparent.ino.val << std::endl;
11752 tout(cct) << name << std::endl;
11753 tout(cct) << mode << std::endl;
11755 if (!cct->_conf->fuse_default_permissions) {
11756 int r = may_create(parent, perms);
11762 int r = _mkdir(parent, name, mode, perms, &in);
11764 fill_statx(in, statx_to_mask(flags, want), stx);
11770 tout(cct) << stx->stx_ino << std::endl;
11771 ldout(cct, 3) << "ll_mkdirx " << vparent << " " << name
11772 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11777 int Client::_symlink(Inode *dir, const char *name, const char *target,
11778 const UserPerm& perms, InodeRef *inp)
11780 ldout(cct, 3) << "_symlink(" << dir->ino << " " << name << ", " << target
11781 << ", uid " << perms.uid() << ", gid " << perms.gid() << ")"
11784 if (strlen(name) > NAME_MAX)
11785 return -ENAMETOOLONG;
11787 if (dir->snapid != CEPH_NOSNAP) {
11790 if (is_quota_files_exceeded(dir, perms)) {
11794 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_SYMLINK);
11797 dir->make_nosnap_relative_path(path);
11798 path.push_dentry(name);
11799 req->set_filepath(path);
11800 req->set_inode(dir);
11801 req->set_string2(target);
11802 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11803 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11806 int res = get_or_create(dir, name, &de);
11809 req->set_dentry(de);
11811 res = make_request(req, perms, inp);
11814 ldout(cct, 3) << "_symlink(\"" << path << "\", \"" << target << "\") = " <<
11823 int Client::ll_symlink(Inode *parent, const char *name, const char *value,
11824 struct stat *attr, Inode **out, const UserPerm& perms)
11826 Mutex::Locker lock(client_lock);
11831 vinodeno_t vparent = _get_vino(parent);
11833 ldout(cct, 3) << "ll_symlink " << vparent << " " << name << " -> " << value
11835 tout(cct) << "ll_symlink" << std::endl;
11836 tout(cct) << vparent.ino.val << std::endl;
11837 tout(cct) << name << std::endl;
11838 tout(cct) << value << std::endl;
11840 if (!cct->_conf->fuse_default_permissions) {
11841 int r = may_create(parent, perms);
11847 int r = _symlink(parent, name, value, perms, &in);
11849 fill_stat(in, attr);
11852 tout(cct) << attr->st_ino << std::endl;
11853 ldout(cct, 3) << "ll_symlink " << vparent << " " << name
11854 << " = " << r << " (" << hex << attr->st_ino << dec << ")" << dendl;
11859 int Client::ll_symlinkx(Inode *parent, const char *name, const char *value,
11860 Inode **out, struct ceph_statx *stx, unsigned want,
11861 unsigned flags, const UserPerm& perms)
11863 Mutex::Locker lock(client_lock);
11868 vinodeno_t vparent = _get_vino(parent);
11870 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name << " -> " << value
11872 tout(cct) << "ll_symlinkx" << std::endl;
11873 tout(cct) << vparent.ino.val << std::endl;
11874 tout(cct) << name << std::endl;
11875 tout(cct) << value << std::endl;
11877 if (!cct->_conf->fuse_default_permissions) {
11878 int r = may_create(parent, perms);
11884 int r = _symlink(parent, name, value, perms, &in);
11886 fill_statx(in, statx_to_mask(flags, want), stx);
11889 tout(cct) << stx->stx_ino << std::endl;
11890 ldout(cct, 3) << "ll_symlinkx " << vparent << " " << name
11891 << " = " << r << " (" << hex << stx->stx_ino << dec << ")" << dendl;
11896 int Client::_unlink(Inode *dir, const char *name, const UserPerm& perm)
11898 ldout(cct, 3) << "_unlink(" << dir->ino << " " << name
11899 << " uid " << perm.uid() << " gid " << perm.gid()
11902 if (dir->snapid != CEPH_NOSNAP) {
11906 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_UNLINK);
11909 dir->make_nosnap_relative_path(path);
11910 path.push_dentry(name);
11911 req->set_filepath(path);
11916 int res = get_or_create(dir, name, &de);
11919 req->set_dentry(de);
11920 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11921 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11923 res = _lookup(dir, name, 0, &otherin, perm);
11926 req->set_other_inode(otherin.get());
11927 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
11929 req->set_inode(dir);
11931 res = make_request(req, perm);
11934 ldout(cct, 3) << "unlink(" << path << ") = " << res << dendl;
11942 int Client::ll_unlink(Inode *in, const char *name, const UserPerm& perm)
11944 Mutex::Locker lock(client_lock);
11949 vinodeno_t vino = _get_vino(in);
11951 ldout(cct, 3) << "ll_unlink " << vino << " " << name << dendl;
11952 tout(cct) << "ll_unlink" << std::endl;
11953 tout(cct) << vino.ino.val << std::endl;
11954 tout(cct) << name << std::endl;
11956 if (!cct->_conf->fuse_default_permissions) {
11957 int r = may_delete(in, name, perm);
11961 return _unlink(in, name, perm);
11964 int Client::_rmdir(Inode *dir, const char *name, const UserPerm& perms)
11966 ldout(cct, 3) << "_rmdir(" << dir->ino << " " << name << " uid "
11967 << perms.uid() << " gid " << perms.gid() << ")" << dendl;
11969 if (dir->snapid != CEPH_NOSNAP && dir->snapid != CEPH_SNAPDIR) {
11973 MetaRequest *req = new MetaRequest(dir->snapid == CEPH_SNAPDIR ? CEPH_MDS_OP_RMSNAP:CEPH_MDS_OP_RMDIR);
11975 dir->make_nosnap_relative_path(path);
11976 path.push_dentry(name);
11977 req->set_filepath(path);
11979 req->dentry_drop = CEPH_CAP_FILE_SHARED;
11980 req->dentry_unless = CEPH_CAP_FILE_EXCL;
11981 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
11986 int res = get_or_create(dir, name, &de);
11989 res = _lookup(dir, name, 0, &in, perms);
11992 if (req->get_op() == CEPH_MDS_OP_RMDIR) {
11993 req->set_inode(dir);
11994 req->set_dentry(de);
11995 req->set_other_inode(in.get());
11997 unlink(de, true, true);
11998 req->set_other_inode(in.get());
12001 res = make_request(req, perms);
12004 ldout(cct, 3) << "rmdir(" << path << ") = " << res << dendl;
12012 int Client::ll_rmdir(Inode *in, const char *name, const UserPerm& perms)
12014 Mutex::Locker lock(client_lock);
12019 vinodeno_t vino = _get_vino(in);
12021 ldout(cct, 3) << "ll_rmdir " << vino << " " << name << dendl;
12022 tout(cct) << "ll_rmdir" << std::endl;
12023 tout(cct) << vino.ino.val << std::endl;
12024 tout(cct) << name << std::endl;
12026 if (!cct->_conf->fuse_default_permissions) {
12027 int r = may_delete(in, name, perms);
12032 return _rmdir(in, name, perms);
12035 int Client::_rename(Inode *fromdir, const char *fromname, Inode *todir, const char *toname, const UserPerm& perm)
12037 ldout(cct, 3) << "_rename(" << fromdir->ino << " " << fromname << " to "
12038 << todir->ino << " " << toname
12039 << " uid " << perm.uid() << " gid " << perm.gid() << ")"
12042 if (fromdir->snapid != todir->snapid)
12045 int op = CEPH_MDS_OP_RENAME;
12046 if (fromdir->snapid != CEPH_NOSNAP) {
12047 if (fromdir == todir && fromdir->snapid == CEPH_SNAPDIR)
12048 op = CEPH_MDS_OP_RENAMESNAP;
12052 if (fromdir != todir) {
12053 Inode *fromdir_root =
12054 fromdir->quota.is_enable() ? fromdir : get_quota_root(fromdir, perm);
12055 Inode *todir_root =
12056 todir->quota.is_enable() ? todir : get_quota_root(todir, perm);
12057 if (fromdir_root != todir_root) {
12063 MetaRequest *req = new MetaRequest(op);
12066 fromdir->make_nosnap_relative_path(from);
12067 from.push_dentry(fromname);
12069 todir->make_nosnap_relative_path(to);
12070 to.push_dentry(toname);
12071 req->set_filepath(to);
12072 req->set_filepath2(from);
12075 int res = get_or_create(fromdir, fromname, &oldde);
12079 res = get_or_create(todir, toname, &de);
12083 if (op == CEPH_MDS_OP_RENAME) {
12084 req->set_old_dentry(oldde);
12085 req->old_dentry_drop = CEPH_CAP_FILE_SHARED;
12086 req->old_dentry_unless = CEPH_CAP_FILE_EXCL;
12088 req->set_dentry(de);
12089 req->dentry_drop = CEPH_CAP_FILE_SHARED;
12090 req->dentry_unless = CEPH_CAP_FILE_EXCL;
12092 InodeRef oldin, otherin;
12093 res = _lookup(fromdir, fromname, 0, &oldin, perm);
12096 req->set_old_inode(oldin.get());
12097 req->old_inode_drop = CEPH_CAP_LINK_SHARED;
12099 res = _lookup(todir, toname, 0, &otherin, perm);
12100 if (res != 0 && res != -ENOENT) {
12102 } else if (res == 0) {
12103 req->set_other_inode(otherin.get());
12104 req->other_inode_drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
12107 req->set_inode(todir);
12109 // renamesnap reply contains no tracedn, so we need to invalidate
12111 unlink(oldde, true, true);
12112 unlink(de, true, true);
12115 res = make_request(req, perm, &target);
12116 ldout(cct, 10) << "rename result is " << res << dendl;
12118 // renamed item from our cache
12121 ldout(cct, 3) << "_rename(" << from << ", " << to << ") = " << res << dendl;
12129 int Client::ll_rename(Inode *parent, const char *name, Inode *newparent,
12130 const char *newname, const UserPerm& perm)
12132 Mutex::Locker lock(client_lock);
12137 vinodeno_t vparent = _get_vino(parent);
12138 vinodeno_t vnewparent = _get_vino(newparent);
12140 ldout(cct, 3) << "ll_rename " << vparent << " " << name << " to "
12141 << vnewparent << " " << newname << dendl;
12142 tout(cct) << "ll_rename" << std::endl;
12143 tout(cct) << vparent.ino.val << std::endl;
12144 tout(cct) << name << std::endl;
12145 tout(cct) << vnewparent.ino.val << std::endl;
12146 tout(cct) << newname << std::endl;
12148 if (!cct->_conf->fuse_default_permissions) {
12149 int r = may_delete(parent, name, perm);
12152 r = may_delete(newparent, newname, perm);
12153 if (r < 0 && r != -ENOENT)
12157 return _rename(parent, name, newparent, newname, perm);
12160 int Client::_link(Inode *in, Inode *dir, const char *newname, const UserPerm& perm, InodeRef *inp)
12162 ldout(cct, 3) << "_link(" << in->ino << " to " << dir->ino << " " << newname
12163 << " uid " << perm.uid() << " gid " << perm.gid() << ")" << dendl;
12165 if (strlen(newname) > NAME_MAX)
12166 return -ENAMETOOLONG;
12168 if (in->snapid != CEPH_NOSNAP || dir->snapid != CEPH_NOSNAP) {
12171 if (is_quota_files_exceeded(dir, perm)) {
12175 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LINK);
12177 filepath path(newname, dir->ino);
12178 req->set_filepath(path);
12179 filepath existing(in->ino);
12180 req->set_filepath2(existing);
12182 req->set_inode(dir);
12183 req->inode_drop = CEPH_CAP_FILE_SHARED;
12184 req->inode_unless = CEPH_CAP_FILE_EXCL;
12187 int res = get_or_create(dir, newname, &de);
12190 req->set_dentry(de);
12192 res = make_request(req, perm, inp);
12193 ldout(cct, 10) << "link result is " << res << dendl;
12196 ldout(cct, 3) << "link(" << existing << ", " << path << ") = " << res << dendl;
12204 int Client::ll_link(Inode *in, Inode *newparent, const char *newname,
12205 const UserPerm& perm)
12207 Mutex::Locker lock(client_lock);
12212 vinodeno_t vino = _get_vino(in);
12213 vinodeno_t vnewparent = _get_vino(newparent);
12215 ldout(cct, 3) << "ll_link " << vino << " to " << vnewparent << " " <<
12217 tout(cct) << "ll_link" << std::endl;
12218 tout(cct) << vino.ino.val << std::endl;
12219 tout(cct) << vnewparent << std::endl;
12220 tout(cct) << newname << std::endl;
12225 if (!cct->_conf->fuse_default_permissions) {
12226 if (S_ISDIR(in->mode))
12229 r = may_hardlink(in, perm);
12233 r = may_create(newparent, perm);
12238 return _link(in, newparent, newname, perm, &target);
12241 int Client::ll_num_osds(void)
12243 Mutex::Locker lock(client_lock);
12244 return objecter->with_osdmap(std::mem_fn(&OSDMap::get_num_osds));
12247 int Client::ll_osdaddr(int osd, uint32_t *addr)
12249 Mutex::Locker lock(client_lock);
12252 bool exists = objecter->with_osdmap([&](const OSDMap& o) {
12253 if (!o.exists(osd))
12255 g = o.get_addr(osd);
12260 uint32_t nb_addr = (g.in4_addr()).sin_addr.s_addr;
12261 *addr = ntohl(nb_addr);
12265 uint32_t Client::ll_stripe_unit(Inode *in)
12267 Mutex::Locker lock(client_lock);
12268 return in->layout.stripe_unit;
12271 uint64_t Client::ll_snap_seq(Inode *in)
12273 Mutex::Locker lock(client_lock);
12274 return in->snaprealm->seq;
12277 int Client::ll_file_layout(Inode *in, file_layout_t *layout)
12279 Mutex::Locker lock(client_lock);
12280 *layout = in->layout;
12284 int Client::ll_file_layout(Fh *fh, file_layout_t *layout)
12286 return ll_file_layout(fh->inode.get(), layout);
12289 /* Currently we cannot take advantage of redundancy in reads, since we
12290 would have to go through all possible placement groups (a
12291 potentially quite large number determined by a hash), and use CRUSH
12292 to calculate the appropriate set of OSDs for each placement group,
12293 then index into that. An array with one entry per OSD is much more
12294 tractable and works for demonstration purposes. */
12296 int Client::ll_get_stripe_osd(Inode *in, uint64_t blockno,
12297 file_layout_t* layout)
12299 Mutex::Locker lock(client_lock);
12301 inodeno_t ino = ll_get_inodeno(in);
12302 uint32_t object_size = layout->object_size;
12303 uint32_t su = layout->stripe_unit;
12304 uint32_t stripe_count = layout->stripe_count;
12305 uint64_t stripes_per_object = object_size / su;
12307 uint64_t stripeno = blockno / stripe_count; // which horizontal stripe (Y)
12308 uint64_t stripepos = blockno % stripe_count; // which object in the object set (X)
12309 uint64_t objectsetno = stripeno / stripes_per_object; // which object set
12310 uint64_t objectno = objectsetno * stripe_count + stripepos; // object id
12312 object_t oid = file_object_t(ino, objectno);
12313 return objecter->with_osdmap([&](const OSDMap& o) {
12314 ceph_object_layout olayout =
12315 o.file_to_object_layout(oid, *layout);
12316 pg_t pg = (pg_t)olayout.ol_pgid;
12319 o.pg_to_acting_osds(pg, &osds, &primary);
12324 /* Return the offset of the block, internal to the object */
12326 uint64_t Client::ll_get_internal_offset(Inode *in, uint64_t blockno)
12328 Mutex::Locker lock(client_lock);
12329 file_layout_t *layout=&(in->layout);
12330 uint32_t object_size = layout->object_size;
12331 uint32_t su = layout->stripe_unit;
12332 uint64_t stripes_per_object = object_size / su;
12334 return (blockno % stripes_per_object) * su;
12337 int Client::ll_opendir(Inode *in, int flags, dir_result_t** dirpp,
12338 const UserPerm& perms)
12340 Mutex::Locker lock(client_lock);
12345 vinodeno_t vino = _get_vino(in);
12347 ldout(cct, 3) << "ll_opendir " << vino << dendl;
12348 tout(cct) << "ll_opendir" << std::endl;
12349 tout(cct) << vino.ino.val << std::endl;
12351 if (!cct->_conf->fuse_default_permissions) {
12352 int r = may_open(in, flags, perms);
12357 int r = _opendir(in, dirpp, perms);
12358 tout(cct) << (unsigned long)*dirpp << std::endl;
12360 ldout(cct, 3) << "ll_opendir " << vino << " = " << r << " (" << *dirpp << ")"
12365 int Client::ll_releasedir(dir_result_t *dirp)
12367 Mutex::Locker lock(client_lock);
12368 ldout(cct, 3) << "ll_releasedir " << dirp << dendl;
12369 tout(cct) << "ll_releasedir" << std::endl;
12370 tout(cct) << (unsigned long)dirp << std::endl;
12379 int Client::ll_fsyncdir(dir_result_t *dirp)
12381 Mutex::Locker lock(client_lock);
12382 ldout(cct, 3) << "ll_fsyncdir " << dirp << dendl;
12383 tout(cct) << "ll_fsyncdir" << std::endl;
12384 tout(cct) << (unsigned long)dirp << std::endl;
12389 return _fsync(dirp->inode.get(), false);
12392 int Client::ll_open(Inode *in, int flags, Fh **fhp, const UserPerm& perms)
12394 assert(!(flags & O_CREAT));
12396 Mutex::Locker lock(client_lock);
12401 vinodeno_t vino = _get_vino(in);
12403 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) << dendl;
12404 tout(cct) << "ll_open" << std::endl;
12405 tout(cct) << vino.ino.val << std::endl;
12406 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12409 if (!cct->_conf->fuse_default_permissions) {
12410 r = may_open(in, flags, perms);
12415 r = _open(in, flags, 0, fhp /* may be NULL */, perms);
12418 Fh *fhptr = fhp ? *fhp : NULL;
12420 ll_unclosed_fh_set.insert(fhptr);
12422 tout(cct) << (unsigned long)fhptr << std::endl;
12423 ldout(cct, 3) << "ll_open " << vino << " " << ceph_flags_sys2wire(flags) <<
12424 " = " << r << " (" << fhptr << ")" << dendl;
12428 int Client::_ll_create(Inode *parent, const char *name, mode_t mode,
12429 int flags, InodeRef *in, int caps, Fh **fhp,
12430 const UserPerm& perms)
12434 vinodeno_t vparent = _get_vino(parent);
12436 ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
12437 mode << dec << " " << ceph_flags_sys2wire(flags) << ", uid " << perms.uid()
12438 << ", gid " << perms.gid() << dendl;
12439 tout(cct) << "ll_create" << std::endl;
12440 tout(cct) << vparent.ino.val << std::endl;
12441 tout(cct) << name << std::endl;
12442 tout(cct) << mode << std::endl;
12443 tout(cct) << ceph_flags_sys2wire(flags) << std::endl;
12445 bool created = false;
12446 int r = _lookup(parent, name, caps, in, perms);
12448 if (r == 0 && (flags & O_CREAT) && (flags & O_EXCL))
12451 if (r == -ENOENT && (flags & O_CREAT)) {
12452 if (!cct->_conf->fuse_default_permissions) {
12453 r = may_create(parent, perms);
12457 r = _create(parent, name, flags, mode, in, fhp, 0, 0, 0, NULL, &created,
12468 ldout(cct, 20) << "_ll_create created = " << created << dendl;
12470 if (!cct->_conf->fuse_default_permissions) {
12471 r = may_open(in->get(), flags, perms);
12474 int release_r = _release_fh(*fhp);
12475 assert(release_r == 0); // during create, no async data ops should have happened
12480 if (*fhp == NULL) {
12481 r = _open(in->get(), flags, mode, fhp, perms);
12489 ll_unclosed_fh_set.insert(*fhp);
12494 Inode *inode = in->get();
12495 if (use_faked_inos())
12496 ino = inode->faked_ino;
12501 tout(cct) << (unsigned long)*fhp << std::endl;
12502 tout(cct) << ino << std::endl;
12503 ldout(cct, 3) << "_ll_create " << vparent << " " << name << " 0" << oct <<
12504 mode << dec << " " << ceph_flags_sys2wire(flags) << " = " << r << " (" <<
12505 *fhp << " " << hex << ino << dec << ")" << dendl;
12510 int Client::ll_create(Inode *parent, const char *name, mode_t mode,
12511 int flags, struct stat *attr, Inode **outp, Fh **fhp,
12512 const UserPerm& perms)
12514 Mutex::Locker lock(client_lock);
12520 int r = _ll_create(parent, name, mode, flags, &in, CEPH_STAT_CAP_INODE_ALL,
12525 // passing an Inode in outp requires an additional ref
12530 fill_stat(in, attr);
12538 int Client::ll_createx(Inode *parent, const char *name, mode_t mode,
12539 int oflags, Inode **outp, Fh **fhp,
12540 struct ceph_statx *stx, unsigned want, unsigned lflags,
12541 const UserPerm& perms)
12543 unsigned caps = statx_to_mask(lflags, want);
12544 Mutex::Locker lock(client_lock);
12550 int r = _ll_create(parent, name, mode, oflags, &in, caps, fhp, perms);
12554 // passing an Inode in outp requires an additional ref
12559 fill_statx(in, caps, stx);
12568 loff_t Client::ll_lseek(Fh *fh, loff_t offset, int whence)
12570 Mutex::Locker lock(client_lock);
12571 tout(cct) << "ll_lseek" << std::endl;
12572 tout(cct) << offset << std::endl;
12573 tout(cct) << whence << std::endl;
12578 return _lseek(fh, offset, whence);
12581 int Client::ll_read(Fh *fh, loff_t off, loff_t len, bufferlist *bl)
12583 Mutex::Locker lock(client_lock);
12584 ldout(cct, 3) << "ll_read " << fh << " " << fh->inode->ino << " " << " " << off << "~" << len << dendl;
12585 tout(cct) << "ll_read" << std::endl;
12586 tout(cct) << (unsigned long)fh << std::endl;
12587 tout(cct) << off << std::endl;
12588 tout(cct) << len << std::endl;
12593 return _read(fh, off, len, bl);
12596 int Client::ll_read_block(Inode *in, uint64_t blockid,
12600 file_layout_t* layout)
12602 Mutex::Locker lock(client_lock);
12607 vinodeno_t vino = ll_get_vino(in);
12608 object_t oid = file_object_t(vino.ino, blockid);
12609 C_SaferCond onfinish;
12612 objecter->read(oid,
12613 object_locator_t(layout->pool_id),
12618 CEPH_OSD_FLAG_READ,
12621 client_lock.Unlock();
12622 int r = onfinish.wait();
12623 client_lock.Lock();
12626 bl.copy(0, bl.length(), buf);
12633 /* It appears that the OSD doesn't return success unless the entire
12634 buffer was written, return the write length on success. */
12636 int Client::ll_write_block(Inode *in, uint64_t blockid,
12637 char* buf, uint64_t offset,
12638 uint64_t length, file_layout_t* layout,
12639 uint64_t snapseq, uint32_t sync)
12641 Mutex flock("Client::ll_write_block flock");
12642 vinodeno_t vino = ll_get_vino(in);
12646 Context *onsafe = nullptr;
12651 if (true || sync) {
12652 /* if write is stable, the epilogue is waiting on
12654 onsafe = new C_SafeCond(&flock, &cond, &done, &r);
12657 /* if write is unstable, we just place a barrier for
12658 * future commits to wait on */
12659 /*onsafe = new C_Block_Sync(this, vino.ino,
12660 barrier_interval(offset, offset + length), &r);
12664 object_t oid = file_object_t(vino.ino, blockid);
12665 SnapContext fakesnap;
12667 if (length > 0) bp = buffer::copy(buf, length);
12671 ldout(cct, 1) << "ll_block_write for " << vino.ino << "." << blockid
12674 fakesnap.seq = snapseq;
12676 /* lock just in time */
12677 client_lock.Lock();
12679 client_lock.Unlock();
12684 objecter->write(oid,
12685 object_locator_t(layout->pool_id),
12690 ceph::real_clock::now(),
12694 client_lock.Unlock();
12695 if (!done /* also !sync */) {
12709 int Client::ll_commit_blocks(Inode *in,
12713 Mutex::Locker lock(client_lock);
12715 BarrierContext *bctx;
12716 vinodeno_t vino = ll_get_vino(in);
12717 uint64_t ino = vino.ino;
12719 ldout(cct, 1) << "ll_commit_blocks for " << vino.ino << " from "
12720 << offset << " to " << length << dendl;
12726 map<uint64_t, BarrierContext*>::iterator p = barriers.find(ino);
12727 if (p != barriers.end()) {
12728 barrier_interval civ(offset, offset + length);
12729 p->second->commit_barrier(civ);
12735 int Client::ll_write(Fh *fh, loff_t off, loff_t len, const char *data)
12737 Mutex::Locker lock(client_lock);
12738 ldout(cct, 3) << "ll_write " << fh << " " << fh->inode->ino << " " << off <<
12739 "~" << len << dendl;
12740 tout(cct) << "ll_write" << std::endl;
12741 tout(cct) << (unsigned long)fh << std::endl;
12742 tout(cct) << off << std::endl;
12743 tout(cct) << len << std::endl;
12748 int r = _write(fh, off, len, data, NULL, 0);
12749 ldout(cct, 3) << "ll_write " << fh << " " << off << "~" << len << " = " << r
12754 int Client::ll_flush(Fh *fh)
12756 Mutex::Locker lock(client_lock);
12757 ldout(cct, 3) << "ll_flush " << fh << " " << fh->inode->ino << " " << dendl;
12758 tout(cct) << "ll_flush" << std::endl;
12759 tout(cct) << (unsigned long)fh << std::endl;
12767 int Client::ll_fsync(Fh *fh, bool syncdataonly)
12769 Mutex::Locker lock(client_lock);
12770 ldout(cct, 3) << "ll_fsync " << fh << " " << fh->inode->ino << " " << dendl;
12771 tout(cct) << "ll_fsync" << std::endl;
12772 tout(cct) << (unsigned long)fh << std::endl;
12777 int r = _fsync(fh, syncdataonly);
12779 // If we're returning an error, clear it from the FH
12780 fh->take_async_err();
12785 #ifdef FALLOC_FL_PUNCH_HOLE
12787 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
12789 if (offset < 0 || length <= 0)
12792 if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
12793 return -EOPNOTSUPP;
12795 if ((mode & FALLOC_FL_PUNCH_HOLE) && !(mode & FALLOC_FL_KEEP_SIZE))
12796 return -EOPNOTSUPP;
12798 Inode *in = fh->inode.get();
12800 if (objecter->osdmap_pool_full(in->layout.pool_id) &&
12801 !(mode & FALLOC_FL_PUNCH_HOLE)) {
12805 if (in->snapid != CEPH_NOSNAP)
12808 if ((fh->mode & CEPH_FILE_MODE_WR) == 0)
12811 uint64_t size = offset + length;
12812 if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE)) &&
12814 is_quota_bytes_exceeded(in, size - in->size, fh->actor_perms)) {
12819 int r = get_caps(in, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, &have, -1);
12823 Mutex uninline_flock("Client::_fallocate_uninline_data flock");
12824 Cond uninline_cond;
12825 bool uninline_done = false;
12826 int uninline_ret = 0;
12827 Context *onuninline = NULL;
12829 if (mode & FALLOC_FL_PUNCH_HOLE) {
12830 if (in->inline_version < CEPH_INLINE_NONE &&
12831 (have & CEPH_CAP_FILE_BUFFER)) {
12833 int len = in->inline_data.length();
12834 if (offset < len) {
12836 in->inline_data.copy(0, offset, bl);
12838 if (offset + size > len)
12839 size = len - offset;
12841 bl.append_zero(size);
12842 if (offset + size < len)
12843 in->inline_data.copy(offset + size, len - offset - size, bl);
12844 in->inline_data = bl;
12845 in->inline_version++;
12847 in->mtime = ceph_clock_now();
12849 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12851 if (in->inline_version < CEPH_INLINE_NONE) {
12852 onuninline = new C_SafeCond(&uninline_flock,
12856 uninline_data(in, onuninline);
12859 Mutex flock("Client::_punch_hole flock");
12862 Context *onfinish = new C_SafeCond(&flock, &cond, &done);
12864 unsafe_sync_write++;
12865 get_cap_ref(in, CEPH_CAP_FILE_BUFFER);
12867 _invalidate_inode_cache(in, offset, length);
12868 filer->zero(in->ino, &in->layout,
12869 in->snaprealm->get_snap_context(),
12871 ceph::real_clock::now(),
12872 0, true, onfinish);
12873 in->mtime = ceph_clock_now();
12875 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12877 client_lock.Unlock();
12882 client_lock.Lock();
12883 _sync_write_commit(in);
12885 } else if (!(mode & FALLOC_FL_KEEP_SIZE)) {
12886 uint64_t size = offset + length;
12887 if (size > in->size) {
12889 in->mtime = ceph_clock_now();
12891 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12893 if (is_quota_bytes_approaching(in, fh->actor_perms)) {
12894 check_caps(in, CHECK_CAPS_NODELAY);
12895 } else if (is_max_size_approaching(in)) {
12902 client_lock.Unlock();
12903 uninline_flock.Lock();
12904 while (!uninline_done)
12905 uninline_cond.Wait(uninline_flock);
12906 uninline_flock.Unlock();
12907 client_lock.Lock();
12909 if (uninline_ret >= 0 || uninline_ret == -ECANCELED) {
12910 in->inline_data.clear();
12911 in->inline_version = CEPH_INLINE_NONE;
12912 mark_caps_dirty(in, CEPH_CAP_FILE_WR);
12918 put_cap_ref(in, CEPH_CAP_FILE_WR);
12923 int Client::_fallocate(Fh *fh, int mode, int64_t offset, int64_t length)
12925 return -EOPNOTSUPP;
12931 int Client::ll_fallocate(Fh *fh, int mode, loff_t offset, loff_t length)
12933 Mutex::Locker lock(client_lock);
12934 ldout(cct, 3) << "ll_fallocate " << fh << " " << fh->inode->ino << " " << dendl;
12935 tout(cct) << "ll_fallocate " << mode << " " << offset << " " << length << std::endl;
12936 tout(cct) << (unsigned long)fh << std::endl;
12941 return _fallocate(fh, mode, offset, length);
12944 int Client::fallocate(int fd, int mode, loff_t offset, loff_t length)
12946 Mutex::Locker lock(client_lock);
12947 tout(cct) << "fallocate " << " " << fd << mode << " " << offset << " " << length << std::endl;
12952 Fh *fh = get_filehandle(fd);
12955 #if defined(__linux__) && defined(O_PATH)
12956 if (fh->flags & O_PATH)
12959 return _fallocate(fh, mode, offset, length);
12962 int Client::ll_release(Fh *fh)
12964 Mutex::Locker lock(client_lock);
12965 ldout(cct, 3) << "ll_release (fh)" << fh << " " << fh->inode->ino << " " <<
12967 tout(cct) << "ll_release (fh)" << std::endl;
12968 tout(cct) << (unsigned long)fh << std::endl;
12973 if (ll_unclosed_fh_set.count(fh))
12974 ll_unclosed_fh_set.erase(fh);
12975 return _release_fh(fh);
12978 int Client::ll_getlk(Fh *fh, struct flock *fl, uint64_t owner)
12980 Mutex::Locker lock(client_lock);
12982 ldout(cct, 3) << "ll_getlk (fh)" << fh << " " << fh->inode->ino << dendl;
12983 tout(cct) << "ll_getk (fh)" << (unsigned long)fh << std::endl;
12988 return _getlk(fh, fl, owner);
12991 int Client::ll_setlk(Fh *fh, struct flock *fl, uint64_t owner, int sleep)
12993 Mutex::Locker lock(client_lock);
12995 ldout(cct, 3) << "ll_setlk (fh) " << fh << " " << fh->inode->ino << dendl;
12996 tout(cct) << "ll_setk (fh)" << (unsigned long)fh << std::endl;
13001 return _setlk(fh, fl, owner, sleep);
13004 int Client::ll_flock(Fh *fh, int cmd, uint64_t owner)
13006 Mutex::Locker lock(client_lock);
13008 ldout(cct, 3) << "ll_flock (fh) " << fh << " " << fh->inode->ino << dendl;
13009 tout(cct) << "ll_flock (fh)" << (unsigned long)fh << std::endl;
13014 return _flock(fh, cmd, owner);
13017 class C_Client_RequestInterrupt : public Context {
13022 C_Client_RequestInterrupt(Client *c, MetaRequest *r) : client(c), req(r) {
13025 void finish(int r) override {
13026 Mutex::Locker l(client->client_lock);
13027 assert(req->head.op == CEPH_MDS_OP_SETFILELOCK);
13028 client->_interrupt_filelock(req);
13029 client->put_request(req);
13033 void Client::ll_interrupt(void *d)
13035 MetaRequest *req = static_cast<MetaRequest*>(d);
13036 ldout(cct, 3) << "ll_interrupt tid " << req->get_tid() << dendl;
13037 tout(cct) << "ll_interrupt tid " << req->get_tid() << std::endl;
13038 interrupt_finisher.queue(new C_Client_RequestInterrupt(this, req));
13041 // =========================================
13044 // expose file layouts
13046 int Client::describe_layout(const char *relpath, file_layout_t *lp,
13047 const UserPerm& perms)
13049 Mutex::Locker lock(client_lock);
13054 filepath path(relpath);
13056 int r = path_walk(path, &in, perms);
13062 ldout(cct, 3) << "describe_layout(" << relpath << ") = 0" << dendl;
13066 int Client::fdescribe_layout(int fd, file_layout_t *lp)
13068 Mutex::Locker lock(client_lock);
13073 Fh *f = get_filehandle(fd);
13076 Inode *in = f->inode.get();
13080 ldout(cct, 3) << "fdescribe_layout(" << fd << ") = 0" << dendl;
13084 int64_t Client::get_default_pool_id()
13086 Mutex::Locker lock(client_lock);
13091 /* first data pool is the default */
13092 return mdsmap->get_first_data_pool();
13097 int64_t Client::get_pool_id(const char *pool_name)
13099 Mutex::Locker lock(client_lock);
13104 return objecter->with_osdmap(std::mem_fn(&OSDMap::lookup_pg_pool_name),
13108 string Client::get_pool_name(int64_t pool)
13110 Mutex::Locker lock(client_lock);
13115 return objecter->with_osdmap([pool](const OSDMap& o) {
13116 return o.have_pg_pool(pool) ? o.get_pool_name(pool) : string();
13120 int Client::get_pool_replication(int64_t pool)
13122 Mutex::Locker lock(client_lock);
13127 return objecter->with_osdmap([pool](const OSDMap& o) {
13128 return o.have_pg_pool(pool) ? o.get_pg_pool(pool)->get_size() : -ENOENT;
13132 int Client::get_file_extent_osds(int fd, loff_t off, loff_t *len, vector<int>& osds)
13134 Mutex::Locker lock(client_lock);
13139 Fh *f = get_filehandle(fd);
13142 Inode *in = f->inode.get();
13144 vector<ObjectExtent> extents;
13145 Striper::file_to_extents(cct, in->ino, &in->layout, off, 1, in->truncate_size, extents);
13146 assert(extents.size() == 1);
13148 objecter->with_osdmap([&](const OSDMap& o) {
13149 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13150 o.pg_to_acting_osds(pg, osds);
13157 * Return the remainder of the extent (stripe unit)
13159 * If length = 1 is passed to Striper::file_to_extents we get a single
13160 * extent back, but its length is one so we still need to compute the length
13161 * to the end of the stripe unit.
13163 * If length = su then we may get 1 or 2 objects back in the extents vector
13164 * which would have to be examined. Even then, the offsets are local to the
13165 * object, so matching up to the file offset is extra work.
13167 * It seems simpler to stick with length = 1 and manually compute the
13171 uint64_t su = in->layout.stripe_unit;
13172 *len = su - (off % su);
13178 int Client::get_osd_crush_location(int id, vector<pair<string, string> >& path)
13180 Mutex::Locker lock(client_lock);
13187 return objecter->with_osdmap([&](const OSDMap& o) {
13188 return o.crush->get_full_location_ordered(id, path);
13192 int Client::get_file_stripe_address(int fd, loff_t offset,
13193 vector<entity_addr_t>& address)
13195 Mutex::Locker lock(client_lock);
13200 Fh *f = get_filehandle(fd);
13203 Inode *in = f->inode.get();
13206 vector<ObjectExtent> extents;
13207 Striper::file_to_extents(cct, in->ino, &in->layout, offset, 1,
13208 in->truncate_size, extents);
13209 assert(extents.size() == 1);
13211 // now we have the object and its 'layout'
13212 return objecter->with_osdmap([&](const OSDMap& o) {
13213 pg_t pg = o.object_locator_to_pg(extents[0].oid, extents[0].oloc);
13215 o.pg_to_acting_osds(pg, osds);
13218 for (unsigned i = 0; i < osds.size(); i++) {
13219 entity_addr_t addr = o.get_addr(osds[i]);
13220 address.push_back(addr);
13226 int Client::get_osd_addr(int osd, entity_addr_t& addr)
13228 Mutex::Locker lock(client_lock);
13233 return objecter->with_osdmap([&](const OSDMap& o) {
13234 if (!o.exists(osd))
13237 addr = o.get_addr(osd);
13242 int Client::enumerate_layout(int fd, vector<ObjectExtent>& result,
13243 loff_t length, loff_t offset)
13245 Mutex::Locker lock(client_lock);
13250 Fh *f = get_filehandle(fd);
13253 Inode *in = f->inode.get();
13255 // map to a list of extents
13256 Striper::file_to_extents(cct, in->ino, &in->layout, offset, length, in->truncate_size, result);
13258 ldout(cct, 3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl;
13264 * find an osd with the same ip. -1 if none.
13266 int Client::get_local_osd()
13268 Mutex::Locker lock(client_lock);
13273 objecter->with_osdmap([this](const OSDMap& o) {
13274 if (o.get_epoch() != local_osd_epoch) {
13275 local_osd = o.find_osd_on_ip(messenger->get_myaddr());
13276 local_osd_epoch = o.get_epoch();
13287 // ===============================
13289 void Client::ms_handle_connect(Connection *con)
13291 ldout(cct, 10) << "ms_handle_connect on " << con->get_peer_addr() << dendl;
13294 bool Client::ms_handle_reset(Connection *con)
13296 ldout(cct, 0) << "ms_handle_reset on " << con->get_peer_addr() << dendl;
13300 void Client::ms_handle_remote_reset(Connection *con)
13302 ldout(cct, 0) << "ms_handle_remote_reset on " << con->get_peer_addr() << dendl;
13303 Mutex::Locker l(client_lock);
13304 switch (con->get_peer_type()) {
13305 case CEPH_ENTITY_TYPE_MDS:
13307 // kludge to figure out which mds this is; fixme with a Connection* state
13308 mds_rank_t mds = MDS_RANK_NONE;
13309 MetaSession *s = NULL;
13310 for (map<mds_rank_t,MetaSession*>::iterator p = mds_sessions.begin();
13311 p != mds_sessions.end();
13313 if (mdsmap->get_addr(p->first) == con->get_peer_addr()) {
13319 assert (s != NULL);
13320 switch (s->state) {
13321 case MetaSession::STATE_CLOSING:
13322 ldout(cct, 1) << "reset from mds we were closing; we'll call that closed" << dendl;
13323 _closed_mds_session(s);
13326 case MetaSession::STATE_OPENING:
13328 ldout(cct, 1) << "reset from mds we were opening; retrying" << dendl;
13329 list<Context*> waiters;
13330 waiters.swap(s->waiting_for_open);
13331 _closed_mds_session(s);
13332 MetaSession *news = _get_or_open_mds_session(mds);
13333 news->waiting_for_open.swap(waiters);
13337 case MetaSession::STATE_OPEN:
13339 const md_config_t *conf = cct->_conf;
13340 if (conf->client_reconnect_stale) {
13341 ldout(cct, 1) << "reset from mds we were open; close mds session for reconnect" << dendl;
13342 _closed_mds_session(s);
13344 ldout(cct, 1) << "reset from mds we were open; mark session as stale" << dendl;
13345 s->state = MetaSession::STATE_STALE;
13350 case MetaSession::STATE_NEW:
13351 case MetaSession::STATE_CLOSED:
13361 bool Client::ms_handle_refused(Connection *con)
13363 ldout(cct, 1) << "ms_handle_refused on " << con->get_peer_addr() << dendl;
13367 bool Client::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
13369 if (dest_type == CEPH_ENTITY_TYPE_MON)
13371 *authorizer = monclient->build_authorizer(dest_type);
13375 Inode *Client::get_quota_root(Inode *in, const UserPerm& perms)
13378 utime_t now = ceph_clock_now();
13381 if (cur != in && cur->quota.is_enable())
13384 Inode *parent_in = NULL;
13385 if (!cur->dn_set.empty()) {
13386 for (auto p = cur->dn_set.begin(); p != cur->dn_set.end(); ++p) {
13388 if (dn->lease_mds >= 0 &&
13389 dn->lease_ttl > now &&
13390 mds_sessions.count(dn->lease_mds)) {
13391 parent_in = dn->dir->parent_inode;
13393 Inode *diri = dn->dir->parent_inode;
13394 if (diri->caps_issued_mask(CEPH_CAP_FILE_SHARED) &&
13395 diri->shared_gen == dn->cap_shared_gen) {
13396 parent_in = dn->dir->parent_inode;
13402 } else if (root_parents.count(cur)) {
13403 parent_in = root_parents[cur].get();
13411 if (cur == root_ancestor)
13415 if (cur->nlink == 0) {
13416 cur = root_ancestor;
13420 MetaRequest *req = new MetaRequest(CEPH_MDS_OP_LOOKUPNAME);
13421 filepath path(cur->ino);
13422 req->set_filepath(path);
13423 req->set_inode(cur);
13425 InodeRef parent_ref;
13426 int ret = make_request(req, perms, &parent_ref);
13428 ldout(cct, 1) << __func__ << " " << in->vino()
13429 << " failed to find parent of " << cur->vino()
13430 << " err " << ret << dendl;
13431 // FIXME: what to do?
13432 cur = root_ancestor;
13436 now = ceph_clock_now();
13438 cur = parent_ref.get();
13440 cur = in; // start over
13443 ldout(cct, 10) << __func__ << " " << in->vino() << " -> " << cur->vino() << dendl;
13448 * Traverse quota ancestors of the Inode, return true
13449 * if any of them passes the passed function
13451 bool Client::check_quota_condition(Inode *in, const UserPerm& perms,
13452 std::function<bool (const Inode &in)> test)
13455 assert(in != NULL);
13460 if (in == root_ancestor) {
13461 // We're done traversing, drop out
13464 // Continue up the tree
13465 in = get_quota_root(in, perms);
13472 bool Client::is_quota_files_exceeded(Inode *in, const UserPerm& perms)
13474 return check_quota_condition(in, perms,
13475 [](const Inode &in) {
13476 return in.quota.max_files && in.rstat.rsize() >= in.quota.max_files;
13480 bool Client::is_quota_bytes_exceeded(Inode *in, int64_t new_bytes,
13481 const UserPerm& perms)
13483 return check_quota_condition(in, perms,
13484 [&new_bytes](const Inode &in) {
13485 return in.quota.max_bytes && (in.rstat.rbytes + new_bytes)
13486 > in.quota.max_bytes;
13490 bool Client::is_quota_bytes_approaching(Inode *in, const UserPerm& perms)
13492 return check_quota_condition(in, perms,
13493 [](const Inode &in) {
13494 if (in.quota.max_bytes) {
13495 if (in.rstat.rbytes >= in.quota.max_bytes) {
13499 assert(in.size >= in.reported_size);
13500 const uint64_t space = in.quota.max_bytes - in.rstat.rbytes;
13501 const uint64_t size = in.size - in.reported_size;
13502 return (space >> 4) < size;
13516 int Client::check_pool_perm(Inode *in, int need)
13518 if (!cct->_conf->client_check_pool_perm)
13521 int64_t pool_id = in->layout.pool_id;
13522 std::string pool_ns = in->layout.pool_ns;
13523 std::pair<int64_t, std::string> perm_key(pool_id, pool_ns);
13526 auto it = pool_perms.find(perm_key);
13527 if (it == pool_perms.end())
13529 if (it->second == POOL_CHECKING) {
13530 // avoid concurrent checkings
13531 wait_on_list(waiting_for_pool_perm);
13534 assert(have & POOL_CHECKED);
13540 if (in->snapid != CEPH_NOSNAP) {
13541 // pool permission check needs to write to the first object. But for snapshot,
13542 // head of the first object may have alread been deleted. To avoid creating
13543 // orphan object, skip the check for now.
13547 pool_perms[perm_key] = POOL_CHECKING;
13550 snprintf(oid_buf, sizeof(oid_buf), "%llx.00000000", (unsigned long long)in->ino);
13551 object_t oid = oid_buf;
13553 SnapContext nullsnapc;
13555 C_SaferCond rd_cond;
13556 ObjectOperation rd_op;
13557 rd_op.stat(NULL, (ceph::real_time*)nullptr, NULL);
13559 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), rd_op,
13560 nullsnapc, ceph::real_clock::now(), 0, &rd_cond);
13562 C_SaferCond wr_cond;
13563 ObjectOperation wr_op;
13564 wr_op.create(true);
13566 objecter->mutate(oid, OSDMap::file_to_object_locator(in->layout), wr_op,
13567 nullsnapc, ceph::real_clock::now(), 0, &wr_cond);
13569 client_lock.Unlock();
13570 int rd_ret = rd_cond.wait();
13571 int wr_ret = wr_cond.wait();
13572 client_lock.Lock();
13574 bool errored = false;
13576 if (rd_ret == 0 || rd_ret == -ENOENT)
13578 else if (rd_ret != -EPERM) {
13579 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13580 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13584 if (wr_ret == 0 || wr_ret == -EEXIST)
13585 have |= POOL_WRITE;
13586 else if (wr_ret != -EPERM) {
13587 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13588 << " rd_err = " << rd_ret << " wr_err = " << wr_ret << dendl;
13593 // Indeterminate: erase CHECKING state so that subsequent calls re-check.
13594 // Raise EIO because actual error code might be misleading for
13595 // userspace filesystem user.
13596 pool_perms.erase(perm_key);
13597 signal_cond_list(waiting_for_pool_perm);
13601 pool_perms[perm_key] = have | POOL_CHECKED;
13602 signal_cond_list(waiting_for_pool_perm);
13605 if ((need & CEPH_CAP_FILE_RD) && !(have & POOL_READ)) {
13606 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13607 << " need " << ccap_string(need) << ", but no read perm" << dendl;
13610 if ((need & CEPH_CAP_FILE_WR) && !(have & POOL_WRITE)) {
13611 ldout(cct, 10) << "check_pool_perm on pool " << pool_id << " ns " << pool_ns
13612 << " need " << ccap_string(need) << ", but no write perm" << dendl;
13619 int Client::_posix_acl_permission(Inode *in, const UserPerm& perms, unsigned want)
13621 if (acl_type == POSIX_ACL) {
13622 if (in->xattrs.count(ACL_EA_ACCESS)) {
13623 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13625 return posix_acl_permits(access_acl, in->uid, in->gid, perms, want);
13631 int Client::_posix_acl_chmod(Inode *in, mode_t mode, const UserPerm& perms)
13633 if (acl_type == NO_ACL)
13636 int r = _getattr(in, CEPH_STAT_CAP_XATTR, perms, in->xattr_version == 0);
13640 if (acl_type == POSIX_ACL) {
13641 if (in->xattrs.count(ACL_EA_ACCESS)) {
13642 const bufferptr& access_acl = in->xattrs[ACL_EA_ACCESS];
13643 bufferptr acl(access_acl.c_str(), access_acl.length());
13644 r = posix_acl_access_chmod(acl, mode);
13647 r = _do_setxattr(in, ACL_EA_ACCESS, acl.c_str(), acl.length(), 0, perms);
13653 ldout(cct, 10) << __func__ << " ino " << in->ino << " result=" << r << dendl;
13657 int Client::_posix_acl_create(Inode *dir, mode_t *mode, bufferlist& xattrs_bl,
13658 const UserPerm& perms)
13660 if (acl_type == NO_ACL)
13663 if (S_ISLNK(*mode))
13666 int r = _getattr(dir, CEPH_STAT_CAP_XATTR, perms, dir->xattr_version == 0);
13670 if (acl_type == POSIX_ACL) {
13671 if (dir->xattrs.count(ACL_EA_DEFAULT)) {
13672 map<string, bufferptr> xattrs;
13674 const bufferptr& default_acl = dir->xattrs[ACL_EA_DEFAULT];
13675 bufferptr acl(default_acl.c_str(), default_acl.length());
13676 r = posix_acl_inherit_mode(acl, mode);
13681 r = posix_acl_equiv_mode(acl.c_str(), acl.length(), mode);
13685 xattrs[ACL_EA_ACCESS] = acl;
13688 if (S_ISDIR(*mode))
13689 xattrs[ACL_EA_DEFAULT] = dir->xattrs[ACL_EA_DEFAULT];
13693 ::encode(xattrs, xattrs_bl);
13696 *mode &= ~umask_cb(callback_handle);
13701 ldout(cct, 10) << __func__ << " dir ino " << dir->ino << " result=" << r << dendl;
13705 void Client::set_filer_flags(int flags)
13707 Mutex::Locker l(client_lock);
13708 assert(flags == 0 ||
13709 flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13710 objecter->add_global_op_flags(flags);
13713 void Client::clear_filer_flags(int flags)
13715 Mutex::Locker l(client_lock);
13716 assert(flags == CEPH_OSD_FLAG_LOCALIZE_READS);
13717 objecter->clear_global_op_flag(flags);
13721 * This is included in cap release messages, to cause
13722 * the MDS to wait until this OSD map epoch. It is necessary
13723 * in corner cases where we cancel RADOS ops, so that
13724 * nobody else tries to do IO to the same objects in
13725 * the same epoch as the cancelled ops.
13727 void Client::set_cap_epoch_barrier(epoch_t e)
13729 ldout(cct, 5) << __func__ << " epoch = " << e << dendl;
13730 cap_epoch_barrier = e;
13733 const char** Client::get_tracked_conf_keys() const
13735 static const char* keys[] = {
13736 "client_cache_size",
13737 "client_cache_mid",
13744 void Client::handle_conf_change(const struct md_config_t *conf,
13745 const std::set <std::string> &changed)
13747 Mutex::Locker lock(client_lock);
13749 if (changed.count("client_cache_mid")) {
13750 lru.lru_set_midpoint(cct->_conf->client_cache_mid);
13752 if (changed.count("client_acl_type")) {
13754 if (cct->_conf->client_acl_type == "posix_acl")
13755 acl_type = POSIX_ACL;
13759 void Client::init_groups(UserPerm *perms)
13762 int count = _getgrouplist(&sgids, perms->uid(), perms->gid());
13763 perms->init_gids(sgids, count);
13766 void intrusive_ptr_add_ref(Inode *in)
13771 void intrusive_ptr_release(Inode *in)
13773 in->client->put_inode(in);
13776 mds_rank_t Client::_get_random_up_mds() const
13778 assert(client_lock.is_locked_by_me());
13780 std::set<mds_rank_t> up;
13781 mdsmap->get_up_mds_set(up);
13784 return MDS_RANK_NONE;
13785 std::set<mds_rank_t>::const_iterator p = up.begin();
13786 for (int n = rand() % up.size(); n; n--)
13792 StandaloneClient::StandaloneClient(Messenger *m, MonClient *mc)
13793 : Client(m, mc, new Objecter(m->cct, m, mc, NULL, 0, 0))
13795 monclient->set_messenger(m);
13796 objecter->set_client_incarnation(0);
13799 StandaloneClient::~StandaloneClient()
13802 objecter = nullptr;
13805 int StandaloneClient::init()
13808 objectcacher->start();
13811 client_lock.Lock();
13812 assert(!initialized);
13814 messenger->add_dispatcher_tail(objecter);
13815 messenger->add_dispatcher_tail(this);
13817 monclient->set_want_keys(CEPH_ENTITY_TYPE_MDS | CEPH_ENTITY_TYPE_OSD);
13818 int r = monclient->init();
13820 // need to do cleanup because we're in an intermediate init state
13822 client_lock.Unlock();
13823 objecter->shutdown();
13824 objectcacher->stop();
13825 monclient->shutdown();
13830 client_lock.Unlock();
13836 void StandaloneClient::shutdown()
13838 Client::shutdown();
13839 objecter->shutdown();
13840 monclient->shutdown();