X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=src%2Fceph%2Fsrc%2Fmds%2FMDCache.cc;fp=src%2Fceph%2Fsrc%2Fmds%2FMDCache.cc;h=0000000000000000000000000000000000000000;hb=7da45d65be36d36b880cc55c5036e96c24b53f00;hp=b40833fd1cabbee3ebf15906d4df332f8239e677;hpb=691462d09d0987b47e112d6ee8740375df3c51b2;p=stor4nfv.git diff --git a/src/ceph/src/mds/MDCache.cc b/src/ceph/src/mds/MDCache.cc deleted file mode 100644 index b40833f..0000000 --- a/src/ceph/src/mds/MDCache.cc +++ /dev/null @@ -1,12476 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#include -#include -#include -#include -#include -#include - -#include "MDCache.h" -#include "MDSRank.h" -#include "Server.h" -#include "Locker.h" -#include "MDLog.h" -#include "MDBalancer.h" -#include "Migrator.h" -#include "ScrubStack.h" - -#include "SnapClient.h" - -#include "MDSMap.h" - -#include "CInode.h" -#include "CDir.h" - -#include "Mutation.h" - -#include "include/ceph_fs.h" -#include "include/filepath.h" -#include "include/util.h" - -#include "msg/Message.h" -#include "msg/Messenger.h" - -#include "common/MemoryModel.h" -#include "common/errno.h" -#include "common/perf_counters.h" -#include "common/safe_io.h" - -#include "osdc/Journaler.h" -#include "osdc/Filer.h" - -#include "events/ESubtreeMap.h" -#include "events/EUpdate.h" -#include "events/ESlaveUpdate.h" -#include "events/EImportFinish.h" -#include "events/EFragment.h" -#include "events/ECommitted.h" -#include "events/ESessions.h" - -#include "messages/MGenericMessage.h" - -#include "messages/MMDSResolve.h" -#include "messages/MMDSResolveAck.h" -#include "messages/MMDSCacheRejoin.h" - -#include "messages/MDiscover.h" -#include "messages/MDiscoverReply.h" - -//#include "messages/MInodeUpdate.h" -#include "messages/MDirUpdate.h" -#include "messages/MCacheExpire.h" - -#include "messages/MInodeFileCaps.h" - -#include "messages/MLock.h" -#include "messages/MDentryLink.h" -#include "messages/MDentryUnlink.h" - -#include "messages/MMDSFindIno.h" -#include "messages/MMDSFindInoReply.h" - -#include "messages/MMDSOpenIno.h" -#include "messages/MMDSOpenInoReply.h" - -#include "messages/MClientRequest.h" -#include "messages/MClientCaps.h" -#include "messages/MClientSnap.h" -#include "messages/MClientQuota.h" - -#include "messages/MMDSSlaveRequest.h" - -#include "messages/MMDSFragmentNotify.h" - -#include "messages/MGatherCaps.h" - -#include "InoTable.h" - -#include "common/Timer.h" - -#include "perfglue/heap_profiler.h" - -using namespace std; - -#include "common/config.h" -#include "include/assert.h" - -#define dout_context g_ceph_context -#define dout_subsys ceph_subsys_mds -#undef dout_prefix -#define dout_prefix _prefix(_dout, mds) -static ostream& _prefix(std::ostream *_dout, MDSRank *mds) { - return *_dout << "mds." << mds->get_nodeid() << ".cache "; -} - -set SimpleLock::empty_gather_set; - - -/** - * All non-I/O contexts that require a reference - * to an MDCache instance descend from this. - */ -class MDCacheContext : public virtual MDSInternalContextBase { -protected: - MDCache *mdcache; - MDSRank *get_mds() override - { - assert(mdcache != NULL); - return mdcache->mds; - } -public: - explicit MDCacheContext(MDCache *mdc_) : mdcache(mdc_) {} -}; - - -/** - * Only for contexts called back from an I/O completion - * - * Note: duplication of members wrt MDCacheContext, because - * it'ls the lesser of two evils compared with introducing - * yet another piece of (multiple) inheritance. - */ -class MDCacheIOContext : public virtual MDSIOContextBase { -protected: - MDCache *mdcache; - MDSRank *get_mds() override - { - assert(mdcache != NULL); - return mdcache->mds; - } -public: - explicit MDCacheIOContext(MDCache *mdc_) : mdcache(mdc_) {} -}; - -class MDCacheLogContext : public virtual MDSLogContextBase { -protected: - MDCache *mdcache; - MDSRank *get_mds() override - { - assert(mdcache != NULL); - return mdcache->mds; - } -public: - explicit MDCacheLogContext(MDCache *mdc_) : mdcache(mdc_) {} -}; - -MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) : - mds(m), - filer(m->objecter, m->finisher), - exceeded_size_limit(false), - recovery_queue(m), - stray_manager(m, purge_queue_) -{ - migrator.reset(new Migrator(mds, this)); - root = NULL; - myin = NULL; - readonly = false; - - stray_index = 0; - for (int i = 0; i < NUM_STRAY; ++i) { - strays[i] = NULL; - } - - num_inodes_with_caps = 0; - - max_dir_commit_size = g_conf->mds_dir_max_commit_size ? - (g_conf->mds_dir_max_commit_size << 20) : - (0.9 *(g_conf->osd_max_write_size << 20)); - - discover_last_tid = 0; - open_ino_last_tid = 0; - find_ino_peer_last_tid = 0; - - last_cap_id = 0; - - client_lease_durations[0] = 5.0; - client_lease_durations[1] = 30.0; - client_lease_durations[2] = 300.0; - - resolves_pending = false; - rejoins_pending = false; - cap_imports_num_opening = 0; - - opening_root = open = false; - lru.lru_set_midpoint(cache_mid()); - - bottom_lru.lru_set_midpoint(0); - - decayrate.set_halflife(g_conf->mds_decay_halflife); - - did_shutdown_log_cap = false; -} - -MDCache::~MDCache() -{ - if (logger) { - g_ceph_context->get_perfcounters_collection()->remove(logger.get()); - } -} - - - -void MDCache::log_stat() -{ - mds->logger->set(l_mds_inode_max, cache_limit_inodes() == 0 ? INT_MAX : cache_limit_inodes()); - mds->logger->set(l_mds_inodes, lru.lru_get_size()); - mds->logger->set(l_mds_inodes_pinned, lru.lru_get_num_pinned()); - mds->logger->set(l_mds_inodes_top, lru.lru_get_top()); - mds->logger->set(l_mds_inodes_bottom, lru.lru_get_bot()); - mds->logger->set(l_mds_inodes_pin_tail, lru.lru_get_pintail()); - mds->logger->set(l_mds_inodes_with_caps, num_inodes_with_caps); - mds->logger->set(l_mds_caps, Capability::count()); -} - - -// - -bool MDCache::shutdown() -{ - if (lru.lru_get_size() > 0) { - dout(7) << "WARNING: mdcache shutdown with non-empty cache" << dendl; - //show_cache(); - show_subtrees(); - //dump(); - } - return true; -} - - -// ==================================================================== -// some inode functions - -void MDCache::add_inode(CInode *in) -{ - // add to lru, inode map - assert(inode_map.count(in->vino()) == 0); // should be no dup inos! - inode_map[ in->vino() ] = in; - - if (in->ino() < MDS_INO_SYSTEM_BASE) { - if (in->ino() == MDS_INO_ROOT) - root = in; - else if (in->ino() == MDS_INO_MDSDIR(mds->get_nodeid())) - myin = in; - else if (in->is_stray()) { - if (MDS_INO_STRAY_OWNER(in->ino()) == mds->get_nodeid()) { - strays[MDS_INO_STRAY_INDEX(in->ino())] = in; - } - } - if (in->is_base()) - base_inodes.insert(in); - } - - if (cache_toofull()) { - exceeded_size_limit = true; - } -} - -void MDCache::remove_inode(CInode *o) -{ - dout(14) << "remove_inode " << *o << dendl; - - if (o->get_parent_dn()) { - // FIXME: multiple parents? - CDentry *dn = o->get_parent_dn(); - assert(!dn->is_dirty()); - dn->dir->unlink_inode(dn); // leave dentry ... FIXME? - } - - if (o->is_dirty()) - o->mark_clean(); - if (o->is_dirty_parent()) - o->clear_dirty_parent(); - - o->clear_scatter_dirty(); - - o->item_open_file.remove_myself(); - - if (o->state_test(CInode::STATE_QUEUEDEXPORTPIN)) - export_pin_queue.erase(o); - - // remove from inode map - inode_map.erase(o->vino()); - - if (o->ino() < MDS_INO_SYSTEM_BASE) { - if (o == root) root = 0; - if (o == myin) myin = 0; - if (o->is_stray()) { - if (MDS_INO_STRAY_OWNER(o->ino()) == mds->get_nodeid()) { - strays[MDS_INO_STRAY_INDEX(o->ino())] = 0; - } - } - if (o->is_base()) - base_inodes.erase(o); - } - - // delete it - assert(o->get_num_ref() == 0); - delete o; -} - -file_layout_t MDCache::gen_default_file_layout(const MDSMap &mdsmap) -{ - file_layout_t result = file_layout_t::get_default(); - result.pool_id = mdsmap.get_first_data_pool(); - return result; -} - -file_layout_t MDCache::gen_default_log_layout(const MDSMap &mdsmap) -{ - file_layout_t result = file_layout_t::get_default(); - result.pool_id = mdsmap.get_metadata_pool(); - if (g_conf->mds_log_segment_size > 0) { - result.object_size = g_conf->mds_log_segment_size; - result.stripe_unit = g_conf->mds_log_segment_size; - } - return result; -} - -void MDCache::init_layouts() -{ - default_file_layout = gen_default_file_layout(*(mds->mdsmap)); - default_log_layout = gen_default_log_layout(*(mds->mdsmap)); -} - -void MDCache::create_unlinked_system_inode(CInode *in, inodeno_t ino, - int mode) const -{ - in->inode.ino = ino; - in->inode.version = 1; - in->inode.xattr_version = 1; - in->inode.mode = 0500 | mode; - in->inode.size = 0; - in->inode.ctime = - in->inode.mtime = - in->inode.btime = ceph_clock_now(); - in->inode.nlink = 1; - in->inode.truncate_size = -1ull; - in->inode.change_attr = 0; - in->inode.export_pin = MDS_RANK_NONE; - - memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout)); - if (in->inode.is_dir()) { - in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash; - ++in->inode.rstat.rsubdirs; - } else { - in->inode.layout = default_file_layout; - ++in->inode.rstat.rfiles; - } - in->inode.accounted_rstat = in->inode.rstat; - - if (in->is_base()) { - if (in->is_root()) - in->inode_auth = mds_authority_t(mds->get_nodeid(), CDIR_AUTH_UNKNOWN); - else - in->inode_auth = mds_authority_t(mds_rank_t(in->ino() - MDS_INO_MDSDIR_OFFSET), CDIR_AUTH_UNKNOWN); - in->open_snaprealm(); // empty snaprealm - assert(!in->snaprealm->parent); // created its own - in->snaprealm->srnode.seq = 1; - } -} - -CInode *MDCache::create_system_inode(inodeno_t ino, int mode) -{ - dout(0) << "creating system inode with ino:" << ino << dendl; - CInode *in = new CInode(this); - create_unlinked_system_inode(in, ino, mode); - add_inode(in); - return in; -} - -CInode *MDCache::create_root_inode() -{ - CInode *i = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); - i->inode.uid = g_conf->mds_root_ino_uid; - i->inode.gid = g_conf->mds_root_ino_gid; - i->inode.layout = default_file_layout; - i->inode.layout.pool_id = mds->mdsmap->get_first_data_pool(); - return i; -} - -void MDCache::create_empty_hierarchy(MDSGather *gather) -{ - // create root dir - CInode *root = create_root_inode(); - - // force empty root dir - CDir *rootdir = root->get_or_open_dirfrag(this, frag_t()); - adjust_subtree_auth(rootdir, mds->get_nodeid()); - rootdir->dir_rep = CDir::REP_ALL; //NONE; - - rootdir->fnode.accounted_fragstat = rootdir->fnode.fragstat; - rootdir->fnode.accounted_rstat = rootdir->fnode.rstat; - - root->inode.dirstat = rootdir->fnode.fragstat; - root->inode.rstat = rootdir->fnode.rstat; - ++root->inode.rstat.rsubdirs; - root->inode.accounted_rstat = root->inode.rstat; - - rootdir->mark_complete(); - rootdir->mark_dirty(rootdir->pre_dirty(), mds->mdlog->get_current_segment()); - rootdir->commit(0, gather->new_sub()); - - root->store(gather->new_sub()); -} - -void MDCache::create_mydir_hierarchy(MDSGather *gather) -{ - // create mds dir - CInode *my = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR); - - CDir *mydir = my->get_or_open_dirfrag(this, frag_t()); - adjust_subtree_auth(mydir, mds->get_nodeid()); - - LogSegment *ls = mds->mdlog->get_current_segment(); - - // stray dir - for (int i = 0; i < NUM_STRAY; ++i) { - CInode *stray = create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR); - CDir *straydir = stray->get_or_open_dirfrag(this, frag_t()); - stringstream name; - name << "stray" << i; - CDentry *sdn = mydir->add_primary_dentry(name.str(), stray); - sdn->_mark_dirty(mds->mdlog->get_current_segment()); - - stray->inode.dirstat = straydir->fnode.fragstat; - - mydir->fnode.rstat.add(stray->inode.rstat); - mydir->fnode.fragstat.nsubdirs++; - // save them - straydir->mark_complete(); - straydir->mark_dirty(straydir->pre_dirty(), ls); - straydir->commit(0, gather->new_sub()); - stray->_mark_dirty_parent(ls, true); - stray->store_backtrace(gather->new_sub()); - } - - mydir->fnode.accounted_fragstat = mydir->fnode.fragstat; - mydir->fnode.accounted_rstat = mydir->fnode.rstat; - - myin->inode.dirstat = mydir->fnode.fragstat; - myin->inode.rstat = mydir->fnode.rstat; - ++myin->inode.rstat.rsubdirs; - myin->inode.accounted_rstat = myin->inode.rstat; - - mydir->mark_complete(); - mydir->mark_dirty(mydir->pre_dirty(), ls); - mydir->commit(0, gather->new_sub()); - - myin->store(gather->new_sub()); -} - -struct C_MDC_CreateSystemFile : public MDCacheLogContext { - MutationRef mut; - CDentry *dn; - version_t dpv; - MDSInternalContextBase *fin; - C_MDC_CreateSystemFile(MDCache *c, MutationRef& mu, CDentry *d, version_t v, MDSInternalContextBase *f) : - MDCacheLogContext(c), mut(mu), dn(d), dpv(v), fin(f) {} - void finish(int r) override { - mdcache->_create_system_file_finish(mut, dn, dpv, fin); - } -}; - -void MDCache::_create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin) -{ - dout(10) << "_create_system_file " << name << " in " << *dir << dendl; - CDentry *dn = dir->add_null_dentry(name); - - dn->push_projected_linkage(in); - version_t dpv = dn->pre_dirty(); - - CDir *mdir = 0; - if (in->inode.is_dir()) { - in->inode.rstat.rsubdirs = 1; - - mdir = in->get_or_open_dirfrag(this, frag_t()); - mdir->mark_complete(); - mdir->pre_dirty(); - } else - in->inode.rstat.rfiles = 1; - in->inode.version = dn->pre_dirty(); - - SnapRealm *realm = dir->get_inode()->find_snaprealm(); - dn->first = in->first = realm->get_newest_seq() + 1; - - MutationRef mut(new MutationImpl()); - - // force some locks. hacky. - mds->locker->wrlock_force(&dir->inode->filelock, mut); - mds->locker->wrlock_force(&dir->inode->nestlock, mut); - - mut->ls = mds->mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mds->mdlog, "create system file"); - mds->mdlog->start_entry(le); - - if (!in->is_mdsdir()) { - predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_PRIMARY|PREDIRTY_DIR, 1); - le->metablob.add_primary_dentry(dn, in, true); - } else { - predirty_journal_parents(mut, &le->metablob, in, dir, PREDIRTY_DIR, 1); - journal_dirty_inode(mut.get(), &le->metablob, in); - dn->push_projected_linkage(in->ino(), in->d_type()); - le->metablob.add_remote_dentry(dn, true, in->ino(), in->d_type()); - le->metablob.add_root(true, in); - } - if (mdir) - le->metablob.add_new_dir(mdir); // dirty AND complete AND new - - mds->mdlog->submit_entry(le, new C_MDC_CreateSystemFile(this, mut, dn, dpv, fin)); - mds->mdlog->flush(); -} - -void MDCache::_create_system_file_finish(MutationRef& mut, CDentry *dn, version_t dpv, MDSInternalContextBase *fin) -{ - dout(10) << "_create_system_file_finish " << *dn << dendl; - - dn->pop_projected_linkage(); - dn->mark_dirty(dpv, mut->ls); - - CInode *in = dn->get_linkage()->get_inode(); - in->inode.version--; - in->mark_dirty(in->inode.version + 1, mut->ls); - - if (in->inode.is_dir()) { - CDir *dir = in->get_dirfrag(frag_t()); - assert(dir); - dir->mark_dirty(1, mut->ls); - dir->mark_new(mut->ls); - } - - mut->apply(); - mds->locker->drop_locks(mut.get()); - mut->cleanup(); - - fin->complete(0); - - //if (dir && MDS_INO_IS_MDSDIR(in->ino())) - //migrator->export_dir(dir, (int)in->ino() - MDS_INO_MDSDIR_OFFSET); -} - - - -struct C_MDS_RetryOpenRoot : public MDSInternalContext { - MDCache *cache; - explicit C_MDS_RetryOpenRoot(MDCache *c) : MDSInternalContext(c->mds), cache(c) {} - void finish(int r) override { - if (r < 0) { - // If we can't open root, something disastrous has happened: mark - // this rank damaged for operator intervention. Note that - // it is not okay to call suicide() here because we are in - // a Finisher callback. - cache->mds->damaged(); - ceph_abort(); // damaged should never return - } else { - cache->open_root(); - } - } -}; - -void MDCache::open_root_inode(MDSInternalContextBase *c) -{ - if (mds->get_nodeid() == mds->mdsmap->get_root()) { - CInode *in; - in = create_system_inode(MDS_INO_ROOT, S_IFDIR|0755); // initially inaccurate! - in->fetch(c); - } else { - discover_base_ino(MDS_INO_ROOT, c, mds->mdsmap->get_root()); - } -} - -void MDCache::open_mydir_inode(MDSInternalContextBase *c) -{ - MDSGatherBuilder gather(g_ceph_context); - - CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate! - in->fetch(gather.new_sub()); - - gather.set_finisher(c); - gather.activate(); -} - -void MDCache::open_root() -{ - dout(10) << "open_root" << dendl; - - if (!root) { - open_root_inode(new C_MDS_RetryOpenRoot(this)); - return; - } - if (mds->get_nodeid() == mds->mdsmap->get_root()) { - assert(root->is_auth()); - CDir *rootdir = root->get_or_open_dirfrag(this, frag_t()); - assert(rootdir); - if (!rootdir->is_subtree_root()) - adjust_subtree_auth(rootdir, mds->get_nodeid()); - if (!rootdir->is_complete()) { - rootdir->fetch(new C_MDS_RetryOpenRoot(this)); - return; - } - } else { - assert(!root->is_auth()); - CDir *rootdir = root->get_dirfrag(frag_t()); - if (!rootdir) { - open_remote_dirfrag(root, frag_t(), new C_MDS_RetryOpenRoot(this)); - return; - } - } - - if (!myin) { - CInode *in = create_system_inode(MDS_INO_MDSDIR(mds->get_nodeid()), S_IFDIR|0755); // initially inaccurate! - in->fetch(new C_MDS_RetryOpenRoot(this)); - return; - } - CDir *mydir = myin->get_or_open_dirfrag(this, frag_t()); - assert(mydir); - adjust_subtree_auth(mydir, mds->get_nodeid()); - - populate_mydir(); -} - -void MDCache::populate_mydir() -{ - assert(myin); - CDir *mydir = myin->get_or_open_dirfrag(this, frag_t()); - assert(mydir); - - dout(10) << "populate_mydir " << *mydir << dendl; - - if (!mydir->is_complete()) { - mydir->fetch(new C_MDS_RetryOpenRoot(this)); - return; - } - - if (mydir->get_version() == 0 && mydir->state_test(CDir::STATE_BADFRAG)) { - // A missing dirfrag, we will recreate it. Before that, we must dirty - // it before dirtying any of the strays we create within it. - mds->clog->warn() << "fragment " << mydir->dirfrag() << " was unreadable, " - "recreating it now"; - LogSegment *ls = mds->mdlog->get_current_segment(); - mydir->state_clear(CDir::STATE_BADFRAG); - mydir->mark_complete(); - mydir->mark_dirty(mydir->pre_dirty(), ls); - } - - // open or create stray - uint64_t num_strays = 0; - for (int i = 0; i < NUM_STRAY; ++i) { - stringstream name; - name << "stray" << i; - CDentry *straydn = mydir->lookup(name.str()); - - // allow for older fs's with stray instead of stray0 - if (straydn == NULL && i == 0) - straydn = mydir->lookup("stray"); - - if (!straydn || !straydn->get_linkage()->get_inode()) { - _create_system_file(mydir, name.str().c_str(), create_system_inode(MDS_INO_STRAY(mds->get_nodeid(), i), S_IFDIR), - new C_MDS_RetryOpenRoot(this)); - return; - } - assert(straydn); - assert(strays[i]); - // we make multiple passes through this method; make sure we only pin each stray once. - if (!strays[i]->state_test(CInode::STATE_STRAYPINNED)) { - strays[i]->get(CInode::PIN_STRAY); - strays[i]->state_set(CInode::STATE_STRAYPINNED); - strays[i]->get_stickydirs(); - } - dout(20) << " stray num " << i << " is " << *strays[i] << dendl; - - // open all frags - list ls; - strays[i]->dirfragtree.get_leaves(ls); - for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - frag_t fg = *p; - CDir *dir = strays[i]->get_dirfrag(fg); - if (!dir) { - dir = strays[i]->get_or_open_dirfrag(this, fg); - } - - // DamageTable applies special handling to strays: it will - // have damaged() us out if one is damaged. - assert(!dir->state_test(CDir::STATE_BADFRAG)); - - if (dir->get_version() == 0) { - dir->fetch(new C_MDS_RetryOpenRoot(this)); - return; - } - - if (dir->get_frag_size() > 0) - num_strays += dir->get_frag_size(); - } - } - - stray_manager.set_num_strays(num_strays); - - // okay! - dout(10) << "populate_mydir done" << dendl; - assert(!open); - open = true; - mds->queue_waiters(waiting_for_open); - - scan_stray_dir(); -} - -void MDCache::open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *fin) -{ - discover_base_ino(ino, fin, mds_rank_t(ino & (MAX_MDS-1))); -} - -CDir *MDCache::get_stray_dir(CInode *in) -{ - string straydname; - in->name_stray_dentry(straydname); - - CInode *strayi = get_stray(); - assert(strayi); - frag_t fg = strayi->pick_dirfrag(straydname); - CDir *straydir = strayi->get_dirfrag(fg); - assert(straydir); - return straydir; -} - -CDentry *MDCache::get_or_create_stray_dentry(CInode *in) -{ - CDir *straydir = get_stray_dir(in); - string straydname; - in->name_stray_dentry(straydname); - CDentry *straydn = straydir->lookup(straydname); - if (!straydn) { - straydn = straydir->add_null_dentry(straydname); - straydn->mark_new(); - } else { - assert(straydn->get_projected_linkage()->is_null()); - } - - straydn->state_set(CDentry::STATE_STRAY); - return straydn; -} - - - -MDSCacheObject *MDCache::get_object(MDSCacheObjectInfo &info) -{ - // inode? - if (info.ino) - return get_inode(info.ino, info.snapid); - - // dir or dentry. - CDir *dir = get_dirfrag(info.dirfrag); - if (!dir) return 0; - - if (info.dname.length()) - return dir->lookup(info.dname, info.snapid); - else - return dir; -} - - - - -// ==================================================================== -// subtree management - -void MDCache::list_subtrees(list& ls) -{ - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) - ls.push_back(p->first); -} - -/* - * adjust the dir_auth of a subtree. - * merge with parent and/or child subtrees, if is it appropriate. - * merge can ONLY happen if both parent and child have unambiguous auth. - */ -void MDCache::adjust_subtree_auth(CDir *dir, mds_authority_t auth) -{ - dout(7) << "adjust_subtree_auth " << dir->get_dir_auth() << " -> " << auth - << " on " << *dir << dendl; - - show_subtrees(); - - CDir *root; - if (dir->inode->is_base()) { - root = dir; // bootstrap hack. - if (subtrees.count(root) == 0) { - subtrees[root]; - root->get(CDir::PIN_SUBTREE); - } - } else { - root = get_subtree_root(dir); // subtree root - } - assert(root); - assert(subtrees.count(root)); - dout(7) << " current root is " << *root << dendl; - - if (root == dir) { - // i am already a subtree. - dir->set_dir_auth(auth); - } else { - // i am a new subtree. - dout(10) << " new subtree at " << *dir << dendl; - assert(subtrees.count(dir) == 0); - subtrees[dir]; // create empty subtree bounds list for me. - dir->get(CDir::PIN_SUBTREE); - - // set dir_auth - dir->set_dir_auth(auth); - - // move items nested beneath me, under me. - set::iterator p = subtrees[root].begin(); - while (p != subtrees[root].end()) { - set::iterator next = p; - ++next; - if (get_subtree_root((*p)->get_parent_dir()) == dir) { - // move under me - dout(10) << " claiming child bound " << **p << dendl; - subtrees[dir].insert(*p); - subtrees[root].erase(p); - } - p = next; - } - - // i am a bound of the parent subtree. - subtrees[root].insert(dir); - - // i am now the subtree root. - root = dir; - - // adjust recursive pop counters - if (dir->is_auth()) { - utime_t now = ceph_clock_now(); - CDir *p = dir->get_parent_dir(); - while (p) { - p->pop_auth_subtree.sub(now, decayrate, dir->pop_auth_subtree); - if (p->is_subtree_root()) break; - p = p->inode->get_parent_dir(); - } - } - } - - show_subtrees(); -} - - -void MDCache::try_subtree_merge(CDir *dir) -{ - dout(7) << "try_subtree_merge " << *dir << dendl; - assert(subtrees.count(dir)); - set oldbounds = subtrees[dir]; - - set to_eval; - // try merge at my root - try_subtree_merge_at(dir, &to_eval); - - // try merge at my old bounds - for (auto bound : oldbounds) - try_subtree_merge_at(bound, &to_eval); - - if (!(mds->is_any_replay() || mds->is_resolve())) { - for(auto in : to_eval) - eval_subtree_root(in); - } -} - -class C_MDC_SubtreeMergeWB : public MDCacheLogContext { - CInode *in; - MutationRef mut; -public: - C_MDC_SubtreeMergeWB(MDCache *mdc, CInode *i, MutationRef& m) : MDCacheLogContext(mdc), in(i), mut(m) {} - void finish(int r) override { - mdcache->subtree_merge_writebehind_finish(in, mut); - } -}; - -void MDCache::try_subtree_merge_at(CDir *dir, set *to_eval) -{ - dout(10) << "try_subtree_merge_at " << *dir << dendl; - assert(subtrees.count(dir)); - - // merge with parent? - CDir *parent = dir; - if (!dir->inode->is_base()) - parent = get_subtree_root(dir->get_parent_dir()); - - if (parent != dir && // we have a parent, - parent->dir_auth == dir->dir_auth && // auth matches, - dir->dir_auth.second == CDIR_AUTH_UNKNOWN && // auth is unambiguous, - !dir->state_test(CDir::STATE_EXPORTBOUND) && // not an exportbound, - !dir->state_test(CDir::STATE_AUXSUBTREE)) { // not aux subtree - // merge with parent. - dout(10) << " subtree merge at " << *dir << dendl; - dir->set_dir_auth(CDIR_AUTH_DEFAULT); - - // move our bounds under the parent - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) - subtrees[parent].insert(*p); - - // we are no longer a subtree or bound - dir->put(CDir::PIN_SUBTREE); - subtrees.erase(dir); - subtrees[parent].erase(dir); - - // adjust popularity? - if (dir->is_auth()) { - utime_t now = ceph_clock_now(); - CDir *p = dir->get_parent_dir(); - while (p) { - p->pop_auth_subtree.add(now, decayrate, dir->pop_auth_subtree); - if (p->is_subtree_root()) break; - p = p->inode->get_parent_dir(); - } - } - - if (to_eval && dir->get_inode()->is_auth()) - to_eval->insert(dir->get_inode()); - - show_subtrees(15); - } -} - -void MDCache::subtree_merge_writebehind_finish(CInode *in, MutationRef& mut) -{ - dout(10) << "subtree_merge_writebehind_finish on " << in << dendl; - in->pop_and_dirty_projected_inode(mut->ls); - - mut->apply(); - mds->locker->drop_locks(mut.get()); - mut->cleanup(); - - in->auth_unpin(this); -} - -void MDCache::eval_subtree_root(CInode *diri) -{ - // evaluate subtree inode filelock? - // (we should scatter the filelock on subtree bounds) - assert(diri->is_auth()); - mds->locker->try_eval(diri, CEPH_LOCK_IFILE | CEPH_LOCK_INEST); -} - - -void MDCache::adjust_bounded_subtree_auth(CDir *dir, set& bounds, mds_authority_t auth) -{ - dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth - << " on " << *dir - << " bounds " << bounds - << dendl; - - show_subtrees(); - - CDir *root; - if (dir->ino() == MDS_INO_ROOT) { - root = dir; // bootstrap hack. - if (subtrees.count(root) == 0) { - subtrees[root]; - root->get(CDir::PIN_SUBTREE); - } - } else { - root = get_subtree_root(dir); // subtree root - } - assert(root); - assert(subtrees.count(root)); - dout(7) << " current root is " << *root << dendl; - - mds_authority_t oldauth = dir->authority(); - - if (root == dir) { - // i am already a subtree. - dir->set_dir_auth(auth); - } else { - // i am a new subtree. - dout(10) << " new subtree at " << *dir << dendl; - assert(subtrees.count(dir) == 0); - subtrees[dir]; // create empty subtree bounds list for me. - dir->get(CDir::PIN_SUBTREE); - - // set dir_auth - dir->set_dir_auth(auth); - - // move items nested beneath me, under me. - set::iterator p = subtrees[root].begin(); - while (p != subtrees[root].end()) { - set::iterator next = p; - ++next; - if (get_subtree_root((*p)->get_parent_dir()) == dir) { - // move under me - dout(10) << " claiming child bound " << **p << dendl; - subtrees[dir].insert(*p); - subtrees[root].erase(p); - } - p = next; - } - - // i am a bound of the parent subtree. - subtrees[root].insert(dir); - - // i am now the subtree root. - root = dir; - } - - set to_eval; - - // verify/adjust bounds. - // - these may be new, or - // - beneath existing ambiguous bounds (which will be collapsed), - // - but NOT beneath unambiguous bounds. - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *bound = *p; - - // new bound? - if (subtrees[dir].count(bound) == 0) { - if (get_subtree_root(bound) == dir) { - dout(10) << " new bound " << *bound << ", adjusting auth back to old " << oldauth << dendl; - adjust_subtree_auth(bound, oldauth); // otherwise, adjust at bound. - } - else { - dout(10) << " want bound " << *bound << dendl; - CDir *t = get_subtree_root(bound->get_parent_dir()); - if (subtrees[t].count(bound) == 0) { - assert(t != dir); - dout(10) << " new bound " << *bound << dendl; - adjust_subtree_auth(bound, t->authority()); - } - // make sure it's nested beneath ambiguous subtree(s) - while (1) { - while (subtrees[dir].count(t) == 0) - t = get_subtree_root(t->get_parent_dir()); - dout(10) << " swallowing intervening subtree at " << *t << dendl; - adjust_subtree_auth(t, auth); - try_subtree_merge_at(t, &to_eval); - t = get_subtree_root(bound->get_parent_dir()); - if (t == dir) break; - } - } - } - else { - dout(10) << " already have bound " << *bound << dendl; - } - } - // merge stray bounds? - while (!subtrees[dir].empty()) { - set copy = subtrees[dir]; - for (set::iterator p = copy.begin(); p != copy.end(); ++p) { - if (bounds.count(*p) == 0) { - CDir *stray = *p; - dout(10) << " swallowing extra subtree at " << *stray << dendl; - adjust_subtree_auth(stray, auth); - try_subtree_merge_at(stray, &to_eval); - } - } - // swallowing subtree may add new subtree bounds - if (copy == subtrees[dir]) - break; - } - - // bound should now match. - verify_subtree_bounds(dir, bounds); - - show_subtrees(); - - if (!(mds->is_any_replay() || mds->is_resolve())) { - for(auto in : to_eval) - eval_subtree_root(in); - } -} - - -/* - * return a set of CDir*'s that correspond to the given bound set. Only adjust - * fragmentation as necessary to get an equivalent bounding set. That is, only - * split if one of our frags spans the provided bounding set. Never merge. - */ -void MDCache::get_force_dirfrag_bound_set(vector& dfs, set& bounds) -{ - dout(10) << "get_force_dirfrag_bound_set " << dfs << dendl; - - // sort by ino - map byino; - for (vector::iterator p = dfs.begin(); p != dfs.end(); ++p) - byino[p->ino].insert(p->frag); - dout(10) << " by ino: " << byino << dendl; - - for (map::iterator p = byino.begin(); p != byino.end(); ++p) { - CInode *diri = get_inode(p->first); - if (!diri) - continue; - dout(10) << " checking fragset " << p->second.get() << " on " << *diri << dendl; - - fragtree_t tmpdft; - for (set::iterator q = p->second.begin(); q != p->second.end(); ++q) - tmpdft.force_to_leaf(g_ceph_context, *q); - - for (set::iterator q = p->second.begin(); q != p->second.end(); ++q) { - frag_t fg = *q; - list fgls; - diri->dirfragtree.get_leaves_under(fg, fgls); - if (fgls.empty()) { - bool all = true; - frag_t approx_fg = diri->dirfragtree[fg.value()]; - list ls; - tmpdft.get_leaves_under(approx_fg, ls); - for (list::iterator r = ls.begin(); r != ls.end(); ++r) { - if (p->second.get().count(*r) == 0) { - // not bound, so the resolve message is from auth MDS of the dirfrag - force_dir_fragment(diri, *r); - all = false; - } - } - if (all) - fgls.push_back(approx_fg); - else - diri->dirfragtree.get_leaves_under(fg, fgls); - } - dout(10) << " frag " << fg << " contains " << fgls << dendl; - for (list::iterator r = fgls.begin(); r != fgls.end(); ++r) { - CDir *dir = diri->get_dirfrag(*r); - if (dir) - bounds.insert(dir); - } - } - } -} - -void MDCache::adjust_bounded_subtree_auth(CDir *dir, vector& bound_dfs, mds_authority_t auth) -{ - dout(7) << "adjust_bounded_subtree_auth " << dir->get_dir_auth() << " -> " << auth - << " on " << *dir << " bound_dfs " << bound_dfs << dendl; - - set bounds; - get_force_dirfrag_bound_set(bound_dfs, bounds); - adjust_bounded_subtree_auth(dir, bounds, auth); -} - -void MDCache::map_dirfrag_set(list& dfs, set& result) -{ - dout(10) << "map_dirfrag_set " << dfs << dendl; - - // group by inode - map ino_fragset; - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) - ino_fragset[p->ino].insert(p->frag); - - // get frags - for (map::iterator p = ino_fragset.begin(); - p != ino_fragset.end(); - ++p) { - CInode *in = get_inode(p->first); - if (!in) - continue; - - list fglist; - for (set::iterator q = p->second.begin(); q != p->second.end(); ++q) - in->dirfragtree.get_leaves_under(*q, fglist); - - dout(15) << "map_dirfrag_set " << p->second << " -> " << fglist - << " on " << *in << dendl; - - for (list::iterator q = fglist.begin(); q != fglist.end(); ++q) { - CDir *dir = in->get_dirfrag(*q); - if (dir) - result.insert(dir); - } - } -} - - - -CDir *MDCache::get_subtree_root(CDir *dir) -{ - // find the underlying dir that delegates (or is about to delegate) auth - while (true) { - if (dir->is_subtree_root()) - return dir; - dir = dir->get_inode()->get_parent_dir(); - if (!dir) - return 0; // none - } -} - -CDir *MDCache::get_projected_subtree_root(CDir *dir) -{ - // find the underlying dir that delegates (or is about to delegate) auth - while (true) { - if (dir->is_subtree_root()) - return dir; - dir = dir->get_inode()->get_projected_parent_dir(); - if (!dir) - return 0; // none - } -} - -void MDCache::remove_subtree(CDir *dir) -{ - dout(10) << "remove_subtree " << *dir << dendl; - assert(subtrees.count(dir)); - assert(subtrees[dir].empty()); - subtrees.erase(dir); - dir->put(CDir::PIN_SUBTREE); - if (dir->get_parent_dir()) { - CDir *p = get_subtree_root(dir->get_parent_dir()); - assert(subtrees[p].count(dir)); - subtrees[p].erase(dir); - } -} - -void MDCache::get_subtree_bounds(CDir *dir, set& bounds) -{ - assert(subtrees.count(dir)); - bounds = subtrees[dir]; -} - -void MDCache::get_wouldbe_subtree_bounds(CDir *dir, set& bounds) -{ - if (subtrees.count(dir)) { - // just copy them, dir is a subtree. - get_subtree_bounds(dir, bounds); - } else { - // find them - CDir *root = get_subtree_root(dir); - for (set::iterator p = subtrees[root].begin(); - p != subtrees[root].end(); - ++p) { - CDir *t = *p; - while (t != root) { - t = t->get_parent_dir(); - assert(t); - if (t == dir) { - bounds.insert(*p); - continue; - } - } - } - } -} - -void MDCache::verify_subtree_bounds(CDir *dir, const set& bounds) -{ - // for debugging only. - assert(subtrees.count(dir)); - if (bounds != subtrees[dir]) { - dout(0) << "verify_subtree_bounds failed" << dendl; - set b = bounds; - for (auto &cd : subtrees[dir]) { - if (bounds.count(cd)) { - b.erase(cd); - continue; - } - dout(0) << " missing bound " << *cd << dendl; - } - for (const auto &cd : b) - dout(0) << " extra bound " << *cd << dendl; - } - assert(bounds == subtrees[dir]); -} - -void MDCache::verify_subtree_bounds(CDir *dir, const list& bounds) -{ - // for debugging only. - assert(subtrees.count(dir)); - - // make sure that any bounds i do have are properly noted as such. - int failed = 0; - for (const auto &fg : bounds) { - CDir *bd = get_dirfrag(fg); - if (!bd) continue; - if (subtrees[dir].count(bd) == 0) { - dout(0) << "verify_subtree_bounds failed: extra bound " << *bd << dendl; - failed++; - } - } - assert(failed == 0); -} - -void MDCache::project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir) -{ - dout(10) << "project_subtree_rename " << *diri << " from " << *olddir - << " to " << *newdir << dendl; - projected_subtree_renames[diri].push_back(pair(olddir, newdir)); -} - -void MDCache::adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop) -{ - dout(10) << "adjust_subtree_after_rename " << *diri << " from " << *olddir << dendl; - - //show_subtrees(); - - CDir *newdir = diri->get_parent_dir(); - - if (pop) { - map > >::iterator p = projected_subtree_renames.find(diri); - assert(p != projected_subtree_renames.end()); - assert(!p->second.empty()); - assert(p->second.front().first == olddir); - assert(p->second.front().second == newdir); - p->second.pop_front(); - if (p->second.empty()) - projected_subtree_renames.erase(p); - } - - // adjust subtree - list dfls; - // make sure subtree dirfrags are at the front of the list - diri->get_subtree_dirfrags(dfls); - diri->get_nested_dirfrags(dfls); - for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) { - CDir *dir = *p; - - dout(10) << "dirfrag " << *dir << dendl; - CDir *oldparent = get_subtree_root(olddir); - dout(10) << " old parent " << *oldparent << dendl; - CDir *newparent = get_subtree_root(newdir); - dout(10) << " new parent " << *newparent << dendl; - - if (oldparent == newparent) { - dout(10) << "parent unchanged for " << *dir << " at " << *oldparent << dendl; - continue; - } - - if (dir->is_subtree_root()) { - // children are fine. change parent. - dout(10) << "moving " << *dir << " from " << *oldparent << " to " << *newparent << dendl; - assert(subtrees[oldparent].count(dir)); - subtrees[oldparent].erase(dir); - assert(subtrees.count(newparent)); - subtrees[newparent].insert(dir); - // caller is responsible for 'eval diri' - try_subtree_merge_at(dir, NULL); - } else { - // mid-subtree. - - // see if any old bounds move to the new parent. - list tomove; - for (set::iterator p = subtrees[oldparent].begin(); - p != subtrees[oldparent].end(); - ++p) { - CDir *bound = *p; - CDir *broot = get_subtree_root(bound->get_parent_dir()); - if (broot != oldparent) { - assert(broot == newparent); - tomove.push_back(bound); - } - } - for (list::iterator p = tomove.begin(); p != tomove.end(); ++p) { - CDir *bound = *p; - dout(10) << "moving bound " << *bound << " from " << *oldparent << " to " << *newparent << dendl; - subtrees[oldparent].erase(bound); - subtrees[newparent].insert(bound); - } - - // did auth change? - if (oldparent->authority() != newparent->authority()) { - adjust_subtree_auth(dir, oldparent->authority()); - // caller is responsible for 'eval diri' - try_subtree_merge_at(dir, NULL); - } - } - } - - show_subtrees(); -} - - -void MDCache::get_fullauth_subtrees(set& s) -{ - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_full_dir_auth()) - s.insert(root); - } -} -void MDCache::get_auth_subtrees(set& s) -{ - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_auth()) - s.insert(root); - } -} - - -// count. - -int MDCache::num_subtrees() -{ - return subtrees.size(); -} - -int MDCache::num_subtrees_fullauth() -{ - int n = 0; - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_full_dir_auth()) - n++; - } - return n; -} - -int MDCache::num_subtrees_fullnonauth() -{ - int n = 0; - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *root = p->first; - if (root->is_full_dir_nonauth()) - n++; - } - return n; -} - - - -// =================================== -// journal and snap/cow helpers - - -/* - * find first inode in cache that follows given snapid. otherwise, return current. - */ -CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows) -{ - dout(10) << "pick_inode_snap follows " << follows << " on " << *in << dendl; - assert(in->last == CEPH_NOSNAP); - - SnapRealm *realm = in->find_snaprealm(); - const set& snaps = realm->get_snaps(); - dout(10) << " realm " << *realm << " " << *realm->inode << dendl; - dout(10) << " snaps " << snaps << dendl; - - if (snaps.empty()) - return in; - - for (set::const_iterator p = snaps.upper_bound(follows); // first item > follows - p != snaps.end(); - ++p) { - CInode *t = get_inode(in->ino(), *p); - if (t) { - in = t; - dout(10) << "pick_inode_snap snap " << *p << " found " << *in << dendl; - break; - } - } - return in; -} - - -/* - * note: i'm currently cheating wrt dirty and inode.version on cow - * items. instead of doing a full dir predirty, i just take the - * original item's version, and set the dirty flag (via - * mutation::add_cow_{inode,dentry}() and mutation::apply(). that - * means a special case in the dir commit clean sweep assertions. - * bah. - */ -CInode *MDCache::cow_inode(CInode *in, snapid_t last) -{ - assert(last >= in->first); - - SnapRealm *realm = in->find_snaprealm(); - const set& snaps = realm->get_snaps(); - - // make sure snap inode's last match existing snapshots. - // MDCache::pick_inode_snap() requires this. - snapid_t last_snap = last; - if (snaps.count(last) == 0) { - set::const_iterator p = snaps.upper_bound(last); - if (p != snaps.begin()) { - --p; - if (*p >= in->first) - last_snap = *p; - } - } - - CInode *oldin = new CInode(this, true, in->first, last_snap); - oldin->inode = *in->get_previous_projected_inode(); - oldin->symlink = in->symlink; - oldin->xattrs = *in->get_previous_projected_xattrs(); - oldin->inode.trim_client_ranges(last); - - if (in->first < in->oldest_snap) - in->oldest_snap = in->first; - - in->first = last+1; - - dout(10) << "cow_inode " << *in << " to " << *oldin << dendl; - add_inode(oldin); - - if (in->last != CEPH_NOSNAP) { - CInode *head_in = get_inode(in->ino()); - assert(head_in); - if (head_in->split_need_snapflush(oldin, in)) { - oldin->client_snap_caps = in->client_snap_caps; - for (compact_map >::iterator p = in->client_snap_caps.begin(); - p != in->client_snap_caps.end(); - ++p) { - SimpleLock *lock = oldin->get_lock(p->first); - assert(lock); - for (auto q = p->second.begin(); q != p->second.end(); ++q) { - oldin->auth_pin(lock); - lock->set_state(LOCK_SNAP_SYNC); // gathering - lock->get_wrlock(true); - } - } - } - return oldin; - } - - // clone caps? - for (map::iterator p = in->client_caps.begin(); - p != in->client_caps.end(); - ++p) { - client_t client = p->first; - Capability *cap = p->second; - int issued = cap->issued(); - if ((issued & CEPH_CAP_ANY_WR) && - cap->client_follows < last) { - // note in oldin - for (int i = 0; i < num_cinode_locks; i++) { - if (issued & cinode_lock_info[i].wr_caps) { - int lockid = cinode_lock_info[i].lock; - SimpleLock *lock = oldin->get_lock(lockid); - assert(lock); - oldin->client_snap_caps[lockid].insert(client); - oldin->auth_pin(lock); - lock->set_state(LOCK_SNAP_SYNC); // gathering - lock->get_wrlock(true); - dout(10) << " client." << client << " cap " << ccap_string(issued & cinode_lock_info[i].wr_caps) - << " wrlock lock " << *lock << " on " << *oldin << dendl; - } - } - cap->client_follows = last; - - // we need snapflushes for any intervening snaps - dout(10) << " snaps " << snaps << dendl; - for (set::const_iterator q = snaps.lower_bound(oldin->first); - q != snaps.end() && *q <= last; - ++q) { - in->add_need_snapflush(oldin, *q, client); - } - } else { - dout(10) << " ignoring client." << client << " cap follows " << cap->client_follows << dendl; - } - } - - return oldin; -} - -void MDCache::journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, - CDentry *dn, snapid_t follows, - CInode **pcow_inode, CDentry::linkage_t *dnl) -{ - if (!dn) { - dout(10) << "journal_cow_dentry got null CDentry, returning" << dendl; - return; - } - dout(10) << "journal_cow_dentry follows " << follows << " on " << *dn << dendl; - assert(dn->is_auth()); - - // nothing to cow on a null dentry, fix caller - if (!dnl) - dnl = dn->get_projected_linkage(); - assert(!dnl->is_null()); - - if (dnl->is_primary() && dnl->get_inode()->is_multiversion()) { - // multiversion inode. - CInode *in = dnl->get_inode(); - SnapRealm *realm = NULL; - - if (in->get_projected_parent_dn() != dn) { - assert(follows == CEPH_NOSNAP); - realm = dn->dir->inode->find_snaprealm(); - snapid_t dir_follows = realm->get_newest_snap(); - - if (dir_follows+1 > dn->first) { - snapid_t oldfirst = dn->first; - dn->first = dir_follows+1; - if (realm->has_snaps_in_range(oldfirst, dir_follows)) { - CDentry *olddn = dn->dir->add_remote_dentry(dn->name, in->ino(), in->d_type(), - oldfirst, dir_follows); - olddn->pre_dirty(); - dout(10) << " olddn " << *olddn << dendl; - metablob->add_remote_dentry(olddn, true); - mut->add_cow_dentry(olddn); - // FIXME: adjust link count here? hmm. - - if (dir_follows+1 > in->first) - in->cow_old_inode(dir_follows, false); - } - } - - if (in->snaprealm) { - realm = in->snaprealm; - follows = realm->get_newest_seq(); - } else - follows = dir_follows; - } else { - realm = in->find_snaprealm(); - if (follows == CEPH_NOSNAP) - follows = realm->get_newest_seq(); - } - - // already cloned? - if (follows < in->first) { - dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *in << dendl; - return; - } - - if (!realm->has_snaps_in_range(in->first, follows)) { - dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *in << dendl; - in->first = follows + 1; - return; - } - - in->cow_old_inode(follows, false); - - } else { - SnapRealm *realm = dn->dir->inode->find_snaprealm(); - if (follows == CEPH_NOSNAP) - follows = realm->get_newest_seq(); - - // already cloned? - if (follows < dn->first) { - dout(10) << "journal_cow_dentry follows " << follows << " < first on " << *dn << dendl; - return; - } - - // update dn.first before adding old dentry to cdir's map - snapid_t oldfirst = dn->first; - dn->first = follows+1; - - CInode *in = dnl->is_primary() ? dnl->get_inode() : NULL; - - if (!realm->has_snaps_in_range(oldfirst, follows)) { - dout(10) << "journal_cow_dentry no snapshot follows " << follows << " on " << *dn << dendl; - if (in) - in->first = follows+1; - return; - } - - dout(10) << " dn " << *dn << dendl; - if (in) { - CInode *oldin = cow_inode(in, follows); - mut->add_cow_inode(oldin); - if (pcow_inode) - *pcow_inode = oldin; - CDentry *olddn = dn->dir->add_primary_dentry(dn->name, oldin, oldfirst, oldin->last); - oldin->inode.version = olddn->pre_dirty(); - dout(10) << " olddn " << *olddn << dendl; - bool need_snapflush = !oldin->client_snap_caps.empty(); - if (need_snapflush) - mut->ls->open_files.push_back(&oldin->item_open_file); - metablob->add_primary_dentry(olddn, 0, true, false, false, need_snapflush); - mut->add_cow_dentry(olddn); - } else { - assert(dnl->is_remote()); - CDentry *olddn = dn->dir->add_remote_dentry(dn->name, dnl->get_remote_ino(), dnl->get_remote_d_type(), - oldfirst, follows); - olddn->pre_dirty(); - dout(10) << " olddn " << *olddn << dendl; - metablob->add_remote_dentry(olddn, true); - mut->add_cow_dentry(olddn); - } - } -} - - -void MDCache::journal_cow_inode(MutationRef& mut, EMetaBlob *metablob, - CInode *in, snapid_t follows, - CInode **pcow_inode) -{ - dout(10) << "journal_cow_inode follows " << follows << " on " << *in << dendl; - CDentry *dn = in->get_projected_parent_dn(); - journal_cow_dentry(mut.get(), metablob, dn, follows, pcow_inode); -} - -void MDCache::journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows) -{ - if (in->is_base()) { - metablob->add_root(true, in, in->get_projected_inode()); - } else { - if (follows == CEPH_NOSNAP && in->last != CEPH_NOSNAP) - follows = in->first - 1; - CDentry *dn = in->get_projected_parent_dn(); - if (!dn->get_projected_linkage()->is_null()) // no need to cow a null dentry - journal_cow_dentry(mut, metablob, dn, follows); - if (in->get_projected_inode()->is_backtrace_updated()) { - bool dirty_pool = in->get_projected_inode()->layout.pool_id != - in->get_previous_projected_inode()->layout.pool_id; - metablob->add_primary_dentry(dn, in, true, true, dirty_pool); - } else { - metablob->add_primary_dentry(dn, in, true); - } - } -} - - - -// nested --------------------------------------------------------------- - -void MDCache::project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first, - int linkunlink, SnapRealm *prealm) -{ - CDentry *parentdn = cur->get_projected_parent_dn(); - inode_t *curi = cur->get_projected_inode(); - - if (cur->first > first) - first = cur->first; - - dout(10) << "projected_rstat_inode_to_frag first " << first << " linkunlink " << linkunlink - << " " << *cur << dendl; - dout(20) << " frag head is [" << parent->first << ",head] " << dendl; - dout(20) << " inode update is [" << first << "," << cur->last << "]" << dendl; - - /* - * FIXME. this incompletely propagates rstats to _old_ parents - * (i.e. shortly after a directory rename). but we need full - * blown hard link backpointers to make this work properly... - */ - snapid_t floor = parentdn->first; - dout(20) << " floor of " << floor << " from parent dn " << *parentdn << dendl; - - if (!prealm) - prealm = parent->inode->find_snaprealm(); - const set snaps = prealm->get_snaps(); - - if (cur->last != CEPH_NOSNAP) { - assert(cur->dirty_old_rstats.empty()); - set::const_iterator q = snaps.lower_bound(MAX(first, floor)); - if (q == snaps.end() || *q > cur->last) - return; - } - - if (cur->last >= floor) { - bool update = true; - if (cur->state_test(CInode::STATE_AMBIGUOUSAUTH) && cur->is_auth()) { - // rename src inode is not projected in the slave rename prep case. so we should - // avoid updateing the inode. - assert(linkunlink < 0); - assert(cur->is_frozen_inode()); - update = false; - } - _project_rstat_inode_to_frag(*curi, MAX(first, floor), cur->last, parent, - linkunlink, update); - } - - if (g_conf->mds_snap_rstat) { - for (compact_set::iterator p = cur->dirty_old_rstats.begin(); - p != cur->dirty_old_rstats.end(); - ++p) { - old_inode_t& old = cur->old_inodes[*p]; - snapid_t ofirst = MAX(old.first, floor); - set::const_iterator q = snaps.lower_bound(ofirst); - if (q == snaps.end() || *q > *p) - continue; - if (*p >= floor) - _project_rstat_inode_to_frag(old.inode, ofirst, *p, parent, 0, false); - } - } - cur->dirty_old_rstats.clear(); -} - - -void MDCache::_project_rstat_inode_to_frag(inode_t& inode, snapid_t ofirst, snapid_t last, - CDir *parent, int linkunlink, bool update_inode) -{ - dout(10) << "_project_rstat_inode_to_frag [" << ofirst << "," << last << "]" << dendl; - dout(20) << " inode rstat " << inode.rstat << dendl; - dout(20) << " inode accounted_rstat " << inode.accounted_rstat << dendl; - nest_info_t delta; - if (linkunlink == 0) { - delta.add(inode.rstat); - delta.sub(inode.accounted_rstat); - } else if (linkunlink < 0) { - delta.sub(inode.accounted_rstat); - } else { - delta.add(inode.rstat); - } - dout(20) << " delta " << delta << dendl; - - if (update_inode) - inode.accounted_rstat = inode.rstat; - - while (last >= ofirst) { - /* - * pick fnode version to update. at each iteration, we want to - * pick a segment ending in 'last' to update. split as necessary - * to make that work. then, adjust first up so that we only - * update one segment at a time. then loop to cover the whole - * [ofirst,last] interval. - */ - nest_info_t *prstat; - snapid_t first; - fnode_t *pf = parent->get_projected_fnode(); - if (last == CEPH_NOSNAP) { - if (g_conf->mds_snap_rstat) - first = MAX(ofirst, parent->first); - else - first = parent->first; - prstat = &pf->rstat; - dout(20) << " projecting to head [" << first << "," << last << "] " << *prstat << dendl; - - if (first > parent->first && - !(pf->rstat == pf->accounted_rstat)) { - dout(10) << " target snapped and not fully accounted, cow to dirty_old_rstat [" - << parent->first << "," << (first-1) << "] " - << " " << *prstat << "/" << pf->accounted_rstat - << dendl; - parent->dirty_old_rstat[first-1].first = parent->first; - parent->dirty_old_rstat[first-1].rstat = pf->rstat; - parent->dirty_old_rstat[first-1].accounted_rstat = pf->accounted_rstat; - } - parent->first = first; - } else if (!g_conf->mds_snap_rstat) { - // drop snapshots' rstats - break; - } else if (last >= parent->first) { - first = parent->first; - parent->dirty_old_rstat[last].first = first; - parent->dirty_old_rstat[last].rstat = pf->rstat; - parent->dirty_old_rstat[last].accounted_rstat = pf->accounted_rstat; - prstat = &parent->dirty_old_rstat[last].rstat; - dout(10) << " projecting to newly split dirty_old_fnode [" << first << "," << last << "] " - << " " << *prstat << "/" << pf->accounted_rstat << dendl; - } else { - // be careful, dirty_old_rstat is a _sparse_ map. - // sorry, this is ugly. - first = ofirst; - - // find any intersection with last - compact_map::iterator p = parent->dirty_old_rstat.lower_bound(last); - if (p == parent->dirty_old_rstat.end()) { - dout(20) << " no dirty_old_rstat with last >= last " << last << dendl; - if (!parent->dirty_old_rstat.empty() && parent->dirty_old_rstat.rbegin()->first >= first) { - dout(20) << " last dirty_old_rstat ends at " << parent->dirty_old_rstat.rbegin()->first << dendl; - first = parent->dirty_old_rstat.rbegin()->first+1; - } - } else { - // *p last is >= last - if (p->second.first <= last) { - // *p intersects [first,last] - if (p->second.first < first) { - dout(10) << " splitting off left bit [" << p->second.first << "," << first-1 << "]" << dendl; - parent->dirty_old_rstat[first-1] = p->second; - p->second.first = first; - } - if (p->second.first > first) - first = p->second.first; - if (last < p->first) { - dout(10) << " splitting off right bit [" << last+1 << "," << p->first << "]" << dendl; - parent->dirty_old_rstat[last] = p->second; - p->second.first = last+1; - } - } else { - // *p is to the _right_ of [first,last] - p = parent->dirty_old_rstat.lower_bound(first); - // new *p last is >= first - if (p->second.first <= last && // new *p isn't also to the right, and - p->first >= first) { // it intersects our first bit, - dout(10) << " staying to the right of [" << p->second.first << "," << p->first << "]..." << dendl; - first = p->first+1; - } - dout(10) << " projecting to new dirty_old_rstat [" << first << "," << last << "]" << dendl; - } - } - dout(20) << " projecting to dirty_old_rstat [" << first << "," << last << "]" << dendl; - parent->dirty_old_rstat[last].first = first; - prstat = &parent->dirty_old_rstat[last].rstat; - } - - // apply - dout(20) << " project to [" << first << "," << last << "] " << *prstat << dendl; - assert(last >= first); - prstat->add(delta); - if (update_inode) - inode.accounted_rstat = inode.rstat; - dout(20) << " result [" << first << "," << last << "] " << *prstat << " " << *parent << dendl; - - last = first-1; - } -} - -void MDCache::project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat, - snapid_t ofirst, snapid_t last, - CInode *pin, bool cow_head) -{ - dout(10) << "project_rstat_frag_to_inode [" << ofirst << "," << last << "]" << dendl; - dout(20) << " frag rstat " << rstat << dendl; - dout(20) << " frag accounted_rstat " << accounted_rstat << dendl; - nest_info_t delta = rstat; - delta.sub(accounted_rstat); - dout(20) << " delta " << delta << dendl; - - while (last >= ofirst) { - inode_t *pi; - snapid_t first; - if (last == pin->last) { - pi = pin->get_projected_inode(); - first = MAX(ofirst, pin->first); - if (first > pin->first) { - old_inode_t& old = pin->cow_old_inode(first-1, cow_head); - dout(20) << " cloned old_inode rstat is " << old.inode.rstat << dendl; - } - } else { - if (last >= pin->first) { - first = pin->first; - pin->cow_old_inode(last, cow_head); - } else { - // our life is easier here because old_inodes is not sparse - // (although it may not begin at snapid 1) - compact_map::iterator p = pin->old_inodes.lower_bound(last); - if (p == pin->old_inodes.end()) { - dout(10) << " no old_inode <= " << last << ", done." << dendl; - break; - } - first = p->second.first; - if (first > last) { - dout(10) << " oldest old_inode is [" << first << "," << p->first << "], done." << dendl; - //assert(p == pin->old_inodes.begin()); - break; - } - if (p->first > last) { - dout(10) << " splitting right old_inode [" << first << "," << p->first << "] to [" - << (last+1) << "," << p->first << "]" << dendl; - pin->old_inodes[last] = p->second; - p->second.first = last+1; - pin->dirty_old_rstats.insert(p->first); - } - } - if (first < ofirst) { - dout(10) << " splitting left old_inode [" << first << "," << last << "] to [" - << first << "," << ofirst-1 << "]" << dendl; - pin->old_inodes[ofirst-1] = pin->old_inodes[last]; - pin->dirty_old_rstats.insert(ofirst-1); - pin->old_inodes[last].first = first = ofirst; - } - pi = &pin->old_inodes[last].inode; - pin->dirty_old_rstats.insert(last); - } - dout(20) << " projecting to [" << first << "," << last << "] " << pi->rstat << dendl; - pi->rstat.add(delta); - dout(20) << " result [" << first << "," << last << "] " << pi->rstat << dendl; - - last = first-1; - } -} - -void MDCache::broadcast_quota_to_client(CInode *in) -{ - if (!in->is_auth() || in->is_frozen()) - return; - - inode_t *i = in->get_projected_inode(); - - if (!i->quota.is_enable()) - return; - - for (map::iterator it = in->client_caps.begin(); - it != in->client_caps.end(); - ++it) { - Session *session = mds->get_session(it->first); - if (!session || !session->connection || - !session->connection->has_feature(CEPH_FEATURE_MDS_QUOTA)) - continue; - - Capability *cap = it->second; - if (cap->last_rbytes == i->rstat.rbytes && - cap->last_rsize == i->rstat.rsize()) - continue; - - if (i->quota.max_files > 0) { - if (i->rstat.rsize() >= i->quota.max_files) - goto update; - - if ((abs(cap->last_rsize - i->quota.max_files) >> 4) < - abs(cap->last_rsize - i->rstat.rsize())) - goto update; - } - - if (i->quota.max_bytes > 0) { - if (i->rstat.rbytes > i->quota.max_bytes - (i->quota.max_bytes >> 3)) - goto update; - - if ((abs(cap->last_rbytes - i->quota.max_bytes) >> 4) < - abs(cap->last_rbytes - i->rstat.rbytes)) - goto update; - } - - continue; - -update: - cap->last_rsize = i->rstat.rsize(); - cap->last_rbytes = i->rstat.rbytes; - - MClientQuota *msg = new MClientQuota(); - msg->ino = in->ino(); - msg->rstat = i->rstat; - msg->quota = i->quota; - mds->send_message_client_counted(msg, session->connection); - } - for (const auto &it : in->get_replicas()) { - MGatherCaps *msg = new MGatherCaps; - msg->ino = in->ino(); - mds->send_message_mds(msg, it.first); - } -} - -/* - * NOTE: we _have_ to delay the scatter if we are called during a - * rejoin, because we can't twiddle locks between when the - * rejoin_(weak|strong) is received and when we send the rejoin_ack. - * normally, this isn't a problem: a recover mds doesn't twiddle locks - * (no requests), and a survivor acks immediately. _except_ that - * during rejoin_(weak|strong) processing, we may complete a lock - * gather, and do a scatter_writebehind.. and we _can't_ twiddle the - * scatterlock state in that case or the lock states will get out of - * sync between the auth and replica. - * - * the simple solution is to never do the scatter here. instead, put - * the scatterlock on a list if it isn't already wrlockable. this is - * probably the best plan anyway, since we avoid too many - * scatters/locks under normal usage. - */ -/* - * some notes on dirlock/nestlock scatterlock semantics: - * - * the fragstat (dirlock) will never be updated without - * dirlock+nestlock wrlock held by the caller. - * - * the rstat (nestlock) _may_ get updated without a wrlock when nested - * data is pushed up the tree. this could be changed with some - * restructuring here, but in its current form we ensure that the - * fragstat+rstat _always_ reflect an accurrate summation over the dir - * frag, which is nice. and, we only need to track frags that need to - * be nudged (and not inodes with pending rstat changes that need to - * be pushed into the frag). a consequence of this is that the - * accounted_rstat on scatterlock sync may not match our current - * rstat. this is normal and expected. - */ -void MDCache::predirty_journal_parents(MutationRef mut, EMetaBlob *blob, - CInode *in, CDir *parent, - int flags, int linkunlink, - snapid_t cfollows) -{ - bool primary_dn = flags & PREDIRTY_PRIMARY; - bool do_parent_mtime = flags & PREDIRTY_DIR; - bool shallow = flags & PREDIRTY_SHALLOW; - - assert(mds->mdlog->entry_is_open()); - - // make sure stamp is set - if (mut->get_mds_stamp() == utime_t()) - mut->set_mds_stamp(ceph_clock_now()); - - if (in->is_base()) - return; - - dout(10) << "predirty_journal_parents" - << (do_parent_mtime ? " do_parent_mtime":"") - << " linkunlink=" << linkunlink - << (primary_dn ? " primary_dn":" remote_dn") - << (shallow ? " SHALLOW":"") - << " follows " << cfollows - << " " << *in << dendl; - - if (!parent) { - assert(primary_dn); - parent = in->get_projected_parent_dn()->get_dir(); - } - - if (flags == 0 && linkunlink == 0) { - dout(10) << " no flags/linkunlink, just adding dir context to blob(s)" << dendl; - blob->add_dir_context(parent); - return; - } - - // build list of inodes to wrlock, dirty, and update - list lsi; - CInode *cur = in; - CDentry *parentdn = NULL; - bool first = true; - while (parent) { - //assert(cur->is_auth() || !primary_dn); // this breaks the rename auth twiddle hack - assert(parent->is_auth()); - - // opportunistically adjust parent dirfrag - CInode *pin = parent->get_inode(); - - // inode -> dirfrag - mut->auth_pin(parent); - mut->add_projected_fnode(parent); - - fnode_t *pf = parent->project_fnode(); - pf->version = parent->pre_dirty(); - - if (do_parent_mtime || linkunlink) { - assert(mut->wrlocks.count(&pin->filelock)); - assert(mut->wrlocks.count(&pin->nestlock)); - assert(cfollows == CEPH_NOSNAP); - - // update stale fragstat/rstat? - parent->resync_accounted_fragstat(); - parent->resync_accounted_rstat(); - - if (do_parent_mtime) { - pf->fragstat.mtime = mut->get_op_stamp(); - pf->fragstat.change_attr++; - dout(10) << "predirty_journal_parents bumping change_attr to " << pf->fragstat.change_attr << " on " << parent << dendl; - if (pf->fragstat.mtime > pf->rstat.rctime) { - dout(10) << "predirty_journal_parents updating mtime on " << *parent << dendl; - pf->rstat.rctime = pf->fragstat.mtime; - } else { - dout(10) << "predirty_journal_parents updating mtime UNDERWATER on " << *parent << dendl; - } - } - if (linkunlink) { - dout(10) << "predirty_journal_parents updating size on " << *parent << dendl; - if (in->is_dir()) { - pf->fragstat.nsubdirs += linkunlink; - //pf->rstat.rsubdirs += linkunlink; - } else { - pf->fragstat.nfiles += linkunlink; - //pf->rstat.rfiles += linkunlink; - } - } - } - - // rstat - if (!primary_dn) { - // don't update parent this pass - } else if (!linkunlink && !(pin->nestlock.can_wrlock(-1) && - pin->versionlock.can_wrlock())) { - dout(20) << " unwritable parent nestlock " << pin->nestlock - << ", marking dirty rstat on " << *cur << dendl; - cur->mark_dirty_rstat(); - } else { - // if we don't hold a wrlock reference on this nestlock, take one, - // because we are about to write into the dirfrag fnode and that needs - // to commit before the lock can cycle. - if (linkunlink) { - assert(pin->nestlock.get_num_wrlocks() || mut->is_slave()); - } - - if (mut->wrlocks.count(&pin->nestlock) == 0) { - dout(10) << " taking wrlock on " << pin->nestlock << " on " << *pin << dendl; - mds->locker->wrlock_force(&pin->nestlock, mut); - } - - // now we can project the inode rstat diff the dirfrag - SnapRealm *prealm = pin->find_snaprealm(); - - snapid_t follows = cfollows; - if (follows == CEPH_NOSNAP) - follows = prealm->get_newest_seq(); - - snapid_t first = follows+1; - - // first, if the frag is stale, bring it back in sync. - parent->resync_accounted_rstat(); - - // now push inode rstats into frag - project_rstat_inode_to_frag(cur, parent, first, linkunlink, prealm); - cur->clear_dirty_rstat(); - } - - bool stop = false; - if (!pin->is_auth() || (!mut->is_auth_pinned(pin) && !pin->can_auth_pin())) { - dout(10) << "predirty_journal_parents !auth or ambig or can't authpin on " << *pin << dendl; - stop = true; - } - - // delay propagating until later? - if (!stop && !first && - g_conf->mds_dirstat_min_interval > 0) { - double since_last_prop = mut->get_mds_stamp() - pin->last_dirstat_prop; - if (since_last_prop < g_conf->mds_dirstat_min_interval) { - dout(10) << "predirty_journal_parents last prop " << since_last_prop - << " < " << g_conf->mds_dirstat_min_interval - << ", stopping" << dendl; - stop = true; - } else { - dout(10) << "predirty_journal_parents last prop " << since_last_prop << " ago, continuing" << dendl; - } - } - - // can cast only because i'm passing nowait=true in the sole user - MDRequestRef mdmut = static_cast(mut.get()); - if (!stop && - mut->wrlocks.count(&pin->nestlock) == 0 && - (!pin->versionlock.can_wrlock() || // make sure we can take versionlock, too - //true - !mds->locker->wrlock_start(&pin->nestlock, mdmut, true) - )) { // ** do not initiate.. see above comment ** - dout(10) << "predirty_journal_parents can't wrlock one of " << pin->versionlock << " or " << pin->nestlock - << " on " << *pin << dendl; - stop = true; - } - if (stop) { - dout(10) << "predirty_journal_parents stop. marking nestlock on " << *pin << dendl; - mds->locker->mark_updated_scatterlock(&pin->nestlock); - mut->ls->dirty_dirfrag_nest.push_back(&pin->item_dirty_dirfrag_nest); - mut->add_updated_lock(&pin->nestlock); - if (do_parent_mtime || linkunlink) { - mds->locker->mark_updated_scatterlock(&pin->filelock); - mut->ls->dirty_dirfrag_dir.push_back(&pin->item_dirty_dirfrag_dir); - mut->add_updated_lock(&pin->filelock); - } - break; - } - if (!mut->wrlocks.count(&pin->versionlock)) - mds->locker->local_wrlock_grab(&pin->versionlock, mut); - - assert(mut->wrlocks.count(&pin->nestlock) || - mut->is_slave()); - - pin->last_dirstat_prop = mut->get_mds_stamp(); - - // dirfrag -> diri - mut->auth_pin(pin); - mut->add_projected_inode(pin); - lsi.push_front(pin); - - pin->pre_cow_old_inode(); // avoid cow mayhem! - - inode_t *pi = pin->project_inode(); - pi->version = pin->pre_dirty(); - - // dirstat - if (do_parent_mtime || linkunlink) { - dout(20) << "predirty_journal_parents add_delta " << pf->fragstat << dendl; - dout(20) << "predirty_journal_parents - " << pf->accounted_fragstat << dendl; - bool touched_mtime = false, touched_chattr = false; - pi->dirstat.add_delta(pf->fragstat, pf->accounted_fragstat, &touched_mtime, &touched_chattr); - pf->accounted_fragstat = pf->fragstat; - if (touched_mtime) - pi->mtime = pi->ctime = pi->dirstat.mtime; - if (touched_chattr) - pi->change_attr = pi->dirstat.change_attr; - dout(20) << "predirty_journal_parents gives " << pi->dirstat << " on " << *pin << dendl; - - if (parent->get_frag() == frag_t()) { // i.e., we are the only frag - if (pi->dirstat.size() < 0) - assert(!"negative dirstat size" == g_conf->mds_verify_scatter); - if (pi->dirstat.size() != pf->fragstat.size()) { - mds->clog->error() << "unmatched fragstat size on single dirfrag " - << parent->dirfrag() << ", inode has " << pi->dirstat - << ", dirfrag has " << pf->fragstat; - - // trust the dirfrag for now - pi->dirstat = pf->fragstat; - - assert(!"unmatched fragstat size" == g_conf->mds_verify_scatter); - } - } - } - - /* - * the rule here is to follow the _oldest_ parent with dirty rstat - * data. if we don't propagate all data, we add ourselves to the - * nudge list. that way all rstat data will (eventually) get - * pushed up the tree. - * - * actually, no. for now, silently drop rstats for old parents. we need - * hard link backpointers to do the above properly. - */ - - // stop? - if (pin->is_base()) - break; - parentdn = pin->get_projected_parent_dn(); - assert(parentdn); - - // rstat - dout(10) << "predirty_journal_parents frag->inode on " << *parent << dendl; - - // first, if the frag is stale, bring it back in sync. - parent->resync_accounted_rstat(); - - if (g_conf->mds_snap_rstat) { - for (compact_map::iterator p = parent->dirty_old_rstat.begin(); - p != parent->dirty_old_rstat.end(); - ++p) - project_rstat_frag_to_inode(p->second.rstat, p->second.accounted_rstat, p->second.first, - p->first, pin, true);//false); - } - parent->dirty_old_rstat.clear(); - project_rstat_frag_to_inode(pf->rstat, pf->accounted_rstat, parent->first, CEPH_NOSNAP, pin, true);//false); - - pf->accounted_rstat = pf->rstat; - - if (parent->get_frag() == frag_t()) { // i.e., we are the only frag - if (pi->rstat.rbytes != pf->rstat.rbytes) { - mds->clog->error() << "unmatched rstat rbytes on single dirfrag " - << parent->dirfrag() << ", inode has " << pi->rstat - << ", dirfrag has " << pf->rstat; - - // trust the dirfrag for now - pi->rstat = pf->rstat; - - assert(!"unmatched rstat rbytes" == g_conf->mds_verify_scatter); - } - } - - parent->check_rstats(); - broadcast_quota_to_client(pin); - // next parent! - cur = pin; - parent = parentdn->get_dir(); - linkunlink = 0; - do_parent_mtime = false; - primary_dn = true; - first = false; - } - - // now, stick it in the blob - assert(parent); - assert(parent->is_auth()); - blob->add_dir_context(parent); - blob->add_dir(parent, true); - for (list::iterator p = lsi.begin(); - p != lsi.end(); - ++p) { - CInode *cur = *p; - journal_dirty_inode(mut.get(), blob, cur); - } - -} - - - - - -// =================================== -// slave requests - - -/* - * some handlers for master requests with slaves. we need to make - * sure slaves journal commits before we forget we mastered them and - * remove them from the uncommitted_masters map (used during recovery - * to commit|abort slaves). - */ -struct C_MDC_CommittedMaster : public MDCacheLogContext { - metareqid_t reqid; - C_MDC_CommittedMaster(MDCache *s, metareqid_t r) : MDCacheLogContext(s), reqid(r) {} - void finish(int r) override { - mdcache->_logged_master_commit(reqid); - } -}; - -void MDCache::log_master_commit(metareqid_t reqid) -{ - dout(10) << "log_master_commit " << reqid << dendl; - uncommitted_masters[reqid].committing = true; - mds->mdlog->start_submit_entry(new ECommitted(reqid), - new C_MDC_CommittedMaster(this, reqid)); -} - -void MDCache::_logged_master_commit(metareqid_t reqid) -{ - dout(10) << "_logged_master_commit " << reqid << dendl; - assert(uncommitted_masters.count(reqid)); - uncommitted_masters[reqid].ls->uncommitted_masters.erase(reqid); - mds->queue_waiters(uncommitted_masters[reqid].waiters); - uncommitted_masters.erase(reqid); -} - -// while active... - -void MDCache::committed_master_slave(metareqid_t r, mds_rank_t from) -{ - dout(10) << "committed_master_slave mds." << from << " on " << r << dendl; - assert(uncommitted_masters.count(r)); - uncommitted_masters[r].slaves.erase(from); - if (!uncommitted_masters[r].recovering && uncommitted_masters[r].slaves.empty()) - log_master_commit(r); -} - -void MDCache::logged_master_update(metareqid_t reqid) -{ - dout(10) << "logged_master_update " << reqid << dendl; - assert(uncommitted_masters.count(reqid)); - uncommitted_masters[reqid].safe = true; - if (pending_masters.count(reqid)) { - pending_masters.erase(reqid); - if (pending_masters.empty()) - process_delayed_resolve(); - } -} - -/* - * Master may crash after receiving all slaves' commit acks, but before journalling - * the final commit. Slaves may crash after journalling the slave commit, but before - * sending commit ack to the master. Commit masters with no uncommitted slave when - * resolve finishes. - */ -void MDCache::finish_committed_masters() -{ - for (map::iterator p = uncommitted_masters.begin(); - p != uncommitted_masters.end(); - ++p) { - p->second.recovering = false; - if (!p->second.committing && p->second.slaves.empty()) { - dout(10) << "finish_committed_masters " << p->first << dendl; - log_master_commit(p->first); - } - } -} - -/* - * at end of resolve... we must journal a commit|abort for all slave - * updates, before moving on. - * - * this is so that the master can safely journal ECommitted on ops it - * masters when it reaches up:active (all other recovering nodes must - * complete resolve before that happens). - */ -struct C_MDC_SlaveCommit : public MDCacheLogContext { - mds_rank_t from; - metareqid_t reqid; - C_MDC_SlaveCommit(MDCache *c, int f, metareqid_t r) : MDCacheLogContext(c), from(f), reqid(r) {} - void finish(int r) override { - mdcache->_logged_slave_commit(from, reqid); - } -}; - -void MDCache::_logged_slave_commit(mds_rank_t from, metareqid_t reqid) -{ - dout(10) << "_logged_slave_commit from mds." << from << " " << reqid << dendl; - - // send a message - MMDSSlaveRequest *req = new MMDSSlaveRequest(reqid, 0, MMDSSlaveRequest::OP_COMMITTED); - mds->send_message_mds(req, from); -} - - - - - - -// ==================================================================== -// import map, recovery - -void MDCache::_move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent, - map >& subtrees) -{ - if (subtrees.count(oldparent)) { - vector& v = subtrees[oldparent]; - dout(10) << " removing " << df << " from " << oldparent << " bounds " << v << dendl; - for (vector::iterator it = v.begin(); it != v.end(); ++it) - if (*it == df) { - v.erase(it); - break; - } - } - if (subtrees.count(newparent)) { - vector& v = subtrees[newparent]; - dout(10) << " adding " << df << " to " << newparent << " bounds " << v << dendl; - v.push_back(df); - } -} - -ESubtreeMap *MDCache::create_subtree_map() -{ - dout(10) << "create_subtree_map " << num_subtrees() << " subtrees, " - << num_subtrees_fullauth() << " fullauth" - << dendl; - - show_subtrees(); - - ESubtreeMap *le = new ESubtreeMap(); - mds->mdlog->_start_entry(le); - - map dirs_to_add; - - if (myin) { - CDir* mydir = myin->get_dirfrag(frag_t()); - dirs_to_add[mydir->dirfrag()] = mydir; - } - - // include all auth subtrees, and their bounds. - // and a spanning tree to tie it to the root. - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - - // journal subtree as "ours" if we are - // me, -2 - // me, me - // me, !me (may be importing and ambiguous!) - - // so not - // !me, * - if (dir->get_dir_auth().first != mds->get_nodeid()) - continue; - - if (migrator->is_ambiguous_import(dir->dirfrag()) || - my_ambiguous_imports.count(dir->dirfrag())) { - dout(15) << " ambig subtree " << *dir << dendl; - le->ambiguous_subtrees.insert(dir->dirfrag()); - } else { - dout(15) << " subtree " << *dir << dendl; - } - - dirs_to_add[dir->dirfrag()] = dir; - le->subtrees[dir->dirfrag()].clear(); - - - // bounds - for (set::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - CDir *bound = *q; - dout(15) << " subtree bound " << *bound << dendl; - dirs_to_add[bound->dirfrag()] = bound; - le->subtrees[dir->dirfrag()].push_back(bound->dirfrag()); - } - } - - // apply projected renames - for (map > >::iterator p = projected_subtree_renames.begin(); - p != projected_subtree_renames.end(); - ++p) { - for (list >::iterator q = p->second.begin(); q != p->second.end(); ++q) { - CInode *diri = p->first; - CDir *olddir = q->first; - CDir *newdir = q->second; - dout(10) << " adjusting for projected rename of " << *diri << " to " << *newdir << dendl; - - list dfls; - diri->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) { - CDir *dir = *p; - dout(10) << "dirfrag " << dir->dirfrag() << " " << *dir << dendl; - CDir *oldparent = get_projected_subtree_root(olddir); - dout(10) << " old parent " << oldparent->dirfrag() << " " << *oldparent << dendl; - CDir *newparent = get_projected_subtree_root(newdir); - dout(10) << " new parent " << newparent->dirfrag() << " " << *newparent << dendl; - - if (oldparent == newparent) { - dout(10) << "parent unchanged for " << dir->dirfrag() << " at " - << oldparent->dirfrag() << dendl; - continue; - } - - if (dir->is_subtree_root()) { - if (le->subtrees.count(newparent->dirfrag()) && - oldparent->get_dir_auth() != newparent->get_dir_auth()) - dirs_to_add[dir->dirfrag()] = dir; - // children are fine. change parent. - _move_subtree_map_bound(dir->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(), - le->subtrees); - } else { - // mid-subtree. - - if (oldparent->get_dir_auth() != newparent->get_dir_auth()) { - dout(10) << " creating subtree for " << dir->dirfrag() << dendl; - // if oldparent is auth, subtree is mine; include it. - if (le->subtrees.count(oldparent->dirfrag())) { - dirs_to_add[dir->dirfrag()] = dir; - le->subtrees[dir->dirfrag()].clear(); - } - // if newparent is auth, subtree is a new bound - if (le->subtrees.count(newparent->dirfrag())) { - dirs_to_add[dir->dirfrag()] = dir; - le->subtrees[newparent->dirfrag()].push_back(dir->dirfrag()); // newparent is auth; new bound - } - newparent = dir; - } - - // see if any old bounds move to the new parent. - for (set::iterator p = subtrees[oldparent].begin(); - p != subtrees[oldparent].end(); - ++p) { - CDir *bound = *p; - if (dir->contains(bound->get_parent_dir())) - _move_subtree_map_bound(bound->dirfrag(), oldparent->dirfrag(), newparent->dirfrag(), - le->subtrees); - } - } - } - } - } - - // simplify the journaled map. our in memory map may have more - // subtrees than needed due to migrations that are just getting - // started or just completing. but on replay, the "live" map will - // be simple and we can do a straight comparison. - for (map >::iterator p = le->subtrees.begin(); p != le->subtrees.end(); ++p) { - if (le->ambiguous_subtrees.count(p->first)) - continue; - unsigned i = 0; - while (i < p->second.size()) { - dirfrag_t b = p->second[i]; - if (le->subtrees.count(b) && - le->ambiguous_subtrees.count(b) == 0) { - vector& bb = le->subtrees[b]; - dout(10) << "simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl; - for (vector::iterator r = bb.begin(); r != bb.end(); ++r) - p->second.push_back(*r); - dirs_to_add.erase(b); - le->subtrees.erase(b); - p->second.erase(p->second.begin() + i); - } else { - ++i; - } - } - } - - for (auto p : dirs_to_add) { - CDir *dir = p.second; - le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT); - le->metablob.add_dir(dir, false); - } - - dout(15) << " subtrees " << le->subtrees << dendl; - dout(15) << " ambiguous_subtrees " << le->ambiguous_subtrees << dendl; - - //le->metablob.print(cout); - le->expire_pos = mds->mdlog->journaler->get_expire_pos(); - return le; -} - -void MDCache::dump_resolve_status(Formatter *f) const -{ - f->open_object_section("resolve_status"); - f->dump_stream("resolve_gather") << resolve_gather; - f->dump_stream("resolve_ack_gather") << resolve_gather; - f->close_section(); -} - -void MDCache::resolve_start(MDSInternalContext *resolve_done_) -{ - dout(10) << "resolve_start" << dendl; - assert(!resolve_done); - resolve_done.reset(resolve_done_); - - if (mds->mdsmap->get_root() != mds->get_nodeid()) { - // if we don't have the root dir, adjust it to UNKNOWN. during - // resolve we want mds0 to explicit claim the portion of it that - // it owns, so that anything beyond its bounds get left as - // unknown. - CDir *rootdir = root->get_dirfrag(frag_t()); - if (rootdir) - adjust_subtree_auth(rootdir, CDIR_AUTH_UNKNOWN); - } - resolve_gather = recovery_set; -} - -void MDCache::send_resolves() -{ - send_slave_resolves(); - if (!resolve_ack_gather.empty()) { - dout(10) << "send_resolves still waiting for resolve ack from (" - << resolve_ack_gather << ")" << dendl; - return; - } - if (!need_resolve_rollback.empty()) { - dout(10) << "send_resolves still waiting for rollback to commit on (" - << need_resolve_rollback << ")" << dendl; - return; - } - send_subtree_resolves(); -} - -void MDCache::send_slave_resolves() -{ - dout(10) << "send_slave_resolves" << dendl; - - map resolves; - - if (mds->is_resolve()) { - for (map >::iterator p = uncommitted_slave_updates.begin(); - p != uncommitted_slave_updates.end(); - ++p) { - resolves[p->first] = new MMDSResolve; - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " including uncommitted " << q->first << dendl; - resolves[p->first]->add_slave_request(q->first, false); - } - } - } else { - set resolve_set; - mds->mdsmap->get_mds_set(resolve_set, MDSMap::STATE_RESOLVE); - for (ceph::unordered_map::iterator p = active_requests.begin(); - p != active_requests.end(); - ++p) { - MDRequestRef& mdr = p->second; - if (!mdr->is_slave()) - continue; - if (!mdr->slave_did_prepare() && !mdr->committing) { - continue; - } - mds_rank_t master = mdr->slave_to_mds; - if (resolve_set.count(master) || is_ambiguous_slave_update(p->first, master)) { - dout(10) << " including uncommitted " << *mdr << dendl; - if (!resolves.count(master)) - resolves[master] = new MMDSResolve; - if (!mdr->committing && - mdr->has_more() && mdr->more()->is_inode_exporter) { - // re-send cap exports - CInode *in = mdr->more()->rename_inode; - map cap_map; - in->export_client_caps(cap_map); - bufferlist bl; - ::encode(in->ino(), bl); - ::encode(cap_map, bl); - resolves[master]->add_slave_request(p->first, bl); - } else { - resolves[master]->add_slave_request(p->first, mdr->committing); - } - } - } - } - - for (map::iterator p = resolves.begin(); - p != resolves.end(); - ++p) { - dout(10) << "sending slave resolve to mds." << p->first << dendl; - mds->send_message_mds(p->second, p->first); - resolve_ack_gather.insert(p->first); - } -} - -void MDCache::send_subtree_resolves() -{ - dout(10) << "send_subtree_resolves" << dendl; - - if (migrator->is_exporting() || migrator->is_importing()) { - dout(7) << "send_subtree_resolves waiting, imports/exports still in progress" << dendl; - migrator->show_importing(); - migrator->show_exporting(); - resolves_pending = true; - return; // not now - } - - map resolves; - for (set::iterator p = recovery_set.begin(); - p != recovery_set.end(); - ++p) { - if (*p == mds->get_nodeid()) - continue; - if (mds->is_resolve() || mds->mdsmap->is_resolve(*p)) - resolves[*p] = new MMDSResolve; - } - - map > my_subtrees; - map > my_ambig_imports; - - // known - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - - // only our subtrees - if (dir->authority().first != mds->get_nodeid()) - continue; - - if (mds->is_resolve() && my_ambiguous_imports.count(dir->dirfrag())) - continue; // we'll add it below - - if (migrator->is_ambiguous_import(dir->dirfrag())) { - // ambiguous (mid-import) - set bounds; - get_subtree_bounds(dir, bounds); - vector dfls; - for (set::iterator q = bounds.begin(); q != bounds.end(); ++q) - dfls.push_back((*q)->dirfrag()); - - my_ambig_imports[dir->dirfrag()] = dfls; - dout(10) << " ambig " << dir->dirfrag() << " " << dfls << dendl; - } else { - // not ambiguous. - for (map::iterator q = resolves.begin(); - q != resolves.end(); - ++q) - resolves[q->first]->add_subtree(dir->dirfrag()); - // bounds too - vector dfls; - for (set::iterator q = subtrees[dir].begin(); - q != subtrees[dir].end(); - ++q) { - CDir *bound = *q; - dfls.push_back(bound->dirfrag()); - } - - my_subtrees[dir->dirfrag()] = dfls; - dout(10) << " claim " << dir->dirfrag() << " " << dfls << dendl; - } - } - - // ambiguous - for (map >::iterator p = my_ambiguous_imports.begin(); - p != my_ambiguous_imports.end(); - ++p) { - my_ambig_imports[p->first] = p->second; - dout(10) << " ambig " << p->first << " " << p->second << dendl; - } - - // simplify the claimed subtree. - for (auto p = my_subtrees.begin(); p != my_subtrees.end(); ++p) { - unsigned i = 0; - while (i < p->second.size()) { - dirfrag_t b = p->second[i]; - if (my_subtrees.count(b)) { - vector& bb = my_subtrees[b]; - dout(10) << " simplify: " << p->first << " swallowing " << b << " with bounds " << bb << dendl; - for (vector::iterator r = bb.begin(); r != bb.end(); ++r) - p->second.push_back(*r); - my_subtrees.erase(b); - p->second.erase(p->second.begin() + i); - } else { - ++i; - } - } - } - - // send - for (map::iterator p = resolves.begin(); - p != resolves.end(); - ++p) { - MMDSResolve* m = p->second; - m->subtrees = my_subtrees; - m->ambiguous_imports = my_ambig_imports; - dout(10) << "sending subtee resolve to mds." << p->first << dendl; - mds->send_message_mds(m, p->first); - } - resolves_pending = false; -} - -void MDCache::handle_mds_failure(mds_rank_t who) -{ - dout(7) << "handle_mds_failure mds." << who << dendl; - - dout(1) << "handle_mds_failure mds." << who << " : recovery peers are " << recovery_set << dendl; - - resolve_gather.insert(who); - discard_delayed_resolve(who); - ambiguous_slave_updates.erase(who); - - rejoin_gather.insert(who); - rejoin_sent.erase(who); // i need to send another - rejoin_ack_sent.erase(who); // i need to send another - rejoin_ack_gather.erase(who); // i'll need/get another. - - dout(10) << " resolve_gather " << resolve_gather << dendl; - dout(10) << " resolve_ack_gather " << resolve_ack_gather << dendl; - dout(10) << " rejoin_sent " << rejoin_sent << dendl; - dout(10) << " rejoin_gather " << rejoin_gather << dendl; - dout(10) << " rejoin_ack_gather " << rejoin_ack_gather << dendl; - - - // tell the migrator too. - migrator->handle_mds_failure_or_stop(who); - - // tell the balancer too. - mds->balancer->handle_mds_failure(who); - - // clean up any requests slave to/from this node - list finish; - for (ceph::unordered_map::iterator p = active_requests.begin(); - p != active_requests.end(); - ++p) { - MDRequestRef& mdr = p->second; - // slave to the failed node? - if (mdr->slave_to_mds == who) { - if (mdr->slave_did_prepare()) { - dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl; - if (is_ambiguous_slave_update(p->first, mdr->slave_to_mds)) - remove_ambiguous_slave_update(p->first, mdr->slave_to_mds); - - if (!mdr->more()->waiting_on_slave.empty()) { - assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid()); - // will rollback, no need to wait - if (mdr->slave_request) { - mdr->slave_request->put(); - mdr->slave_request = 0; - } - mdr->more()->waiting_on_slave.clear(); - } - } else if (!mdr->committing) { - dout(10) << " slave request " << *mdr << " has no prepare, finishing up" << dendl; - if (mdr->slave_request || mdr->slave_rolling_back()) - mdr->aborted = true; - else - finish.push_back(mdr); - } - } - - if (mdr->is_slave() && mdr->slave_did_prepare()) { - if (mdr->more()->waiting_on_slave.count(who)) { - assert(mdr->more()->srcdn_auth_mds == mds->get_nodeid()); - dout(10) << " slave request " << *mdr << " no longer need rename notity ack from mds." - << who << dendl; - mdr->more()->waiting_on_slave.erase(who); - if (mdr->more()->waiting_on_slave.empty() && mdr->slave_request) - mds->queue_waiter(new C_MDS_RetryRequest(this, mdr)); - } - - if (mdr->more()->srcdn_auth_mds == who && - mds->mdsmap->is_clientreplay_or_active_or_stopping(mdr->slave_to_mds)) { - // rename srcdn's auth mds failed, resolve even I'm a survivor. - dout(10) << " slave request " << *mdr << " uncommitted, will resolve shortly" << dendl; - add_ambiguous_slave_update(p->first, mdr->slave_to_mds); - } - } else if (mdr->slave_request) { - MMDSSlaveRequest *slave_req = mdr->slave_request; - // FIXME: Slave rename request can arrive after we notice mds failure. - // This can cause mds to crash (does not affect integrity of FS). - if (slave_req->get_op() == MMDSSlaveRequest::OP_RENAMEPREP && - slave_req->srcdn_auth == who) - slave_req->mark_interrupted(); - } - - // failed node is slave? - if (mdr->is_master() && !mdr->committing) { - if (mdr->more()->srcdn_auth_mds == who) { - dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds." - << who << " to recover" << dendl; - assert(mdr->more()->witnessed.count(who) == 0); - if (mdr->more()->is_ambiguous_auth) - mdr->clear_ambiguous_auth(); - // rename srcdn's auth mds failed, all witnesses will rollback - mdr->more()->witnessed.clear(); - pending_masters.erase(p->first); - } - - if (mdr->more()->witnessed.count(who)) { - mds_rank_t srcdn_auth = mdr->more()->srcdn_auth_mds; - if (srcdn_auth >= 0 && mdr->more()->waiting_on_slave.count(srcdn_auth)) { - dout(10) << " master request " << *mdr << " waiting for rename srcdn's auth mds." - << mdr->more()->srcdn_auth_mds << " to reply" << dendl; - // waiting for the slave (rename srcdn's auth mds), delay sending resolve ack - // until either the request is committing or the slave also fails. - assert(mdr->more()->waiting_on_slave.size() == 1); - pending_masters.insert(p->first); - } else { - dout(10) << " master request " << *mdr << " no longer witnessed by slave mds." - << who << " to recover" << dendl; - if (srcdn_auth >= 0) - assert(mdr->more()->witnessed.count(srcdn_auth) == 0); - - // discard this peer's prepare (if any) - mdr->more()->witnessed.erase(who); - } - } - - if (mdr->more()->waiting_on_slave.count(who)) { - dout(10) << " master request " << *mdr << " waiting for slave mds." << who - << " to recover" << dendl; - // retry request when peer recovers - mdr->more()->waiting_on_slave.erase(who); - if (mdr->more()->waiting_on_slave.empty()) - mds->wait_for_active_peer(who, new C_MDS_RetryRequest(this, mdr)); - } - - if (mdr->locking && mdr->locking_target_mds == who) - mdr->finish_locking(mdr->locking); - } - } - - for (map::iterator p = uncommitted_masters.begin(); - p != uncommitted_masters.end(); - ++p) { - // The failed MDS may have already committed the slave update - if (p->second.slaves.count(who)) { - p->second.recovering = true; - p->second.slaves.erase(who); - } - } - - while (!finish.empty()) { - dout(10) << "cleaning up slave request " << *finish.front() << dendl; - request_finish(finish.front()); - finish.pop_front(); - } - - kick_find_ino_peers(who); - kick_open_ino_peers(who); - - for (map::iterator p = fragments.begin(); - p != fragments.end(); ) { - dirfrag_t df = p->first; - fragment_info_t& info = p->second; - ++p; - if (info.is_fragmenting()) - continue; - dout(10) << "cancelling fragment " << df << " bit " << info.bits << dendl; - list dirs; - info.dirs.swap(dirs); - fragments.erase(df); - fragment_unmark_unfreeze_dirs(dirs); - } - - // MDCache::shutdown_export_strays() always exports strays to mds.0 - if (who == mds_rank_t(0)) - shutdown_exported_strays.clear(); - - show_subtrees(); -} - -/* - * handle_mds_recovery - called on another node's transition - * from resolve -> active. - */ -void MDCache::handle_mds_recovery(mds_rank_t who) -{ - dout(7) << "handle_mds_recovery mds." << who << dendl; - - // exclude all discover waiters. kick_discovers() will do the job - static const uint64_t i_mask = CInode::WAIT_ANY_MASK & ~CInode::WAIT_DIR; - static const uint64_t d_mask = CDir::WAIT_ANY_MASK & ~CDir::WAIT_DENTRY; - - list waiters; - - // wake up any waiters in their subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - - if (dir->authority().first != who || - dir->authority().second == mds->get_nodeid()) - continue; - assert(!dir->is_auth()); - - // wake any waiters - list q; - q.push_back(dir); - - while (!q.empty()) { - CDir *d = q.front(); - q.pop_front(); - d->take_waiting(d_mask, waiters); - - // inode waiters too - for (CDir::map_t::iterator p = d->items.begin(); - p != d->items.end(); - ++p) { - CDentry *dn = p->second; - CDentry::linkage_t *dnl = dn->get_linkage(); - if (dnl->is_primary()) { - dnl->get_inode()->take_waiting(i_mask, waiters); - - // recurse? - list ls; - dnl->get_inode()->get_dirfrags(ls); - for (list::iterator p = ls.begin(); - p != ls.end(); - ++p) { - CDir *subdir = *p; - if (!subdir->is_subtree_root()) - q.push_back(subdir); - } - } - } - } - } - - kick_open_ino_peers(who); - kick_find_ino_peers(who); - - // queue them up. - mds->queue_waiters(waiters); -} - -void MDCache::set_recovery_set(set& s) -{ - dout(7) << "set_recovery_set " << s << dendl; - recovery_set = s; -} - - -/* - * during resolve state, we share resolves to determine who - * is authoritative for which trees. we expect to get an resolve - * from _everyone_ in the recovery_set (the mds cluster at the time of - * the first failure). - * - * This functions puts the passed message before returning - */ -void MDCache::handle_resolve(MMDSResolve *m) -{ - dout(7) << "handle_resolve from " << m->get_source() << dendl; - mds_rank_t from = mds_rank_t(m->get_source().num()); - - if (mds->get_state() < MDSMap::STATE_RESOLVE) { - if (mds->get_want_state() == CEPH_MDS_STATE_RESOLVE) { - mds->wait_for_resolve(new C_MDS_RetryMessage(mds, m)); - return; - } - // wait until we reach the resolve stage! - m->put(); - return; - } - - discard_delayed_resolve(from); - - // ambiguous slave requests? - if (!m->slave_requests.empty()) { - if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) { - for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) { - if (uncommitted_masters.count(p->first) && !uncommitted_masters[p->first].safe) { - assert(!p->second.committing); - pending_masters.insert(p->first); - } - } - - if (!pending_masters.empty()) { - dout(10) << " still have pending updates, delay processing slave resolve" << dendl; - delayed_resolve[from] = m; - return; - } - } - - MMDSResolveAck *ack = new MMDSResolveAck; - for (auto p = m->slave_requests.begin(); p != m->slave_requests.end(); ++p) { - if (uncommitted_masters.count(p->first)) { //mds->sessionmap.have_completed_request(p->first)) { - // COMMIT - if (p->second.committing) { - // already committing, waiting for the OP_COMMITTED slave reply - dout(10) << " already committing slave request " << *p << " noop "<< dendl; - } else { - dout(10) << " ambiguous slave request " << *p << " will COMMIT" << dendl; - ack->add_commit(p->first); - } - uncommitted_masters[p->first].slaves.insert(from); // wait for slave OP_COMMITTED before we log ECommitted - - if (p->second.inode_caps.length() > 0) { - // slave wants to export caps (rename) - assert(mds->is_resolve()); - - inodeno_t ino; - map cap_exports; - bufferlist::iterator q = p->second.inode_caps.begin(); - ::decode(ino, q); - ::decode(cap_exports, q); - - assert(get_inode(ino)); - - for (map::iterator q = cap_exports.begin(); - q != cap_exports.end(); - ++q) { - Capability::Import& im = rejoin_imported_caps[from][ino][q->first]; - im.cap_id = ++last_cap_id; // assign a new cap ID - im.issue_seq = 1; - im.mseq = q->second.mseq; - } - - // will process these caps in rejoin stage - rejoin_slave_exports[ino].first = from; - rejoin_slave_exports[ino].second.swap(cap_exports); - - // send information of imported caps back to slave - ::encode(rejoin_imported_caps[from][ino], ack->commit[p->first]); - } - } else { - // ABORT - dout(10) << " ambiguous slave request " << *p << " will ABORT" << dendl; - assert(!p->second.committing); - ack->add_abort(p->first); - } - } - mds->send_message(ack, m->get_connection()); - m->put(); - return; - } - - if (!resolve_ack_gather.empty() || !need_resolve_rollback.empty()) { - dout(10) << "delay processing subtree resolve" << dendl; - delayed_resolve[from] = m; - return; - } - - bool survivor = false; - // am i a surviving ambiguous importer? - if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) { - survivor = true; - // check for any import success/failure (from this node) - map >::iterator p = my_ambiguous_imports.begin(); - while (p != my_ambiguous_imports.end()) { - map >::iterator next = p; - ++next; - CDir *dir = get_dirfrag(p->first); - assert(dir); - dout(10) << "checking ambiguous import " << *dir << dendl; - if (migrator->is_importing(dir->dirfrag()) && - migrator->get_import_peer(dir->dirfrag()) == from) { - assert(migrator->get_import_state(dir->dirfrag()) == Migrator::IMPORT_ACKING); - - // check if sender claims the subtree - bool claimed_by_sender = false; - for (map >::iterator q = m->subtrees.begin(); - q != m->subtrees.end(); - ++q) { - // an ambiguous import won't race with a refragmentation; it's appropriate to force here. - CDir *base = get_force_dirfrag(q->first, false); - if (!base || !base->contains(dir)) - continue; // base not dir or an ancestor of dir, clearly doesn't claim dir. - - bool inside = true; - set bounds; - get_force_dirfrag_bound_set(q->second, bounds); - for (set::iterator p = bounds.begin(); p != bounds.end(); ++p) { - CDir *bound = *p; - if (bound->contains(dir)) { - inside = false; // nope, bound is dir or parent of dir, not inside. - break; - } - } - if (inside) - claimed_by_sender = true; - } - - my_ambiguous_imports.erase(p); // no longer ambiguous. - if (claimed_by_sender) { - dout(7) << "ambiguous import failed on " << *dir << dendl; - migrator->import_reverse(dir); - } else { - dout(7) << "ambiguous import succeeded on " << *dir << dendl; - migrator->import_finish(dir, true); - } - } - p = next; - } - } - - // update my dir_auth values - // need to do this on recoverying nodes _and_ bystanders (to resolve ambiguous - // migrations between other nodes) - for (map >::iterator pi = m->subtrees.begin(); - pi != m->subtrees.end(); - ++pi) { - dout(10) << "peer claims " << pi->first << " bounds " << pi->second << dendl; - CDir *dir = get_force_dirfrag(pi->first, !survivor); - if (!dir) - continue; - adjust_bounded_subtree_auth(dir, pi->second, from); - try_subtree_merge(dir); - } - - show_subtrees(); - - // note ambiguous imports too - for (map >::iterator pi = m->ambiguous_imports.begin(); - pi != m->ambiguous_imports.end(); - ++pi) { - dout(10) << "noting ambiguous import on " << pi->first << " bounds " << pi->second << dendl; - other_ambiguous_imports[from][pi->first].swap( pi->second ); - } - - // did i get them all? - resolve_gather.erase(from); - - maybe_resolve_finish(); - - m->put(); -} - -void MDCache::process_delayed_resolve() -{ - dout(10) << "process_delayed_resolve" << dendl; - map tmp; - tmp.swap(delayed_resolve); - for (map::iterator p = tmp.begin(); p != tmp.end(); ++p) - handle_resolve(p->second); -} - -void MDCache::discard_delayed_resolve(mds_rank_t who) -{ - if (delayed_resolve.count(who)) { - delayed_resolve[who]->put(); - delayed_resolve.erase(who); - } -} - -void MDCache::maybe_resolve_finish() -{ - assert(resolve_ack_gather.empty()); - assert(need_resolve_rollback.empty()); - - if (!resolve_gather.empty()) { - dout(10) << "maybe_resolve_finish still waiting for resolves (" - << resolve_gather << ")" << dendl; - return; - } - - dout(10) << "maybe_resolve_finish got all resolves+resolve_acks, done." << dendl; - disambiguate_my_imports(); - finish_committed_masters(); - - if (resolve_done) { - assert(mds->is_resolve()); - trim_unlinked_inodes(); - recalc_auth_bits(false); - resolve_done.release()->complete(0); - } else { - maybe_send_pending_rejoins(); - } -} - -/* This functions puts the passed message before returning */ -void MDCache::handle_resolve_ack(MMDSResolveAck *ack) -{ - dout(10) << "handle_resolve_ack " << *ack << " from " << ack->get_source() << dendl; - mds_rank_t from = mds_rank_t(ack->get_source().num()); - - if (!resolve_ack_gather.count(from) || - mds->mdsmap->get_state(from) < MDSMap::STATE_RESOLVE) { - ack->put(); - return; - } - - if (ambiguous_slave_updates.count(from)) { - assert(mds->mdsmap->is_clientreplay_or_active_or_stopping(from)); - assert(mds->is_clientreplay() || mds->is_active() || mds->is_stopping()); - } - - for (map::iterator p = ack->commit.begin(); - p != ack->commit.end(); - ++p) { - dout(10) << " commit on slave " << p->first << dendl; - - if (ambiguous_slave_updates.count(from)) { - remove_ambiguous_slave_update(p->first, from); - continue; - } - - if (mds->is_resolve()) { - // replay - MDSlaveUpdate *su = get_uncommitted_slave_update(p->first, from); - assert(su); - - // log commit - mds->mdlog->start_submit_entry(new ESlaveUpdate(mds->mdlog, "unknown", p->first, from, - ESlaveUpdate::OP_COMMIT, su->origop), - new C_MDC_SlaveCommit(this, from, p->first)); - mds->mdlog->flush(); - - finish_uncommitted_slave_update(p->first, from); - } else { - MDRequestRef mdr = request_get(p->first); - // information about master imported caps - if (p->second.length() > 0) - mdr->more()->inode_import.claim(p->second); - - assert(mdr->slave_request == 0); // shouldn't be doing anything! - request_finish(mdr); - } - } - - for (vector::iterator p = ack->abort.begin(); - p != ack->abort.end(); - ++p) { - dout(10) << " abort on slave " << *p << dendl; - - if (mds->is_resolve()) { - MDSlaveUpdate *su = get_uncommitted_slave_update(*p, from); - assert(su); - - // perform rollback (and journal a rollback entry) - // note: this will hold up the resolve a bit, until the rollback entries journal. - MDRequestRef null_ref; - switch (su->origop) { - case ESlaveUpdate::LINK: - mds->server->do_link_rollback(su->rollback, from, null_ref); - break; - case ESlaveUpdate::RENAME: - mds->server->do_rename_rollback(su->rollback, from, null_ref); - break; - case ESlaveUpdate::RMDIR: - mds->server->do_rmdir_rollback(su->rollback, from, null_ref); - break; - default: - ceph_abort(); - } - } else { - MDRequestRef mdr = request_get(*p); - mdr->aborted = true; - if (mdr->slave_request) { - if (mdr->slave_did_prepare()) // journaling slave prepare ? - add_rollback(*p, from); - } else { - request_finish(mdr); - } - } - } - - if (!ambiguous_slave_updates.count(from)) - resolve_ack_gather.erase(from); - if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) { - send_subtree_resolves(); - process_delayed_resolve(); - } - - ack->put(); -} - -void MDCache::add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate *su) -{ - assert(uncommitted_slave_updates[master].count(reqid) == 0); - uncommitted_slave_updates[master][reqid] = su; - for(set::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) - uncommitted_slave_rename_olddir[*p]++; - for(set::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) - uncommitted_slave_unlink[*p]++; -} - -void MDCache::finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master) -{ - assert(uncommitted_slave_updates[master].count(reqid)); - MDSlaveUpdate* su = uncommitted_slave_updates[master][reqid]; - - uncommitted_slave_updates[master].erase(reqid); - if (uncommitted_slave_updates[master].empty()) - uncommitted_slave_updates.erase(master); - // discard the non-auth subtree we renamed out of - for(set::iterator p = su->olddirs.begin(); p != su->olddirs.end(); ++p) { - CInode *diri = *p; - map::iterator it = uncommitted_slave_rename_olddir.find(diri); - assert(it != uncommitted_slave_rename_olddir.end()); - it->second--; - if (it->second == 0) { - uncommitted_slave_rename_olddir.erase(it); - list ls; - diri->get_dirfrags(ls); - for (list::iterator q = ls.begin(); q != ls.end(); ++q) { - CDir *root = get_subtree_root(*q); - if (root->get_dir_auth() == CDIR_AUTH_UNDEF) { - try_trim_non_auth_subtree(root); - if (*q != root) - break; - } - } - } else - assert(it->second > 0); - } - // removed the inodes that were unlinked by slave update - for(set::iterator p = su->unlinked.begin(); p != su->unlinked.end(); ++p) { - CInode *in = *p; - map::iterator it = uncommitted_slave_unlink.find(in); - assert(it != uncommitted_slave_unlink.end()); - it->second--; - if (it->second == 0) { - uncommitted_slave_unlink.erase(it); - if (!in->get_projected_parent_dn()) - mds->mdcache->remove_inode_recursive(in); - } else - assert(it->second > 0); - } - delete su; -} - -MDSlaveUpdate* MDCache::get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master) -{ - - MDSlaveUpdate* su = NULL; - if (uncommitted_slave_updates.count(master) && - uncommitted_slave_updates[master].count(reqid)) { - su = uncommitted_slave_updates[master][reqid]; - assert(su); - } - return su; -} - -void MDCache::finish_rollback(metareqid_t reqid) { - assert(need_resolve_rollback.count(reqid)); - if (mds->is_resolve()) - finish_uncommitted_slave_update(reqid, need_resolve_rollback[reqid]); - need_resolve_rollback.erase(reqid); - if (resolve_ack_gather.empty() && need_resolve_rollback.empty()) { - send_subtree_resolves(); - process_delayed_resolve(); - } -} - -void MDCache::disambiguate_other_imports() -{ - dout(10) << "disambiguate_other_imports" << dendl; - - bool recovering = !(mds->is_clientreplay() || mds->is_active() || mds->is_stopping()); - // other nodes' ambiguous imports - for (map > >::iterator p = other_ambiguous_imports.begin(); - p != other_ambiguous_imports.end(); - ++p) { - mds_rank_t who = p->first; - dout(10) << "ambiguous imports for mds." << who << dendl; - - for (map >::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - dout(10) << " ambiguous import " << q->first << " bounds " << q->second << dendl; - // an ambiguous import will not race with a refragmentation; it's appropriate to force here. - CDir *dir = get_force_dirfrag(q->first, recovering); - if (!dir) continue; - - if (dir->is_ambiguous_auth() || // works for me_ambig or if i am a surviving bystander - dir->authority() == CDIR_AUTH_UNDEF) { // resolving - dout(10) << " mds." << who << " did import " << *dir << dendl; - adjust_bounded_subtree_auth(dir, q->second, who); - try_subtree_merge(dir); - } else { - dout(10) << " mds." << who << " did not import " << *dir << dendl; - } - } - } - other_ambiguous_imports.clear(); -} - -void MDCache::disambiguate_my_imports() -{ - dout(10) << "disambiguate_my_imports" << dendl; - - if (!mds->is_resolve()) { - assert(my_ambiguous_imports.empty()); - return; - } - - disambiguate_other_imports(); - - // my ambiguous imports - mds_authority_t me_ambig(mds->get_nodeid(), mds->get_nodeid()); - while (!my_ambiguous_imports.empty()) { - map >::iterator q = my_ambiguous_imports.begin(); - - CDir *dir = get_dirfrag(q->first); - assert(dir); - - if (dir->authority() != me_ambig) { - dout(10) << "ambiguous import auth known, must not be me " << *dir << dendl; - cancel_ambiguous_import(dir); - - mds->mdlog->start_submit_entry(new EImportFinish(dir, false)); - - // subtree may have been swallowed by another node claiming dir - // as their own. - CDir *root = get_subtree_root(dir); - if (root != dir) - dout(10) << " subtree root is " << *root << dendl; - assert(root->dir_auth.first != mds->get_nodeid()); // no us! - try_trim_non_auth_subtree(root); - } else { - dout(10) << "ambiguous import auth unclaimed, must be me " << *dir << dendl; - finish_ambiguous_import(q->first); - mds->mdlog->start_submit_entry(new EImportFinish(dir, true)); - } - } - assert(my_ambiguous_imports.empty()); - mds->mdlog->flush(); - - // verify all my subtrees are unambiguous! - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - if (dir->is_ambiguous_dir_auth()) { - dout(0) << "disambiguate_imports uh oh, dir_auth is still ambiguous for " << *dir << dendl; - } - assert(!dir->is_ambiguous_dir_auth()); - } - - show_subtrees(); -} - - -void MDCache::add_ambiguous_import(dirfrag_t base, const vector& bounds) -{ - assert(my_ambiguous_imports.count(base) == 0); - my_ambiguous_imports[base] = bounds; -} - - -void MDCache::add_ambiguous_import(CDir *base, const set& bounds) -{ - // make a list - vector binos; - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) - binos.push_back((*p)->dirfrag()); - - // note: this can get called twice if the exporter fails during recovery - if (my_ambiguous_imports.count(base->dirfrag())) - my_ambiguous_imports.erase(base->dirfrag()); - - add_ambiguous_import(base->dirfrag(), binos); -} - -void MDCache::cancel_ambiguous_import(CDir *dir) -{ - dirfrag_t df = dir->dirfrag(); - assert(my_ambiguous_imports.count(df)); - dout(10) << "cancel_ambiguous_import " << df - << " bounds " << my_ambiguous_imports[df] - << " " << *dir - << dendl; - my_ambiguous_imports.erase(df); -} - -void MDCache::finish_ambiguous_import(dirfrag_t df) -{ - assert(my_ambiguous_imports.count(df)); - vector bounds; - bounds.swap(my_ambiguous_imports[df]); - my_ambiguous_imports.erase(df); - - dout(10) << "finish_ambiguous_import " << df - << " bounds " << bounds - << dendl; - CDir *dir = get_dirfrag(df); - assert(dir); - - // adjust dir_auth, import maps - adjust_bounded_subtree_auth(dir, bounds, mds->get_nodeid()); - try_subtree_merge(dir); -} - -void MDCache::remove_inode_recursive(CInode *in) -{ - dout(10) << "remove_inode_recursive " << *in << dendl; - list ls; - in->get_dirfrags(ls); - list::iterator p = ls.begin(); - while (p != ls.end()) { - CDir *subdir = *p++; - - dout(10) << " removing dirfrag " << subdir << dendl; - CDir::map_t::iterator q = subdir->items.begin(); - while (q != subdir->items.end()) { - CDentry *dn = q->second; - ++q; - CDentry::linkage_t *dnl = dn->get_linkage(); - if (dnl->is_primary()) { - CInode *tin = dnl->get_inode(); - subdir->unlink_inode(dn, false); - remove_inode_recursive(tin); - } - subdir->remove_dentry(dn); - } - - if (subdir->is_subtree_root()) - remove_subtree(subdir); - in->close_dirfrag(subdir->dirfrag().frag); - } - remove_inode(in); -} - -bool MDCache::expire_recursive( - CInode *in, - map& expiremap) -{ - assert(!in->is_auth()); - - dout(10) << __func__ << ":" << *in << dendl; - - // Recurse into any dirfrags beneath this inode - list ls; - in->get_dirfrags(ls); - for (auto subdir : ls) { - if (!in->is_mdsdir() && subdir->is_subtree_root()) { - dout(10) << __func__ << ": stray still has subtree " << *in << dendl; - return true; - } - - for (auto &it : subdir->items) { - CDentry *dn = it.second; - CDentry::linkage_t *dnl = dn->get_linkage(); - if (dnl->is_primary()) { - CInode *tin = dnl->get_inode(); - - /* Remote strays with linkage (i.e. hardlinks) should not be - * expired, because they may be the target of - * a rename() as the owning MDS shuts down */ - if (!tin->is_stray() && tin->inode.nlink) { - dout(10) << __func__ << ": stray still has linkage " << *tin << dendl; - return true; - } - - const bool abort = expire_recursive(tin, expiremap); - if (abort) { - return true; - } - } - if (dn->lru_is_expireable()) { - trim_dentry(dn, expiremap); - } else { - dout(10) << __func__ << ": stray dn is not expireable " << *dn << dendl; - return true; - } - } - } - - return false; -} - -void MDCache::trim_unlinked_inodes() -{ - dout(7) << "trim_unlinked_inodes" << dendl; - list q; - for (ceph::unordered_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - CInode *in = p->second; - if (in->get_parent_dn() == NULL && !in->is_base()) { - dout(7) << " will trim from " << *in << dendl; - q.push_back(in); - } - } - for (list::iterator p = q.begin(); p != q.end(); ++p) - remove_inode_recursive(*p); -} - -/** recalc_auth_bits() - * once subtree auth is disambiguated, we need to adjust all the - * auth and dirty bits in our cache before moving on. - */ -void MDCache::recalc_auth_bits(bool replay) -{ - dout(7) << "recalc_auth_bits " << (replay ? "(replay)" : "") << dendl; - - if (root) { - root->inode_auth.first = mds->mdsmap->get_root(); - bool auth = mds->get_nodeid() == root->inode_auth.first; - if (auth) { - root->state_set(CInode::STATE_AUTH); - } else { - root->state_clear(CInode::STATE_AUTH); - if (!replay) - root->state_set(CInode::STATE_REJOINING); - } - } - - set subtree_inodes; - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - if (p->first->dir_auth.first == mds->get_nodeid()) - subtree_inodes.insert(p->first->inode); - } - - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - if (p->first->inode->is_mdsdir()) { - CInode *in = p->first->inode; - bool auth = in->ino() == MDS_INO_MDSDIR(mds->get_nodeid()); - if (auth) { - in->state_set(CInode::STATE_AUTH); - } else { - in->state_clear(CInode::STATE_AUTH); - if (!replay) - in->state_set(CInode::STATE_REJOINING); - } - } - - list dfq; // dirfrag queue - dfq.push_back(p->first); - - bool auth = p->first->authority().first == mds->get_nodeid(); - dout(10) << " subtree auth=" << auth << " for " << *p->first << dendl; - - while (!dfq.empty()) { - CDir *dir = dfq.front(); - dfq.pop_front(); - - // dir - if (auth) { - dir->state_set(CDir::STATE_AUTH); - } else { - dir->state_clear(CDir::STATE_AUTH); - if (!replay) { - // close empty non-auth dirfrag - if (!dir->is_subtree_root() && dir->get_num_any() == 0) { - dir->inode->close_dirfrag(dir->get_frag()); - continue; - } - dir->state_set(CDir::STATE_REJOINING); - dir->state_clear(CDir::STATE_COMPLETE); - if (dir->is_dirty()) - dir->mark_clean(); - } - } - - // dentries in this dir - for (CDir::map_t::iterator q = dir->items.begin(); - q != dir->items.end(); - ++q) { - // dn - CDentry *dn = q->second; - CDentry::linkage_t *dnl = dn->get_linkage(); - if (auth) { - dn->state_set(CDentry::STATE_AUTH); - } else { - dn->state_clear(CDentry::STATE_AUTH); - if (!replay) { - dn->state_set(CDentry::STATE_REJOINING); - if (dn->is_dirty()) - dn->mark_clean(); - } - } - - if (dnl->is_primary()) { - // inode - CInode *in = dnl->get_inode(); - if (auth) { - in->state_set(CInode::STATE_AUTH); - } else { - in->state_clear(CInode::STATE_AUTH); - if (!replay) { - in->state_set(CInode::STATE_REJOINING); - if (in->is_dirty()) - in->mark_clean(); - if (in->is_dirty_parent()) - in->clear_dirty_parent(); - // avoid touching scatterlocks for our subtree roots! - if (subtree_inodes.count(in) == 0) - in->clear_scatter_dirty(); - } - } - // recurse? - if (in->is_dir()) - in->get_nested_dirfrags(dfq); - } - } - } - } - - show_subtrees(); - show_cache(); -} - - - -// =========================================================================== -// REJOIN - -/* - * notes on scatterlock recovery: - * - * - recovering inode replica sends scatterlock data for any subtree - * roots (the only ones that are possibly dirty). - * - * - surviving auth incorporates any provided scatterlock data. any - * pending gathers are then finished, as with the other lock types. - * - * that takes care of surviving auth + (recovering replica)*. - * - * - surviving replica sends strong_inode, which includes current - * scatterlock state, AND any dirty scatterlock data. this - * provides the recovering auth with everything it might need. - * - * - recovering auth must pick initial scatterlock state based on - * (weak|strong) rejoins. - * - always assimilate scatterlock data (it can't hurt) - * - any surviving replica in SCATTER state -> SCATTER. otherwise, SYNC. - * - include base inode in ack for all inodes that saw scatterlock content - * - * also, for scatter gather, - * - * - auth increments {frag,r}stat.version on completion of any gather. - * - * - auth incorporates changes in a gather _only_ if the version - * matches. - * - * - replica discards changes any time the scatterlock syncs, and - * after recovery. - */ - -void MDCache::dump_rejoin_status(Formatter *f) const -{ - f->open_object_section("rejoin_status"); - f->dump_stream("rejoin_gather") << rejoin_gather; - f->dump_stream("rejoin_ack_gather") << rejoin_ack_gather; - f->dump_unsigned("num_opening_inodes", cap_imports_num_opening); - f->close_section(); -} - -void MDCache::rejoin_start(MDSInternalContext *rejoin_done_) -{ - dout(10) << "rejoin_start" << dendl; - assert(!rejoin_done); - rejoin_done.reset(rejoin_done_); - - rejoin_gather = recovery_set; - // need finish opening cap inodes before sending cache rejoins - rejoin_gather.insert(mds->get_nodeid()); - process_imported_caps(); -} - -/* - * rejoin phase! - * - * this initiates rejoin. it shoudl be called before we get any - * rejoin or rejoin_ack messages (or else mdsmap distribution is broken). - * - * we start out by sending rejoins to everyone in the recovery set. - * - * if we are rejoin, send for all regions in our cache. - * if we are active|stopping, send only to nodes that are are rejoining. - */ -void MDCache::rejoin_send_rejoins() -{ - dout(10) << "rejoin_send_rejoins with recovery_set " << recovery_set << dendl; - - if (rejoin_gather.count(mds->get_nodeid())) { - dout(7) << "rejoin_send_rejoins still processing imported caps, delaying" << dendl; - rejoins_pending = true; - return; - } - if (!resolve_gather.empty()) { - dout(7) << "rejoin_send_rejoins still waiting for resolves (" - << resolve_gather << ")" << dendl; - rejoins_pending = true; - return; - } - - assert(!migrator->is_importing()); - assert(!migrator->is_exporting()); - - if (!mds->is_rejoin()) { - disambiguate_other_imports(); - } - - map rejoins; - - - // if i am rejoining, send a rejoin to everyone. - // otherwise, just send to others who are rejoining. - for (set::iterator p = recovery_set.begin(); - p != recovery_set.end(); - ++p) { - if (*p == mds->get_nodeid()) continue; // nothing to myself! - if (rejoin_sent.count(*p)) continue; // already sent a rejoin to this node! - if (mds->is_rejoin()) - rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_WEAK); - else if (mds->mdsmap->is_rejoin(*p)) - rejoins[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_STRONG); - } - - if (mds->is_rejoin()) { - map > client_exports; - for (auto p = cap_exports.begin(); p != cap_exports.end(); ++p) { - assert(cap_export_targets.count(p->first)); - mds_rank_t target = cap_export_targets[p->first]; - if (rejoins.count(target) == 0) - continue; - rejoins[target]->cap_exports[p->first] = p->second; - for (auto q = p->second.begin(); q != p->second.end(); ++q) - client_exports[q->first].insert(target); - } - for (map >::iterator p = client_exports.begin(); - p != client_exports.end(); - ++p) { - entity_inst_t inst = mds->sessionmap.get_inst(entity_name_t::CLIENT(p->first.v)); - for (set::iterator q = p->second.begin(); q != p->second.end(); ++q) - rejoins[*q]->client_map[p->first] = inst; - } - } - - - // check all subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - assert(dir->is_subtree_root()); - if (dir->is_ambiguous_dir_auth()) { - // exporter is recovering, importer is survivor. - assert(rejoins.count(dir->authority().first)); - assert(!rejoins.count(dir->authority().second)); - continue; - } - - // my subtree? - if (dir->is_auth()) - continue; // skip my own regions! - - mds_rank_t auth = dir->get_dir_auth().first; - assert(auth >= 0); - if (rejoins.count(auth) == 0) - continue; // don't care about this node's subtrees - - rejoin_walk(dir, rejoins[auth]); - } - - // rejoin root inodes, too - for (map::iterator p = rejoins.begin(); - p != rejoins.end(); - ++p) { - if (mds->is_rejoin()) { - // weak - if (p->first == 0 && root) { - p->second->add_weak_inode(root->vino()); - if (root->is_dirty_scattered()) { - dout(10) << " sending scatterlock state on root " << *root << dendl; - p->second->add_scatterlock_state(root); - } - } - if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) { - if (in) - p->second->add_weak_inode(in->vino()); - } - } else { - // strong - if (p->first == 0 && root) { - p->second->add_strong_inode(root->vino(), - root->get_replica_nonce(), - root->get_caps_wanted(), - root->filelock.get_state(), - root->nestlock.get_state(), - root->dirfragtreelock.get_state()); - root->state_set(CInode::STATE_REJOINING); - if (root->is_dirty_scattered()) { - dout(10) << " sending scatterlock state on root " << *root << dendl; - p->second->add_scatterlock_state(root); - } - } - - if (CInode *in = get_inode(MDS_INO_MDSDIR(p->first))) { - p->second->add_strong_inode(in->vino(), - in->get_replica_nonce(), - in->get_caps_wanted(), - in->filelock.get_state(), - in->nestlock.get_state(), - in->dirfragtreelock.get_state()); - in->state_set(CInode::STATE_REJOINING); - } - } - } - - if (!mds->is_rejoin()) { - // i am survivor. send strong rejoin. - // note request remote_auth_pins, xlocks - for (ceph::unordered_map::iterator p = active_requests.begin(); - p != active_requests.end(); - ++p) { - MDRequestRef& mdr = p->second; - if (mdr->is_slave()) - continue; - // auth pins - for (map::iterator q = mdr->remote_auth_pins.begin(); - q != mdr->remote_auth_pins.end(); - ++q) { - if (!q->first->is_auth()) { - assert(q->second == q->first->authority().first); - if (rejoins.count(q->second) == 0) continue; - MMDSCacheRejoin *rejoin = rejoins[q->second]; - - dout(15) << " " << *mdr << " authpin on " << *q->first << dendl; - MDSCacheObjectInfo i; - q->first->set_object_info(i); - if (i.ino) - rejoin->add_inode_authpin(vinodeno_t(i.ino, i.snapid), mdr->reqid, mdr->attempt); - else - rejoin->add_dentry_authpin(i.dirfrag, i.dname, i.snapid, mdr->reqid, mdr->attempt); - - if (mdr->has_more() && mdr->more()->is_remote_frozen_authpin && - mdr->more()->rename_inode == q->first) - rejoin->add_inode_frozen_authpin(vinodeno_t(i.ino, i.snapid), - mdr->reqid, mdr->attempt); - } - } - // xlocks - for (set::iterator q = mdr->xlocks.begin(); - q != mdr->xlocks.end(); - ++q) { - if (!(*q)->get_parent()->is_auth()) { - mds_rank_t who = (*q)->get_parent()->authority().first; - if (rejoins.count(who) == 0) continue; - MMDSCacheRejoin *rejoin = rejoins[who]; - - dout(15) << " " << *mdr << " xlock on " << **q << " " << *(*q)->get_parent() << dendl; - MDSCacheObjectInfo i; - (*q)->get_parent()->set_object_info(i); - if (i.ino) - rejoin->add_inode_xlock(vinodeno_t(i.ino, i.snapid), (*q)->get_type(), - mdr->reqid, mdr->attempt); - else - rejoin->add_dentry_xlock(i.dirfrag, i.dname, i.snapid, - mdr->reqid, mdr->attempt); - } - } - // remote wrlocks - for (map::iterator q = mdr->remote_wrlocks.begin(); - q != mdr->remote_wrlocks.end(); - ++q) { - mds_rank_t who = q->second; - if (rejoins.count(who) == 0) continue; - MMDSCacheRejoin *rejoin = rejoins[who]; - - dout(15) << " " << *mdr << " wrlock on " << q->second - << " " << q->first->get_parent() << dendl; - MDSCacheObjectInfo i; - q->first->get_parent()->set_object_info(i); - assert(i.ino); - rejoin->add_inode_wrlock(vinodeno_t(i.ino, i.snapid), q->first->get_type(), - mdr->reqid, mdr->attempt); - } - } - } - - // send the messages - for (map::iterator p = rejoins.begin(); - p != rejoins.end(); - ++p) { - assert(rejoin_sent.count(p->first) == 0); - assert(rejoin_ack_gather.count(p->first) == 0); - rejoin_sent.insert(p->first); - rejoin_ack_gather.insert(p->first); - mds->send_message_mds(p->second, p->first); - } - rejoin_ack_gather.insert(mds->get_nodeid()); // we need to complete rejoin_gather_finish, too - rejoins_pending = false; - - // nothing? - if (mds->is_rejoin() && rejoins.empty()) { - dout(10) << "nothing to rejoin" << dendl; - rejoin_gather_finish(); - } -} - - -/** - * rejoin_walk - build rejoin declarations for a subtree - * - * @param dir subtree root - * @param rejoin rejoin message - * - * from a rejoining node: - * weak dirfrag - * weak dentries (w/ connectivity) - * - * from a surviving node: - * strong dirfrag - * strong dentries (no connectivity!) - * strong inodes - */ -void MDCache::rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin) -{ - dout(10) << "rejoin_walk " << *dir << dendl; - - list nested; // finish this dir, then do nested items - - if (mds->is_rejoin()) { - // WEAK - rejoin->add_weak_dirfrag(dir->dirfrag()); - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - CDentry::linkage_t *dnl = dn->get_linkage(); - dout(15) << " add_weak_primary_dentry " << *dn << dendl; - assert(dnl->is_primary()); - CInode *in = dnl->get_inode(); - assert(dnl->get_inode()->is_dir()); - rejoin->add_weak_primary_dentry(dir->ino(), dn->name.c_str(), dn->first, dn->last, in->ino()); - in->get_nested_dirfrags(nested); - if (in->is_dirty_scattered()) { - dout(10) << " sending scatterlock state on " << *in << dendl; - rejoin->add_scatterlock_state(in); - } - } - } else { - // STRONG - dout(15) << " add_strong_dirfrag " << *dir << dendl; - rejoin->add_strong_dirfrag(dir->dirfrag(), dir->get_replica_nonce(), dir->get_dir_rep()); - dir->state_set(CDir::STATE_REJOINING); - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - CDentry::linkage_t *dnl = dn->get_linkage(); - dout(15) << " add_strong_dentry " << *dn << dendl; - rejoin->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last, - dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0), - dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0), - dnl->is_remote() ? dnl->get_remote_d_type():0, - dn->get_replica_nonce(), - dn->lock.get_state()); - dn->state_set(CDentry::STATE_REJOINING); - if (dnl->is_primary()) { - CInode *in = dnl->get_inode(); - dout(15) << " add_strong_inode " << *in << dendl; - rejoin->add_strong_inode(in->vino(), - in->get_replica_nonce(), - in->get_caps_wanted(), - in->filelock.get_state(), - in->nestlock.get_state(), - in->dirfragtreelock.get_state()); - in->state_set(CInode::STATE_REJOINING); - in->get_nested_dirfrags(nested); - if (in->is_dirty_scattered()) { - dout(10) << " sending scatterlock state on " << *in << dendl; - rejoin->add_scatterlock_state(in); - } - } - } - } - - // recurse into nested dirs - for (list::iterator p = nested.begin(); - p != nested.end(); - ++p) - rejoin_walk(*p, rejoin); -} - - -/* - * i got a rejoin. - * - reply with the lockstate - * - * if i am active|stopping, - * - remove source from replica list for everything not referenced here. - * This function puts the passed message before returning. - */ -void MDCache::handle_cache_rejoin(MMDSCacheRejoin *m) -{ - dout(7) << "handle_cache_rejoin " << *m << " from " << m->get_source() - << " (" << m->get_payload().length() << " bytes)" - << dendl; - - switch (m->op) { - case MMDSCacheRejoin::OP_WEAK: - handle_cache_rejoin_weak(m); - break; - case MMDSCacheRejoin::OP_STRONG: - handle_cache_rejoin_strong(m); - break; - case MMDSCacheRejoin::OP_ACK: - handle_cache_rejoin_ack(m); - break; - - default: - ceph_abort(); - } - m->put(); -} - - -/* - * handle_cache_rejoin_weak - * - * the sender - * - is recovering from their journal. - * - may have incorrect (out of date) inode contents - * - will include weak dirfrag if sender is dirfrag auth and parent inode auth is recipient - * - * if the sender didn't trim_non_auth(), they - * - may have incorrect (out of date) dentry/inode linkage - * - may have deleted/purged inodes - * and i may have to go to disk to get accurate inode contents. yuck. - * This functions DOES NOT put the passed message before returning - */ -void MDCache::handle_cache_rejoin_weak(MMDSCacheRejoin *weak) -{ - mds_rank_t from = mds_rank_t(weak->get_source().num()); - - // possible response(s) - MMDSCacheRejoin *ack = 0; // if survivor - set acked_inodes; // if survivor - set gather_locks; // if survivor - bool survivor = false; // am i a survivor? - - if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping()) { - survivor = true; - dout(10) << "i am a surivivor, and will ack immediately" << dendl; - ack = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK); - - map > imported_caps; - - // check cap exports - for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) { - CInode *in = get_inode(p->first); - assert(!in || in->is_auth()); - for (auto q = p->second.begin(); q != p->second.end(); ++q) { - dout(10) << " claiming cap import " << p->first << " client." << q->first << " on " << *in << dendl; - Capability *cap = rejoin_import_cap(in, q->first, q->second, from); - Capability::Import& im = imported_caps[p->first][q->first]; - if (cap) { - im.cap_id = cap->get_cap_id(); - im.issue_seq = cap->get_last_seq(); - im.mseq = cap->get_mseq(); - } else { - // all are zero - } - } - mds->locker->eval(in, CEPH_CAP_LOCKS, true); - } - - ::encode(imported_caps, ack->imported_caps); - } else { - assert(mds->is_rejoin()); - - // we may have already received a strong rejoin from the sender. - rejoin_scour_survivor_replicas(from, NULL, acked_inodes, gather_locks); - assert(gather_locks.empty()); - - // check cap exports. - rejoin_client_map.insert(weak->client_map.begin(), weak->client_map.end()); - - for (auto p = weak->cap_exports.begin(); p != weak->cap_exports.end(); ++p) { - CInode *in = get_inode(p->first); - assert(in && in->is_auth()); - // note - for (auto q = p->second.begin(); q != p->second.end(); ++q) { - dout(10) << " claiming cap import " << p->first << " client." << q->first << dendl; - cap_imports[p->first][q->first][from] = q->second; - } - } - } - - // assimilate any potentially dirty scatterlock state - for (map::iterator p = weak->inode_scatterlocks.begin(); - p != weak->inode_scatterlocks.end(); - ++p) { - CInode *in = get_inode(p->first); - assert(in); - in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file); - in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest); - in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft); - if (!survivor) - rejoin_potential_updated_scatterlocks.insert(in); - } - - // recovering peer may send incorrect dirfrags here. we need to - // infer which dirfrag they meant. the ack will include a - // strong_dirfrag that will set them straight on the fragmentation. - - // walk weak map - set dirs_to_share; - for (set::iterator p = weak->weak_dirfrags.begin(); - p != weak->weak_dirfrags.end(); - ++p) { - CInode *diri = get_inode(p->ino); - if (!diri) - dout(0) << " missing dir ino " << p->ino << dendl; - assert(diri); - - list ls; - if (diri->dirfragtree.is_leaf(p->frag)) { - ls.push_back(p->frag); - } else { - diri->dirfragtree.get_leaves_under(p->frag, ls); - if (ls.empty()) - ls.push_back(diri->dirfragtree[p->frag.value()]); - } - for (list::iterator q = ls.begin(); q != ls.end(); ++q) { - frag_t fg = *q; - CDir *dir = diri->get_dirfrag(fg); - if (!dir) { - dout(0) << " missing dir for " << p->frag << " (which maps to " << fg << ") on " << *diri << dendl; - continue; - } - assert(dir); - if (dirs_to_share.count(dir)) { - dout(10) << " already have " << p->frag << " -> " << fg << " " << *dir << dendl; - } else { - dirs_to_share.insert(dir); - unsigned nonce = dir->add_replica(from); - dout(10) << " have " << p->frag << " -> " << fg << " " << *dir << dendl; - if (ack) { - ack->add_strong_dirfrag(dir->dirfrag(), nonce, dir->dir_rep); - ack->add_dirfrag_base(dir); - } - } - } - } - - for (map >::iterator p = weak->weak.begin(); - p != weak->weak.end(); - ++p) { - CInode *diri = get_inode(p->first); - if (!diri) - dout(0) << " missing dir ino " << p->first << dendl; - assert(diri); - - // weak dentries - CDir *dir = 0; - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - // locate proper dirfrag. - // optimize for common case (one dirfrag) to avoid dirs_to_share set check - frag_t fg = diri->pick_dirfrag(q->first.name); - if (!dir || dir->get_frag() != fg) { - dir = diri->get_dirfrag(fg); - if (!dir) - dout(0) << " missing dir frag " << fg << " on " << *diri << dendl; - assert(dir); - assert(dirs_to_share.count(dir)); - } - - // and dentry - CDentry *dn = dir->lookup(q->first.name, q->first.snapid); - assert(dn); - CDentry::linkage_t *dnl = dn->get_linkage(); - assert(dnl->is_primary()); - - if (survivor && dn->is_replica(from)) - dentry_remove_replica(dn, from, gather_locks); - unsigned dnonce = dn->add_replica(from); - dout(10) << " have " << *dn << dendl; - if (ack) - ack->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last, - dnl->get_inode()->ino(), inodeno_t(0), 0, - dnonce, dn->lock.get_replica_state()); - - // inode - CInode *in = dnl->get_inode(); - assert(in); - - if (survivor && in->is_replica(from)) - inode_remove_replica(in, from, true, gather_locks); - unsigned inonce = in->add_replica(from); - dout(10) << " have " << *in << dendl; - - // scatter the dirlock, just in case? - if (!survivor && in->is_dir() && in->has_subtree_root_dirfrag()) - in->filelock.set_state(LOCK_MIX); - - if (ack) { - acked_inodes.insert(in->vino()); - ack->add_inode_base(in, mds->mdsmap->get_up_features()); - bufferlist bl; - in->_encode_locks_state_for_rejoin(bl, from); - ack->add_inode_locks(in, inonce, bl); - } - } - } - - // weak base inodes? (root, stray, etc.) - for (set::iterator p = weak->weak_inodes.begin(); - p != weak->weak_inodes.end(); - ++p) { - CInode *in = get_inode(*p); - assert(in); // hmm fixme wrt stray? - if (survivor && in->is_replica(from)) - inode_remove_replica(in, from, true, gather_locks); - unsigned inonce = in->add_replica(from); - dout(10) << " have base " << *in << dendl; - - if (ack) { - acked_inodes.insert(in->vino()); - ack->add_inode_base(in, mds->mdsmap->get_up_features()); - bufferlist bl; - in->_encode_locks_state_for_rejoin(bl, from); - ack->add_inode_locks(in, inonce, bl); - } - } - - assert(rejoin_gather.count(from)); - rejoin_gather.erase(from); - if (survivor) { - // survivor. do everything now. - for (map::iterator p = weak->inode_scatterlocks.begin(); - p != weak->inode_scatterlocks.end(); - ++p) { - CInode *in = get_inode(p->first); - assert(in); - dout(10) << " including base inode (due to potential scatterlock update) " << *in << dendl; - acked_inodes.insert(in->vino()); - ack->add_inode_base(in, mds->mdsmap->get_up_features()); - } - - rejoin_scour_survivor_replicas(from, ack, acked_inodes, gather_locks); - mds->send_message(ack, weak->get_connection()); - - for (set::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) { - if (!(*p)->is_stable()) - mds->locker->eval_gather(*p); - } - } else { - // done? - if (rejoin_gather.empty()) { - rejoin_gather_finish(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl; - } - } -} - -class C_MDC_RejoinGatherFinish : public MDCacheContext { -public: - explicit C_MDC_RejoinGatherFinish(MDCache *c) : MDCacheContext(c) {} - void finish(int r) override { - mdcache->rejoin_gather_finish(); - } -}; - -/* - * rejoin_scour_survivor_replica - remove source from replica list on unmentioned objects - * - * all validated replicas are acked with a strong nonce, etc. if that isn't in the - * ack, the replica dne, and we can remove it from our replica maps. - */ -void MDCache::rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *ack, - set& acked_inodes, - set& gather_locks) -{ - dout(10) << "rejoin_scour_survivor_replicas from mds." << from << dendl; - - for (ceph::unordered_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - CInode *in = p->second; - - // inode? - if (in->is_auth() && - in->is_replica(from) && - (ack == NULL || acked_inodes.count(p->second->vino()) == 0)) { - inode_remove_replica(in, from, false, gather_locks); - dout(10) << " rem " << *in << dendl; - } - - if (!in->is_dir()) continue; - - list dfs; - in->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); - p != dfs.end(); - ++p) { - CDir *dir = *p; - if (!dir->is_auth()) - continue; - - if (dir->is_replica(from) && - (ack == NULL || ack->strong_dirfrags.count(dir->dirfrag()) == 0)) { - dir->remove_replica(from); - dout(10) << " rem " << *dir << dendl; - } - - // dentries - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - - if (dn->is_replica(from) && - (ack == NULL || - ack->strong_dentries.count(dir->dirfrag()) == 0 || - ack->strong_dentries[dir->dirfrag()].count(string_snap_t(dn->name, dn->last)) == 0)) { - dentry_remove_replica(dn, from, gather_locks); - dout(10) << " rem " << *dn << dendl; - } - } - } - } -} - - -CInode *MDCache::rejoin_invent_inode(inodeno_t ino, snapid_t last) -{ - CInode *in = new CInode(this, true, 1, last); - in->inode.ino = ino; - in->state_set(CInode::STATE_REJOINUNDEF); - add_inode(in); - rejoin_undef_inodes.insert(in); - dout(10) << " invented " << *in << dendl; - return in; -} - -CDir *MDCache::rejoin_invent_dirfrag(dirfrag_t df) -{ - CInode *in = get_inode(df.ino); - if (!in) - in = rejoin_invent_inode(df.ino, CEPH_NOSNAP); - if (!in->is_dir()) { - assert(in->state_test(CInode::STATE_REJOINUNDEF)); - in->inode.mode = S_IFDIR; - in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash; - } - CDir *dir = in->get_or_open_dirfrag(this, df.frag); - dir->state_set(CDir::STATE_REJOINUNDEF); - rejoin_undef_dirfrags.insert(dir); - dout(10) << " invented " << *dir << dendl; - return dir; -} - -/* This functions DOES NOT put the passed message before returning */ -void MDCache::handle_cache_rejoin_strong(MMDSCacheRejoin *strong) -{ - mds_rank_t from = mds_rank_t(strong->get_source().num()); - - // only a recovering node will get a strong rejoin. - assert(mds->is_rejoin()); - - // assimilate any potentially dirty scatterlock state - for (map::iterator p = strong->inode_scatterlocks.begin(); - p != strong->inode_scatterlocks.end(); - ++p) { - CInode *in = get_inode(p->first); - assert(in); - in->decode_lock_state(CEPH_LOCK_IFILE, p->second.file); - in->decode_lock_state(CEPH_LOCK_INEST, p->second.nest); - in->decode_lock_state(CEPH_LOCK_IDFT, p->second.dft); - rejoin_potential_updated_scatterlocks.insert(in); - } - - rejoin_unlinked_inodes[from].clear(); - - // surviving peer may send incorrect dirfrag here (maybe they didn't - // get the fragment notify, or maybe we rolled back?). we need to - // infer the right frag and get them with the program. somehow. - // we don't normally send ACK.. so we'll need to bundle this with - // MISSING or something. - - // strong dirfrags/dentries. - // also process auth_pins, xlocks. - for (map::iterator p = strong->strong_dirfrags.begin(); - p != strong->strong_dirfrags.end(); - ++p) { - CInode *diri = get_inode(p->first.ino); - if (!diri) - diri = rejoin_invent_inode(p->first.ino, CEPH_NOSNAP); - CDir *dir = diri->get_dirfrag(p->first.frag); - bool refragged = false; - if (dir) { - dout(10) << " have " << *dir << dendl; - } else { - if (diri->state_test(CInode::STATE_REJOINUNDEF)) - dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), frag_t())); - else if (diri->dirfragtree.is_leaf(p->first.frag)) - dir = rejoin_invent_dirfrag(p->first); - } - if (dir) { - dir->add_replica(from, p->second.nonce); - dir->dir_rep = p->second.dir_rep; - } else { - dout(10) << " frag " << p->first << " doesn't match dirfragtree " << *diri << dendl; - list ls; - diri->dirfragtree.get_leaves_under(p->first.frag, ls); - if (ls.empty()) - ls.push_back(diri->dirfragtree[p->first.frag.value()]); - dout(10) << " maps to frag(s) " << ls << dendl; - for (list::iterator q = ls.begin(); q != ls.end(); ++q) { - CDir *dir = diri->get_dirfrag(*q); - if (!dir) - dir = rejoin_invent_dirfrag(dirfrag_t(diri->ino(), *q)); - else - dout(10) << " have(approx) " << *dir << dendl; - dir->add_replica(from, p->second.nonce); - dir->dir_rep = p->second.dir_rep; - } - refragged = true; - } - - map& dmap = strong->strong_dentries[p->first]; - for (map::iterator q = dmap.begin(); - q != dmap.end(); - ++q) { - CDentry *dn; - if (!refragged) - dn = dir->lookup(q->first.name, q->first.snapid); - else { - frag_t fg = diri->pick_dirfrag(q->first.name); - dir = diri->get_dirfrag(fg); - assert(dir); - dn = dir->lookup(q->first.name, q->first.snapid); - } - if (!dn) { - if (q->second.is_remote()) { - dn = dir->add_remote_dentry(q->first.name, q->second.remote_ino, q->second.remote_d_type, - q->second.first, q->first.snapid); - } else if (q->second.is_null()) { - dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid); - } else { - CInode *in = get_inode(q->second.ino, q->first.snapid); - if (!in) in = rejoin_invent_inode(q->second.ino, q->first.snapid); - dn = dir->add_primary_dentry(q->first.name, in, q->second.first, q->first.snapid); - } - dout(10) << " invented " << *dn << dendl; - } - CDentry::linkage_t *dnl = dn->get_linkage(); - - // dn auth_pin? - if (strong->authpinned_dentries.count(p->first) && - strong->authpinned_dentries[p->first].count(q->first)) { - for (list::iterator r = strong->authpinned_dentries[p->first][q->first].begin(); - r != strong->authpinned_dentries[p->first][q->first].end(); - ++r) { - dout(10) << " dn authpin by " << *r << " on " << *dn << dendl; - - // get/create slave mdrequest - MDRequestRef mdr; - if (have_request(r->reqid)) - mdr = request_get(r->reqid); - else - mdr = request_start_slave(r->reqid, r->attempt, strong); - mdr->auth_pin(dn); - } - } - - // dn xlock? - if (strong->xlocked_dentries.count(p->first) && - strong->xlocked_dentries[p->first].count(q->first)) { - MMDSCacheRejoin::slave_reqid r = strong->xlocked_dentries[p->first][q->first]; - dout(10) << " dn xlock by " << r << " on " << *dn << dendl; - MDRequestRef mdr = request_get(r.reqid); // should have this from auth_pin above. - assert(mdr->is_auth_pinned(dn)); - if (!mdr->xlocks.count(&dn->versionlock)) { - assert(dn->versionlock.can_xlock_local()); - dn->versionlock.get_xlock(mdr, mdr->get_client()); - mdr->xlocks.insert(&dn->versionlock); - mdr->locks.insert(&dn->versionlock); - } - if (dn->lock.is_stable()) - dn->auth_pin(&dn->lock); - dn->lock.set_state(LOCK_XLOCK); - dn->lock.get_xlock(mdr, mdr->get_client()); - mdr->xlocks.insert(&dn->lock); - mdr->locks.insert(&dn->lock); - } - - dn->add_replica(from, q->second.nonce); - dout(10) << " have " << *dn << dendl; - - if (dnl->is_primary()) { - if (q->second.is_primary()) { - if (vinodeno_t(q->second.ino, q->first.snapid) != dnl->get_inode()->vino()) { - // the survivor missed MDentryUnlink+MDentryLink messages ? - assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0); - CInode *in = get_inode(q->second.ino, q->first.snapid); - assert(in); - assert(in->get_parent_dn()); - rejoin_unlinked_inodes[from].insert(in); - dout(7) << " sender has primary dentry but wrong inode" << dendl; - } - } else { - // the survivor missed MDentryLink message ? - assert(strong->strong_inodes.count(dnl->get_inode()->vino()) == 0); - dout(7) << " sender doesn't have primay dentry" << dendl; - } - } else { - if (q->second.is_primary()) { - // the survivor missed MDentryUnlink message ? - CInode *in = get_inode(q->second.ino, q->first.snapid); - assert(in); - assert(in->get_parent_dn()); - rejoin_unlinked_inodes[from].insert(in); - dout(7) << " sender has primary dentry but we don't" << dendl; - } - } - } - } - - for (map::iterator p = strong->strong_inodes.begin(); - p != strong->strong_inodes.end(); - ++p) { - CInode *in = get_inode(p->first); - assert(in); - in->add_replica(from, p->second.nonce); - dout(10) << " have " << *in << dendl; - - MMDSCacheRejoin::inode_strong &is = p->second; - - // caps_wanted - if (is.caps_wanted) { - in->mds_caps_wanted[from] = is.caps_wanted; - dout(15) << " inode caps_wanted " << ccap_string(is.caps_wanted) - << " on " << *in << dendl; - } - - // scatterlocks? - // infer state from replica state: - // * go to MIX if they might have wrlocks - // * go to LOCK if they are LOCK (just bc identify_files_to_recover might start twiddling filelock) - in->filelock.infer_state_from_strong_rejoin(is.filelock, !in->is_dir()); // maybe also go to LOCK - in->nestlock.infer_state_from_strong_rejoin(is.nestlock, false); - in->dirfragtreelock.infer_state_from_strong_rejoin(is.dftlock, false); - - // auth pin? - if (strong->authpinned_inodes.count(in->vino())) { - for (list::iterator r = strong->authpinned_inodes[in->vino()].begin(); - r != strong->authpinned_inodes[in->vino()].end(); - ++r) { - dout(10) << " inode authpin by " << *r << " on " << *in << dendl; - - // get/create slave mdrequest - MDRequestRef mdr; - if (have_request(r->reqid)) - mdr = request_get(r->reqid); - else - mdr = request_start_slave(r->reqid, r->attempt, strong); - if (strong->frozen_authpin_inodes.count(in->vino())) { - assert(!in->get_num_auth_pins()); - mdr->freeze_auth_pin(in); - } else { - assert(!in->is_frozen_auth_pin()); - } - mdr->auth_pin(in); - } - } - // xlock(s)? - if (strong->xlocked_inodes.count(in->vino())) { - for (map::iterator q = strong->xlocked_inodes[in->vino()].begin(); - q != strong->xlocked_inodes[in->vino()].end(); - ++q) { - SimpleLock *lock = in->get_lock(q->first); - dout(10) << " inode xlock by " << q->second << " on " << *lock << " on " << *in << dendl; - MDRequestRef mdr = request_get(q->second.reqid); // should have this from auth_pin above. - assert(mdr->is_auth_pinned(in)); - if (!mdr->xlocks.count(&in->versionlock)) { - assert(in->versionlock.can_xlock_local()); - in->versionlock.get_xlock(mdr, mdr->get_client()); - mdr->xlocks.insert(&in->versionlock); - mdr->locks.insert(&in->versionlock); - } - if (lock->is_stable()) - in->auth_pin(lock); - lock->set_state(LOCK_XLOCK); - if (lock == &in->filelock) - in->loner_cap = -1; - lock->get_xlock(mdr, mdr->get_client()); - mdr->xlocks.insert(lock); - mdr->locks.insert(lock); - } - } - } - // wrlock(s)? - for (map > >::iterator p = strong->wrlocked_inodes.begin(); - p != strong->wrlocked_inodes.end(); - ++p) { - CInode *in = get_inode(p->first); - for (map >::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - SimpleLock *lock = in->get_lock(q->first); - for (list::iterator r = q->second.begin(); - r != q->second.end(); - ++r) { - dout(10) << " inode wrlock by " << *r << " on " << *lock << " on " << *in << dendl; - MDRequestRef mdr = request_get(r->reqid); // should have this from auth_pin above. - if (in->is_auth()) - assert(mdr->is_auth_pinned(in)); - lock->set_state(LOCK_MIX); - if (lock == &in->filelock) - in->loner_cap = -1; - lock->get_wrlock(true); - mdr->wrlocks.insert(lock); - mdr->locks.insert(lock); - } - } - } - - // done? - assert(rejoin_gather.count(from)); - rejoin_gather.erase(from); - if (rejoin_gather.empty()) { - rejoin_gather_finish(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" << dendl; - } -} - -/* This functions DOES NOT put the passed message before returning */ -void MDCache::handle_cache_rejoin_ack(MMDSCacheRejoin *ack) -{ - dout(7) << "handle_cache_rejoin_ack from " << ack->get_source() << dendl; - mds_rank_t from = mds_rank_t(ack->get_source().num()); - - // for sending cache expire message - set isolated_inodes; - set refragged_inodes; - - // dirs - for (map::iterator p = ack->strong_dirfrags.begin(); - p != ack->strong_dirfrags.end(); - ++p) { - // we may have had incorrect dir fragmentation; refragment based - // on what they auth tells us. - CDir *dir = get_dirfrag(p->first); - if (!dir) { - dir = get_force_dirfrag(p->first, false); - if (dir) - refragged_inodes.insert(dir->get_inode()); - } - if (!dir) { - CInode *diri = get_inode(p->first.ino); - if (!diri) { - // barebones inode; the full inode loop below will clean up. - diri = new CInode(this, false); - diri->inode.ino = p->first.ino; - diri->inode.mode = S_IFDIR; - diri->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash; - add_inode(diri); - if (MDS_INO_MDSDIR(from) == p->first.ino) { - diri->inode_auth = mds_authority_t(from, CDIR_AUTH_UNKNOWN); - dout(10) << " add inode " << *diri << dendl; - } else { - diri->inode_auth = CDIR_AUTH_DEFAULT; - isolated_inodes.insert(diri); - dout(10) << " unconnected dirfrag " << p->first << dendl; - } - } - // barebones dirfrag; the full dirfrag loop below will clean up. - dir = diri->add_dirfrag(new CDir(diri, p->first.frag, this, false)); - if (MDS_INO_MDSDIR(from) == p->first.ino || - (dir->authority() != CDIR_AUTH_UNDEF && - dir->authority().first != from)) - adjust_subtree_auth(dir, from); - dout(10) << " add dirfrag " << *dir << dendl; - } - - dir->set_replica_nonce(p->second.nonce); - dir->state_clear(CDir::STATE_REJOINING); - dout(10) << " got " << *dir << dendl; - - // dentries - map& dmap = ack->strong_dentries[p->first]; - for (map::iterator q = dmap.begin(); - q != dmap.end(); - ++q) { - CDentry *dn = dir->lookup(q->first.name, q->first.snapid); - if(!dn) - dn = dir->add_null_dentry(q->first.name, q->second.first, q->first.snapid); - - CDentry::linkage_t *dnl = dn->get_linkage(); - - assert(dn->last == q->first.snapid); - if (dn->first != q->second.first) { - dout(10) << " adjust dn.first " << dn->first << " -> " << q->second.first << " on " << *dn << dendl; - dn->first = q->second.first; - } - - // may have bad linkage if we missed dentry link/unlink messages - if (dnl->is_primary()) { - CInode *in = dnl->get_inode(); - if (!q->second.is_primary() || - vinodeno_t(q->second.ino, q->first.snapid) != in->vino()) { - dout(10) << " had bad linkage for " << *dn << ", unlinking " << *in << dendl; - dir->unlink_inode(dn); - } - } else if (dnl->is_remote()) { - if (!q->second.is_remote() || - q->second.remote_ino != dnl->get_remote_ino() || - q->second.remote_d_type != dnl->get_remote_d_type()) { - dout(10) << " had bad linkage for " << *dn << dendl; - dir->unlink_inode(dn); - } - } else { - if (!q->second.is_null()) - dout(10) << " had bad linkage for " << *dn << dendl; - } - - // hmm, did we have the proper linkage here? - if (dnl->is_null() && !q->second.is_null()) { - if (q->second.is_remote()) { - dn->dir->link_remote_inode(dn, q->second.remote_ino, q->second.remote_d_type); - } else { - CInode *in = get_inode(q->second.ino, q->first.snapid); - if (!in) { - // barebones inode; assume it's dir, the full inode loop below will clean up. - in = new CInode(this, false, q->second.first, q->first.snapid); - in->inode.ino = q->second.ino; - in->inode.mode = S_IFDIR; - in->inode.dir_layout.dl_dir_hash = g_conf->mds_default_dir_hash; - add_inode(in); - dout(10) << " add inode " << *in << dendl; - } else if (in->get_parent_dn()) { - dout(10) << " had bad linkage for " << *(in->get_parent_dn()) - << ", unlinking " << *in << dendl; - in->get_parent_dir()->unlink_inode(in->get_parent_dn()); - } - dn->dir->link_primary_inode(dn, in); - isolated_inodes.erase(in); - } - } - - dn->set_replica_nonce(q->second.nonce); - dn->lock.set_state_rejoin(q->second.lock, rejoin_waiters); - dn->state_clear(CDentry::STATE_REJOINING); - dout(10) << " got " << *dn << dendl; - } - } - - for (set::iterator p = refragged_inodes.begin(); - p != refragged_inodes.end(); - ++p) { - list ls; - (*p)->get_nested_dirfrags(ls); - for (list::iterator q = ls.begin(); q != ls.end(); ++q) { - if ((*q)->is_auth() || ack->strong_dirfrags.count((*q)->dirfrag())) - continue; - assert((*q)->get_num_any() == 0); - (*p)->close_dirfrag((*q)->get_frag()); - } - } - - // full dirfrags - for (map::iterator p = ack->dirfrag_bases.begin(); - p != ack->dirfrag_bases.end(); - ++p) { - CDir *dir = get_dirfrag(p->first); - assert(dir); - bufferlist::iterator q = p->second.begin(); - dir->_decode_base(q); - dout(10) << " got dir replica " << *dir << dendl; - } - - // full inodes - bufferlist::iterator p = ack->inode_base.begin(); - while (!p.end()) { - inodeno_t ino; - snapid_t last; - bufferlist basebl; - ::decode(ino, p); - ::decode(last, p); - ::decode(basebl, p); - CInode *in = get_inode(ino, last); - assert(in); - bufferlist::iterator q = basebl.begin(); - in->_decode_base(q); - dout(10) << " got inode base " << *in << dendl; - } - - // inodes - p = ack->inode_locks.begin(); - //dout(10) << "inode_locks len " << ack->inode_locks.length() << " is " << ack->inode_locks << dendl; - while (!p.end()) { - inodeno_t ino; - snapid_t last; - __u32 nonce; - bufferlist lockbl; - ::decode(ino, p); - ::decode(last, p); - ::decode(nonce, p); - ::decode(lockbl, p); - - CInode *in = get_inode(ino, last); - assert(in); - in->set_replica_nonce(nonce); - bufferlist::iterator q = lockbl.begin(); - in->_decode_locks_rejoin(q, rejoin_waiters, rejoin_eval_locks); - in->state_clear(CInode::STATE_REJOINING); - dout(10) << " got inode locks " << *in << dendl; - } - - // FIXME: This can happen if entire subtree, together with the inode subtree root - // belongs to, were trimmed between sending cache rejoin and receiving rejoin ack. - assert(isolated_inodes.empty()); - - map > peer_imported; - bufferlist::iterator bp = ack->imported_caps.begin(); - ::decode(peer_imported, bp); - - for (map >::iterator p = peer_imported.begin(); - p != peer_imported.end(); - ++p) { - assert(cap_exports.count(p->first)); - assert(cap_export_targets.count(p->first)); - assert(cap_export_targets[p->first] == from); - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - assert(cap_exports[p->first].count(q->first)); - - dout(10) << " exporting caps for client." << q->first << " ino " << p->first << dendl; - Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v)); - assert(session); - - // mark client caps stale. - MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, - cap_exports[p->first][q->first].capinfo.cap_id, 0, - mds->get_osd_epoch_barrier()); - m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq, - (q->second.cap_id > 0 ? from : -1), 0); - mds->send_message_client_counted(m, session); - - cap_exports[p->first].erase(q->first); - } - assert(cap_exports[p->first].empty()); - } - - // done? - assert(rejoin_ack_gather.count(from)); - rejoin_ack_gather.erase(from); - if (mds->is_rejoin()) { - - if (rejoin_gather.empty()) { - // eval unstable scatter locks after all wrlocks are rejoined. - while (!rejoin_eval_locks.empty()) { - SimpleLock *lock = rejoin_eval_locks.front(); - rejoin_eval_locks.pop_front(); - if (!lock->is_stable()) - mds->locker->eval_gather(lock); - } - } - - if (rejoin_gather.empty() && // make sure we've gotten our FULL inodes, too. - rejoin_ack_gather.empty()) { - // finally, kickstart past snap parent opens - open_snap_parents(); - } else { - dout(7) << "still need rejoin from (" << rejoin_gather << ")" - << ", rejoin_ack from (" << rejoin_ack_gather << ")" << dendl; - } - } else { - // survivor. - mds->queue_waiters(rejoin_waiters); - } -} - -/** - * rejoin_trim_undef_inodes() -- remove REJOINUNDEF flagged inodes - * - * FIXME: wait, can this actually happen? a survivor should generate cache trim - * messages that clean these guys up... - */ -void MDCache::rejoin_trim_undef_inodes() -{ - dout(10) << "rejoin_trim_undef_inodes" << dendl; - - while (!rejoin_undef_inodes.empty()) { - set::iterator p = rejoin_undef_inodes.begin(); - CInode *in = *p; - rejoin_undef_inodes.erase(p); - - in->clear_replica_map(); - - // close out dirfrags - if (in->is_dir()) { - list dfls; - in->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); - p != dfls.end(); - ++p) { - CDir *dir = *p; - dir->clear_replica_map(); - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - dn->clear_replica_map(); - - dout(10) << " trimming " << *dn << dendl; - dir->remove_dentry(dn); - } - - dout(10) << " trimming " << *dir << dendl; - in->close_dirfrag(dir->dirfrag().frag); - } - } - - CDentry *dn = in->get_parent_dn(); - if (dn) { - dn->clear_replica_map(); - dout(10) << " trimming " << *dn << dendl; - dn->dir->remove_dentry(dn); - } else { - dout(10) << " trimming " << *in << dendl; - remove_inode(in); - } - } - - assert(rejoin_undef_inodes.empty()); -} - -void MDCache::rejoin_gather_finish() -{ - dout(10) << "rejoin_gather_finish" << dendl; - assert(mds->is_rejoin()); - - if (open_undef_inodes_dirfrags()) - return; - - if (process_imported_caps()) - return; - - choose_lock_states_and_reconnect_caps(); - - identify_files_to_recover(); - rejoin_send_acks(); - - // signal completion of fetches, rejoin_gather_finish, etc. - assert(rejoin_ack_gather.count(mds->get_nodeid())); - rejoin_ack_gather.erase(mds->get_nodeid()); - - // did we already get our acks too? - if (rejoin_ack_gather.empty()) { - // finally, kickstart past snap parent opens - open_snap_parents(); - } -} - -class C_MDC_RejoinOpenInoFinish: public MDCacheContext { - inodeno_t ino; -public: - C_MDC_RejoinOpenInoFinish(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {} - void finish(int r) override { - mdcache->rejoin_open_ino_finish(ino, r); - } -}; - -void MDCache::rejoin_open_ino_finish(inodeno_t ino, int ret) -{ - dout(10) << "open_caps_inode_finish ino " << ino << " ret " << ret << dendl; - - if (ret < 0) { - cap_imports_missing.insert(ino); - } else if (ret == mds->get_nodeid()) { - assert(get_inode(ino)); - } else { - auto p = cap_imports.find(ino); - assert(p != cap_imports.end()); - for (auto q = p->second.begin(); q != p->second.end(); ++q) { - assert(q->second.count(MDS_RANK_NONE)); - assert(q->second.size() == 1); - rejoin_export_caps(p->first, q->first, q->second[MDS_RANK_NONE], ret); - } - cap_imports.erase(p); - } - - assert(cap_imports_num_opening > 0); - cap_imports_num_opening--; - - if (cap_imports_num_opening == 0) { - if (rejoin_gather.empty()) - rejoin_gather_finish(); - else if (rejoin_gather.count(mds->get_nodeid())) - process_imported_caps(); - } -} - -class C_MDC_RejoinSessionsOpened : public MDCacheLogContext { -public: - map client_map; - map sseqmap; - - C_MDC_RejoinSessionsOpened(MDCache *c, map& cm) : - MDCacheLogContext(c), client_map(cm) {} - void finish(int r) override { - assert(r == 0); - mdcache->rejoin_open_sessions_finish(client_map, sseqmap); - } -}; - -void MDCache::rejoin_open_sessions_finish(map client_map, - map& sseqmap) -{ - dout(10) << "rejoin_open_sessions_finish" << dendl; - mds->server->finish_force_open_sessions(client_map, sseqmap); - if (rejoin_gather.empty()) - rejoin_gather_finish(); -} - -bool MDCache::process_imported_caps() -{ - dout(10) << "process_imported_caps" << dendl; - - for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) { - CInode *in = get_inode(p->first); - if (in) { - assert(in->is_auth()); - cap_imports_missing.erase(p->first); - continue; - } - if (cap_imports_missing.count(p->first) > 0) - continue; - - cap_imports_num_opening++; - dout(10) << " opening missing ino " << p->first << dendl; - open_ino(p->first, (int64_t)-1, new C_MDC_RejoinOpenInoFinish(this, p->first), false); - } - - if (cap_imports_num_opening > 0) - return true; - - // called by rejoin_gather_finish() ? - if (rejoin_gather.count(mds->get_nodeid()) == 0) { - // if sessions for imported caps are all open ? - for (map::iterator p = rejoin_client_map.begin(); - p != rejoin_client_map.end(); - ++p) { - if (!mds->sessionmap.have_session(entity_name_t::CLIENT(p->first.v))) { - C_MDC_RejoinSessionsOpened *finish = new C_MDC_RejoinSessionsOpened(this, rejoin_client_map); - version_t pv = mds->server->prepare_force_open_sessions(rejoin_client_map, finish->sseqmap); - ESessions *le = new ESessions(pv, rejoin_client_map); - mds->mdlog->start_submit_entry(le, finish); - mds->mdlog->flush(); - rejoin_client_map.clear(); - return true; - } - } - rejoin_client_map.clear(); - - // process caps that were exported by slave rename - for (map > >::iterator p = rejoin_slave_exports.begin(); - p != rejoin_slave_exports.end(); - ++p) { - CInode *in = get_inode(p->first); - assert(in); - for (map::iterator q = p->second.second.begin(); - q != p->second.second.end(); - ++q) { - Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v)); - assert(session); - - Capability *cap = in->get_client_cap(q->first); - if (!cap) - cap = in->add_client_cap(q->first, session); - cap->merge(q->second, true); - - Capability::Import& im = rejoin_imported_caps[p->second.first][p->first][q->first]; - assert(cap->get_last_seq() == im.issue_seq); - assert(cap->get_mseq() == im.mseq); - cap->set_cap_id(im.cap_id); - // send cap import because we assigned a new cap ID - do_cap_import(session, in, cap, q->second.cap_id, q->second.seq, q->second.mseq - 1, - p->second.first, CEPH_CAP_FLAG_AUTH); - } - } - rejoin_slave_exports.clear(); - rejoin_imported_caps.clear(); - - // process cap imports - // ino -> client -> frommds -> capex - for (auto p = cap_imports.begin(); p != cap_imports.end(); ) { - CInode *in = get_inode(p->first); - if (!in) { - dout(10) << " still missing ino " << p->first - << ", will try again after replayed client requests" << dendl; - ++p; - continue; - } - assert(in->is_auth()); - for (auto q = p->second.begin(); q != p->second.end(); ++q) { - Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v)); - assert(session); - for (auto r = q->second.begin(); r != q->second.end(); ++r) { - Capability *cap = in->reconnect_cap(q->first, r->second, session); - add_reconnected_cap(q->first, in->ino(), r->second); - if (r->first >= 0) { - if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists - cap->inc_mseq(); - do_cap_import(session, in, cap, r->second.capinfo.cap_id, 0, 0, r->first, 0); - - Capability::Import& im = rejoin_imported_caps[r->first][p->first][q->first]; - im.cap_id = cap->get_cap_id(); - im.issue_seq = cap->get_last_seq(); - im.mseq = cap->get_mseq(); - } - } - } - cap_imports.erase(p++); // remove and move on - } - } else { - trim_non_auth(); - - rejoin_gather.erase(mds->get_nodeid()); - maybe_send_pending_rejoins(); - - if (rejoin_gather.empty() && rejoin_ack_gather.count(mds->get_nodeid())) - rejoin_gather_finish(); - } - return false; -} - -void MDCache::check_realm_past_parents(SnapRealm *realm, bool reconnect) -{ - // are this realm's parents fully open? - if (realm->have_past_parents_open()) { - dout(10) << " have past snap parents for realm " << *realm - << " on " << *realm->inode << dendl; - if (reconnect) { - // finish off client snaprealm reconnects? - auto p = reconnected_snaprealms.find(realm->inode->ino()); - if (p != reconnected_snaprealms.end()) { - for (auto q = p->second.begin(); q != p->second.end(); ++q) - finish_snaprealm_reconnect(q->first, realm, q->second); - reconnected_snaprealms.erase(p); - } - } - } else { - if (!missing_snap_parents.count(realm->inode)) { - dout(10) << " MISSING past snap parents for realm " << *realm - << " on " << *realm->inode << dendl; - realm->inode->get(CInode::PIN_OPENINGSNAPPARENTS); - missing_snap_parents[realm->inode].size(); // just to get it into the map! - } else { - dout(10) << " (already) MISSING past snap parents for realm " << *realm - << " on " << *realm->inode << dendl; - } - } -} - -void MDCache::rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, - client_t client, snapid_t snap_follows) -{ - dout(10) << "rebuild_need_snapflush " << snap_follows << " on " << *head_in << dendl; - - const set& snaps = realm->get_snaps(); - snapid_t follows = snap_follows; - - while (true) { - CInode *in = pick_inode_snap(head_in, follows); - if (in == head_in) - break; - dout(10) << " need snapflush from client." << client << " on " << *in << dendl; - - /* TODO: we can check the reconnected/flushing caps to find - * which locks need gathering */ - for (int i = 0; i < num_cinode_locks; i++) { - int lockid = cinode_lock_info[i].lock; - SimpleLock *lock = in->get_lock(lockid); - assert(lock); - in->client_snap_caps[lockid].insert(client); - in->auth_pin(lock); - lock->set_state(LOCK_SNAP_SYNC); - lock->get_wrlock(true); - } - - for (auto p = snaps.lower_bound(in->first); - p != snaps.end() && *p <= in->last; - ++p) { - head_in->add_need_snapflush(in, *p, client); - } - - follows = in->last; - } -} - -/* - * choose lock states based on reconnected caps - */ -void MDCache::choose_lock_states_and_reconnect_caps() -{ - dout(10) << "choose_lock_states_and_reconnect_caps" << dendl; - - map splits; - - for (ceph::unordered_map::iterator i = inode_map.begin(); - i != inode_map.end(); - ++i) { - CInode *in = i->second; - - if (in->last != CEPH_NOSNAP) - continue; - - if (in->is_auth() && !in->is_base() && in->inode.is_dirty_rstat()) - in->mark_dirty_rstat(); - - auto p = reconnected_caps.find(in->ino()); - - int dirty_caps = 0; - if (p != reconnected_caps.end()) { - for (const auto &it : p->second) - dirty_caps |= it.second.dirty_caps; - } - in->choose_lock_states(dirty_caps); - dout(15) << " chose lock states on " << *in << dendl; - - SnapRealm *realm = in->find_snaprealm(); - - check_realm_past_parents(realm, realm == in->snaprealm); - - if (p != reconnected_caps.end()) { - bool missing_snap_parent = false; - // also, make sure client's cap is in the correct snaprealm. - for (auto q = p->second.begin(); q != p->second.end(); ++q) { - if (q->second.snap_follows > 0 && q->second.snap_follows < in->first - 1) { - if (realm->have_past_parents_open()) { - rebuild_need_snapflush(in, realm, q->first, q->second.snap_follows); - } else { - missing_snap_parent = true; - } - } - - if (q->second.realm_ino == realm->inode->ino()) { - dout(15) << " client." << q->first << " has correct realm " << q->second.realm_ino << dendl; - } else { - dout(15) << " client." << q->first << " has wrong realm " << q->second.realm_ino - << " != " << realm->inode->ino() << dendl; - if (realm->have_past_parents_open()) { - // ok, include in a split message _now_. - prepare_realm_split(realm, q->first, in->ino(), splits); - } else { - // send the split later. - missing_snap_parent = true; - } - } - } - if (missing_snap_parent) - missing_snap_parents[realm->inode].insert(in); - } - } - - send_snaps(splits); -} - -void MDCache::prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino, - map& splits) -{ - MClientSnap *snap; - if (splits.count(client) == 0) { - splits[client] = snap = new MClientSnap(CEPH_SNAP_OP_SPLIT); - snap->head.split = realm->inode->ino(); - realm->build_snap_trace(snap->bl); - - for (set::iterator p = realm->open_children.begin(); - p != realm->open_children.end(); - ++p) - snap->split_realms.push_back((*p)->inode->ino()); - - } else - snap = splits[client]; - snap->split_inos.push_back(ino); -} - -void MDCache::send_snaps(map& splits) -{ - dout(10) << "send_snaps" << dendl; - - for (map::iterator p = splits.begin(); - p != splits.end(); - ++p) { - Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v)); - if (session) { - dout(10) << " client." << p->first - << " split " << p->second->head.split - << " inos " << p->second->split_inos - << dendl; - mds->send_message_client_counted(p->second, session); - } else { - dout(10) << " no session for client." << p->first << dendl; - p->second->put(); - } - } - splits.clear(); -} - - -/* - * remove any items from logsegment open_file lists that don't have - * any caps - */ -void MDCache::clean_open_file_lists() -{ - dout(10) << "clean_open_file_lists" << dendl; - - for (map::iterator p = mds->mdlog->segments.begin(); - p != mds->mdlog->segments.end(); - ++p) { - LogSegment *ls = p->second; - - elist::iterator q = ls->open_files.begin(member_offset(CInode, item_open_file)); - while (!q.end()) { - CInode *in = *q; - ++q; - if (in->last == CEPH_NOSNAP) { - if (!in->is_any_caps_wanted()) { - dout(10) << " unlisting unwanted/capless inode " << *in << dendl; - in->item_open_file.remove_myself(); - } - } else if (in->last != CEPH_NOSNAP) { - if (in->client_snap_caps.empty()) { - dout(10) << " unlisting flushed snap inode " << *in << dendl; - in->item_open_file.remove_myself(); - } - } - } - } -} - - - -Capability* MDCache::rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds) -{ - dout(10) << "rejoin_import_cap for client." << client << " from mds." << frommds - << " on " << *in << dendl; - Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v)); - if (!session) { - dout(10) << " no session for client." << client << dendl; - return NULL; - } - - Capability *cap = in->reconnect_cap(client, icr, session); - - if (frommds >= 0) { - if (cap->get_last_seq() == 0) // don't increase mseq if cap already exists - cap->inc_mseq(); - do_cap_import(session, in, cap, icr.capinfo.cap_id, 0, 0, frommds, 0); - } - - return cap; -} - -void MDCache::export_remaining_imported_caps() -{ - dout(10) << "export_remaining_imported_caps" << dendl; - - stringstream warn_str; - - for (auto p = cap_imports.begin(); p != cap_imports.end(); ++p) { - warn_str << " ino " << p->first << "\n"; - for (auto q = p->second.begin(); q != p->second.end(); ++q) { - Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v)); - if (session) { - // mark client caps stale. - MClientCaps *stale = new MClientCaps(CEPH_CAP_OP_EXPORT, p->first, 0, 0, 0, mds->get_osd_epoch_barrier()); - stale->set_cap_peer(0, 0, 0, -1, 0); - mds->send_message_client_counted(stale, q->first); - } - } - - mds->heartbeat_reset(); - } - - for (map >::iterator p = cap_reconnect_waiters.begin(); - p != cap_reconnect_waiters.end(); - ++p) - mds->queue_waiters(p->second); - - cap_imports.clear(); - cap_reconnect_waiters.clear(); - - if (warn_str.peek() != EOF) { - mds->clog->warn() << "failed to reconnect caps for missing inodes:"; - mds->clog->warn(warn_str); - } -} - -void MDCache::try_reconnect_cap(CInode *in, Session *session) -{ - client_t client = session->info.get_client(); - const cap_reconnect_t *rc = get_replay_cap_reconnect(in->ino(), client); - if (rc) { - in->reconnect_cap(client, *rc, session); - dout(10) << "try_reconnect_cap client." << client - << " reconnect wanted " << ccap_string(rc->capinfo.wanted) - << " issue " << ccap_string(rc->capinfo.issued) - << " on " << *in << dendl; - remove_replay_cap_reconnect(in->ino(), client); - - if (in->is_replicated()) { - mds->locker->try_eval(in, CEPH_CAP_LOCKS); - } else { - int dirty_caps = 0; - auto p = reconnected_caps.find(in->ino()); - if (p != reconnected_caps.end()) { - auto q = p->second.find(client); - if (q != p->second.end()) - dirty_caps = q->second.dirty_caps; - } - in->choose_lock_states(dirty_caps); - dout(15) << " chose lock states on " << *in << dendl; - } - - map >::iterator it = - cap_reconnect_waiters.find(in->ino()); - if (it != cap_reconnect_waiters.end()) { - mds->queue_waiters(it->second); - cap_reconnect_waiters.erase(it); - } - } -} - - - -// ------- -// cap imports and delayed snap parent opens - -void MDCache::do_cap_import(Session *session, CInode *in, Capability *cap, - uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq, - int peer, int p_flags) -{ - client_t client = session->info.inst.name.num(); - SnapRealm *realm = in->find_snaprealm(); - if (realm->have_past_parents_open()) { - dout(10) << "do_cap_import " << session->info.inst.name << " mseq " << cap->get_mseq() << " on " << *in << dendl; - if (cap->get_last_seq() == 0) // reconnected cap - cap->inc_last_seq(); - cap->set_last_issue(); - cap->set_last_issue_stamp(ceph_clock_now()); - cap->clear_new(); - MClientCaps *reap = new MClientCaps(CEPH_CAP_OP_IMPORT, - in->ino(), - realm->inode->ino(), - cap->get_cap_id(), cap->get_last_seq(), - cap->pending(), cap->wanted(), 0, - cap->get_mseq(), mds->get_osd_epoch_barrier()); - in->encode_cap_message(reap, cap); - realm->build_snap_trace(reap->snapbl); - reap->set_cap_peer(p_cap_id, p_seq, p_mseq, peer, p_flags); - mds->send_message_client_counted(reap, session); - } else { - dout(10) << "do_cap_import missing past snap parents, delaying " << session->info.inst.name << " mseq " - << cap->get_mseq() << " on " << *in << dendl; - in->auth_pin(this); - cap->inc_suppress(); - delayed_imported_caps[client].insert(in); - missing_snap_parents[in].size(); - } -} - -void MDCache::do_delayed_cap_imports() -{ - dout(10) << "do_delayed_cap_imports" << dendl; - - assert(delayed_imported_caps.empty()); -} - -struct C_MDC_OpenSnapParents : public MDCacheContext { - explicit C_MDC_OpenSnapParents(MDCache *c) : MDCacheContext(c) {} - void finish(int r) override { - mdcache->open_snap_parents(); - } -}; - -void MDCache::open_snap_parents() -{ - dout(10) << "open_snap_parents" << dendl; - - map splits; - MDSGatherBuilder gather(g_ceph_context); - - auto p = missing_snap_parents.begin(); - while (p != missing_snap_parents.end()) { - CInode *in = p->first; - assert(in->snaprealm); - if (in->snaprealm->open_parents(gather.new_sub())) { - dout(10) << " past parents now open on " << *in << dendl; - - for (CInode *child : p->second) { - auto q = reconnected_caps.find(child->ino()); - assert(q != reconnected_caps.end()); - for (auto r = q->second.begin(); r != q->second.end(); ++r) { - if (r->second.snap_follows > 0 && r->second.snap_follows < in->first - 1) { - rebuild_need_snapflush(child, in->snaprealm, r->first, r->second.snap_follows); - } - // make sure client's cap is in the correct snaprealm. - if (r->second.realm_ino != in->ino()) { - prepare_realm_split(in->snaprealm, r->first, child->ino(), splits); - } - } - } - - missing_snap_parents.erase(p++); - - in->put(CInode::PIN_OPENINGSNAPPARENTS); - - // finish off client snaprealm reconnects? - map >::iterator q = reconnected_snaprealms.find(in->ino()); - if (q != reconnected_snaprealms.end()) { - for (map::iterator r = q->second.begin(); - r != q->second.end(); - ++r) - finish_snaprealm_reconnect(r->first, in->snaprealm, r->second); - reconnected_snaprealms.erase(q); - } - } else { - dout(10) << " opening past parents on " << *in << dendl; - ++p; - } - } - - send_snaps(splits); - - if (gather.has_subs()) { - dout(10) << "open_snap_parents - waiting for " - << gather.num_subs_remaining() << dendl; - gather.set_finisher(new C_MDC_OpenSnapParents(this)); - gather.activate(); - } else { - if (!reconnected_snaprealms.empty()) { - stringstream warn_str; - for (map >::iterator p = reconnected_snaprealms.begin(); - p != reconnected_snaprealms.end(); - ++p) { - warn_str << " unconnected snaprealm " << p->first << "\n"; - for (map::iterator q = p->second.begin(); - q != p->second.end(); - ++q) - warn_str << " client." << q->first << " snapid " << q->second << "\n"; - } - mds->clog->warn() << "open_snap_parents has:"; - mds->clog->warn(warn_str); - } - assert(rejoin_waiters.empty()); - assert(missing_snap_parents.empty()); - dout(10) << "open_snap_parents - all open" << dendl; - do_delayed_cap_imports(); - - assert(rejoin_done); - rejoin_done.release()->complete(0); - reconnected_caps.clear(); - } -} - -bool MDCache::open_undef_inodes_dirfrags() -{ - dout(10) << "open_undef_inodes_dirfrags " - << rejoin_undef_inodes.size() << " inodes " - << rejoin_undef_dirfrags.size() << " dirfrags" << dendl; - - set fetch_queue = rejoin_undef_dirfrags; - - for (set::iterator p = rejoin_undef_inodes.begin(); - p != rejoin_undef_inodes.end(); - ++p) { - CInode *in = *p; - assert(!in->is_base()); - fetch_queue.insert(in->get_parent_dir()); - } - - if (fetch_queue.empty()) - return false; - - MDSGatherBuilder gather(g_ceph_context, new C_MDC_RejoinGatherFinish(this)); - for (set::iterator p = fetch_queue.begin(); - p != fetch_queue.end(); - ++p) { - CDir *dir = *p; - CInode *diri = dir->get_inode(); - if (diri->state_test(CInode::STATE_REJOINUNDEF)) - continue; - if (dir->state_test(CDir::STATE_REJOINUNDEF)) - assert(diri->dirfragtree.is_leaf(dir->get_frag())); - dir->fetch(gather.new_sub()); - } - assert(gather.has_subs()); - gather.activate(); - return true; -} - -void MDCache::opened_undef_inode(CInode *in) { - dout(10) << "opened_undef_inode " << *in << dendl; - rejoin_undef_inodes.erase(in); - if (in->is_dir()) { - // FIXME: re-hash dentries if necessary - assert(in->inode.dir_layout.dl_dir_hash == g_conf->mds_default_dir_hash); - if (in->has_dirfrags() && !in->dirfragtree.is_leaf(frag_t())) { - CDir *dir = in->get_dirfrag(frag_t()); - assert(dir); - rejoin_undef_dirfrags.erase(dir); - in->force_dirfrags(); - list ls; - in->get_dirfrags(ls); - for (list::iterator p = ls.begin(); p != ls.end(); ++p) - rejoin_undef_dirfrags.insert(*p); - } - } -} - -void MDCache::finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq) -{ - if (seq < realm->get_newest_seq()) { - dout(10) << "finish_snaprealm_reconnect client." << client << " has old seq " << seq << " < " - << realm->get_newest_seq() - << " on " << *realm << dendl; - // send an update - Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v)); - if (session) { - MClientSnap *snap = new MClientSnap(CEPH_SNAP_OP_UPDATE); - realm->build_snap_trace(snap->bl); - mds->send_message_client_counted(snap, session); - } else { - dout(10) << " ...or not, no session for this client!" << dendl; - } - } else { - dout(10) << "finish_snaprealm_reconnect client." << client << " up to date" - << " on " << *realm << dendl; - } -} - - - -void MDCache::rejoin_send_acks() -{ - dout(7) << "rejoin_send_acks" << dendl; - - // replicate stray - for (map >::iterator p = rejoin_unlinked_inodes.begin(); - p != rejoin_unlinked_inodes.end(); - ++p) { - for (set::iterator q = p->second.begin(); - q != p->second.end(); - ++q) { - CInode *in = *q; - dout(7) << " unlinked inode " << *in << dendl; - // inode expired - if (!in->is_replica(p->first)) - continue; - while (1) { - CDentry *dn = in->get_parent_dn(); - if (dn->is_replica(p->first)) - break; - dn->add_replica(p->first); - CDir *dir = dn->get_dir(); - if (dir->is_replica(p->first)) - break; - dir->add_replica(p->first); - in = dir->get_inode(); - if (in->is_replica(p->first)) - break; - in->add_replica(p->first); - if (in->is_base()) - break; - } - } - } - rejoin_unlinked_inodes.clear(); - - // send acks to everyone in the recovery set - map acks; - for (set::iterator p = recovery_set.begin(); - p != recovery_set.end(); - ++p) { - if (rejoin_ack_sent.count(*p)) - continue; - acks[*p] = new MMDSCacheRejoin(MMDSCacheRejoin::OP_ACK); - } - - rejoin_ack_sent = recovery_set; - - // walk subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - CDir *dir = p->first; - if (!dir->is_auth()) - continue; - dout(10) << "subtree " << *dir << dendl; - - // auth items in this subtree - list dq; - dq.push_back(dir); - - while (!dq.empty()) { - CDir *dir = dq.front(); - dq.pop_front(); - - // dir - for (auto &r : dir->get_replicas()) { - auto it = acks.find(r.first); - if (it == acks.end()) - continue; - it->second->add_strong_dirfrag(dir->dirfrag(), ++r.second, dir->dir_rep); - it->second->add_dirfrag_base(dir); - } - - for (CDir::map_t::iterator q = dir->items.begin(); - q != dir->items.end(); - ++q) { - CDentry *dn = q->second; - CDentry::linkage_t *dnl = dn->get_linkage(); - - // inode - CInode *in = NULL; - if (dnl->is_primary()) - in = dnl->get_inode(); - - // dentry - for (auto &r : dn->get_replicas()) { - auto it = acks.find(r.first); - if (it == acks.end()) - continue; - it->second->add_strong_dentry(dir->dirfrag(), dn->name, dn->first, dn->last, - dnl->is_primary() ? dnl->get_inode()->ino():inodeno_t(0), - dnl->is_remote() ? dnl->get_remote_ino():inodeno_t(0), - dnl->is_remote() ? dnl->get_remote_d_type():0, - ++r.second, - dn->lock.get_replica_state()); - // peer missed MDentrylink message ? - if (in && !in->is_replica(r.first)) - in->add_replica(r.first); - } - - if (!in) - continue; - - for (auto &r : in->get_replicas()) { - auto it = acks.find(r.first); - if (it == acks.end()) - continue; - it->second->add_inode_base(in, mds->mdsmap->get_up_features()); - bufferlist bl; - in->_encode_locks_state_for_rejoin(bl, r.first); - it->second->add_inode_locks(in, ++r.second, bl); - } - - // subdirs in this subtree? - in->get_nested_dirfrags(dq); - } - } - } - - // base inodes too - if (root && root->is_auth()) - for (auto &r : root->get_replicas()) { - auto it = acks.find(r.first); - if (it == acks.end()) - continue; - it->second->add_inode_base(root, mds->mdsmap->get_up_features()); - bufferlist bl; - root->_encode_locks_state_for_rejoin(bl, r.first); - it->second->add_inode_locks(root, ++r.second, bl); - } - if (myin) - for (auto &r : myin->get_replicas()) { - auto it = acks.find(r.first); - if (it == acks.end()) - continue; - it->second->add_inode_base(myin, mds->mdsmap->get_up_features()); - bufferlist bl; - myin->_encode_locks_state_for_rejoin(bl, r.first); - it->second->add_inode_locks(myin, ++r.second, bl); - } - - // include inode base for any inodes whose scatterlocks may have updated - for (set::iterator p = rejoin_potential_updated_scatterlocks.begin(); - p != rejoin_potential_updated_scatterlocks.end(); - ++p) { - CInode *in = *p; - for (const auto &r : in->get_replicas()) { - auto it = acks.find(r.first); - if (it == acks.end()) - continue; - it->second->add_inode_base(in, mds->mdsmap->get_up_features()); - } - } - - // send acks - for (auto p = acks.begin(); p != acks.end(); ++p) { - ::encode(rejoin_imported_caps[p->first], p->second->imported_caps); - mds->send_message_mds(p->second, p->first); - } - - rejoin_imported_caps.clear(); -} - -class C_MDC_ReIssueCaps : public MDCacheContext { - CInode *in; -public: - C_MDC_ReIssueCaps(MDCache *mdc, CInode *i) : - MDCacheContext(mdc), in(i) - { - in->get(CInode::PIN_PTRWAITER); - } - void finish(int r) override { - if (!mdcache->mds->locker->eval(in, CEPH_CAP_LOCKS)) - mdcache->mds->locker->issue_caps(in); - in->put(CInode::PIN_PTRWAITER); - } -}; - -void MDCache::reissue_all_caps() -{ - dout(10) << "reissue_all_caps" << dendl; - - for (ceph::unordered_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - CInode *in = p->second; - if (in->is_head() && in->is_any_caps()) { - // called by MDSRank::active_start(). There shouldn't be any frozen subtree. - if (in->is_frozen_inode()) { - in->add_waiter(CInode::WAIT_UNFREEZE, new C_MDC_ReIssueCaps(this, in)); - continue; - } - if (!mds->locker->eval(in, CEPH_CAP_LOCKS)) - mds->locker->issue_caps(in); - } - } -} - - -// =============================================================================== - -struct C_MDC_QueuedCow : public MDCacheContext { - CInode *in; - MutationRef mut; - C_MDC_QueuedCow(MDCache *mdc, CInode *i, MutationRef& m) : - MDCacheContext(mdc), in(i), mut(m) {} - void finish(int r) override { - mdcache->_queued_file_recover_cow(in, mut); - } -}; - - -void MDCache::queue_file_recover(CInode *in) -{ - dout(10) << "queue_file_recover " << *in << dendl; - assert(in->is_auth()); - - // cow? - /* - SnapRealm *realm = in->find_snaprealm(); - set s = realm->get_snaps(); - while (!s.empty() && *s.begin() < in->first) - s.erase(s.begin()); - while (!s.empty() && *s.rbegin() > in->last) - s.erase(*s.rbegin()); - dout(10) << " snaps in [" << in->first << "," << in->last << "] are " << s << dendl; - if (s.size() > 1) { - inode_t *pi = in->project_inode(); - pi->version = in->pre_dirty(); - - auto mut(std::make_shared()); - mut->ls = mds->mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mds->mdlog, "queue_file_recover cow"); - mds->mdlog->start_entry(le); - predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY); - - s.erase(*s.begin()); - while (!s.empty()) { - snapid_t snapid = *s.begin(); - CInode *cow_inode = 0; - journal_cow_inode(mut, &le->metablob, in, snapid-1, &cow_inode); - assert(cow_inode); - recovery_queue.enqueue(cow_inode); - s.erase(*s.begin()); - } - - in->parent->first = in->first; - le->metablob.add_primary_dentry(in->parent, in, true); - mds->mdlog->submit_entry(le, new C_MDC_QueuedCow(this, in, mut)); - mds->mdlog->flush(); - } - */ - - recovery_queue.enqueue(in); -} - -void MDCache::_queued_file_recover_cow(CInode *in, MutationRef& mut) -{ - in->pop_and_dirty_projected_inode(mut->ls); - mut->apply(); - mds->locker->drop_locks(mut.get()); - mut->cleanup(); -} - - -/* - * called after recovery to recover file sizes for previously opened (for write) - * files. that is, those where max_size > size. - */ -void MDCache::identify_files_to_recover() -{ - dout(10) << "identify_files_to_recover" << dendl; - for (ceph::unordered_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - CInode *in = p->second; - if (!in->is_auth()) - continue; - - if (in->last != CEPH_NOSNAP) - continue; - - // Only normal files need file size recovery - if (!in->is_file()) { - continue; - } - - bool recover = false; - for (map::iterator p = in->inode.client_ranges.begin(); - p != in->inode.client_ranges.end(); - ++p) { - Capability *cap = in->get_client_cap(p->first); - if (!cap) { - dout(10) << " client." << p->first << " has range " << p->second << " but no cap on " << *in << dendl; - recover = true; - break; - } - } - - if (recover) { - if (in->filelock.is_stable()) { - in->auth_pin(&in->filelock); - } else { - assert(in->filelock.get_state() == LOCK_XLOCKSNAP); - } - in->filelock.set_state(LOCK_PRE_SCAN); - rejoin_recover_q.push_back(in); - } else { - rejoin_check_q.push_back(in); - } - } -} - -void MDCache::start_files_to_recover() -{ - for (CInode *in : rejoin_check_q) { - if (in->filelock.get_state() == LOCK_XLOCKSNAP) - mds->locker->issue_caps(in); - mds->locker->check_inode_max_size(in); - } - rejoin_check_q.clear(); - for (CInode *in : rejoin_recover_q) { - mds->locker->file_recover(&in->filelock); - } - if (!rejoin_recover_q.empty()) { - rejoin_recover_q.clear(); - do_file_recover(); - } -} - -void MDCache::do_file_recover() -{ - recovery_queue.advance(); -} - -// =============================================================================== - - -// ---------------------------- -// truncate - -class C_MDC_RetryTruncate : public MDCacheContext { - CInode *in; - LogSegment *ls; -public: - C_MDC_RetryTruncate(MDCache *c, CInode *i, LogSegment *l) : - MDCacheContext(c), in(i), ls(l) {} - void finish(int r) override { - mdcache->_truncate_inode(in, ls); - } -}; - -void MDCache::truncate_inode(CInode *in, LogSegment *ls) -{ - inode_t *pi = in->get_projected_inode(); - dout(10) << "truncate_inode " - << pi->truncate_from << " -> " << pi->truncate_size - << " on " << *in - << dendl; - - ls->truncating_inodes.insert(in); - in->get(CInode::PIN_TRUNCATING); - in->auth_pin(this); - - if (!in->client_need_snapflush.empty() && - (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) { - assert(in->filelock.is_xlocked()); - in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls)); - mds->locker->issue_caps(in); - return; - } - - _truncate_inode(in, ls); -} - -struct C_IO_MDC_TruncateFinish : public MDCacheIOContext { - CInode *in; - LogSegment *ls; - C_IO_MDC_TruncateFinish(MDCache *c, CInode *i, LogSegment *l) : - MDCacheIOContext(c), in(i), ls(l) {} - void finish(int r) override { - assert(r == 0 || r == -ENOENT); - mdcache->truncate_inode_finish(in, ls); - } -}; - -void MDCache::_truncate_inode(CInode *in, LogSegment *ls) -{ - inode_t *pi = &in->inode; - dout(10) << "_truncate_inode " - << pi->truncate_from << " -> " << pi->truncate_size - << " on " << *in << dendl; - - assert(pi->is_truncating()); - assert(pi->truncate_size < (1ULL << 63)); - assert(pi->truncate_from < (1ULL << 63)); - assert(pi->truncate_size < pi->truncate_from); - - - SnapRealm *realm = in->find_snaprealm(); - SnapContext nullsnap; - const SnapContext *snapc; - if (realm) { - dout(10) << " realm " << *realm << dendl; - snapc = &realm->get_snap_context(); - } else { - dout(10) << " NO realm, using null context" << dendl; - snapc = &nullsnap; - assert(in->last == CEPH_NOSNAP); - } - dout(10) << "_truncate_inode snapc " << snapc << " on " << *in << dendl; - filer.truncate(in->inode.ino, &in->inode.layout, *snapc, - pi->truncate_size, pi->truncate_from-pi->truncate_size, - pi->truncate_seq, ceph::real_time::min(), 0, - new C_OnFinisher(new C_IO_MDC_TruncateFinish(this, in, ls), - mds->finisher)); -} - -struct C_MDC_TruncateLogged : public MDCacheLogContext { - CInode *in; - MutationRef mut; - C_MDC_TruncateLogged(MDCache *m, CInode *i, MutationRef& mu) : - MDCacheLogContext(m), in(i), mut(mu) {} - void finish(int r) override { - mdcache->truncate_inode_logged(in, mut); - } -}; - -void MDCache::truncate_inode_finish(CInode *in, LogSegment *ls) -{ - dout(10) << "truncate_inode_finish " << *in << dendl; - - set::iterator p = ls->truncating_inodes.find(in); - assert(p != ls->truncating_inodes.end()); - ls->truncating_inodes.erase(p); - - // update - inode_t *pi = in->project_inode(); - pi->version = in->pre_dirty(); - pi->truncate_from = 0; - pi->truncate_pending--; - - MutationRef mut(new MutationImpl()); - mut->ls = mds->mdlog->get_current_segment(); - mut->add_projected_inode(in); - - EUpdate *le = new EUpdate(mds->mdlog, "truncate finish"); - mds->mdlog->start_entry(le); - CDentry *dn = in->get_projected_parent_dn(); - le->metablob.add_dir_context(dn->get_dir()); - le->metablob.add_primary_dentry(dn, in, true); - le->metablob.add_truncate_finish(in->ino(), ls->seq); - - journal_dirty_inode(mut.get(), &le->metablob, in); - mds->mdlog->submit_entry(le, new C_MDC_TruncateLogged(this, in, mut)); - - // flush immediately if there are readers/writers waiting - if (in->is_waiter_for(CInode::WAIT_TRUNC) || - (in->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR))) - mds->mdlog->flush(); -} - -void MDCache::truncate_inode_logged(CInode *in, MutationRef& mut) -{ - dout(10) << "truncate_inode_logged " << *in << dendl; - mut->apply(); - mds->locker->drop_locks(mut.get()); - mut->cleanup(); - - in->put(CInode::PIN_TRUNCATING); - in->auth_unpin(this); - - list waiters; - in->take_waiting(CInode::WAIT_TRUNC, waiters); - mds->queue_waiters(waiters); -} - - -void MDCache::add_recovered_truncate(CInode *in, LogSegment *ls) -{ - dout(20) << "add_recovered_truncate " << *in << " in log segment " - << ls->seq << "/" << ls->offset << dendl; - ls->truncating_inodes.insert(in); - in->get(CInode::PIN_TRUNCATING); -} - -void MDCache::remove_recovered_truncate(CInode *in, LogSegment *ls) -{ - dout(20) << "remove_recovered_truncate " << *in << " in log segment " - << ls->seq << "/" << ls->offset << dendl; - // if we have the logseg the truncate started in, it must be in our list. - set::iterator p = ls->truncating_inodes.find(in); - assert(p != ls->truncating_inodes.end()); - ls->truncating_inodes.erase(p); - in->put(CInode::PIN_TRUNCATING); -} - -void MDCache::start_recovered_truncates() -{ - dout(10) << "start_recovered_truncates" << dendl; - for (map::iterator p = mds->mdlog->segments.begin(); - p != mds->mdlog->segments.end(); - ++p) { - LogSegment *ls = p->second; - for (set::iterator q = ls->truncating_inodes.begin(); - q != ls->truncating_inodes.end(); - ++q) { - CInode *in = *q; - in->auth_pin(this); - - if (!in->client_need_snapflush.empty() && - (in->get_caps_issued() & CEPH_CAP_FILE_BUFFER)) { - assert(in->filelock.is_stable()); - in->filelock.set_state(LOCK_XLOCKDONE); - in->auth_pin(&in->filelock); - in->filelock.set_xlock_snap_sync(new C_MDC_RetryTruncate(this, in, ls)); - // start_files_to_recover will revoke caps - continue; - } - _truncate_inode(in, ls); - } - } -} - - - - - - -// ================================================================================ -// cache trimming - -void MDCache::trim_lru(uint64_t count, map &expiremap) -{ - bool is_standby_replay = mds->is_standby_replay(); - std::vector unexpirables; - uint64_t trimmed = 0; - - dout(7) << "trim_lru trimming " << count - << " items from LRU" - << " size=" << lru.lru_get_size() - << " mid=" << lru.lru_get_top() - << " pintail=" << lru.lru_get_pintail() - << " pinned=" << lru.lru_get_num_pinned() - << dendl; - - for (;;) { - CDentry *dn = static_cast(bottom_lru.lru_expire()); - if (!dn) - break; - if (trim_dentry(dn, expiremap)) { - unexpirables.push_back(dn); - } else { - trimmed++; - } - } - - for (auto &dn : unexpirables) { - bottom_lru.lru_insert_mid(dn); - } - unexpirables.clear(); - - // trim dentries from the LRU until count is reached - while (cache_toofull() || count > 0) { - CDentry *dn = static_cast(lru.lru_expire()); - if (!dn) { - break; - } - if ((is_standby_replay && dn->get_linkage()->inode && - dn->get_linkage()->inode->item_open_file.is_on_list())) { - unexpirables.push_back(dn); - } else if (trim_dentry(dn, expiremap)) { - unexpirables.push_back(dn); - } else { - trimmed++; - if (count > 0) count--; - } - } - - for (auto &dn : unexpirables) { - lru.lru_insert_mid(dn); - } - unexpirables.clear(); - - dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl; -} - -/* - * note: only called while MDS is active or stopping... NOT during recovery. - * however, we may expire a replica whose authority is recovering. - * - * @param count is number of dentries to try to expire - */ -bool MDCache::trim(uint64_t count) -{ - uint64_t used = cache_size(); - uint64_t limit = cache_limit_memory(); - map expiremap; - - dout(7) << "trim bytes_used=" << bytes2str(used) - << " limit=" << bytes2str(limit) - << " reservation=" << cache_reservation() - << "% count=" << count << dendl; - - // process delayed eval_stray() - stray_manager.advance_delayed(); - - trim_lru(count, expiremap); - - // trim non-auth, non-bound subtrees - for (auto p = subtrees.begin(); p != subtrees.end();) { - CDir *dir = p->first; - ++p; - CInode *diri = dir->get_inode(); - if (dir->is_auth()) { - if (!diri->is_auth() && !diri->is_base() && - dir->get_num_head_items() == 0) { - if (dir->state_test(CDir::STATE_EXPORTING) || - !(mds->is_active() || mds->is_stopping()) || - dir->is_freezing() || dir->is_frozen()) - continue; - - migrator->export_empty_import(dir); - } - } else { - if (!diri->is_auth()) { - if (dir->get_num_ref() > 1) // only subtree pin - continue; - list ls; - diri->get_subtree_dirfrags(ls); - if (diri->get_num_ref() > (int)ls.size()) // only pinned by subtrees - continue; - - // don't trim subtree root if its auth MDS is recovering. - // This simplify the cache rejoin code. - if (dir->is_subtree_root() && - rejoin_ack_gather.count(dir->get_dir_auth().first)) - continue; - trim_dirfrag(dir, 0, expiremap); - } - } - } - - // trim root? - if (mds->is_stopping() && root) { - list ls; - root->get_dirfrags(ls); - for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - CDir *dir = *p; - if (dir->get_num_ref() == 1) // subtree pin - trim_dirfrag(dir, 0, expiremap); - } - if (root->get_num_ref() == 0) - trim_inode(0, root, 0, expiremap); - } - - std::set stopping; - mds->mdsmap->get_mds_set(stopping, MDSMap::STATE_STOPPING); - stopping.erase(mds->get_nodeid()); - for (auto rank : stopping) { - CInode* mdsdir_in = get_inode(MDS_INO_MDSDIR(rank)); - if (!mdsdir_in) - continue; - - if (expiremap.count(rank) == 0) { - expiremap[rank] = new MCacheExpire(mds->get_nodeid()); - } - - dout(20) << __func__ << ": try expiring " << *mdsdir_in << " for stopping mds." << mds << dendl; - - const bool aborted = expire_recursive(mdsdir_in, expiremap); - if (!aborted) { - dout(20) << __func__ << ": successfully expired mdsdir" << dendl; - list ls; - mdsdir_in->get_dirfrags(ls); - for (auto dir : ls) { - if (dir->get_num_ref() == 1) // subtree pin - trim_dirfrag(dir, dir, expiremap); - } - if (mdsdir_in->get_num_ref() == 0) - trim_inode(NULL, mdsdir_in, NULL, expiremap); - } else { - dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl; - } - } - - // Other rank's base inodes (when I'm stopping) - if (mds->is_stopping()) { - for (set::iterator p = base_inodes.begin(); - p != base_inodes.end(); ++p) { - if (MDS_INO_MDSDIR_OWNER((*p)->ino()) != mds->get_nodeid()) { - dout(20) << __func__ << ": maybe trimming base: " << *(*p) << dendl; - if ((*p)->get_num_ref() == 0) { - trim_inode(NULL, *p, NULL, expiremap); - } - } - } - } - - // send any expire messages - send_expire_messages(expiremap); - - return true; -} - -void MDCache::send_expire_messages(map& expiremap) -{ - // send expires - for (map::iterator it = expiremap.begin(); - it != expiremap.end(); - ++it) { - if (mds->is_cluster_degraded() && - (mds->mdsmap->get_state(it->first) < MDSMap::STATE_REJOIN || - (mds->mdsmap->get_state(it->first) == MDSMap::STATE_REJOIN && - rejoin_sent.count(it->first) == 0))) { - it->second->put(); - continue; - } - dout(7) << "sending cache_expire to " << it->first << dendl; - mds->send_message_mds(it->second, it->first); - } -} - - -bool MDCache::trim_dentry(CDentry *dn, map& expiremap) -{ - dout(12) << "trim_dentry " << *dn << dendl; - - CDentry::linkage_t *dnl = dn->get_linkage(); - - CDir *dir = dn->get_dir(); - assert(dir); - - CDir *con = get_subtree_root(dir); - if (con) - dout(12) << " in container " << *con << dendl; - else { - dout(12) << " no container; under a not-yet-linked dir" << dendl; - assert(dn->is_auth()); - } - - // If replica dentry is not readable, it's likely we will receive - // MDentryLink/MDentryUnlink message soon (It's possible we first - // receive a MDentryUnlink message, then MDentryLink message) - // MDentryLink message only replicates an inode, so we should - // avoid trimming the inode's parent dentry. This is because that - // unconnected replicas are problematic for subtree migration. - if (!dn->is_auth() && !dn->lock.can_read(-1) && - !dn->get_dir()->get_inode()->is_stray()) - return true; - - // adjust the dir state - // NOTE: we can safely remove a clean, null dentry without effecting - // directory completeness. - // (check this _before_ we unlink the inode, below!) - bool clear_complete = false; - if (!(dnl->is_null() && dn->is_clean())) - clear_complete = true; - - // unlink the dentry - if (dnl->is_remote()) { - // just unlink. - dir->unlink_inode(dn, false); - } else if (dnl->is_primary()) { - // expire the inode, too. - CInode *in = dnl->get_inode(); - assert(in); - if (trim_inode(dn, in, con, expiremap)) - return true; // purging stray instead of trimming - } else { - assert(dnl->is_null()); - } - - if (!dn->is_auth()) { - // notify dentry authority. - mds_authority_t auth = dn->authority(); - - for (int p=0; p<2; p++) { - mds_rank_t a = auth.first; - if (p) a = auth.second; - if (a < 0 || (p == 1 && auth.second == auth.first)) break; - if (mds->get_nodeid() == auth.second && - con->is_importing()) break; // don't send any expire while importing. - if (a == mds->get_nodeid()) continue; // on export, ignore myself. - - dout(12) << " sending expire to mds." << a << " on " << *dn << dendl; - assert(a != mds->get_nodeid()); - if (expiremap.count(a) == 0) - expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_dentry(con->dirfrag(), dir->dirfrag(), dn->name, dn->last, dn->get_replica_nonce()); - } - } - - // remove dentry - if (dn->last == CEPH_NOSNAP && dir->is_auth()) - dir->add_to_bloom(dn); - dir->remove_dentry(dn); - - if (clear_complete) - dir->state_clear(CDir::STATE_COMPLETE); - - if (mds->logger) mds->logger->inc(l_mds_inodes_expired); - return false; -} - - -void MDCache::trim_dirfrag(CDir *dir, CDir *con, map& expiremap) -{ - dout(15) << "trim_dirfrag " << *dir << dendl; - - if (dir->is_subtree_root()) { - assert(!dir->is_auth() || - (!dir->is_replicated() && dir->inode->is_base())); - remove_subtree(dir); // remove from subtree map - } - assert(dir->get_num_ref() == 0); - - CInode *in = dir->get_inode(); - - if (!dir->is_auth()) { - mds_authority_t auth = dir->authority(); - - // was this an auth delegation? (if so, slightly modified container) - dirfrag_t condf; - if (dir->is_subtree_root()) { - dout(12) << " subtree root, container is " << *dir << dendl; - con = dir; - condf = dir->dirfrag(); - } else { - condf = con->dirfrag(); - } - - for (int p=0; p<2; p++) { - mds_rank_t a = auth.first; - if (p) a = auth.second; - if (a < 0 || (p == 1 && auth.second == auth.first)) break; - if (mds->get_nodeid() == auth.second && - con->is_importing()) break; // don't send any expire while importing. - if (a == mds->get_nodeid()) continue; // on export, ignore myself. - - dout(12) << " sending expire to mds." << a << " on " << *dir << dendl; - assert(a != mds->get_nodeid()); - if (expiremap.count(a) == 0) - expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_dir(condf, dir->dirfrag(), dir->replica_nonce); - } - } - - in->close_dirfrag(dir->dirfrag().frag); -} - -/** - * Try trimming an inode from the cache - * - * @return true if the inode is still in cache, else false if it was trimmed - */ -bool MDCache::trim_inode(CDentry *dn, CInode *in, CDir *con, map& expiremap) -{ - dout(15) << "trim_inode " << *in << dendl; - assert(in->get_num_ref() == 0); - - if (in->is_dir()) { - // If replica inode's dirfragtreelock is not readable, it's likely - // some dirfrags of the inode are being fragmented and we will receive - // MMDSFragmentNotify soon. MMDSFragmentNotify only replicates the new - // dirfrags, so we should avoid trimming these dirfrags' parent inode. - // This is because that unconnected replicas are problematic for - // subtree migration. - // - if (!in->is_auth() && !in->dirfragtreelock.can_read(-1)) - return true; - - // DIR - list dfls; - in->get_dirfrags(dfls); - for (list::iterator p = dfls.begin(); p != dfls.end(); ++p) { - CDir *dir = *p; - assert(!dir->is_subtree_root()); - trim_dirfrag(dir, con ? con:dir, expiremap); // if no container (e.g. root dirfrag), use *p - } - } - - // INODE - if (in->is_auth()) { - // eval stray after closing dirfrags - if (dn && !dn->state_test(CDentry::STATE_PURGING)) { - maybe_eval_stray(in); - if (dn->state_test(CDentry::STATE_PURGING) || dn->get_num_ref() > 0) - return true; - } - } else { - mds_authority_t auth = in->authority(); - - dirfrag_t df; - if (con) - df = con->dirfrag(); - else - df = dirfrag_t(0,frag_t()); // must be a root or stray inode. - - for (int p=0; p<2; p++) { - mds_rank_t a = auth.first; - if (p) a = auth.second; - if (a < 0 || (p == 1 && auth.second == auth.first)) break; - if (con && mds->get_nodeid() == auth.second && - con->is_importing()) break; // don't send any expire while importing. - if (a == mds->get_nodeid()) continue; // on export, ignore myself. - - dout(12) << " sending expire to mds." << a << " on " << *in << dendl; - assert(a != mds->get_nodeid()); - if (expiremap.count(a) == 0) - expiremap[a] = new MCacheExpire(mds->get_nodeid()); - expiremap[a]->add_inode(df, in->vino(), in->get_replica_nonce()); - } - } - - /* - if (in->is_auth()) { - if (in->hack_accessed) - mds->logger->inc("outt"); - else { - mds->logger->inc("outut"); - mds->logger->fset("oututl", ceph_clock_now() - in->hack_load_stamp); - } - } - */ - - // unlink - if (dn) - dn->get_dir()->unlink_inode(dn, false); - remove_inode(in); - return false; -} - - -/** - * trim_non_auth - remove any non-auth items from our cache - * - * this reduces the amount of non-auth metadata in our cache, reducing the - * load incurred by the rejoin phase. - * - * the only non-auth items that remain are those that are needed to - * attach our own subtrees to the root. - * - * when we are done, all dentries will be in the top bit of the lru. - * - * why we have to do this: - * we may not have accurate linkage for non-auth items. which means we will - * know which subtree it falls into, and can not be sure to declare it to the - * correct authority. - */ -void MDCache::trim_non_auth() -{ - dout(7) << "trim_non_auth" << dendl; - - // temporarily pin all subtree roots - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) - p->first->get(CDir::PIN_SUBTREETEMP); - - list auth_list; - - // trim non-auth items from the lru - for (;;) { - CDentry *dn = NULL; - if (bottom_lru.lru_get_size() > 0) - dn = static_cast(bottom_lru.lru_expire()); - if (!dn && lru.lru_get_size() > 0) - dn = static_cast(lru.lru_expire()); - if (!dn) - break; - - CDentry::linkage_t *dnl = dn->get_linkage(); - - if (dn->is_auth()) { - // add back into lru (at the top) - auth_list.push_back(dn); - - if (dnl->is_remote() && dnl->get_inode() && !dnl->get_inode()->is_auth()) - dn->unlink_remote(dnl); - } else { - // non-auth. expire. - CDir *dir = dn->get_dir(); - assert(dir); - - // unlink the dentry - dout(10) << " removing " << *dn << dendl; - if (dnl->is_remote()) { - dir->unlink_inode(dn, false); - } - else if (dnl->is_primary()) { - CInode *in = dnl->get_inode(); - dout(10) << " removing " << *in << dendl; - list ls; - in->get_dirfrags(ls); - for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - CDir *subdir = *p; - assert(!subdir->is_subtree_root()); - in->close_dirfrag(subdir->dirfrag().frag); - } - dir->unlink_inode(dn, false); - remove_inode(in); - } - else { - assert(dnl->is_null()); - } - - assert(!dir->has_bloom()); - dir->remove_dentry(dn); - // adjust the dir state - dir->state_clear(CDir::STATE_COMPLETE); // dir incomplete! - // close empty non-auth dirfrag - if (!dir->is_subtree_root() && dir->get_num_any() == 0) - dir->inode->close_dirfrag(dir->get_frag()); - } - } - - for (auto dn : auth_list) { - if (dn->state_test(CDentry::STATE_BOTTOMLRU)) - bottom_lru.lru_insert_mid(dn); - else - lru.lru_insert_top(dn); - } - - // move everything in the pintail to the top bit of the lru. - lru.lru_touch_entire_pintail(); - - // unpin all subtrees - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) - p->first->put(CDir::PIN_SUBTREETEMP); - - if (lru.lru_get_size() == 0 && - bottom_lru.lru_get_size() == 0) { - // root, stray, etc.? - ceph::unordered_map::iterator p = inode_map.begin(); - while (p != inode_map.end()) { - ceph::unordered_map::iterator next = p; - ++next; - CInode *in = p->second; - if (!in->is_auth()) { - list ls; - in->get_dirfrags(ls); - for (list::iterator p = ls.begin(); - p != ls.end(); - ++p) { - dout(10) << " removing " << **p << dendl; - assert((*p)->get_num_ref() == 1); // SUBTREE - remove_subtree((*p)); - in->close_dirfrag((*p)->dirfrag().frag); - } - dout(10) << " removing " << *in << dendl; - assert(!in->get_parent_dn()); - assert(in->get_num_ref() == 0); - remove_inode(in); - } - p = next; - } - } - - show_subtrees(); -} - -/** - * Recursively trim the subtree rooted at directory to remove all - * CInodes/CDentrys/CDirs that aren't links to remote MDSes, or ancestors - * of those links. This is used to clear invalid data out of the cache. - * Note that it doesn't clear the passed-in directory, since that's not - * always safe. - */ -bool MDCache::trim_non_auth_subtree(CDir *dir) -{ - dout(10) << "trim_non_auth_subtree(" << dir << ") " << *dir << dendl; - - bool keep_dir = !can_trim_non_auth_dirfrag(dir); - - CDir::map_t::iterator j = dir->begin(); - CDir::map_t::iterator i = j; - while (j != dir->end()) { - i = j++; - CDentry *dn = i->second; - dout(10) << "trim_non_auth_subtree(" << dir << ") Checking dentry " << dn << dendl; - CDentry::linkage_t *dnl = dn->get_linkage(); - if (dnl->is_primary()) { // check for subdirectories, etc - CInode *in = dnl->get_inode(); - bool keep_inode = false; - if (in->is_dir()) { - list subdirs; - in->get_dirfrags(subdirs); - for (list::iterator subdir = subdirs.begin(); - subdir != subdirs.end(); - ++subdir) { - if ((*subdir)->is_subtree_root()) { - keep_inode = true; - dout(10) << "trim_non_auth_subtree(" << dir << ") keeping " << **subdir << dendl; - } else { - if (trim_non_auth_subtree(*subdir)) - keep_inode = true; - else { - in->close_dirfrag((*subdir)->get_frag()); - dir->state_clear(CDir::STATE_COMPLETE); // now incomplete! - } - } - } - - } - if (!keep_inode) { // remove it! - dout(20) << "trim_non_auth_subtree(" << dir << ") removing inode " << in << " with dentry" << dn << dendl; - dir->unlink_inode(dn, false); - remove_inode(in); - assert(!dir->has_bloom()); - dir->remove_dentry(dn); - } else { - dout(20) << "trim_non_auth_subtree(" << dir << ") keeping inode " << in << " with dentry " << dn <state_clear(CDentry::STATE_AUTH); - in->state_clear(CInode::STATE_AUTH); - } - } else if (keep_dir && dnl->is_null()) { // keep null dentry for slave rollback - dout(20) << "trim_non_auth_subtree(" << dir << ") keeping dentry " << dn <is_remote()) - dir->unlink_inode(dn, false); - dir->remove_dentry(dn); - } - } - dir->state_clear(CDir::STATE_AUTH); - /** - * We've now checked all our children and deleted those that need it. - * Now return to caller, and tell them if *we're* a keeper. - */ - return keep_dir || dir->get_num_any(); -} - -/* - * during replay, when we determine a subtree is no longer ours, we - * try to trim it from our cache. because subtrees must be connected - * to the root, the fact that we can trim this tree may mean that our - * children or parents can also be trimmed. - */ -void MDCache::try_trim_non_auth_subtree(CDir *dir) -{ - dout(10) << "try_trim_nonauth_subtree " << *dir << dendl; - - // can we now trim child subtrees? - set bounds; - get_subtree_bounds(dir, bounds); - for (set::iterator p = bounds.begin(); p != bounds.end(); ++p) { - CDir *bd = *p; - if (bd->get_dir_auth().first != mds->get_nodeid() && // we are not auth - bd->get_num_any() == 0 && // and empty - can_trim_non_auth_dirfrag(bd)) { - CInode *bi = bd->get_inode(); - dout(10) << " closing empty non-auth child subtree " << *bd << dendl; - remove_subtree(bd); - bd->mark_clean(); - bi->close_dirfrag(bd->get_frag()); - } - } - - if (trim_non_auth_subtree(dir)) { - // keep - try_subtree_merge(dir); - } else { - // can we trim this subtree (and possibly our ancestors) too? - while (true) { - CInode *diri = dir->get_inode(); - if (diri->is_base()) { - if (!diri->is_root() && diri->authority().first != mds->get_nodeid()) { - dout(10) << " closing empty non-auth subtree " << *dir << dendl; - remove_subtree(dir); - dir->mark_clean(); - diri->close_dirfrag(dir->get_frag()); - - dout(10) << " removing " << *diri << dendl; - assert(!diri->get_parent_dn()); - assert(diri->get_num_ref() == 0); - remove_inode(diri); - } - break; - } - - CDir *psub = get_subtree_root(diri->get_parent_dir()); - dout(10) << " parent subtree is " << *psub << dendl; - if (psub->get_dir_auth().first == mds->get_nodeid()) - break; // we are auth, keep. - - dout(10) << " closing empty non-auth subtree " << *dir << dendl; - remove_subtree(dir); - dir->mark_clean(); - diri->close_dirfrag(dir->get_frag()); - - dout(10) << " parent subtree also non-auth: " << *psub << dendl; - if (trim_non_auth_subtree(psub)) - break; - dir = psub; - } - } - - show_subtrees(); -} - -void MDCache::standby_trim_segment(LogSegment *ls) -{ - ls->new_dirfrags.clear_list(); - ls->open_files.clear_list(); - - while (!ls->dirty_dirfrags.empty()) { - CDir *dir = ls->dirty_dirfrags.front(); - dir->mark_clean(); - } - while (!ls->dirty_inodes.empty()) { - CInode *in = ls->dirty_inodes.front(); - in->mark_clean(); - } - while (!ls->dirty_dentries.empty()) { - CDentry *dn = ls->dirty_dentries.front(); - dn->mark_clean(); - } - while (!ls->dirty_parent_inodes.empty()) { - CInode *in = ls->dirty_parent_inodes.front(); - in->clear_dirty_parent(); - } - while (!ls->dirty_dirfrag_dir.empty()) { - CInode *in = ls->dirty_dirfrag_dir.front(); - in->filelock.remove_dirty(); - } - while (!ls->dirty_dirfrag_nest.empty()) { - CInode *in = ls->dirty_dirfrag_nest.front(); - in->nestlock.remove_dirty(); - } - while (!ls->dirty_dirfrag_dirfragtree.empty()) { - CInode *in = ls->dirty_dirfrag_dirfragtree.front(); - in->dirfragtreelock.remove_dirty(); - } -} - -/* This function DOES put the passed message before returning */ -void MDCache::handle_cache_expire(MCacheExpire *m) -{ - mds_rank_t from = mds_rank_t(m->get_from()); - - dout(7) << "cache_expire from mds." << from << dendl; - - if (mds->get_state() < MDSMap::STATE_REJOIN) { - m->put(); - return; - } - - set gather_locks; - // loop over realms - for (map::iterator p = m->realms.begin(); - p != m->realms.end(); - ++p) { - // check container? - if (p->first.ino > 0) { - CInode *expired_inode = get_inode(p->first.ino); - assert(expired_inode); // we had better have this. - CDir *parent_dir = expired_inode->get_approx_dirfrag(p->first.frag); - assert(parent_dir); - - int export_state = -1; - if (parent_dir->is_auth() && parent_dir->is_exporting()) { - export_state = migrator->get_export_state(parent_dir); - assert(export_state >= 0); - } - - if (!parent_dir->is_auth() || - (export_state != -1 && - ((export_state == Migrator::EXPORT_WARNING && - migrator->export_has_warned(parent_dir,from)) || - export_state == Migrator::EXPORT_EXPORTING || - export_state == Migrator::EXPORT_LOGGINGFINISH || - (export_state == Migrator::EXPORT_NOTIFYING && - !migrator->export_has_notified(parent_dir,from))))) { - - // not auth. - dout(7) << "delaying nonauth|warned expires for " << *parent_dir << dendl; - assert(parent_dir->is_frozen_tree_root()); - - // make a message container - if (delayed_expire[parent_dir].count(from) == 0) - delayed_expire[parent_dir][from] = new MCacheExpire(from); - - // merge these expires into it - delayed_expire[parent_dir][from]->add_realm(p->first, p->second); - continue; - } - assert(export_state <= Migrator::EXPORT_PREPPING || - (export_state == Migrator::EXPORT_WARNING && - !migrator->export_has_warned(parent_dir, from))); - - dout(7) << "expires for " << *parent_dir << dendl; - } else { - dout(7) << "containerless expires (root, stray inodes)" << dendl; - } - - // INODES - for (map::iterator it = p->second.inodes.begin(); - it != p->second.inodes.end(); - ++it) { - CInode *in = get_inode(it->first); - unsigned nonce = it->second; - - if (!in) { - dout(0) << " inode expire on " << it->first << " from " << from - << ", don't have it" << dendl; - assert(in); - } - assert(in->is_auth()); - dout(20) << __func__ << ": expiring inode " << *in << dendl; - - // check nonce - if (nonce == in->get_replica_nonce(from)) { - // remove from our cached_by - dout(7) << " inode expire on " << *in << " from mds." << from - << " cached_by was " << in->get_replicas() << dendl; - inode_remove_replica(in, from, false, gather_locks); - } - else { - // this is an old nonce, ignore expire. - dout(7) << " inode expire on " << *in << " from mds." << from - << " with old nonce " << nonce - << " (current " << in->get_replica_nonce(from) << "), dropping" - << dendl; - } - } - - // DIRS - for (map::iterator it = p->second.dirs.begin(); - it != p->second.dirs.end(); - ++it) { - CDir *dir = get_dirfrag(it->first); - unsigned nonce = it->second; - - if (!dir) { - CInode *diri = get_inode(it->first.ino); - if (diri) { - if (mds->is_rejoin() && - rejoin_ack_gather.count(mds->get_nodeid()) && // haven't sent rejoin ack yet - !diri->is_replica(from)) { - list ls; - diri->get_nested_dirfrags(ls); - dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from - << " while rejoining, inode isn't replicated" << dendl; - for (list::iterator q = ls.begin(); q != ls.end(); ++q) { - dir = *q; - if (dir->is_replica(from)) { - dout(7) << " dir expire on " << *dir << " from mds." << from << dendl; - dir->remove_replica(from); - } - } - continue; - } - CDir *other = diri->get_approx_dirfrag(it->first.frag); - if (other) { - dout(7) << " dir expire on dirfrag " << it->first << " from mds." << from - << " have " << *other << ", mismatched frags, dropping" << dendl; - continue; - } - } - dout(0) << " dir expire on " << it->first << " from " << from - << ", don't have it" << dendl; - assert(dir); - } - dout(20) << __func__ << ": expiring dirfrag " << *dir << dendl; - - assert(dir->is_auth()); - - // check nonce - if (nonce == dir->get_replica_nonce(from)) { - // remove from our cached_by - dout(7) << " dir expire on " << *dir << " from mds." << from - << " replicas was " << dir->get_replicas() << dendl; - dir->remove_replica(from); - } - else { - // this is an old nonce, ignore expire. - dout(7) << " dir expire on " << *dir << " from mds." << from - << " with old nonce " << nonce << " (current " << dir->get_replica_nonce(from) - << "), dropping" << dendl; - } - } - - // DENTRIES - for (map,uint32_t> >::iterator pd = p->second.dentries.begin(); - pd != p->second.dentries.end(); - ++pd) { - dout(10) << " dn expires in dir " << pd->first << dendl; - CInode *diri = get_inode(pd->first.ino); - assert(diri); - CDir *dir = diri->get_dirfrag(pd->first.frag); - - if (!dir) { - dout(0) << " dn expires on " << pd->first << " from " << from - << ", must have refragmented" << dendl; - } else { - assert(dir->is_auth()); - } - - for (map,uint32_t>::iterator p = pd->second.begin(); - p != pd->second.end(); - ++p) { - unsigned nonce = p->second; - CDentry *dn; - - if (dir) { - dn = dir->lookup(p->first.first, p->first.second); - } else { - // which dirfrag for this dentry? - CDir *dir = diri->get_dirfrag(diri->pick_dirfrag(p->first.first)); - assert(dir); - assert(dir->is_auth()); - dn = dir->lookup(p->first.first, p->first.second); - } - - if (!dn) { - if (dir) - dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << " in " << *dir << dendl; - else - dout(0) << " missing dentry for " << p->first.first << " snap " << p->first.second << dendl; - } - assert(dn); - - if (nonce == dn->get_replica_nonce(from)) { - dout(7) << " dentry_expire on " << *dn << " from mds." << from << dendl; - dentry_remove_replica(dn, from, gather_locks); - } - else { - dout(7) << " dentry_expire on " << *dn << " from mds." << from - << " with old nonce " << nonce << " (current " << dn->get_replica_nonce(from) - << "), dropping" << dendl; - } - } - } - } - - // done - m->put(); - - for (set::iterator p = gather_locks.begin(); p != gather_locks.end(); ++p) { - if (!(*p)->is_stable()) - mds->locker->eval_gather(*p); - } -} - -void MDCache::process_delayed_expire(CDir *dir) -{ - dout(7) << "process_delayed_expire on " << *dir << dendl; - for (map::iterator p = delayed_expire[dir].begin(); - p != delayed_expire[dir].end(); - ++p) - handle_cache_expire(p->second); - delayed_expire.erase(dir); -} - -void MDCache::discard_delayed_expire(CDir *dir) -{ - dout(7) << "discard_delayed_expire on " << *dir << dendl; - for (map::iterator p = delayed_expire[dir].begin(); - p != delayed_expire[dir].end(); - ++p) - p->second->put(); - delayed_expire.erase(dir); -} - -void MDCache::inode_remove_replica(CInode *in, mds_rank_t from, bool rejoin, - set& gather_locks) -{ - in->remove_replica(from); - in->mds_caps_wanted.erase(from); - - // note: this code calls _eval more often than it needs to! - // fix lock - if (in->authlock.remove_replica(from)) gather_locks.insert(&in->authlock); - if (in->linklock.remove_replica(from)) gather_locks.insert(&in->linklock); - if (in->snaplock.remove_replica(from)) gather_locks.insert(&in->snaplock); - if (in->xattrlock.remove_replica(from)) gather_locks.insert(&in->xattrlock); - if (in->flocklock.remove_replica(from)) gather_locks.insert(&in->flocklock); - if (in->policylock.remove_replica(from)) gather_locks.insert(&in->policylock); - - // If 'rejoin' is true and the scatter lock is in LOCK_MIX_* state. - // Don't remove the recovering mds from lock's gathering list because - // it may hold rejoined wrlocks. - if (in->dirfragtreelock.remove_replica(from, rejoin)) gather_locks.insert(&in->dirfragtreelock); - if (in->filelock.remove_replica(from, rejoin)) gather_locks.insert(&in->filelock); - if (in->nestlock.remove_replica(from, rejoin)) gather_locks.insert(&in->nestlock); -} - -void MDCache::dentry_remove_replica(CDentry *dn, mds_rank_t from, set& gather_locks) -{ - dn->remove_replica(from); - - // fix lock - if (dn->lock.remove_replica(from)) - gather_locks.insert(&dn->lock); - - // Replicated strays might now be elegible for purge - CDentry::linkage_t *dnl = dn->get_linkage(); - if (dnl->is_primary()) { - maybe_eval_stray(dnl->get_inode()); - } -} - -void MDCache::trim_client_leases() -{ - utime_t now = ceph_clock_now(); - - dout(10) << "trim_client_leases" << dendl; - - for (int pool=0; poolttl > now) break; - CDentry *dn = static_cast(r->parent); - dout(10) << " expiring client." << r->client << " lease of " << *dn << dendl; - dn->remove_client_lease(r, mds->locker); - } - int after = client_leases[pool].size(); - dout(10) << "trim_client_leases pool " << pool << " trimmed " - << (before-after) << " leases, " << after << " left" << dendl; - } -} - - -void MDCache::check_memory_usage() -{ - static MemoryModel mm(g_ceph_context); - static MemoryModel::snap last; - mm.sample(&last); - static MemoryModel::snap baseline = last; - - // check client caps - assert(CInode::count() == inode_map.size()); - double caps_per_inode = 0.0; - if (CInode::count()) - caps_per_inode = (double)Capability::count() / (double)CInode::count(); - - dout(2) << "check_memory_usage" - << " total " << last.get_total() - << ", rss " << last.get_rss() - << ", heap " << last.get_heap() - << ", baseline " << baseline.get_heap() - << ", buffers " << (buffer::get_total_alloc() >> 10) - << ", " << num_inodes_with_caps << " / " << CInode::count() << " inodes have caps" - << ", " << Capability::count() << " caps, " << caps_per_inode << " caps per inode" - << dendl; - - mds->update_mlogger(); - mds->mlogger->set(l_mdm_rss, last.get_rss()); - mds->mlogger->set(l_mdm_heap, last.get_heap()); - - if (cache_toofull()) { - last_recall_state = ceph_clock_now(); - mds->server->recall_client_state(); - } - - // If the cache size had exceeded its limit, but we're back in bounds - // now, free any unused pool memory so that our memory usage isn't - // permanently bloated. - if (exceeded_size_limit && !cache_toofull()) { - // Only do this once we are back in bounds: otherwise the releases would - // slow down whatever process caused us to exceed bounds to begin with - if (ceph_using_tcmalloc()) { - dout(2) << "check_memory_usage: releasing unused space from tcmalloc" - << dendl; - ceph_heap_release_free_memory(); - } - exceeded_size_limit = false; - } -} - - - -// ========================================================================================= -// shutdown - -class C_MDC_ShutdownCheck : public MDCacheContext { -public: - explicit C_MDC_ShutdownCheck(MDCache *m) : MDCacheContext(m) {} - void finish(int) override { - mdcache->shutdown_check(); - } -}; - -void MDCache::shutdown_check() -{ - dout(0) << "shutdown_check at " << ceph_clock_now() << dendl; - - // cache - char old_val[32] = { 0 }; - char *o = old_val; - g_conf->get_val("debug_mds", &o, sizeof(old_val)); - g_conf->set_val("debug_mds", "10"); - g_conf->apply_changes(NULL); - show_cache(); - g_conf->set_val("debug_mds", old_val); - g_conf->apply_changes(NULL); - mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this)); - - // this - dout(0) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl; - dout(0) << "log len " << mds->mdlog->get_num_events() << dendl; - - - if (mds->objecter->is_active()) { - dout(0) << "objecter still active" << dendl; - mds->objecter->dump_active(); - } -} - - -void MDCache::shutdown_start() -{ - dout(2) << "shutdown_start" << dendl; - - if (g_conf->mds_shutdown_check) - mds->timer.add_event_after(g_conf->mds_shutdown_check, new C_MDC_ShutdownCheck(this)); - - // g_conf->debug_mds = 10; -} - - - -bool MDCache::shutdown_pass() -{ - dout(7) << "shutdown_pass" << dendl; - - if (mds->is_stopped()) { - dout(7) << " already shut down" << dendl; - show_cache(); - show_subtrees(); - return true; - } - - // empty stray dir - if (!shutdown_export_strays()) { - dout(7) << "waiting for strays to migrate" << dendl; - return false; - } - - // drop our reference to our stray dir inode - for (int i = 0; i < NUM_STRAY; ++i) { - if (strays[i] && - strays[i]->state_test(CInode::STATE_STRAYPINNED)) { - strays[i]->state_clear(CInode::STATE_STRAYPINNED); - strays[i]->put(CInode::PIN_STRAY); - strays[i]->put_stickydirs(); - } - } - - // trim cache - trim(UINT64_MAX); - dout(5) << "lru size now " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl; - - // SUBTREES - int num_auth_subtree = 0; - if (!subtrees.empty() && - mds->get_nodeid() != 0 && - migrator->get_export_queue_size() == 0) { - dout(7) << "looking for subtrees to export to mds0" << dendl; - list ls; - for (map >::iterator it = subtrees.begin(); - it != subtrees.end(); - ++it) { - CDir *dir = it->first; - if (dir->get_inode()->is_mdsdir()) - continue; - if (dir->is_auth()) { - num_auth_subtree++; - if (dir->is_frozen() || - dir->is_freezing() || - dir->is_ambiguous_dir_auth() || - dir->state_test(CDir::STATE_EXPORTING)) - continue; - ls.push_back(dir); - } - } - for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - CDir *dir = *p; - mds_rank_t dest = dir->get_inode()->authority().first; - if (dest > 0 && !mds->mdsmap->is_active(dest)) - dest = 0; - dout(7) << "sending " << *dir << " back to mds." << dest << dendl; - migrator->export_dir_nicely(dir, dest); - } - } - - if (num_auth_subtree > 0) { - dout(7) << "still have " << num_auth_subtree << " auth subtrees" << dendl; - show_subtrees(); - return false; - } - - // close out any sessions (and open files!) before we try to trim the log, etc. - if (mds->sessionmap.have_unclosed_sessions()) { - if (!mds->server->terminating_sessions) - mds->server->terminate_sessions(); - return false; - } - - CDir *mydir = myin ? myin->get_dirfrag(frag_t()) : NULL; - if (mydir && !mydir->is_subtree_root()) - mydir = NULL; - - // subtrees map not empty yet? - if (subtrees.size() > (mydir ? 1 : 0)) { - dout(7) << "still have " << num_subtrees() << " subtrees" << dendl; - show_subtrees(); - migrator->show_importing(); - migrator->show_exporting(); - if (!migrator->is_importing() && !migrator->is_exporting()) - show_cache(); - return false; - } - assert(!migrator->is_exporting()); - assert(!migrator->is_importing()); - - // flush what we can from the log - mds->mdlog->trim(0); - if (mds->mdlog->get_num_segments() > 1) { - dout(7) << "still >1 segments, waiting for log to trim" << dendl; - return false; - } - - if ((myin && myin->is_auth_pinned()) || - (mydir && mydir->is_auth_pinned())) { - dout(7) << "still have auth pinned objects" << dendl; - return false; - } - - // (only do this once!) - if (!mds->mdlog->is_capped()) { - dout(7) << "capping the log" << dendl; - mds->mdlog->cap(); - mds->mdlog->trim(); - } - - if (!mds->mdlog->empty()) { - dout(7) << "waiting for log to flush.. " << mds->mdlog->get_num_events() - << " in " << mds->mdlog->get_num_segments() << " segments" << dendl; - return false; - } - - if (!did_shutdown_log_cap) { - // flush journal header - dout(7) << "writing header for (now-empty) journal" << dendl; - assert(mds->mdlog->empty()); - mds->mdlog->write_head(0); - // NOTE: filer active checker below will block us until this completes. - did_shutdown_log_cap = true; - return false; - } - - // filer active? - if (mds->objecter->is_active()) { - dout(7) << "objecter still active" << dendl; - mds->objecter->dump_active(); - return false; - } - - // trim what we can from the cache - if (lru.lru_get_size() > 0 || bottom_lru.lru_get_size() > 0) { - dout(7) << "there's still stuff in the cache: " << lru.lru_get_size() << "/" << bottom_lru.lru_get_size() << dendl; - show_cache(); - //dump(); - return false; - } - - // make mydir subtree go away - if (mydir) { - if (mydir->get_num_ref() > 1) { // subtree pin - dout(7) << "there's still reference to mydir " << *mydir << dendl; - show_cache(); - return false; - } - - remove_subtree(mydir); - myin->close_dirfrag(mydir->get_frag()); - } - assert(subtrees.empty()); - - if (myin) - remove_inode(myin); - - // done! - dout(2) << "shutdown done." << dendl; - return true; -} - -bool MDCache::shutdown_export_strays() -{ - if (mds->get_nodeid() == 0) - return true; - - dout(10) << "shutdown_export_strays" << dendl; - - bool mds0_active = mds->mdsmap->is_active(mds_rank_t(0)); - - bool done = true; - - list dfs; - for (int i = 0; i < NUM_STRAY; ++i) { - if (!strays[i]) { - continue; - } - strays[i]->get_dirfrags(dfs); - } - - for (std::list::iterator dfs_i = dfs.begin(); - dfs_i != dfs.end(); ++dfs_i) - { - CDir *dir = *dfs_i; - - if (!dir->is_complete()) { - dir->fetch(0); - done = false; - if (!mds0_active) - break; - } - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - CDentry::linkage_t *dnl = dn->get_linkage(); - if (dnl->is_null()) - continue; - done = false; - if (!mds0_active) - break; - - if (dn->state_test(CDentry::STATE_PURGING)) { - // Don't try to migrate anything that is actually - // being purged right now - continue; - } - - if (shutdown_exported_strays.count(dnl->get_inode()->ino()) == 0) { - shutdown_exported_strays.insert(dnl->get_inode()->ino()); - stray_manager.migrate_stray(dn, mds_rank_t(0)); // send to root! - } else { - dout(10) << "already exporting " << *dn << dendl; - } - } - } - - return done; -} - -// ========= messaging ============== - -/* This function DOES put the passed message before returning */ -void MDCache::dispatch(Message *m) -{ - switch (m->get_type()) { - - // RESOLVE - case MSG_MDS_RESOLVE: - handle_resolve(static_cast(m)); - break; - case MSG_MDS_RESOLVEACK: - handle_resolve_ack(static_cast(m)); - break; - - // REJOIN - case MSG_MDS_CACHEREJOIN: - handle_cache_rejoin(static_cast(m)); - break; - - case MSG_MDS_DISCOVER: - handle_discover(static_cast(m)); - break; - case MSG_MDS_DISCOVERREPLY: - handle_discover_reply(static_cast(m)); - break; - - case MSG_MDS_DIRUPDATE: - handle_dir_update(static_cast(m)); - break; - - case MSG_MDS_CACHEEXPIRE: - handle_cache_expire(static_cast(m)); - break; - - case MSG_MDS_DENTRYLINK: - handle_dentry_link(static_cast(m)); - break; - case MSG_MDS_DENTRYUNLINK: - handle_dentry_unlink(static_cast(m)); - break; - - case MSG_MDS_FRAGMENTNOTIFY: - handle_fragment_notify(static_cast(m)); - break; - - case MSG_MDS_FINDINO: - handle_find_ino(static_cast(m)); - break; - case MSG_MDS_FINDINOREPLY: - handle_find_ino_reply(static_cast(m)); - break; - - case MSG_MDS_OPENINO: - handle_open_ino(static_cast(m)); - break; - case MSG_MDS_OPENINOREPLY: - handle_open_ino_reply(static_cast(m)); - break; - - default: - derr << "cache unknown message " << m->get_type() << dendl; - assert(0 == "cache unknown message"); - } -} - -MDSInternalContextBase *MDCache::_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin) -{ - if (mdr) { - dout(20) << "_get_waiter retryrequest" << dendl; - return new C_MDS_RetryRequest(this, mdr); - } else if (req) { - dout(20) << "_get_waiter retrymessage" << dendl; - return new C_MDS_RetryMessage(mds, req); - } else { - return fin; - } -} - -int MDCache::path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin, // who - const filepath& path, // what - vector *pdnvec, // result - CInode **pin, - int onfail) -{ - bool discover = (onfail == MDS_TRAVERSE_DISCOVER); - bool null_okay = (onfail == MDS_TRAVERSE_DISCOVERXLOCK); - bool forward = (onfail == MDS_TRAVERSE_FORWARD); - - assert(mdr || req || fin); - assert(!forward || mdr || req); // forward requires a request - - snapid_t snapid = CEPH_NOSNAP; - if (mdr) - mdr->snapid = snapid; - - client_t client = (mdr && mdr->reqid.name.is_client()) ? mdr->reqid.name.num() : -1; - - if (mds->logger) mds->logger->inc(l_mds_traverse); - - dout(7) << "traverse: opening base ino " << path.get_ino() << " snap " << snapid << dendl; - CInode *cur = get_inode(path.get_ino()); - if (cur == NULL) { - if (MDS_INO_IS_MDSDIR(path.get_ino())) - open_foreign_mdsdir(path.get_ino(), _get_waiter(mdr, req, fin)); - else { - //ceph_abort(); // hrm.. broken - return -ESTALE; - } - return 1; - } - if (cur->state_test(CInode::STATE_PURGING)) - return -ESTALE; - - // make sure snaprealm are open... - if (mdr && cur->snaprealm && !cur->snaprealm->is_open() && - !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) { - return 1; - } - - // start trace - if (pdnvec) - pdnvec->clear(); - if (pin) - *pin = cur; - - unsigned depth = 0; - while (depth < path.depth()) { - dout(12) << "traverse: path seg depth " << depth << " '" << path[depth] - << "' snapid " << snapid << dendl; - - if (!cur->is_dir()) { - dout(7) << "traverse: " << *cur << " not a dir " << dendl; - return -ENOTDIR; - } - - // walk into snapdir? - if (path[depth].length() == 0) { - dout(10) << "traverse: snapdir" << dendl; - if (!mdr) - return -EINVAL; - snapid = CEPH_SNAPDIR; - mdr->snapid = snapid; - depth++; - continue; - } - // walk thru snapdir? - if (snapid == CEPH_SNAPDIR) { - if (!mdr) - return -EINVAL; - SnapRealm *realm = cur->find_snaprealm(); - snapid = realm->resolve_snapname(path[depth], cur->ino()); - dout(10) << "traverse: snap " << path[depth] << " -> " << snapid << dendl; - if (!snapid) - return -ENOENT; - mdr->snapid = snapid; - depth++; - continue; - } - - // open dir - frag_t fg = cur->pick_dirfrag(path[depth]); - CDir *curdir = cur->get_dirfrag(fg); - if (!curdir) { - if (cur->is_auth()) { - // parent dir frozen_dir? - if (cur->is_frozen()) { - dout(7) << "traverse: " << *cur << " is frozen, waiting" << dendl; - cur->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin)); - return 1; - } - curdir = cur->get_or_open_dirfrag(this, fg); - } else { - // discover? - dout(10) << "traverse: need dirfrag " << fg << ", doing discover from " << *cur << dendl; - discover_path(cur, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin), - null_okay); - if (mds->logger) mds->logger->inc(l_mds_traverse_discover); - return 1; - } - } - assert(curdir); - -#ifdef MDS_VERIFY_FRAGSTAT - if (curdir->is_complete()) - curdir->verify_fragstat(); -#endif - - // frozen? - /* - if (curdir->is_frozen()) { - // doh! - // FIXME: traverse is allowed? - dout(7) << "traverse: " << *curdir << " is frozen, waiting" << dendl; - curdir->add_waiter(CDir::WAIT_UNFREEZE, _get_waiter(mdr, req, fin)); - if (onfinish) delete onfinish; - return 1; - } - */ - - // Before doing dirfrag->dn lookup, compare with DamageTable's - // record of which dentries were unreadable - if (mds->damage_table.is_dentry_damaged(curdir, path[depth], snapid)) { - dout(4) << "traverse: stopped lookup at damaged dentry " - << *curdir << "/" << path[depth] << " snap=" << snapid << dendl; - return -EIO; - } - - // dentry - CDentry *dn = curdir->lookup(path[depth], snapid); - CDentry::linkage_t *dnl = dn ? dn->get_projected_linkage() : 0; - - // null and last_bit and xlocked by me? - if (dnl && dnl->is_null() && null_okay) { - dout(10) << "traverse: hit null dentry at tail of traverse, succeeding" << dendl; - if (pdnvec) - pdnvec->push_back(dn); - if (pin) - *pin = 0; - break; // done! - } - - if (dnl && - dn->lock.is_xlocked() && - dn->lock.get_xlock_by() != mdr && - !dn->lock.can_read(client) && - (dnl->is_null() || forward)) { - dout(10) << "traverse: xlocked dentry at " << *dn << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin)); - if (mds->logger) mds->logger->inc(l_mds_traverse_lock); - mds->mdlog->flush(); - return 1; - } - - // can we conclude ENOENT? - if (dnl && dnl->is_null()) { - if (dn->lock.can_read(client) || - (dn->lock.is_xlocked() && dn->lock.get_xlock_by() == mdr)) { - dout(10) << "traverse: miss on null+readable dentry " << path[depth] << " " << *dn << dendl; - if (pdnvec) { - if (depth == path.depth() - 1) - pdnvec->push_back(dn); - else - pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref(); - } - return -ENOENT; - } else { - dout(10) << "miss on dentry " << *dn << ", can't read due to lock" << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, _get_waiter(mdr, req, fin)); - return 1; - } - } - - if (dnl && !dnl->is_null()) { - CInode *in = dnl->get_inode(); - - // do we have inode? - if (!in) { - assert(dnl->is_remote()); - // do i have it? - in = get_inode(dnl->get_remote_ino()); - if (in) { - dout(7) << "linking in remote in " << *in << dendl; - dn->link_remote(dnl, in); - } else { - dout(7) << "remote link to " << dnl->get_remote_ino() << ", which i don't have" << dendl; - assert(mdr); // we shouldn't hit non-primary dentries doing a non-mdr traversal! - if (mds->damage_table.is_remote_damaged(dnl->get_remote_ino())) { - dout(4) << "traverse: remote dentry points to damaged ino " - << *dn << dendl; - return -EIO; - } - open_remote_dentry(dn, true, _get_waiter(mdr, req, fin), - (null_okay && depth == path.depth() - 1)); - if (mds->logger) mds->logger->inc(l_mds_traverse_remote_ino); - return 1; - } - } - - cur = in; - // make sure snaprealm are open... - if (mdr && cur->snaprealm && !cur->snaprealm->is_open() && - !cur->snaprealm->open_parents(_get_waiter(mdr, req, fin))) { - return 1; - } - - // add to trace, continue. - touch_inode(cur); - if (pdnvec) - pdnvec->push_back(dn); - if (pin) - *pin = cur; - depth++; - continue; - } - - - // MISS. dentry doesn't exist. - dout(12) << "traverse: miss on dentry " << path[depth] << " in " << *curdir << dendl; - - if (curdir->is_auth()) { - // dentry is mine. - if (curdir->is_complete() || - (snapid == CEPH_NOSNAP && - curdir->has_bloom() && - !curdir->is_in_bloom(path[depth]))){ - // file not found - if (pdnvec) { - // instantiate a null dn? - if (depth < path.depth()-1){ - dout(20) << " didn't traverse full path; not returning pdnvec" << dendl; - dn = NULL; - } else if (dn) { - ceph_abort(); // should have fallen out in ->is_null() check above - } else if (curdir->is_frozen()) { - dout(20) << " not adding null to frozen dir " << dendl; - } else if (snapid < CEPH_MAXSNAP) { - dout(20) << " not adding null for snapid " << snapid << dendl; - } else { - // create a null dentry - dn = curdir->add_null_dentry(path[depth]); - dout(20) << " added null " << *dn << dendl; - } - if (dn) - pdnvec->push_back(dn); - else - pdnvec->clear(); // do not confuse likes of rdlock_path_pin_ref(); - } - return -ENOENT; - } else { - - // Check DamageTable for missing fragments before trying to fetch - // this - if (mds->damage_table.is_dirfrag_damaged(curdir)) { - dout(4) << "traverse: damaged dirfrag " << *curdir - << ", blocking fetch" << dendl; - return -EIO; - } - - // directory isn't complete; reload - dout(7) << "traverse: incomplete dir contents for " << *cur << ", fetching" << dendl; - touch_inode(cur); - curdir->fetch(_get_waiter(mdr, req, fin), path[depth]); - if (mds->logger) mds->logger->inc(l_mds_traverse_dir_fetch); - return 1; - } - } else { - // dirfrag/dentry is not mine. - mds_authority_t dauth = curdir->authority(); - - if (forward && - snapid && mdr && mdr->client_request && - (int)depth < mdr->client_request->get_num_fwd()) { - dout(7) << "traverse: snap " << snapid << " and depth " << depth - << " < fwd " << mdr->client_request->get_num_fwd() - << ", discovering instead of forwarding" << dendl; - discover = true; - } - - if ((discover || null_okay)) { - dout(7) << "traverse: discover from " << path[depth] << " from " << *curdir << dendl; - discover_path(curdir, snapid, path.postfixpath(depth), _get_waiter(mdr, req, fin), - null_okay); - if (mds->logger) mds->logger->inc(l_mds_traverse_discover); - return 1; - } - if (forward) { - // forward - dout(7) << "traverse: not auth for " << path << " in " << *curdir << dendl; - - if (curdir->is_ambiguous_auth()) { - // wait - dout(7) << "traverse: waiting for single auth in " << *curdir << dendl; - curdir->add_waiter(CDir::WAIT_SINGLEAUTH, _get_waiter(mdr, req, fin)); - return 1; - } - - dout(7) << "traverse: forwarding, not auth for " << *curdir << dendl; - - if (mdr) - request_forward(mdr, dauth.first); - else - mds->forward_message_mds(req, dauth.first); - - if (mds->logger) mds->logger->inc(l_mds_traverse_forward); - assert(fin == NULL); - return 2; - } - } - - ceph_abort(); // i shouldn't get here - } - - // success. - if (mds->logger) mds->logger->inc(l_mds_traverse_hit); - dout(10) << "path_traverse finish on snapid " << snapid << dendl; - if (mdr) - assert(mdr->snapid == snapid); - return 0; -} - -CInode *MDCache::cache_traverse(const filepath& fp) -{ - dout(10) << "cache_traverse " << fp << dendl; - - CInode *in; - if (fp.get_ino()) - in = get_inode(fp.get_ino()); - else - in = root; - if (!in) - return NULL; - - for (unsigned i = 0; i < fp.depth(); i++) { - const string& dname = fp[i]; - frag_t fg = in->pick_dirfrag(dname); - dout(20) << " " << i << " " << dname << " frag " << fg << " from " << *in << dendl; - CDir *curdir = in->get_dirfrag(fg); - if (!curdir) - return NULL; - CDentry *dn = curdir->lookup(dname, CEPH_NOSNAP); - if (!dn) - return NULL; - in = dn->get_linkage()->get_inode(); - if (!in) - return NULL; - } - dout(10) << " got " << *in << dendl; - return in; -} - - -/** - * open_remote_dir -- open up a remote dirfrag - * - * @param diri base inode - * @param approxfg approximate fragment. - * @param fin completion callback - */ -void MDCache::open_remote_dirfrag(CInode *diri, frag_t approxfg, MDSInternalContextBase *fin) -{ - dout(10) << "open_remote_dir on " << *diri << dendl; - assert(diri->is_dir()); - assert(!diri->is_auth()); - assert(diri->get_dirfrag(approxfg) == 0); - - discover_dir_frag(diri, approxfg, fin); -} - - -/** - * get_dentry_inode - get or open inode - * - * @param dn the dentry - * @param mdr current request - * - * will return inode for primary, or link up/open up remote link's inode as necessary. - * If it's not available right now, puts mdr on wait list and returns null. - */ -CInode *MDCache::get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected) -{ - CDentry::linkage_t *dnl; - if (projected) - dnl = dn->get_projected_linkage(); - else - dnl = dn->get_linkage(); - - assert(!dnl->is_null()); - - if (dnl->is_primary()) - return dnl->inode; - - assert(dnl->is_remote()); - CInode *in = get_inode(dnl->get_remote_ino()); - if (in) { - dout(7) << "get_dentry_inode linking in remote in " << *in << dendl; - dn->link_remote(dnl, in); - return in; - } else { - dout(10) << "get_dentry_inode on remote dn, opening inode for " << *dn << dendl; - open_remote_dentry(dn, projected, new C_MDS_RetryRequest(this, mdr)); - return 0; - } -} - -struct C_MDC_OpenRemoteDentry : public MDCacheContext { - CDentry *dn; - inodeno_t ino; - MDSInternalContextBase *onfinish; - bool want_xlocked; - C_MDC_OpenRemoteDentry(MDCache *m, CDentry *d, inodeno_t i, MDSInternalContextBase *f, bool wx) : - MDCacheContext(m), dn(d), ino(i), onfinish(f), want_xlocked(wx) { - dn->get(MDSCacheObject::PIN_PTRWAITER); - } - void finish(int r) override { - mdcache->_open_remote_dentry_finish(dn, ino, onfinish, want_xlocked, r); - dn->put(MDSCacheObject::PIN_PTRWAITER); - } -}; - -void MDCache::open_remote_dentry(CDentry *dn, bool projected, MDSInternalContextBase *fin, bool want_xlocked) -{ - dout(10) << "open_remote_dentry " << *dn << dendl; - CDentry::linkage_t *dnl = projected ? dn->get_projected_linkage() : dn->get_linkage(); - inodeno_t ino = dnl->get_remote_ino(); - int64_t pool = dnl->get_remote_d_type() == DT_DIR ? mds->mdsmap->get_metadata_pool() : -1; - open_ino(ino, pool, - new C_MDC_OpenRemoteDentry(this, dn, ino, fin, want_xlocked), true, want_xlocked); // backtrace -} - -void MDCache::_open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSInternalContextBase *fin, - bool want_xlocked, int r) -{ - if (r < 0) { - CDentry::linkage_t *dnl = dn->get_projected_linkage(); - if (dnl->is_remote() && dnl->get_remote_ino() == ino) { - dout(0) << "open_remote_dentry_finish bad remote dentry " << *dn << dendl; - dn->state_set(CDentry::STATE_BADREMOTEINO); - - std::string path; - CDir *dir = dn->get_dir(); - if (dir) { - dir->get_inode()->make_path_string(path); - path = path + "/" + dn->get_name(); - } - - bool fatal = mds->damage_table.notify_remote_damaged(ino, path); - if (fatal) { - mds->damaged(); - ceph_abort(); // unreachable, damaged() respawns us - } - } else { - r = 0; - } - } - fin->complete(r < 0 ? r : 0); -} - - -void MDCache::make_trace(vector& trace, CInode *in) -{ - // empty trace if we're a base inode - if (in->is_base()) - return; - - CInode *parent = in->get_parent_inode(); - assert(parent); - make_trace(trace, parent); - - CDentry *dn = in->get_parent_dn(); - dout(15) << "make_trace adding " << *dn << dendl; - trace.push_back(dn); -} - - -// ------------------------------------------------------------------------------- -// Open inode by inode number - -class C_IO_MDC_OpenInoBacktraceFetched : public MDCacheIOContext { - inodeno_t ino; - public: - bufferlist bl; - C_IO_MDC_OpenInoBacktraceFetched(MDCache *c, inodeno_t i) : - MDCacheIOContext(c), ino(i) {} - void finish(int r) override { - mdcache->_open_ino_backtrace_fetched(ino, bl, r); - } -}; - -struct C_MDC_OpenInoTraverseDir : public MDCacheContext { - inodeno_t ino; - MMDSOpenIno *msg; - bool parent; - public: - C_MDC_OpenInoTraverseDir(MDCache *c, inodeno_t i, MMDSOpenIno *m, bool p) : - MDCacheContext(c), ino(i), msg(m), parent(p) {} - void finish(int r) override { - if (r < 0 && !parent) - r = -EAGAIN; - if (msg) { - mdcache->handle_open_ino(msg, r); - return; - } - assert(mdcache->opening_inodes.count(ino)); - mdcache->_open_ino_traverse_dir(ino, mdcache->opening_inodes[ino], r); - } -}; - -struct C_MDC_OpenInoParentOpened : public MDCacheContext { - inodeno_t ino; - public: - C_MDC_OpenInoParentOpened(MDCache *c, inodeno_t i) : MDCacheContext(c), ino(i) {} - void finish(int r) override { - mdcache->_open_ino_parent_opened(ino, r); - } -}; - -void MDCache::_open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err) -{ - dout(10) << "_open_ino_backtrace_fetched ino " << ino << " errno " << err << dendl; - - assert(opening_inodes.count(ino)); - open_ino_info_t& info = opening_inodes[ino]; - - CInode *in = get_inode(ino); - if (in) { - dout(10) << " found cached " << *in << dendl; - open_ino_finish(ino, info, in->authority().first); - return; - } - - inode_backtrace_t backtrace; - if (err == 0) { - try { - ::decode(backtrace, bl); - } catch (const buffer::error &decode_exc) { - derr << "corrupt backtrace on ino x0" << std::hex << ino - << std::dec << ": " << decode_exc << dendl; - open_ino_finish(ino, info, -EIO); - return; - } - if (backtrace.pool != info.pool && backtrace.pool != -1) { - dout(10) << " old object in pool " << info.pool - << ", retrying pool " << backtrace.pool << dendl; - info.pool = backtrace.pool; - C_IO_MDC_OpenInoBacktraceFetched *fin = - new C_IO_MDC_OpenInoBacktraceFetched(this, ino); - fetch_backtrace(ino, info.pool, fin->bl, - new C_OnFinisher(fin, mds->finisher)); - return; - } - } else if (err == -ENOENT) { - int64_t meta_pool = mds->mdsmap->get_metadata_pool(); - if (info.pool != meta_pool) { - dout(10) << " no object in pool " << info.pool - << ", retrying pool " << meta_pool << dendl; - info.pool = meta_pool; - C_IO_MDC_OpenInoBacktraceFetched *fin = - new C_IO_MDC_OpenInoBacktraceFetched(this, ino); - fetch_backtrace(ino, info.pool, fin->bl, - new C_OnFinisher(fin, mds->finisher)); - return; - } - err = 0; // backtrace.ancestors.empty() is checked below - } - - if (err == 0) { - if (backtrace.ancestors.empty()) { - dout(10) << " got empty backtrace " << dendl; - err = -EIO; - } else if (!info.ancestors.empty()) { - if (info.ancestors[0] == backtrace.ancestors[0]) { - dout(10) << " got same parents " << info.ancestors[0] << " 2 times" << dendl; - err = -EINVAL; - } else { - info.last_err = 0; - } - } - } - if (err) { - dout(0) << " failed to open ino " << ino << " err " << err << "/" << info.last_err << dendl; - if (info.last_err) - err = info.last_err; - open_ino_finish(ino, info, err); - return; - } - - dout(10) << " got backtrace " << backtrace << dendl; - info.ancestors = backtrace.ancestors; - - _open_ino_traverse_dir(ino, info, 0); -} - -void MDCache::_open_ino_parent_opened(inodeno_t ino, int ret) -{ - dout(10) << "_open_ino_parent_opened ino " << ino << " ret " << ret << dendl; - - assert(opening_inodes.count(ino)); - open_ino_info_t& info = opening_inodes[ino]; - - CInode *in = get_inode(ino); - if (in) { - dout(10) << " found cached " << *in << dendl; - open_ino_finish(ino, info, in->authority().first); - return; - } - - if (ret == mds->get_nodeid()) { - _open_ino_traverse_dir(ino, info, 0); - } else { - if (ret >= 0) { - mds_rank_t checked_rank = mds_rank_t(ret); - info.check_peers = true; - info.auth_hint = checked_rank; - info.checked.erase(checked_rank); - } - do_open_ino(ino, info, ret); - } -} - -void MDCache::_open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int ret) -{ - dout(10) << __func__ << ": ino " << ino << " ret " << ret << dendl; - - CInode *in = get_inode(ino); - if (in) { - dout(10) << " found cached " << *in << dendl; - open_ino_finish(ino, info, in->authority().first); - return; - } - - if (ret) { - do_open_ino(ino, info, ret); - return; - } - - mds_rank_t hint = info.auth_hint; - ret = open_ino_traverse_dir(ino, NULL, info.ancestors, - info.discover, info.want_xlocked, &hint); - if (ret > 0) - return; - if (hint != mds->get_nodeid()) - info.auth_hint = hint; - do_open_ino(ino, info, ret); -} - -void MDCache::_open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool parent) -{ - if (dir->state_test(CDir::STATE_REJOINUNDEF)) - assert(dir->get_inode()->dirfragtree.is_leaf(dir->get_frag())); - dir->fetch(new C_MDC_OpenInoTraverseDir(this, ino, m, parent)); -} - -int MDCache::open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m, - vector& ancestors, - bool discover, bool want_xlocked, mds_rank_t *hint) -{ - dout(10) << "open_ino_traverse_dir ino " << ino << " " << ancestors << dendl; - int err = 0; - for (unsigned i = 0; i < ancestors.size(); i++) { - CInode *diri = get_inode(ancestors[i].dirino); - - if (!diri) { - if (discover && MDS_INO_IS_MDSDIR(ancestors[i].dirino)) { - open_foreign_mdsdir(ancestors[i].dirino, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0)); - return 1; - } - continue; - } - - if (diri->state_test(CInode::STATE_REJOINUNDEF)) { - CDir *dir = diri->get_parent_dir(); - while (dir->state_test(CDir::STATE_REJOINUNDEF) && - dir->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) - dir = dir->get_inode()->get_parent_dir(); - _open_ino_fetch_dir(ino, m, dir, i == 0); - return 1; - } - - if (!diri->is_dir()) { - dout(10) << " " << *diri << " is not dir" << dendl; - if (i == 0) - err = -ENOTDIR; - break; - } - - string &name = ancestors[i].dname; - frag_t fg = diri->pick_dirfrag(name); - CDir *dir = diri->get_dirfrag(fg); - if (!dir) { - if (diri->is_auth()) { - if (diri->is_frozen()) { - dout(10) << " " << *diri << " is frozen, waiting " << dendl; - diri->add_waiter(CDir::WAIT_UNFREEZE, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0)); - return 1; - } - dir = diri->get_or_open_dirfrag(this, fg); - } else if (discover) { - open_remote_dirfrag(diri, fg, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0)); - return 1; - } - } - if (dir) { - inodeno_t next_ino = i > 0 ? ancestors[i - 1].dirino : ino; - CDentry *dn = dir->lookup(name); - CDentry::linkage_t *dnl = dn ? dn->get_linkage() : NULL; - if (dir->is_auth()) { - if (dnl && dnl->is_primary() && - dnl->get_inode()->state_test(CInode::STATE_REJOINUNDEF)) { - dout(10) << " fetching undef " << *dnl->get_inode() << dendl; - _open_ino_fetch_dir(ino, m, dir, i == 0); - return 1; - } - - if (!dnl && !dir->is_complete() && - (!dir->has_bloom() || dir->is_in_bloom(name))) { - dout(10) << " fetching incomplete " << *dir << dendl; - _open_ino_fetch_dir(ino, m, dir, i == 0); - return 1; - } - - dout(10) << " no ino " << next_ino << " in " << *dir << dendl; - if (i == 0) - err = -ENOENT; - } else if (discover) { - if (!dnl) { - filepath path(name, 0); - discover_path(dir, CEPH_NOSNAP, path, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0), - (i == 0 && want_xlocked)); - return 1; - } - if (dnl->is_null() && !dn->lock.can_read(-1)) { - dout(10) << " null " << *dn << " is not readable, waiting" << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDC_OpenInoTraverseDir(this, ino, m, i == 0)); - return 1; - } - dout(10) << " no ino " << next_ino << " in " << *dir << dendl; - if (i == 0) - err = -ENOENT; - } - } - if (hint && i == 0) - *hint = dir ? dir->authority().first : diri->authority().first; - break; - } - return err; -} - -void MDCache::open_ino_finish(inodeno_t ino, open_ino_info_t& info, int ret) -{ - dout(10) << "open_ino_finish ino " << ino << " ret " << ret << dendl; - - list waiters; - waiters.swap(info.waiters); - opening_inodes.erase(ino); - finish_contexts(g_ceph_context, waiters, ret); -} - -void MDCache::do_open_ino(inodeno_t ino, open_ino_info_t& info, int err) -{ - if (err < 0 && err != -EAGAIN) { - info.checked.clear(); - info.checking = MDS_RANK_NONE; - info.check_peers = true; - info.fetch_backtrace = true; - if (info.discover) { - info.discover = false; - info.ancestors.clear(); - } - if (err != -ENOENT && err != -ENOTDIR) - info.last_err = err; - } - - if (info.check_peers || info.discover) { - if (info.discover) { - // got backtrace from peer, but failed to find inode. re-check peers - info.discover = false; - info.ancestors.clear(); - info.checked.clear(); - } - info.check_peers = false; - info.checking = MDS_RANK_NONE; - do_open_ino_peer(ino, info); - } else if (info.fetch_backtrace) { - info.check_peers = true; - info.fetch_backtrace = false; - info.checking = mds->get_nodeid(); - info.checked.clear(); - C_IO_MDC_OpenInoBacktraceFetched *fin = - new C_IO_MDC_OpenInoBacktraceFetched(this, ino); - fetch_backtrace(ino, info.pool, fin->bl, - new C_OnFinisher(fin, mds->finisher)); - } else { - assert(!info.ancestors.empty()); - info.checking = mds->get_nodeid(); - open_ino(info.ancestors[0].dirino, mds->mdsmap->get_metadata_pool(), - new C_MDC_OpenInoParentOpened(this, ino), info.want_replica); - } -} - -void MDCache::do_open_ino_peer(inodeno_t ino, open_ino_info_t& info) -{ - set all, active; - mds->mdsmap->get_mds_set(all); - mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active); - if (mds->get_state() == MDSMap::STATE_REJOIN) - mds->mdsmap->get_mds_set(active, MDSMap::STATE_REJOIN); - - dout(10) << "do_open_ino_peer " << ino << " active " << active - << " all " << all << " checked " << info.checked << dendl; - - mds_rank_t peer = MDS_RANK_NONE; - if (info.auth_hint >= 0) { - if (active.count(info.auth_hint)) { - peer = info.auth_hint; - info.auth_hint = MDS_RANK_NONE; - } - } else { - for (set::iterator p = active.begin(); p != active.end(); ++p) - if (*p != mds->get_nodeid() && info.checked.count(*p) == 0) { - peer = *p; - break; - } - } - if (peer < 0) { - all.erase(mds->get_nodeid()); - if (all != info.checked) { - dout(10) << " waiting for more peers to be active" << dendl; - } else { - dout(10) << " all MDS peers have been checked " << dendl; - do_open_ino(ino, info, 0); - } - } else { - info.checking = peer; - vector *pa = NULL; - // got backtrace from peer or backtrace just fetched - if (info.discover || !info.fetch_backtrace) - pa = &info.ancestors; - mds->send_message_mds(new MMDSOpenIno(info.tid, ino, pa), peer); - } -} - -void MDCache::handle_open_ino(MMDSOpenIno *m, int err) -{ - if (mds->get_state() < MDSMap::STATE_REJOIN && - mds->get_want_state() != CEPH_MDS_STATE_REJOIN) { - m->put(); - return; - } - - dout(10) << "handle_open_ino " << *m << " err " << err << dendl; - - inodeno_t ino = m->ino; - MMDSOpenInoReply *reply; - CInode *in = get_inode(ino); - if (in) { - dout(10) << " have " << *in << dendl; - reply = new MMDSOpenInoReply(m->get_tid(), ino, mds_rank_t(0)); - if (in->is_auth()) { - touch_inode(in); - while (1) { - CDentry *pdn = in->get_parent_dn(); - if (!pdn) - break; - CInode *diri = pdn->get_dir()->get_inode(); - reply->ancestors.push_back(inode_backpointer_t(diri->ino(), pdn->name, - in->inode.version)); - in = diri; - } - } else { - reply->hint = in->authority().first; - } - } else if (err < 0) { - reply = new MMDSOpenInoReply(m->get_tid(), ino, MDS_RANK_NONE, err); - } else { - mds_rank_t hint = MDS_RANK_NONE; - int ret = open_ino_traverse_dir(ino, m, m->ancestors, false, false, &hint); - if (ret > 0) - return; - reply = new MMDSOpenInoReply(m->get_tid(), ino, hint, ret); - } - m->get_connection()->send_message(reply); - m->put(); -} - -void MDCache::handle_open_ino_reply(MMDSOpenInoReply *m) -{ - dout(10) << "handle_open_ino_reply " << *m << dendl; - - inodeno_t ino = m->ino; - mds_rank_t from = mds_rank_t(m->get_source().num()); - auto it = opening_inodes.find(ino); - if (it != opening_inodes.end() && it->second.checking == from) { - open_ino_info_t& info = it->second; - info.checking = MDS_RANK_NONE; - info.checked.insert(from); - - CInode *in = get_inode(ino); - if (in) { - dout(10) << " found cached " << *in << dendl; - open_ino_finish(ino, info, in->authority().first); - } else if (!m->ancestors.empty()) { - dout(10) << " found ino " << ino << " on mds." << from << dendl; - if (!info.want_replica) { - open_ino_finish(ino, info, from); - m->put(); - return; - } - - info.ancestors = m->ancestors; - info.auth_hint = from; - info.checking = mds->get_nodeid(); - info.discover = true; - _open_ino_traverse_dir(ino, info, 0); - } else if (m->error) { - dout(10) << " error " << m->error << " from mds." << from << dendl; - do_open_ino(ino, info, m->error); - } else { - if (m->hint >= 0 && m->hint != mds->get_nodeid()) { - info.auth_hint = m->hint; - info.checked.erase(m->hint); - } - do_open_ino_peer(ino, info); - } - } - m->put(); -} - -void MDCache::kick_open_ino_peers(mds_rank_t who) -{ - dout(10) << "kick_open_ino_peers mds." << who << dendl; - - for (map::iterator p = opening_inodes.begin(); - p != opening_inodes.end(); - ++p) { - open_ino_info_t& info = p->second; - if (info.checking == who) { - dout(10) << " kicking ino " << p->first << " who was checking mds." << who << dendl; - info.checking = MDS_RANK_NONE; - do_open_ino_peer(p->first, info); - } else if (info.checking == MDS_RANK_NONE) { - dout(10) << " kicking ino " << p->first << " who was waiting" << dendl; - do_open_ino_peer(p->first, info); - } - } -} - -void MDCache::open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase* fin, - bool want_replica, bool want_xlocked) -{ - dout(10) << "open_ino " << ino << " pool " << pool << " want_replica " - << want_replica << dendl; - - if (opening_inodes.count(ino)) { - open_ino_info_t& info = opening_inodes[ino]; - if (want_replica) { - info.want_replica = true; - if (want_xlocked && !info.want_xlocked) { - if (!info.ancestors.empty()) { - CInode *diri = get_inode(info.ancestors[0].dirino); - if (diri) { - frag_t fg = diri->pick_dirfrag(info.ancestors[0].dname); - CDir *dir = diri->get_dirfrag(fg); - if (dir && !dir->is_auth()) { - filepath path(info.ancestors[0].dname, 0); - discover_path(dir, CEPH_NOSNAP, path, NULL, true); - } - } - } - info.want_xlocked = true; - } - } - info.waiters.push_back(fin); - } else { - open_ino_info_t& info = opening_inodes[ino]; - info.want_replica = want_replica; - info.want_xlocked = want_xlocked; - info.tid = ++open_ino_last_tid; - info.pool = pool >= 0 ? pool : default_file_layout.pool_id; - info.waiters.push_back(fin); - do_open_ino(ino, info, 0); - } -} - -/* ---------------------------- */ - -/* - * search for a given inode on MDS peers. optionally start with the given node. - - - TODO - - recover from mds node failure, recovery - - traverse path - - */ -void MDCache::find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_t hint) -{ - dout(5) << "find_ino_peers " << ino << " hint " << hint << dendl; - assert(!have_inode(ino)); - - ceph_tid_t tid = ++find_ino_peer_last_tid; - find_ino_peer_info_t& fip = find_ino_peer[tid]; - fip.ino = ino; - fip.tid = tid; - fip.fin = c; - fip.hint = hint; - _do_find_ino_peer(fip); -} - -void MDCache::_do_find_ino_peer(find_ino_peer_info_t& fip) -{ - set all, active; - mds->mdsmap->get_mds_set(all); - mds->mdsmap->get_clientreplay_or_active_or_stopping_mds_set(active); - - dout(10) << "_do_find_ino_peer " << fip.tid << " " << fip.ino - << " active " << active << " all " << all - << " checked " << fip.checked - << dendl; - - mds_rank_t m = MDS_RANK_NONE; - if (fip.hint >= 0) { - m = fip.hint; - fip.hint = MDS_RANK_NONE; - } else { - for (set::iterator p = active.begin(); p != active.end(); ++p) - if (*p != mds->get_nodeid() && - fip.checked.count(*p) == 0) { - m = *p; - break; - } - } - if (m == MDS_RANK_NONE) { - all.erase(mds->get_nodeid()); - if (all != fip.checked) { - dout(10) << "_do_find_ino_peer waiting for more peers to be active" << dendl; - } else { - dout(10) << "_do_find_ino_peer failed on " << fip.ino << dendl; - fip.fin->complete(-ESTALE); - find_ino_peer.erase(fip.tid); - } - } else { - fip.checking = m; - mds->send_message_mds(new MMDSFindIno(fip.tid, fip.ino), m); - } -} - -void MDCache::handle_find_ino(MMDSFindIno *m) -{ - if (mds->get_state() < MDSMap::STATE_REJOIN) { - m->put(); - return; - } - - dout(10) << "handle_find_ino " << *m << dendl; - MMDSFindInoReply *r = new MMDSFindInoReply(m->tid); - CInode *in = get_inode(m->ino); - if (in) { - in->make_path(r->path); - dout(10) << " have " << r->path << " " << *in << dendl; - } - m->get_connection()->send_message(r); - m->put(); -} - - -void MDCache::handle_find_ino_reply(MMDSFindInoReply *m) -{ - map::iterator p = find_ino_peer.find(m->tid); - if (p != find_ino_peer.end()) { - dout(10) << "handle_find_ino_reply " << *m << dendl; - find_ino_peer_info_t& fip = p->second; - - // success? - if (get_inode(fip.ino)) { - dout(10) << "handle_find_ino_reply successfully found " << fip.ino << dendl; - mds->queue_waiter(fip.fin); - find_ino_peer.erase(p); - m->put(); - return; - } - - mds_rank_t from = mds_rank_t(m->get_source().num()); - if (fip.checking == from) - fip.checking = MDS_RANK_NONE; - fip.checked.insert(from); - - if (!m->path.empty()) { - // we got a path! - vector trace; - MDRequestRef null_ref; - int r = path_traverse(null_ref, m, NULL, m->path, &trace, NULL, MDS_TRAVERSE_DISCOVER); - if (r > 0) - return; - dout(0) << "handle_find_ino_reply failed with " << r << " on " << m->path - << ", retrying" << dendl; - fip.checked.clear(); - _do_find_ino_peer(fip); - } else { - // nope, continue. - _do_find_ino_peer(fip); - } - } else { - dout(10) << "handle_find_ino_reply tid " << m->tid << " dne" << dendl; - } - m->put(); -} - -void MDCache::kick_find_ino_peers(mds_rank_t who) -{ - // find_ino_peers requests we should move on from - for (map::iterator p = find_ino_peer.begin(); - p != find_ino_peer.end(); - ++p) { - find_ino_peer_info_t& fip = p->second; - if (fip.checking == who) { - dout(10) << "kicking find_ino_peer " << fip.tid << " who was checking mds." << who << dendl; - fip.checking = MDS_RANK_NONE; - _do_find_ino_peer(fip); - } else if (fip.checking == MDS_RANK_NONE) { - dout(10) << "kicking find_ino_peer " << fip.tid << " who was waiting" << dendl; - _do_find_ino_peer(fip); - } - } -} - -/* ---------------------------- */ - -int MDCache::get_num_client_requests() -{ - int count = 0; - for (ceph::unordered_map::iterator p = active_requests.begin(); - p != active_requests.end(); - ++p) { - MDRequestRef& mdr = p->second; - if (mdr->reqid.name.is_client() && !mdr->is_slave()) - count++; - } - return count; -} - -/* This function takes over the reference to the passed Message */ -MDRequestRef MDCache::request_start(MClientRequest *req) -{ - // did we win a forward race against a slave? - if (active_requests.count(req->get_reqid())) { - MDRequestRef& mdr = active_requests[req->get_reqid()]; - assert(mdr); - if (mdr->is_slave()) { - dout(10) << "request_start already had " << *mdr << ", waiting for finish" << dendl; - mdr->more()->waiting_for_finish.push_back(new C_MDS_RetryMessage(mds, req)); - } else { - dout(10) << "request_start already processing " << *mdr << ", dropping new msg" << dendl; - req->put(); - } - return MDRequestRef(); - } - - // register new client request - MDRequestImpl::Params params; - params.reqid = req->get_reqid(); - params.attempt = req->get_num_fwd(); - params.client_req = req; - params.initiated = req->get_recv_stamp(); - params.throttled = req->get_throttle_stamp(); - params.all_read = req->get_recv_complete_stamp(); - params.dispatched = req->get_dispatch_stamp(); - - MDRequestRef mdr = - mds->op_tracker.create_request(params); - active_requests[params.reqid] = mdr; - mdr->set_op_stamp(req->get_stamp()); - dout(7) << "request_start " << *mdr << dendl; - return mdr; -} - -MDRequestRef MDCache::request_start_slave(metareqid_t ri, __u32 attempt, Message *m) -{ - int by = m->get_source().num(); - MDRequestImpl::Params params; - params.reqid = ri; - params.attempt = attempt; - params.triggering_slave_req = m; - params.slave_to = by; - params.initiated = m->get_recv_stamp(); - params.throttled = m->get_throttle_stamp(); - params.all_read = m->get_recv_complete_stamp(); - params.dispatched = m->get_dispatch_stamp(); - MDRequestRef mdr = - mds->op_tracker.create_request(params); - assert(active_requests.count(mdr->reqid) == 0); - active_requests[mdr->reqid] = mdr; - dout(7) << "request_start_slave " << *mdr << " by mds." << by << dendl; - return mdr; -} - -MDRequestRef MDCache::request_start_internal(int op) -{ - MDRequestImpl::Params params; - params.reqid.name = entity_name_t::MDS(mds->get_nodeid()); - params.reqid.tid = mds->issue_tid(); - params.initiated = ceph_clock_now(); - params.internal_op = op; - MDRequestRef mdr = - mds->op_tracker.create_request(params); - - assert(active_requests.count(mdr->reqid) == 0); - active_requests[mdr->reqid] = mdr; - dout(7) << "request_start_internal " << *mdr << " op " << op << dendl; - return mdr; -} - -MDRequestRef MDCache::request_get(metareqid_t rid) -{ - ceph::unordered_map::iterator p = active_requests.find(rid); - assert(p != active_requests.end()); - dout(7) << "request_get " << rid << " " << *p->second << dendl; - return p->second; -} - -void MDCache::request_finish(MDRequestRef& mdr) -{ - dout(7) << "request_finish " << *mdr << dendl; - mdr->mark_event("finishing request"); - - // slave finisher? - if (mdr->has_more() && mdr->more()->slave_commit) { - Context *fin = mdr->more()->slave_commit; - mdr->more()->slave_commit = 0; - int ret; - if (mdr->aborted) { - mdr->aborted = false; - ret = -1; - mdr->more()->slave_rolling_back = true; - } else { - ret = 0; - mdr->committing = true; - } - fin->complete(ret); // this must re-call request_finish. - return; - } - - switch(mdr->internal_op) { - case CEPH_MDS_OP_FRAGMENTDIR: - logger->inc(l_mdss_ireq_fragmentdir); - break; - case CEPH_MDS_OP_EXPORTDIR: - logger->inc(l_mdss_ireq_exportdir); - break; - case CEPH_MDS_OP_ENQUEUE_SCRUB: - logger->inc(l_mdss_ireq_enqueue_scrub); - break; - case CEPH_MDS_OP_FLUSH: - logger->inc(l_mdss_ireq_flush); - break; - case CEPH_MDS_OP_REPAIR_FRAGSTATS: - logger->inc(l_mdss_ireq_fragstats); - break; - case CEPH_MDS_OP_REPAIR_INODESTATS: - logger->inc(l_mdss_ireq_inodestats); - break; - } - - request_cleanup(mdr); -} - - -void MDCache::request_forward(MDRequestRef& mdr, mds_rank_t who, int port) -{ - mdr->mark_event("forwarding request"); - if (mdr->client_request && mdr->client_request->get_source().is_client()) { - dout(7) << "request_forward " << *mdr << " to mds." << who << " req " - << *mdr->client_request << dendl; - mds->forward_message_mds(mdr->client_request, who); - mdr->client_request = 0; - if (mds->logger) mds->logger->inc(l_mds_forward); - } else if (mdr->internal_op >= 0) { - dout(10) << "request_forward on internal op; cancelling" << dendl; - mdr->internal_op_finish->complete(-EXDEV); - } else { - dout(7) << "request_forward drop " << *mdr << " req " << *mdr->client_request - << " was from mds" << dendl; - } - request_cleanup(mdr); -} - - -void MDCache::dispatch_request(MDRequestRef& mdr) -{ - if (mdr->client_request) { - mds->server->dispatch_client_request(mdr); - } else if (mdr->slave_request) { - mds->server->dispatch_slave_request(mdr); - } else { - switch (mdr->internal_op) { - case CEPH_MDS_OP_FRAGMENTDIR: - dispatch_fragment_dir(mdr); - break; - case CEPH_MDS_OP_EXPORTDIR: - migrator->dispatch_export_dir(mdr, 0); - break; - case CEPH_MDS_OP_ENQUEUE_SCRUB: - enqueue_scrub_work(mdr); - break; - case CEPH_MDS_OP_FLUSH: - flush_dentry_work(mdr); - break; - case CEPH_MDS_OP_REPAIR_FRAGSTATS: - repair_dirfrag_stats_work(mdr); - break; - case CEPH_MDS_OP_REPAIR_INODESTATS: - repair_inode_stats_work(mdr); - break; - default: - ceph_abort(); - } - } -} - - -void MDCache::request_drop_foreign_locks(MDRequestRef& mdr) -{ - if (!mdr->has_more()) - return; - - // clean up slaves - // (will implicitly drop remote dn pins) - for (set::iterator p = mdr->more()->slaves.begin(); - p != mdr->more()->slaves.end(); - ++p) { - MMDSSlaveRequest *r = new MMDSSlaveRequest(mdr->reqid, mdr->attempt, - MMDSSlaveRequest::OP_FINISH); - - if (mdr->killed && !mdr->committing) { - r->mark_abort(); - } else if (mdr->more()->srcdn_auth_mds == *p && - mdr->more()->inode_import.length() > 0) { - // information about rename imported caps - r->inode_export.claim(mdr->more()->inode_import); - } - - mds->send_message_mds(r, *p); - } - - /* strip foreign xlocks out of lock lists, since the OP_FINISH drops them - * implicitly. Note that we don't call the finishers -- there shouldn't - * be any on a remote lock and the request finish wakes up all - * the waiters anyway! */ - set::iterator p = mdr->xlocks.begin(); - while (p != mdr->xlocks.end()) { - if ((*p)->get_parent()->is_auth()) - ++p; - else { - dout(10) << "request_drop_foreign_locks forgetting lock " << **p - << " on " << *(*p)->get_parent() << dendl; - (*p)->put_xlock(); - mdr->locks.erase(*p); - mdr->xlocks.erase(p++); - } - } - - map::iterator q = mdr->remote_wrlocks.begin(); - while (q != mdr->remote_wrlocks.end()) { - dout(10) << "request_drop_foreign_locks forgetting remote_wrlock " << *q->first - << " on mds." << q->second - << " on " << *(q->first)->get_parent() << dendl; - mdr->locks.erase(q->first); - mdr->remote_wrlocks.erase(q++); - } - - mdr->more()->slaves.clear(); /* we no longer have requests out to them, and - * leaving them in can cause double-notifies as - * this function can get called more than once */ -} - -void MDCache::request_drop_non_rdlocks(MDRequestRef& mdr) -{ - request_drop_foreign_locks(mdr); - mds->locker->drop_non_rdlocks(mdr.get()); -} - -void MDCache::request_drop_locks(MDRequestRef& mdr) -{ - request_drop_foreign_locks(mdr); - mds->locker->drop_locks(mdr.get()); -} - -void MDCache::request_cleanup(MDRequestRef& mdr) -{ - dout(15) << "request_cleanup " << *mdr << dendl; - - if (mdr->has_more()) { - if (mdr->more()->is_ambiguous_auth) - mdr->clear_ambiguous_auth(); - if (!mdr->more()->waiting_for_finish.empty()) - mds->queue_waiters(mdr->more()->waiting_for_finish); - } - - request_drop_locks(mdr); - - // drop (local) auth pins - mdr->drop_local_auth_pins(); - - // drop stickydirs - for (set::iterator p = mdr->stickydirs.begin(); - p != mdr->stickydirs.end(); - ++p) - (*p)->put_stickydirs(); - - mds->locker->kick_cap_releases(mdr); - - // drop cache pins - mdr->drop_pins(); - - // remove from session - mdr->item_session_request.remove_myself(); - - // remove from map - active_requests.erase(mdr->reqid); - - if (mds->logger) - log_stat(); - - mdr->mark_event("cleaned up request"); -} - -void MDCache::request_kill(MDRequestRef& mdr) -{ - // rollback slave requests is tricky. just let the request proceed. - if (mdr->done_locking && mdr->has_more() && - (!mdr->more()->witnessed.empty() || !mdr->more()->waiting_on_slave.empty())) { - dout(10) << "request_kill " << *mdr << " -- already started slave requests, no-op" << dendl; - - assert(mdr->used_prealloc_ino == 0); - assert(mdr->prealloc_inos.empty()); - - mdr->session = NULL; - mdr->item_session_request.remove_myself(); - return; - } - - mdr->killed = true; - mdr->mark_event("killing request"); - - if (mdr->committing) { - dout(10) << "request_kill " << *mdr << " -- already committing, no-op" << dendl; - } else { - dout(10) << "request_kill " << *mdr << dendl; - request_cleanup(mdr); - } -} - -// ------------------------------------------------------------------------------- -// SNAPREALMS - -struct C_MDC_snaprealm_create_finish : public MDCacheLogContext { - MDRequestRef mdr; - MutationRef mut; - CInode *in; - C_MDC_snaprealm_create_finish(MDCache *c, MDRequestRef& m, - MutationRef& mu, CInode *i) : - MDCacheLogContext(c), mdr(m), mut(mu), in(i) {} - void finish(int r) override { - mdcache->_snaprealm_create_finish(mdr, mut, in); - } -}; - -void MDCache::snaprealm_create(MDRequestRef& mdr, CInode *in) -{ - dout(10) << "snaprealm_create " << *in << dendl; - assert(!in->snaprealm); - - // allocate an id.. - if (!mdr->more()->stid) { - mds->snapclient->prepare_create_realm(in->ino(), &mdr->more()->stid, &mdr->more()->snapidbl, - new C_MDS_RetryRequest(this, mdr)); - return; - } - - MutationRef mut(new MutationImpl()); - mut->ls = mds->mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mds->mdlog, "snaprealm_create"); - mds->mdlog->start_entry(le); - - le->metablob.add_table_transaction(TABLE_SNAP, mdr->more()->stid); - - inode_t *pi = in->project_inode(); - pi->version = in->pre_dirty(); - pi->rstat.rsnaprealms++; - - bufferlist::iterator p = mdr->more()->snapidbl.begin(); - snapid_t seq; - ::decode(seq, p); - - sr_t *newsnap = in->project_snaprealm(seq); - newsnap->seq = seq; - newsnap->last_created = seq; - - predirty_journal_parents(mut, &le->metablob, in, 0, PREDIRTY_PRIMARY); - journal_cow_inode(mut, &le->metablob, in); - le->metablob.add_primary_dentry(in->get_projected_parent_dn(), in, true); - - mds->server->submit_mdlog_entry(le, - new C_MDC_snaprealm_create_finish(this, mdr, - mut, in), - mdr, __func__); - mds->mdlog->flush(); -} - - -void MDCache::do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend) -{ - dout(10) << "do_realm_invalidate_and_update_notify " << *in->snaprealm << " " << *in << dendl; - - vector split_inos; - vector split_realms; - - if (snapop == CEPH_SNAP_OP_SPLIT) { - // notify clients of update|split - for (elist::iterator p = in->snaprealm->inodes_with_caps.begin(member_offset(CInode, item_caps)); - !p.end(); ++p) - split_inos.push_back((*p)->ino()); - - for (set::iterator p = in->snaprealm->open_children.begin(); - p != in->snaprealm->open_children.end(); - ++p) - split_realms.push_back((*p)->inode->ino()); - } - - bufferlist snapbl; - in->snaprealm->build_snap_trace(snapbl); - - set past_children; - map updates; - list q; - q.push_back(in->snaprealm); - while (!q.empty()) { - SnapRealm *realm = q.front(); - q.pop_front(); - - dout(10) << " realm " << *realm << " on " << *realm->inode << dendl; - realm->invalidate_cached_snaps(); - - for (map* >::iterator p = realm->client_caps.begin(); - p != realm->client_caps.end(); - ++p) { - assert(!p->second->empty()); - if (!nosend && updates.count(p->first) == 0) { - MClientSnap *update = new MClientSnap(snapop); - update->head.split = in->ino(); - update->split_inos = split_inos; - update->split_realms = split_realms; - update->bl = snapbl; - updates[p->first] = update; - } - } - - if (snapop == CEPH_SNAP_OP_UPDATE || snapop == CEPH_SNAP_OP_DESTROY) { - for (set::iterator p = realm->open_past_children.begin(); - p != realm->open_past_children.end(); - ++p) - past_children.insert(*p); - } - - // notify for active children, too. - dout(10) << " " << realm << " open_children are " << realm->open_children << dendl; - for (set::iterator p = realm->open_children.begin(); - p != realm->open_children.end(); - ++p) - q.push_back(*p); - } - - if (!nosend) - send_snaps(updates); - - // notify past children and their descendants if we update/delete old snapshots - for (set::iterator p = past_children.begin(); - p != past_children.end(); - ++p) - q.push_back(*p); - - while (!q.empty()) { - SnapRealm *realm = q.front(); - q.pop_front(); - - realm->invalidate_cached_snaps(); - - for (set::iterator p = realm->open_children.begin(); - p != realm->open_children.end(); - ++p) { - if (past_children.count(*p) == 0) - q.push_back(*p); - } - - for (set::iterator p = realm->open_past_children.begin(); - p != realm->open_past_children.end(); - ++p) { - if (past_children.count(*p) == 0) { - q.push_back(*p); - past_children.insert(*p); - } - } - } - - if (snapop == CEPH_SNAP_OP_DESTROY) { - // eval stray inodes if we delete snapshot from their past ancestor snaprealm - for (set::iterator p = past_children.begin(); - p != past_children.end(); - ++p) - maybe_eval_stray((*p)->inode, true); - } -} - -void MDCache::_snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in) -{ - dout(10) << "_snaprealm_create_finish " << *in << dendl; - - // apply - in->pop_and_dirty_projected_inode(mut->ls); - mut->apply(); - mds->locker->drop_locks(mut.get()); - mut->cleanup(); - - // tell table we've committed - mds->snapclient->commit(mdr->more()->stid, mut->ls); - - // create - bufferlist::iterator p = mdr->more()->snapidbl.begin(); - snapid_t seq; - ::decode(seq, p); - - in->open_snaprealm(); - in->snaprealm->srnode.seq = seq; - in->snaprealm->srnode.created = seq; - bool ok = in->snaprealm->_open_parents(NULL); - assert(ok); - - do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT); - - /* - static int count = 5; - if (--count == 0) - ceph_abort(); // hack test test ********** - */ - - // done. - mdr->more()->stid = 0; // caller will likely need to reuse this - dispatch_request(mdr); -} - - -// ------------------------------------------------------------------------------- -// STRAYS - -struct C_MDC_RetryScanStray : public MDCacheContext { - dirfrag_t next; - C_MDC_RetryScanStray(MDCache *c, dirfrag_t n) : MDCacheContext(c), next(n) { } - void finish(int r) override { - mdcache->scan_stray_dir(next); - } -}; - -void MDCache::scan_stray_dir(dirfrag_t next) -{ - dout(10) << "scan_stray_dir " << next << dendl; - - list ls; - for (int i = 0; i < NUM_STRAY; ++i) { - if (strays[i]->ino() < next.ino) - continue; - strays[i]->get_dirfrags(ls); - } - - for (list::iterator p = ls.begin(); p != ls.end(); ++p) { - CDir *dir = *p; - if (dir->dirfrag() < next) - continue; - if (!dir->is_complete()) { - dir->fetch(new C_MDC_RetryScanStray(this, dir->dirfrag())); - return; - } - for (CDir::map_t::iterator q = dir->items.begin(); q != dir->items.end(); ++q) { - CDentry *dn = q->second; - dn->state_set(CDentry::STATE_STRAY); - CDentry::linkage_t *dnl = dn->get_projected_linkage(); - if (dnl->is_primary()) { - CInode *in = dnl->get_inode(); - if (in->inode.nlink == 0) - in->state_set(CInode::STATE_ORPHAN); - maybe_eval_stray(in); - } - } - } -} - -void MDCache::fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin) -{ - object_t oid = CInode::get_object_name(ino, frag_t(), ""); - mds->objecter->getxattr(oid, object_locator_t(pool), "parent", CEPH_NOSNAP, &bl, 0, fin); -} - - - - - -// ======================================================================================== -// DISCOVER -/* - - - for all discovers (except base_inos, e.g. root, stray), waiters are attached - to the parent metadata object in the cache (pinning it). - - - all discovers are tracked by tid, so that we can ignore potentially dup replies. - -*/ - -void MDCache::_send_discover(discover_info_t& d) -{ - MDiscover *dis = new MDiscover(d.ino, d.frag, d.snap, d.want_path, - d.want_base_dir, d.want_xlocked); - dis->set_tid(d.tid); - mds->send_message_mds(dis, d.mds); -} - -void MDCache::discover_base_ino(inodeno_t want_ino, - MDSInternalContextBase *onfinish, - mds_rank_t from) -{ - dout(7) << "discover_base_ino " << want_ino << " from mds." << from << dendl; - if (waiting_for_base_ino[from].count(want_ino) == 0) { - discover_info_t& d = _create_discover(from); - d.ino = want_ino; - _send_discover(d); - } - waiting_for_base_ino[from][want_ino].push_back(onfinish); -} - - -void MDCache::discover_dir_frag(CInode *base, - frag_t approx_fg, - MDSInternalContextBase *onfinish, - mds_rank_t from) -{ - if (from < 0) - from = base->authority().first; - - dirfrag_t df(base->ino(), approx_fg); - dout(7) << "discover_dir_frag " << df - << " from mds." << from << dendl; - - if (!base->is_waiting_for_dir(approx_fg) || !onfinish) { - discover_info_t& d = _create_discover(from); - d.pin_base(base); - d.ino = base->ino(); - d.frag = approx_fg; - d.want_base_dir = true; - _send_discover(d); - } - - if (onfinish) - base->add_dir_waiter(approx_fg, onfinish); -} - -struct C_MDC_RetryDiscoverPath : public MDCacheContext { - CInode *base; - snapid_t snapid; - filepath path; - mds_rank_t from; - C_MDC_RetryDiscoverPath(MDCache *c, CInode *b, snapid_t s, filepath &p, mds_rank_t f) : - MDCacheContext(c), base(b), snapid(s), path(p), from(f) {} - void finish(int r) override { - mdcache->discover_path(base, snapid, path, 0, from); - } -}; - -void MDCache::discover_path(CInode *base, - snapid_t snap, - filepath want_path, - MDSInternalContextBase *onfinish, - bool want_xlocked, - mds_rank_t from) -{ - if (from < 0) - from = base->authority().first; - - dout(7) << "discover_path " << base->ino() << " " << want_path << " snap " << snap << " from mds." << from - << (want_xlocked ? " want_xlocked":"") - << dendl; - - if (base->is_ambiguous_auth()) { - dout(10) << " waiting for single auth on " << *base << dendl; - if (!onfinish) - onfinish = new C_MDC_RetryDiscoverPath(this, base, snap, want_path, from); - base->add_waiter(CInode::WAIT_SINGLEAUTH, onfinish); - return; - } else if (from == mds->get_nodeid()) { - list finished; - base->take_waiting(CInode::WAIT_DIR, finished); - mds->queue_waiters(finished); - return; - } - - frag_t fg = base->pick_dirfrag(want_path[0]); - if ((want_xlocked && want_path.depth() == 1) || - !base->is_waiting_for_dir(fg) || !onfinish) { - discover_info_t& d = _create_discover(from); - d.ino = base->ino(); - d.pin_base(base); - d.frag = fg; - d.snap = snap; - d.want_path = want_path; - d.want_base_dir = true; - d.want_xlocked = want_xlocked; - _send_discover(d); - } - - // register + wait - if (onfinish) - base->add_dir_waiter(fg, onfinish); -} - -struct C_MDC_RetryDiscoverPath2 : public MDCacheContext { - CDir *base; - snapid_t snapid; - filepath path; - C_MDC_RetryDiscoverPath2(MDCache *c, CDir *b, snapid_t s, filepath &p) : - MDCacheContext(c), base(b), snapid(s), path(p) {} - void finish(int r) override { - mdcache->discover_path(base, snapid, path, 0); - } -}; - -void MDCache::discover_path(CDir *base, - snapid_t snap, - filepath want_path, - MDSInternalContextBase *onfinish, - bool want_xlocked) -{ - mds_rank_t from = base->authority().first; - - dout(7) << "discover_path " << base->dirfrag() << " " << want_path << " snap " << snap << " from mds." << from - << (want_xlocked ? " want_xlocked":"") - << dendl; - - if (base->is_ambiguous_auth()) { - dout(7) << " waiting for single auth on " << *base << dendl; - if (!onfinish) - onfinish = new C_MDC_RetryDiscoverPath2(this, base, snap, want_path); - base->add_waiter(CDir::WAIT_SINGLEAUTH, onfinish); - return; - } else if (from == mds->get_nodeid()) { - list finished; - base->take_sub_waiting(finished); - mds->queue_waiters(finished); - return; - } - - if ((want_xlocked && want_path.depth() == 1) || - !base->is_waiting_for_dentry(want_path[0].c_str(), snap) || !onfinish) { - discover_info_t& d = _create_discover(from); - d.ino = base->ino(); - d.pin_base(base->inode); - d.frag = base->get_frag(); - d.snap = snap; - d.want_path = want_path; - d.want_base_dir = false; - d.want_xlocked = want_xlocked; - _send_discover(d); - } - - // register + wait - if (onfinish) - base->add_dentry_waiter(want_path[0], snap, onfinish); -} - -void MDCache::kick_discovers(mds_rank_t who) -{ - for (map::iterator p = discovers.begin(); - p != discovers.end(); - ++p) { - if (p->second.mds != who) - continue; - _send_discover(p->second); - } -} - - -/* This function DOES put the passed message before returning */ -void MDCache::handle_discover(MDiscover *dis) -{ - mds_rank_t whoami = mds->get_nodeid(); - mds_rank_t from = mds_rank_t(dis->get_source().num()); - - assert(from != whoami); - - if (mds->get_state() <= MDSMap::STATE_REJOIN) { - if (mds->get_state() < MDSMap::STATE_REJOIN && - mds->get_want_state() < CEPH_MDS_STATE_REJOIN) { - dis->put(); - return; - } - - // proceed if requester is in the REJOIN stage, the request is from parallel_fetch(). - // delay processing request from survivor because we may not yet choose lock states. - if (!mds->mdsmap->is_rejoin(from)) { - dout(0) << "discover_reply not yet active(|still rejoining), delaying" << dendl; - mds->wait_for_replay(new C_MDS_RetryMessage(mds, dis)); - return; - } - } - - - CInode *cur = 0; - MDiscoverReply *reply = new MDiscoverReply(dis); - - snapid_t snapid = dis->get_snapid(); - - // get started. - if (MDS_INO_IS_BASE(dis->get_base_ino()) && - !dis->wants_base_dir() && dis->get_want().depth() == 0) { - // wants root - dout(7) << "handle_discover from mds." << from - << " wants base + " << dis->get_want().get_path() - << " snap " << snapid - << dendl; - - cur = get_inode(dis->get_base_ino()); - assert(cur); - - // add root - reply->starts_with = MDiscoverReply::INODE; - replicate_inode(cur, from, reply->trace, mds->mdsmap->get_up_features()); - dout(10) << "added base " << *cur << dendl; - } - else { - // there's a base inode - cur = get_inode(dis->get_base_ino(), snapid); - if (!cur && snapid != CEPH_NOSNAP) { - cur = get_inode(dis->get_base_ino()); - if (cur && !cur->is_multiversion()) - cur = NULL; // nope! - } - - if (!cur) { - dout(7) << "handle_discover mds." << from - << " don't have base ino " << dis->get_base_ino() << "." << snapid - << dendl; - if (!dis->wants_base_dir() && dis->get_want().depth() > 0) - reply->set_error_dentry(dis->get_dentry(0)); - reply->set_flag_error_dir(); - } else if (dis->wants_base_dir()) { - dout(7) << "handle_discover mds." << from - << " wants basedir+" << dis->get_want().get_path() - << " has " << *cur - << dendl; - } else { - dout(7) << "handle_discover mds." << from - << " wants " << dis->get_want().get_path() - << " has " << *cur - << dendl; - } - } - - assert(reply); - - // add content - // do some fidgeting to include a dir if they asked for the base dir, or just root. - for (unsigned i = 0; - cur && (i < dis->get_want().depth() || dis->get_want().depth() == 0); - i++) { - - // -- figure out the dir - - // is *cur even a dir at all? - if (!cur->is_dir()) { - dout(7) << *cur << " not a dir" << dendl; - reply->set_flag_error_dir(); - break; - } - - // pick frag - frag_t fg; - if (dis->get_want().depth()) { - // dentry specifies - fg = cur->pick_dirfrag(dis->get_dentry(i)); - } else { - // requester explicity specified the frag - assert(dis->wants_base_dir() || MDS_INO_IS_BASE(dis->get_base_ino())); - fg = dis->get_base_dir_frag(); - if (!cur->dirfragtree.is_leaf(fg)) - fg = cur->dirfragtree[fg.value()]; - } - CDir *curdir = cur->get_dirfrag(fg); - - if ((!curdir && !cur->is_auth()) || - (curdir && !curdir->is_auth())) { - - /* before: - * ONLY set flag if empty!! - * otherwise requester will wake up waiter(s) _and_ continue with discover, - * resulting in duplicate discovers in flight, - * which can wreak havoc when discovering rename srcdn (which may move) - */ - - if (reply->is_empty()) { - // only hint if empty. - // someday this could be better, but right now the waiter logic isn't smart enough. - - // hint - if (curdir) { - dout(7) << " not dirfrag auth, setting dir_auth_hint for " << *curdir << dendl; - reply->set_dir_auth_hint(curdir->authority().first); - } else { - dout(7) << " dirfrag not open, not inode auth, setting dir_auth_hint for " - << *cur << dendl; - reply->set_dir_auth_hint(cur->authority().first); - } - - // note error dentry, if any - // NOTE: important, as it allows requester to issue an equivalent discover - // to whomever we hint at. - if (dis->get_want().depth() > i) - reply->set_error_dentry(dis->get_dentry(i)); - } - - break; - } - - if (!curdir) { // open dir? - if (cur->is_frozen()) { - if (!reply->is_empty()) { - dout(7) << *cur << " is frozen, non-empty reply, stopping" << dendl; - break; - } - dout(7) << *cur << " is frozen, empty reply, waiting" << dendl; - cur->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); - reply->put(); - return; - } - curdir = cur->get_or_open_dirfrag(this, fg); - } else if (curdir->is_frozen_tree() || - (curdir->is_frozen_dir() && fragment_are_all_frozen(curdir))) { - if (!reply->is_empty()) { - dout(7) << *curdir << " is frozen, non-empty reply, stopping" << dendl; - break; - } - if (dis->wants_base_dir() && dis->get_base_dir_frag() != curdir->get_frag()) { - dout(7) << *curdir << " is frozen, dirfrag mismatch, stopping" << dendl; - reply->set_flag_error_dir(); - break; - } - dout(7) << *curdir << " is frozen, empty reply, waiting" << dendl; - curdir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); - reply->put(); - return; - } - - // add dir - if (curdir->get_version() == 0) { - // fetch newly opened dir - } else if (reply->is_empty() && !dis->wants_base_dir()) { - dout(7) << "handle_discover not adding unwanted base dir " << *curdir << dendl; - // make sure the base frag is correct, though, in there was a refragment since the - // original request was sent. - reply->set_base_dir_frag(curdir->get_frag()); - } else { - assert(!curdir->is_ambiguous_auth()); // would be frozen. - if (!reply->trace.length()) - reply->starts_with = MDiscoverReply::DIR; - replicate_dir(curdir, from, reply->trace); - dout(7) << "handle_discover added dir " << *curdir << dendl; - } - - // lookup - CDentry *dn = 0; - if (curdir->get_version() == 0) { - // fetch newly opened dir - assert(!curdir->has_bloom()); - } else if (dis->get_want().depth() > 0) { - // lookup dentry - dn = curdir->lookup(dis->get_dentry(i), snapid); - } else - break; // done! - - // incomplete dir? - if (!dn) { - if (!curdir->is_complete() && - (!curdir->has_bloom() || curdir->is_in_bloom(dis->get_dentry(i)))) { - // readdir - dout(7) << "incomplete dir contents for " << *curdir << ", fetching" << dendl; - if (reply->is_empty()) { - // fetch and wait - curdir->fetch(new C_MDS_RetryMessage(mds, dis), - dis->wants_base_dir() && curdir->get_version() == 0); - reply->put(); - return; - } else { - // initiate fetch, but send what we have so far - curdir->fetch(0); - break; - } - } - - // send null dentry - dout(7) << "dentry " << dis->get_dentry(i) << " dne, returning null in " - << *curdir << dendl; - dn = curdir->add_null_dentry(dis->get_dentry(i)); - } - assert(dn); - - // don't add replica to purging dentry/inode - if (dn->state_test(CDentry::STATE_PURGING)) { - if (reply->is_empty()) - reply->set_flag_error_dn(dis->get_dentry(i)); - break; - } - - CDentry::linkage_t *dnl = dn->get_linkage(); - - // xlocked dentry? - // ...always block on non-tail items (they are unrelated) - // ...allow xlocked tail disocvery _only_ if explicitly requested - bool tailitem = (dis->get_want().depth() == 0) || (i == dis->get_want().depth() - 1); - if (dn->lock.is_xlocked()) { - // is this the last (tail) item in the discover traversal? - if (tailitem && dis->wants_xlocked()) { - dout(7) << "handle_discover allowing discovery of xlocked tail " << *dn << dendl; - } else if (reply->is_empty()) { - dout(7) << "handle_discover blocking on xlocked " << *dn << dendl; - dn->lock.add_waiter(SimpleLock::WAIT_RD, new C_MDS_RetryMessage(mds, dis)); - reply->put(); - return; - } else { - dout(7) << "handle_discover non-empty reply, xlocked tail " << *dn << dendl; - break; - } - } - - // frozen inode? - if (dnl->is_primary() && dnl->get_inode()->is_frozen_inode()) { - if (tailitem && dis->wants_xlocked()) { - dout(7) << "handle_discover allowing discovery of frozen tail " << *dnl->get_inode() << dendl; - } else if (reply->is_empty()) { - dout(7) << *dnl->get_inode() << " is frozen, empty reply, waiting" << dendl; - dnl->get_inode()->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryMessage(mds, dis)); - reply->put(); - return; - } else { - dout(7) << *dnl->get_inode() << " is frozen, non-empty reply, stopping" << dendl; - break; - } - } - - // add dentry - if (!reply->trace.length()) - reply->starts_with = MDiscoverReply::DENTRY; - replicate_dentry(dn, from, reply->trace); - dout(7) << "handle_discover added dentry " << *dn << dendl; - - if (!dnl->is_primary()) break; // stop on null or remote link. - - // add inode - CInode *next = dnl->get_inode(); - assert(next->is_auth()); - - replicate_inode(next, from, reply->trace, mds->mdsmap->get_up_features()); - dout(7) << "handle_discover added inode " << *next << dendl; - - // descend, keep going. - cur = next; - continue; - } - - // how did we do? - assert(!reply->is_empty()); - dout(7) << "handle_discover sending result back to asker mds." << from << dendl; - mds->send_message(reply, dis->get_connection()); - - dis->put(); -} - -/* This function DOES put the passed message before returning */ -void MDCache::handle_discover_reply(MDiscoverReply *m) -{ - /* - if (mds->get_state() < MDSMap::STATE_ACTIVE) { - dout(0) << "discover_reply NOT ACTIVE YET" << dendl; - m->put(); - return; - } - */ - dout(7) << "discover_reply " << *m << dendl; - if (m->is_flag_error_dir()) - dout(7) << " flag error, dir" << dendl; - if (m->is_flag_error_dn()) - dout(7) << " flag error, dentry = " << m->get_error_dentry() << dendl; - - list finished, error; - mds_rank_t from = mds_rank_t(m->get_source().num()); - - // starting point - CInode *cur = get_inode(m->get_base_ino()); - bufferlist::iterator p = m->trace.begin(); - - int next = m->starts_with; - - // decrement discover counters - if (m->get_tid()) { - map::iterator p = discovers.find(m->get_tid()); - if (p != discovers.end()) { - dout(10) << " found tid " << m->get_tid() << dendl; - discovers.erase(p); - } else { - dout(10) << " tid " << m->get_tid() << " not found, must be dup reply" << dendl; - } - } - - // discover may start with an inode - if (!p.end() && next == MDiscoverReply::INODE) { - cur = add_replica_inode(p, NULL, finished); - dout(7) << "discover_reply got base inode " << *cur << dendl; - assert(cur->is_base()); - - next = MDiscoverReply::DIR; - - // take waiters? - if (cur->is_base() && - waiting_for_base_ino[from].count(cur->ino())) { - finished.swap(waiting_for_base_ino[from][cur->ino()]); - waiting_for_base_ino[from].erase(cur->ino()); - } - } - assert(cur); - - // loop over discover results. - // indexes follow each ([[dir] dentry] inode) - // can start, end with any type. - while (!p.end()) { - // dir - frag_t fg; - CDir *curdir = 0; - if (next == MDiscoverReply::DIR) { - curdir = add_replica_dir(p, cur, mds_rank_t(m->get_source().num()), finished); - if (cur->ino() == m->get_base_ino() && curdir->get_frag() != m->get_base_dir_frag()) { - assert(m->get_wanted_base_dir()); - cur->take_dir_waiting(m->get_base_dir_frag(), finished); - } - } else { - // note: this can only happen our first way around this loop. - if (p.end() && m->is_flag_error_dn()) { - fg = cur->pick_dirfrag(m->get_error_dentry()); - curdir = cur->get_dirfrag(fg); - } else - curdir = cur->get_dirfrag(m->get_base_dir_frag()); - } - - if (p.end()) - break; - - // dentry - CDentry *dn = add_replica_dentry(p, curdir, finished); - - if (p.end()) - break; - - // inode - cur = add_replica_inode(p, dn, finished); - - next = MDiscoverReply::DIR; - } - - // dir error? - // or dir_auth hint? - if (m->is_flag_error_dir() && !cur->is_dir()) { - // not a dir. - cur->take_waiting(CInode::WAIT_DIR, error); - } else if (m->is_flag_error_dir() || m->get_dir_auth_hint() != CDIR_AUTH_UNKNOWN) { - mds_rank_t who = m->get_dir_auth_hint(); - if (who == mds->get_nodeid()) who = -1; - if (who >= 0) - dout(7) << " dir_auth_hint is " << m->get_dir_auth_hint() << dendl; - - - if (m->get_wanted_base_dir()) { - frag_t fg = m->get_base_dir_frag(); - CDir *dir = cur->get_dirfrag(fg); - - if (cur->is_waiting_for_dir(fg)) { - if (cur->is_auth()) - cur->take_waiting(CInode::WAIT_DIR, finished); - else if (dir || !cur->dirfragtree.is_leaf(fg)) - cur->take_dir_waiting(fg, finished); - else - discover_dir_frag(cur, fg, 0, who); - } else - dout(7) << " doing nothing, nobody is waiting for dir" << dendl; - } - - // try again? - if (m->get_error_dentry().length()) { - frag_t fg = cur->pick_dirfrag(m->get_error_dentry()); - CDir *dir = cur->get_dirfrag(fg); - // wanted a dentry - if (dir && dir->is_waiting_for_dentry(m->get_error_dentry(), m->get_wanted_snapid())) { - if (dir->is_auth() || dir->lookup(m->get_error_dentry())) { - dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(), - m->get_wanted_snapid(), finished); - } else { - filepath relpath(m->get_error_dentry(), 0); - discover_path(dir, m->get_wanted_snapid(), relpath, 0, m->get_wanted_xlocked()); - } - } else - dout(7) << " doing nothing, have dir but nobody is waiting on dentry " - << m->get_error_dentry() << dendl; - } - } else if (m->is_flag_error_dn()) { - frag_t fg = cur->pick_dirfrag(m->get_error_dentry()); - CDir *dir = cur->get_dirfrag(fg); - if (dir) { - if (dir->is_auth()) { - dir->take_sub_waiting(finished); - } else { - dir->take_dentry_waiting(m->get_error_dentry(), m->get_wanted_snapid(), - m->get_wanted_snapid(), error); - } - } - } - - // waiters - finish_contexts(g_ceph_context, error, -ENOENT); // finish errors directly - mds->queue_waiters(finished); - - // done - m->put(); -} - - - -// ---------------------------- -// REPLICAS - -CDir *MDCache::add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from, - list& finished) -{ - dirfrag_t df; - ::decode(df, p); - - assert(diri->ino() == df.ino); - - // add it (_replica_) - CDir *dir = diri->get_dirfrag(df.frag); - - if (dir) { - // had replica. update w/ new nonce. - dir->decode_replica(p); - dout(7) << "add_replica_dir had " << *dir << " nonce " << dir->replica_nonce << dendl; - } else { - // force frag to leaf in the diri tree - if (!diri->dirfragtree.is_leaf(df.frag)) { - dout(7) << "add_replica_dir forcing frag " << df.frag << " to leaf in the fragtree " - << diri->dirfragtree << dendl; - diri->dirfragtree.force_to_leaf(g_ceph_context, df.frag); - } - - // add replica. - dir = diri->add_dirfrag( new CDir(diri, df.frag, this, false) ); - dir->decode_replica(p); - - // is this a dir_auth delegation boundary? - if (from != diri->authority().first || - diri->is_ambiguous_auth() || - diri->is_base()) - adjust_subtree_auth(dir, from); - - dout(7) << "add_replica_dir added " << *dir << " nonce " << dir->replica_nonce << dendl; - - // get waiters - diri->take_dir_waiting(df.frag, finished); - } - - return dir; -} - -CDentry *MDCache::add_replica_dentry(bufferlist::iterator& p, CDir *dir, list& finished) -{ - string name; - snapid_t last; - ::decode(name, p); - ::decode(last, p); - - CDentry *dn = dir->lookup(name, last); - - // have it? - if (dn) { - dn->decode_replica(p, false); - dout(7) << "add_replica_dentry had " << *dn << dendl; - } else { - dn = dir->add_null_dentry(name, 1 /* this will get updated below */, last); - dn->decode_replica(p, true); - dout(7) << "add_replica_dentry added " << *dn << dendl; - } - - dir->take_dentry_waiting(name, dn->first, dn->last, finished); - - return dn; -} - -CInode *MDCache::add_replica_inode(bufferlist::iterator& p, CDentry *dn, list& finished) -{ - inodeno_t ino; - snapid_t last; - ::decode(ino, p); - ::decode(last, p); - CInode *in = get_inode(ino, last); - if (!in) { - in = new CInode(this, false, 1, last); - in->decode_replica(p, true); - add_inode(in); - if (in->ino() == MDS_INO_ROOT) - in->inode_auth.first = 0; - else if (in->is_mdsdir()) - in->inode_auth.first = in->ino() - MDS_INO_MDSDIR_OFFSET; - dout(10) << "add_replica_inode added " << *in << dendl; - if (dn) { - assert(dn->get_linkage()->is_null()); - dn->dir->link_primary_inode(dn, in); - } - } else { - in->decode_replica(p, false); - dout(10) << "add_replica_inode had " << *in << dendl; - } - - if (dn) { - if (!dn->get_linkage()->is_primary() || dn->get_linkage()->get_inode() != in) - dout(10) << "add_replica_inode different linkage in dentry " << *dn << dendl; - } - - return in; -} - - -void MDCache::replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl) -{ - uint64_t features = mds->mdsmap->get_up_features(); - replicate_inode(get_myin(), who, bl, features); - replicate_dir(straydn->get_dir()->inode->get_parent_dn()->get_dir(), who, bl); - replicate_dentry(straydn->get_dir()->inode->get_parent_dn(), who, bl); - replicate_inode(straydn->get_dir()->inode, who, bl, features); - replicate_dir(straydn->get_dir(), who, bl); - replicate_dentry(straydn, who, bl); -} - -CDentry *MDCache::add_replica_stray(bufferlist &bl, mds_rank_t from) -{ - list finished; - bufferlist::iterator p = bl.begin(); - - CInode *mdsin = add_replica_inode(p, NULL, finished); - CDir *mdsdir = add_replica_dir(p, mdsin, from, finished); - CDentry *straydirdn = add_replica_dentry(p, mdsdir, finished); - CInode *strayin = add_replica_inode(p, straydirdn, finished); - CDir *straydir = add_replica_dir(p, strayin, from, finished); - CDentry *straydn = add_replica_dentry(p, straydir, finished); - if (!finished.empty()) - mds->queue_waiters(finished); - - return straydn; -} - - -int MDCache::send_dir_updates(CDir *dir, bool bcast) -{ - // this is an FYI, re: replication - - set who; - if (bcast) { - mds->get_mds_map()->get_active_mds_set(who); - } else { - for (const auto &p : dir->get_replicas()) { - who.insert(p.first); - } - } - - dout(7) << "sending dir_update on " << *dir << " bcast " << bcast << " to " << who << dendl; - - filepath path; - dir->inode->make_path(path); - - mds_rank_t whoami = mds->get_nodeid(); - for (set::iterator it = who.begin(); - it != who.end(); - ++it) { - if (*it == whoami) continue; - //if (*it == except) continue; - dout(7) << "sending dir_update on " << *dir << " to " << *it << dendl; - - mds->send_message_mds(new MDirUpdate(mds->get_nodeid(), - dir->dirfrag(), - dir->dir_rep, - dir->dir_rep_by, - path, - bcast), - *it); - } - - return 0; -} - -/* This function DOES put the passed message before returning */ -void MDCache::handle_dir_update(MDirUpdate *m) -{ - dirfrag_t df = m->get_dirfrag(); - CDir *dir = get_dirfrag(df); - if (!dir) { - dout(5) << "dir_update on " << df << ", don't have it" << dendl; - - // discover it? - if (m->should_discover()) { - // only try once! - // this is key to avoid a fragtree update race, among other things. - m->inc_tried_discover(); - vector trace; - CInode *in; - filepath path = m->get_path(); - dout(5) << "trying discover on dir_update for " << path << dendl; - MDRequestRef null_ref; - int r = path_traverse(null_ref, m, NULL, path, &trace, &in, MDS_TRAVERSE_DISCOVER); - if (r > 0) - return; - if (r == 0 && - in->ino() == df.ino && - in->get_approx_dirfrag(df.frag) == NULL) { - open_remote_dirfrag(in, df.frag, new C_MDS_RetryMessage(mds, m)); - return; - } - } - - m->put(); - return; - } - - if (!m->has_tried_discover()) { - // Update if it already exists. Othwerwise it got updated by discover reply. - dout(5) << "dir_update on " << *dir << dendl; - dir->dir_rep = m->get_dir_rep(); - dir->dir_rep_by = m->get_dir_rep_by(); - } - - // done - m->put(); -} - - - - - -// LINK - -void MDCache::send_dentry_link(CDentry *dn, MDRequestRef& mdr) -{ - dout(7) << "send_dentry_link " << *dn << dendl; - - CDir *subtree = get_subtree_root(dn->get_dir()); - for (const auto &p : dn->get_replicas()) { - // don't tell (rename) witnesses; they already know - if (mdr.get() && mdr->more()->witnessed.count(p.first)) - continue; - if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN || - (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN && - rejoin_gather.count(p.first))) - continue; - CDentry::linkage_t *dnl = dn->get_linkage(); - MDentryLink *m = new MDentryLink(subtree->dirfrag(), dn->get_dir()->dirfrag(), - dn->name, dnl->is_primary()); - if (dnl->is_primary()) { - dout(10) << " primary " << *dnl->get_inode() << dendl; - replicate_inode(dnl->get_inode(), p.first, m->bl, - mds->mdsmap->get_up_features()); - } else if (dnl->is_remote()) { - inodeno_t ino = dnl->get_remote_ino(); - __u8 d_type = dnl->get_remote_d_type(); - dout(10) << " remote " << ino << " " << d_type << dendl; - ::encode(ino, m->bl); - ::encode(d_type, m->bl); - } else - ceph_abort(); // aie, bad caller! - mds->send_message_mds(m, p.first); - } -} - -/* This function DOES put the passed message before returning */ -void MDCache::handle_dentry_link(MDentryLink *m) -{ - - CDentry *dn = NULL; - CDir *dir = get_dirfrag(m->get_dirfrag()); - if (!dir) { - dout(7) << "handle_dentry_link don't have dirfrag " << m->get_dirfrag() << dendl; - } else { - dn = dir->lookup(m->get_dn()); - if (!dn) { - dout(7) << "handle_dentry_link don't have dentry " << *dir << " dn " << m->get_dn() << dendl; - } else { - dout(7) << "handle_dentry_link on " << *dn << dendl; - CDentry::linkage_t *dnl = dn->get_linkage(); - - assert(!dn->is_auth()); - assert(dnl->is_null()); - } - } - - bufferlist::iterator p = m->bl.begin(); - list finished; - if (dn) { - if (m->get_is_primary()) { - // primary link. - add_replica_inode(p, dn, finished); - } else { - // remote link, easy enough. - inodeno_t ino; - __u8 d_type; - ::decode(ino, p); - ::decode(d_type, p); - dir->link_remote_inode(dn, ino, d_type); - } - } else { - ceph_abort(); - } - - if (!finished.empty()) - mds->queue_waiters(finished); - - m->put(); - return; -} - - -// UNLINK - -void MDCache::send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr) -{ - dout(10) << "send_dentry_unlink " << *dn << dendl; - // share unlink news with replicas - set replicas; - dn->list_replicas(replicas); - if (straydn) - straydn->list_replicas(replicas); - for (set::iterator it = replicas.begin(); - it != replicas.end(); - ++it) { - // don't tell (rmdir) witnesses; they already know - if (mdr.get() && mdr->more()->witnessed.count(*it)) - continue; - - if (mds->mdsmap->get_state(*it) < MDSMap::STATE_REJOIN || - (mds->mdsmap->get_state(*it) == MDSMap::STATE_REJOIN && - rejoin_gather.count(*it))) - continue; - - MDentryUnlink *unlink = new MDentryUnlink(dn->get_dir()->dirfrag(), dn->name); - if (straydn) - replicate_stray(straydn, *it, unlink->straybl); - mds->send_message_mds(unlink, *it); - } -} - -/* This function DOES put the passed message before returning */ -void MDCache::handle_dentry_unlink(MDentryUnlink *m) -{ - // straydn - CDentry *straydn = NULL; - if (m->straybl.length()) - straydn = add_replica_stray(m->straybl, mds_rank_t(m->get_source().num())); - - CDir *dir = get_dirfrag(m->get_dirfrag()); - if (!dir) { - dout(7) << "handle_dentry_unlink don't have dirfrag " << m->get_dirfrag() << dendl; - } else { - CDentry *dn = dir->lookup(m->get_dn()); - if (!dn) { - dout(7) << "handle_dentry_unlink don't have dentry " << *dir << " dn " << m->get_dn() << dendl; - } else { - dout(7) << "handle_dentry_unlink on " << *dn << dendl; - CDentry::linkage_t *dnl = dn->get_linkage(); - - // open inode? - if (dnl->is_primary()) { - CInode *in = dnl->get_inode(); - dn->dir->unlink_inode(dn); - assert(straydn); - straydn->dir->link_primary_inode(straydn, in); - - // in->first is lazily updated on replica; drag it forward so - // that we always keep it in sync with the dnq - assert(straydn->first >= in->first); - in->first = straydn->first; - - // update subtree map? - if (in->is_dir()) - adjust_subtree_after_rename(in, dir, false); - - // send caps to auth (if we're not already) - if (in->is_any_caps() && - !in->state_test(CInode::STATE_EXPORTINGCAPS)) - migrator->export_caps(in); - - straydn = NULL; - } else { - assert(!straydn); - assert(dnl->is_remote()); - dn->dir->unlink_inode(dn); - } - assert(dnl->is_null()); - } - } - - // race with trim_dentry() - if (straydn) { - assert(straydn->get_num_ref() == 0); - assert(straydn->get_linkage()->is_null()); - map expiremap; - trim_dentry(straydn, expiremap); - send_expire_messages(expiremap); - } - - m->put(); - return; -} - - - - - - -// =================================================================== - - - -// =================================================================== -// FRAGMENT - - -/** - * adjust_dir_fragments -- adjust fragmentation for a directory - * - * @param diri directory inode - * @param basefrag base fragment - * @param bits bit adjustment. positive for split, negative for merge. - */ -void MDCache::adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, - list& resultfrags, - list& waiters, - bool replay) -{ - dout(10) << "adjust_dir_fragments " << basefrag << " " << bits - << " on " << *diri << dendl; - - list srcfrags; - diri->get_dirfrags_under(basefrag, srcfrags); - - adjust_dir_fragments(diri, srcfrags, basefrag, bits, resultfrags, waiters, replay); -} - -CDir *MDCache::force_dir_fragment(CInode *diri, frag_t fg, bool replay) -{ - CDir *dir = diri->get_dirfrag(fg); - if (dir) - return dir; - - dout(10) << "force_dir_fragment " << fg << " on " << *diri << dendl; - - list src, result; - list waiters; - - // split a parent? - frag_t parent = diri->dirfragtree.get_branch_or_leaf(fg); - while (1) { - CDir *pdir = diri->get_dirfrag(parent); - if (pdir) { - int split = fg.bits() - parent.bits(); - dout(10) << " splitting parent by " << split << " " << *pdir << dendl; - src.push_back(pdir); - adjust_dir_fragments(diri, src, parent, split, result, waiters, replay); - dir = diri->get_dirfrag(fg); - if (dir) { - dout(10) << "force_dir_fragment result " << *dir << dendl; - break; - } - } - if (parent == frag_t()) - break; - frag_t last = parent; - parent = parent.parent(); - dout(10) << " " << last << " parent is " << parent << dendl; - } - - if (!dir) { - // hoover up things under fg? - diri->get_dirfrags_under(fg, src); - if (src.empty()) { - dout(10) << "force_dir_fragment no frags under " << fg << dendl; - } else { - dout(10) << " will combine frags under " << fg << ": " << src << dendl; - adjust_dir_fragments(diri, src, fg, 0, result, waiters, replay); - dir = result.front(); - dout(10) << "force_dir_fragment result " << *dir << dendl; - } - } - if (!replay) - mds->queue_waiters(waiters); - return dir; -} - -void MDCache::adjust_dir_fragments(CInode *diri, - list& srcfrags, - frag_t basefrag, int bits, - list& resultfrags, - list& waiters, - bool replay) -{ - dout(10) << "adjust_dir_fragments " << basefrag << " bits " << bits - << " srcfrags " << srcfrags - << " on " << *diri << dendl; - - // adjust fragtree - // yuck. we may have discovered the inode while it was being fragmented. - if (!diri->dirfragtree.is_leaf(basefrag)) - diri->dirfragtree.force_to_leaf(g_ceph_context, basefrag); - - if (bits > 0) - diri->dirfragtree.split(basefrag, bits); - dout(10) << " new fragtree is " << diri->dirfragtree << dendl; - - if (srcfrags.empty()) - return; - - // split - CDir *parent_dir = diri->get_parent_dir(); - CDir *parent_subtree = 0; - if (parent_dir) - parent_subtree = get_subtree_root(parent_dir); - - if (bits > 0) { - // SPLIT - assert(srcfrags.size() == 1); - CDir *dir = srcfrags.front(); - - dir->split(bits, resultfrags, waiters, replay); - - // did i change the subtree map? - if (dir->is_subtree_root()) { - // new frags are now separate subtrees - for (list::iterator p = resultfrags.begin(); - p != resultfrags.end(); - ++p) - subtrees[*p].clear(); // new frag is now its own subtree - - // was i a bound? - if (parent_subtree) { - assert(subtrees[parent_subtree].count(dir)); - subtrees[parent_subtree].erase(dir); - for (list::iterator p = resultfrags.begin(); - p != resultfrags.end(); - ++p) { - assert((*p)->is_subtree_root()); - subtrees[parent_subtree].insert(*p); - } - } - - // adjust my bounds. - set bounds; - bounds.swap(subtrees[dir]); - subtrees.erase(dir); - for (set::iterator p = bounds.begin(); - p != bounds.end(); - ++p) { - CDir *frag = get_subtree_root((*p)->get_parent_dir()); - subtrees[frag].insert(*p); - } - - show_subtrees(10); - - // dir has no PIN_SUBTREE; CDir::purge_stolen() drops it. - dir->dir_auth = CDIR_AUTH_DEFAULT; - } - - diri->close_dirfrag(dir->get_frag()); - - } else { - // MERGE - - // are my constituent bits subtrees? if so, i will be too. - // (it's all or none, actually.) - bool any_subtree = false; - for (CDir *dir : srcfrags) { - if (dir->is_subtree_root()) { - any_subtree = true; - break; - } - } - set new_bounds; - if (any_subtree) { - for (CDir *dir : srcfrags) { - // this simplifies the code that find subtrees underneath the dirfrag - if (!dir->is_subtree_root()) { - dir->state_set(CDir::STATE_AUXSUBTREE); - adjust_subtree_auth(dir, mds->get_nodeid()); - } - } - - for (CDir *dir : srcfrags) { - assert(dir->is_subtree_root()); - dout(10) << " taking srcfrag subtree bounds from " << *dir << dendl; - map >::iterator q = subtrees.find(dir); - set::iterator r = q->second.begin(); - while (r != subtrees[dir].end()) { - new_bounds.insert(*r); - subtrees[dir].erase(r++); - } - subtrees.erase(q); - - // remove myself as my parent's bound - if (parent_subtree) - subtrees[parent_subtree].erase(dir); - } - } - - // merge - CDir *f = new CDir(diri, basefrag, this, srcfrags.front()->is_auth()); - f->merge(srcfrags, waiters, replay); - - if (any_subtree) { - assert(f->is_subtree_root()); - subtrees[f].swap(new_bounds); - if (parent_subtree) - subtrees[parent_subtree].insert(f); - - show_subtrees(10); - } - - resultfrags.push_back(f); - } -} - - -class C_MDC_FragmentFrozen : public MDSInternalContext { - MDCache *mdcache; - MDRequestRef mdr; -public: - C_MDC_FragmentFrozen(MDCache *m, MDRequestRef& r) : - MDSInternalContext(m->mds), mdcache(m), mdr(r) {} - void finish(int r) override { - mdcache->fragment_frozen(mdr, r); - } -}; - -bool MDCache::can_fragment(CInode *diri, list& dirs) -{ - if (is_readonly()) { - dout(7) << "can_fragment: read-only FS, no fragmenting for now" << dendl; - return false; - } - if (mds->is_cluster_degraded()) { - dout(7) << "can_fragment: cluster degraded, no fragmenting for now" << dendl; - return false; - } - if (diri->get_parent_dir() && - diri->get_parent_dir()->get_inode()->is_stray()) { - dout(7) << "can_fragment: i won't merge|split anything in stray" << dendl; - return false; - } - if (diri->is_mdsdir() || diri->is_stray() || diri->ino() == MDS_INO_CEPH) { - dout(7) << "can_fragment: i won't fragment the mdsdir or straydir or .ceph" << dendl; - return false; - } - - if (diri->scrub_is_in_progress()) { - dout(7) << "can_fragment: scrub in progress" << dendl; - return false; - } - - for (list::iterator p = dirs.begin(); p != dirs.end(); ++p) { - CDir *dir = *p; - if (dir->state_test(CDir::STATE_FRAGMENTING)) { - dout(7) << "can_fragment: already fragmenting " << *dir << dendl; - return false; - } - if (!dir->is_auth()) { - dout(7) << "can_fragment: not auth on " << *dir << dendl; - return false; - } - if (dir->is_bad()) { - dout(7) << "can_fragment: bad dirfrag " << *dir << dendl; - return false; - } - if (dir->is_frozen() || - dir->is_freezing()) { - dout(7) << "can_fragment: can't merge, freezing|frozen. wait for other exports to finish first." << dendl; - return false; - } - } - - return true; -} - -void MDCache::split_dir(CDir *dir, int bits) -{ - dout(7) << __func__ << " " << *dir << " bits " << bits << dendl; - assert(dir->is_auth()); - CInode *diri = dir->inode; - - list dirs; - dirs.push_back(dir); - - if (!can_fragment(diri, dirs)) { - dout(7) << __func__ << " cannot fragment right now, dropping" << dendl; - return; - } - - if (dir->frag.bits() + bits > 24) { - dout(7) << __func__ << " frag bits > 24, dropping" << dendl; - return; - } - - MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR); - mdr->more()->fragment_base = dir->dirfrag(); - - assert(fragments.count(dir->dirfrag()) == 0); - fragment_info_t& info = fragments[dir->dirfrag()]; - info.mdr = mdr; - info.dirs.push_back(dir); - info.bits = bits; - info.last_cum_auth_pins_change = ceph_clock_now(); - - fragment_freeze_dirs(dirs); - // initial mark+complete pass - fragment_mark_and_complete(mdr); -} - -void MDCache::merge_dir(CInode *diri, frag_t frag) -{ - dout(7) << "merge_dir to " << frag << " on " << *diri << dendl; - - list dirs; - if (!diri->get_dirfrags_under(frag, dirs)) { - dout(7) << "don't have all frags under " << frag << " for " << *diri << dendl; - return; - } - - if (diri->dirfragtree.is_leaf(frag)) { - dout(10) << " " << frag << " already a leaf for " << *diri << dendl; - return; - } - - if (!can_fragment(diri, dirs)) - return; - - CDir *first = dirs.front(); - int bits = first->get_frag().bits() - frag.bits(); - dout(10) << " we are merginb by " << bits << " bits" << dendl; - - dirfrag_t basedirfrag(diri->ino(), frag); - MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FRAGMENTDIR); - mdr->more()->fragment_base = basedirfrag; - - assert(fragments.count(basedirfrag) == 0); - fragment_info_t& info = fragments[basedirfrag]; - info.mdr = mdr; - info.dirs = dirs; - info.bits = -bits; - info.last_cum_auth_pins_change = ceph_clock_now(); - - fragment_freeze_dirs(dirs); - // initial mark+complete pass - fragment_mark_and_complete(mdr); -} - -void MDCache::fragment_freeze_dirs(list& dirs) -{ - for (list::iterator p = dirs.begin(); p != dirs.end(); ++p) { - CDir *dir = *p; - dir->auth_pin(dir); // until we mark and complete them - dir->state_set(CDir::STATE_FRAGMENTING); - dir->freeze_dir(); - assert(dir->is_freezing_dir()); - } -} - -class C_MDC_FragmentMarking : public MDCacheContext { - MDRequestRef mdr; -public: - C_MDC_FragmentMarking(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {} - void finish(int r) override { - mdcache->fragment_mark_and_complete(mdr); - } -}; - -void MDCache::fragment_mark_and_complete(MDRequestRef& mdr) -{ - dirfrag_t basedirfrag = mdr->more()->fragment_base; - map::iterator it = fragments.find(basedirfrag); - if (it == fragments.end() || it->second.mdr != mdr) { - dout(7) << "fragment_mark_and_complete " << basedirfrag << " must have aborted" << dendl; - request_finish(mdr); - return; - } - - fragment_info_t& info = it->second; - CInode *diri = info.dirs.front()->get_inode(); - dout(10) << "fragment_mark_and_complete " << info.dirs << " on " << *diri << dendl; - - MDSGatherBuilder gather(g_ceph_context); - - for (list::iterator p = info.dirs.begin(); - p != info.dirs.end(); - ++p) { - CDir *dir = *p; - - bool ready = true; - if (!dir->is_complete()) { - dout(15) << " fetching incomplete " << *dir << dendl; - dir->fetch(gather.new_sub(), true); // ignore authpinnability - ready = false; - } else if (dir->get_frag() == frag_t()) { - // The COMPLETE flag gets lost if we fragment a new dirfrag, then rollback - // the operation. To avoid CDir::fetch() complaining about missing object, - // we commit new dirfrag first. - if (dir->state_test(CDir::STATE_CREATING)) { - dout(15) << " waiting until new dir gets journaled " << *dir << dendl; - dir->add_waiter(CDir::WAIT_CREATED, gather.new_sub()); - ready = false; - } else if (dir->is_new()) { - dout(15) << " committing new " << *dir << dendl; - assert(dir->is_dirty()); - dir->commit(0, gather.new_sub(), true); - ready = false; - } - } - if (!ready) - continue; - - if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) { - dout(15) << " marking " << *dir << dendl; - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - dn->get(CDentry::PIN_FRAGMENTING); - assert(!dn->state_test(CDentry::STATE_FRAGMENTING)); - dn->state_set(CDentry::STATE_FRAGMENTING); - } - dir->state_set(CDir::STATE_DNPINNEDFRAG); - dir->auth_unpin(dir); - } else { - dout(15) << " already marked " << *dir << dendl; - } - } - if (gather.has_subs()) { - gather.set_finisher(new C_MDC_FragmentMarking(this, mdr)); - gather.activate(); - return; - } - - for (list::iterator p = info.dirs.begin(); - p != info.dirs.end(); - ++p) { - CDir *dir = *p; - if (!dir->is_frozen_dir()) { - assert(dir->is_freezing_dir()); - dir->add_waiter(CDir::WAIT_FROZEN, gather.new_sub()); - } - } - if (gather.has_subs()) { - gather.set_finisher(new C_MDC_FragmentFrozen(this, mdr)); - gather.activate(); - // flush log so that request auth_pins are retired - mds->mdlog->flush(); - return; - } - - fragment_frozen(mdr, 0); -} - -void MDCache::fragment_unmark_unfreeze_dirs(list& dirs) -{ - dout(10) << "fragment_unmark_unfreeze_dirs " << dirs << dendl; - for (list::iterator p = dirs.begin(); p != dirs.end(); ++p) { - CDir *dir = *p; - dout(10) << " frag " << *dir << dendl; - - assert(dir->state_test(CDir::STATE_FRAGMENTING)); - dir->state_clear(CDir::STATE_FRAGMENTING); - - if (dir->state_test(CDir::STATE_DNPINNEDFRAG)) { - dir->state_clear(CDir::STATE_DNPINNEDFRAG); - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - assert(dn->state_test(CDentry::STATE_FRAGMENTING)); - dn->state_clear(CDentry::STATE_FRAGMENTING); - dn->put(CDentry::PIN_FRAGMENTING); - } - } else { - dir->auth_unpin(dir); - } - - dir->unfreeze_dir(); - } -} - -bool MDCache::fragment_are_all_frozen(CDir *dir) -{ - assert(dir->is_frozen_dir()); - map::iterator p; - for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0)); - p != fragments.end() && p->first.ino == dir->ino(); - ++p) { - if (p->first.frag.contains(dir->get_frag())) - return p->second.all_frozen; - } - ceph_abort(); - return false; -} - -void MDCache::fragment_freeze_inc_num_waiters(CDir *dir) -{ - map::iterator p; - for (p = fragments.lower_bound(dirfrag_t(dir->ino(), 0)); - p != fragments.end() && p->first.ino == dir->ino(); - ++p) { - if (p->first.frag.contains(dir->get_frag())) { - p->second.num_remote_waiters++; - return; - } - } - ceph_abort(); -} - -void MDCache::find_stale_fragment_freeze() -{ - dout(10) << "find_stale_fragment_freeze" << dendl; - // see comment in Migrator::find_stale_export_freeze() - utime_t now = ceph_clock_now(); - utime_t cutoff = now; - cutoff -= g_conf->mds_freeze_tree_timeout; - - for (map::iterator p = fragments.begin(); - p != fragments.end(); ) { - dirfrag_t df = p->first; - fragment_info_t& info = p->second; - ++p; - if (info.all_frozen) - continue; - CDir *dir; - int total_auth_pins = 0; - for (list::iterator q = info.dirs.begin(); - q != info.dirs.end(); - ++q) { - dir = *q; - if (!dir->state_test(CDir::STATE_DNPINNEDFRAG)) { - total_auth_pins = -1; - break; - } - if (dir->is_frozen_dir()) - continue; - total_auth_pins += dir->get_auth_pins() + dir->get_dir_auth_pins(); - } - if (total_auth_pins < 0) - continue; - if (info.last_cum_auth_pins != total_auth_pins) { - info.last_cum_auth_pins = total_auth_pins; - info.last_cum_auth_pins_change = now; - continue; - } - if (info.last_cum_auth_pins_change >= cutoff) - continue; - dir = info.dirs.front(); - if (info.num_remote_waiters > 0 || - (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) { - dout(10) << " cancel fragmenting " << df << " bit " << info.bits << dendl; - list dirs; - info.dirs.swap(dirs); - fragments.erase(df); - fragment_unmark_unfreeze_dirs(dirs); - } - } -} - -class C_MDC_FragmentPrep : public MDCacheLogContext { - MDRequestRef mdr; -public: - C_MDC_FragmentPrep(MDCache *m, MDRequestRef& r) : MDCacheLogContext(m), mdr(r) {} - void finish(int r) override { - mdcache->_fragment_logged(mdr); - } -}; - -class C_MDC_FragmentStore : public MDCacheContext { - MDRequestRef mdr; -public: - C_MDC_FragmentStore(MDCache *m, MDRequestRef& r) : MDCacheContext(m), mdr(r) {} - void finish(int r) override { - mdcache->_fragment_stored(mdr); - } -}; - -class C_MDC_FragmentCommit : public MDCacheLogContext { - dirfrag_t basedirfrag; - list resultfrags; -public: - C_MDC_FragmentCommit(MDCache *m, dirfrag_t df, list& l) : - MDCacheLogContext(m), basedirfrag(df), resultfrags(l) {} - void finish(int r) override { - mdcache->_fragment_committed(basedirfrag, resultfrags); - } -}; - -class C_IO_MDC_FragmentFinish : public MDCacheIOContext { - dirfrag_t basedirfrag; - list resultfrags; -public: - C_IO_MDC_FragmentFinish(MDCache *m, dirfrag_t f, list& l) : - MDCacheIOContext(m), basedirfrag(f) { - resultfrags.swap(l); - } - void finish(int r) override { - assert(r == 0 || r == -ENOENT); - mdcache->_fragment_finish(basedirfrag, resultfrags); - } -}; - -void MDCache::fragment_frozen(MDRequestRef& mdr, int r) -{ - dirfrag_t basedirfrag = mdr->more()->fragment_base; - map::iterator it = fragments.find(basedirfrag); - if (it == fragments.end() || it->second.mdr != mdr) { - dout(7) << "fragment_frozen " << basedirfrag << " must have aborted" << dendl; - request_finish(mdr); - return; - } - - assert(r == 0); - fragment_info_t& info = it->second; - dout(10) << "fragment_frozen " << basedirfrag.frag << " by " << info.bits - << " on " << info.dirs.front()->get_inode() << dendl; - - info.all_frozen = true; - dispatch_fragment_dir(mdr); -} - -void MDCache::dispatch_fragment_dir(MDRequestRef& mdr) -{ - dirfrag_t basedirfrag = mdr->more()->fragment_base; - map::iterator it = fragments.find(basedirfrag); - if (it == fragments.end() || it->second.mdr != mdr) { - dout(7) << "dispatch_fragment_dir " << basedirfrag << " must have aborted" << dendl; - request_finish(mdr); - return; - } - - fragment_info_t& info = it->second; - CInode *diri = info.dirs.front()->get_inode(); - - dout(10) << "dispatch_fragment_dir " << basedirfrag << " bits " << info.bits - << " on " << *diri << dendl; - if (!mdr->aborted) { - set rdlocks, wrlocks, xlocks; - wrlocks.insert(&diri->dirfragtreelock); - // prevent a racing gather on any other scatterlocks too - wrlocks.insert(&diri->nestlock); - wrlocks.insert(&diri->filelock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true)) - if (!mdr->aborted) - return; - } - - if (mdr->aborted) { - dout(10) << " can't auth_pin " << *diri << ", requeuing dir " - << info.dirs.front()->dirfrag() << dendl; - if (info.bits > 0) - mds->balancer->queue_split(info.dirs.front(), false); - else - mds->balancer->queue_merge(info.dirs.front()); - fragment_unmark_unfreeze_dirs(info.dirs); - fragments.erase(it); - request_finish(mdr); - return; - } - - mdr->ls = mds->mdlog->get_current_segment(); - EFragment *le = new EFragment(mds->mdlog, EFragment::OP_PREPARE, basedirfrag, info.bits); - mds->mdlog->start_entry(le); - - for (list::iterator p = info.dirs.begin(); p != info.dirs.end(); ++p) { - CDir *dir = *p; - dirfrag_rollback rollback; - rollback.fnode = dir->fnode; - le->add_orig_frag(dir->get_frag(), &rollback); - } - - // refragment - list waiters; - adjust_dir_fragments(diri, info.dirs, basedirfrag.frag, info.bits, - info.resultfrags, waiters, false); - if (g_conf->mds_debug_frag) - diri->verify_dirfrags(); - mds->queue_waiters(waiters); - - for (list::iterator p = le->orig_frags.begin(); p != le->orig_frags.end(); ++p) - assert(!diri->dirfragtree.is_leaf(*p)); - - le->metablob.add_dir_context(*info.resultfrags.begin()); - for (list::iterator p = info.resultfrags.begin(); - p != info.resultfrags.end(); - ++p) { - if (diri->is_auth()) { - le->metablob.add_fragmented_dir(*p, false, false); - } else { - (*p)->state_set(CDir::STATE_DIRTYDFT); - le->metablob.add_fragmented_dir(*p, false, true); - } - } - - // dft lock - if (diri->is_auth()) { - // journal dirfragtree - inode_t *pi = diri->project_inode(); - pi->version = diri->pre_dirty(); - journal_dirty_inode(mdr.get(), &le->metablob, diri); - } else { - mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock); - mdr->ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree); - mdr->add_updated_lock(&diri->dirfragtreelock); - } - - /* - // filelock - mds->locker->mark_updated_scatterlock(&diri->filelock); - mut->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir); - mut->add_updated_lock(&diri->filelock); - - // dirlock - mds->locker->mark_updated_scatterlock(&diri->nestlock); - mut->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest); - mut->add_updated_lock(&diri->nestlock); - */ - - add_uncommitted_fragment(basedirfrag, info.bits, le->orig_frags, mdr->ls); - mds->server->submit_mdlog_entry(le, new C_MDC_FragmentPrep(this, mdr), - mdr, __func__); - mds->mdlog->flush(); -} - -void MDCache::_fragment_logged(MDRequestRef& mdr) -{ - dirfrag_t basedirfrag = mdr->more()->fragment_base; - map::iterator it = fragments.find(basedirfrag); - assert(it != fragments.end()); - fragment_info_t &info = it->second; - CInode *diri = info.resultfrags.front()->get_inode(); - - dout(10) << "fragment_logged " << basedirfrag << " bits " << info.bits - << " on " << *diri << dendl; - - if (diri->is_auth()) - diri->pop_and_dirty_projected_inode(mdr->ls); - - mdr->apply(); // mark scatterlock - - // store resulting frags - MDSGatherBuilder gather(g_ceph_context, new C_MDC_FragmentStore(this, mdr)); - - for (list::iterator p = info.resultfrags.begin(); - p != info.resultfrags.end(); - ++p) { - CDir *dir = *p; - dout(10) << " storing result frag " << *dir << dendl; - - // freeze and store them too - dir->auth_pin(this); - dir->state_set(CDir::STATE_FRAGMENTING); - dir->commit(0, gather.new_sub(), true); // ignore authpinnability - } - - gather.activate(); -} - -void MDCache::_fragment_stored(MDRequestRef& mdr) -{ - dirfrag_t basedirfrag = mdr->more()->fragment_base; - map::iterator it = fragments.find(basedirfrag); - assert(it != fragments.end()); - fragment_info_t &info = it->second; - CInode *diri = info.resultfrags.front()->get_inode(); - - dout(10) << "fragment_stored " << basedirfrag << " bits " << info.bits - << " on " << *diri << dendl; - - // tell peers - CDir *first = *info.resultfrags.begin(); - for (const auto &p : first->get_replicas()) { - if (mds->mdsmap->get_state(p.first) < MDSMap::STATE_REJOIN || - (mds->mdsmap->get_state(p.first) == MDSMap::STATE_REJOIN && - rejoin_gather.count(p.first))) - continue; - - MMDSFragmentNotify *notify = new MMDSFragmentNotify(basedirfrag, info.bits); - - // freshly replicate new dirs to peers - for (list::iterator q = info.resultfrags.begin(); - q != info.resultfrags.end(); - ++q) - replicate_dir(*q, p.first, notify->basebl); - - mds->send_message_mds(notify, p.first); - } - - // journal commit - EFragment *le = new EFragment(mds->mdlog, EFragment::OP_COMMIT, basedirfrag, info.bits); - mds->mdlog->start_submit_entry(le, new C_MDC_FragmentCommit(this, basedirfrag, - info.resultfrags)); - - mds->locker->drop_locks(mdr.get()); - - // unfreeze resulting frags - for (list::iterator p = info.resultfrags.begin(); - p != info.resultfrags.end(); - ++p) { - CDir *dir = *p; - dout(10) << " result frag " << *dir << dendl; - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - assert(dn->state_test(CDentry::STATE_FRAGMENTING)); - dn->state_clear(CDentry::STATE_FRAGMENTING); - dn->put(CDentry::PIN_FRAGMENTING); - } - - // unfreeze - dir->unfreeze_dir(); - } - - fragments.erase(it); - request_finish(mdr); -} - -void MDCache::_fragment_committed(dirfrag_t basedirfrag, list& resultfrags) -{ - dout(10) << "fragment_committed " << basedirfrag << dendl; - map::iterator it = uncommitted_fragments.find(basedirfrag); - assert(it != uncommitted_fragments.end()); - ufragment &uf = it->second; - - // remove old frags - C_GatherBuilder gather( - g_ceph_context, - new C_OnFinisher( - new C_IO_MDC_FragmentFinish(this, basedirfrag, resultfrags), - mds->finisher)); - - SnapContext nullsnapc; - object_locator_t oloc(mds->mdsmap->get_metadata_pool()); - for (list::iterator p = uf.old_frags.begin(); - p != uf.old_frags.end(); - ++p) { - object_t oid = CInode::get_object_name(basedirfrag.ino, *p, ""); - ObjectOperation op; - if (*p == frag_t()) { - // backtrace object - dout(10) << " truncate orphan dirfrag " << oid << dendl; - op.truncate(0); - op.omap_clear(); - } else { - dout(10) << " removing orphan dirfrag " << oid << dendl; - op.remove(); - } - mds->objecter->mutate(oid, oloc, op, nullsnapc, - ceph::real_clock::now(), - 0, gather.new_sub()); - } - - assert(gather.has_subs()); - gather.activate(); -} - -void MDCache::_fragment_finish(dirfrag_t basedirfrag, list& resultfrags) -{ - dout(10) << "fragment_finish " << basedirfrag << "resultfrags.size=" - << resultfrags.size() << dendl; - map::iterator it = uncommitted_fragments.find(basedirfrag); - assert(it != uncommitted_fragments.end()); - ufragment &uf = it->second; - - // unmark & auth_unpin - for (const auto &dir : resultfrags) { - dir->state_clear(CDir::STATE_FRAGMENTING); - dir->auth_unpin(this); - - // In case the resulting fragments are beyond the split size, - // we might need to split them again right away (they could - // have been taking inserts between unfreezing and getting - // here) - mds->balancer->maybe_fragment(dir, false); - } - - if (mds->logger) { - if (resultfrags.size() > 1) { - mds->logger->inc(l_mds_dir_split); - } else { - mds->logger->inc(l_mds_dir_merge); - } - } - - EFragment *le = new EFragment(mds->mdlog, EFragment::OP_FINISH, basedirfrag, uf.bits); - mds->mdlog->start_submit_entry(le); - - finish_uncommitted_fragment(basedirfrag, EFragment::OP_FINISH); -} - -/* This function DOES put the passed message before returning */ -void MDCache::handle_fragment_notify(MMDSFragmentNotify *notify) -{ - dout(10) << "handle_fragment_notify " << *notify << " from " << notify->get_source() << dendl; - - if (mds->get_state() < MDSMap::STATE_REJOIN) { - notify->put(); - return; - } - - CInode *diri = get_inode(notify->get_ino()); - if (diri) { - frag_t base = notify->get_basefrag(); - int bits = notify->get_bits(); - -/* - if ((bits < 0 && diri->dirfragtree.is_leaf(base)) || - (bits > 0 && !diri->dirfragtree.is_leaf(base))) { - dout(10) << " dft " << diri->dirfragtree << " state doesn't match " << base << " by " << bits - << ", must have found out during resolve/rejoin? ignoring. " << *diri << dendl; - notify->put(); - return; - } -*/ - - // refragment - list waiters; - list resultfrags; - adjust_dir_fragments(diri, base, bits, resultfrags, waiters, false); - if (g_conf->mds_debug_frag) - diri->verify_dirfrags(); - - for (list::iterator p = resultfrags.begin(); p != resultfrags.end(); ++p) - diri->take_dir_waiting((*p)->get_frag(), waiters); - - // add new replica dirs values - bufferlist::iterator p = notify->basebl.begin(); - while (!p.end()) - add_replica_dir(p, diri, mds_rank_t(notify->get_source().num()), waiters); - - mds->queue_waiters(waiters); - } else { - ceph_abort(); - } - - notify->put(); -} - -void MDCache::add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list& old_frags, - LogSegment *ls, bufferlist *rollback) -{ - dout(10) << "add_uncommitted_fragment: base dirfrag " << basedirfrag << " bits " << bits << dendl; - assert(!uncommitted_fragments.count(basedirfrag)); - ufragment& uf = uncommitted_fragments[basedirfrag]; - uf.old_frags = old_frags; - uf.bits = bits; - uf.ls = ls; - ls->uncommitted_fragments.insert(basedirfrag); - if (rollback) - uf.rollback.swap(*rollback); -} - -void MDCache::finish_uncommitted_fragment(dirfrag_t basedirfrag, int op) -{ - dout(10) << "finish_uncommitted_fragments: base dirfrag " << basedirfrag - << " op " << EFragment::op_name(op) << dendl; - map::iterator it = uncommitted_fragments.find(basedirfrag); - if (it != uncommitted_fragments.end()) { - ufragment& uf = it->second; - if (op != EFragment::OP_FINISH && !uf.old_frags.empty()) { - uf.committed = true; - } else { - uf.ls->uncommitted_fragments.erase(basedirfrag); - mds->queue_waiters(uf.waiters); - uncommitted_fragments.erase(it); - } - } -} - -void MDCache::rollback_uncommitted_fragment(dirfrag_t basedirfrag, list& old_frags) -{ - dout(10) << "rollback_uncommitted_fragment: base dirfrag " << basedirfrag - << " old_frags (" << old_frags << ")" << dendl; - map::iterator it = uncommitted_fragments.find(basedirfrag); - if (it != uncommitted_fragments.end()) { - ufragment& uf = it->second; - if (!uf.old_frags.empty()) { - uf.old_frags.swap(old_frags); - uf.committed = true; - } else { - uf.ls->uncommitted_fragments.erase(basedirfrag); - uncommitted_fragments.erase(it); - } - } -} - -void MDCache::rollback_uncommitted_fragments() -{ - dout(10) << "rollback_uncommitted_fragments: " << uncommitted_fragments.size() << " pending" << dendl; - for (map::iterator p = uncommitted_fragments.begin(); - p != uncommitted_fragments.end(); - ++p) { - ufragment &uf = p->second; - CInode *diri = get_inode(p->first.ino); - assert(diri); - - if (uf.committed) { - list frags; - diri->get_dirfrags_under(p->first.frag, frags); - for (list::iterator q = frags.begin(); q != frags.end(); ++q) { - CDir *dir = *q; - dir->auth_pin(this); - dir->state_set(CDir::STATE_FRAGMENTING); - } - _fragment_committed(p->first, frags); - continue; - } - - dout(10) << " rolling back " << p->first << " refragment by " << uf.bits << " bits" << dendl; - - LogSegment *ls = mds->mdlog->get_current_segment(); - EFragment *le = new EFragment(mds->mdlog, EFragment::OP_ROLLBACK, p->first, uf.bits); - mds->mdlog->start_entry(le); - bool diri_auth = (diri->authority() != CDIR_AUTH_UNDEF); - - list old_frags; - diri->dirfragtree.get_leaves_under(p->first.frag, old_frags); - - list resultfrags; - if (uf.old_frags.empty()) { - // created by old format EFragment - list waiters; - adjust_dir_fragments(diri, p->first.frag, -uf.bits, resultfrags, waiters, true); - } else { - bufferlist::iterator bp = uf.rollback.begin(); - for (list::iterator q = uf.old_frags.begin(); q != uf.old_frags.end(); ++q) { - CDir *dir = force_dir_fragment(diri, *q); - resultfrags.push_back(dir); - - dirfrag_rollback rollback; - ::decode(rollback, bp); - - dir->set_version(rollback.fnode.version); - dir->fnode = rollback.fnode; - - dir->_mark_dirty(ls); - - if (!(dir->fnode.rstat == dir->fnode.accounted_rstat)) { - dout(10) << " dirty nestinfo on " << *dir << dendl; - mds->locker->mark_updated_scatterlock(&dir->inode->nestlock); - ls->dirty_dirfrag_nest.push_back(&dir->inode->item_dirty_dirfrag_nest); - } - if (!(dir->fnode.fragstat == dir->fnode.accounted_fragstat)) { - dout(10) << " dirty fragstat on " << *dir << dendl; - mds->locker->mark_updated_scatterlock(&dir->inode->filelock); - ls->dirty_dirfrag_dir.push_back(&dir->inode->item_dirty_dirfrag_dir); - } - - le->add_orig_frag(dir->get_frag()); - le->metablob.add_dir_context(dir); - if (diri_auth) { - le->metablob.add_fragmented_dir(dir, true, false); - } else { - dout(10) << " dirty dirfragtree on " << *dir << dendl; - dir->state_set(CDir::STATE_DIRTYDFT); - le->metablob.add_fragmented_dir(dir, true, true); - } - } - } - - if (diri_auth) { - diri->project_inode()->version = diri->pre_dirty(); - diri->pop_and_dirty_projected_inode(ls); // hacky - le->metablob.add_primary_dentry(diri->get_projected_parent_dn(), diri, true); - } else { - mds->locker->mark_updated_scatterlock(&diri->dirfragtreelock); - ls->dirty_dirfrag_dirfragtree.push_back(&diri->item_dirty_dirfrag_dirfragtree); - } - - if (g_conf->mds_debug_frag) - diri->verify_dirfrags(); - - for (list::iterator q = old_frags.begin(); q != old_frags.end(); ++q) - assert(!diri->dirfragtree.is_leaf(*q)); - - for (list::iterator q = resultfrags.begin(); q != resultfrags.end(); ++q) { - CDir *dir = *q; - dir->auth_pin(this); - dir->state_set(CDir::STATE_FRAGMENTING); - } - - mds->mdlog->submit_entry(le); - - uf.old_frags.swap(old_frags); - _fragment_committed(p->first, resultfrags); - } -} - -void MDCache::force_readonly() -{ - if (is_readonly()) - return; - - dout(1) << "force file system read-only" << dendl; - mds->clog->warn() << "force file system read-only"; - - set_readonly(); - - mds->server->force_clients_readonly(); - - // revoke write caps - for (ceph::unordered_map::iterator p = inode_map.begin(); - p != inode_map.end(); - ++p) { - CInode *in = p->second; - if (in->is_head()) - mds->locker->eval(in, CEPH_CAP_LOCKS); - } - - mds->mdlog->flush(); -} - - -// ============================================================== -// debug crap - -void MDCache::show_subtrees(int dbl) -{ - if (g_conf->mds_thrash_exports) - dbl += 15; - - //dout(10) << "show_subtrees" << dendl; - - if (!g_conf->subsys.should_gather(ceph_subsys_mds, dbl)) - return; // i won't print anything. - - if (subtrees.empty()) { - dout(dbl) << "show_subtrees - no subtrees" << dendl; - return; - } - - // root frags - list basefrags; - for (set::iterator p = base_inodes.begin(); - p != base_inodes.end(); - ++p) - (*p)->get_dirfrags(basefrags); - //dout(15) << "show_subtrees, base dirfrags " << basefrags << dendl; - dout(15) << "show_subtrees" << dendl; - - // queue stuff - list > q; - string indent; - set seen; - - // calc max depth - for (list::iterator p = basefrags.begin(); p != basefrags.end(); ++p) - q.push_back(pair(*p, 0)); - - set subtrees_seen; - - int depth = 0; - while (!q.empty()) { - CDir *dir = q.front().first; - int d = q.front().second; - q.pop_front(); - - if (subtrees.count(dir) == 0) continue; - - subtrees_seen.insert(dir); - - if (d > depth) depth = d; - - // sanity check - //dout(25) << "saw depth " << d << " " << *dir << dendl; - if (seen.count(dir)) dout(0) << "aah, already seen " << *dir << dendl; - assert(seen.count(dir) == 0); - seen.insert(dir); - - // nested items? - if (!subtrees[dir].empty()) { - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) { - //dout(25) << " saw sub " << **p << dendl; - q.push_front(pair(*p, d+1)); - } - } - } - - - // print tree - for (list::iterator p = basefrags.begin(); p != basefrags.end(); ++p) - q.push_back(pair(*p, 0)); - - while (!q.empty()) { - CDir *dir = q.front().first; - int d = q.front().second; - q.pop_front(); - - if (subtrees.count(dir) == 0) continue; - - // adjust indenter - while ((unsigned)d < indent.size()) - indent.resize(d); - - // pad - string pad = "______________________________________"; - pad.resize(depth*2+1-indent.size()); - if (!subtrees[dir].empty()) - pad[0] = '.'; // parent - - - string auth; - if (dir->is_auth()) - auth = "auth "; - else - auth = " rep "; - - char s[10]; - if (dir->get_dir_auth().second == CDIR_AUTH_UNKNOWN) - snprintf(s, sizeof(s), "%2d ", int(dir->get_dir_auth().first)); - else - snprintf(s, sizeof(s), "%2d,%2d", int(dir->get_dir_auth().first), int(dir->get_dir_auth().second)); - - // print - dout(dbl) << indent << "|_" << pad << s << " " << auth << *dir << dendl; - - if (dir->ino() == MDS_INO_ROOT) - assert(dir->inode == root); - if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid())) - assert(dir->inode == myin); - if (dir->inode->is_stray() && (MDS_INO_STRAY_OWNER(dir->ino()) == mds->get_nodeid())) - assert(strays[MDS_INO_STRAY_INDEX(dir->ino())] == dir->inode); - - // nested items? - if (!subtrees[dir].empty()) { - // more at my level? - if (!q.empty() && q.front().second == d) - indent += "| "; - else - indent += " "; - - for (set::iterator p = subtrees[dir].begin(); - p != subtrees[dir].end(); - ++p) - q.push_front(pair(*p, d+2)); - } - } - - // verify there isn't stray crap in subtree map - int lost = 0; - for (map >::iterator p = subtrees.begin(); - p != subtrees.end(); - ++p) { - if (subtrees_seen.count(p->first)) continue; - dout(10) << "*** stray/lost entry in subtree map: " << *p->first << dendl; - lost++; - } - assert(lost == 0); -} - - -void MDCache::show_cache() -{ - dout(7) << "show_cache" << dendl; - - for (ceph::unordered_map::iterator it = inode_map.begin(); - it != inode_map.end(); - ++it) { - // unlinked? - if (!it->second->parent) - dout(7) << " unlinked " << *it->second << dendl; - - // dirfrags? - list dfs; - it->second->get_dirfrags(dfs); - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { - CDir *dir = *p; - dout(7) << " dirfrag " << *dir << dendl; - - for (CDir::map_t::iterator p = dir->items.begin(); - p != dir->items.end(); - ++p) { - CDentry *dn = p->second; - dout(7) << " dentry " << *dn << dendl; - CDentry::linkage_t *dnl = dn->get_linkage(); - if (dnl->is_primary() && dnl->get_inode()) - dout(7) << " inode " << *dnl->get_inode() << dendl; - } - } - } -} - -int MDCache::cache_status(Formatter *f) -{ - f->open_object_section("cache"); - - f->open_object_section("pool"); - mempool::get_pool(mempool::mds_co::id).dump(f); - f->close_section(); - - f->close_section(); - return 0; -} - -int MDCache::dump_cache(std::string const &file_name) -{ - return dump_cache(file_name.c_str(), NULL); -} - -int MDCache::dump_cache(Formatter *f) -{ - return dump_cache(NULL, f); -} - -int MDCache::dump_cache(const string& dump_root, int depth, Formatter *f) -{ - return dump_cache(NULL, f, dump_root, depth); -} - -/** - * Dump the metadata cache, either to a Formatter, if - * provided, else to a plain text file. - */ -int MDCache::dump_cache(const char *fn, Formatter *f, - const string& dump_root, int depth) -{ - int r = 0; - int fd = -1; - - if (f) { - f->open_array_section("inodes"); - } else { - char deffn[200]; - if (!fn) { - snprintf(deffn, sizeof(deffn), "cachedump.%d.mds%d", (int)mds->mdsmap->get_epoch(), int(mds->get_nodeid())); - fn = deffn; - } - - dout(1) << "dump_cache to " << fn << dendl; - - fd = ::open(fn, O_WRONLY|O_CREAT|O_EXCL, 0600); - if (fd < 0) { - derr << "failed to open " << fn << ": " << cpp_strerror(errno) << dendl; - return errno; - } - } - - for (ceph::unordered_map::iterator it = inode_map.begin(); - it != inode_map.end(); - ++it) { - CInode *in = it->second; - - if (!dump_root.empty()) { - string ipath; - if (in->is_root()) - ipath = "/"; - else - in->make_path_string(ipath); - - if (dump_root.length() > ipath.length() || - !equal(dump_root.begin(), dump_root.end(), ipath.begin())) - continue; - - if (depth >= 0 && - count(ipath.begin() + dump_root.length(), ipath.end(), '/') > depth) - continue; - } - - if (f) { - f->open_object_section("inode"); - in->dump(f); - } else { - ostringstream ss; - ss << *in << std::endl; - std::string s = ss.str(); - r = safe_write(fd, s.c_str(), s.length()); - if (r < 0) { - goto out; - } - } - - list dfs; - in->get_dirfrags(dfs); - if (f) { - f->open_array_section("dirfrags"); - } - for (list::iterator p = dfs.begin(); p != dfs.end(); ++p) { - CDir *dir = *p; - if (f) { - f->open_object_section("dir"); - dir->dump(f); - } else { - ostringstream tt; - tt << " " << *dir << std::endl; - string t = tt.str(); - r = safe_write(fd, t.c_str(), t.length()); - if (r < 0) { - goto out; - } - } - - if (f) { - f->open_array_section("dentries"); - } - for (CDir::map_t::iterator q = dir->items.begin(); - q != dir->items.end(); - ++q) { - CDentry *dn = q->second; - if (f) { - f->open_object_section("dentry"); - dn->dump(f); - f->close_section(); - } else { - ostringstream uu; - uu << " " << *dn << std::endl; - string u = uu.str(); - r = safe_write(fd, u.c_str(), u.length()); - if (r < 0) { - goto out; - } - } - } - if (f) { - f->close_section(); //dentries - } - dir->check_rstats(); - if (f) { - f->close_section(); //dir - } - } - if (f) { - f->close_section(); // dirfrags - } - - if (f) { - f->close_section(); // inode - } - } - - out: - if (f) { - f->close_section(); // inodes - } else { - ::close(fd); - } - return r; -} - - - -C_MDS_RetryRequest::C_MDS_RetryRequest(MDCache *c, MDRequestRef& r) - : MDSInternalContext(c->mds), cache(c), mdr(r) -{} - -void C_MDS_RetryRequest::finish(int r) -{ - mdr->retry++; - cache->dispatch_request(mdr); -} - - -class C_MDS_EnqueueScrub : public Context -{ - Formatter *formatter; - Context *on_finish; -public: - ScrubHeaderRef header; - C_MDS_EnqueueScrub(Formatter *f, Context *fin) : - formatter(f), on_finish(fin), header(nullptr) {} - - Context *take_finisher() { - Context *fin = on_finish; - on_finish = NULL; - return fin; - } - - void finish(int r) override { - if (r < 0) { // we failed the lookup or something; dump ourselves - formatter->open_object_section("results"); - formatter->dump_int("return_code", r); - formatter->close_section(); // results - } - if (on_finish) - on_finish->complete(r); - } -}; - -void MDCache::enqueue_scrub( - const string& path, - const std::string &tag, - bool force, bool recursive, bool repair, - Formatter *f, Context *fin) -{ - dout(10) << __func__ << path << dendl; - MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_ENQUEUE_SCRUB); - filepath fp(path.c_str()); - mdr->set_filepath(fp); - - C_MDS_EnqueueScrub *cs = new C_MDS_EnqueueScrub(f, fin); - cs->header = std::make_shared( - tag, force, recursive, repair, f); - - mdr->internal_op_finish = cs; - enqueue_scrub_work(mdr); -} - -void MDCache::enqueue_scrub_work(MDRequestRef& mdr) -{ - set rdlocks, wrlocks, xlocks; - CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true); - if (NULL == in) - return; - - // TODO: Remove this restriction - assert(in->is_auth()); - - bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks); - if (!locked) - return; - - C_MDS_EnqueueScrub *cs = static_cast(mdr->internal_op_finish); - ScrubHeaderRef &header = cs->header; - - // Cannot scrub same dentry twice at same time - if (in->scrub_infop && in->scrub_infop->scrub_in_progress) { - mds->server->respond_to_request(mdr, -EBUSY); - return; - } else { - in->scrub_info(); - } - - header->set_origin(in); - - // only set completion context for non-recursive scrub, because we don't - // want to block asok caller on long running scrub - if (!header->get_recursive()) { - Context *fin = cs->take_finisher(); - mds->scrubstack->enqueue_inode_top(in, header, - new MDSInternalContextWrapper(mds, fin)); - } else - mds->scrubstack->enqueue_inode_bottom(in, header, NULL); - - mds->server->respond_to_request(mdr, 0); - return; -} - -struct C_MDC_RepairDirfragStats : public MDCacheLogContext { - MDRequestRef mdr; - C_MDC_RepairDirfragStats(MDCache *c, MDRequestRef& m) : - MDCacheLogContext(c), mdr(m) {} - void finish(int r) override { - mdr->apply(); - get_mds()->server->respond_to_request(mdr, r); - } -}; - -void MDCache::repair_dirfrag_stats(CDir *dir) -{ - MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_FRAGSTATS); - mdr->pin(dir); - mdr->internal_op_private = dir; - mdr->internal_op_finish = new C_MDSInternalNoop; - repair_dirfrag_stats_work(mdr); -} - -void MDCache::repair_dirfrag_stats_work(MDRequestRef& mdr) -{ - CDir *dir = static_cast(mdr->internal_op_private); - dout(10) << __func__ << " " << *dir << dendl; - - if (!dir->is_auth()) { - mds->server->respond_to_request(mdr, -ESTALE); - return; - } - - if (!mdr->is_auth_pinned(dir) && !dir->can_auth_pin()) { - dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(this, mdr)); - - mds->locker->drop_locks(mdr.get()); - mdr->drop_local_auth_pins(); - if (!mdr->remote_auth_pins.empty()) - mds->locker->notify_freeze_waiter(dir); - return; - } - - mdr->auth_pin(dir); - - set rdlocks, wrlocks, xlocks; - CInode *diri = dir->inode; - rdlocks.insert(&diri->dirfragtreelock); - wrlocks.insert(&diri->nestlock); - wrlocks.insert(&diri->filelock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - if (!dir->is_complete()) { - dir->fetch(new C_MDS_RetryRequest(this, mdr)); - return; - } - - frag_info_t frag_info; - nest_info_t nest_info; - for (CDir::map_t::iterator it = dir->begin(); it != dir->end(); ++it) { - CDentry *dn = it->second; - if (dn->last != CEPH_NOSNAP) - continue; - CDentry::linkage_t *dnl = dn->get_projected_linkage(); - if (dnl->is_primary()) { - CInode *in = dnl->get_inode(); - nest_info.add(in->get_projected_inode()->accounted_rstat); - if (in->is_dir()) - frag_info.nsubdirs++; - else - frag_info.nfiles++; - } else if (dnl->is_remote()) - frag_info.nfiles++; - } - - fnode_t *pf = dir->get_projected_fnode(); - bool good_fragstat = frag_info.same_sums(pf->fragstat); - bool good_rstat = nest_info.same_sums(pf->rstat); - if (good_fragstat && good_rstat) { - dout(10) << __func__ << " no corruption found" << dendl; - mds->server->respond_to_request(mdr, 0); - return; - } - - pf = dir->project_fnode(); - pf->version = dir->pre_dirty(); - mdr->add_projected_fnode(dir); - - mdr->ls = mds->mdlog->get_current_segment(); - EUpdate *le = new EUpdate(mds->mdlog, "repair_dirfrag"); - mds->mdlog->start_entry(le); - - if (!good_fragstat) { - if (pf->fragstat.mtime > frag_info.mtime) - frag_info.mtime = pf->fragstat.mtime; - if (pf->fragstat.change_attr > frag_info.change_attr) - frag_info.change_attr = pf->fragstat.change_attr; - pf->fragstat = frag_info; - mds->locker->mark_updated_scatterlock(&diri->filelock); - mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir); - mdr->add_updated_lock(&diri->filelock); - } - - if (!good_rstat) { - if (pf->rstat.rctime > nest_info.rctime) - nest_info.rctime = pf->rstat.rctime; - pf->rstat = nest_info; - mds->locker->mark_updated_scatterlock(&diri->nestlock); - mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest); - mdr->add_updated_lock(&diri->nestlock); - } - - le->metablob.add_dir_context(dir); - le->metablob.add_dir(dir, true); - - mds->mdlog->submit_entry(le, new C_MDC_RepairDirfragStats(this, mdr)); -} - -void MDCache::repair_inode_stats(CInode *diri) -{ - MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_REPAIR_INODESTATS); - mdr->pin(diri); - mdr->internal_op_private = diri; - mdr->internal_op_finish = new C_MDSInternalNoop; - repair_inode_stats_work(mdr); -} - -void MDCache::repair_inode_stats_work(MDRequestRef& mdr) -{ - CInode *diri = static_cast(mdr->internal_op_private); - dout(10) << __func__ << " " << *diri << dendl; - - if (!diri->is_auth()) { - mds->server->respond_to_request(mdr, -ESTALE); - return; - } - if (!diri->is_dir()) { - mds->server->respond_to_request(mdr, -ENOTDIR); - return; - } - - set rdlocks, wrlocks, xlocks; - std::list frags; - - if (mdr->ls) // already marked filelock/nestlock dirty ? - goto do_rdlocks; - - rdlocks.insert(&diri->dirfragtreelock); - wrlocks.insert(&diri->nestlock); - wrlocks.insert(&diri->filelock); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - // Fetch all dirfrags and mark filelock/nestlock dirty. This will tirgger - // the scatter-gather process, which will fix any fragstat/rstat errors. - diri->dirfragtree.get_leaves(frags); - for (list::iterator p = frags.begin(); p != frags.end(); ++p) { - CDir *dir = diri->get_dirfrag(*p); - if (!dir) { - assert(mdr->is_auth_pinned(diri)); - dir = diri->get_or_open_dirfrag(this, *p); - } - if (dir->get_version() == 0) { - assert(dir->is_auth()); - dir->fetch(new C_MDS_RetryRequest(this, mdr)); - return; - } - } - - diri->state_set(CInode::STATE_REPAIRSTATS); - mdr->ls = mds->mdlog->get_current_segment(); - mds->locker->mark_updated_scatterlock(&diri->filelock); - mdr->ls->dirty_dirfrag_dir.push_back(&diri->item_dirty_dirfrag_dir); - mds->locker->mark_updated_scatterlock(&diri->nestlock); - mdr->ls->dirty_dirfrag_nest.push_back(&diri->item_dirty_dirfrag_nest); - - mds->locker->drop_locks(mdr.get()); - -do_rdlocks: - // force the scatter-gather process - rdlocks.insert(&diri->dirfragtreelock); - rdlocks.insert(&diri->nestlock); - rdlocks.insert(&diri->filelock); - wrlocks.clear(); - if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) - return; - - diri->state_clear(CInode::STATE_REPAIRSTATS); - - frag_info_t dir_info; - nest_info_t nest_info; - nest_info.rsubdirs++; // it gets one to account for self - - diri->dirfragtree.get_leaves(frags); - for (list::iterator p = frags.begin(); p != frags.end(); ++p) { - CDir *dir = diri->get_dirfrag(*p); - assert(dir); - assert(dir->get_version() > 0); - dir_info.add(dir->fnode.accounted_fragstat); - nest_info.add(dir->fnode.accounted_rstat); - } - - if (!dir_info.same_sums(diri->inode.dirstat) || - !nest_info.same_sums(diri->inode.rstat)) { - dout(10) << __func__ << " failed to fix fragstat/rstat on " - << *diri << dendl; - } - - mds->server->respond_to_request(mdr, 0); -} - -void MDCache::flush_dentry(const string& path, Context *fin) -{ - if (is_readonly()) { - dout(10) << __func__ << ": read-only FS" << dendl; - fin->complete(-EROFS); - return; - } - dout(10) << "flush_dentry " << path << dendl; - MDRequestRef mdr = request_start_internal(CEPH_MDS_OP_FLUSH); - filepath fp(path.c_str()); - mdr->set_filepath(fp); - mdr->internal_op_finish = fin; - flush_dentry_work(mdr); -} - -class C_FinishIOMDR : public MDSInternalContextBase { -protected: - MDSRank *mds; - MDRequestRef mdr; - MDSRank *get_mds() override { return mds; } -public: - C_FinishIOMDR(MDSRank *mds_, MDRequestRef& mdr_) : mds(mds_), mdr(mdr_) {} - void finish(int r) override { mds->server->respond_to_request(mdr, r); } -}; - -void MDCache::flush_dentry_work(MDRequestRef& mdr) -{ - set rdlocks, wrlocks, xlocks; - CInode *in = mds->server->rdlock_path_pin_ref(mdr, 0, rdlocks, true); - if (NULL == in) - return; - - // TODO: Is this necessary? Fix it if so - assert(in->is_auth()); - bool locked = mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks); - if (!locked) - return; - in->flush(new C_FinishIOMDR(mds, mdr)); -} - - -/** - * Initialize performance counters with global perfcounter - * collection. - */ -void MDCache::register_perfcounters() -{ - PerfCountersBuilder pcb(g_ceph_context, - "mds_cache", l_mdc_first, l_mdc_last); - - /* Stray/purge statistics */ - pcb.add_u64(l_mdc_num_strays, "num_strays", - "Stray dentries", "stry", PerfCountersBuilder::PRIO_INTERESTING); - pcb.add_u64(l_mdc_num_strays_delayed, "num_strays_delayed", "Stray dentries delayed"); - pcb.add_u64(l_mdc_num_strays_enqueuing, "num_strays_enqueuing", "Stray dentries enqueuing for purge"); - - pcb.add_u64_counter(l_mdc_strays_created, "strays_created", "Stray dentries created"); - pcb.add_u64_counter(l_mdc_strays_enqueued, "strays_enqueued", - "Stray dentries enqueued for purge"); - pcb.add_u64_counter(l_mdc_strays_reintegrated, "strays_reintegrated", "Stray dentries reintegrated"); - pcb.add_u64_counter(l_mdc_strays_migrated, "strays_migrated", "Stray dentries migrated"); - - - /* Recovery queue statistics */ - pcb.add_u64(l_mdc_num_recovering_processing, "num_recovering_processing", "Files currently being recovered"); - pcb.add_u64(l_mdc_num_recovering_enqueued, "num_recovering_enqueued", - "Files waiting for recovery", "recy", PerfCountersBuilder::PRIO_INTERESTING); - pcb.add_u64(l_mdc_num_recovering_prioritized, "num_recovering_prioritized", "Files waiting for recovery with elevated priority"); - pcb.add_u64_counter(l_mdc_recovery_started, "recovery_started", "File recoveries started"); - pcb.add_u64_counter(l_mdc_recovery_completed, "recovery_completed", - "File recoveries completed", "recd", PerfCountersBuilder::PRIO_INTERESTING); - - pcb.add_u64_counter(l_mdss_ireq_enqueue_scrub, "ireq_enqueue_scrub", - "Internal Request type enqueue scrub"); - pcb.add_u64_counter(l_mdss_ireq_exportdir, "ireq_exportdir", - "Internal Request type export dir"); - pcb.add_u64_counter(l_mdss_ireq_flush, "ireq_flush", - "Internal Request type flush"); - pcb.add_u64_counter(l_mdss_ireq_fragmentdir, "ireq_fragmentdir", - "Internal Request type fragmentdir"); - pcb.add_u64_counter(l_mdss_ireq_fragstats, "ireq_fragstats", - "Internal Request type frag stats"); - pcb.add_u64_counter(l_mdss_ireq_inodestats, "ireq_inodestats", - "Internal Request type inode stats"); - - logger.reset(pcb.create_perf_counters()); - g_ceph_context->get_perfcounters_collection()->add(logger.get()); - recovery_queue.set_logger(logger.get()); - stray_manager.set_logger(logger.get()); -} - -void MDCache::activate_stray_manager() -{ - if (open) { - stray_manager.activate(); - } else { - wait_for_open( - new MDSInternalContextWrapper(mds, - new FunctionContext([this](int r){ - stray_manager.activate(); - }) - ) - ); - } -} - -/** - * Call this when putting references to an inode/dentry or - * when attempting to trim it. - * - * If this inode is no longer linked by anyone, and this MDS - * rank holds the primary dentry, and that dentry is in a stray - * directory, then give up the dentry to the StrayManager, never - * to be seen again by MDCache. - * - * @param delay if true, then purgeable inodes are stashed til - * the next trim(), rather than being purged right - * away. - */ -void MDCache::maybe_eval_stray(CInode *in, bool delay) { - if (in->inode.nlink > 0 || in->is_base() || is_readonly() || - mds->get_state() <= MDSMap::STATE_REJOIN) - return; - - CDentry *dn = in->get_projected_parent_dn(); - - if (dn->state_test(CDentry::STATE_PURGING)) { - /* We have already entered the purging process, no need - * to re-evaluate me ! */ - return; - } - - if (dn->get_projected_linkage()->is_primary() && - dn->get_dir()->get_inode()->is_stray()) { - stray_manager.eval_stray(dn, delay); - } -} - -void MDCache::clear_dirty_bits_for_stray(CInode* diri) { - dout(10) << __func__ << " " << *diri << dendl; - assert(diri->get_projected_parent_dir()->inode->is_stray()); - list ls; - diri->get_dirfrags(ls); - for (auto p : ls) { - if (p->is_auth() && !(p->is_frozen() || p->is_freezing())) - p->try_remove_dentries_for_stray(); - } - if (!diri->snaprealm) { - if (diri->is_auth()) - diri->clear_dirty_rstat(); - diri->clear_scatter_dirty(); - } -} -