1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
24 #include "MDBalancer.h"
29 #include "include/filepath.h"
31 #include "events/EExport.h"
32 #include "events/EImportStart.h"
33 #include "events/EImportFinish.h"
34 #include "events/ESessions.h"
36 #include "msg/Messenger.h"
38 #include "messages/MClientCaps.h"
40 #include "messages/MExportDirDiscover.h"
41 #include "messages/MExportDirDiscoverAck.h"
42 #include "messages/MExportDirCancel.h"
43 #include "messages/MExportDirPrep.h"
44 #include "messages/MExportDirPrepAck.h"
45 #include "messages/MExportDir.h"
46 #include "messages/MExportDirAck.h"
47 #include "messages/MExportDirNotify.h"
48 #include "messages/MExportDirNotifyAck.h"
49 #include "messages/MExportDirFinish.h"
51 #include "messages/MExportCaps.h"
52 #include "messages/MExportCapsAck.h"
53 #include "messages/MGatherCaps.h"
57 * this is what the dir->dir_auth values look like
62 * me, me me - still me, but preparing for export
63 * me, them me - send MExportDir (peer is preparing)
64 * them, me me - journaled EExport
69 * me, them me - journaled EImportStart
73 * - auth bit is set if i am listed as first _or_ second dir_auth.
76 #include "common/config.h"
79 #define dout_context g_ceph_context
80 #define dout_subsys ceph_subsys_mds
82 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".migrator "
85 class MigratorContext : public MDSInternalContextBase {
88 MDSRank *get_mds() override {
92 explicit MigratorContext(Migrator *mig_) : mig(mig_) {
97 class MigratorLogContext : public MDSLogContextBase {
100 MDSRank *get_mds() override {
104 explicit MigratorLogContext(Migrator *mig_) : mig(mig_) {
109 /* This function DOES put the passed message before returning*/
110 void Migrator::dispatch(Message *m)
112 switch (m->get_type()) {
114 case MSG_MDS_EXPORTDIRDISCOVER:
115 handle_export_discover(static_cast<MExportDirDiscover*>(m));
117 case MSG_MDS_EXPORTDIRPREP:
118 handle_export_prep(static_cast<MExportDirPrep*>(m));
120 case MSG_MDS_EXPORTDIR:
121 handle_export_dir(static_cast<MExportDir*>(m));
123 case MSG_MDS_EXPORTDIRFINISH:
124 handle_export_finish(static_cast<MExportDirFinish*>(m));
126 case MSG_MDS_EXPORTDIRCANCEL:
127 handle_export_cancel(static_cast<MExportDirCancel*>(m));
131 case MSG_MDS_EXPORTDIRDISCOVERACK:
132 handle_export_discover_ack(static_cast<MExportDirDiscoverAck*>(m));
134 case MSG_MDS_EXPORTDIRPREPACK:
135 handle_export_prep_ack(static_cast<MExportDirPrepAck*>(m));
137 case MSG_MDS_EXPORTDIRACK:
138 handle_export_ack(static_cast<MExportDirAck*>(m));
140 case MSG_MDS_EXPORTDIRNOTIFYACK:
141 handle_export_notify_ack(static_cast<MExportDirNotifyAck*>(m));
144 // export 3rd party (dir_auth adjustments)
145 case MSG_MDS_EXPORTDIRNOTIFY:
146 handle_export_notify(static_cast<MExportDirNotify*>(m));
150 case MSG_MDS_EXPORTCAPS:
151 handle_export_caps(static_cast<MExportCaps*>(m));
153 case MSG_MDS_GATHERCAPS:
154 handle_gather_caps(static_cast<MGatherCaps*>(m));
158 derr << "migrator unknown message " << m->get_type() << dendl;
159 assert(0 == "migrator unknown message");
164 class C_MDC_EmptyImport : public MigratorContext {
167 C_MDC_EmptyImport(Migrator *m, CDir *d) : MigratorContext(m), dir(d) {}
168 void finish(int r) override {
169 mig->export_empty_import(dir);
174 void Migrator::export_empty_import(CDir *dir)
176 dout(7) << "export_empty_import " << *dir << dendl;
177 assert(dir->is_subtree_root());
179 if (dir->inode->is_auth()) {
180 dout(7) << " inode is auth" << dendl;
183 if (!dir->is_auth()) {
184 dout(7) << " not auth" << dendl;
187 if (dir->is_freezing() || dir->is_frozen()) {
188 dout(7) << " freezing or frozen" << dendl;
191 if (dir->get_num_head_items() > 0) {
192 dout(7) << " not actually empty" << dendl;
195 if (dir->inode->is_root()) {
196 dout(7) << " root" << dendl;
200 mds_rank_t dest = dir->inode->authority().first;
201 //if (mds->is_shutting_down()) dest = 0; // this is more efficient.
203 dout(7) << " really empty, exporting to " << dest << dendl;
204 assert (dest != mds->get_nodeid());
206 dout(7) << "exporting to mds." << dest
207 << " empty import " << *dir << dendl;
208 export_dir( dir, dest );
211 void Migrator::find_stale_export_freeze()
213 utime_t now = ceph_clock_now();
214 utime_t cutoff = now;
215 cutoff -= g_conf->mds_freeze_tree_timeout;
219 * We could have situations like:
221 * - mds.0 authpins an item in subtree A
222 * - mds.0 sends request to mds.1 to authpin an item in subtree B
223 * - mds.0 freezes subtree A
224 * - mds.1 authpins an item in subtree B
225 * - mds.1 sends request to mds.0 to authpin an item in subtree A
226 * - mds.1 freezes subtree B
227 * - mds.1 receives the remote authpin request from mds.0
228 * (wait because subtree B is freezing)
229 * - mds.0 receives the remote authpin request from mds.1
230 * (wait because subtree A is freezing)
233 * - client request authpins items in subtree B
235 * - import subtree A which is parent of subtree B
236 * (authpins parent inode of subtree B, see CDir::set_dir_auth())
238 * - client request tries authpinning items in subtree A
239 * (wait because subtree A is freezing)
241 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
242 p != export_state.end(); ) {
243 CDir* dir = p->first;
244 export_state_t& stat = p->second;
246 if (stat.state != EXPORT_DISCOVERING && stat.state != EXPORT_FREEZING)
248 if (stat.last_cum_auth_pins != dir->get_cum_auth_pins()) {
249 stat.last_cum_auth_pins = dir->get_cum_auth_pins();
250 stat.last_cum_auth_pins_change = now;
253 if (stat.last_cum_auth_pins_change >= cutoff)
255 if (stat.num_remote_waiters > 0 ||
256 (!dir->inode->is_root() && dir->get_parent_dir()->is_freezing())) {
257 export_try_cancel(dir);
262 void Migrator::export_try_cancel(CDir *dir, bool notify_peer)
264 dout(10) << "export_try_cancel " << *dir << dendl;
266 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
267 assert(it != export_state.end());
269 int state = it->second.state;
272 dout(10) << "export state=locking : dropping locks and removing auth_pin" << dendl;
273 it->second.state = EXPORT_CANCELLED;
274 dir->auth_unpin(this);
276 case EXPORT_DISCOVERING:
277 dout(10) << "export state=discovering : canceling freeze and removing auth_pin" << dendl;
278 it->second.state = EXPORT_CANCELLED;
279 dir->unfreeze_tree(); // cancel the freeze
280 dir->auth_unpin(this);
282 (!mds->is_cluster_degraded() ||
283 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
284 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
287 case EXPORT_FREEZING:
288 dout(10) << "export state=freezing : canceling freeze" << dendl;
289 it->second.state = EXPORT_CANCELLED;
290 dir->unfreeze_tree(); // cancel the freeze
291 if (dir->is_subtree_root())
292 cache->try_subtree_merge(dir);
294 (!mds->is_cluster_degraded() ||
295 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
296 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
299 // NOTE: state order reversal, warning comes after prepping
301 dout(10) << "export state=warning : unpinning bounds, unfreezing, notifying" << dendl;
302 it->second.state = EXPORT_CANCELLING;
305 case EXPORT_PREPPING:
306 if (state != EXPORT_WARNING) {
307 dout(10) << "export state=prepping : unpinning bounds, unfreezing" << dendl;
308 it->second.state = EXPORT_CANCELLED;
314 cache->get_subtree_bounds(dir, bounds);
315 for (set<CDir*>::iterator q = bounds.begin();
319 bd->put(CDir::PIN_EXPORTBOUND);
320 bd->state_clear(CDir::STATE_EXPORTBOUND);
322 if (state == EXPORT_WARNING) {
324 export_notify_abort(dir, bounds);
325 // process delayed expires
326 cache->process_delayed_expire(dir);
329 dir->unfreeze_tree();
330 cache->try_subtree_merge(dir);
332 (!mds->is_cluster_degraded() ||
333 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer))) // tell them.
334 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
337 case EXPORT_EXPORTING:
338 dout(10) << "export state=exporting : reversing, and unfreezing" << dendl;
339 it->second.state = EXPORT_CANCELLING;
343 case EXPORT_LOGGINGFINISH:
344 case EXPORT_NOTIFYING:
345 dout(10) << "export state=loggingfinish|notifying : ignoring dest failure, we were successful." << dendl;
346 // leave export_state, don't clean up now.
348 case EXPORT_CANCELLING:
356 if (it->second.state == EXPORT_CANCELLING ||
357 it->second.state == EXPORT_CANCELLED) {
359 mut.swap(it->second.mut);
361 if (it->second.state == EXPORT_CANCELLED) {
362 export_state.erase(it);
363 dir->state_clear(CDir::STATE_EXPORTING);
364 // send pending import_maps?
365 cache->maybe_send_pending_resolves();
369 if (state == EXPORT_LOCKING || state == EXPORT_DISCOVERING) {
370 MDRequestRef mdr = static_cast<MDRequestImpl*>(mut.get());
372 if (mdr->more()->waiting_on_slave.empty())
373 mds->mdcache->request_finish(mdr);
375 mds->locker->drop_locks(mut.get());
379 cache->show_subtrees();
381 maybe_do_queued_export();
385 void Migrator::export_cancel_finish(CDir *dir)
387 assert(dir->state_test(CDir::STATE_EXPORTING));
388 dir->state_clear(CDir::STATE_EXPORTING);
390 // pinned by Migrator::export_notify_abort()
391 dir->auth_unpin(this);
392 // send pending import_maps? (these need to go out when all exports have finished.)
393 cache->maybe_send_pending_resolves();
396 // ==========================================================
397 // mds failure handling
399 void Migrator::handle_mds_failure_or_stop(mds_rank_t who)
401 dout(5) << "handle_mds_failure_or_stop mds." << who << dendl;
405 // first add an extra auth_pin on any freezes, so that canceling a
406 // nested freeze doesn't complete one further up the hierarchy and
407 // confuse the shit out of us. we'll remove it after canceling the
408 // freeze. this way no freeze completions run before we want them
410 list<CDir*> pinned_dirs;
411 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
412 p != export_state.end();
414 if (p->second.state == EXPORT_FREEZING) {
415 CDir *dir = p->first;
416 dout(10) << "adding temp auth_pin on freezing " << *dir << dendl;
418 pinned_dirs.push_back(dir);
422 map<CDir*,export_state_t>::iterator p = export_state.begin();
423 while (p != export_state.end()) {
424 map<CDir*,export_state_t>::iterator next = p;
426 CDir *dir = p->first;
429 // - that are going to the failed node
430 // - that aren't frozen yet (to avoid auth_pin deadlock)
431 // - they havne't prepped yet (they may need to discover bounds to do that)
432 if ((p->second.peer == who &&
433 p->second.state != EXPORT_CANCELLING) ||
434 p->second.state == EXPORT_LOCKING ||
435 p->second.state == EXPORT_DISCOVERING ||
436 p->second.state == EXPORT_FREEZING ||
437 p->second.state == EXPORT_PREPPING) {
438 // the guy i'm exporting to failed, or we're just freezing.
439 dout(10) << "cleaning up export state (" << p->second.state << ")"
440 << get_export_statename(p->second.state) << " of " << *dir << dendl;
441 export_try_cancel(dir);
442 } else if (p->second.peer != who) {
444 if (p->second.warning_ack_waiting.erase(who)) {
445 if (p->second.state == EXPORT_WARNING) {
446 p->second.notify_ack_waiting.erase(who); // they won't get a notify either.
447 // exporter waiting for warning acks, let's fake theirs.
448 dout(10) << "faking export_warning_ack from mds." << who
449 << " on " << *dir << " to mds." << p->second.peer
451 if (p->second.warning_ack_waiting.empty())
455 if (p->second.notify_ack_waiting.erase(who)) {
456 // exporter is waiting for notify acks, fake it
457 dout(10) << "faking export_notify_ack from mds." << who
458 << " on " << *dir << " to mds." << p->second.peer
460 if (p->second.state == EXPORT_NOTIFYING) {
461 if (p->second.notify_ack_waiting.empty())
463 } else if (p->second.state == EXPORT_CANCELLING) {
464 if (p->second.notify_ack_waiting.empty()) {
465 export_state.erase(p);
466 export_cancel_finish(dir);
478 map<dirfrag_t,import_state_t>::iterator q = import_state.begin();
479 while (q != import_state.end()) {
480 map<dirfrag_t,import_state_t>::iterator next = q;
482 dirfrag_t df = q->first;
483 CInode *diri = mds->mdcache->get_inode(df.ino);
484 CDir *dir = mds->mdcache->get_dirfrag(df);
486 if (q->second.peer == who) {
488 dout(10) << "cleaning up import state (" << q->second.state << ")"
489 << get_import_statename(q->second.state) << " of " << *dir << dendl;
491 dout(10) << "cleaning up import state (" << q->second.state << ")"
492 << get_import_statename(q->second.state) << " of " << df << dendl;
494 switch (q->second.state) {
495 case IMPORT_DISCOVERING:
496 dout(10) << "import state=discovering : clearing state" << dendl;
497 import_reverse_discovering(df);
500 case IMPORT_DISCOVERED:
502 dout(10) << "import state=discovered : unpinning inode " << *diri << dendl;
503 import_reverse_discovered(df, diri);
506 case IMPORT_PREPPING:
508 dout(10) << "import state=prepping : unpinning base+bounds " << *dir << dendl;
509 import_reverse_prepping(dir);
514 dout(10) << "import state=prepped : unpinning base+bounds, unfreezing " << *dir << dendl;
517 cache->get_subtree_bounds(dir, bounds);
518 import_remove_pins(dir, bounds);
520 // adjust auth back to the exporter
521 cache->adjust_subtree_auth(dir, q->second.peer);
523 // notify bystanders ; wait in aborting state
524 import_state[df].state = IMPORT_ABORTING;
525 import_notify_abort(dir, bounds);
526 assert(g_conf->mds_kill_import_at != 10);
530 case IMPORT_LOGGINGSTART:
532 dout(10) << "import state=loggingstart : reversing import on " << *dir << dendl;
538 // hrm. make this an ambiguous import, and wait for exporter recovery to disambiguate
539 dout(10) << "import state=acking : noting ambiguous import " << *dir << dendl;
542 cache->get_subtree_bounds(dir, bounds);
543 cache->add_ambiguous_import(dir, bounds);
547 case IMPORT_FINISHING:
549 dout(10) << "import state=finishing : finishing import on " << *dir << dendl;
550 import_finish(dir, true);
553 case IMPORT_ABORTING:
555 dout(10) << "import state=aborting : ignoring repeat failure " << *dir << dendl;
559 auto bystanders_entry = q->second.bystanders.find(who);
560 if (bystanders_entry != q->second.bystanders.end()) {
561 q->second.bystanders.erase(bystanders_entry);
562 if (q->second.state == IMPORT_ABORTING) {
564 dout(10) << "faking export_notify_ack from mds." << who
565 << " on aborting import " << *dir << " from mds." << q->second.peer
567 if (q->second.bystanders.empty())
568 import_reverse_unfreeze(dir);
577 while (!pinned_dirs.empty()) {
578 CDir *dir = pinned_dirs.front();
579 dout(10) << "removing temp auth_pin on " << *dir << dendl;
580 dir->auth_unpin(this);
581 pinned_dirs.pop_front();
587 void Migrator::show_importing()
589 dout(10) << "show_importing" << dendl;
590 for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
591 p != import_state.end();
593 CDir *dir = mds->mdcache->get_dirfrag(p->first);
595 dout(10) << " importing from " << p->second.peer
596 << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
597 << " " << p->first << " " << *dir << dendl;
599 dout(10) << " importing from " << p->second.peer
600 << ": (" << p->second.state << ") " << get_import_statename(p->second.state)
601 << " " << p->first << dendl;
606 void Migrator::show_exporting()
608 dout(10) << "show_exporting" << dendl;
609 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
610 p != export_state.end();
612 dout(10) << " exporting to " << p->second.peer
613 << ": (" << p->second.state << ") " << get_export_statename(p->second.state)
614 << " " << p->first->dirfrag() << " " << *p->first << dendl;
619 void Migrator::audit()
621 if (!g_conf->subsys.should_gather(ceph_subsys_mds, 5))
626 for (map<dirfrag_t,import_state_t>::iterator p = import_state.begin();
627 p != import_state.end();
629 if (p->second.state == IMPORT_DISCOVERING)
631 if (p->second.state == IMPORT_DISCOVERED) {
632 CInode *in = cache->get_inode(p->first.ino);
636 CDir *dir = cache->get_dirfrag(p->first);
638 if (p->second.state == IMPORT_PREPPING)
640 if (p->second.state == IMPORT_ABORTING) {
641 assert(!dir->is_ambiguous_dir_auth());
642 assert(dir->get_dir_auth().first != mds->get_nodeid());
645 assert(dir->is_ambiguous_dir_auth());
646 assert(dir->authority().first == mds->get_nodeid() ||
647 dir->authority().second == mds->get_nodeid());
652 for (map<CDir*,export_state_t>::iterator p = export_state.begin();
653 p != export_state.end();
655 CDir *dir = p->first;
656 if (p->second.state == EXPORT_LOCKING ||
657 p->second.state == EXPORT_DISCOVERING ||
658 p->second.state == EXPORT_FREEZING ||
659 p->second.state == EXPORT_CANCELLING)
661 assert(dir->is_ambiguous_dir_auth());
662 assert(dir->authority().first == mds->get_nodeid() ||
663 dir->authority().second == mds->get_nodeid());
666 // ambiguous+me subtrees should be importing|exporting
675 // ==========================================================
678 void Migrator::export_dir_nicely(CDir *dir, mds_rank_t dest)
681 dout(7) << "export_dir_nicely " << *dir << " to " << dest << dendl;
682 export_queue.push_back(pair<dirfrag_t,mds_rank_t>(dir->dirfrag(), dest));
684 maybe_do_queued_export();
687 void Migrator::maybe_do_queued_export()
693 while (!export_queue.empty() &&
694 export_state.size() <= 4) {
695 dirfrag_t df = export_queue.front().first;
696 mds_rank_t dest = export_queue.front().second;
697 export_queue.pop_front();
699 CDir *dir = mds->mdcache->get_dirfrag(df);
701 if (!dir->is_auth()) continue;
703 dout(0) << "nicely exporting to mds." << dest << " " << *dir << dendl;
705 export_dir(dir, dest);
713 class C_MDC_ExportFreeze : public MigratorContext {
714 CDir *ex; // dir i'm exporting
717 C_MDC_ExportFreeze(Migrator *m, CDir *e, uint64_t t) :
718 MigratorContext(m), ex(e), tid(t) {
721 void finish(int r) override {
723 mig->export_frozen(ex, tid);
728 void Migrator::get_export_lock_set(CDir *dir, set<SimpleLock*>& locks)
731 vector<CDentry*> trace;
732 cache->make_trace(trace, dir->inode);
733 for (vector<CDentry*>::iterator it = trace.begin();
736 locks.insert(&(*it)->lock);
738 // prevent scatter gather race
739 locks.insert(&dir->get_inode()->dirfragtreelock);
742 // NOTE: We need to take an rdlock on bounding dirfrags during
743 // migration for a rather irritating reason: when we export the
744 // bound inode, we need to send scatterlock state for the dirfrags
745 // as well, so that the new auth also gets the correct info. If we
746 // race with a refragment, this info is useless, as we can't
747 // redivvy it up. And it's needed for the scatterlocks to work
748 // properly: when the auth is in a sync/lock state it keeps each
749 // dirfrag's portion in the local (auth OR replica) dirfrag.
750 set<CDir*> wouldbe_bounds;
751 cache->get_wouldbe_subtree_bounds(dir, wouldbe_bounds);
752 for (set<CDir*>::iterator p = wouldbe_bounds.begin(); p != wouldbe_bounds.end(); ++p)
753 locks.insert(&(*p)->get_inode()->dirfragtreelock);
757 class C_M_ExportDirWait : public MigratorContext {
761 C_M_ExportDirWait(Migrator *m, MDRequestRef mdr, int count)
762 : MigratorContext(m), mdr(mdr), count(count) {}
763 void finish(int r) override {
764 mig->dispatch_export_dir(mdr, count);
769 /** export_dir(dir, dest)
770 * public method to initiate an export.
771 * will fail if the directory is freezing, frozen, unpinnable, or root.
773 void Migrator::export_dir(CDir *dir, mds_rank_t dest)
775 dout(7) << "export_dir " << *dir << " to " << dest << dendl;
776 assert(dir->is_auth());
777 assert(dest != mds->get_nodeid());
779 if (!(mds->is_active() || mds->is_stopping())) {
780 dout(7) << "i'm not active, no exports for now" << dendl;
783 if (mds->mdcache->is_readonly()) {
784 dout(7) << "read-only FS, no exports for now" << dendl;
787 if (!mds->mdsmap->is_active(dest)) {
788 dout(7) << "dest not active, no exports for now" << dendl;
791 if (mds->is_cluster_degraded()) {
792 dout(7) << "cluster degraded, no exports for now" << dendl;
795 if (dir->inode->is_system()) {
796 dout(7) << "i won't export system dirs (root, mdsdirs, stray, /.ceph, etc.)" << dendl;
801 if (!dir->inode->is_base() && dir->inode->get_projected_parent_dir()->inode->is_stray() &&
802 dir->inode->get_projected_parent_dir()->get_parent_dir()->ino() != MDS_INO_MDSDIR(dest)) {
803 dout(7) << "i won't export anything in stray" << dendl;
807 if (dir->is_frozen() ||
808 dir->is_freezing()) {
809 dout(7) << " can't export, freezing|frozen. wait for other exports to finish first." << dendl;
812 if (dir->state_test(CDir::STATE_EXPORTING)) {
813 dout(7) << "already exporting" << dendl;
817 if (!mds->is_stopping() && !dir->inode->is_exportable(dest)) {
818 dout(7) << "dir is export pinned" << dendl;
822 if (dest == mds->get_nodeid() || !mds->mdsmap->is_up(dest)) {
823 dout(7) << "cannot export: dest " << dest << " is me or is not active" << dendl;
827 if (g_conf->mds_thrash_exports) {
828 // create random subtree bound (which will not be exported)
830 for (auto p = dir->begin(); p != dir->end(); ++p) {
832 CDentry::linkage_t *dnl= dn->get_linkage();
833 if (dnl->is_primary()) {
834 CInode *in = dnl->get_inode();
836 in->get_nested_dirfrags(ls);
840 int n = rand() % ls.size();
844 if (!(bd->is_frozen() || bd->is_freezing())) {
845 assert(bd->is_auth());
846 dir->state_set(CDir::STATE_AUXSUBTREE);
847 mds->mdcache->adjust_subtree_auth(dir, mds->get_nodeid());
848 dout(0) << "export_dir: create aux subtree " << *bd << " under " << *dir << dendl;
853 mds->hit_export_target(ceph_clock_now(), dest, -1);
856 dir->state_set(CDir::STATE_EXPORTING);
858 MDRequestRef mdr = mds->mdcache->request_start_internal(CEPH_MDS_OP_EXPORTDIR);
859 mdr->more()->export_dir = dir;
861 assert(export_state.count(dir) == 0);
862 export_state_t& stat = export_state[dir];
863 stat.state = EXPORT_LOCKING;
865 stat.tid = mdr->reqid.tid;
868 return mds->mdcache->dispatch_request(mdr);
871 void Migrator::dispatch_export_dir(MDRequestRef& mdr, int count)
873 dout(7) << "dispatch_export_dir " << *mdr << dendl;
875 CDir *dir = mdr->more()->export_dir;
876 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
877 if (it == export_state.end() || it->second.tid != mdr->reqid.tid) {
878 // export must have aborted.
879 dout(7) << "export must have aborted " << *mdr << dendl;
880 mds->mdcache->request_finish(mdr);
883 assert(it->second.state == EXPORT_LOCKING);
885 mds_rank_t dest = it->second.peer;
887 if (!mds->is_export_target(dest)) {
888 dout(7) << "dest is not yet an export target" << dendl;
890 dout(5) << "dest has not been added as export target after three MDSMap epochs, canceling export" << dendl;
891 export_try_cancel(dir);
895 mds->locker->drop_locks(mdr.get());
896 mdr->drop_local_auth_pins();
898 mds->wait_for_mdsmap(mds->mdsmap->get_epoch(), new C_M_ExportDirWait(this, mdr, count+1));
902 if (!dir->inode->get_parent_dn()) {
903 dout(7) << "waiting for dir to become stable before export: " << *dir << dendl;
904 dir->add_waiter(CDir::WAIT_CREATED, new C_M_ExportDirWait(this, mdr, 1));
908 if (mdr->aborted || dir->is_frozen() || dir->is_freezing()) {
909 dout(7) << "wouldblock|freezing|frozen, canceling export" << dendl;
910 export_try_cancel(dir);
915 set<SimpleLock*> rdlocks;
916 set<SimpleLock*> xlocks;
917 set<SimpleLock*> wrlocks;
918 get_export_lock_set(dir, rdlocks);
919 // If auth MDS of the subtree root inode is neither the exporter MDS
920 // nor the importer MDS and it gathers subtree root's fragstat/neststat
921 // while the subtree is exporting. It's possible that the exporter MDS
922 // and the importer MDS both are auth MDS of the subtree root or both
923 // are not auth MDS of the subtree root at the time they receive the
924 // lock messages. So the auth MDS of the subtree root inode may get no
925 // or duplicated fragstat/neststat for the subtree root dirfrag.
926 wrlocks.insert(&dir->get_inode()->filelock);
927 wrlocks.insert(&dir->get_inode()->nestlock);
928 if (dir->get_inode()->is_auth()) {
929 dir->get_inode()->filelock.set_scatter_wanted();
930 dir->get_inode()->nestlock.set_scatter_wanted();
933 if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks, NULL, NULL, true)) {
935 export_try_cancel(dir);
939 assert(g_conf->mds_kill_export_at != 1);
940 it->second.state = EXPORT_DISCOVERING;
942 // send ExportDirDiscover (ask target)
944 dir->inode->make_path(path);
945 MExportDirDiscover *discover = new MExportDirDiscover(dir->dirfrag(), path,
948 mds->send_message_mds(discover, dest);
949 assert(g_conf->mds_kill_export_at != 2);
951 it->second.last_cum_auth_pins_change = ceph_clock_now();
953 // start the freeze, but hold it up with an auth_pin.
955 assert(dir->is_freezing_tree());
956 dir->add_waiter(CDir::WAIT_FROZEN, new C_MDC_ExportFreeze(this, dir, it->second.tid));
960 * called on receipt of MExportDirDiscoverAck
961 * the importer now has the directory's _inode_ in memory, and pinned.
963 * This function DOES put the passed message before returning
965 void Migrator::handle_export_discover_ack(MExportDirDiscoverAck *m)
967 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
968 mds_rank_t dest(m->get_source().num());
969 utime_t now = ceph_clock_now();
972 dout(7) << "export_discover_ack from " << m->get_source()
973 << " on " << *dir << dendl;
975 mds->hit_export_target(now, dest, -1);
977 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
978 if (it == export_state.end() ||
979 it->second.tid != m->get_tid() ||
980 it->second.peer != dest) {
981 dout(7) << "must have aborted" << dendl;
983 assert(it->second.state == EXPORT_DISCOVERING);
985 if (m->is_success()) {
986 // release locks to avoid deadlock
987 MDRequestRef mdr = static_cast<MDRequestImpl*>(it->second.mut.get());
989 mds->mdcache->request_finish(mdr);
990 it->second.mut.reset();
991 // freeze the subtree
992 it->second.state = EXPORT_FREEZING;
993 dir->auth_unpin(this);
994 assert(g_conf->mds_kill_export_at != 3);
997 dout(7) << "peer failed to discover (not active?), canceling" << dendl;
998 export_try_cancel(dir, false);
1005 class C_M_ExportSessionsFlushed : public MigratorContext {
1009 C_M_ExportSessionsFlushed(Migrator *m, CDir *d, uint64_t t)
1010 : MigratorContext(m), dir(d), tid(t) {
1011 assert(dir != NULL);
1013 void finish(int r) override {
1014 mig->export_sessions_flushed(dir, tid);
1018 void Migrator::export_sessions_flushed(CDir *dir, uint64_t tid)
1020 dout(7) << "export_sessions_flushed " << *dir << dendl;
1022 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1023 if (it == export_state.end() ||
1024 it->second.state == EXPORT_CANCELLING ||
1025 it->second.tid != tid) {
1026 // export must have aborted.
1027 dout(7) << "export must have aborted on " << dir << dendl;
1031 assert(it->second.state == EXPORT_PREPPING || it->second.state == EXPORT_WARNING);
1032 assert(it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0);
1033 it->second.warning_ack_waiting.erase(MDS_RANK_NONE);
1034 if (it->second.state == EXPORT_WARNING && it->second.warning_ack_waiting.empty())
1035 export_go(dir); // start export.
1038 void Migrator::export_frozen(CDir *dir, uint64_t tid)
1040 dout(7) << "export_frozen on " << *dir << dendl;
1042 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1043 if (it == export_state.end() || it->second.tid != tid) {
1044 dout(7) << "export must have aborted" << dendl;
1048 assert(it->second.state == EXPORT_FREEZING);
1049 assert(dir->is_frozen_tree_root());
1050 assert(dir->get_cum_auth_pins() == 0);
1052 CInode *diri = dir->get_inode();
1054 // ok, try to grab all my locks.
1055 set<SimpleLock*> rdlocks;
1056 get_export_lock_set(dir, rdlocks);
1057 if ((diri->is_auth() && diri->is_frozen()) ||
1058 !mds->locker->can_rdlock_set(rdlocks) ||
1059 !diri->filelock.can_wrlock(-1) ||
1060 !diri->nestlock.can_wrlock(-1)) {
1061 dout(7) << "export_dir couldn't acquire all needed locks, failing. "
1064 dir->unfreeze_tree();
1065 cache->try_subtree_merge(dir);
1067 mds->send_message_mds(new MExportDirCancel(dir->dirfrag(), it->second.tid), it->second.peer);
1068 export_state.erase(it);
1070 dir->state_clear(CDir::STATE_EXPORTING);
1071 cache->maybe_send_pending_resolves();
1075 it->second.mut = new MutationImpl();
1076 if (diri->is_auth())
1077 it->second.mut->auth_pin(diri);
1078 mds->locker->rdlock_take_set(rdlocks, it->second.mut);
1079 mds->locker->wrlock_force(&diri->filelock, it->second.mut);
1080 mds->locker->wrlock_force(&diri->nestlock, it->second.mut);
1082 cache->show_subtrees();
1084 // CDir::_freeze_tree() should have forced it into subtree.
1085 assert(dir->get_dir_auth() == mds_authority_t(mds->get_nodeid(), mds->get_nodeid()));
1088 cache->get_subtree_bounds(dir, bounds);
1090 // generate prep message, log entry.
1091 MExportDirPrep *prep = new MExportDirPrep(dir->dirfrag(), it->second.tid);
1093 // include list of bystanders
1094 for (const auto &p : dir->get_replicas()) {
1095 if (p.first != it->second.peer) {
1096 dout(10) << "bystander mds." << p.first << dendl;
1097 prep->add_bystander(p.first);
1101 // include base dirfrag
1102 cache->replicate_dir(dir, it->second.peer, prep->basedir);
1105 * include spanning tree for all nested exports.
1106 * these need to be on the destination _before_ the final export so that
1107 * dir_auth updates on any nested exports are properly absorbed.
1108 * this includes inodes and dirfrags included in the subtree, but
1109 * only the inodes at the bounds.
1111 * each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
1113 set<inodeno_t> inodes_added;
1114 set<dirfrag_t> dirfrags_added;
1117 for (set<CDir*>::iterator p = bounds.begin();
1123 bound->get(CDir::PIN_EXPORTBOUND);
1124 bound->state_set(CDir::STATE_EXPORTBOUND);
1126 dout(7) << " export bound " << *bound << dendl;
1127 prep->add_bound( bound->dirfrag() );
1135 // don't repeat inodes
1136 if (inodes_added.count(cur->inode->ino()))
1138 inodes_added.insert(cur->inode->ino());
1140 // prepend dentry + inode
1141 assert(cur->inode->is_auth());
1143 cache->replicate_dentry(cur->inode->parent, it->second.peer, bl);
1144 dout(7) << " added " << *cur->inode->parent << dendl;
1145 cache->replicate_inode(cur->inode, it->second.peer, bl,
1146 mds->mdsmap->get_up_features());
1147 dout(7) << " added " << *cur->inode << dendl;
1148 bl.claim_append(tracebl);
1151 cur = cur->get_parent_dir();
1153 // don't repeat dirfrags
1154 if (dirfrags_added.count(cur->dirfrag()) ||
1156 start = 'd'; // start with dentry
1159 dirfrags_added.insert(cur->dirfrag());
1162 cache->replicate_dir(cur, it->second.peer, bl);
1163 dout(7) << " added " << *cur << dendl;
1164 bl.claim_append(tracebl);
1167 start = 'f'; // start with dirfrag
1169 bufferlist final_bl;
1170 dirfrag_t df = cur->dirfrag();
1171 ::encode(df, final_bl);
1172 ::encode(start, final_bl);
1173 final_bl.claim_append(tracebl);
1174 prep->add_trace(final_bl);
1178 it->second.state = EXPORT_PREPPING;
1179 mds->send_message_mds(prep, it->second.peer);
1180 assert (g_conf->mds_kill_export_at != 4);
1182 // make sure any new instantiations of caps are flushed out
1183 assert(it->second.warning_ack_waiting.empty());
1185 set<client_t> export_client_set;
1186 get_export_client_set(dir, export_client_set);
1188 MDSGatherBuilder gather(g_ceph_context);
1189 mds->server->flush_client_sessions(export_client_set, gather);
1190 if (gather.has_subs()) {
1191 it->second.warning_ack_waiting.insert(MDS_RANK_NONE);
1192 gather.set_finisher(new C_M_ExportSessionsFlushed(this, dir, it->second.tid));
1197 void Migrator::get_export_client_set(CDir *dir, set<client_t>& client_set)
1201 while (!dfs.empty()) {
1202 CDir *dir = dfs.front();
1204 for (CDir::map_t::iterator p = dir->begin(); p != dir->end(); ++p) {
1205 CDentry *dn = p->second;
1206 if (!dn->get_linkage()->is_primary())
1208 CInode *in = dn->get_linkage()->get_inode();
1212 in->get_dirfrags(ls);
1213 for (list<CDir*>::iterator q = ls.begin(); q != ls.end(); ++q) {
1214 if (!(*q)->state_test(CDir::STATE_EXPORTBOUND)) {
1215 // include nested dirfrag
1216 assert((*q)->get_dir_auth().first == CDIR_AUTH_PARENT);
1217 dfs.push_back(*q); // it's ours, recurse (later)
1221 for (map<client_t, Capability*>::iterator q = in->client_caps.begin();
1222 q != in->client_caps.end();
1224 client_set.insert(q->first);
1229 void Migrator::get_export_client_set(CInode *in, set<client_t>& client_set)
1231 for (map<client_t, Capability*>::iterator q = in->client_caps.begin();
1232 q != in->client_caps.end();
1234 client_set.insert(q->first);
1237 /* This function DOES put the passed message before returning*/
1238 void Migrator::handle_export_prep_ack(MExportDirPrepAck *m)
1240 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1241 mds_rank_t dest(m->get_source().num());
1242 utime_t now = ceph_clock_now();
1245 dout(7) << "export_prep_ack " << *dir << dendl;
1247 mds->hit_export_target(now, dest, -1);
1249 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1250 if (it == export_state.end() ||
1251 it->second.tid != m->get_tid() ||
1252 it->second.peer != mds_rank_t(m->get_source().num())) {
1253 // export must have aborted.
1254 dout(7) << "export must have aborted" << dendl;
1258 assert(it->second.state == EXPORT_PREPPING);
1260 if (!m->is_success()) {
1261 dout(7) << "peer couldn't acquire all needed locks or wasn't active, canceling" << dendl;
1262 export_try_cancel(dir, false);
1267 assert (g_conf->mds_kill_export_at != 5);
1270 cache->get_subtree_bounds(dir, bounds);
1272 assert(it->second.warning_ack_waiting.empty() ||
1273 (it->second.warning_ack_waiting.size() == 1 &&
1274 it->second.warning_ack_waiting.count(MDS_RANK_NONE) > 0));
1275 assert(it->second.notify_ack_waiting.empty());
1277 for (const auto &p : dir->get_replicas()) {
1278 if (p.first == it->second.peer) continue;
1279 if (mds->is_cluster_degraded() &&
1280 !mds->mdsmap->is_clientreplay_or_active_or_stopping(p.first))
1281 continue; // only if active
1282 it->second.warning_ack_waiting.insert(p.first);
1283 it->second.notify_ack_waiting.insert(p.first); // we'll eventually get a notifyack, too!
1285 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), it->second.tid, true,
1286 mds_authority_t(mds->get_nodeid(),CDIR_AUTH_UNKNOWN),
1287 mds_authority_t(mds->get_nodeid(),it->second.peer));
1288 for (set<CDir*>::iterator q = bounds.begin(); q != bounds.end(); ++q)
1289 notify->get_bounds().push_back((*q)->dirfrag());
1290 mds->send_message_mds(notify, p.first);
1294 it->second.state = EXPORT_WARNING;
1296 assert(g_conf->mds_kill_export_at != 6);
1298 if (it->second.warning_ack_waiting.empty())
1299 export_go(dir); // start export.
1306 class C_M_ExportGo : public MigratorContext {
1310 C_M_ExportGo(Migrator *m, CDir *d, uint64_t t) :
1311 MigratorContext(m), dir(d), tid(t) {
1312 assert(dir != NULL);
1314 void finish(int r) override {
1315 mig->export_go_synced(dir, tid);
1319 void Migrator::export_go(CDir *dir)
1321 assert(export_state.count(dir));
1322 dout(7) << "export_go " << *dir << " to " << export_state[dir].peer << dendl;
1324 // first sync log to flush out e.g. any cap imports
1325 mds->mdlog->wait_for_safe(new C_M_ExportGo(this, dir, export_state[dir].tid));
1326 mds->mdlog->flush();
1329 void Migrator::export_go_synced(CDir *dir, uint64_t tid)
1331 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1332 if (it == export_state.end() ||
1333 it->second.state == EXPORT_CANCELLING ||
1334 it->second.tid != tid) {
1335 // export must have aborted.
1336 dout(7) << "export must have aborted on " << dir << dendl;
1339 assert(it->second.state == EXPORT_WARNING);
1340 mds_rank_t dest = it->second.peer;
1342 dout(7) << "export_go_synced " << *dir << " to " << dest << dendl;
1344 cache->show_subtrees();
1346 it->second.state = EXPORT_EXPORTING;
1347 assert(g_conf->mds_kill_export_at != 7);
1349 assert(dir->is_frozen_tree_root());
1350 assert(dir->get_cum_auth_pins() == 0);
1352 // set ambiguous auth
1353 cache->adjust_subtree_auth(dir, mds->get_nodeid(), dest);
1355 // take away the popularity we're sending.
1356 utime_t now = ceph_clock_now();
1357 mds->balancer->subtract_export(dir, now);
1359 // fill export message with cache data
1360 MExportDir *req = new MExportDir(dir->dirfrag(), it->second.tid);
1361 map<client_t,entity_inst_t> exported_client_map;
1362 uint64_t num_exported_inodes = encode_export_dir(req->export_data,
1363 dir, // recur start point
1364 exported_client_map,
1366 ::encode(exported_client_map, req->client_map,
1367 mds->mdsmap->get_up_features());
1369 // add bounds to message
1371 cache->get_subtree_bounds(dir, bounds);
1372 for (set<CDir*>::iterator p = bounds.begin();
1375 req->add_export((*p)->dirfrag());
1378 mds->send_message_mds(req, dest);
1379 assert(g_conf->mds_kill_export_at != 8);
1381 mds->hit_export_target(now, dest, num_exported_inodes+1);
1384 if (mds->logger) mds->logger->inc(l_mds_exported);
1385 if (mds->logger) mds->logger->inc(l_mds_exported_inodes, num_exported_inodes);
1387 cache->show_subtrees();
1391 /** encode_export_inode
1392 * update our local state for this inode to export.
1393 * encode relevant state to be sent over the wire.
1394 * used by: encode_export_dir, file_rename (if foreign)
1396 * FIXME: the separation between CInode.encode_export and these methods
1397 * is pretty arbitrary and dumb.
1399 void Migrator::encode_export_inode(CInode *in, bufferlist& enc_state,
1400 map<client_t,entity_inst_t>& exported_client_map)
1402 dout(7) << "encode_export_inode " << *in << dendl;
1403 assert(!in->is_replica(mds->get_nodeid()));
1406 if (!in->is_replicated()) {
1407 in->replicate_relax_locks();
1408 dout(20) << " did replicate_relax_locks, now " << *in << dendl;
1411 ::encode(in->inode.ino, enc_state);
1412 ::encode(in->last, enc_state);
1413 in->encode_export(enc_state);
1416 encode_export_inode_caps(in, true, enc_state, exported_client_map);
1419 void Migrator::encode_export_inode_caps(CInode *in, bool auth_cap, bufferlist& bl,
1420 map<client_t,entity_inst_t>& exported_client_map)
1422 dout(20) << "encode_export_inode_caps " << *in << dendl;
1425 map<client_t,Capability::Export> cap_map;
1426 in->export_client_caps(cap_map);
1427 ::encode(cap_map, bl);
1429 ::encode(in->get_mds_caps_wanted(), bl);
1431 in->state_set(CInode::STATE_EXPORTINGCAPS);
1432 in->get(CInode::PIN_EXPORTINGCAPS);
1435 // make note of clients named by exported capabilities
1436 for (map<client_t, Capability*>::iterator it = in->client_caps.begin();
1437 it != in->client_caps.end();
1439 exported_client_map[it->first] = mds->sessionmap.get_inst(entity_name_t::CLIENT(it->first.v));
1442 void Migrator::finish_export_inode_caps(CInode *in, mds_rank_t peer,
1443 map<client_t,Capability::Import>& peer_imported)
1445 dout(20) << "finish_export_inode_caps " << *in << dendl;
1447 in->state_clear(CInode::STATE_EXPORTINGCAPS);
1448 in->put(CInode::PIN_EXPORTINGCAPS);
1450 // tell (all) clients about migrating caps..
1451 for (map<client_t, Capability*>::iterator it = in->client_caps.begin();
1452 it != in->client_caps.end();
1454 Capability *cap = it->second;
1455 dout(7) << "finish_export_inode_caps telling client." << it->first
1456 << " exported caps on " << *in << dendl;
1457 MClientCaps *m = new MClientCaps(CEPH_CAP_OP_EXPORT, in->ino(), 0,
1458 cap->get_cap_id(), cap->get_mseq(), mds->get_osd_epoch_barrier());
1460 map<client_t,Capability::Import>::iterator q = peer_imported.find(it->first);
1461 assert(q != peer_imported.end());
1462 m->set_cap_peer(q->second.cap_id, q->second.issue_seq, q->second.mseq, peer, 0);
1463 mds->send_message_client_counted(m, it->first);
1465 in->clear_client_caps_after_export();
1466 mds->locker->eval(in, CEPH_CAP_LOCKS);
1469 void Migrator::finish_export_inode(CInode *in, utime_t now, mds_rank_t peer,
1470 map<client_t,Capability::Import>& peer_imported,
1471 list<MDSInternalContextBase*>& finished)
1473 dout(12) << "finish_export_inode " << *in << dendl;
1479 // clear/unpin cached_by (we're no longer the authority)
1480 in->clear_replica_map();
1482 // twiddle lock states for auth -> replica transition
1483 in->authlock.export_twiddle();
1484 in->linklock.export_twiddle();
1485 in->dirfragtreelock.export_twiddle();
1486 in->filelock.export_twiddle();
1487 in->nestlock.export_twiddle();
1488 in->xattrlock.export_twiddle();
1489 in->snaplock.export_twiddle();
1490 in->flocklock.export_twiddle();
1491 in->policylock.export_twiddle();
1494 assert(in->is_auth());
1495 in->state_clear(CInode::STATE_AUTH);
1496 in->replica_nonce = CInode::EXPORT_NONCE;
1498 in->clear_dirty_rstat();
1500 // no more auth subtree? clear scatter dirty
1501 if (!in->has_subtree_root_dirfrag(mds->get_nodeid()))
1502 in->clear_scatter_dirty();
1504 in->item_open_file.remove_myself();
1506 in->clear_dirty_parent();
1508 in->clear_file_locks();
1511 in->take_waiting(CInode::WAIT_ANY_MASK, finished);
1513 in->finish_export(now);
1515 finish_export_inode_caps(in, peer, peer_imported);
1518 uint64_t Migrator::encode_export_dir(bufferlist& exportbl,
1520 map<client_t,entity_inst_t>& exported_client_map,
1523 uint64_t num_exported = 0;
1525 dout(7) << "encode_export_dir " << *dir << " " << dir->get_num_head_items() << " head items" << dendl;
1527 assert(dir->get_projected_version() == dir->get_version());
1529 #ifdef MDS_VERIFY_FRAGSTAT
1530 if (dir->is_complete())
1531 dir->verify_fragstat();
1535 dirfrag_t df = dir->dirfrag();
1536 ::encode(df, exportbl);
1537 dir->encode_export(exportbl);
1539 __u32 nden = dir->items.size();
1540 ::encode(nden, exportbl);
1543 list<CDir*> subdirs;
1544 CDir::map_t::iterator it;
1545 for (it = dir->begin(); it != dir->end(); ++it) {
1546 CDentry *dn = it->second;
1547 CInode *in = dn->get_linkage()->get_inode();
1549 if (!dn->is_replicated())
1550 dn->lock.replicate_relax();
1555 dout(7) << "encode_export_dir exporting " << *dn << dendl;
1558 ::encode(dn->name, exportbl);
1559 ::encode(dn->last, exportbl);
1562 dn->encode_export(exportbl);
1567 if (dn->get_linkage()->is_null()) {
1568 exportbl.append("N", 1); // null dentry
1572 if (dn->get_linkage()->is_remote()) {
1574 exportbl.append("L", 1); // remote link
1576 inodeno_t ino = dn->get_linkage()->get_remote_ino();
1577 unsigned char d_type = dn->get_linkage()->get_remote_d_type();
1578 ::encode(ino, exportbl);
1579 ::encode(d_type, exportbl);
1585 exportbl.append("I", 1); // inode dentry
1587 encode_export_inode(in, exportbl, exported_client_map); // encode, and (update state for) export
1591 in->get_dirfrags(dfs);
1592 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p) {
1594 if (!t->state_test(CDir::STATE_EXPORTBOUND)) {
1595 // include nested dirfrag
1596 assert(t->get_dir_auth().first == CDIR_AUTH_PARENT);
1597 subdirs.push_back(t); // it's ours, recurse (later)
1603 for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); ++it)
1604 num_exported += encode_export_dir(exportbl, *it, exported_client_map, now);
1606 return num_exported;
1609 void Migrator::finish_export_dir(CDir *dir, utime_t now, mds_rank_t peer,
1610 map<inodeno_t,map<client_t,Capability::Import> >& peer_imported,
1611 list<MDSInternalContextBase*>& finished, int *num_dentries)
1613 dout(10) << "finish_export_dir " << *dir << dendl;
1616 dir->clear_replica_map();
1619 assert(dir->is_auth());
1620 dir->state_clear(CDir::STATE_AUTH);
1621 dir->remove_bloom();
1622 dir->replica_nonce = CDir::EXPORT_NONCE;
1624 if (dir->is_dirty())
1627 // suck up all waiters
1628 dir->take_waiting(CDir::WAIT_ANY_MASK, finished); // all dir waiters
1631 dir->finish_export(now);
1634 list<CDir*> subdirs;
1635 CDir::map_t::iterator it;
1636 for (it = dir->begin(); it != dir->end(); ++it) {
1637 CDentry *dn = it->second;
1638 CInode *in = dn->get_linkage()->get_inode();
1641 dn->finish_export();
1644 if (dn->get_linkage()->is_primary()) {
1645 finish_export_inode(in, now, peer, peer_imported[in->ino()], finished);
1648 in->get_nested_dirfrags(subdirs);
1651 cache->touch_dentry_bottom(dn); // move dentry to tail of LRU
1656 for (list<CDir*>::iterator it = subdirs.begin(); it != subdirs.end(); ++it)
1657 finish_export_dir(*it, now, peer, peer_imported, finished, num_dentries);
1660 class C_MDS_ExportFinishLogged : public MigratorLogContext {
1663 C_MDS_ExportFinishLogged(Migrator *m, CDir *d) : MigratorLogContext(m), dir(d) {}
1664 void finish(int r) override {
1665 mig->export_logged_finish(dir);
1671 * i should get an export_ack from the export target.
1673 * This function DOES put the passed message before returning
1675 void Migrator::handle_export_ack(MExportDirAck *m)
1677 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1678 mds_rank_t dest(m->get_source().num());
1679 utime_t now = ceph_clock_now();
1681 assert(dir->is_frozen_tree_root()); // i'm exporting!
1684 dout(7) << "handle_export_ack " << *dir << dendl;
1686 mds->hit_export_target(now, dest, -1);
1688 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1689 assert(it != export_state.end());
1690 assert(it->second.state == EXPORT_EXPORTING);
1691 assert(it->second.tid == m->get_tid());
1693 bufferlist::iterator bp = m->imported_caps.begin();
1694 ::decode(it->second.peer_imported, bp);
1696 it->second.state = EXPORT_LOGGINGFINISH;
1697 assert (g_conf->mds_kill_export_at != 9);
1699 cache->get_subtree_bounds(dir, bounds);
1702 // include export bounds, to ensure they're in the journal.
1703 EExport *le = new EExport(mds->mdlog, dir, it->second.peer);;
1704 mds->mdlog->start_entry(le);
1706 le->metablob.add_dir_context(dir, EMetaBlob::TO_ROOT);
1707 le->metablob.add_dir(dir, false);
1708 for (set<CDir*>::iterator p = bounds.begin();
1712 le->get_bounds().insert(bound->dirfrag());
1713 le->metablob.add_dir_context(bound);
1714 le->metablob.add_dir(bound, false);
1717 // list us second, them first.
1718 // this keeps authority().first in sync with subtree auth state in the journal.
1719 cache->adjust_subtree_auth(dir, it->second.peer, mds->get_nodeid());
1721 // log export completion, then finish (unfreeze, trigger finish context, etc.)
1722 mds->mdlog->submit_entry(le, new C_MDS_ExportFinishLogged(this, dir));
1723 mds->mdlog->flush();
1724 assert (g_conf->mds_kill_export_at != 10);
1729 void Migrator::export_notify_abort(CDir *dir, set<CDir*>& bounds)
1731 dout(7) << "export_notify_abort " << *dir << dendl;
1733 export_state_t& stat = export_state[dir];
1734 assert(stat.state == EXPORT_CANCELLING);
1736 if (stat.notify_ack_waiting.empty()) {
1737 stat.state = EXPORT_CANCELLED;
1741 dir->auth_pin(this);
1743 for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
1744 p != stat.notify_ack_waiting.end();
1746 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(),stat.tid, true,
1747 pair<int,int>(mds->get_nodeid(),stat.peer),
1748 pair<int,int>(mds->get_nodeid(),CDIR_AUTH_UNKNOWN));
1749 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
1750 notify->get_bounds().push_back((*i)->dirfrag());
1751 mds->send_message_mds(notify, *p);
1756 * this happens if hte dest failes after i send teh export data but before it is acked
1757 * that is, we don't know they safely received and logged it, so we reverse our changes
1760 void Migrator::export_reverse(CDir *dir)
1762 dout(7) << "export_reverse " << *dir << dendl;
1764 set<CInode*> to_eval;
1767 cache->get_subtree_bounds(dir, bounds);
1769 // remove exporting pins
1772 while (!rq.empty()) {
1773 CDir *t = rq.front();
1776 for (CDir::map_t::iterator p = t->items.begin(); p != t->items.end(); ++p) {
1777 p->second->abort_export();
1778 if (!p->second->get_linkage()->is_primary())
1780 CInode *in = p->second->get_linkage()->get_inode();
1782 if (in->state_test(CInode::STATE_EVALSTALECAPS)) {
1783 in->state_clear(CInode::STATE_EVALSTALECAPS);
1787 in->get_nested_dirfrags(rq);
1792 for (const auto &bd : bounds) {
1793 bd->put(CDir::PIN_EXPORTBOUND);
1794 bd->state_clear(CDir::STATE_EXPORTBOUND);
1797 // notify bystanders
1798 export_notify_abort(dir, bounds);
1800 // unfreeze tree, with possible subtree merge.
1801 cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
1803 // process delayed expires
1804 cache->process_delayed_expire(dir);
1806 dir->unfreeze_tree();
1807 cache->try_subtree_merge(dir);
1809 // revoke/resume stale caps
1810 for (auto in : to_eval) {
1811 bool need_issue = false;
1812 for (auto& p : in->get_client_caps()) {
1813 Capability *cap = p.second;
1814 if (cap->is_stale()) {
1815 mds->locker->revoke_stale_caps(cap);
1821 (!in->is_auth() || !mds->locker->eval(in, CEPH_CAP_LOCKS)))
1822 mds->locker->issue_caps(in);
1825 cache->show_cache();
1830 * once i get the ack, and logged the EExportFinish(true),
1831 * send notifies (if any), otherwise go straight to finish.
1834 void Migrator::export_logged_finish(CDir *dir)
1836 dout(7) << "export_logged_finish " << *dir << dendl;
1838 export_state_t& stat = export_state[dir];
1842 cache->get_subtree_bounds(dir, bounds);
1844 for (set<mds_rank_t>::iterator p = stat.notify_ack_waiting.begin();
1845 p != stat.notify_ack_waiting.end();
1847 MExportDirNotify *notify = new MExportDirNotify(dir->dirfrag(), stat.tid, true,
1848 pair<int,int>(mds->get_nodeid(), stat.peer),
1849 pair<int,int>(stat.peer, CDIR_AUTH_UNKNOWN));
1851 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
1852 notify->get_bounds().push_back((*i)->dirfrag());
1854 mds->send_message_mds(notify, *p);
1857 // wait for notifyacks
1858 stat.state = EXPORT_NOTIFYING;
1859 assert (g_conf->mds_kill_export_at != 11);
1861 // no notifies to wait for?
1862 if (stat.notify_ack_waiting.empty()) {
1863 export_finish(dir); // skip notify/notify_ack stage.
1865 // notify peer to send cap import messages to clients
1866 if (!mds->is_cluster_degraded() ||
1867 mds->mdsmap->is_clientreplay_or_active_or_stopping(stat.peer)) {
1868 mds->send_message_mds(new MExportDirFinish(dir->dirfrag(), false, stat.tid), stat.peer);
1870 dout(7) << "not sending MExportDirFinish, dest has failed" << dendl;
1877 * i'll get an ack from each bystander.
1878 * when i get them all, do the export.
1880 * i'll get an ack from each bystander.
1881 * when i get them all, unfreeze and send the finish.
1883 * This function DOES put the passed message before returning
1885 void Migrator::handle_export_notify_ack(MExportDirNotifyAck *m)
1887 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
1888 mds_rank_t dest(m->get_source().num());
1889 utime_t now = ceph_clock_now();
1891 mds_rank_t from = mds_rank_t(m->get_source().num());
1893 mds->hit_export_target(now, dest, -1);
1895 auto export_state_entry = export_state.find(dir);
1896 if (export_state_entry != export_state.end()) {
1897 export_state_t& stat = export_state_entry->second;
1898 if (stat.state == EXPORT_WARNING &&
1899 stat.warning_ack_waiting.erase(from)) {
1900 // exporting. process warning.
1901 dout(7) << "handle_export_notify_ack from " << m->get_source()
1902 << ": exporting, processing warning on " << *dir << dendl;
1903 if (stat.warning_ack_waiting.empty())
1904 export_go(dir); // start export.
1905 } else if (stat.state == EXPORT_NOTIFYING &&
1906 stat.notify_ack_waiting.erase(from)) {
1907 // exporting. process notify.
1908 dout(7) << "handle_export_notify_ack from " << m->get_source()
1909 << ": exporting, processing notify on " << *dir << dendl;
1910 if (stat.notify_ack_waiting.empty())
1912 } else if (stat.state == EXPORT_CANCELLING &&
1913 m->get_new_auth().second == CDIR_AUTH_UNKNOWN && // not warning ack
1914 stat.notify_ack_waiting.erase(from)) {
1915 dout(7) << "handle_export_notify_ack from " << m->get_source()
1916 << ": cancelling export, processing notify on " << *dir << dendl;
1917 if (stat.notify_ack_waiting.empty()) {
1918 export_state.erase(export_state_entry);
1919 export_cancel_finish(dir);
1924 auto import_state_entry = import_state.find(dir->dirfrag());
1925 if (import_state_entry != import_state.end()) {
1926 import_state_t& stat = import_state_entry->second;
1927 if (stat.state == IMPORT_ABORTING) {
1929 dout(7) << "handle_export_notify_ack from " << m->get_source()
1930 << ": aborting import on " << *dir << dendl;
1931 assert(stat.bystanders.count(from));
1932 stat.bystanders.erase(from);
1933 if (stat.bystanders.empty())
1934 import_reverse_unfreeze(dir);
1942 void Migrator::export_finish(CDir *dir)
1944 dout(5) << "export_finish " << *dir << dendl;
1946 assert (g_conf->mds_kill_export_at != 12);
1947 map<CDir*,export_state_t>::iterator it = export_state.find(dir);
1948 if (it == export_state.end()) {
1949 dout(7) << "target must have failed, not sending final commit message. export succeeded anyway." << dendl;
1953 // send finish/commit to new auth
1954 if (!mds->is_cluster_degraded() ||
1955 mds->mdsmap->is_clientreplay_or_active_or_stopping(it->second.peer)) {
1956 mds->send_message_mds(new MExportDirFinish(dir->dirfrag(), true, it->second.tid), it->second.peer);
1958 dout(7) << "not sending MExportDirFinish last, dest has failed" << dendl;
1960 assert(g_conf->mds_kill_export_at != 13);
1962 // finish export (adjust local cache state)
1963 int num_dentries = 0;
1964 list<MDSInternalContextBase*> finished;
1965 finish_export_dir(dir, ceph_clock_now(), it->second.peer,
1966 it->second.peer_imported, finished, &num_dentries);
1968 assert(!dir->is_auth());
1969 cache->adjust_subtree_auth(dir, it->second.peer);
1973 cache->get_subtree_bounds(dir, bounds);
1974 for (set<CDir*>::iterator p = bounds.begin();
1978 bd->put(CDir::PIN_EXPORTBOUND);
1979 bd->state_clear(CDir::STATE_EXPORTBOUND);
1982 if (dir->state_test(CDir::STATE_AUXSUBTREE))
1983 dir->state_clear(CDir::STATE_AUXSUBTREE);
1985 // discard delayed expires
1986 cache->discard_delayed_expire(dir);
1988 dout(7) << "export_finish unfreezing" << dendl;
1990 // unfreeze tree, with possible subtree merge.
1991 // (we do this _after_ removing EXPORTBOUND pins, to allow merges)
1992 dir->unfreeze_tree();
1993 cache->try_subtree_merge(dir);
1995 // no more auth subtree? clear scatter dirty
1996 if (!dir->get_inode()->is_auth() &&
1997 !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
1998 dir->get_inode()->clear_scatter_dirty();
1999 // wake up scatter_nudge waiters
2000 dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, finished);
2003 if (!finished.empty())
2004 mds->queue_waiters(finished);
2006 MutationRef mut = it->second.mut;
2007 // remove from exporting list, clean up state
2008 export_state.erase(it);
2009 dir->state_clear(CDir::STATE_EXPORTING);
2011 cache->show_subtrees();
2014 cache->trim(num_dentries); // try trimming exported dentries
2016 // send pending import_maps?
2017 mds->mdcache->maybe_send_pending_resolves();
2019 // drop locks, unpin path
2021 mds->locker->drop_locks(mut.get());
2025 maybe_do_queued_export();
2035 // ==========================================================
2038 void Migrator::handle_export_discover(MExportDirDiscover *m)
2040 mds_rank_t from = m->get_source_mds();
2041 assert(from != mds->get_nodeid());
2043 dout(7) << "handle_export_discover on " << m->get_path() << dendl;
2045 // note import state
2046 dirfrag_t df = m->get_dirfrag();
2048 if (!mds->is_active()) {
2049 dout(7) << " not active, send NACK " << dendl;
2050 mds->send_message_mds(new MExportDirDiscoverAck(df, m->get_tid(), false), from);
2055 // only start discovering on this message once.
2056 map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
2058 assert(it == import_state.end());
2060 import_state[df].state = IMPORT_DISCOVERING;
2061 import_state[df].peer = from;
2062 import_state[df].tid = m->get_tid();
2064 // am i retrying after ancient path_traverse results?
2065 if (it == import_state.end() ||
2066 it->second.peer != from ||
2067 it->second.tid != m->get_tid()) {
2068 dout(7) << " dropping obsolete message" << dendl;
2072 assert(it->second.state == IMPORT_DISCOVERING);
2075 if (!mds->mdcache->is_open()) {
2076 dout(5) << " waiting for root" << dendl;
2077 mds->mdcache->wait_for_open(new C_MDS_RetryMessage(mds, m));
2081 assert (g_conf->mds_kill_import_at != 1);
2084 CInode *in = cache->get_inode(m->get_dirfrag().ino);
2086 // must discover it!
2087 filepath fpath(m->get_path());
2088 vector<CDentry*> trace;
2089 MDRequestRef null_ref;
2090 int r = cache->path_traverse(null_ref, m, NULL, fpath, &trace, NULL, MDS_TRAVERSE_DISCOVER);
2093 dout(7) << "handle_export_discover_2 failed to discover or not dir " << m->get_path() << ", NAK" << dendl;
2094 ceph_abort(); // this shouldn't happen if the auth pins its path properly!!!!
2097 ceph_abort(); // this shouldn't happen; the get_inode above would have succeeded.
2101 dout(7) << "handle_export_discover have " << df << " inode " << *in << dendl;
2103 import_state[df].state = IMPORT_DISCOVERED;
2105 // pin inode in the cache (for now)
2106 assert(in->is_dir());
2107 in->get(CInode::PIN_IMPORTING);
2110 dout(7) << " sending export_discover_ack on " << *in << dendl;
2111 mds->send_message_mds(new MExportDirDiscoverAck(df, m->get_tid()), import_state[df].peer);
2113 assert (g_conf->mds_kill_import_at != 2);
2116 void Migrator::import_reverse_discovering(dirfrag_t df)
2118 import_state.erase(df);
2121 void Migrator::import_reverse_discovered(dirfrag_t df, CInode *diri)
2124 diri->put(CInode::PIN_IMPORTING);
2125 import_state.erase(df);
2128 void Migrator::import_reverse_prepping(CDir *dir)
2131 cache->map_dirfrag_set(import_state[dir->dirfrag()].bound_ls, bounds);
2132 import_remove_pins(dir, bounds);
2133 import_reverse_final(dir);
2136 /* This function DOES put the passed message before returning*/
2137 void Migrator::handle_export_cancel(MExportDirCancel *m)
2139 dout(7) << "handle_export_cancel on " << m->get_dirfrag() << dendl;
2140 dirfrag_t df = m->get_dirfrag();
2141 map<dirfrag_t,import_state_t>::iterator it = import_state.find(df);
2142 if (it == import_state.end()) {
2143 assert(0 == "got export_cancel in weird state");
2144 } else if (it->second.state == IMPORT_DISCOVERING) {
2145 import_reverse_discovering(df);
2146 } else if (it->second.state == IMPORT_DISCOVERED) {
2147 CInode *in = cache->get_inode(df.ino);
2149 import_reverse_discovered(df, in);
2150 } else if (it->second.state == IMPORT_PREPPING) {
2151 CDir *dir = mds->mdcache->get_dirfrag(df);
2153 import_reverse_prepping(dir);
2154 } else if (it->second.state == IMPORT_PREPPED) {
2155 CDir *dir = mds->mdcache->get_dirfrag(df);
2158 cache->get_subtree_bounds(dir, bounds);
2159 import_remove_pins(dir, bounds);
2160 // adjust auth back to the exportor
2161 cache->adjust_subtree_auth(dir, it->second.peer);
2162 import_reverse_unfreeze(dir);
2164 assert(0 == "got export_cancel in weird state");
2169 /* This function DOES put the passed message before returning*/
2170 void Migrator::handle_export_prep(MExportDirPrep *m)
2172 mds_rank_t oldauth = mds_rank_t(m->get_source().num());
2173 assert(oldauth != mds->get_nodeid());
2177 list<MDSInternalContextBase*> finished;
2179 // assimilate root dir.
2180 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
2181 if (!m->did_assim()) {
2182 assert(it != import_state.end());
2183 assert(it->second.state == IMPORT_DISCOVERED);
2184 assert(it->second.peer == oldauth);
2185 diri = cache->get_inode(m->get_dirfrag().ino);
2187 bufferlist::iterator p = m->basedir.begin();
2188 dir = cache->add_replica_dir(p, diri, oldauth, finished);
2189 dout(7) << "handle_export_prep on " << *dir << " (first pass)" << dendl;
2191 if (it == import_state.end() ||
2192 it->second.peer != oldauth ||
2193 it->second.tid != m->get_tid()) {
2194 dout(7) << "handle_export_prep obsolete message, dropping" << dendl;
2198 assert(it->second.state == IMPORT_PREPPING);
2199 assert(it->second.peer == oldauth);
2201 dir = cache->get_dirfrag(m->get_dirfrag());
2203 dout(7) << "handle_export_prep on " << *dir << " (subsequent pass)" << dendl;
2204 diri = dir->get_inode();
2206 assert(dir->is_auth() == false);
2208 cache->show_subtrees();
2210 // build import bound map
2211 map<inodeno_t, fragset_t> import_bound_fragset;
2212 for (list<dirfrag_t>::iterator p = m->get_bounds().begin();
2213 p != m->get_bounds().end();
2215 dout(10) << " bound " << *p << dendl;
2216 import_bound_fragset[p->ino].insert(p->frag);
2219 // assimilate contents?
2220 if (!m->did_assim()) {
2221 dout(7) << "doing assim on " << *dir << dendl;
2222 m->mark_assim(); // only do this the first time!
2224 // change import state
2225 it->second.state = IMPORT_PREPPING;
2226 it->second.bound_ls = m->get_bounds();
2227 it->second.bystanders = m->get_bystanders();
2228 assert(g_conf->mds_kill_import_at != 3);
2231 dout(7) << "bystanders are " << it->second.bystanders << dendl;
2234 diri->put(CInode::PIN_IMPORTING);
2235 dir->get(CDir::PIN_IMPORTING);
2236 dir->state_set(CDir::STATE_IMPORTING);
2238 // assimilate traces to exports
2239 // each trace is: df ('-' | ('f' dir | 'd') dentry inode (dir dentry inode)*)
2240 for (list<bufferlist>::iterator p = m->traces.begin();
2241 p != m->traces.end();
2243 bufferlist::iterator q = p->begin();
2248 dout(10) << " trace from " << df << " start " << start << " len " << p->length() << dendl;
2252 cur = cache->get_dirfrag(df);
2254 dout(10) << " had " << *cur << dendl;
2255 } else if (start == 'f') {
2256 CInode *in = cache->get_inode(df.ino);
2258 dout(10) << " had " << *in << dendl;
2259 cur = cache->add_replica_dir(q, in, oldauth, finished);
2260 dout(10) << " added " << *cur << dendl;
2261 } else if (start == '-') {
2264 assert(0 == "unrecognized start char");
2266 while (start != '-') {
2267 CDentry *dn = cache->add_replica_dentry(q, cur, finished);
2268 dout(10) << " added " << *dn << dendl;
2269 CInode *in = cache->add_replica_inode(q, dn, finished);
2270 dout(10) << " added " << *in << dendl;
2273 cur = cache->add_replica_dir(q, in, oldauth, finished);
2274 dout(10) << " added " << *cur << dendl;
2278 // make bound sticky
2279 for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
2280 p != import_bound_fragset.end();
2282 CInode *in = cache->get_inode(p->first);
2284 in->get_stickydirs();
2285 dout(7) << " set stickydirs on bound inode " << *in << dendl;
2289 dout(7) << " not doing assim on " << *dir << dendl;
2292 if (!finished.empty())
2293 mds->queue_waiters(finished);
2296 bool success = true;
2297 if (mds->is_active()) {
2299 set<CDir*> import_bounds;
2300 for (map<inodeno_t,fragset_t>::iterator p = import_bound_fragset.begin();
2301 p != import_bound_fragset.end();
2303 CInode *in = cache->get_inode(p->first);
2306 // map fragset into a frag_t list, based on the inode fragtree
2307 list<frag_t> fglist;
2308 for (set<frag_t>::iterator q = p->second.begin(); q != p->second.end(); ++q)
2309 in->dirfragtree.get_leaves_under(*q, fglist);
2310 dout(10) << " bound inode " << p->first << " fragset " << p->second << " maps to " << fglist << dendl;
2312 for (list<frag_t>::iterator q = fglist.begin();
2315 CDir *bound = cache->get_dirfrag(dirfrag_t(p->first, *q));
2317 dout(7) << " opening bounding dirfrag " << *q << " on " << *in << dendl;
2318 cache->open_remote_dirfrag(in, *q,
2319 new C_MDS_RetryMessage(mds, m));
2323 if (!bound->state_test(CDir::STATE_IMPORTBOUND)) {
2324 dout(7) << " pinning import bound " << *bound << dendl;
2325 bound->get(CDir::PIN_IMPORTBOUND);
2326 bound->state_set(CDir::STATE_IMPORTBOUND);
2328 dout(7) << " already pinned import bound " << *bound << dendl;
2330 import_bounds.insert(bound);
2334 dout(7) << " all ready, noting auth and freezing import region" << dendl;
2336 if (!mds->mdcache->is_readonly() &&
2337 dir->get_inode()->filelock.can_wrlock(-1) &&
2338 dir->get_inode()->nestlock.can_wrlock(-1)) {
2339 it->second.mut = new MutationImpl();
2340 // force some locks. hacky.
2341 mds->locker->wrlock_force(&dir->inode->filelock, it->second.mut);
2342 mds->locker->wrlock_force(&dir->inode->nestlock, it->second.mut);
2344 // note that i am an ambiguous auth for this subtree.
2345 // specify bounds, since the exporter explicitly defines the region.
2346 cache->adjust_bounded_subtree_auth(dir, import_bounds,
2347 pair<int,int>(oldauth, mds->get_nodeid()));
2348 cache->verify_subtree_bounds(dir, import_bounds);
2350 dir->_freeze_tree();
2352 it->second.state = IMPORT_PREPPED;
2354 dout(7) << " couldn't acquire all needed locks, failing. " << *dir << dendl;
2358 dout(7) << " not active, failing. " << *dir << dendl;
2363 import_reverse_prepping(dir);
2366 dout(7) << " sending export_prep_ack on " << *dir << dendl;
2367 mds->send_message(new MExportDirPrepAck(dir->dirfrag(), success, m->get_tid()), m->get_connection());
2369 assert(g_conf->mds_kill_import_at != 4);
2377 class C_MDS_ImportDirLoggedStart : public MigratorLogContext {
2382 map<client_t,entity_inst_t> imported_client_map;
2383 map<client_t,uint64_t> sseqmap;
2385 C_MDS_ImportDirLoggedStart(Migrator *m, CDir *d, mds_rank_t f) :
2386 MigratorLogContext(m), df(d->dirfrag()), dir(d), from(f) {
2388 void finish(int r) override {
2389 mig->import_logged_start(df, dir, from, imported_client_map, sseqmap);
2393 /* This function DOES put the passed message before returning*/
2394 void Migrator::handle_export_dir(MExportDir *m)
2396 assert (g_conf->mds_kill_import_at != 5);
2397 CDir *dir = cache->get_dirfrag(m->dirfrag);
2400 mds_rank_t oldauth = mds_rank_t(m->get_source().num());
2401 dout(7) << "handle_export_dir importing " << *dir << " from " << oldauth << dendl;
2403 assert(!dir->is_auth());
2405 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->dirfrag);
2406 assert(it != import_state.end());
2407 assert(it->second.state == IMPORT_PREPPED);
2408 assert(it->second.tid == m->get_tid());
2409 assert(it->second.peer == oldauth);
2411 utime_t now = ceph_clock_now();
2413 if (!dir->get_inode()->dirfragtree.is_leaf(dir->get_frag()))
2414 dir->get_inode()->dirfragtree.force_to_leaf(g_ceph_context, dir->get_frag());
2416 cache->show_subtrees();
2418 C_MDS_ImportDirLoggedStart *onlogged = new C_MDS_ImportDirLoggedStart(this, dir, oldauth);
2420 // start the journal entry
2421 EImportStart *le = new EImportStart(mds->mdlog, dir->dirfrag(), m->bounds, oldauth);
2422 mds->mdlog->start_entry(le);
2424 le->metablob.add_dir_context(dir);
2426 // adjust auth (list us _first_)
2427 cache->adjust_subtree_auth(dir, mds->get_nodeid(), oldauth);
2429 // new client sessions, open these after we journal
2430 // include imported sessions in EImportStart
2431 bufferlist::iterator cmp = m->client_map.begin();
2432 ::decode(onlogged->imported_client_map, cmp);
2434 le->cmapv = mds->server->prepare_force_open_sessions(onlogged->imported_client_map, onlogged->sseqmap);
2435 le->client_map.claim(m->client_map);
2437 bufferlist::iterator blp = m->export_data.begin();
2438 int num_imported_inodes = 0;
2439 while (!blp.end()) {
2440 num_imported_inodes +=
2441 decode_import_dir(blp,
2445 mds->mdlog->get_current_segment(),
2446 it->second.peer_exports,
2447 it->second.updated_scatterlocks,
2450 dout(10) << " " << m->bounds.size() << " imported bounds" << dendl;
2452 // include bounds in EImportStart
2453 set<CDir*> import_bounds;
2454 for (vector<dirfrag_t>::iterator p = m->bounds.begin();
2455 p != m->bounds.end();
2457 CDir *bd = cache->get_dirfrag(*p);
2459 le->metablob.add_dir(bd, false); // note that parent metadata is already in the event
2460 import_bounds.insert(bd);
2462 cache->verify_subtree_bounds(dir, import_bounds);
2464 // adjust popularity
2465 mds->balancer->add_import(dir, now);
2467 dout(7) << "handle_export_dir did " << *dir << dendl;
2470 it->second.state = IMPORT_LOGGINGSTART;
2471 assert (g_conf->mds_kill_import_at != 6);
2474 mds->mdlog->submit_entry(le, onlogged);
2475 mds->mdlog->flush();
2479 mds->logger->inc(l_mds_imported);
2480 mds->logger->inc(l_mds_imported_inodes, num_imported_inodes);
2488 * this is an import helper
2489 * called by import_finish, and import_reverse and friends.
2491 void Migrator::import_remove_pins(CDir *dir, set<CDir*>& bounds)
2493 import_state_t& stat = import_state[dir->dirfrag()];
2495 dir->put(CDir::PIN_IMPORTING);
2496 dir->state_clear(CDir::STATE_IMPORTING);
2500 for (list<dirfrag_t>::iterator p = stat.bound_ls.begin();
2501 p != stat.bound_ls.end();
2503 if (did.count(p->ino))
2506 CInode *in = cache->get_inode(p->ino);
2508 in->put_stickydirs();
2511 if (stat.state == IMPORT_PREPPING) {
2512 for (auto bd : bounds) {
2513 if (bd->state_test(CDir::STATE_IMPORTBOUND)) {
2514 bd->put(CDir::PIN_IMPORTBOUND);
2515 bd->state_clear(CDir::STATE_IMPORTBOUND);
2518 } else if (stat.state >= IMPORT_PREPPED) {
2519 // bounding dirfrags
2520 for (auto bd : bounds) {
2521 assert(bd->state_test(CDir::STATE_IMPORTBOUND));
2522 bd->put(CDir::PIN_IMPORTBOUND);
2523 bd->state_clear(CDir::STATE_IMPORTBOUND);
2530 * note: this does teh full work of reversing and import and cleaning up
2532 * called by both handle_mds_failure and by handle_resolve (if we are
2533 * a survivor coping with an exporter failure+recovery).
2535 void Migrator::import_reverse(CDir *dir)
2537 dout(7) << "import_reverse " << *dir << dendl;
2539 import_state_t& stat = import_state[dir->dirfrag()];
2540 stat.state = IMPORT_ABORTING;
2543 cache->get_subtree_bounds(dir, bounds);
2546 import_remove_pins(dir, bounds);
2548 // update auth, with possible subtree merge.
2549 assert(dir->is_subtree_root());
2550 if (mds->is_resolve())
2551 cache->trim_non_auth_subtree(dir);
2553 cache->adjust_subtree_auth(dir, stat.peer);
2555 C_ContextsBase<MDSInternalContextBase, MDSInternalContextGather> *fin = new C_ContextsBase<MDSInternalContextBase, MDSInternalContextGather>(g_ceph_context);
2556 if (!dir->get_inode()->is_auth() &&
2557 !dir->get_inode()->has_subtree_root_dirfrag(mds->get_nodeid())) {
2558 dir->get_inode()->clear_scatter_dirty();
2559 // wake up scatter_nudge waiters
2560 dir->get_inode()->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
2563 int num_dentries = 0;
2564 // adjust auth bits.
2567 while (!q.empty()) {
2568 CDir *cur = q.front();
2572 assert(cur->is_auth());
2573 cur->state_clear(CDir::STATE_AUTH);
2574 cur->remove_bloom();
2575 cur->clear_replica_map();
2576 cur->set_replica_nonce(CDir::EXPORT_NONCE);
2577 if (cur->is_dirty())
2580 CDir::map_t::iterator it;
2581 for (it = cur->begin(); it != cur->end(); ++it) {
2582 CDentry *dn = it->second;
2585 dn->state_clear(CDentry::STATE_AUTH);
2586 dn->clear_replica_map();
2587 dn->set_replica_nonce(CDentry::EXPORT_NONCE);
2592 if (dn->get_linkage()->is_primary()) {
2593 CInode *in = dn->get_linkage()->get_inode();
2594 in->state_clear(CDentry::STATE_AUTH);
2595 in->clear_replica_map();
2596 in->set_replica_nonce(CInode::EXPORT_NONCE);
2599 in->clear_dirty_rstat();
2600 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
2601 in->clear_scatter_dirty();
2602 in->take_waiting(CInode::WAIT_ANY_MASK, fin->contexts);
2605 in->clear_dirty_parent();
2607 in->authlock.clear_gather();
2608 in->linklock.clear_gather();
2609 in->dirfragtreelock.clear_gather();
2610 in->filelock.clear_gather();
2612 in->clear_file_locks();
2614 // non-bounding dir?
2616 in->get_dirfrags(dfs);
2617 for (list<CDir*>::iterator p = dfs.begin(); p != dfs.end(); ++p)
2618 if (bounds.count(*p) == 0)
2622 cache->touch_dentry_bottom(dn); // move dentry to tail of LRU
2627 dir->add_waiter(CDir::WAIT_UNFREEZE, fin);
2629 if (stat.state == IMPORT_ACKING) {
2630 // remove imported caps
2631 for (map<CInode*,map<client_t,Capability::Export> >::iterator p = stat.peer_exports.begin();
2632 p != stat.peer_exports.end();
2634 CInode *in = p->first;
2635 for (map<client_t,Capability::Export>::iterator q = p->second.begin();
2636 q != p->second.end();
2638 Capability *cap = in->get_client_cap(q->first);
2640 if (cap->is_importing())
2641 in->remove_client_cap(q->first);
2643 in->put(CInode::PIN_IMPORTINGCAPS);
2645 for (map<client_t,entity_inst_t>::iterator p = stat.client_map.begin();
2646 p != stat.client_map.end();
2648 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
2650 session->dec_importing();
2655 mds->mdlog->start_submit_entry(new EImportFinish(dir, false)); // log failure
2657 cache->trim(num_dentries); // try trimming dentries
2659 // notify bystanders; wait in aborting state
2660 import_notify_abort(dir, bounds);
2663 void Migrator::import_notify_finish(CDir *dir, set<CDir*>& bounds)
2665 dout(7) << "import_notify_finish " << *dir << dendl;
2667 import_state_t& stat = import_state[dir->dirfrag()];
2668 for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
2669 p != stat.bystanders.end();
2671 MExportDirNotify *notify =
2672 new MExportDirNotify(dir->dirfrag(), stat.tid, false,
2673 pair<int,int>(stat.peer, mds->get_nodeid()),
2674 pair<int,int>(mds->get_nodeid(), CDIR_AUTH_UNKNOWN));
2675 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
2676 notify->get_bounds().push_back((*i)->dirfrag());
2677 mds->send_message_mds(notify, *p);
2681 void Migrator::import_notify_abort(CDir *dir, set<CDir*>& bounds)
2683 dout(7) << "import_notify_abort " << *dir << dendl;
2685 import_state_t& stat = import_state[dir->dirfrag()];
2686 for (set<mds_rank_t>::iterator p = stat.bystanders.begin();
2687 p != stat.bystanders.end(); ) {
2688 if (mds->is_cluster_degraded() &&
2689 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)) {
2690 // this can happen if both exporter and bystander fail in the same mdsmap epoch
2691 stat.bystanders.erase(p++);
2694 MExportDirNotify *notify =
2695 new MExportDirNotify(dir->dirfrag(), stat.tid, true,
2696 mds_authority_t(stat.peer, mds->get_nodeid()),
2697 mds_authority_t(stat.peer, CDIR_AUTH_UNKNOWN));
2698 for (set<CDir*>::iterator i = bounds.begin(); i != bounds.end(); ++i)
2699 notify->get_bounds().push_back((*i)->dirfrag());
2700 mds->send_message_mds(notify, *p);
2703 if (stat.bystanders.empty()) {
2704 dout(7) << "no bystanders, finishing reverse now" << dendl;
2705 import_reverse_unfreeze(dir);
2707 assert (g_conf->mds_kill_import_at != 10);
2711 void Migrator::import_reverse_unfreeze(CDir *dir)
2713 dout(7) << "import_reverse_unfreeze " << *dir << dendl;
2714 assert(!dir->is_auth());
2715 cache->discard_delayed_expire(dir);
2716 dir->unfreeze_tree();
2717 if (dir->is_subtree_root())
2718 cache->try_subtree_merge(dir);
2719 import_reverse_final(dir);
2722 void Migrator::import_reverse_final(CDir *dir)
2724 dout(7) << "import_reverse_final " << *dir << dendl;
2727 map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
2728 assert(it != import_state.end());
2730 MutationRef mut = it->second.mut;
2731 import_state.erase(it);
2733 // send pending import_maps?
2734 mds->mdcache->maybe_send_pending_resolves();
2737 mds->locker->drop_locks(mut.get());
2741 cache->show_subtrees();
2742 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
2748 void Migrator::import_logged_start(dirfrag_t df, CDir *dir, mds_rank_t from,
2749 map<client_t,entity_inst_t>& imported_client_map,
2750 map<client_t,uint64_t>& sseqmap)
2752 map<dirfrag_t, import_state_t>::iterator it = import_state.find(dir->dirfrag());
2753 if (it == import_state.end() ||
2754 it->second.state != IMPORT_LOGGINGSTART) {
2755 dout(7) << "import " << df << " must have aborted" << dendl;
2756 mds->server->finish_force_open_sessions(imported_client_map, sseqmap);
2760 dout(7) << "import_logged " << *dir << dendl;
2763 it->second.state = IMPORT_ACKING;
2765 assert (g_conf->mds_kill_import_at != 7);
2767 // force open client sessions and finish cap import
2768 mds->server->finish_force_open_sessions(imported_client_map, sseqmap, false);
2769 it->second.client_map.swap(imported_client_map);
2771 map<inodeno_t,map<client_t,Capability::Import> > imported_caps;
2772 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
2773 p != it->second.peer_exports.end();
2775 // parameter 'peer' is NONE, delay sending cap import messages to client
2776 finish_import_inode_caps(p->first, MDS_RANK_NONE, true, p->second, imported_caps[p->first->ino()]);
2779 // send notify's etc.
2780 dout(7) << "sending ack for " << *dir << " to old auth mds." << from << dendl;
2782 // test surviving observer of a failed migration that did not complete
2783 //assert(dir->replica_map.size() < 2 || mds->get_nodeid() != 0);
2785 MExportDirAck *ack = new MExportDirAck(dir->dirfrag(), it->second.tid);
2786 ::encode(imported_caps, ack->imported_caps);
2788 mds->send_message_mds(ack, from);
2789 assert (g_conf->mds_kill_import_at != 8);
2791 cache->show_subtrees();
2794 /* This function DOES put the passed message before returning*/
2795 void Migrator::handle_export_finish(MExportDirFinish *m)
2797 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
2799 dout(7) << "handle_export_finish on " << *dir << (m->is_last() ? " last" : "") << dendl;
2801 map<dirfrag_t,import_state_t>::iterator it = import_state.find(m->get_dirfrag());
2802 assert(it != import_state.end());
2803 assert(it->second.tid == m->get_tid());
2805 import_finish(dir, false, m->is_last());
2810 void Migrator::import_finish(CDir *dir, bool notify, bool last)
2812 dout(7) << "import_finish on " << *dir << dendl;
2814 map<dirfrag_t,import_state_t>::iterator it = import_state.find(dir->dirfrag());
2815 assert(it != import_state.end());
2816 assert(it->second.state == IMPORT_ACKING || it->second.state == IMPORT_FINISHING);
2818 if (it->second.state == IMPORT_ACKING) {
2819 assert(dir->is_auth());
2820 cache->adjust_subtree_auth(dir, mds->get_nodeid(), mds->get_nodeid());
2824 assert(g_conf->mds_kill_import_at != 9);
2826 if (it->second.state == IMPORT_ACKING) {
2827 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = it->second.peer_exports.begin();
2828 p != it->second.peer_exports.end();
2830 CInode *in = p->first;
2831 assert(in->is_auth());
2832 for (map<client_t,Capability::Export>::iterator q = p->second.begin();
2833 q != p->second.end();
2835 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(q->first.v));
2837 Capability *cap = in->get_client_cap(q->first);
2839 cap->merge(q->second, true);
2840 cap->clear_importing();
2841 mds->mdcache->do_cap_import(session, in, cap, q->second.cap_id, q->second.seq,
2842 q->second.mseq - 1, it->second.peer, CEPH_CAP_FLAG_AUTH);
2845 in->replica_caps_wanted = 0;
2847 for (map<client_t,entity_inst_t>::iterator p = it->second.client_map.begin();
2848 p != it->second.client_map.end();
2850 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->first.v));
2852 session->dec_importing();
2857 assert(it->second.state == IMPORT_ACKING);
2858 it->second.state = IMPORT_FINISHING;
2864 cache->get_subtree_bounds(dir, bounds);
2867 import_notify_finish(dir, bounds);
2869 import_remove_pins(dir, bounds);
2871 map<CInode*, map<client_t,Capability::Export> > peer_exports;
2872 it->second.peer_exports.swap(peer_exports);
2874 // clear import state (we're done!)
2875 MutationRef mut = it->second.mut;
2876 import_state.erase(it);
2878 mds->mdlog->start_submit_entry(new EImportFinish(dir, true));
2880 // process delayed expires
2881 cache->process_delayed_expire(dir);
2883 // unfreeze tree, with possible subtree merge.
2884 dir->unfreeze_tree();
2885 cache->try_subtree_merge(dir);
2887 cache->show_subtrees();
2888 //audit(); // this fails, bc we munge up the subtree map during handle_import_map (resolve phase)
2891 mds->locker->drop_locks(mut.get());
2895 // re-eval imported caps
2896 for (map<CInode*, map<client_t,Capability::Export> >::iterator p = peer_exports.begin();
2897 p != peer_exports.end();
2899 if (p->first->is_auth())
2900 mds->locker->eval(p->first, CEPH_CAP_LOCKS, true);
2901 p->first->put(CInode::PIN_IMPORTINGCAPS);
2904 // send pending import_maps?
2905 mds->mdcache->maybe_send_pending_resolves();
2907 // did i just import mydir?
2908 if (dir->ino() == MDS_INO_MDSDIR(mds->get_nodeid()))
2909 cache->populate_mydir();
2912 if (dir->get_num_head_items() == 0 &&
2913 !dir->inode->is_auth()) {
2915 export_empty_import(dir);
2920 void Migrator::decode_import_inode(CDentry *dn, bufferlist::iterator& blp,
2921 mds_rank_t oldauth, LogSegment *ls,
2922 map<CInode*, map<client_t,Capability::Export> >& peer_exports,
2923 list<ScatterLock*>& updated_scatterlocks)
2925 dout(15) << "decode_import_inode on " << *dn << dendl;
2930 ::decode(last, blp);
2933 CInode *in = cache->get_inode(ino, last);
2935 in = new CInode(mds->mdcache, true, 1, last);
2939 // state after link -- or not! -sage
2940 in->decode_import(blp, ls); // cap imports are noted for later action
2943 decode_import_inode_caps(in, true, blp, peer_exports);
2945 // link before state -- or not! -sage
2946 if (dn->get_linkage()->get_inode() != in) {
2947 assert(!dn->get_linkage()->get_inode());
2948 dn->dir->link_primary_inode(dn, in);
2953 cache->add_inode(in);
2954 dout(10) << "added " << *in << dendl;
2956 dout(10) << " had " << *in << dendl;
2959 if (in->inode.is_dirty_rstat())
2960 in->mark_dirty_rstat();
2962 // clear if dirtyscattered, since we're going to journal this
2963 // but not until we _actually_ finish the import...
2964 if (in->filelock.is_dirty()) {
2965 updated_scatterlocks.push_back(&in->filelock);
2966 mds->locker->mark_updated_scatterlock(&in->filelock);
2969 if (in->dirfragtreelock.is_dirty()) {
2970 updated_scatterlocks.push_back(&in->dirfragtreelock);
2971 mds->locker->mark_updated_scatterlock(&in->dirfragtreelock);
2974 // adjust replica list
2975 //assert(!in->is_replica(oldauth)); // not true on failed export
2976 in->add_replica(oldauth, CInode::EXPORT_NONCE);
2977 if (in->is_replica(mds->get_nodeid()))
2978 in->remove_replica(mds->get_nodeid());
2981 void Migrator::decode_import_inode_caps(CInode *in, bool auth_cap,
2982 bufferlist::iterator &blp,
2983 map<CInode*, map<client_t,Capability::Export> >& peer_exports)
2985 map<client_t,Capability::Export> cap_map;
2986 ::decode(cap_map, blp);
2988 ::decode(in->get_mds_caps_wanted(), blp);
2989 if (!cap_map.empty() ||
2990 (auth_cap && !in->get_mds_caps_wanted().empty())) {
2991 peer_exports[in].swap(cap_map);
2992 in->get(CInode::PIN_IMPORTINGCAPS);
2996 void Migrator::finish_import_inode_caps(CInode *in, mds_rank_t peer, bool auth_cap,
2997 map<client_t,Capability::Export> &export_map,
2998 map<client_t,Capability::Import> &import_map)
3000 for (map<client_t,Capability::Export>::iterator it = export_map.begin();
3001 it != export_map.end();
3003 dout(10) << "finish_import_inode_caps for client." << it->first << " on " << *in << dendl;
3004 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(it->first.v));
3007 Capability *cap = in->get_client_cap(it->first);
3009 cap = in->add_client_cap(it->first, session);
3011 cap->mark_importing();
3014 Capability::Import& im = import_map[it->first];
3015 im.cap_id = cap->get_cap_id();
3016 im.mseq = auth_cap ? it->second.mseq : cap->get_mseq();
3017 im.issue_seq = cap->get_last_seq() + 1;
3020 cap->merge(it->second, auth_cap);
3021 mds->mdcache->do_cap_import(session, in, cap, it->second.cap_id,
3022 it->second.seq, it->second.mseq - 1, peer,
3023 auth_cap ? CEPH_CAP_FLAG_AUTH : CEPH_CAP_FLAG_RELEASE);
3028 in->replica_caps_wanted = 0;
3029 in->put(CInode::PIN_IMPORTINGCAPS);
3033 int Migrator::decode_import_dir(bufferlist::iterator& blp,
3038 map<CInode*,map<client_t,Capability::Export> >& peer_exports,
3039 list<ScatterLock*>& updated_scatterlocks, utime_t now)
3045 CInode *diri = cache->get_inode(df.ino);
3047 CDir *dir = diri->get_or_open_dirfrag(mds->mdcache, df.frag);
3050 dout(7) << "decode_import_dir " << *dir << dendl;
3053 dir->decode_import(blp, now, ls);
3055 // adjust replica list
3056 //assert(!dir->is_replica(oldauth)); // not true on failed export
3057 dir->add_replica(oldauth, CDir::EXPORT_NONCE);
3058 if (dir->is_replica(mds->get_nodeid()))
3059 dir->remove_replica(mds->get_nodeid());
3061 // add to journal entry
3063 le->metablob.add_import_dir(dir);
3065 int num_imported = 0;
3067 // take all waiters on this dir
3068 // NOTE: a pass of imported data is guaranteed to get all of my waiters because
3069 // a replica's presense in my cache implies/forces it's presense in authority's.
3070 list<MDSInternalContextBase*> waiters;
3072 dir->take_waiting(CDir::WAIT_ANY_MASK, waiters);
3073 for (list<MDSInternalContextBase*>::iterator it = waiters.begin();
3074 it != waiters.end();
3076 import_root->add_waiter(CDir::WAIT_UNFREEZE, *it); // UNFREEZE will get kicked both on success or failure
3078 dout(15) << "doing contents" << dendl;
3082 ::decode(nden, blp);
3084 for (; nden>0; nden--) {
3090 ::decode(dname, blp);
3091 ::decode(last, blp);
3093 CDentry *dn = dir->lookup_exact_snap(dname, last);
3095 dn = dir->add_null_dentry(dname, 1, last);
3097 dn->decode_import(blp, ls);
3099 dn->add_replica(oldauth, CDentry::EXPORT_NONCE);
3100 if (dn->is_replica(mds->get_nodeid()))
3101 dn->remove_replica(mds->get_nodeid());
3103 // dentry lock in unreadable state can block path traverse
3104 if (dn->lock.get_state() != LOCK_SYNC)
3105 mds->locker->try_eval(&dn->lock, NULL);
3107 dout(15) << "decode_import_dir got " << *dn << dendl;
3111 ::decode(icode, blp);
3115 assert(dn->get_linkage()->is_null());
3119 else if (icode == 'L') {
3122 unsigned char d_type;
3124 ::decode(d_type, blp);
3125 if (dn->get_linkage()->is_remote()) {
3126 assert(dn->get_linkage()->get_remote_ino() == ino);
3128 dir->link_remote_inode(dn, ino, d_type);
3131 else if (icode == 'I') {
3134 decode_import_inode(dn, blp, oldauth, ls,
3135 peer_exports, updated_scatterlocks);
3138 // add dentry to journal entry
3140 le->metablob.add_import_dentry(dn);
3143 #ifdef MDS_VERIFY_FRAGSTAT
3144 if (dir->is_complete())
3145 dir->verify_fragstat();
3148 dir->inode->maybe_export_pin();
3150 dout(7) << "decode_import_dir done " << *dir << dendl;
3151 return num_imported;
3158 // authority bystander
3160 /* This function DOES put the passed message before returning*/
3161 void Migrator::handle_export_notify(MExportDirNotify *m)
3163 if (!(mds->is_clientreplay() || mds->is_active() || mds->is_stopping())) {
3168 CDir *dir = cache->get_dirfrag(m->get_dirfrag());
3170 mds_rank_t from = mds_rank_t(m->get_source().num());
3171 mds_authority_t old_auth = m->get_old_auth();
3172 mds_authority_t new_auth = m->get_new_auth();
3175 dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth
3176 << " on missing dir " << m->get_dirfrag() << dendl;
3177 } else if (dir->authority() != old_auth) {
3178 dout(7) << "handle_export_notify old_auth was " << dir->authority()
3179 << " != " << old_auth << " -> " << new_auth
3180 << " on " << *dir << dendl;
3182 dout(7) << "handle_export_notify " << old_auth << " -> " << new_auth
3183 << " on " << *dir << dendl;
3186 cache->map_dirfrag_set(m->get_bounds(), have);
3187 cache->adjust_bounded_subtree_auth(dir, have, new_auth);
3190 cache->try_subtree_merge(dir);
3194 if (m->wants_ack()) {
3195 mds->send_message_mds(new MExportDirNotifyAck(m->get_dirfrag(), m->get_tid(), m->get_new_auth()), from);
3198 dout(7) << "handle_export_notify no ack requested" << dendl;
3205 void Migrator::export_caps(CInode *in)
3207 mds_rank_t dest = in->authority().first;
3208 dout(7) << "export_caps to mds." << dest << " " << *in << dendl;
3210 assert(in->is_any_caps());
3211 assert(!in->is_auth());
3212 assert(!in->is_ambiguous_auth());
3213 assert(!in->state_test(CInode::STATE_EXPORTINGCAPS));
3215 MExportCaps *ex = new MExportCaps;
3216 ex->ino = in->ino();
3218 encode_export_inode_caps(in, false, ex->cap_bl, ex->client_map);
3220 mds->send_message_mds(ex, dest);
3223 void Migrator::handle_gather_caps(MGatherCaps *m)
3225 CInode *in = cache->get_inode(m->ino);
3230 dout(10) << "handle_gather_caps " << *m << " from " << m->get_source()
3233 if (in->is_any_caps() &&
3235 !in->is_ambiguous_auth() &&
3236 !in->state_test(CInode::STATE_EXPORTINGCAPS))
3243 class C_M_LoggedImportCaps : public MigratorLogContext {
3247 map<CInode*, map<client_t,Capability::Export> > peer_exports;
3248 map<client_t,entity_inst_t> client_map;
3249 map<client_t,uint64_t> sseqmap;
3251 C_M_LoggedImportCaps(Migrator *m, CInode *i, mds_rank_t f) : MigratorLogContext(m), in(i), from(f) {}
3252 void finish(int r) override {
3253 mig->logged_import_caps(in, from, peer_exports, client_map, sseqmap);
3257 /* This function DOES put the passed message before returning*/
3258 void Migrator::handle_export_caps(MExportCaps *ex)
3260 dout(10) << "handle_export_caps " << *ex << " from " << ex->get_source() << dendl;
3261 CInode *in = cache->get_inode(ex->ino);
3264 assert(in->is_auth());
3267 if (!in->can_auth_pin())
3271 C_M_LoggedImportCaps *finish = new C_M_LoggedImportCaps(
3272 this, in, mds_rank_t(ex->get_source().num()));
3273 finish->client_map = ex->client_map;
3276 bufferlist::iterator blp = ex->cap_bl.begin();
3277 decode_import_inode_caps(in, false, blp, finish->peer_exports);
3278 assert(!finish->peer_exports.empty()); // thus, inode is pinned.
3280 // journal open client sessions
3281 version_t pv = mds->server->prepare_force_open_sessions(finish->client_map, finish->sseqmap);
3283 ESessions *le = new ESessions(pv, ex->client_map);
3284 mds->mdlog->start_submit_entry(le, finish);
3285 mds->mdlog->flush();
3291 void Migrator::logged_import_caps(CInode *in,
3293 map<CInode*, map<client_t,Capability::Export> >& peer_exports,
3294 map<client_t,entity_inst_t>& client_map,
3295 map<client_t,uint64_t>& sseqmap)
3297 dout(10) << "logged_import_caps on " << *in << dendl;
3298 // see export_go() vs export_go_synced()
3299 assert(in->is_auth());
3301 // force open client sessions and finish cap import
3302 mds->server->finish_force_open_sessions(client_map, sseqmap);
3304 map<client_t,Capability::Import> imported_caps;
3306 assert(peer_exports.count(in));
3307 // clients will release caps from the exporter when they receive the cap import message.
3308 finish_import_inode_caps(in, from, false, peer_exports[in], imported_caps);
3309 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
3310 in->auth_unpin(this);