1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2015 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 #include "common/perf_counters.h"
18 #include "mds/MDSRank.h"
19 #include "mds/MDCache.h"
20 #include "mds/MDLog.h"
22 #include "mds/CDentry.h"
23 #include "events/EUpdate.h"
24 #include "messages/MClientRequest.h"
26 #include "StrayManager.h"
28 #define dout_context g_ceph_context
29 #define dout_subsys ceph_subsys_mds
31 #define dout_prefix _prefix(_dout, mds)
32 static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
33 return *_dout << "mds." << mds->get_nodeid() << ".cache.strays ";
36 class StrayManagerIOContext : public virtual MDSIOContextBase {
39 MDSRank *get_mds() override
44 explicit StrayManagerIOContext(StrayManager *sm_) : sm(sm_) {}
47 class StrayManagerLogContext : public virtual MDSLogContextBase {
50 MDSRank *get_mds() override
55 explicit StrayManagerLogContext(StrayManager *sm_) : sm(sm_) {}
58 class StrayManagerContext : public virtual MDSInternalContextBase {
61 MDSRank *get_mds() override
66 explicit StrayManagerContext(StrayManager *sm_) : sm(sm_) {}
71 * Context wrapper for _purge_stray_purged completion
73 class C_IO_PurgeStrayPurged : public StrayManagerIOContext {
77 C_IO_PurgeStrayPurged(StrayManager *sm_, CDentry *d, bool oh) :
78 StrayManagerIOContext(sm_), dn(d), only_head(oh) { }
79 void finish(int r) override {
80 assert(r == 0 || r == -ENOENT);
81 sm->_purge_stray_purged(dn, only_head);
86 void StrayManager::purge(CDentry *dn)
88 CDentry::linkage_t *dnl = dn->get_projected_linkage();
89 CInode *in = dnl->get_inode();
90 dout(10) << __func__ << " " << *dn << " " << *in << dendl;
91 assert(!dn->is_replicated());
93 // CHEAT. there's no real need to journal our intent to purge, since
94 // that is implicit in the dentry's presence and non-use in the stray
95 // dir. on recovery, we'll need to re-eval all strays anyway.
97 SnapContext nullsnapc;
100 item.ino = in->inode.ino;
102 item.action = PurgeItem::PURGE_DIR;
103 item.fragtree = in->dirfragtree;
105 item.action = PurgeItem::PURGE_FILE;
107 const SnapContext *snapc;
108 SnapRealm *realm = in->find_snaprealm();
110 dout(10) << " realm " << *realm << dendl;
111 snapc = &realm->get_snap_context();
113 dout(10) << " NO realm, using null context" << dendl;
115 assert(in->last == CEPH_NOSNAP);
120 to = in->inode.get_max_size();
121 to = MAX(in->inode.size, to);
122 // when truncating a file, the filer does not delete stripe objects that are
123 // truncated to zero. so we need to purge stripe objects up to the max size
124 // the file has ever been.
125 to = MAX(in->inode.max_size_ever, to);
128 inode_t *pi = in->get_projected_inode();
131 item.layout = pi->layout;
132 item.old_pools = pi->old_pools;
136 purge_queue.push(item, new C_IO_PurgeStrayPurged(
140 class C_PurgeStrayLogged : public StrayManagerLogContext {
145 C_PurgeStrayLogged(StrayManager *sm_, CDentry *d, version_t v, LogSegment *s) :
146 StrayManagerLogContext(sm_), dn(d), pdv(v), ls(s) { }
147 void finish(int r) override {
148 sm->_purge_stray_logged(dn, pdv, ls);
152 class C_TruncateStrayLogged : public StrayManagerLogContext {
156 C_TruncateStrayLogged(StrayManager *sm, CDentry *d, LogSegment *s) :
157 StrayManagerLogContext(sm), dn(d), ls(s) { }
158 void finish(int r) override {
159 sm->_truncate_stray_logged(dn, ls);
163 void StrayManager::_purge_stray_purged(
164 CDentry *dn, bool only_head)
166 CInode *in = dn->get_projected_linkage()->get_inode();
167 dout(10) << "_purge_stray_purged " << *dn << " " << *in << dendl;
169 logger->inc(l_mdc_strays_enqueued);
170 num_strays_enqueuing--;
171 logger->set(l_mdc_num_strays_enqueuing, num_strays_enqueuing);
174 /* This was a ::truncate */
175 EUpdate *le = new EUpdate(mds->mdlog, "purge_stray truncate");
176 mds->mdlog->start_entry(le);
178 inode_t *pi = in->project_inode();
180 pi->max_size_ever = 0;
181 pi->client_ranges.clear();
182 pi->truncate_size = 0;
183 pi->truncate_from = 0;
184 pi->version = in->pre_dirty();
186 le->metablob.add_dir_context(dn->dir);
187 le->metablob.add_primary_dentry(dn, in, true);
189 mds->mdlog->submit_entry(le,
190 new C_TruncateStrayLogged(
191 this, dn, mds->mdlog->get_current_segment()));
193 if (in->get_num_ref() != (int)in->is_dirty() ||
194 dn->get_num_ref() != (int)dn->is_dirty() + !!in->get_num_ref() + 1/*PIN_PURGING*/) {
195 // Nobody should be taking new references to an inode when it
196 // is being purged (aside from it were
198 derr << "Rogue reference after purge to " << *dn << dendl;
199 assert(0 == "rogue reference to purging inode");
203 version_t pdv = dn->pre_dirty();
204 dn->push_projected_linkage(); // NULL
206 EUpdate *le = new EUpdate(mds->mdlog, "purge_stray");
207 mds->mdlog->start_entry(le);
209 // update dirfrag fragstat, rstat
210 CDir *dir = dn->get_dir();
211 fnode_t *pf = dir->project_fnode();
212 pf->version = dir->pre_dirty();
214 pf->fragstat.nsubdirs--;
216 pf->fragstat.nfiles--;
217 pf->rstat.sub(in->inode.accounted_rstat);
219 le->metablob.add_dir_context(dn->dir);
220 EMetaBlob::dirlump& dl = le->metablob.add_dir(dn->dir, true);
221 le->metablob.add_null_dentry(dl, dn, true);
222 le->metablob.add_destroyed_inode(in->ino());
224 mds->mdlog->submit_entry(le, new C_PurgeStrayLogged(this, dn, pdv,
225 mds->mdlog->get_current_segment()));
227 logger->set(l_mdc_num_strays, num_strays);
231 void StrayManager::_purge_stray_logged(CDentry *dn, version_t pdv, LogSegment *ls)
233 CInode *in = dn->get_linkage()->get_inode();
234 dout(10) << "_purge_stray_logged " << *dn << " " << *in << dendl;
236 assert(!in->state_test(CInode::STATE_RECOVERING));
238 bool new_dn = dn->is_new();
241 assert(dn->get_projected_linkage()->is_null());
242 dn->dir->unlink_inode(dn, !new_dn);
243 dn->pop_projected_linkage();
244 dn->mark_dirty(pdv, ls);
246 dn->dir->pop_and_dirty_projected_fnode(ls);
248 in->state_clear(CInode::STATE_ORPHAN);
249 dn->state_clear(CDentry::STATE_PURGING | CDentry::STATE_PURGINGPINNED);
250 dn->put(CDentry::PIN_PURGING);
254 dout(20) << " dn is new, removing" << dendl;
256 dn->dir->remove_dentry(dn);
262 in->mdcache->remove_inode(in);
265 void StrayManager::enqueue(CDentry *dn, bool trunc)
267 CDentry::linkage_t *dnl = dn->get_projected_linkage();
269 CInode *in = dnl->get_inode();
272 /* We consider a stray to be purging as soon as it is enqueued, to avoid
273 * enqueing it twice */
274 dn->state_set(CDentry::STATE_PURGING);
275 in->state_set(CInode::STATE_PURGING);
277 /* We must clear this as soon as enqueuing it, to prevent the journal
278 * expiry code from seeing a dirty parent and trying to write a backtrace */
280 if (in->is_dirty_parent()) {
281 in->clear_dirty_parent();
285 dout(20) << __func__ << ": purging dn: " << *dn << dendl;
287 if (!dn->state_test(CDentry::STATE_PURGINGPINNED)) {
288 dn->get(CDentry::PIN_PURGING);
289 dn->state_set(CDentry::STATE_PURGINGPINNED);
292 ++num_strays_enqueuing;
293 logger->set(l_mdc_num_strays_enqueuing, num_strays_enqueuing);
295 // Resources are available, acquire them and execute the purge
298 dout(10) << __func__ << ": purging this dentry immediately: "
302 class C_OpenSnapParents : public StrayManagerContext {
306 C_OpenSnapParents(StrayManager *sm_, CDentry *dn_, bool t) :
307 StrayManagerContext(sm_), dn(dn_), trunc(t) { }
308 void finish(int r) override {
309 sm->_enqueue(dn, trunc);
313 void StrayManager::_enqueue(CDentry *dn, bool trunc)
317 CInode *in = dn->get_linkage()->get_inode();
319 !in->snaprealm->have_past_parents_open() &&
320 !in->snaprealm->open_parents(new C_OpenSnapParents(this, dn, trunc))) {
321 // this can happen if the dentry had been trimmed from cache.
333 void StrayManager::advance_delayed()
338 for (elist<CDentry*>::iterator p = delayed_eval_stray.begin(); !p.end(); ) {
341 dn->item_stray.remove_myself();
342 num_strays_delayed--;
344 if (dn->get_projected_linkage()->is_null()) {
345 /* A special case: a stray dentry can go null if its inode is being
346 * re-linked into another MDS's stray dir during a shutdown migration. */
347 dout(4) << __func__ << ": delayed dentry is now null: " << *dn << dendl;
351 const bool purging = eval_stray(dn);
353 derr << "Dentry " << *dn << " was purgeable but no longer is!" << dendl;
355 * This can happen if a stray is purgeable, but has gained an extra
356 * reference by virtue of having its backtrace updated.
357 * FIXME perhaps we could simplify this further by
358 * avoiding writing the backtrace of purge-ready strays, so
359 * that this code could be more rigid?
363 logger->set(l_mdc_num_strays_delayed, num_strays_delayed);
366 void StrayManager::set_num_strays(uint64_t num)
370 logger->set(l_mdc_num_strays, num_strays);
373 void StrayManager::notify_stray_created()
376 logger->set(l_mdc_num_strays, num_strays);
377 logger->inc(l_mdc_strays_created);
380 void StrayManager::notify_stray_removed()
383 logger->set(l_mdc_num_strays, num_strays);
386 struct C_EvalStray : public StrayManagerContext {
388 C_EvalStray(StrayManager *sm_, CDentry *d) : StrayManagerContext(sm_), dn(d) {}
389 void finish(int r) override {
394 struct C_MDC_EvalStray : public StrayManagerContext {
396 C_MDC_EvalStray(StrayManager *sm_, CDentry *d) : StrayManagerContext(sm_), dn(d) {}
397 void finish(int r) override {
402 bool StrayManager::_eval_stray(CDentry *dn, bool delay)
404 dout(10) << "eval_stray " << *dn << dendl;
405 CDentry::linkage_t *dnl = dn->get_projected_linkage();
406 assert(dnl->is_primary());
407 dout(10) << " inode is " << *dnl->get_inode() << dendl;
408 CInode *in = dnl->get_inode();
410 assert(!in->state_test(CInode::STATE_REJOINUNDEF));
412 // The only dentries elegible for purging are those
413 // in the stray directories
414 assert(dn->get_dir()->get_inode()->is_stray());
416 // Inode may not pass through this function if it
417 // was already identified for purging (i.e. cannot
418 // call eval_stray() after purge()
419 assert(!dn->state_test(CDentry::STATE_PURGING));
421 if (!dn->is_auth()) {
428 if (dn->item_stray.is_on_list()) {
432 dn->item_stray.remove_myself();
433 num_strays_delayed--;
434 logger->set(l_mdc_num_strays_delayed, num_strays_delayed);
438 if (in->inode.nlink == 0) {
439 // past snaprealm parents imply snapped dentry remote links.
440 // only important for directories. normal file data snaps are handled
441 // by the object store.
443 if (!in->snaprealm->have_past_parents_open() &&
444 !in->snaprealm->open_parents(new C_MDC_EvalStray(this, dn))) {
447 in->snaprealm->prune_past_parents();
448 in->purge_stale_snap_data(in->snaprealm->get_snaps());
451 if (in->snaprealm && in->snaprealm->has_past_parents()) {
452 dout(20) << " directory has past parents "
453 << in->snaprealm->srnode.past_parents << dendl;
454 if (in->state_test(CInode::STATE_MISSINGOBJS)) {
455 mds->clog->error() << "previous attempt at committing dirfrag of ino "
456 << in->ino() << " has failed, missing object";
457 mds->handle_write_error(-ENOENT);
459 return false; // not until some snaps are deleted.
462 in->mdcache->clear_dirty_bits_for_stray(in);
464 if (!in->remote_parents.empty()) {
465 // unlink any stale remote snap dentry.
466 for (compact_set<CDentry*>::iterator p = in->remote_parents.begin();
467 p != in->remote_parents.end(); ) {
468 CDentry *remote_dn = *p;
470 assert(remote_dn->last != CEPH_NOSNAP);
471 remote_dn->unlink_remote(remote_dn->get_linkage());
475 if (dn->is_replicated()) {
476 dout(20) << " replicated" << dendl;
479 if (dn->is_any_leases() || in->is_any_caps()) {
480 dout(20) << " caps | leases" << dendl;
481 return false; // wait
483 if (in->state_test(CInode::STATE_NEEDSRECOVER) ||
484 in->state_test(CInode::STATE_RECOVERING)) {
485 dout(20) << " pending recovery" << dendl;
486 return false; // don't mess with file size probing
488 if (in->get_num_ref() > (int)in->is_dirty() + (int)in->is_dirty_parent()) {
489 dout(20) << " too many inode refs" << dendl;
492 if (dn->get_num_ref() > (int)dn->is_dirty() + !!in->get_num_ref()) {
493 dout(20) << " too many dn refs" << dendl;
497 if (!dn->item_stray.is_on_list()) {
498 delayed_eval_stray.push_back(&dn->item_stray);
499 num_strays_delayed++;
500 logger->set(l_mdc_num_strays_delayed, num_strays_delayed);
502 // don't purge multiversion inode with snap data
503 } else if (in->snaprealm && in->snaprealm->has_past_parents() &&
504 !in->old_inodes.empty()) {
505 // A file with snapshots: we will truncate the HEAD revision
506 // but leave the metadata intact.
507 assert(!in->is_dir());
508 dout(20) << " file has past parents "
509 << in->snaprealm->srnode.past_parents << dendl;
510 if (in->is_file() && in->get_projected_inode()->size > 0) {
511 enqueue(dn, true); // truncate head objects
514 // A straightforward file, ready to be purged. Enqueue it.
516 in->close_dirfrags();
525 * Where a stray has some links, they should be remotes, check
526 * if we can do anything with them if we happen to have them in
529 _eval_stray_remote(dn, NULL);
534 void StrayManager::activate()
536 dout(10) << __func__ << dendl;
538 purge_queue.activate();
541 bool StrayManager::eval_stray(CDentry *dn, bool delay)
543 // avoid nested eval_stray
544 if (dn->state_test(CDentry::STATE_EVALUATINGSTRAY))
547 dn->state_set(CDentry::STATE_EVALUATINGSTRAY);
548 bool ret = _eval_stray(dn, delay);
549 dn->state_clear(CDentry::STATE_EVALUATINGSTRAY);
553 void StrayManager::eval_remote(CDentry *remote_dn)
555 dout(10) << __func__ << " " << *remote_dn << dendl;
557 CDentry::linkage_t *dnl = remote_dn->get_projected_linkage();
558 assert(dnl->is_remote());
559 CInode *in = dnl->get_inode();
562 dout(20) << __func__ << ": no inode, cannot evaluate" << dendl;
566 if (remote_dn->last != CEPH_NOSNAP) {
567 dout(20) << __func__ << ": snap dentry, cannot evaluate" << dendl;
572 CDentry *primary_dn = in->get_projected_parent_dn();
573 assert(primary_dn != NULL);
574 if (primary_dn->get_dir()->get_inode()->is_stray()) {
575 _eval_stray_remote(primary_dn, remote_dn);
577 dout(20) << __func__ << ": inode's primary dn not stray" << dendl;
581 class C_RetryEvalRemote : public StrayManagerContext {
584 C_RetryEvalRemote(StrayManager *sm_, CDentry *dn_) :
585 StrayManagerContext(sm_), dn(dn_) {
586 dn->get(CDentry::PIN_PTRWAITER);
588 void finish(int r) override {
589 if (dn->get_projected_linkage()->is_remote())
591 dn->put(CDentry::PIN_PTRWAITER);
595 void StrayManager::_eval_stray_remote(CDentry *stray_dn, CDentry *remote_dn)
597 dout(20) << __func__ << " " << *stray_dn << dendl;
598 assert(stray_dn != NULL);
599 assert(stray_dn->get_dir()->get_inode()->is_stray());
600 CDentry::linkage_t *stray_dnl = stray_dn->get_projected_linkage();
601 assert(stray_dnl->is_primary());
602 CInode *stray_in = stray_dnl->get_inode();
603 assert(stray_in->inode.nlink >= 1);
604 assert(stray_in->last == CEPH_NOSNAP);
606 /* If no remote_dn hinted, pick one arbitrarily */
607 if (remote_dn == NULL) {
608 if (!stray_in->remote_parents.empty()) {
609 for (compact_set<CDentry*>::iterator p = stray_in->remote_parents.begin();
610 p != stray_in->remote_parents.end();
612 if ((*p)->last == CEPH_NOSNAP && !(*p)->is_projected()) {
613 if ((*p)->is_auth()) {
615 if (remote_dn->dir->can_auth_pin())
617 } else if (!remote_dn) {
623 dout(20) << __func__ << ": not reintegrating (no remote parents in cache)" << dendl;
627 assert(remote_dn->last == CEPH_NOSNAP);
628 // NOTE: we repeat this check in _rename(), since our submission path is racey.
629 if (!remote_dn->is_projected()) {
630 if (remote_dn->is_auth()) {
631 if (remote_dn->dir->can_auth_pin()) {
632 reintegrate_stray(stray_dn, remote_dn);
634 remote_dn->dir->add_waiter(CDir::WAIT_UNFREEZE, new C_RetryEvalRemote(this, remote_dn));
635 dout(20) << __func__ << ": not reintegrating (can't authpin remote parent)" << dendl;
638 } else if (!remote_dn->is_auth() && stray_dn->is_auth()) {
639 migrate_stray(stray_dn, remote_dn->authority().first);
641 dout(20) << __func__ << ": not reintegrating" << dendl;
644 // don't do anything if the remote parent is projected, or we may
645 // break user-visible semantics!
646 dout(20) << __func__ << ": not reintegrating (projected)" << dendl;
650 void StrayManager::reintegrate_stray(CDentry *straydn, CDentry *rdn)
652 dout(10) << __func__ << " " << *straydn << " into " << *rdn << dendl;
654 logger->inc(l_mdc_strays_reintegrated);
656 // rename it to another mds.
658 straydn->make_path(src);
662 MClientRequest *req = new MClientRequest(CEPH_MDS_OP_RENAME);
663 req->set_filepath(dst);
664 req->set_filepath2(src);
665 req->set_tid(mds->issue_tid());
667 mds->send_message_mds(req, rdn->authority().first);
670 void StrayManager::migrate_stray(CDentry *dn, mds_rank_t to)
672 CInode *in = dn->get_projected_linkage()->get_inode();
674 CInode *diri = dn->dir->get_inode();
675 assert(diri->is_stray());
676 dout(10) << "migrate_stray from mds." << MDS_INO_STRAY_OWNER(diri->inode.ino)
678 << " " << *dn << " " << *in << dendl;
680 logger->inc(l_mdc_strays_migrated);
682 // rename it to another mds.
685 assert(src.depth() == 2);
687 filepath dst(MDS_INO_MDSDIR(to));
688 dst.push_dentry(src[0]);
689 dst.push_dentry(src[1]);
691 MClientRequest *req = new MClientRequest(CEPH_MDS_OP_RENAME);
692 req->set_filepath(dst);
693 req->set_filepath2(src);
694 req->set_tid(mds->issue_tid());
696 mds->send_message_mds(req, to);
699 StrayManager::StrayManager(MDSRank *mds, PurgeQueue &purge_queue_)
700 : delayed_eval_stray(member_offset(CDentry, item_stray)),
701 mds(mds), logger(NULL), started(false), num_strays(0),
702 num_strays_delayed(0), num_strays_enqueuing(0),
703 purge_queue(purge_queue_)
708 void StrayManager::truncate(CDentry *dn)
710 const CDentry::linkage_t *dnl = dn->get_projected_linkage();
711 const CInode *in = dnl->get_inode();
713 dout(10) << __func__ << ": " << *dn << " " << *in << dendl;
714 assert(!dn->is_replicated());
716 const SnapRealm *realm = in->find_snaprealm();
718 dout(10) << " realm " << *realm << dendl;
719 const SnapContext *snapc = &realm->get_snap_context();
721 uint64_t to = in->inode.get_max_size();
722 to = MAX(in->inode.size, to);
723 // when truncating a file, the filer does not delete stripe objects that are
724 // truncated to zero. so we need to purge stripe objects up to the max size
725 // the file has ever been.
726 to = MAX(in->inode.max_size_ever, to);
731 item.ino = in->inode.ino;
732 item.layout = in->inode.layout;
736 purge_queue.push(item, new C_IO_PurgeStrayPurged(
740 void StrayManager::_truncate_stray_logged(CDentry *dn, LogSegment *ls)
742 CInode *in = dn->get_projected_linkage()->get_inode();
744 dout(10) << __func__ << ": " << *dn << " " << *in << dendl;
746 dn->state_clear(CDentry::STATE_PURGING | CDentry::STATE_PURGINGPINNED);
747 dn->put(CDentry::PIN_PURGING);
749 in->pop_and_dirty_projected_inode(ls);