--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2014 Red Hat
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include <iostream>
+
+#include "ScrubStack.h"
+#include "common/Finisher.h"
+#include "mds/MDSRank.h"
+#include "mds/MDCache.h"
+#include "mds/MDSContinuation.h"
+
+#define dout_context g_ceph_context
+#define dout_subsys ceph_subsys_mds
+#undef dout_prefix
+#define dout_prefix _prefix(_dout, scrubstack->mdcache->mds)
+static ostream& _prefix(std::ostream *_dout, MDSRank *mds) {
+ return *_dout << "mds." << mds->get_nodeid() << ".scrubstack ";
+}
+
+void ScrubStack::push_inode(CInode *in)
+{
+ dout(20) << "pushing " << *in << " on top of ScrubStack" << dendl;
+ if (!in->item_scrub.is_on_list()) {
+ in->get(CInode::PIN_SCRUBQUEUE);
+ stack_size++;
+ }
+ inode_stack.push_front(&in->item_scrub);
+}
+
+void ScrubStack::push_inode_bottom(CInode *in)
+{
+ dout(20) << "pushing " << *in << " on bottom of ScrubStack" << dendl;
+ if (!in->item_scrub.is_on_list()) {
+ in->get(CInode::PIN_SCRUBQUEUE);
+ stack_size++;
+ }
+ inode_stack.push_back(&in->item_scrub);
+}
+
+void ScrubStack::pop_inode(CInode *in)
+{
+ dout(20) << "popping " << *in
+ << " off of ScrubStack" << dendl;
+ assert(in->item_scrub.is_on_list());
+ in->put(CInode::PIN_SCRUBQUEUE);
+ in->item_scrub.remove_myself();
+ stack_size--;
+}
+
+void ScrubStack::_enqueue_inode(CInode *in, CDentry *parent,
+ const ScrubHeaderRefConst& header,
+ MDSInternalContextBase *on_finish, bool top)
+{
+ dout(10) << __func__ << " with {" << *in << "}"
+ << ", on_finish=" << on_finish << ", top=" << top << dendl;
+ assert(mdcache->mds->mds_lock.is_locked_by_me());
+ in->scrub_initialize(parent, header, on_finish);
+ if (top)
+ push_inode(in);
+ else
+ push_inode_bottom(in);
+}
+
+void ScrubStack::enqueue_inode(CInode *in, const ScrubHeaderRefConst& header,
+ MDSInternalContextBase *on_finish, bool top)
+{
+ _enqueue_inode(in, NULL, header, on_finish, top);
+ kick_off_scrubs();
+}
+
+void ScrubStack::kick_off_scrubs()
+{
+ dout(20) << __func__ << " entering with " << scrubs_in_progress << " in "
+ "progress and " << stack_size << " in the stack" << dendl;
+ bool can_continue = true;
+ elist<CInode*>::iterator i = inode_stack.begin();
+ while (g_conf->mds_max_scrub_ops_in_progress > scrubs_in_progress &&
+ can_continue && !i.end()) {
+ CInode *curi = *i;
+ ++i; // we have our reference, push iterator forward
+
+ dout(20) << __func__ << " examining " << *curi << dendl;
+
+ if (!curi->is_dir()) {
+ // it's a regular file, symlink, or hard link
+ pop_inode(curi); // we only touch it this once, so remove from stack
+
+ if (!curi->scrub_info()->on_finish) {
+ scrubs_in_progress++;
+ curi->scrub_set_finisher(&scrub_kick);
+ }
+ scrub_file_inode(curi);
+ can_continue = true;
+ } else {
+ bool completed; // it's done, so pop it off the stack
+ bool terminal; // not done, but we can start ops on other directories
+ bool progress; // it added new dentries to the top of the stack
+ scrub_dir_inode(curi, &progress, &terminal, &completed);
+ if (completed) {
+ dout(20) << __func__ << " dir completed" << dendl;
+ pop_inode(curi);
+ } else if (progress) {
+ dout(20) << __func__ << " dir progressed" << dendl;
+ // we added new stuff to top of stack, so reset ourselves there
+ i = inode_stack.begin();
+ } else {
+ dout(20) << __func__ << " dir no-op" << dendl;
+ }
+
+ can_continue = progress || terminal || completed;
+ }
+ }
+}
+
+void ScrubStack::scrub_dir_inode(CInode *in,
+ bool *added_children,
+ bool *terminal,
+ bool *done)
+{
+ dout(10) << __func__ << *in << dendl;
+
+ *added_children = false;
+ bool all_frags_terminal = true;
+ bool all_frags_done = true;
+
+ const ScrubHeaderRefConst& header = in->scrub_info()->header;
+
+ if (header->get_recursive()) {
+ list<frag_t> scrubbing_frags;
+ list<CDir*> scrubbing_cdirs;
+ in->scrub_dirfrags_scrubbing(&scrubbing_frags);
+ dout(20) << __func__ << " iterating over " << scrubbing_frags.size()
+ << " scrubbing frags" << dendl;
+ for (list<frag_t>::iterator i = scrubbing_frags.begin();
+ i != scrubbing_frags.end();
+ ++i) {
+ // turn frags into CDir *
+ CDir *dir = in->get_dirfrag(*i);
+ if (dir) {
+ scrubbing_cdirs.push_back(dir);
+ dout(25) << __func__ << " got CDir " << *dir << " presently scrubbing" << dendl;
+ } else {
+ in->scrub_dirfrag_finished(*i);
+ dout(25) << __func__ << " missing dirfrag " << *i << " skip scrubbing" << dendl;
+ }
+ }
+
+ dout(20) << __func__ << " consuming from " << scrubbing_cdirs.size()
+ << " scrubbing cdirs" << dendl;
+
+ list<CDir*>::iterator i = scrubbing_cdirs.begin();
+ while (g_conf->mds_max_scrub_ops_in_progress > scrubs_in_progress) {
+ // select next CDir
+ CDir *cur_dir = NULL;
+ if (i != scrubbing_cdirs.end()) {
+ cur_dir = *i;
+ ++i;
+ dout(20) << __func__ << " got cur_dir = " << *cur_dir << dendl;
+ } else {
+ bool ready = get_next_cdir(in, &cur_dir);
+ dout(20) << __func__ << " get_next_cdir ready=" << ready << dendl;
+
+ if (ready && cur_dir) {
+ scrubbing_cdirs.push_back(cur_dir);
+ } else if (!ready) {
+ // We are waiting for load of a frag
+ all_frags_done = false;
+ all_frags_terminal = false;
+ break;
+ } else {
+ // Finished with all frags
+ break;
+ }
+ }
+ // scrub that CDir
+ bool frag_added_children = false;
+ bool frag_terminal = true;
+ bool frag_done = false;
+ scrub_dirfrag(cur_dir, header,
+ &frag_added_children, &frag_terminal, &frag_done);
+ if (frag_done) {
+ cur_dir->inode->scrub_dirfrag_finished(cur_dir->frag);
+ }
+ *added_children |= frag_added_children;
+ all_frags_terminal = all_frags_terminal && frag_terminal;
+ all_frags_done = all_frags_done && frag_done;
+ }
+
+ dout(20) << "finished looping; all_frags_terminal=" << all_frags_terminal
+ << ", all_frags_done=" << all_frags_done << dendl;
+ } else {
+ dout(20) << "!scrub_recursive" << dendl;
+ }
+
+ if (all_frags_done) {
+ assert (!*added_children); // can't do this if children are still pending
+
+ // OK, so now I can... fire off a validate on the dir inode, and
+ // when it completes, come through here again, noticing that we've
+ // set a flag to indicate the validate happened, and
+ scrub_dir_inode_final(in);
+ }
+
+ *terminal = all_frags_terminal;
+ *done = all_frags_done;
+ dout(10) << __func__ << " is exiting " << *terminal << " " << *done << dendl;
+ return;
+}
+
+bool ScrubStack::get_next_cdir(CInode *in, CDir **new_dir)
+{
+ dout(20) << __func__ << " on " << *in << dendl;
+ frag_t next_frag;
+ int r = in->scrub_dirfrag_next(&next_frag);
+ assert (r >= 0);
+
+ if (r == 0) {
+ // we got a frag to scrub, otherwise it would be ENOENT
+ dout(25) << "looking up new frag " << next_frag << dendl;
+ CDir *next_dir = in->get_or_open_dirfrag(mdcache, next_frag);
+ if (!next_dir->is_complete()) {
+ scrubs_in_progress++;
+ next_dir->fetch(&scrub_kick);
+ dout(25) << "fetching frag from RADOS" << dendl;
+ return false;
+ }
+ *new_dir = next_dir;
+ dout(25) << "returning dir " << *new_dir << dendl;
+ return true;
+ }
+ assert(r == ENOENT);
+ // there are no dirfrags left
+ *new_dir = NULL;
+ return true;
+}
+
+class C_InodeValidated : public MDSInternalContext
+{
+ public:
+ ScrubStack *stack;
+ CInode::validated_data result;
+ CInode *target;
+
+ C_InodeValidated(MDSRank *mds, ScrubStack *stack_, CInode *target_)
+ : MDSInternalContext(mds), stack(stack_), target(target_)
+ {}
+
+ void finish(int r) override
+ {
+ stack->_validate_inode_done(target, r, result);
+ }
+};
+
+
+void ScrubStack::scrub_dir_inode_final(CInode *in)
+{
+ dout(20) << __func__ << *in << dendl;
+
+ // Two passes through this function. First one triggers inode validation,
+ // second one sets finally_done
+ // FIXME: kind of overloading scrub_in_progress here, using it while
+ // dentry is still on stack to indicate that we have finished
+ // doing our validate_disk_state on the inode
+ // FIXME: the magic-constructing scrub_info() is going to leave
+ // an unneeded scrub_infop lying around here
+ if (!in->scrub_info()->children_scrubbed) {
+ if (!in->scrub_info()->on_finish) {
+ scrubs_in_progress++;
+ in->scrub_set_finisher(&scrub_kick);
+ }
+
+ in->scrub_children_finished();
+ C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in);
+ in->validate_disk_state(&fin->result, fin);
+ }
+
+ return;
+}
+
+void ScrubStack::scrub_dirfrag(CDir *dir,
+ const ScrubHeaderRefConst& header,
+ bool *added_children, bool *is_terminal,
+ bool *done)
+{
+ assert(dir != NULL);
+
+ dout(20) << __func__ << " on " << *dir << dendl;
+ *added_children = false;
+ *is_terminal = false;
+ *done = false;
+
+
+ if (!dir->scrub_info()->directory_scrubbing) {
+ // Get the frag complete before calling
+ // scrub initialize, so that it can populate its lists
+ // of dentries.
+ if (!dir->is_complete()) {
+ scrubs_in_progress++;
+ dir->fetch(&scrub_kick);
+ return;
+ }
+
+ dir->scrub_initialize(header);
+ }
+
+ int r = 0;
+ while(r == 0) {
+ CDentry *dn = NULL;
+ scrubs_in_progress++;
+ r = dir->scrub_dentry_next(&scrub_kick, &dn);
+ if (r != EAGAIN) {
+ scrubs_in_progress--;
+ }
+
+ if (r == EAGAIN) {
+ // Drop out, CDir fetcher will call back our kicker context
+ dout(20) << __func__ << " waiting for fetch on " << *dir << dendl;
+ return;
+ }
+
+ if (r == ENOENT) {
+ // Nothing left to scrub, are we done?
+ std::list<CDentry*> scrubbing;
+ dir->scrub_dentries_scrubbing(&scrubbing);
+ if (scrubbing.empty()) {
+ dout(20) << __func__ << " dirfrag done: " << *dir << dendl;
+ // FIXME: greg: What's the diff meant to be between done and terminal
+ dir->scrub_finished();
+ *done = true;
+ *is_terminal = true;
+ } else {
+ dout(20) << __func__ << " " << scrubbing.size() << " dentries still "
+ "scrubbing in " << *dir << dendl;
+ }
+ return;
+ }
+
+ // scrub_dentry_next defined to only give EAGAIN, ENOENT, 0 -- we should
+ // never get random IO errors here.
+ assert(r == 0);
+
+ _enqueue_inode(dn->get_projected_inode(), dn, header, NULL, true);
+
+ *added_children = true;
+ }
+}
+
+void ScrubStack::scrub_file_inode(CInode *in)
+{
+ C_InodeValidated *fin = new C_InodeValidated(mdcache->mds, this, in);
+ // At this stage the DN is already past scrub_initialize, so
+ // it's in the cache, it has PIN_SCRUBQUEUE and it is authpinned
+ in->validate_disk_state(&fin->result, fin);
+}
+
+void ScrubStack::_validate_inode_done(CInode *in, int r,
+ const CInode::validated_data &result)
+{
+ LogChannelRef clog = mdcache->mds->clog;
+ const ScrubHeaderRefConst header = in->scrub_info()->header;
+
+ std::string path;
+ if (!result.passed_validation) {
+ // Build path string for use in messages
+ in->make_path_string(path, true);
+ }
+
+ if (result.backtrace.checked && !result.backtrace.passed) {
+ // Record backtrace fails as remote linkage damage, as
+ // we may not be able to resolve hard links to this inode
+ mdcache->mds->damage_table.notify_remote_damaged(in->inode.ino, path);
+ } else if (result.inode.checked && !result.inode.passed) {
+ // Record damaged inode structures as damaged dentries as
+ // that is where they are stored
+ auto parent = in->get_projected_parent_dn();
+ if (parent) {
+ auto dir = parent->get_dir();
+ mdcache->mds->damage_table.notify_dentry(
+ dir->inode->ino(), dir->frag, parent->last, parent->name, path);
+ }
+ }
+
+ // Inform the cluster log if we found an error
+ if (!result.passed_validation) {
+ clog->warn() << "Scrub error on inode " << in->ino()
+ << " (" << path << ") see " << g_conf->name
+ << " log and `damage ls` output for details";
+
+ // Put the verbose JSON output into the MDS log for later inspection
+ JSONFormatter f;
+ result.dump(&f);
+ std::ostringstream out;
+ f.flush(out);
+ derr << __func__ << " scrub error on inode " << *in << ": " << out.str()
+ << dendl;
+ } else {
+ dout(10) << __func__ << " scrub passed on inode " << *in << dendl;
+ }
+
+ MDSInternalContextBase *c = NULL;
+ in->scrub_finished(&c);
+
+ if (!header->get_recursive() && in == header->get_origin()) {
+ if (r >= 0) { // we got into the scrubbing dump it
+ result.dump(&(header->get_formatter()));
+ } else { // we failed the lookup or something; dump ourselves
+ header->get_formatter().open_object_section("results");
+ header->get_formatter().dump_int("return_code", r);
+ header->get_formatter().close_section(); // results
+ }
+ }
+ if (c) {
+ finisher->queue(new MDSIOContextWrapper(mdcache->mds, c), 0);
+ }
+}
+
+ScrubStack::C_KickOffScrubs::C_KickOffScrubs(MDCache *mdcache, ScrubStack *s)
+ : MDSInternalContext(mdcache->mds), stack(s) { }