X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=src%2Fceph%2Fsrc%2Fmds%2FFSMap.cc;fp=src%2Fceph%2Fsrc%2Fmds%2FFSMap.cc;h=b224e11190d25c9e9edd17e8c36c1a95bfd7fb0a;hb=812ff6ca9fcd3e629e49d4328905f33eee8ca3f5;hp=0000000000000000000000000000000000000000;hpb=15280273faafb77777eab341909a3f495cf248d9;p=stor4nfv.git diff --git a/src/ceph/src/mds/FSMap.cc b/src/ceph/src/mds/FSMap.cc new file mode 100644 index 0000000..b224e11 --- /dev/null +++ b/src/ceph/src/mds/FSMap.cc @@ -0,0 +1,1046 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#include "FSMap.h" + +#include +using std::stringstream; + +#include "mon/health_check.h" + + +void Filesystem::dump(Formatter *f) const +{ + f->open_object_section("mdsmap"); + mds_map.dump(f); + f->close_section(); + f->dump_int("id", fscid); +} + +void FSMap::dump(Formatter *f) const +{ + f->dump_int("epoch", epoch); + + f->open_object_section("compat"); + compat.dump(f); + f->close_section(); + + f->open_object_section("feature_flags"); + f->dump_bool("enable_multiple", enable_multiple); + f->dump_bool("ever_enabled_multiple", ever_enabled_multiple); + f->close_section(); + + f->open_array_section("standbys"); + for (const auto &i : standby_daemons) { + f->open_object_section("info"); + i.second.dump(f); + f->dump_int("epoch", standby_epochs.at(i.first)); + f->close_section(); + } + f->close_section(); + + f->open_array_section("filesystems"); + for (const auto &fs : filesystems) { + f->open_object_section("filesystem"); + fs.second->dump(f); + f->close_section(); + } + f->close_section(); +} + +void FSMap::generate_test_instances(list& ls) +{ + FSMap *m = new FSMap(); + + std::list mds_map_instances; + MDSMap::generate_test_instances(mds_map_instances); + + int k = 20; + for (auto i : mds_map_instances) { + auto fs = std::make_shared(); + fs->fscid = k++; + fs->mds_map = *i; + delete i; + m->filesystems[fs->fscid] = fs; + } + mds_map_instances.clear(); + + ls.push_back(m); +} + +void FSMap::print(ostream& out) const +{ + out << "e" << epoch << std::endl; + out << "enable_multiple, ever_enabled_multiple: " << enable_multiple << "," + << ever_enabled_multiple << std::endl; + out << "compat: " << compat << std::endl; + out << "legacy client fscid: " << legacy_client_fscid << std::endl; + out << " " << std::endl; + + if (filesystems.empty()) { + out << "No filesystems configured" << std::endl; + return; + } + + for (const auto &fs : filesystems) { + fs.second->print(out); + out << " " << std::endl << " " << std::endl; // Space out a bit + } + + if (!standby_daemons.empty()) { + out << "Standby daemons:" << std::endl << " " << std::endl; + } + + for (const auto &p : standby_daemons) { + p.second.print_summary(out); + out << std::endl; + } +} + + + +void FSMap::print_summary(Formatter *f, ostream *out) const +{ + map by_rank; + map by_state; + + if (f) { + f->dump_unsigned("epoch", get_epoch()); + for (auto i : filesystems) { + auto fs = i.second; + f->dump_unsigned("id", fs->fscid); + f->dump_unsigned("up", fs->mds_map.up.size()); + f->dump_unsigned("in", fs->mds_map.in.size()); + f->dump_unsigned("max", fs->mds_map.max_mds); + } + } else { + for (auto i : filesystems) { + auto fs = i.second; + *out << fs->mds_map.fs_name << "-" << fs->mds_map.up.size() << "/" + << fs->mds_map.in.size() << "/" << fs->mds_map.max_mds << " up "; + } + } + + if (f) { + f->open_array_section("by_rank"); + } + + const auto all_info = get_mds_info(); + for (const auto &p : all_info) { + const auto &info = p.second; + string s = ceph_mds_state_name(info.state); + if (info.laggy()) { + s += "(laggy or crashed)"; + } + + const fs_cluster_id_t fscid = mds_roles.at(info.global_id); + + if (info.rank != MDS_RANK_NONE && + info.state != MDSMap::STATE_STANDBY_REPLAY) { + if (f) { + f->open_object_section("mds"); + f->dump_unsigned("filesystem_id", fscid); + f->dump_unsigned("rank", info.rank); + f->dump_string("name", info.name); + f->dump_string("status", s); + f->close_section(); + } else { + by_rank[mds_role_t(fscid, info.rank)] = info.name + "=" + s; + } + } else { + by_state[s]++; + } + } + + if (f) { + f->close_section(); + } else { + if (!by_rank.empty()) { + if (filesystems.size() > 1) { + // Disambiguate filesystems + std::map pretty; + for (auto i : by_rank) { + const auto &fs_name = filesystems.at(i.first.fscid)->mds_map.fs_name; + std::ostringstream o; + o << "[" << fs_name << ":" << i.first.rank << "]"; + pretty[o.str()] = i.second; + } + *out << " " << pretty; + } else { + // Omit FSCID in output when only one filesystem exists + std::map shortened; + for (auto i : by_rank) { + shortened[i.first.rank] = i.second; + } + *out << " " << shortened; + } + } + } + + for (map::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); ++p) { + if (f) { + f->dump_unsigned(p->first.c_str(), p->second); + } else { + *out << ", " << p->second << " " << p->first; + } + } + + size_t failed = 0; + size_t damaged = 0; + for (auto i : filesystems) { + auto fs = i.second; + failed += fs->mds_map.failed.size(); + damaged += fs->mds_map.damaged.size(); + } + + if (failed > 0) { + if (f) { + f->dump_unsigned("failed", failed); + } else { + *out << ", " << failed << " failed"; + } + } + + if (damaged > 0) { + if (f) { + f->dump_unsigned("damaged", damaged); + } else { + *out << ", " << damaged << " damaged"; + } + } + //if (stopped.size()) + //out << ", " << stopped.size() << " stopped"; +} + + +void FSMap::create_filesystem(const std::string &name, + int64_t metadata_pool, int64_t data_pool, + uint64_t features) +{ + auto fs = std::make_shared(); + fs->mds_map.fs_name = name; + fs->mds_map.max_mds = 1; + fs->mds_map.data_pools.push_back(data_pool); + fs->mds_map.metadata_pool = metadata_pool; + fs->mds_map.cas_pool = -1; + fs->mds_map.max_file_size = g_conf->mds_max_file_size; + fs->mds_map.compat = compat; + fs->mds_map.created = ceph_clock_now(); + fs->mds_map.modified = ceph_clock_now(); + fs->mds_map.session_timeout = g_conf->mds_session_timeout; + fs->mds_map.session_autoclose = g_conf->mds_session_autoclose; + fs->mds_map.enabled = true; + if (features & CEPH_FEATURE_SERVER_JEWEL) { + fs->fscid = next_filesystem_id++; + // ANONYMOUS is only for upgrades from legacy mdsmaps, we should + // have initialized next_filesystem_id such that it's never used here. + assert(fs->fscid != FS_CLUSTER_ID_ANONYMOUS); + } else { + // Use anon fscid because this will get thrown away when encoding + // as legacy MDSMap for legacy mons. + assert(filesystems.empty()); + fs->fscid = FS_CLUSTER_ID_ANONYMOUS; + } + filesystems[fs->fscid] = fs; + + // Created first filesystem? Set it as the one + // for legacy clients to use + if (filesystems.size() == 1) { + legacy_client_fscid = fs->fscid; + } +} + +void FSMap::reset_filesystem(fs_cluster_id_t fscid) +{ + auto fs = get_filesystem(fscid); + auto new_fs = std::make_shared(); + + // Populate rank 0 as existing (so don't go into CREATING) + // but failed (so that next available MDS is assigned the rank) + new_fs->mds_map.in.insert(mds_rank_t(0)); + new_fs->mds_map.failed.insert(mds_rank_t(0)); + + // Carry forward what makes sense + new_fs->fscid = fs->fscid; + new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled; + new_fs->mds_map.max_mds = 1; + new_fs->mds_map.data_pools = fs->mds_map.data_pools; + new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool; + new_fs->mds_map.cas_pool = fs->mds_map.cas_pool; + new_fs->mds_map.fs_name = fs->mds_map.fs_name; + new_fs->mds_map.max_file_size = g_conf->mds_max_file_size; + new_fs->mds_map.compat = compat; + new_fs->mds_map.created = ceph_clock_now(); + new_fs->mds_map.modified = ceph_clock_now(); + new_fs->mds_map.session_timeout = g_conf->mds_session_timeout; + new_fs->mds_map.session_autoclose = g_conf->mds_session_autoclose; + new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted; + new_fs->mds_map.enabled = true; + + // Remember mds ranks that have ever started. (They should load old inotable + // instead of creating new one if they start again.) + new_fs->mds_map.stopped.insert(fs->mds_map.in.begin(), fs->mds_map.in.end()); + new_fs->mds_map.stopped.insert(fs->mds_map.stopped.begin(), fs->mds_map.stopped.end()); + new_fs->mds_map.stopped.erase(mds_rank_t(0)); + + // Persist the new FSMap + filesystems[new_fs->fscid] = new_fs; +} + +void FSMap::get_health(list >& summary, + list > *detail) const +{ + mds_rank_t standby_count_wanted = 0; + for (const auto &i : filesystems) { + const auto &fs = i.second; + + // TODO: move get_health up into here so that we can qualify + // all the messages with what filesystem they're talking about + fs->mds_map.get_health(summary, detail); + + standby_count_wanted = std::max(standby_count_wanted, fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size())); + } + + if (standby_count_wanted) { + std::ostringstream oss; + oss << "insufficient standby daemons available: have " << standby_daemons.size() << "; want " << standby_count_wanted << " more"; + summary.push_back(make_pair(HEALTH_WARN, oss.str())); + } +} + +bool FSMap::check_health(void) +{ + bool changed = false; + for (auto &i : filesystems) { + changed |= i.second->mds_map.check_health((mds_rank_t)standby_daemons.size()); + } + return changed; +} + +void FSMap::get_health_checks(health_check_map_t *checks) const +{ + mds_rank_t standby_count_wanted = 0; + for (const auto &i : filesystems) { + const auto &fs = i.second; + health_check_map_t fschecks; + + fs->mds_map.get_health_checks(&fschecks); + + // Some of the failed ranks might be transient (i.e. there are standbys + // ready to replace them). We will report only on "stuck" failed, i.e. + // ranks which are failed and have no standby replacement available. + std::set stuck_failed; + + for (const auto &rank : fs->mds_map.failed) { + const mds_gid_t replacement = find_replacement_for( + {fs->fscid, rank}, {}, g_conf->mon_force_standby_active); + if (replacement == MDS_GID_NONE) { + stuck_failed.insert(rank); + } + } + + // FS_WITH_FAILED_MDS + if (!stuck_failed.empty()) { + health_check_t& fscheck = checks->get_or_add( + "FS_WITH_FAILED_MDS", HEALTH_WARN, + "%num% filesystem%plurals% %hasorhave% a failed mds daemon"); + ostringstream ss; + ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size() + << " failed mds" << (stuck_failed.size() > 1 ? "s" : ""); + fscheck.detail.push_back(ss.str()); } + + checks->merge(fschecks); + standby_count_wanted = std::max( + standby_count_wanted, + fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size())); + } + + // MDS_INSUFFICIENT_STANDBY + if (standby_count_wanted) { + std::ostringstream oss, dss; + oss << "insufficient standby MDS daemons available"; + auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str()); + dss << "have " << standby_daemons.size() << "; want " << standby_count_wanted + << " more"; + d.detail.push_back(dss.str()); + } +} + +void FSMap::encode(bufferlist& bl, uint64_t features) const +{ + if (features & CEPH_FEATURE_SERVER_JEWEL) { + ENCODE_START(7, 6, bl); + ::encode(epoch, bl); + ::encode(next_filesystem_id, bl); + ::encode(legacy_client_fscid, bl); + ::encode(compat, bl); + ::encode(enable_multiple, bl); + std::vector fs_list; + for (auto i : filesystems) { + fs_list.push_back(*(i.second)); + } + ::encode(fs_list, bl, features); + ::encode(mds_roles, bl); + ::encode(standby_daemons, bl, features); + ::encode(standby_epochs, bl); + ::encode(ever_enabled_multiple, bl); + ENCODE_FINISH(bl); + } else { + if (filesystems.empty()) { + MDSMap disabled_map; + disabled_map.epoch = epoch; + disabled_map.encode(bl, features); + } else { + // MDSMonitor should never have created multiple filesystems + // until the quorum features indicated Jewel + assert(filesystems.size() == 1); + auto fs = filesystems.begin()->second; + + // Take the MDSMap for the enabled filesystem, and populated its + // mds_info with the standbys to get a pre-jewel-style mon MDSMap. + MDSMap full_mdsmap = fs->mds_map; + full_mdsmap.epoch = epoch; + for (const auto &p : standby_daemons) { + full_mdsmap.mds_info[p.first] = p.second; + } + + // Old MDSMaps don't set rank on standby replay daemons + for (auto &i : full_mdsmap.mds_info) { + auto &info = i.second; + if (info.state == MDSMap::STATE_STANDBY_REPLAY) { + info.rank = MDS_RANK_NONE; + } + } + + full_mdsmap.encode(bl, features); + } + } +} + +void FSMap::decode(bufferlist::iterator& p) +{ + // The highest MDSMap encoding version before we changed the + // MDSMonitor to store an FSMap instead of an MDSMap was + // 5, so anything older than 6 is decoded as an MDSMap, + // and anything newer is decoded as an FSMap. + DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p); + if (struct_v < 6) { + // Because the mon used to store an MDSMap where we now + // store an FSMap, FSMap knows how to decode the legacy + // MDSMap format (it never needs to encode it though). + MDSMap legacy_mds_map; + + // Decoding an MDSMap (upgrade) + ::decode(epoch, p); + ::decode(legacy_mds_map.flags, p); + ::decode(legacy_mds_map.last_failure, p); + ::decode(legacy_mds_map.root, p); + ::decode(legacy_mds_map.session_timeout, p); + ::decode(legacy_mds_map.session_autoclose, p); + ::decode(legacy_mds_map.max_file_size, p); + ::decode(legacy_mds_map.max_mds, p); + ::decode(legacy_mds_map.mds_info, p); + if (struct_v < 3) { + __u32 n; + ::decode(n, p); + while (n--) { + __u32 m; + ::decode(m, p); + legacy_mds_map.data_pools.push_back(m); + } + __s32 s; + ::decode(s, p); + legacy_mds_map.cas_pool = s; + } else { + ::decode(legacy_mds_map.data_pools, p); + ::decode(legacy_mds_map.cas_pool, p); + } + + // kclient ignores everything from here + __u16 ev = 1; + if (struct_v >= 2) + ::decode(ev, p); + if (ev >= 3) + ::decode(legacy_mds_map.compat, p); + else + legacy_mds_map.compat = get_mdsmap_compat_set_base(); + if (ev < 5) { + __u32 n; + ::decode(n, p); + legacy_mds_map.metadata_pool = n; + } else { + ::decode(legacy_mds_map.metadata_pool, p); + } + ::decode(legacy_mds_map.created, p); + ::decode(legacy_mds_map.modified, p); + ::decode(legacy_mds_map.tableserver, p); + ::decode(legacy_mds_map.in, p); + std::map inc; // Legacy field, parse and drop + ::decode(inc, p); + ::decode(legacy_mds_map.up, p); + ::decode(legacy_mds_map.failed, p); + ::decode(legacy_mds_map.stopped, p); + if (ev >= 4) + ::decode(legacy_mds_map.last_failure_osd_epoch, p); + if (ev >= 6) { + if (ev < 10) { + // previously this was a bool about snaps, not a flag map + bool flag; + ::decode(flag, p); + legacy_mds_map.ever_allowed_features = flag ? + CEPH_MDSMAP_ALLOW_SNAPS : 0; + ::decode(flag, p); + legacy_mds_map.explicitly_allowed_features = flag ? + CEPH_MDSMAP_ALLOW_SNAPS : 0; + if (legacy_mds_map.max_mds > 1) { + legacy_mds_map.set_multimds_allowed(); + } + } else { + ::decode(legacy_mds_map.ever_allowed_features, p); + ::decode(legacy_mds_map.explicitly_allowed_features, p); + } + } else { + legacy_mds_map.ever_allowed_features = CEPH_MDSMAP_ALLOW_CLASSICS; + legacy_mds_map.explicitly_allowed_features = 0; + if (legacy_mds_map.max_mds > 1) { + legacy_mds_map.set_multimds_allowed(); + } + } + if (ev >= 7) + ::decode(legacy_mds_map.inline_data_enabled, p); + + if (ev >= 8) { + assert(struct_v >= 5); + ::decode(legacy_mds_map.enabled, p); + ::decode(legacy_mds_map.fs_name, p); + } else { + legacy_mds_map.fs_name = "default"; + if (epoch > 1) { + // If an MDS has ever been started, epoch will be greater than 1, + // assume filesystem is enabled. + legacy_mds_map.enabled = true; + } else { + // Upgrading from a cluster that never used an MDS, switch off + // filesystem until it's explicitly enabled. + legacy_mds_map.enabled = false; + } + } + + if (ev >= 9) { + ::decode(legacy_mds_map.damaged, p); + } + + // We're upgrading, populate filesystems from the legacy fields + filesystems.clear(); + standby_daemons.clear(); + standby_epochs.clear(); + mds_roles.clear(); + compat = legacy_mds_map.compat; + enable_multiple = false; + + // Synthesise a Filesystem from legacy_mds_map, if enabled + if (legacy_mds_map.enabled) { + // Construct a Filesystem from the legacy MDSMap + auto migrate_fs = std::make_shared(); + migrate_fs->fscid = FS_CLUSTER_ID_ANONYMOUS; + migrate_fs->mds_map = legacy_mds_map; + migrate_fs->mds_map.epoch = epoch; + filesystems[migrate_fs->fscid] = migrate_fs; + + // List of GIDs that had invalid states + std::set drop_gids; + + // Construct mds_roles, standby_daemons, and remove + // standbys from the MDSMap in the Filesystem. + for (auto &p : migrate_fs->mds_map.mds_info) { + if (p.second.state == MDSMap::STATE_STANDBY_REPLAY) { + // In legacy MDSMap, standby replay daemons don't have + // rank set, but since FSMap they do. + p.second.rank = p.second.standby_for_rank; + } + if (p.second.rank == MDS_RANK_NONE) { + if (p.second.state != MDSMap::STATE_STANDBY) { + // Old MDSMaps can have down:dne here, which + // is invalid in an FSMap (#17837) + drop_gids.insert(p.first); + } else { + insert(p.second); // into standby_daemons + } + } else { + mds_roles[p.first] = migrate_fs->fscid; + } + } + for (const auto &p : standby_daemons) { + // Erase from this Filesystem's MDSMap, because it has + // been copied into FSMap::Standby_daemons above + migrate_fs->mds_map.mds_info.erase(p.first); + } + for (const auto &gid : drop_gids) { + // Throw away all info for this MDS because it was identified + // as having invalid state above. + migrate_fs->mds_map.mds_info.erase(gid); + } + + legacy_client_fscid = migrate_fs->fscid; + } else { + legacy_client_fscid = FS_CLUSTER_ID_NONE; + } + } else { + ::decode(epoch, p); + ::decode(next_filesystem_id, p); + ::decode(legacy_client_fscid, p); + ::decode(compat, p); + ::decode(enable_multiple, p); + std::vector fs_list; + ::decode(fs_list, p); + filesystems.clear(); + for (std::vector::const_iterator fs = fs_list.begin(); fs != fs_list.end(); ++fs) { + filesystems[fs->fscid] = std::make_shared(*fs); + } + + ::decode(mds_roles, p); + ::decode(standby_daemons, p); + ::decode(standby_epochs, p); + if (struct_v >= 7) { + ::decode(ever_enabled_multiple, p); + } + } + + DECODE_FINISH(p); +} + +void FSMap::sanitize(std::function pool_exists) +{ + for (auto &fs : filesystems) { + fs.second->mds_map.sanitize(pool_exists); + } +} + +void Filesystem::encode(bufferlist& bl, uint64_t features) const +{ + ENCODE_START(1, 1, bl); + ::encode(fscid, bl); + bufferlist mdsmap_bl; + mds_map.encode(mdsmap_bl, features); + ::encode(mdsmap_bl, bl); + ENCODE_FINISH(bl); +} + +void Filesystem::decode(bufferlist::iterator& p) +{ + DECODE_START(1, p); + ::decode(fscid, p); + bufferlist mdsmap_bl; + ::decode(mdsmap_bl, p); + bufferlist::iterator mdsmap_bl_iter = mdsmap_bl.begin(); + mds_map.decode(mdsmap_bl_iter); + DECODE_FINISH(p); +} + +int FSMap::parse_filesystem( + std::string const &ns_str, + std::shared_ptr *result + ) const +{ + std::string ns_err; + fs_cluster_id_t fscid = strict_strtol(ns_str.c_str(), 10, &ns_err); + if (!ns_err.empty() || filesystems.count(fscid) == 0) { + for (auto &fs : filesystems) { + if (fs.second->mds_map.fs_name == ns_str) { + *result = std::const_pointer_cast(fs.second); + return 0; + } + } + return -ENOENT; + } else { + *result = get_filesystem(fscid); + return 0; + } +} + +void Filesystem::print(std::ostream &out) const +{ + out << "Filesystem '" << mds_map.fs_name + << "' (" << fscid << ")" << std::endl; + mds_map.print(out); +} + +mds_gid_t FSMap::find_standby_for(mds_role_t role, const std::string& name) const +{ + mds_gid_t result = MDS_GID_NONE; + + // First see if we have a STANDBY_REPLAY + auto fs = get_filesystem(role.fscid); + for (const auto &i : fs->mds_map.mds_info) { + const auto &info = i.second; + if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) { + return info.global_id; + } + } + + // See if there are any STANDBY daemons available + for (const auto &i : standby_daemons) { + const auto &gid = i.first; + const auto &info = i.second; + assert(info.state == MDSMap::STATE_STANDBY); + assert(info.rank == MDS_RANK_NONE); + + if (info.laggy()) { + continue; + } + + // The mds_info_t may or may not tell us exactly which filesystem + // the standby_for_rank refers to: lookup via legacy_client_fscid + mds_role_t target_role = { + info.standby_for_fscid == FS_CLUSTER_ID_NONE ? + legacy_client_fscid : info.standby_for_fscid, + info.standby_for_rank}; + + if ((target_role.rank == role.rank && target_role.fscid == role.fscid) + || (name.length() && info.standby_for_name == name)) { + // It's a named standby for *me*, use it. + return gid; + } else if ( + info.standby_for_rank < 0 && info.standby_for_name.length() == 0 && + (info.standby_for_fscid == FS_CLUSTER_ID_NONE || + info.standby_for_fscid == role.fscid)) { + // It's not a named standby for anyone, use it if we don't find + // a named standby for me later, unless it targets another FSCID. + result = gid; + } + } + + return result; +} + +mds_gid_t FSMap::find_unused_for(mds_role_t role, + bool force_standby_active) const { + for (const auto &i : standby_daemons) { + const auto &gid = i.first; + const auto &info = i.second; + assert(info.state == MDSMap::STATE_STANDBY); + + if (info.laggy() || info.rank >= 0) + continue; + + if (info.standby_for_fscid != FS_CLUSTER_ID_NONE && + info.standby_for_fscid != role.fscid) + continue; + if (info.standby_for_rank != MDS_RANK_NONE && + info.standby_for_rank != role.rank) + continue; + + // To be considered 'unused' a daemon must either not + // be selected for standby-replay or the force_standby_active + // setting must be enabled to use replay daemons anyway. + if (!info.standby_replay || force_standby_active) { + return gid; + } + } + return MDS_GID_NONE; +} + +mds_gid_t FSMap::find_replacement_for(mds_role_t role, const std::string& name, + bool force_standby_active) const { + const mds_gid_t standby = find_standby_for(role, name); + if (standby) + return standby; + else + return find_unused_for(role, force_standby_active); +} + +void FSMap::sanity() const +{ + if (legacy_client_fscid != FS_CLUSTER_ID_NONE) { + assert(filesystems.count(legacy_client_fscid) == 1); + } + + for (const auto &i : filesystems) { + auto fs = i.second; + assert(fs->mds_map.compat.compare(compat) == 0); + assert(fs->fscid == i.first); + for (const auto &j : fs->mds_map.mds_info) { + assert(j.second.rank != MDS_RANK_NONE); + assert(mds_roles.count(j.first) == 1); + assert(standby_daemons.count(j.first) == 0); + assert(standby_epochs.count(j.first) == 0); + assert(mds_roles.at(j.first) == i.first); + if (j.second.state != MDSMap::STATE_STANDBY_REPLAY) { + assert(fs->mds_map.up.at(j.second.rank) == j.first); + assert(fs->mds_map.failed.count(j.second.rank) == 0); + assert(fs->mds_map.damaged.count(j.second.rank) == 0); + } + } + + for (const auto &j : fs->mds_map.up) { + mds_rank_t rank = j.first; + assert(fs->mds_map.in.count(rank) == 1); + mds_gid_t gid = j.second; + assert(fs->mds_map.mds_info.count(gid) == 1); + } + } + + for (const auto &i : standby_daemons) { + assert(i.second.state == MDSMap::STATE_STANDBY); + assert(i.second.rank == MDS_RANK_NONE); + assert(i.second.global_id == i.first); + assert(standby_epochs.count(i.first) == 1); + assert(mds_roles.count(i.first) == 1); + assert(mds_roles.at(i.first) == FS_CLUSTER_ID_NONE); + } + + for (const auto &i : standby_epochs) { + assert(standby_daemons.count(i.first) == 1); + } + + for (const auto &i : mds_roles) { + if (i.second == FS_CLUSTER_ID_NONE) { + assert(standby_daemons.count(i.first) == 1); + } else { + assert(filesystems.count(i.second) == 1); + assert(filesystems.at(i.second)->mds_map.mds_info.count(i.first) == 1); + } + } +} + +void FSMap::promote( + mds_gid_t standby_gid, + const std::shared_ptr &filesystem, + mds_rank_t assigned_rank) +{ + assert(gid_exists(standby_gid)); + bool is_standby_replay = mds_roles.at(standby_gid) != FS_CLUSTER_ID_NONE; + if (!is_standby_replay) { + assert(standby_daemons.count(standby_gid)); + assert(standby_daemons.at(standby_gid).state == MDSMap::STATE_STANDBY); + } + + MDSMap &mds_map = filesystem->mds_map; + + // Insert daemon state to Filesystem + if (!is_standby_replay) { + mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid); + } else { + assert(mds_map.mds_info.count(standby_gid)); + assert(mds_map.mds_info.at(standby_gid).state == MDSMap::STATE_STANDBY_REPLAY); + assert(mds_map.mds_info.at(standby_gid).rank == assigned_rank); + } + MDSMap::mds_info_t &info = mds_map.mds_info[standby_gid]; + + if (mds_map.stopped.erase(assigned_rank)) { + // The cluster is being expanded with a stopped rank + info.state = MDSMap::STATE_STARTING; + } else if (!mds_map.is_in(assigned_rank)) { + // The cluster is being expanded with a new rank + info.state = MDSMap::STATE_CREATING; + } else { + // An existing rank is being assigned to a replacement + info.state = MDSMap::STATE_REPLAY; + mds_map.failed.erase(assigned_rank); + } + info.rank = assigned_rank; + info.inc = epoch; + mds_roles[standby_gid] = filesystem->fscid; + + // Update the rank state in Filesystem + mds_map.in.insert(assigned_rank); + mds_map.up[assigned_rank] = standby_gid; + + // Remove from the list of standbys + if (!is_standby_replay) { + standby_daemons.erase(standby_gid); + standby_epochs.erase(standby_gid); + } + + // Indicate that Filesystem has been modified + mds_map.epoch = epoch; +} + +void FSMap::assign_standby_replay( + const mds_gid_t standby_gid, + const fs_cluster_id_t leader_ns, + const mds_rank_t leader_rank) +{ + assert(mds_roles.at(standby_gid) == FS_CLUSTER_ID_NONE); + assert(gid_exists(standby_gid)); + assert(!gid_has_rank(standby_gid)); + assert(standby_daemons.count(standby_gid)); + + // Insert to the filesystem + auto fs = filesystems.at(leader_ns); + fs->mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid); + fs->mds_map.mds_info[standby_gid].rank = leader_rank; + fs->mds_map.mds_info[standby_gid].state = MDSMap::STATE_STANDBY_REPLAY; + mds_roles[standby_gid] = leader_ns; + + // Remove from the list of standbys + standby_daemons.erase(standby_gid); + standby_epochs.erase(standby_gid); + + // Indicate that Filesystem has been modified + fs->mds_map.epoch = epoch; +} + +void FSMap::erase(mds_gid_t who, epoch_t blacklist_epoch) +{ + if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) { + standby_daemons.erase(who); + standby_epochs.erase(who); + } else { + auto &fs = filesystems.at(mds_roles.at(who)); + const auto &info = fs->mds_map.mds_info.at(who); + if (info.state != MDSMap::STATE_STANDBY_REPLAY) { + if (info.state == MDSMap::STATE_CREATING) { + // If this gid didn't make it past CREATING, then forget + // the rank ever existed so that next time it's handed out + // to a gid it'll go back into CREATING. + fs->mds_map.in.erase(info.rank); + } else { + // Put this rank into the failed list so that the next available + // STANDBY will pick it up. + fs->mds_map.failed.insert(info.rank); + } + assert(fs->mds_map.up.at(info.rank) == info.global_id); + fs->mds_map.up.erase(info.rank); + } + fs->mds_map.mds_info.erase(who); + fs->mds_map.last_failure_osd_epoch = blacklist_epoch; + fs->mds_map.epoch = epoch; + } + + mds_roles.erase(who); +} + +void FSMap::damaged(mds_gid_t who, epoch_t blacklist_epoch) +{ + assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE); + auto fs = filesystems.at(mds_roles.at(who)); + mds_rank_t rank = fs->mds_map.mds_info[who].rank; + + erase(who, blacklist_epoch); + fs->mds_map.failed.erase(rank); + fs->mds_map.damaged.insert(rank); + + assert(fs->mds_map.epoch == epoch); +} + +/** + * Update to indicate that the rank `rank` is to be removed + * from the damaged list of the filesystem `fscid` + */ +bool FSMap::undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank) +{ + auto fs = filesystems.at(fscid); + + if (fs->mds_map.damaged.erase(rank)) { + fs->mds_map.failed.insert(rank); + fs->mds_map.epoch = epoch; + return true; + } else { + return false; + } +} + +void FSMap::insert(const MDSMap::mds_info_t &new_info) +{ + assert(new_info.state == MDSMap::STATE_STANDBY); + assert(new_info.rank == MDS_RANK_NONE); + mds_roles[new_info.global_id] = FS_CLUSTER_ID_NONE; + standby_daemons[new_info.global_id] = new_info; + standby_epochs[new_info.global_id] = epoch; +} + +std::list FSMap::stop(mds_gid_t who) +{ + assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE); + auto fs = filesystems.at(mds_roles.at(who)); + const auto &info = fs->mds_map.mds_info.at(who); + fs->mds_map.up.erase(info.rank); + fs->mds_map.in.erase(info.rank); + fs->mds_map.stopped.insert(info.rank); + + // Also drop any standby replays that were following this rank + std::list standbys; + for (const auto &i : fs->mds_map.mds_info) { + const auto &other_gid = i.first; + const auto &other_info = i.second; + if (other_info.rank == info.rank + && other_info.state == MDSMap::STATE_STANDBY_REPLAY) { + standbys.push_back(other_gid); + erase(other_gid, 0); + } + } + + fs->mds_map.mds_info.erase(who); + mds_roles.erase(who); + + fs->mds_map.epoch = epoch; + + return standbys; +} + + +/** + * Given one of the following forms: + * : + * : + * + * + * Parse into a mds_role_t. The rank-only form is only valid + * if legacy_client_ns is set. + */ +int FSMap::parse_role( + const std::string &role_str, + mds_role_t *role, + std::ostream &ss) const +{ + size_t colon_pos = role_str.find(":"); + size_t rank_pos; + std::shared_ptr fs; + if (colon_pos == std::string::npos) { + if (legacy_client_fscid == FS_CLUSTER_ID_NONE) { + ss << "No filesystem selected"; + return -ENOENT; + } + fs = get_filesystem(legacy_client_fscid); + rank_pos = 0; + } else { + if (parse_filesystem(role_str.substr(0, colon_pos), &fs) < 0) { + ss << "Invalid filesystem"; + return -ENOENT; + } + rank_pos = colon_pos+1; + } + + mds_rank_t rank; + std::string err; + std::string rank_str = role_str.substr(rank_pos); + long rank_i = strict_strtol(rank_str.c_str(), 10, &err); + if (rank_i < 0 || !err.empty()) { + ss << "Invalid rank '" << rank_str << "'"; + return -EINVAL; + } else { + rank = rank_i; + } + + if (fs->mds_map.in.count(rank) == 0) { + ss << "Rank '" << rank << "' not found"; + return -ENOENT; + } + + *role = {fs->fscid, rank}; + + return 0; +}