1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
19 using std::stringstream;
21 #include "mon/health_check.h"
24 void Filesystem::dump(Formatter *f) const
26 f->open_object_section("mdsmap");
29 f->dump_int("id", fscid);
32 void FSMap::dump(Formatter *f) const
34 f->dump_int("epoch", epoch);
36 f->open_object_section("compat");
40 f->open_object_section("feature_flags");
41 f->dump_bool("enable_multiple", enable_multiple);
42 f->dump_bool("ever_enabled_multiple", ever_enabled_multiple);
45 f->open_array_section("standbys");
46 for (const auto &i : standby_daemons) {
47 f->open_object_section("info");
49 f->dump_int("epoch", standby_epochs.at(i.first));
54 f->open_array_section("filesystems");
55 for (const auto &fs : filesystems) {
56 f->open_object_section("filesystem");
63 void FSMap::generate_test_instances(list<FSMap*>& ls)
65 FSMap *m = new FSMap();
67 std::list<MDSMap*> mds_map_instances;
68 MDSMap::generate_test_instances(mds_map_instances);
71 for (auto i : mds_map_instances) {
72 auto fs = std::make_shared<Filesystem>();
76 m->filesystems[fs->fscid] = fs;
78 mds_map_instances.clear();
83 void FSMap::print(ostream& out) const
85 out << "e" << epoch << std::endl;
86 out << "enable_multiple, ever_enabled_multiple: " << enable_multiple << ","
87 << ever_enabled_multiple << std::endl;
88 out << "compat: " << compat << std::endl;
89 out << "legacy client fscid: " << legacy_client_fscid << std::endl;
90 out << " " << std::endl;
92 if (filesystems.empty()) {
93 out << "No filesystems configured" << std::endl;
97 for (const auto &fs : filesystems) {
98 fs.second->print(out);
99 out << " " << std::endl << " " << std::endl; // Space out a bit
102 if (!standby_daemons.empty()) {
103 out << "Standby daemons:" << std::endl << " " << std::endl;
106 for (const auto &p : standby_daemons) {
107 p.second.print_summary(out);
114 void FSMap::print_summary(Formatter *f, ostream *out) const
116 map<mds_role_t,string> by_rank;
117 map<string,int> by_state;
120 f->dump_unsigned("epoch", get_epoch());
121 for (auto i : filesystems) {
123 f->dump_unsigned("id", fs->fscid);
124 f->dump_unsigned("up", fs->mds_map.up.size());
125 f->dump_unsigned("in", fs->mds_map.in.size());
126 f->dump_unsigned("max", fs->mds_map.max_mds);
129 for (auto i : filesystems) {
131 *out << fs->mds_map.fs_name << "-" << fs->mds_map.up.size() << "/"
132 << fs->mds_map.in.size() << "/" << fs->mds_map.max_mds << " up ";
137 f->open_array_section("by_rank");
140 const auto all_info = get_mds_info();
141 for (const auto &p : all_info) {
142 const auto &info = p.second;
143 string s = ceph_mds_state_name(info.state);
145 s += "(laggy or crashed)";
148 const fs_cluster_id_t fscid = mds_roles.at(info.global_id);
150 if (info.rank != MDS_RANK_NONE &&
151 info.state != MDSMap::STATE_STANDBY_REPLAY) {
153 f->open_object_section("mds");
154 f->dump_unsigned("filesystem_id", fscid);
155 f->dump_unsigned("rank", info.rank);
156 f->dump_string("name", info.name);
157 f->dump_string("status", s);
160 by_rank[mds_role_t(fscid, info.rank)] = info.name + "=" + s;
170 if (!by_rank.empty()) {
171 if (filesystems.size() > 1) {
172 // Disambiguate filesystems
173 std::map<std::string, std::string> pretty;
174 for (auto i : by_rank) {
175 const auto &fs_name = filesystems.at(i.first.fscid)->mds_map.fs_name;
176 std::ostringstream o;
177 o << "[" << fs_name << ":" << i.first.rank << "]";
178 pretty[o.str()] = i.second;
180 *out << " " << pretty;
182 // Omit FSCID in output when only one filesystem exists
183 std::map<mds_rank_t, std::string> shortened;
184 for (auto i : by_rank) {
185 shortened[i.first.rank] = i.second;
187 *out << " " << shortened;
192 for (map<string,int>::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); ++p) {
194 f->dump_unsigned(p->first.c_str(), p->second);
196 *out << ", " << p->second << " " << p->first;
202 for (auto i : filesystems) {
204 failed += fs->mds_map.failed.size();
205 damaged += fs->mds_map.damaged.size();
210 f->dump_unsigned("failed", failed);
212 *out << ", " << failed << " failed";
218 f->dump_unsigned("damaged", damaged);
220 *out << ", " << damaged << " damaged";
223 //if (stopped.size())
224 //out << ", " << stopped.size() << " stopped";
228 void FSMap::create_filesystem(const std::string &name,
229 int64_t metadata_pool, int64_t data_pool,
232 auto fs = std::make_shared<Filesystem>();
233 fs->mds_map.fs_name = name;
234 fs->mds_map.max_mds = 1;
235 fs->mds_map.data_pools.push_back(data_pool);
236 fs->mds_map.metadata_pool = metadata_pool;
237 fs->mds_map.cas_pool = -1;
238 fs->mds_map.max_file_size = g_conf->mds_max_file_size;
239 fs->mds_map.compat = compat;
240 fs->mds_map.created = ceph_clock_now();
241 fs->mds_map.modified = ceph_clock_now();
242 fs->mds_map.session_timeout = g_conf->mds_session_timeout;
243 fs->mds_map.session_autoclose = g_conf->mds_session_autoclose;
244 fs->mds_map.enabled = true;
245 if (features & CEPH_FEATURE_SERVER_JEWEL) {
246 fs->fscid = next_filesystem_id++;
247 // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
248 // have initialized next_filesystem_id such that it's never used here.
249 assert(fs->fscid != FS_CLUSTER_ID_ANONYMOUS);
251 // Use anon fscid because this will get thrown away when encoding
252 // as legacy MDSMap for legacy mons.
253 assert(filesystems.empty());
254 fs->fscid = FS_CLUSTER_ID_ANONYMOUS;
256 filesystems[fs->fscid] = fs;
258 // Created first filesystem? Set it as the one
259 // for legacy clients to use
260 if (filesystems.size() == 1) {
261 legacy_client_fscid = fs->fscid;
265 void FSMap::reset_filesystem(fs_cluster_id_t fscid)
267 auto fs = get_filesystem(fscid);
268 auto new_fs = std::make_shared<Filesystem>();
270 // Populate rank 0 as existing (so don't go into CREATING)
271 // but failed (so that next available MDS is assigned the rank)
272 new_fs->mds_map.in.insert(mds_rank_t(0));
273 new_fs->mds_map.failed.insert(mds_rank_t(0));
275 // Carry forward what makes sense
276 new_fs->fscid = fs->fscid;
277 new_fs->mds_map.inline_data_enabled = fs->mds_map.inline_data_enabled;
278 new_fs->mds_map.max_mds = 1;
279 new_fs->mds_map.data_pools = fs->mds_map.data_pools;
280 new_fs->mds_map.metadata_pool = fs->mds_map.metadata_pool;
281 new_fs->mds_map.cas_pool = fs->mds_map.cas_pool;
282 new_fs->mds_map.fs_name = fs->mds_map.fs_name;
283 new_fs->mds_map.max_file_size = g_conf->mds_max_file_size;
284 new_fs->mds_map.compat = compat;
285 new_fs->mds_map.created = ceph_clock_now();
286 new_fs->mds_map.modified = ceph_clock_now();
287 new_fs->mds_map.session_timeout = g_conf->mds_session_timeout;
288 new_fs->mds_map.session_autoclose = g_conf->mds_session_autoclose;
289 new_fs->mds_map.standby_count_wanted = fs->mds_map.standby_count_wanted;
290 new_fs->mds_map.enabled = true;
292 // Remember mds ranks that have ever started. (They should load old inotable
293 // instead of creating new one if they start again.)
294 new_fs->mds_map.stopped.insert(fs->mds_map.in.begin(), fs->mds_map.in.end());
295 new_fs->mds_map.stopped.insert(fs->mds_map.stopped.begin(), fs->mds_map.stopped.end());
296 new_fs->mds_map.stopped.erase(mds_rank_t(0));
298 // Persist the new FSMap
299 filesystems[new_fs->fscid] = new_fs;
302 void FSMap::get_health(list<pair<health_status_t,string> >& summary,
303 list<pair<health_status_t,string> > *detail) const
305 mds_rank_t standby_count_wanted = 0;
306 for (const auto &i : filesystems) {
307 const auto &fs = i.second;
309 // TODO: move get_health up into here so that we can qualify
310 // all the messages with what filesystem they're talking about
311 fs->mds_map.get_health(summary, detail);
313 standby_count_wanted = std::max(standby_count_wanted, fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
316 if (standby_count_wanted) {
317 std::ostringstream oss;
318 oss << "insufficient standby daemons available: have " << standby_daemons.size() << "; want " << standby_count_wanted << " more";
319 summary.push_back(make_pair(HEALTH_WARN, oss.str()));
323 bool FSMap::check_health(void)
325 bool changed = false;
326 for (auto &i : filesystems) {
327 changed |= i.second->mds_map.check_health((mds_rank_t)standby_daemons.size());
332 void FSMap::get_health_checks(health_check_map_t *checks) const
334 mds_rank_t standby_count_wanted = 0;
335 for (const auto &i : filesystems) {
336 const auto &fs = i.second;
337 health_check_map_t fschecks;
339 fs->mds_map.get_health_checks(&fschecks);
341 // Some of the failed ranks might be transient (i.e. there are standbys
342 // ready to replace them). We will report only on "stuck" failed, i.e.
343 // ranks which are failed and have no standby replacement available.
344 std::set<mds_rank_t> stuck_failed;
346 for (const auto &rank : fs->mds_map.failed) {
347 const mds_gid_t replacement = find_replacement_for(
348 {fs->fscid, rank}, {}, g_conf->mon_force_standby_active);
349 if (replacement == MDS_GID_NONE) {
350 stuck_failed.insert(rank);
354 // FS_WITH_FAILED_MDS
355 if (!stuck_failed.empty()) {
356 health_check_t& fscheck = checks->get_or_add(
357 "FS_WITH_FAILED_MDS", HEALTH_WARN,
358 "%num% filesystem%plurals% %hasorhave% a failed mds daemon");
360 ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size()
361 << " failed mds" << (stuck_failed.size() > 1 ? "s" : "");
362 fscheck.detail.push_back(ss.str()); }
364 checks->merge(fschecks);
365 standby_count_wanted = std::max(
366 standby_count_wanted,
367 fs->mds_map.get_standby_count_wanted((mds_rank_t)standby_daemons.size()));
370 // MDS_INSUFFICIENT_STANDBY
371 if (standby_count_wanted) {
372 std::ostringstream oss, dss;
373 oss << "insufficient standby MDS daemons available";
374 auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str());
375 dss << "have " << standby_daemons.size() << "; want " << standby_count_wanted
377 d.detail.push_back(dss.str());
381 void FSMap::encode(bufferlist& bl, uint64_t features) const
383 if (features & CEPH_FEATURE_SERVER_JEWEL) {
384 ENCODE_START(7, 6, bl);
386 ::encode(next_filesystem_id, bl);
387 ::encode(legacy_client_fscid, bl);
388 ::encode(compat, bl);
389 ::encode(enable_multiple, bl);
390 std::vector<Filesystem> fs_list;
391 for (auto i : filesystems) {
392 fs_list.push_back(*(i.second));
394 ::encode(fs_list, bl, features);
395 ::encode(mds_roles, bl);
396 ::encode(standby_daemons, bl, features);
397 ::encode(standby_epochs, bl);
398 ::encode(ever_enabled_multiple, bl);
401 if (filesystems.empty()) {
403 disabled_map.epoch = epoch;
404 disabled_map.encode(bl, features);
406 // MDSMonitor should never have created multiple filesystems
407 // until the quorum features indicated Jewel
408 assert(filesystems.size() == 1);
409 auto fs = filesystems.begin()->second;
411 // Take the MDSMap for the enabled filesystem, and populated its
412 // mds_info with the standbys to get a pre-jewel-style mon MDSMap.
413 MDSMap full_mdsmap = fs->mds_map;
414 full_mdsmap.epoch = epoch;
415 for (const auto &p : standby_daemons) {
416 full_mdsmap.mds_info[p.first] = p.second;
419 // Old MDSMaps don't set rank on standby replay daemons
420 for (auto &i : full_mdsmap.mds_info) {
421 auto &info = i.second;
422 if (info.state == MDSMap::STATE_STANDBY_REPLAY) {
423 info.rank = MDS_RANK_NONE;
427 full_mdsmap.encode(bl, features);
432 void FSMap::decode(bufferlist::iterator& p)
434 // The highest MDSMap encoding version before we changed the
435 // MDSMonitor to store an FSMap instead of an MDSMap was
436 // 5, so anything older than 6 is decoded as an MDSMap,
437 // and anything newer is decoded as an FSMap.
438 DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p);
440 // Because the mon used to store an MDSMap where we now
441 // store an FSMap, FSMap knows how to decode the legacy
442 // MDSMap format (it never needs to encode it though).
443 MDSMap legacy_mds_map;
445 // Decoding an MDSMap (upgrade)
447 ::decode(legacy_mds_map.flags, p);
448 ::decode(legacy_mds_map.last_failure, p);
449 ::decode(legacy_mds_map.root, p);
450 ::decode(legacy_mds_map.session_timeout, p);
451 ::decode(legacy_mds_map.session_autoclose, p);
452 ::decode(legacy_mds_map.max_file_size, p);
453 ::decode(legacy_mds_map.max_mds, p);
454 ::decode(legacy_mds_map.mds_info, p);
461 legacy_mds_map.data_pools.push_back(m);
465 legacy_mds_map.cas_pool = s;
467 ::decode(legacy_mds_map.data_pools, p);
468 ::decode(legacy_mds_map.cas_pool, p);
471 // kclient ignores everything from here
476 ::decode(legacy_mds_map.compat, p);
478 legacy_mds_map.compat = get_mdsmap_compat_set_base();
482 legacy_mds_map.metadata_pool = n;
484 ::decode(legacy_mds_map.metadata_pool, p);
486 ::decode(legacy_mds_map.created, p);
487 ::decode(legacy_mds_map.modified, p);
488 ::decode(legacy_mds_map.tableserver, p);
489 ::decode(legacy_mds_map.in, p);
490 std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop
492 ::decode(legacy_mds_map.up, p);
493 ::decode(legacy_mds_map.failed, p);
494 ::decode(legacy_mds_map.stopped, p);
496 ::decode(legacy_mds_map.last_failure_osd_epoch, p);
499 // previously this was a bool about snaps, not a flag map
502 legacy_mds_map.ever_allowed_features = flag ?
503 CEPH_MDSMAP_ALLOW_SNAPS : 0;
505 legacy_mds_map.explicitly_allowed_features = flag ?
506 CEPH_MDSMAP_ALLOW_SNAPS : 0;
507 if (legacy_mds_map.max_mds > 1) {
508 legacy_mds_map.set_multimds_allowed();
511 ::decode(legacy_mds_map.ever_allowed_features, p);
512 ::decode(legacy_mds_map.explicitly_allowed_features, p);
515 legacy_mds_map.ever_allowed_features = CEPH_MDSMAP_ALLOW_CLASSICS;
516 legacy_mds_map.explicitly_allowed_features = 0;
517 if (legacy_mds_map.max_mds > 1) {
518 legacy_mds_map.set_multimds_allowed();
522 ::decode(legacy_mds_map.inline_data_enabled, p);
525 assert(struct_v >= 5);
526 ::decode(legacy_mds_map.enabled, p);
527 ::decode(legacy_mds_map.fs_name, p);
529 legacy_mds_map.fs_name = "default";
531 // If an MDS has ever been started, epoch will be greater than 1,
532 // assume filesystem is enabled.
533 legacy_mds_map.enabled = true;
535 // Upgrading from a cluster that never used an MDS, switch off
536 // filesystem until it's explicitly enabled.
537 legacy_mds_map.enabled = false;
542 ::decode(legacy_mds_map.damaged, p);
545 // We're upgrading, populate filesystems from the legacy fields
547 standby_daemons.clear();
548 standby_epochs.clear();
550 compat = legacy_mds_map.compat;
551 enable_multiple = false;
553 // Synthesise a Filesystem from legacy_mds_map, if enabled
554 if (legacy_mds_map.enabled) {
555 // Construct a Filesystem from the legacy MDSMap
556 auto migrate_fs = std::make_shared<Filesystem>();
557 migrate_fs->fscid = FS_CLUSTER_ID_ANONYMOUS;
558 migrate_fs->mds_map = legacy_mds_map;
559 migrate_fs->mds_map.epoch = epoch;
560 filesystems[migrate_fs->fscid] = migrate_fs;
562 // List of GIDs that had invalid states
563 std::set<mds_gid_t> drop_gids;
565 // Construct mds_roles, standby_daemons, and remove
566 // standbys from the MDSMap in the Filesystem.
567 for (auto &p : migrate_fs->mds_map.mds_info) {
568 if (p.second.state == MDSMap::STATE_STANDBY_REPLAY) {
569 // In legacy MDSMap, standby replay daemons don't have
570 // rank set, but since FSMap they do.
571 p.second.rank = p.second.standby_for_rank;
573 if (p.second.rank == MDS_RANK_NONE) {
574 if (p.second.state != MDSMap::STATE_STANDBY) {
575 // Old MDSMaps can have down:dne here, which
576 // is invalid in an FSMap (#17837)
577 drop_gids.insert(p.first);
579 insert(p.second); // into standby_daemons
582 mds_roles[p.first] = migrate_fs->fscid;
585 for (const auto &p : standby_daemons) {
586 // Erase from this Filesystem's MDSMap, because it has
587 // been copied into FSMap::Standby_daemons above
588 migrate_fs->mds_map.mds_info.erase(p.first);
590 for (const auto &gid : drop_gids) {
591 // Throw away all info for this MDS because it was identified
592 // as having invalid state above.
593 migrate_fs->mds_map.mds_info.erase(gid);
596 legacy_client_fscid = migrate_fs->fscid;
598 legacy_client_fscid = FS_CLUSTER_ID_NONE;
602 ::decode(next_filesystem_id, p);
603 ::decode(legacy_client_fscid, p);
605 ::decode(enable_multiple, p);
606 std::vector<Filesystem> fs_list;
607 ::decode(fs_list, p);
609 for (std::vector<Filesystem>::const_iterator fs = fs_list.begin(); fs != fs_list.end(); ++fs) {
610 filesystems[fs->fscid] = std::make_shared<Filesystem>(*fs);
613 ::decode(mds_roles, p);
614 ::decode(standby_daemons, p);
615 ::decode(standby_epochs, p);
617 ::decode(ever_enabled_multiple, p);
624 void FSMap::sanitize(std::function<bool(int64_t pool)> pool_exists)
626 for (auto &fs : filesystems) {
627 fs.second->mds_map.sanitize(pool_exists);
631 void Filesystem::encode(bufferlist& bl, uint64_t features) const
633 ENCODE_START(1, 1, bl);
635 bufferlist mdsmap_bl;
636 mds_map.encode(mdsmap_bl, features);
637 ::encode(mdsmap_bl, bl);
641 void Filesystem::decode(bufferlist::iterator& p)
645 bufferlist mdsmap_bl;
646 ::decode(mdsmap_bl, p);
647 bufferlist::iterator mdsmap_bl_iter = mdsmap_bl.begin();
648 mds_map.decode(mdsmap_bl_iter);
652 int FSMap::parse_filesystem(
653 std::string const &ns_str,
654 std::shared_ptr<const Filesystem> *result
658 fs_cluster_id_t fscid = strict_strtol(ns_str.c_str(), 10, &ns_err);
659 if (!ns_err.empty() || filesystems.count(fscid) == 0) {
660 for (auto &fs : filesystems) {
661 if (fs.second->mds_map.fs_name == ns_str) {
662 *result = std::const_pointer_cast<const Filesystem>(fs.second);
668 *result = get_filesystem(fscid);
673 void Filesystem::print(std::ostream &out) const
675 out << "Filesystem '" << mds_map.fs_name
676 << "' (" << fscid << ")" << std::endl;
680 mds_gid_t FSMap::find_standby_for(mds_role_t role, const std::string& name) const
682 mds_gid_t result = MDS_GID_NONE;
684 // First see if we have a STANDBY_REPLAY
685 auto fs = get_filesystem(role.fscid);
686 for (const auto &i : fs->mds_map.mds_info) {
687 const auto &info = i.second;
688 if (info.rank == role.rank && info.state == MDSMap::STATE_STANDBY_REPLAY) {
689 return info.global_id;
693 // See if there are any STANDBY daemons available
694 for (const auto &i : standby_daemons) {
695 const auto &gid = i.first;
696 const auto &info = i.second;
697 assert(info.state == MDSMap::STATE_STANDBY);
698 assert(info.rank == MDS_RANK_NONE);
704 // The mds_info_t may or may not tell us exactly which filesystem
705 // the standby_for_rank refers to: lookup via legacy_client_fscid
706 mds_role_t target_role = {
707 info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
708 legacy_client_fscid : info.standby_for_fscid,
709 info.standby_for_rank};
711 if ((target_role.rank == role.rank && target_role.fscid == role.fscid)
712 || (name.length() && info.standby_for_name == name)) {
713 // It's a named standby for *me*, use it.
716 info.standby_for_rank < 0 && info.standby_for_name.length() == 0 &&
717 (info.standby_for_fscid == FS_CLUSTER_ID_NONE ||
718 info.standby_for_fscid == role.fscid)) {
719 // It's not a named standby for anyone, use it if we don't find
720 // a named standby for me later, unless it targets another FSCID.
728 mds_gid_t FSMap::find_unused_for(mds_role_t role,
729 bool force_standby_active) const {
730 for (const auto &i : standby_daemons) {
731 const auto &gid = i.first;
732 const auto &info = i.second;
733 assert(info.state == MDSMap::STATE_STANDBY);
735 if (info.laggy() || info.rank >= 0)
738 if (info.standby_for_fscid != FS_CLUSTER_ID_NONE &&
739 info.standby_for_fscid != role.fscid)
741 if (info.standby_for_rank != MDS_RANK_NONE &&
742 info.standby_for_rank != role.rank)
745 // To be considered 'unused' a daemon must either not
746 // be selected for standby-replay or the force_standby_active
747 // setting must be enabled to use replay daemons anyway.
748 if (!info.standby_replay || force_standby_active) {
755 mds_gid_t FSMap::find_replacement_for(mds_role_t role, const std::string& name,
756 bool force_standby_active) const {
757 const mds_gid_t standby = find_standby_for(role, name);
761 return find_unused_for(role, force_standby_active);
764 void FSMap::sanity() const
766 if (legacy_client_fscid != FS_CLUSTER_ID_NONE) {
767 assert(filesystems.count(legacy_client_fscid) == 1);
770 for (const auto &i : filesystems) {
772 assert(fs->mds_map.compat.compare(compat) == 0);
773 assert(fs->fscid == i.first);
774 for (const auto &j : fs->mds_map.mds_info) {
775 assert(j.second.rank != MDS_RANK_NONE);
776 assert(mds_roles.count(j.first) == 1);
777 assert(standby_daemons.count(j.first) == 0);
778 assert(standby_epochs.count(j.first) == 0);
779 assert(mds_roles.at(j.first) == i.first);
780 if (j.second.state != MDSMap::STATE_STANDBY_REPLAY) {
781 assert(fs->mds_map.up.at(j.second.rank) == j.first);
782 assert(fs->mds_map.failed.count(j.second.rank) == 0);
783 assert(fs->mds_map.damaged.count(j.second.rank) == 0);
787 for (const auto &j : fs->mds_map.up) {
788 mds_rank_t rank = j.first;
789 assert(fs->mds_map.in.count(rank) == 1);
790 mds_gid_t gid = j.second;
791 assert(fs->mds_map.mds_info.count(gid) == 1);
795 for (const auto &i : standby_daemons) {
796 assert(i.second.state == MDSMap::STATE_STANDBY);
797 assert(i.second.rank == MDS_RANK_NONE);
798 assert(i.second.global_id == i.first);
799 assert(standby_epochs.count(i.first) == 1);
800 assert(mds_roles.count(i.first) == 1);
801 assert(mds_roles.at(i.first) == FS_CLUSTER_ID_NONE);
804 for (const auto &i : standby_epochs) {
805 assert(standby_daemons.count(i.first) == 1);
808 for (const auto &i : mds_roles) {
809 if (i.second == FS_CLUSTER_ID_NONE) {
810 assert(standby_daemons.count(i.first) == 1);
812 assert(filesystems.count(i.second) == 1);
813 assert(filesystems.at(i.second)->mds_map.mds_info.count(i.first) == 1);
819 mds_gid_t standby_gid,
820 const std::shared_ptr<Filesystem> &filesystem,
821 mds_rank_t assigned_rank)
823 assert(gid_exists(standby_gid));
824 bool is_standby_replay = mds_roles.at(standby_gid) != FS_CLUSTER_ID_NONE;
825 if (!is_standby_replay) {
826 assert(standby_daemons.count(standby_gid));
827 assert(standby_daemons.at(standby_gid).state == MDSMap::STATE_STANDBY);
830 MDSMap &mds_map = filesystem->mds_map;
832 // Insert daemon state to Filesystem
833 if (!is_standby_replay) {
834 mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
836 assert(mds_map.mds_info.count(standby_gid));
837 assert(mds_map.mds_info.at(standby_gid).state == MDSMap::STATE_STANDBY_REPLAY);
838 assert(mds_map.mds_info.at(standby_gid).rank == assigned_rank);
840 MDSMap::mds_info_t &info = mds_map.mds_info[standby_gid];
842 if (mds_map.stopped.erase(assigned_rank)) {
843 // The cluster is being expanded with a stopped rank
844 info.state = MDSMap::STATE_STARTING;
845 } else if (!mds_map.is_in(assigned_rank)) {
846 // The cluster is being expanded with a new rank
847 info.state = MDSMap::STATE_CREATING;
849 // An existing rank is being assigned to a replacement
850 info.state = MDSMap::STATE_REPLAY;
851 mds_map.failed.erase(assigned_rank);
853 info.rank = assigned_rank;
855 mds_roles[standby_gid] = filesystem->fscid;
857 // Update the rank state in Filesystem
858 mds_map.in.insert(assigned_rank);
859 mds_map.up[assigned_rank] = standby_gid;
861 // Remove from the list of standbys
862 if (!is_standby_replay) {
863 standby_daemons.erase(standby_gid);
864 standby_epochs.erase(standby_gid);
867 // Indicate that Filesystem has been modified
868 mds_map.epoch = epoch;
871 void FSMap::assign_standby_replay(
872 const mds_gid_t standby_gid,
873 const fs_cluster_id_t leader_ns,
874 const mds_rank_t leader_rank)
876 assert(mds_roles.at(standby_gid) == FS_CLUSTER_ID_NONE);
877 assert(gid_exists(standby_gid));
878 assert(!gid_has_rank(standby_gid));
879 assert(standby_daemons.count(standby_gid));
881 // Insert to the filesystem
882 auto fs = filesystems.at(leader_ns);
883 fs->mds_map.mds_info[standby_gid] = standby_daemons.at(standby_gid);
884 fs->mds_map.mds_info[standby_gid].rank = leader_rank;
885 fs->mds_map.mds_info[standby_gid].state = MDSMap::STATE_STANDBY_REPLAY;
886 mds_roles[standby_gid] = leader_ns;
888 // Remove from the list of standbys
889 standby_daemons.erase(standby_gid);
890 standby_epochs.erase(standby_gid);
892 // Indicate that Filesystem has been modified
893 fs->mds_map.epoch = epoch;
896 void FSMap::erase(mds_gid_t who, epoch_t blacklist_epoch)
898 if (mds_roles.at(who) == FS_CLUSTER_ID_NONE) {
899 standby_daemons.erase(who);
900 standby_epochs.erase(who);
902 auto &fs = filesystems.at(mds_roles.at(who));
903 const auto &info = fs->mds_map.mds_info.at(who);
904 if (info.state != MDSMap::STATE_STANDBY_REPLAY) {
905 if (info.state == MDSMap::STATE_CREATING) {
906 // If this gid didn't make it past CREATING, then forget
907 // the rank ever existed so that next time it's handed out
908 // to a gid it'll go back into CREATING.
909 fs->mds_map.in.erase(info.rank);
911 // Put this rank into the failed list so that the next available
912 // STANDBY will pick it up.
913 fs->mds_map.failed.insert(info.rank);
915 assert(fs->mds_map.up.at(info.rank) == info.global_id);
916 fs->mds_map.up.erase(info.rank);
918 fs->mds_map.mds_info.erase(who);
919 fs->mds_map.last_failure_osd_epoch = blacklist_epoch;
920 fs->mds_map.epoch = epoch;
923 mds_roles.erase(who);
926 void FSMap::damaged(mds_gid_t who, epoch_t blacklist_epoch)
928 assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
929 auto fs = filesystems.at(mds_roles.at(who));
930 mds_rank_t rank = fs->mds_map.mds_info[who].rank;
932 erase(who, blacklist_epoch);
933 fs->mds_map.failed.erase(rank);
934 fs->mds_map.damaged.insert(rank);
936 assert(fs->mds_map.epoch == epoch);
940 * Update to indicate that the rank `rank` is to be removed
941 * from the damaged list of the filesystem `fscid`
943 bool FSMap::undamaged(const fs_cluster_id_t fscid, const mds_rank_t rank)
945 auto fs = filesystems.at(fscid);
947 if (fs->mds_map.damaged.erase(rank)) {
948 fs->mds_map.failed.insert(rank);
949 fs->mds_map.epoch = epoch;
956 void FSMap::insert(const MDSMap::mds_info_t &new_info)
958 assert(new_info.state == MDSMap::STATE_STANDBY);
959 assert(new_info.rank == MDS_RANK_NONE);
960 mds_roles[new_info.global_id] = FS_CLUSTER_ID_NONE;
961 standby_daemons[new_info.global_id] = new_info;
962 standby_epochs[new_info.global_id] = epoch;
965 std::list<mds_gid_t> FSMap::stop(mds_gid_t who)
967 assert(mds_roles.at(who) != FS_CLUSTER_ID_NONE);
968 auto fs = filesystems.at(mds_roles.at(who));
969 const auto &info = fs->mds_map.mds_info.at(who);
970 fs->mds_map.up.erase(info.rank);
971 fs->mds_map.in.erase(info.rank);
972 fs->mds_map.stopped.insert(info.rank);
974 // Also drop any standby replays that were following this rank
975 std::list<mds_gid_t> standbys;
976 for (const auto &i : fs->mds_map.mds_info) {
977 const auto &other_gid = i.first;
978 const auto &other_info = i.second;
979 if (other_info.rank == info.rank
980 && other_info.state == MDSMap::STATE_STANDBY_REPLAY) {
981 standbys.push_back(other_gid);
986 fs->mds_map.mds_info.erase(who);
987 mds_roles.erase(who);
989 fs->mds_map.epoch = epoch;
996 * Given one of the following forms:
1001 * Parse into a mds_role_t. The rank-only form is only valid
1002 * if legacy_client_ns is set.
1004 int FSMap::parse_role(
1005 const std::string &role_str,
1007 std::ostream &ss) const
1009 size_t colon_pos = role_str.find(":");
1011 std::shared_ptr<const Filesystem> fs;
1012 if (colon_pos == std::string::npos) {
1013 if (legacy_client_fscid == FS_CLUSTER_ID_NONE) {
1014 ss << "No filesystem selected";
1017 fs = get_filesystem(legacy_client_fscid);
1020 if (parse_filesystem(role_str.substr(0, colon_pos), &fs) < 0) {
1021 ss << "Invalid filesystem";
1024 rank_pos = colon_pos+1;
1029 std::string rank_str = role_str.substr(rank_pos);
1030 long rank_i = strict_strtol(rank_str.c_str(), 10, &err);
1031 if (rank_i < 0 || !err.empty()) {
1032 ss << "Invalid rank '" << rank_str << "'";
1038 if (fs->mds_map.in.count(rank) == 0) {
1039 ss << "Rank '" << rank << "' not found";
1043 *role = {fs->fscid, rank};