1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
21 #include "include/types.h"
22 #include "common/Clock.h"
23 #include "msg/Message.h"
24 #include "include/health.h"
31 #include "common/config.h"
33 #include "include/CompatSet.h"
34 #include "include/ceph_features.h"
35 #include "common/Formatter.h"
36 #include "mds/mdstypes.h"
40 boot --> standby, creating, or starting.
43 dne ----> creating -----> active*
49 stopped <---- stopping* <-/ / |
51 ----- starting* ----/ |
55 \--> replay* --> reconnect* --> rejoin*
62 class health_check_map_t;
64 extern CompatSet get_mdsmap_compat_set_all();
65 extern CompatSet get_mdsmap_compat_set_default();
66 extern CompatSet get_mdsmap_compat_set_base(); // pre v0.20
68 #define MDS_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "base v0.20")
69 #define MDS_FEATURE_INCOMPAT_CLIENTRANGES CompatSet::Feature(2, "client writeable ranges")
70 #define MDS_FEATURE_INCOMPAT_FILELAYOUT CompatSet::Feature(3, "default file layouts on dirs")
71 #define MDS_FEATURE_INCOMPAT_DIRINODE CompatSet::Feature(4, "dir inode in separate object")
72 #define MDS_FEATURE_INCOMPAT_ENCODING CompatSet::Feature(5, "mds uses versioned encoding")
73 #define MDS_FEATURE_INCOMPAT_OMAPDIRFRAG CompatSet::Feature(6, "dirfrag is stored in omap")
74 #define MDS_FEATURE_INCOMPAT_INLINE CompatSet::Feature(7, "mds uses inline data")
75 #define MDS_FEATURE_INCOMPAT_NOANCHOR CompatSet::Feature(8, "no anchor table")
76 #define MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2 CompatSet::Feature(8, "file layout v2")
78 #define MDS_FS_NAME_DEFAULT "cephfs"
82 /* These states are the union of the set of possible states of an MDS daemon,
83 * and the set of possible states of an MDS rank */
85 // States of an MDS daemon not currently holding a rank
86 // ====================================================
87 STATE_NULL = CEPH_MDS_STATE_NULL, // null value for fns returning this type.
88 STATE_BOOT = CEPH_MDS_STATE_BOOT, // up, boot announcement. destiny unknown.
89 STATE_STANDBY = CEPH_MDS_STATE_STANDBY, // up, idle. waiting for assignment by monitor.
90 STATE_STANDBY_REPLAY = CEPH_MDS_STATE_STANDBY_REPLAY, // up, replaying active node, ready to take over.
92 // States of an MDS rank, and of any MDS daemon holding that rank
93 // ==============================================================
94 STATE_STOPPED = CEPH_MDS_STATE_STOPPED, // down, once existed, but no subtrees. empty log. may not be held by a daemon.
96 STATE_CREATING = CEPH_MDS_STATE_CREATING, // up, creating MDS instance (new journal, idalloc..).
97 STATE_STARTING = CEPH_MDS_STATE_STARTING, // up, starting prior stopped MDS instance.
99 STATE_REPLAY = CEPH_MDS_STATE_REPLAY, // up, starting prior failed instance. scanning journal.
100 STATE_RESOLVE = CEPH_MDS_STATE_RESOLVE, // up, disambiguating distributed operations (import, rename, etc.)
101 STATE_RECONNECT = CEPH_MDS_STATE_RECONNECT, // up, reconnect to clients
102 STATE_REJOIN = CEPH_MDS_STATE_REJOIN, // up, replayed journal, rejoining distributed cache
103 STATE_CLIENTREPLAY = CEPH_MDS_STATE_CLIENTREPLAY, // up, active
104 STATE_ACTIVE = CEPH_MDS_STATE_ACTIVE, // up, active
105 STATE_STOPPING = CEPH_MDS_STATE_STOPPING, // up, exporting metadata (-> standby or out)
106 STATE_DNE = CEPH_MDS_STATE_DNE, // down, rank does not exist
108 // State which a daemon may send to MDSMonitor in its beacon
109 // to indicate that offline repair is required. Daemon must stop
110 // immediately after indicating this state.
111 STATE_DAMAGED = CEPH_MDS_STATE_DAMAGED
114 * In addition to explicit states, an MDS rank implicitly in state:
115 * - STOPPED if it is not currently associated with an MDS daemon gid but it
116 * is in MDSMap::stopped
117 * - FAILED if it is not currently associated with an MDS daemon gid but it
118 * is in MDSMap::failed
119 * - DNE if it is not currently associated with an MDS daemon gid and it is
120 * missing from both MDSMap::failed and MDSMap::stopped
129 MDSMap::DaemonState state;
133 mds_rank_t standby_for_rank;
134 std::string standby_for_name;
135 fs_cluster_id_t standby_for_fscid;
137 std::set<mds_rank_t> export_targets;
138 uint64_t mds_features = 0;
140 mds_info_t() : global_id(MDS_GID_NONE), rank(MDS_RANK_NONE), inc(0),
141 state(STATE_STANDBY), state_seq(0),
142 standby_for_rank(MDS_RANK_NONE),
143 standby_for_fscid(FS_CLUSTER_ID_NONE),
144 standby_replay(false)
147 bool laggy() const { return !(laggy_since == utime_t()); }
148 void clear_laggy() { laggy_since = utime_t(); }
150 entity_inst_t get_inst() const { return entity_inst_t(entity_name_t::MDS(rank), addr); }
152 void encode(bufferlist& bl, uint64_t features) const {
153 if ((features & CEPH_FEATURE_MDSENC) == 0 ) encode_unversioned(bl);
154 else encode_versioned(bl, features);
156 void decode(bufferlist::iterator& p);
157 void dump(Formatter *f) const;
158 void print_summary(ostream &out) const;
160 // The long form name for use in cluster log messages`
161 std::string human_name() const;
163 static void generate_test_instances(list<mds_info_t*>& ls);
165 void encode_versioned(bufferlist& bl, uint64_t features) const;
166 void encode_unversioned(bufferlist& bl) const;
175 uint32_t flags; // flags
176 epoch_t last_failure; // mds epoch of last failure
177 epoch_t last_failure_osd_epoch; // osd epoch of last failure; any mds entering replay needs
178 // at least this osdmap to ensure the blacklist propagates.
179 utime_t created, modified;
181 mds_rank_t tableserver; // which MDS has snaptable
182 mds_rank_t root; // which MDS has root directory
184 __u32 session_timeout;
185 __u32 session_autoclose;
186 uint64_t max_file_size;
188 std::vector<int64_t> data_pools; // file data pools available to clients (via an ioctl). first is the default.
189 int64_t cas_pool; // where CAS objects go
190 int64_t metadata_pool; // where fs metadata objects go
193 * in: the set of logical mds #'s that define the cluster. this is the set
194 * of mds's the metadata may be distributed over.
195 * up: map from logical mds #'s to the addrs filling those roles.
196 * failed: subset of @in that are failed.
197 * stopped: set of nodes that have been initialized, but are not active.
199 * @up + @failed = @in. @in * @stopped = {}.
202 mds_rank_t max_mds; /* The maximum number of active MDSes. Also, the maximum rank. */
203 mds_rank_t standby_count_wanted;
204 string balancer; /* The name/version of the mantle balancer (i.e. the rados obj name) */
206 std::set<mds_rank_t> in; // currently defined cluster
208 // which ranks are failed, stopped, damaged (i.e. not held by a daemon)
209 std::set<mds_rank_t> failed, stopped, damaged;
210 std::map<mds_rank_t, mds_gid_t> up; // who is in those roles
211 std::map<mds_gid_t, mds_info_t> mds_info;
213 uint8_t ever_allowed_features; //< bitmap of features the cluster has allowed
214 uint8_t explicitly_allowed_features; //< bitmap of features explicitly enabled
216 bool inline_data_enabled;
218 uint64_t cached_up_features;
223 friend class MDSMonitor;
224 friend class Filesystem;
229 : epoch(0), enabled(false), fs_name(MDS_FS_NAME_DEFAULT),
230 flags(CEPH_MDSMAP_DEFAULTS), last_failure(0),
231 last_failure_osd_epoch(0),
232 tableserver(0), root(0),
234 session_autoclose(0),
239 standby_count_wanted(-1),
240 ever_allowed_features(0),
241 explicitly_allowed_features(0),
242 inline_data_enabled(false),
243 cached_up_features(0)
246 bool get_inline_data_enabled() const { return inline_data_enabled; }
247 void set_inline_data_enabled(bool enabled) { inline_data_enabled = enabled; }
249 utime_t get_session_timeout() const {
250 return utime_t(session_timeout,0);
252 uint64_t get_max_filesize() const { return max_file_size; }
253 void set_max_filesize(uint64_t m) { max_file_size = m; }
255 int get_flags() const { return flags; }
256 bool test_flag(int f) const { return flags & f; }
257 void set_flag(int f) { flags |= f; }
258 void clear_flag(int f) { flags &= ~f; }
260 const std::string &get_fs_name() const {return fs_name;}
262 void set_snaps_allowed() {
263 set_flag(CEPH_MDSMAP_ALLOW_SNAPS);
264 ever_allowed_features |= CEPH_MDSMAP_ALLOW_SNAPS;
265 explicitly_allowed_features |= CEPH_MDSMAP_ALLOW_SNAPS;
267 void clear_snaps_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_SNAPS); }
268 bool allows_snaps() const { return test_flag(CEPH_MDSMAP_ALLOW_SNAPS); }
270 void set_multimds_allowed() {
271 set_flag(CEPH_MDSMAP_ALLOW_MULTIMDS);
272 ever_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS;
273 explicitly_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS;
275 void clear_multimds_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_MULTIMDS); }
276 bool allows_multimds() const { return test_flag(CEPH_MDSMAP_ALLOW_MULTIMDS); }
278 void set_dirfrags_allowed() {
279 set_flag(CEPH_MDSMAP_ALLOW_DIRFRAGS);
280 ever_allowed_features |= CEPH_MDSMAP_ALLOW_DIRFRAGS;
281 explicitly_allowed_features |= CEPH_MDSMAP_ALLOW_DIRFRAGS;
283 void clear_dirfrags_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_DIRFRAGS); }
284 bool allows_dirfrags() const { return test_flag(CEPH_MDSMAP_ALLOW_DIRFRAGS); }
286 epoch_t get_epoch() const { return epoch; }
287 void inc_epoch() { epoch++; }
289 bool get_enabled() const { return enabled; }
291 const utime_t& get_created() const { return created; }
292 void set_created(utime_t ct) { modified = created = ct; }
293 const utime_t& get_modified() const { return modified; }
294 void set_modified(utime_t mt) { modified = mt; }
296 epoch_t get_last_failure() const { return last_failure; }
297 epoch_t get_last_failure_osd_epoch() const { return last_failure_osd_epoch; }
299 mds_rank_t get_max_mds() const { return max_mds; }
300 void set_max_mds(mds_rank_t m) { max_mds = m; }
302 mds_rank_t get_standby_count_wanted(mds_rank_t standby_daemon_count) const {
303 assert(standby_daemon_count >= 0);
304 std::set<mds_rank_t> s;
305 get_standby_replay_mds_set(s);
306 mds_rank_t standbys_avail = (mds_rank_t)s.size()+standby_daemon_count;
307 mds_rank_t wanted = std::max(0, standby_count_wanted);
308 return wanted > standbys_avail ? wanted - standbys_avail : 0;
310 void set_standby_count_wanted(mds_rank_t n) { standby_count_wanted = n; }
311 bool check_health(mds_rank_t standby_daemon_count);
313 const std::string get_balancer() const { return balancer; }
314 void set_balancer(std::string val) { balancer.assign(val); }
316 mds_rank_t get_tableserver() const { return tableserver; }
317 mds_rank_t get_root() const { return root; }
319 const std::vector<int64_t> &get_data_pools() const { return data_pools; }
320 int64_t get_first_data_pool() const { return *data_pools.begin(); }
321 int64_t get_metadata_pool() const { return metadata_pool; }
322 bool is_data_pool(int64_t poolid) const {
323 auto p = std::find(data_pools.begin(), data_pools.end(), poolid);
324 if (p == data_pools.end())
329 bool pool_in_use(int64_t poolid) const {
330 return get_enabled() && (is_data_pool(poolid) || metadata_pool == poolid);
333 const std::map<mds_gid_t,mds_info_t>& get_mds_info() const { return mds_info; }
334 const mds_info_t& get_mds_info_gid(mds_gid_t gid) const {
335 return mds_info.at(gid);
337 const mds_info_t& get_mds_info(mds_rank_t m) const {
338 assert(up.count(m) && mds_info.count(up.at(m)));
339 return mds_info.at(up.at(m));
341 mds_gid_t find_mds_gid_by_name(const std::string& s) const {
342 for (std::map<mds_gid_t,mds_info_t>::const_iterator p = mds_info.begin();
345 if (p->second.name == s) {
353 unsigned get_num_in_mds() const {
356 unsigned get_num_up_mds() const {
359 mds_rank_t get_last_in_mds() const {
360 auto p = in.rbegin();
361 return p == in.rend() ? MDS_RANK_NONE : *p;
363 int get_num_failed_mds() const {
364 return failed.size();
366 unsigned get_num_mds(int state) const {
368 for (std::map<mds_gid_t,mds_info_t>::const_iterator p = mds_info.begin();
371 if (p->second.state == state) ++n;
376 void add_data_pool(int64_t poolid) {
377 data_pools.push_back(poolid);
379 int remove_data_pool(int64_t poolid) {
380 std::vector<int64_t>::iterator p = std::find(data_pools.begin(), data_pools.end(), poolid);
381 if (p == data_pools.end())
388 void get_mds_set(std::set<mds_rank_t>& s) const {
391 void get_up_mds_set(std::set<mds_rank_t>& s) const {
392 for (std::map<mds_rank_t, mds_gid_t>::const_iterator p = up.begin();
397 void get_active_mds_set(std::set<mds_rank_t>& s) const {
398 get_mds_set(s, MDSMap::STATE_ACTIVE);
400 void get_standby_replay_mds_set(std::set<mds_rank_t>& s) const {
401 get_mds_set(s, MDSMap::STATE_STANDBY_REPLAY);
403 void get_failed_mds_set(std::set<mds_rank_t>& s) const {
408 uint64_t get_up_features() {
409 if (!cached_up_features) {
411 for (std::map<mds_rank_t, mds_gid_t>::const_iterator p = up.begin();
414 std::map<mds_gid_t, mds_info_t>::const_iterator q =
415 mds_info.find(p->second);
416 assert(q != mds_info.end());
418 cached_up_features = q->second.mds_features;
421 cached_up_features &= q->second.mds_features;
425 return cached_up_features;
429 * Get MDS ranks which are in but not up.
431 void get_down_mds_set(std::set<mds_rank_t> *s) const
434 s->insert(failed.begin(), failed.end());
435 s->insert(damaged.begin(), damaged.end());
438 int get_failed() const {
439 if (!failed.empty()) return *failed.begin();
442 void get_stopped_mds_set(std::set<mds_rank_t>& s) const {
445 void get_recovery_mds_set(std::set<mds_rank_t>& s) const {
447 for (const auto& p : damaged)
449 for (const auto& p : mds_info)
450 if (p.second.state >= STATE_REPLAY && p.second.state <= STATE_STOPPING)
451 s.insert(p.second.rank);
455 get_clientreplay_or_active_or_stopping_mds_set(std::set<mds_rank_t>& s) const {
456 for (std::map<mds_gid_t, mds_info_t>::const_iterator p = mds_info.begin();
459 if (p->second.state >= STATE_CLIENTREPLAY && p->second.state <= STATE_STOPPING)
460 s.insert(p->second.rank);
462 void get_mds_set(std::set<mds_rank_t>& s, DaemonState state) const {
463 for (std::map<mds_gid_t, mds_info_t>::const_iterator p = mds_info.begin();
466 if (p->second.state == state)
467 s.insert(p->second.rank);
470 void get_health(list<pair<health_status_t,std::string> >& summary,
471 list<pair<health_status_t,std::string> > *detail) const;
473 void get_health_checks(health_check_map_t *checks) const;
478 TRANSIENT_UNAVAILABLE = 1,
479 STUCK_UNAVAILABLE = 2
484 * Return indication of whether cluster is available. This is a
485 * heuristic for clients to see if they should bother waiting to talk to
486 * MDSs, or whether they should error out at startup/mount.
488 * A TRANSIENT_UNAVAILABLE result indicates that the cluster is in a
489 * transition state like replaying, or is potentially about the fail over.
490 * Clients should wait for an updated map before making a final decision
491 * about whether the filesystem is mountable.
493 * A STUCK_UNAVAILABLE result indicates that we can't see a way that
494 * the cluster is about to recover on its own, so it'll probably require
495 * administrator intervention: clients should probaly not bother trying
498 availability_t is_cluster_available() const;
501 bool is_down(mds_rank_t m) const { return up.count(m) == 0; }
502 bool is_up(mds_rank_t m) const { return up.count(m); }
503 bool is_in(mds_rank_t m) const { return up.count(m) || failed.count(m); }
504 bool is_out(mds_rank_t m) const { return !is_in(m); }
506 bool is_failed(mds_rank_t m) const { return failed.count(m); }
507 bool is_stopped(mds_rank_t m) const { return stopped.count(m); }
509 bool is_dne(mds_rank_t m) const { return in.count(m) == 0; }
510 bool is_dne_gid(mds_gid_t gid) const { return mds_info.count(gid) == 0; }
513 * Get MDS rank state if the rank is up, else STATE_NULL
515 DaemonState get_state(mds_rank_t m) const {
516 std::map<mds_rank_t, mds_gid_t>::const_iterator u = up.find(m);
519 return get_state_gid(u->second);
523 * Get MDS daemon status by GID
525 DaemonState get_state_gid(mds_gid_t gid) const {
526 std::map<mds_gid_t,mds_info_t>::const_iterator i = mds_info.find(gid);
527 if (i == mds_info.end())
529 return i->second.state;
532 const mds_info_t& get_info(const mds_rank_t m) const {
533 return mds_info.at(up.at(m));
535 const mds_info_t& get_info_gid(const mds_gid_t gid) const {
536 return mds_info.at(gid);
539 bool is_boot(mds_rank_t m) const { return get_state(m) == STATE_BOOT; }
540 bool is_creating(mds_rank_t m) const { return get_state(m) == STATE_CREATING; }
541 bool is_starting(mds_rank_t m) const { return get_state(m) == STATE_STARTING; }
542 bool is_replay(mds_rank_t m) const { return get_state(m) == STATE_REPLAY; }
543 bool is_resolve(mds_rank_t m) const { return get_state(m) == STATE_RESOLVE; }
544 bool is_reconnect(mds_rank_t m) const { return get_state(m) == STATE_RECONNECT; }
545 bool is_rejoin(mds_rank_t m) const { return get_state(m) == STATE_REJOIN; }
546 bool is_clientreplay(mds_rank_t m) const { return get_state(m) == STATE_CLIENTREPLAY; }
547 bool is_active(mds_rank_t m) const { return get_state(m) == STATE_ACTIVE; }
548 bool is_stopping(mds_rank_t m) const { return get_state(m) == STATE_STOPPING; }
549 bool is_active_or_stopping(mds_rank_t m) const {
550 return is_active(m) || is_stopping(m);
552 bool is_clientreplay_or_active_or_stopping(mds_rank_t m) const {
553 return is_clientreplay(m) || is_active(m) || is_stopping(m);
556 bool is_followable(mds_rank_t m) const {
557 return (is_resolve(m) ||
560 is_clientreplay(m) ||
565 bool is_laggy_gid(mds_gid_t gid) const {
566 if (!mds_info.count(gid))
568 std::map<mds_gid_t,mds_info_t>::const_iterator p = mds_info.find(gid);
569 return p->second.laggy();
572 // degraded = some recovery in process. fixes active membership and
574 bool is_degraded() const {
575 if (!failed.empty() || !damaged.empty())
577 for (std::map<mds_gid_t,mds_info_t>::const_iterator p = mds_info.begin();
580 if (p->second.state >= STATE_REPLAY && p->second.state <= STATE_CLIENTREPLAY)
584 bool is_any_failed() const {
585 return failed.size();
587 bool is_resolving() const {
589 get_num_mds(STATE_RESOLVE) > 0 &&
590 get_num_mds(STATE_REPLAY) == 0 &&
591 failed.empty() && damaged.empty();
593 bool is_rejoining() const {
594 // nodes are rejoining cache state
596 get_num_mds(STATE_REJOIN) > 0 &&
597 get_num_mds(STATE_REPLAY) == 0 &&
598 get_num_mds(STATE_RECONNECT) == 0 &&
599 get_num_mds(STATE_RESOLVE) == 0 &&
600 failed.empty() && damaged.empty();
602 bool is_stopped() const {
607 * Get whether a rank is 'up', i.e. has
608 * an MDS daemon's entity_inst_t associated
611 bool have_inst(mds_rank_t m) const {
616 * Get the MDS daemon entity_inst_t for a rank
619 const entity_inst_t get_inst(mds_rank_t m) {
621 return mds_info[up[m]].get_inst();
623 const entity_addr_t get_addr(mds_rank_t m) {
625 return mds_info[up[m]].addr;
629 * Get the MDS daemon entity_inst_t for a rank,
632 * @return true if the rank was up and the inst
633 * was populated, else false.
635 bool get_inst(mds_rank_t m, entity_inst_t& inst) {
643 mds_rank_t get_rank_gid(mds_gid_t gid) const {
644 if (mds_info.count(gid)) {
645 return mds_info.at(gid).rank;
647 return MDS_RANK_NONE;
651 int get_inc_gid(mds_gid_t gid) const {
652 auto mds_info_entry = mds_info.find(gid);
653 if (mds_info_entry != mds_info.end())
654 return mds_info_entry->second.inc;
657 void encode(bufferlist& bl, uint64_t features) const;
658 void decode(bufferlist::iterator& p);
659 void decode(bufferlist& bl) {
660 bufferlist::iterator p = bl.begin();
663 void sanitize(std::function<bool(int64_t pool)> pool_exists);
665 void print(ostream& out) const;
666 void print_summary(Formatter *f, ostream *out) const;
668 void dump(Formatter *f) const;
669 static void generate_test_instances(list<MDSMap*>& ls);
671 static bool state_transition_valid(DaemonState prev, DaemonState next);
673 WRITE_CLASS_ENCODER_FEATURES(MDSMap::mds_info_t)
674 WRITE_CLASS_ENCODER_FEATURES(MDSMap)
676 inline ostream& operator<<(ostream &out, const MDSMap &m) {
677 m.print_summary(NULL, &out);