// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system * * Copyright (C) 2015 Red Hat * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #ifndef MDS_RANK_H_ #define MDS_RANK_H_ #include "common/DecayCounter.h" #include "common/LogClient.h" #include "common/Timer.h" #include "common/TrackedOp.h" #include "messages/MCommand.h" #include "Beacon.h" #include "DamageTable.h" #include "MDSMap.h" #include "SessionMap.h" #include "MDCache.h" #include "Migrator.h" #include "MDLog.h" #include "PurgeQueue.h" #include "osdc/Journaler.h" // Full .h import instead of forward declaration for PerfCounter, for the // benefit of those including this header and using MDSRank::logger #include "common/perf_counters.h" enum { l_mds_first = 2000, l_mds_request, l_mds_reply, l_mds_reply_latency, l_mds_forward, l_mds_dir_fetch, l_mds_dir_commit, l_mds_dir_split, l_mds_dir_merge, l_mds_inode_max, l_mds_inodes, l_mds_inodes_top, l_mds_inodes_bottom, l_mds_inodes_pin_tail, l_mds_inodes_pinned, l_mds_inodes_expired, l_mds_inodes_with_caps, l_mds_caps, l_mds_subtrees, l_mds_traverse, l_mds_traverse_hit, l_mds_traverse_forward, l_mds_traverse_discover, l_mds_traverse_dir_fetch, l_mds_traverse_remote_ino, l_mds_traverse_lock, l_mds_load_cent, l_mds_dispatch_queue_len, l_mds_exported, l_mds_exported_inodes, l_mds_imported, l_mds_imported_inodes, l_mds_last, }; // memory utilization enum { l_mdm_first = 2500, l_mdm_ino, l_mdm_inoa, l_mdm_inos, l_mdm_dir, l_mdm_dira, l_mdm_dirs, l_mdm_dn, l_mdm_dna, l_mdm_dns, l_mdm_cap, l_mdm_capa, l_mdm_caps, l_mdm_rss, l_mdm_heap, l_mdm_buf, l_mdm_last, }; namespace ceph { struct heartbeat_handle_d; } class Server; class Locker; class MDCache; class MDLog; class MDBalancer; class InoTable; class SnapServer; class SnapClient; class MDSTableServer; class MDSTableClient; class Messenger; class Objecter; class MonClient; class Finisher; class MMDSMap; class ScrubStack; /** * The public part of this class's interface is what's exposed to all * the various subsystems (server, mdcache, etc), such as pointers * to the other subsystems, and message-sending calls. */ class MDSRank { protected: const mds_rank_t whoami; // Incarnation as seen in MDSMap at the point where a rank is // assigned. int incarnation; public: mds_rank_t get_nodeid() const { return whoami; } int64_t get_metadata_pool(); // Reference to global MDS::mds_lock, so that users of MDSRank don't // carry around references to the outer MDS, and we can substitute // a separate lock here in future potentially. Mutex &mds_lock; bool is_daemon_stopping() const; // Reference to global cluster log client, just to avoid initialising // a separate one here. LogChannelRef &clog; // Reference to global timer utility, because MDSRank and MDSDaemon // currently both use the same mds_lock, so it makes sense for them // to share a timer. SafeTimer &timer; MDSMap *&mdsmap; Objecter *objecter; // sub systems Server *server; MDCache *mdcache; Locker *locker; MDLog *mdlog; MDBalancer *balancer; ScrubStack *scrubstack; DamageTable damage_table; InoTable *inotable; SnapServer *snapserver; SnapClient *snapclient; MDSTableClient *get_table_client(int t); MDSTableServer *get_table_server(int t); SessionMap sessionmap; Session *get_session(client_t client) { return sessionmap.get_session(entity_name_t::CLIENT(client.v)); } PerfCounters *logger, *mlogger; OpTracker op_tracker; // The last different state I held before current MDSMap::DaemonState last_state; // The state assigned to me by the MDSMap MDSMap::DaemonState state; bool cluster_degraded; MDSMap::DaemonState get_state() const { return state; } MDSMap::DaemonState get_want_state() const { return beacon.get_want_state(); } bool is_creating() const { return state == MDSMap::STATE_CREATING; } bool is_starting() const { return state == MDSMap::STATE_STARTING; } bool is_standby() const { return state == MDSMap::STATE_STANDBY; } bool is_replay() const { return state == MDSMap::STATE_REPLAY; } bool is_standby_replay() const { return state == MDSMap::STATE_STANDBY_REPLAY; } bool is_resolve() const { return state == MDSMap::STATE_RESOLVE; } bool is_reconnect() const { return state == MDSMap::STATE_RECONNECT; } bool is_rejoin() const { return state == MDSMap::STATE_REJOIN; } bool is_clientreplay() const { return state == MDSMap::STATE_CLIENTREPLAY; } bool is_active() const { return state == MDSMap::STATE_ACTIVE; } bool is_stopping() const { return state == MDSMap::STATE_STOPPING; } bool is_any_replay() const { return (is_replay() || is_standby_replay()); } bool is_stopped() const { return mdsmap->is_stopped(whoami); } bool is_cluster_degraded() const { return cluster_degraded; } void handle_write_error(int err); void handle_conf_change(const struct md_config_t *conf, const std::set &changed) { purge_queue.handle_conf_change(conf, changed, *mdsmap); } void update_mlogger(); protected: // Flag to indicate we entered shutdown: anyone seeing this to be true // after taking mds_lock must drop out. bool stopping; // PurgeQueue is only used by StrayManager, but it is owned by MDSRank // because its init/shutdown happens at the top level. PurgeQueue purge_queue; class ProgressThread : public Thread { MDSRank *mds; Cond cond; public: explicit ProgressThread(MDSRank *mds_) : mds(mds_) {} void * entry() override; void shutdown(); void signal() {cond.Signal();} } progress_thread; list waiting_for_nolaggy; list finished_queue; // Dispatch, retry, queues int dispatch_depth; void inc_dispatch_depth() { ++dispatch_depth; } void dec_dispatch_depth() { --dispatch_depth; } void retry_dispatch(Message *m); bool handle_deferrable_message(Message *m); void _advance_queues(); bool _dispatch(Message *m, bool new_msg); ceph::heartbeat_handle_d *hb; // Heartbeat for threads using mds_lock bool is_stale_message(Message *m) const; map peer_mdsmap_epoch; ceph_tid_t last_tid; // for mds-initiated requests (e.g. stray rename) list waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve; list replay_queue; map > waiting_for_active_peer; map > waiting_for_mdsmap; epoch_t osd_epoch_barrier; // Const reference to the beacon so that we can behave differently // when it's laggy. Beacon &beacon; /** * Emit clog warnings for any ops reported as warnings by optracker */ void check_ops_in_flight(); int mds_slow_req_count; /** * Share MDSMap with clients */ void bcast_mds_map(); // to mounted clients epoch_t last_client_mdsmap_bcast; map export_targets; /* targets this MDS is exporting to or wants/tries to */ void create_logger(); public: void queue_waiter(MDSInternalContextBase *c) { finished_queue.push_back(c); progress_thread.signal(); } void queue_waiters(list& ls) { finished_queue.splice( finished_queue.end(), ls ); progress_thread.signal(); } MDSRank( mds_rank_t whoami_, Mutex &mds_lock_, LogChannelRef &clog_, SafeTimer &timer_, Beacon &beacon_, MDSMap *& mdsmap_, Messenger *msgr, MonClient *monc_, Context *respawn_hook_, Context *suicide_hook_); protected: ~MDSRank(); public: // Daemon lifetime functions: these guys break the abstraction // and call up into the parent MDSDaemon instance. It's kind // of unavoidable: if we want any depth into our calls // to be able to e.g. tear down the whole process, we have to // have a reference going all the way down. // >>> void suicide(); void respawn(); // <<< /** * Call this periodically if inside a potentially long running piece * of code while holding the mds_lock */ void heartbeat_reset(); /** * Report state DAMAGED to the mon, and then pass on to respawn(). Call * this when an unrecoverable error is encountered while attempting * to load an MDS rank's data structures. This is *not* for use with * errors affecting normal dirfrag/inode objects -- they should be handled * through cleaner scrub/repair mechanisms. * * Callers must already hold mds_lock. */ void damaged(); /** * Wrapper around `damaged` for users who are not * already holding mds_lock. * * Callers must not already hold mds_lock. */ void damaged_unlocked(); utime_t get_laggy_until() const; void send_message_mds(Message *m, mds_rank_t mds); void forward_message_mds(Message *req, mds_rank_t mds); void send_message_client_counted(Message *m, client_t client); void send_message_client_counted(Message *m, Session *session); void send_message_client_counted(Message *m, Connection *connection); void send_message_client_counted(Message *m, const ConnectionRef& con) { send_message_client_counted(m, con.get()); } void send_message_client(Message *m, Session *session); void send_message(Message *m, Connection *c); void send_message(Message *m, const ConnectionRef& c) { send_message(m, c.get()); } void wait_for_active_peer(mds_rank_t who, MDSInternalContextBase *c) { waiting_for_active_peer[who].push_back(c); } void wait_for_cluster_recovered(MDSInternalContextBase *c) { assert(cluster_degraded); waiting_for_active_peer[MDS_RANK_NONE].push_back(c); } void wait_for_active(MDSInternalContextBase *c) { waiting_for_active.push_back(c); } void wait_for_replay(MDSInternalContextBase *c) { waiting_for_replay.push_back(c); } void wait_for_reconnect(MDSInternalContextBase *c) { waiting_for_reconnect.push_back(c); } void wait_for_resolve(MDSInternalContextBase *c) { waiting_for_resolve.push_back(c); } void wait_for_mdsmap(epoch_t e, MDSInternalContextBase *c) { waiting_for_mdsmap[e].push_back(c); } void enqueue_replay(MDSInternalContextBase *c) { replay_queue.push_back(c); } bool queue_one_replay(); void set_osd_epoch_barrier(epoch_t e); epoch_t get_osd_epoch_barrier() const {return osd_epoch_barrier;} epoch_t get_osd_epoch() const; ceph_tid_t issue_tid() { return ++last_tid; } Finisher *finisher; MDSMap *get_mds_map() { return mdsmap; } int get_req_rate() const { return logger->get(l_mds_request); } int get_mds_slow_req_count() const { return mds_slow_req_count; } void dump_status(Formatter *f) const; void hit_export_target(utime_t now, mds_rank_t rank, double amount=-1.0); bool is_export_target(mds_rank_t rank) { const set& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets; return map_targets.count(rank); } bool evict_client(int64_t session_id, bool wait, bool blacklist, std::stringstream& ss, Context *on_killed=nullptr); protected: void dump_clientreplay_status(Formatter *f) const; void command_scrub_path(Formatter *f, const string& path, vector& scrubop_vec); void command_tag_path(Formatter *f, const string& path, const string &tag); void command_flush_path(Formatter *f, const string& path); void command_flush_journal(Formatter *f); void command_get_subtrees(Formatter *f); void command_export_dir(Formatter *f, const std::string &path, mds_rank_t dest); bool command_dirfrag_split( cmdmap_t cmdmap, std::ostream &ss); bool command_dirfrag_merge( cmdmap_t cmdmap, std::ostream &ss); bool command_dirfrag_ls( cmdmap_t cmdmap, std::ostream &ss, Formatter *f); int _command_export_dir(const std::string &path, mds_rank_t dest); int _command_flush_journal(std::stringstream *ss); CDir *_command_dirfrag_get( const cmdmap_t &cmdmap, std::ostream &ss); protected: Messenger *messenger; MonClient *monc; Context *respawn_hook; Context *suicide_hook; // Friended to access retry_dispatch friend class C_MDS_RetryMessage; // FIXME the state machine logic should be separable from the dispatch // logic that calls it. // >>> void calc_recovery_set(); void request_state(MDSMap::DaemonState s); bool standby_replaying; // true if current replay pass is in standby-replay mode typedef enum { // The MDSMap is available, configure default layouts and structures MDS_BOOT_INITIAL = 0, // We are ready to open some inodes MDS_BOOT_OPEN_ROOT, // We are ready to do a replay if needed MDS_BOOT_PREPARE_LOG, // Replay is complete MDS_BOOT_REPLAY_DONE } BootStep; friend class C_MDS_BootStart; friend class C_MDS_InternalBootStart; void boot_create(); // i am new mds. void boot_start(BootStep step=MDS_BOOT_INITIAL, int r=0); // starting|replay void replay_start(); void creating_done(); void starting_done(); void replay_done(); void standby_replay_restart(); void _standby_replay_restart_finish(int r, uint64_t old_read_pos); class C_MDS_StandbyReplayRestart; class C_MDS_StandbyReplayRestartFinish; void reopen_log(); void resolve_start(); void resolve_done(); void reconnect_start(); void reconnect_done(); void rejoin_joint_start(); void rejoin_start(); void rejoin_done(); void recovery_done(int oldstate); void clientreplay_start(); void clientreplay_done(); void active_start(); void stopping_start(); void stopping_done(); void validate_sessions(); // <<< // >>> void handle_mds_recovery(mds_rank_t who); void handle_mds_failure(mds_rank_t who); // <<< /* Update MDSMap export_targets for this rank. Called on ::tick(). */ void update_targets(utime_t now); }; /* This expects to be given a reference which it is responsible for. * The finish function calls functions which * will put the Message exactly once.*/ class C_MDS_RetryMessage : public MDSInternalContext { protected: Message *m; public: C_MDS_RetryMessage(MDSRank *mds, Message *m) : MDSInternalContext(mds) { assert(m); this->m = m; } void finish(int r) override { mds->retry_dispatch(m); } }; /** * The aspect of MDSRank exposed to MDSDaemon but not subsystems: i.e. * the service/dispatcher stuff like init/shutdown that subsystems should * never touch. */ class MDSRankDispatcher : public MDSRank { public: void init(); void tick(); void shutdown(); bool handle_asok_command(std::string command, cmdmap_t& cmdmap, Formatter *f, std::ostream& ss); void handle_mds_map(MMDSMap *m, MDSMap *oldmap); void handle_osd_map(); void update_log_config(); bool handle_command( const cmdmap_t &cmdmap, MCommand *m, int *r, std::stringstream *ds, std::stringstream *ss, bool *need_reply); void dump_sessions(const SessionFilter &filter, Formatter *f) const; void evict_clients(const SessionFilter &filter, MCommand *m); // Call into me from MDS::ms_dispatch bool ms_dispatch(Message *m); MDSRankDispatcher( mds_rank_t whoami_, Mutex &mds_lock_, LogChannelRef &clog_, SafeTimer &timer_, Beacon &beacon_, MDSMap *& mdsmap_, Messenger *msgr, MonClient *monc_, Context *respawn_hook_, Context *suicide_hook_); }; // This utility for MDS and MDSRank dispatchers. #define ALLOW_MESSAGES_FROM(peers) \ do { \ if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \ dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \ << " allowing=" << #peers << " message=" << *m << dendl; \ m->put(); \ return true; \ } \ } while (0) #endif // MDS_RANK_H_