+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2015 Red Hat
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef MDS_RANK_H_
-#define MDS_RANK_H_
-
-#include "common/DecayCounter.h"
-#include "common/LogClient.h"
-#include "common/Timer.h"
-#include "common/TrackedOp.h"
-
-#include "messages/MCommand.h"
-
-#include "Beacon.h"
-#include "DamageTable.h"
-#include "MDSMap.h"
-#include "SessionMap.h"
-#include "MDCache.h"
-#include "Migrator.h"
-#include "MDLog.h"
-#include "PurgeQueue.h"
-#include "osdc/Journaler.h"
-
-// Full .h import instead of forward declaration for PerfCounter, for the
-// benefit of those including this header and using MDSRank::logger
-#include "common/perf_counters.h"
-
-enum {
- l_mds_first = 2000,
- l_mds_request,
- l_mds_reply,
- l_mds_reply_latency,
- l_mds_forward,
- l_mds_dir_fetch,
- l_mds_dir_commit,
- l_mds_dir_split,
- l_mds_dir_merge,
- l_mds_inode_max,
- l_mds_inodes,
- l_mds_inodes_top,
- l_mds_inodes_bottom,
- l_mds_inodes_pin_tail,
- l_mds_inodes_pinned,
- l_mds_inodes_expired,
- l_mds_inodes_with_caps,
- l_mds_caps,
- l_mds_subtrees,
- l_mds_traverse,
- l_mds_traverse_hit,
- l_mds_traverse_forward,
- l_mds_traverse_discover,
- l_mds_traverse_dir_fetch,
- l_mds_traverse_remote_ino,
- l_mds_traverse_lock,
- l_mds_load_cent,
- l_mds_dispatch_queue_len,
- l_mds_exported,
- l_mds_exported_inodes,
- l_mds_imported,
- l_mds_imported_inodes,
- l_mds_last,
-};
-
-// memory utilization
-enum {
- l_mdm_first = 2500,
- l_mdm_ino,
- l_mdm_inoa,
- l_mdm_inos,
- l_mdm_dir,
- l_mdm_dira,
- l_mdm_dirs,
- l_mdm_dn,
- l_mdm_dna,
- l_mdm_dns,
- l_mdm_cap,
- l_mdm_capa,
- l_mdm_caps,
- l_mdm_rss,
- l_mdm_heap,
- l_mdm_buf,
- l_mdm_last,
-};
-
-namespace ceph {
- struct heartbeat_handle_d;
-}
-
-class Server;
-class Locker;
-class MDCache;
-class MDLog;
-class MDBalancer;
-class InoTable;
-class SnapServer;
-class SnapClient;
-class MDSTableServer;
-class MDSTableClient;
-class Messenger;
-class Objecter;
-class MonClient;
-class Finisher;
-class MMDSMap;
-class ScrubStack;
-
-/**
- * The public part of this class's interface is what's exposed to all
- * the various subsystems (server, mdcache, etc), such as pointers
- * to the other subsystems, and message-sending calls.
- */
-class MDSRank {
- protected:
- const mds_rank_t whoami;
-
- // Incarnation as seen in MDSMap at the point where a rank is
- // assigned.
- int incarnation;
-
- public:
- mds_rank_t get_nodeid() const { return whoami; }
- int64_t get_metadata_pool();
-
- // Reference to global MDS::mds_lock, so that users of MDSRank don't
- // carry around references to the outer MDS, and we can substitute
- // a separate lock here in future potentially.
- Mutex &mds_lock;
-
- bool is_daemon_stopping() const;
-
- // Reference to global cluster log client, just to avoid initialising
- // a separate one here.
- LogChannelRef &clog;
-
- // Reference to global timer utility, because MDSRank and MDSDaemon
- // currently both use the same mds_lock, so it makes sense for them
- // to share a timer.
- SafeTimer &timer;
-
- MDSMap *&mdsmap;
-
- Objecter *objecter;
-
- // sub systems
- Server *server;
- MDCache *mdcache;
- Locker *locker;
- MDLog *mdlog;
- MDBalancer *balancer;
- ScrubStack *scrubstack;
- DamageTable damage_table;
-
-
- InoTable *inotable;
-
- SnapServer *snapserver;
- SnapClient *snapclient;
-
- MDSTableClient *get_table_client(int t);
- MDSTableServer *get_table_server(int t);
-
- SessionMap sessionmap;
- Session *get_session(client_t client) {
- return sessionmap.get_session(entity_name_t::CLIENT(client.v));
- }
-
- PerfCounters *logger, *mlogger;
- OpTracker op_tracker;
-
- // The last different state I held before current
- MDSMap::DaemonState last_state;
- // The state assigned to me by the MDSMap
- MDSMap::DaemonState state;
-
- bool cluster_degraded;
-
- MDSMap::DaemonState get_state() const { return state; }
- MDSMap::DaemonState get_want_state() const { return beacon.get_want_state(); }
-
- bool is_creating() const { return state == MDSMap::STATE_CREATING; }
- bool is_starting() const { return state == MDSMap::STATE_STARTING; }
- bool is_standby() const { return state == MDSMap::STATE_STANDBY; }
- bool is_replay() const { return state == MDSMap::STATE_REPLAY; }
- bool is_standby_replay() const { return state == MDSMap::STATE_STANDBY_REPLAY; }
- bool is_resolve() const { return state == MDSMap::STATE_RESOLVE; }
- bool is_reconnect() const { return state == MDSMap::STATE_RECONNECT; }
- bool is_rejoin() const { return state == MDSMap::STATE_REJOIN; }
- bool is_clientreplay() const { return state == MDSMap::STATE_CLIENTREPLAY; }
- bool is_active() const { return state == MDSMap::STATE_ACTIVE; }
- bool is_stopping() const { return state == MDSMap::STATE_STOPPING; }
- bool is_any_replay() const { return (is_replay() || is_standby_replay()); }
- bool is_stopped() const { return mdsmap->is_stopped(whoami); }
- bool is_cluster_degraded() const { return cluster_degraded; }
-
- void handle_write_error(int err);
-
- void handle_conf_change(const struct md_config_t *conf,
- const std::set <std::string> &changed)
- {
- purge_queue.handle_conf_change(conf, changed, *mdsmap);
- }
-
- void update_mlogger();
- protected:
- // Flag to indicate we entered shutdown: anyone seeing this to be true
- // after taking mds_lock must drop out.
- bool stopping;
-
- // PurgeQueue is only used by StrayManager, but it is owned by MDSRank
- // because its init/shutdown happens at the top level.
- PurgeQueue purge_queue;
-
- class ProgressThread : public Thread {
- MDSRank *mds;
- Cond cond;
- public:
- explicit ProgressThread(MDSRank *mds_) : mds(mds_) {}
- void * entry() override;
- void shutdown();
- void signal() {cond.Signal();}
- } progress_thread;
-
- list<Message*> waiting_for_nolaggy;
- list<MDSInternalContextBase*> finished_queue;
- // Dispatch, retry, queues
- int dispatch_depth;
- void inc_dispatch_depth() { ++dispatch_depth; }
- void dec_dispatch_depth() { --dispatch_depth; }
- void retry_dispatch(Message *m);
- bool handle_deferrable_message(Message *m);
- void _advance_queues();
- bool _dispatch(Message *m, bool new_msg);
-
- ceph::heartbeat_handle_d *hb; // Heartbeat for threads using mds_lock
-
- bool is_stale_message(Message *m) const;
-
- map<mds_rank_t, version_t> peer_mdsmap_epoch;
-
- ceph_tid_t last_tid; // for mds-initiated requests (e.g. stray rename)
-
- list<MDSInternalContextBase*> waiting_for_active, waiting_for_replay, waiting_for_reconnect, waiting_for_resolve;
- list<MDSInternalContextBase*> replay_queue;
- map<mds_rank_t, list<MDSInternalContextBase*> > waiting_for_active_peer;
- map<epoch_t, list<MDSInternalContextBase*> > waiting_for_mdsmap;
-
- epoch_t osd_epoch_barrier;
-
- // Const reference to the beacon so that we can behave differently
- // when it's laggy.
- Beacon &beacon;
-
- /**
- * Emit clog warnings for any ops reported as warnings by optracker
- */
- void check_ops_in_flight();
-
- int mds_slow_req_count;
-
- /**
- * Share MDSMap with clients
- */
- void bcast_mds_map(); // to mounted clients
- epoch_t last_client_mdsmap_bcast;
-
- map<mds_rank_t,DecayCounter> export_targets; /* targets this MDS is exporting to or wants/tries to */
-
- void create_logger();
- public:
-
- void queue_waiter(MDSInternalContextBase *c) {
- finished_queue.push_back(c);
- progress_thread.signal();
- }
- void queue_waiters(list<MDSInternalContextBase*>& ls) {
- finished_queue.splice( finished_queue.end(), ls );
- progress_thread.signal();
- }
-
- MDSRank(
- mds_rank_t whoami_,
- Mutex &mds_lock_,
- LogChannelRef &clog_,
- SafeTimer &timer_,
- Beacon &beacon_,
- MDSMap *& mdsmap_,
- Messenger *msgr,
- MonClient *monc_,
- Context *respawn_hook_,
- Context *suicide_hook_);
-
- protected:
- ~MDSRank();
-
- public:
-
- // Daemon lifetime functions: these guys break the abstraction
- // and call up into the parent MDSDaemon instance. It's kind
- // of unavoidable: if we want any depth into our calls
- // to be able to e.g. tear down the whole process, we have to
- // have a reference going all the way down.
- // >>>
- void suicide();
- void respawn();
- // <<<
-
- /**
- * Call this periodically if inside a potentially long running piece
- * of code while holding the mds_lock
- */
- void heartbeat_reset();
-
- /**
- * Report state DAMAGED to the mon, and then pass on to respawn(). Call
- * this when an unrecoverable error is encountered while attempting
- * to load an MDS rank's data structures. This is *not* for use with
- * errors affecting normal dirfrag/inode objects -- they should be handled
- * through cleaner scrub/repair mechanisms.
- *
- * Callers must already hold mds_lock.
- */
- void damaged();
-
- /**
- * Wrapper around `damaged` for users who are not
- * already holding mds_lock.
- *
- * Callers must not already hold mds_lock.
- */
- void damaged_unlocked();
-
- utime_t get_laggy_until() const;
-
- void send_message_mds(Message *m, mds_rank_t mds);
- void forward_message_mds(Message *req, mds_rank_t mds);
-
- void send_message_client_counted(Message *m, client_t client);
- void send_message_client_counted(Message *m, Session *session);
- void send_message_client_counted(Message *m, Connection *connection);
- void send_message_client_counted(Message *m, const ConnectionRef& con) {
- send_message_client_counted(m, con.get());
- }
- void send_message_client(Message *m, Session *session);
- void send_message(Message *m, Connection *c);
- void send_message(Message *m, const ConnectionRef& c) {
- send_message(m, c.get());
- }
-
- void wait_for_active_peer(mds_rank_t who, MDSInternalContextBase *c) {
- waiting_for_active_peer[who].push_back(c);
- }
- void wait_for_cluster_recovered(MDSInternalContextBase *c) {
- assert(cluster_degraded);
- waiting_for_active_peer[MDS_RANK_NONE].push_back(c);
- }
-
- void wait_for_active(MDSInternalContextBase *c) {
- waiting_for_active.push_back(c);
- }
- void wait_for_replay(MDSInternalContextBase *c) {
- waiting_for_replay.push_back(c);
- }
- void wait_for_reconnect(MDSInternalContextBase *c) {
- waiting_for_reconnect.push_back(c);
- }
- void wait_for_resolve(MDSInternalContextBase *c) {
- waiting_for_resolve.push_back(c);
- }
- void wait_for_mdsmap(epoch_t e, MDSInternalContextBase *c) {
- waiting_for_mdsmap[e].push_back(c);
- }
- void enqueue_replay(MDSInternalContextBase *c) {
- replay_queue.push_back(c);
- }
-
- bool queue_one_replay();
-
- void set_osd_epoch_barrier(epoch_t e);
- epoch_t get_osd_epoch_barrier() const {return osd_epoch_barrier;}
- epoch_t get_osd_epoch() const;
-
- ceph_tid_t issue_tid() { return ++last_tid; }
-
- Finisher *finisher;
-
- MDSMap *get_mds_map() { return mdsmap; }
-
- int get_req_rate() const { return logger->get(l_mds_request); }
-
- int get_mds_slow_req_count() const { return mds_slow_req_count; }
-
- void dump_status(Formatter *f) const;
-
- void hit_export_target(utime_t now, mds_rank_t rank, double amount=-1.0);
- bool is_export_target(mds_rank_t rank) {
- const set<mds_rank_t>& map_targets = mdsmap->get_mds_info(get_nodeid()).export_targets;
- return map_targets.count(rank);
- }
-
- bool evict_client(int64_t session_id, bool wait, bool blacklist,
- std::stringstream& ss, Context *on_killed=nullptr);
-
- protected:
- void dump_clientreplay_status(Formatter *f) const;
- void command_scrub_path(Formatter *f, const string& path, vector<string>& scrubop_vec);
- void command_tag_path(Formatter *f, const string& path,
- const string &tag);
- void command_flush_path(Formatter *f, const string& path);
- void command_flush_journal(Formatter *f);
- void command_get_subtrees(Formatter *f);
- void command_export_dir(Formatter *f,
- const std::string &path, mds_rank_t dest);
- bool command_dirfrag_split(
- cmdmap_t cmdmap,
- std::ostream &ss);
- bool command_dirfrag_merge(
- cmdmap_t cmdmap,
- std::ostream &ss);
- bool command_dirfrag_ls(
- cmdmap_t cmdmap,
- std::ostream &ss,
- Formatter *f);
- int _command_export_dir(const std::string &path, mds_rank_t dest);
- int _command_flush_journal(std::stringstream *ss);
- CDir *_command_dirfrag_get(
- const cmdmap_t &cmdmap,
- std::ostream &ss);
-
- protected:
- Messenger *messenger;
- MonClient *monc;
-
- Context *respawn_hook;
- Context *suicide_hook;
-
- // Friended to access retry_dispatch
- friend class C_MDS_RetryMessage;
-
- // FIXME the state machine logic should be separable from the dispatch
- // logic that calls it.
- // >>>
- void calc_recovery_set();
- void request_state(MDSMap::DaemonState s);
-
- bool standby_replaying; // true if current replay pass is in standby-replay mode
-
- typedef enum {
- // The MDSMap is available, configure default layouts and structures
- MDS_BOOT_INITIAL = 0,
- // We are ready to open some inodes
- MDS_BOOT_OPEN_ROOT,
- // We are ready to do a replay if needed
- MDS_BOOT_PREPARE_LOG,
- // Replay is complete
- MDS_BOOT_REPLAY_DONE
- } BootStep;
- friend class C_MDS_BootStart;
- friend class C_MDS_InternalBootStart;
- void boot_create(); // i am new mds.
- void boot_start(BootStep step=MDS_BOOT_INITIAL, int r=0); // starting|replay
-
- void replay_start();
- void creating_done();
- void starting_done();
- void replay_done();
- void standby_replay_restart();
- void _standby_replay_restart_finish(int r, uint64_t old_read_pos);
- class C_MDS_StandbyReplayRestart;
- class C_MDS_StandbyReplayRestartFinish;
-
- void reopen_log();
-
- void resolve_start();
- void resolve_done();
- void reconnect_start();
- void reconnect_done();
- void rejoin_joint_start();
- void rejoin_start();
- void rejoin_done();
- void recovery_done(int oldstate);
- void clientreplay_start();
- void clientreplay_done();
- void active_start();
- void stopping_start();
- void stopping_done();
-
- void validate_sessions();
- // <<<
-
- // >>>
- void handle_mds_recovery(mds_rank_t who);
- void handle_mds_failure(mds_rank_t who);
- // <<<
-
- /* Update MDSMap export_targets for this rank. Called on ::tick(). */
- void update_targets(utime_t now);
-};
-
-/* This expects to be given a reference which it is responsible for.
- * The finish function calls functions which
- * will put the Message exactly once.*/
-class C_MDS_RetryMessage : public MDSInternalContext {
-protected:
- Message *m;
-public:
- C_MDS_RetryMessage(MDSRank *mds, Message *m)
- : MDSInternalContext(mds)
- {
- assert(m);
- this->m = m;
- }
- void finish(int r) override {
- mds->retry_dispatch(m);
- }
-};
-
-/**
- * The aspect of MDSRank exposed to MDSDaemon but not subsystems: i.e.
- * the service/dispatcher stuff like init/shutdown that subsystems should
- * never touch.
- */
-class MDSRankDispatcher : public MDSRank
-{
-public:
- void init();
- void tick();
- void shutdown();
- bool handle_asok_command(std::string command, cmdmap_t& cmdmap,
- Formatter *f, std::ostream& ss);
- void handle_mds_map(MMDSMap *m, MDSMap *oldmap);
- void handle_osd_map();
- void update_log_config();
-
- bool handle_command(
- const cmdmap_t &cmdmap,
- MCommand *m,
- int *r,
- std::stringstream *ds,
- std::stringstream *ss,
- bool *need_reply);
-
- void dump_sessions(const SessionFilter &filter, Formatter *f) const;
- void evict_clients(const SessionFilter &filter, MCommand *m);
-
- // Call into me from MDS::ms_dispatch
- bool ms_dispatch(Message *m);
-
- MDSRankDispatcher(
- mds_rank_t whoami_,
- Mutex &mds_lock_,
- LogChannelRef &clog_,
- SafeTimer &timer_,
- Beacon &beacon_,
- MDSMap *& mdsmap_,
- Messenger *msgr,
- MonClient *monc_,
- Context *respawn_hook_,
- Context *suicide_hook_);
-};
-
-// This utility for MDS and MDSRank dispatchers.
-#define ALLOW_MESSAGES_FROM(peers) \
-do { \
- if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
- dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \
- << " allowing=" << #peers << " message=" << *m << dendl; \
- m->put(); \
- return true; \
- } \
-} while (0)
-
-#endif // MDS_RANK_H_
-