initial code repo

[stor4nfv.git] / src / ceph / src / mon / Monitor.h
diff --git a/src/ceph/src/mon/Monitor.h b/src/ceph/src/mon/Monitor.h

new file mode 100644 (file)

index 0000000..fc97726
--- /dev/null
+++ b/src/ceph/src/mon/Monitor.h
@@ -0,0 +1,1020 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- 
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software 
+ * Foundation.  See file COPYING.
+ * 
+ */
+
+/* 
+ * This is the top level monitor. It runs on each machine in the Monitor   
+ * Cluster. The election of a leader for the paxos algorithm only happens 
+ * once per machine via the elector. There is a separate paxos instance (state) 
+ * kept for each of the system components: Object Store Device (OSD) Monitor, 
+ * Placement Group (PG) Monitor, Metadata Server (MDS) Monitor, and Client Monitor.
+ */
+
+#ifndef CEPH_MONITOR_H
+#define CEPH_MONITOR_H
+
+#include <errno.h>
+#include <cmath>
+
+#include "include/types.h"
+#include "include/health.h"
+#include "msg/Messenger.h"
+
+#include "common/Timer.h"
+
+#include "health_check.h"
+#include "MonMap.h"
+#include "Elector.h"
+#include "Paxos.h"
+#include "Session.h"
+#include "PGStatService.h"
+#include "MonCommand.h"
+
+#include "common/LogClient.h"
+#include "auth/cephx/CephxKeyServer.h"
+#include "auth/AuthMethodList.h"
+#include "auth/KeyRing.h"
+#include "messages/MMonCommand.h"
+#include "mon/MonitorDBStore.h"
+#include "include/memory.h"
+#include "mgr/MgrClient.h"
+
+#include "mon/MonOpRequest.h"
+#include "common/WorkQueue.h"
+
+
+#define CEPH_MON_PROTOCOL     13 /* cluster internal */
+
+
+enum {
+  l_cluster_first = 555000,
+  l_cluster_num_mon,
+  l_cluster_num_mon_quorum,
+  l_cluster_num_osd,
+  l_cluster_num_osd_up,
+  l_cluster_num_osd_in,
+  l_cluster_osd_epoch,
+  l_cluster_osd_bytes,
+  l_cluster_osd_bytes_used,
+  l_cluster_osd_bytes_avail,
+  l_cluster_num_pool,
+  l_cluster_num_pg,
+  l_cluster_num_pg_active_clean,
+  l_cluster_num_pg_active,
+  l_cluster_num_pg_peering,
+  l_cluster_num_object,
+  l_cluster_num_object_degraded,
+  l_cluster_num_object_misplaced,
+  l_cluster_num_object_unfound,
+  l_cluster_num_bytes,
+  l_cluster_num_mds_up,
+  l_cluster_num_mds_in,
+  l_cluster_num_mds_failed,
+  l_cluster_mds_epoch,
+  l_cluster_last,
+};
+
+enum {
+  l_mon_first = 456000,
+  l_mon_num_sessions,
+  l_mon_session_add,
+  l_mon_session_rm,
+  l_mon_session_trim,
+  l_mon_num_elections,
+  l_mon_election_call,
+  l_mon_election_win,
+  l_mon_election_lose,
+  l_mon_last,
+};
+
+class QuorumService;
+class PaxosService;
+
+class PerfCounters;
+class AdminSocketHook;
+
+class MMonGetMap;
+class MMonGetVersion;
+class MMonMetadata;
+class MMonSync;
+class MMonScrub;
+class MMonProbe;
+struct MMonSubscribe;
+struct MRoute;
+struct MForward;
+struct MTimeCheck;
+struct MMonHealth;
+
+#define COMPAT_SET_LOC "feature_set"
+
+class C_MonContext final : public FunctionContext {
+  const Monitor *mon;
+public:
+  explicit C_MonContext(Monitor *m, boost::function<void(int)>&& callback)
+    : FunctionContext(std::move(callback)), mon(m) {}
+  void finish(int r) override;
+};
+
+class Monitor : public Dispatcher,
+                public md_config_obs_t {
+public:
+  // me
+  string name;
+  int rank;
+  Messenger *messenger;
+  ConnectionRef con_self;
+  Mutex lock;
+  SafeTimer timer;
+  Finisher finisher;
+  ThreadPool cpu_tp;  ///< threadpool for CPU intensive work
+  
+  /// true if we have ever joined a quorum.  if false, we are either a
+  /// new cluster, a newly joining monitor, or a just-upgraded
+  /// monitor.
+  bool has_ever_joined;
+
+  PerfCounters *logger, *cluster_logger;
+  bool cluster_logger_registered;
+
+  void register_cluster_logger();
+  void unregister_cluster_logger();
+
+  MonMap *monmap;
+  uuid_d fingerprint;
+
+  set<entity_addr_t> extra_probe_peers;
+
+  LogClient log_client;
+  LogChannelRef clog;
+  LogChannelRef audit_clog;
+  KeyRing keyring;
+  KeyServer key_server;
+
+  AuthMethodList auth_cluster_required;
+  AuthMethodList auth_service_required;
+
+  CompatSet features;
+
+  vector<MonCommand> leader_mon_commands; // quorum leader's commands
+  vector<MonCommand> local_mon_commands;  // commands i support
+  bufferlist local_mon_commands_bl;       // encoded version of above
+
+  // for upgrading mon cluster that still uses PGMonitor
+  vector<MonCommand> local_upgrading_mon_commands;  // mixed mon cluster commands
+  bufferlist local_upgrading_mon_commands_bl;       // encoded version of above
+
+  Messenger *mgr_messenger;
+  MgrClient mgr_client;
+  uint64_t mgr_proxy_bytes = 0;  // in-flight proxied mgr command message bytes
+
+  const MonPGStatService *pgservice;
+
+private:
+  void new_tick();
+
+  // -- local storage --
+public:
+  MonitorDBStore *store;
+  static const string MONITOR_NAME;
+  static const string MONITOR_STORE_PREFIX;
+
+  // -- monitor state --
+private:
+  enum {
+    STATE_PROBING = 1,
+    STATE_SYNCHRONIZING,
+    STATE_ELECTING,
+    STATE_LEADER,
+    STATE_PEON,
+    STATE_SHUTDOWN
+  };
+  int state;
+
+public:
+  static const char *get_state_name(int s) {
+    switch (s) {
+    case STATE_PROBING: return "probing";
+    case STATE_SYNCHRONIZING: return "synchronizing";
+    case STATE_ELECTING: return "electing";
+    case STATE_LEADER: return "leader";
+    case STATE_PEON: return "peon";
+    case STATE_SHUTDOWN: return "shutdown";
+    default: return "???";
+    }
+  }
+  const char *get_state_name() const {
+    return get_state_name(state);
+  }
+
+  bool is_shutdown() const { return state == STATE_SHUTDOWN; }
+  bool is_probing() const { return state == STATE_PROBING; }
+  bool is_synchronizing() const { return state == STATE_SYNCHRONIZING; }
+  bool is_electing() const { return state == STATE_ELECTING; }
+  bool is_leader() const { return state == STATE_LEADER; }
+  bool is_peon() const { return state == STATE_PEON; }
+
+  const utime_t &get_leader_since() const;
+
+  void prepare_new_fingerprint(MonitorDBStore::TransactionRef t);
+
+  // -- elector --
+private:
+  Paxos *paxos;
+  Elector elector;
+  friend class Elector;
+
+  /// features we require of peers (based on on-disk compatset)
+  uint64_t required_features;
+  
+  int leader;            // current leader (to best of knowledge)
+  set<int> quorum;       // current active set of monitors (if !starting)
+  utime_t leader_since;  // when this monitor became the leader, if it is the leader
+  utime_t exited_quorum; // time detected as not in quorum; 0 if in
+
+  // map of counts of connected clients, by type and features, for
+  // each quorum mon
+  map<int,FeatureMap> quorum_feature_map;
+
+  /**
+   * Intersection of quorum member's connection feature bits.
+   */
+  uint64_t quorum_con_features;
+  /**
+   * Intersection of quorum members mon-specific feature bits
+   */
+  mon_feature_t quorum_mon_features;
+
+  set<string> outside_quorum;
+
+  /**
+   * @defgroup Monitor_h_scrub
+   * @{
+   */
+  version_t scrub_version;            ///< paxos version we are scrubbing
+  map<int,ScrubResult> scrub_result;  ///< results so far
+
+  /**
+   * trigger a cross-mon scrub
+   *
+   * Verify all mons are storing identical content
+   */
+  int scrub_start();
+  int scrub();
+  void handle_scrub(MonOpRequestRef op);
+  bool _scrub(ScrubResult *r,
+              pair<string,string> *start,
+              int *num_keys);
+  void scrub_check_results();
+  void scrub_timeout();
+  void scrub_finish();
+  void scrub_reset();
+  void scrub_update_interval(int secs);
+
+  Context *scrub_event;       ///< periodic event to trigger scrub (leader)
+  Context *scrub_timeout_event;  ///< scrub round timeout (leader)
+  void scrub_event_start();
+  void scrub_event_cancel();
+  void scrub_reset_timeout();
+  void scrub_cancel_timeout();
+
+  struct ScrubState {
+    pair<string,string> last_key; ///< last scrubbed key
+    bool finished;
+
+    ScrubState() : finished(false) { }
+    virtual ~ScrubState() { }
+  };
+  ceph::shared_ptr<ScrubState> scrub_state; ///< keeps track of current scrub
+
+  /**
+   * @defgroup Monitor_h_sync Synchronization
+   * @{
+   */
+  /**
+   * @} // provider state
+   */
+  struct SyncProvider {
+    entity_inst_t entity;  ///< who
+    uint64_t cookie;       ///< unique cookie for this sync attempt
+    utime_t timeout;       ///< when we give up and expire this attempt
+    version_t last_committed; ///< last paxos version on peer
+    pair<string,string> last_key; ///< last key sent to (or on) peer
+    bool full;             ///< full scan?
+    MonitorDBStore::Synchronizer synchronizer;   ///< iterator
+
+    SyncProvider() : cookie(0), last_committed(0), full(false) {}
+
+    void reset_timeout(CephContext *cct, int grace) {
+      timeout = ceph_clock_now();
+      timeout += grace;
+    }
+  };
+
+  map<uint64_t, SyncProvider> sync_providers;  ///< cookie -> SyncProvider for those syncing from us
+  uint64_t sync_provider_count;   ///< counter for issued cookies to keep them unique
+
+  /**
+   * @} // requester state
+   */
+  entity_inst_t sync_provider;   ///< who we are syncing from
+  uint64_t sync_cookie;          ///< 0 if we are starting, non-zero otherwise
+  bool sync_full;                ///< true if we are a full sync, false for recent catch-up
+  version_t sync_start_version;  ///< last_committed at sync start
+  Context *sync_timeout_event;   ///< timeout event
+
+  /**
+   * floor for sync source
+   *
+   * When we sync we forget about our old last_committed value which
+   * can be dangerous.  For example, if we have a cluster of:
+   *
+   *   mon.a: lc 100
+   *   mon.b: lc 80
+   *   mon.c: lc 100 (us)
+   *
+   * If something forces us to sync (say, corruption, or manual
+   * intervention, or bug), we forget last_committed, and might abort.
+   * If mon.a happens to be down when we come back, we will see:
+   *
+   *   mon.b: lc 80
+   *   mon.c: lc 0 (us)
+   *
+   * and sync from mon.b, at which point a+b will both have lc 80 and
+   * come online with a majority holding out of date commits.
+   *
+   * Avoid this by preserving our old last_committed value prior to
+   * sync and never going backwards.
+   */
+  version_t sync_last_committed_floor;
+
+  /**
+   * Obtain the synchronization target prefixes in set form.
+   *
+   * We consider a target prefix all those that are relevant when
+   * synchronizing two stores. That is, all those that hold paxos service's
+   * versions, as well as paxos versions, or any control keys such as the
+   * first or last committed version.
+   *
+   * Given the current design, this function should return the name of all and
+   * any available paxos service, plus the paxos name.
+   *
+   * @returns a set of strings referring to the prefixes being synchronized
+   */
+  set<string> get_sync_targets_names();
+
+  /**
+   * Reset the monitor's sync-related data structures for syncing *from* a peer
+   */
+  void sync_reset_requester();
+
+  /**
+   * Reset sync state related to allowing others to sync from us
+   */
+  void sync_reset_provider();
+
+  /**
+   * Caled when a sync attempt times out (requester-side)
+   */
+  void sync_timeout();
+
+  /**
+   * Get the latest monmap for backup purposes during sync
+   */
+  void sync_obtain_latest_monmap(bufferlist &bl);
+
+  /**
+   * Start sync process
+   *
+   * Start pulling committed state from another monitor.
+   *
+   * @param entity where to pull committed state from
+   * @param full whether to do a full sync or just catch up on recent paxos
+   */
+  void sync_start(entity_inst_t &entity, bool full);
+
+public:
+  /**
+   * force a sync on next mon restart
+   */
+  void sync_force(Formatter *f, ostream& ss);
+
+private:
+  /**
+   * store critical state for safekeeping during sync
+   *
+   * We store a few things on the side that we don't want to get clobbered by sync.  This
+   * includes the latest monmap and a lower bound on last_committed.
+   */
+  void sync_stash_critical_state(MonitorDBStore::TransactionRef tx);
+
+  /**
+   * reset the sync timeout
+   *
+   * This is used on the client to restart if things aren't progressing
+   */
+  void sync_reset_timeout();
+
+  /**
+   * trim stale sync provider state
+   *
+   * If someone is syncing from us and hasn't talked to us recently, expire their state.
+   */
+  void sync_trim_providers();
+
+  /**
+   * Complete a sync
+   *
+   * Finish up a sync after we've gotten all of the chunks.
+   *
+   * @param last_committed final last_committed value from provider
+   */
+  void sync_finish(version_t last_committed);
+
+  /**
+   * request the next chunk from the provider
+   */
+  void sync_get_next_chunk();
+
+  /**
+   * handle sync message
+   *
+   * @param m Sync message with operation type MMonSync::OP_START_CHUNKS
+   */
+  void handle_sync(MonOpRequestRef op);
+
+  void _sync_reply_no_cookie(MonOpRequestRef op);
+
+  void handle_sync_get_cookie(MonOpRequestRef op);
+  void handle_sync_get_chunk(MonOpRequestRef op);
+  void handle_sync_finish(MonOpRequestRef op);
+
+  void handle_sync_cookie(MonOpRequestRef op);
+  void handle_sync_forward(MonOpRequestRef op);
+  void handle_sync_chunk(MonOpRequestRef op);
+  void handle_sync_no_cookie(MonOpRequestRef op);
+
+  /**
+   * @} // Synchronization
+   */
+
+  list<Context*> waitfor_quorum;
+  list<Context*> maybe_wait_for_quorum;
+
+  /**
+   * @defgroup Monitor_h_TimeCheck Monitor Clock Drift Early Warning System
+   * @{
+   *
+   * We use time checks to keep track of any clock drifting going on in the
+   * cluster. This is accomplished by periodically ping each monitor in the
+   * quorum and register its response time on a map, assessing how much its
+   * clock has drifted. We also take this opportunity to assess the latency
+   * on response.
+   *
+   * This mechanism works as follows:
+   *
+   *  - Leader sends out a 'PING' message to each other monitor in the quorum.
+   *    The message is timestamped with the leader's current time. The leader's
+   *    current time is recorded in a map, associated with each peon's
+   *    instance.
+   *  - The peon replies to the leader with a timestamped 'PONG' message.
+   *  - The leader calculates a delta between the peon's timestamp and its
+   *    current time and stashes it.
+   *  - The leader also calculates the time it took to receive the 'PONG'
+   *    since the 'PING' was sent, and stashes an approximate latency estimate.
+   *  - Once all the quorum members have pong'ed, the leader will share the
+   *    clock skew and latency maps with all the monitors in the quorum.
+   */
+  map<entity_inst_t, utime_t> timecheck_waiting;
+  map<entity_inst_t, double> timecheck_skews;
+  map<entity_inst_t, double> timecheck_latencies;
+  // odd value means we are mid-round; even value means the round has
+  // finished.
+  version_t timecheck_round;
+  unsigned int timecheck_acks;
+  utime_t timecheck_round_start;
+  friend class HealthMonitor;
+  /* When we hit a skew we will start a new round based off of
+   * 'mon_timecheck_skew_interval'. Each new round will be backed off
+   * until we hit 'mon_timecheck_interval' -- which is the typical
+   * interval when not in the presence of a skew.
+   *
+   * This variable tracks the number of rounds with skews since last clean
+   * so that we can report to the user and properly adjust the backoff.
+   */
+  uint64_t timecheck_rounds_since_clean;
+  /**
+   * Time Check event.
+   */
+  Context *timecheck_event;
+
+  void timecheck_start();
+  void timecheck_finish();
+  void timecheck_start_round();
+  void timecheck_finish_round(bool success = true);
+  void timecheck_cancel_round();
+  void timecheck_cleanup();
+  void timecheck_reset_event();
+  void timecheck_check_skews();
+  void timecheck_report();
+  void timecheck();
+  health_status_t timecheck_status(ostringstream &ss,
+                                   const double skew_bound,
+                                   const double latency);
+  void handle_timecheck_leader(MonOpRequestRef op);
+  void handle_timecheck_peon(MonOpRequestRef op);
+  void handle_timecheck(MonOpRequestRef op);
+
+  /**
+   * Returns 'true' if this is considered to be a skew; 'false' otherwise.
+   */
+  bool timecheck_has_skew(const double skew_bound, double *abs) const {
+    double abs_skew = std::fabs(skew_bound);
+    if (abs)
+      *abs = abs_skew;
+    return (abs_skew > g_conf->mon_clock_drift_allowed);
+  }
+
+  /**
+   * @}
+   */
+  /**
+   * Handle ping messages from others.
+   */
+  void handle_ping(MonOpRequestRef op);
+
+  Context *probe_timeout_event = nullptr;  // for probing
+
+  void reset_probe_timeout();
+  void cancel_probe_timeout();
+  void probe_timeout(int r);
+
+  void _apply_compatset_features(CompatSet &new_features);
+
+public:
+  epoch_t get_epoch();
+  int get_leader() const { return leader; }
+  string get_leader_name() {
+    return quorum.empty() ? string() : monmap->get_name(*quorum.begin());
+  }
+  const set<int>& get_quorum() const { return quorum; }
+  list<string> get_quorum_names() {
+    list<string> q;
+    for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p)
+      q.push_back(monmap->get_name(*p));
+    return q;
+  }
+  uint64_t get_quorum_con_features() const {
+    return quorum_con_features;
+  }
+  mon_feature_t get_quorum_mon_features() const {
+    return quorum_mon_features;
+  }
+  uint64_t get_required_features() const {
+    return required_features;
+  }
+  mon_feature_t get_required_mon_features() const {
+    return monmap->get_required_features();
+  }
+  void apply_quorum_to_compatset_features();
+  void apply_monmap_to_compatset_features();
+  void calc_quorum_requirements();
+
+  void get_combined_feature_map(FeatureMap *fm);
+
+private:
+  void _reset();   ///< called from bootstrap, start_, or join_election
+  void wait_for_paxos_write();
+  void _finish_svc_election(); ///< called by {win,lose}_election
+public:
+  void bootstrap();
+  void join_election();
+  void start_election();
+  void win_standalone_election();
+  // end election (called by Elector)
+  void win_election(epoch_t epoch, set<int>& q,
+                   uint64_t features,
+                    const mon_feature_t& mon_features,
+                   const map<int,Metadata>& metadata);
+  void lose_election(epoch_t epoch, set<int>& q, int l,
+                    uint64_t features,
+                     const mon_feature_t& mon_features);
+  // end election (called by Elector)
+  void finish_election();
+
+  void update_logger();
+
+  /**
+   * Vector holding the Services serviced by this Monitor.
+   */
+  vector<PaxosService*> paxos_service;
+
+  class PGMonitor *pgmon() {
+    return (class PGMonitor *)paxos_service[PAXOS_PGMAP];
+  }
+
+  class MDSMonitor *mdsmon() {
+    return (class MDSMonitor *)paxos_service[PAXOS_MDSMAP];
+  }
+
+  class MonmapMonitor *monmon() {
+    return (class MonmapMonitor *)paxos_service[PAXOS_MONMAP];
+  }
+
+  class OSDMonitor *osdmon() {
+    return (class OSDMonitor *)paxos_service[PAXOS_OSDMAP];
+  }
+
+  class AuthMonitor *authmon() {
+    return (class AuthMonitor *)paxos_service[PAXOS_AUTH];
+  }
+
+  class LogMonitor *logmon() {
+    return (class LogMonitor*) paxos_service[PAXOS_LOG];
+  }
+
+  class MgrMonitor *mgrmon() {
+    return (class MgrMonitor*) paxos_service[PAXOS_MGR];
+  }
+
+  class MgrStatMonitor *mgrstatmon() {
+    return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT];
+  }
+
+  class MgrStatMonitor *healthmon() {
+    return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT];
+  }
+
+  friend class Paxos;
+  friend class OSDMonitor;
+  friend class MDSMonitor;
+  friend class MonmapMonitor;
+  friend class PGMonitor;
+  friend class LogMonitor;
+  friend class ConfigKeyService;
+
+  QuorumService *health_monitor;
+  QuorumService *config_key_service;
+
+  // -- sessions --
+  MonSessionMap session_map;
+  Mutex session_map_lock{"Monitor::session_map_lock"};
+  AdminSocketHook *admin_hook;
+
+  template<typename Func, typename...Args>
+  void with_session_map(Func&& func) {
+    Mutex::Locker l(session_map_lock);
+    std::forward<Func>(func)(session_map);
+  }
+  void send_latest_monmap(Connection *con);
+
+  // messages
+  void handle_get_version(MonOpRequestRef op);
+  void handle_subscribe(MonOpRequestRef op);
+  void handle_mon_get_map(MonOpRequestRef op);
+
+  static void _generate_command_map(map<string,cmd_vartype>& cmdmap,
+                                    map<string,string> &param_str_map);
+  static const MonCommand *_get_moncommand(
+    const string &cmd_prefix,
+    const vector<MonCommand>& cmds);
+  bool _allowed_command(MonSession *s, string &module, string &prefix,
+                        const map<string,cmd_vartype>& cmdmap,
+                        const map<string,string>& param_str_map,
+                        const MonCommand *this_cmd);
+  void get_mon_status(Formatter *f, ostream& ss);
+  void _quorum_status(Formatter *f, ostream& ss);
+  bool _add_bootstrap_peer_hint(string cmd, cmdmap_t& cmdmap, ostream& ss);
+  void handle_command(MonOpRequestRef op);
+  void handle_route(MonOpRequestRef op);
+
+  void handle_mon_metadata(MonOpRequestRef op);
+  int get_mon_metadata(int mon, Formatter *f, ostream& err);
+  int print_nodes(Formatter *f, ostream& err);
+
+  // Accumulate metadata across calls to update_mon_metadata
+  map<int, Metadata> mon_metadata;
+  map<int, Metadata> pending_metadata;
+
+  /**
+   *
+   */
+  struct health_cache_t {
+    health_status_t overall;
+    string summary;
+
+    void reset() {
+      // health_status_t doesn't really have a NONE value and we're not
+      // okay with setting something else (say, HEALTH_ERR).  so just
+      // leave it be.
+      summary.clear();
+    }
+  } health_status_cache;
+
+  Context *health_tick_event = nullptr;
+  Context *health_interval_event = nullptr;
+
+  void health_tick_start();
+  void health_tick_stop();
+  utime_t health_interval_calc_next_update();
+  void health_interval_start();
+  void health_interval_stop();
+  void health_events_cleanup();
+
+  void health_to_clog_update_conf(const std::set<std::string> &changed);
+
+  void do_health_to_clog_interval();
+  void do_health_to_clog(bool force = false);
+
+  /**
+   * Generate health report
+   *
+   * @param status one-line status summary
+   * @param detailbl optional bufferlist* to fill with a detailed report
+   * @returns health status
+   */
+  health_status_t get_health(list<string>& status, bufferlist *detailbl,
+                             Formatter *f);
+
+  health_status_t get_health_status(
+    bool want_detail,
+    Formatter *f,
+    std::string *plain,
+    const char *sep1 = " ",
+    const char *sep2 = "; ");
+  void log_health(
+    const health_check_map_t& updated,
+    const health_check_map_t& previous,
+    MonitorDBStore::TransactionRef t);
+
+protected:
+
+  class HealthCheckLogStatus {
+    public:
+    health_status_t severity;
+    std::string last_message;
+    utime_t updated_at = 0;
+    HealthCheckLogStatus(health_status_t severity_,
+                         const std::string &last_message_,
+                         utime_t updated_at_)
+      : severity(severity_),
+        last_message(last_message_),
+        updated_at(updated_at_)
+    {}
+  };
+  std::map<std::string, HealthCheckLogStatus> health_check_log_times;
+
+public:
+
+  void get_cluster_status(stringstream &ss, Formatter *f);
+
+  void reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version);
+  void reply_command(MonOpRequestRef op, int rc, const string &rs, bufferlist& rdata, version_t version);
+
+
+  void handle_probe(MonOpRequestRef op);
+  /**
+   * Handle a Probe Operation, replying with our name, quorum and known versions.
+   *
+   * We use the MMonProbe message class for anything and everything related with
+   * Monitor probing. One of the operations relates directly with the probing
+   * itself, in which we receive a probe request and to which we reply with
+   * our name, our quorum and the known versions for each Paxos service. Thus the
+   * redundant function name. This reply will obviously be sent to the one
+   * probing/requesting these infos.
+   *
+   * @todo Add @pre and @post
+   *
+   * @param m A Probe message, with an operation of type Probe.
+   */
+  void handle_probe_probe(MonOpRequestRef op);
+  void handle_probe_reply(MonOpRequestRef op);
+
+  // request routing
+  struct RoutedRequest {
+    uint64_t tid;
+    bufferlist request_bl;
+    MonSession *session;
+    ConnectionRef con;
+    uint64_t con_features;
+    entity_inst_t client_inst;
+    MonOpRequestRef op;
+
+    RoutedRequest() : tid(0), session(NULL), con_features(0) {}
+    ~RoutedRequest() {
+      if (session)
+       session->put();
+    }
+  };
+  uint64_t routed_request_tid;
+  map<uint64_t, RoutedRequest*> routed_requests;
+  
+  void forward_request_leader(MonOpRequestRef op);
+  void handle_forward(MonOpRequestRef op);
+  void try_send_message(Message *m, const entity_inst_t& to);
+  void send_reply(MonOpRequestRef op, Message *reply);
+  void no_reply(MonOpRequestRef op);
+  void resend_routed_requests();
+  void remove_session(MonSession *s);
+  void remove_all_sessions();
+  void waitlist_or_zap_client(MonOpRequestRef op);
+
+  void send_command(const entity_inst_t& inst,
+                   const vector<string>& com);
+
+public:
+  struct C_Command : public C_MonOp {
+    Monitor *mon;
+    int rc;
+    string rs;
+    bufferlist rdata;
+    version_t version;
+    C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, version_t v) :
+      C_MonOp(_op), mon(_mm), rc(r), rs(s), version(v){}
+    C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, bufferlist rd, version_t v) :
+      C_MonOp(_op), mon(_mm), rc(r), rs(s), rdata(rd), version(v){}
+
+    void _finish(int r) override {
+      MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
+      if (r >= 0) {
+        ostringstream ss;
+        if (!op->get_req()->get_connection()) {
+          ss << "connection dropped for command ";
+        } else {
+          MonSession *s = op->get_session();
+
+          // if client drops we may not have a session to draw information from.
+          if (s) {
+            ss << "from='" << s->inst << "' "
+              << "entity='" << s->entity_name << "' ";
+          } else {
+            ss << "session dropped for command ";
+          }
+        }
+        ss << "cmd='" << m->cmd << "': finished";
+
+        mon->audit_clog->info() << ss.str();
+       mon->reply_command(op, rc, rs, rdata, version);
+      }
+      else if (r == -ECANCELED)
+        return;
+      else if (r == -EAGAIN)
+       mon->dispatch_op(op);
+      else
+       assert(0 == "bad C_Command return value");
+    }
+  };
+
+ private:
+  class C_RetryMessage : public C_MonOp {
+    Monitor *mon;
+  public:
+    C_RetryMessage(Monitor *m, MonOpRequestRef op) :
+      C_MonOp(op), mon(m) { }
+
+    void _finish(int r) override {
+      if (r == -EAGAIN || r >= 0)
+        mon->dispatch_op(op);
+      else if (r == -ECANCELED)
+        return;
+      else
+       assert(0 == "bad C_RetryMessage return value");
+    }
+  };
+
+  //ms_dispatch handles a lot of logic and we want to reuse it
+  //on forwarded messages, so we create a non-locking version for this class
+  void _ms_dispatch(Message *m);
+  bool ms_dispatch(Message *m) override {
+    lock.Lock();
+    _ms_dispatch(m);
+    lock.Unlock();
+    return true;
+  }
+  void dispatch_op(MonOpRequestRef op);
+  //mon_caps is used for un-connected messages from monitors
+  MonCap * mon_caps;
+  bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new) override;
+  bool ms_verify_authorizer(Connection *con, int peer_type,
+                           int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
+                           bool& isvalid, CryptoKey& session_key) override;
+  bool ms_handle_reset(Connection *con) override;
+  void ms_handle_remote_reset(Connection *con) override {}
+  bool ms_handle_refused(Connection *con) override;
+
+  int write_default_keyring(bufferlist& bl);
+  void extract_save_mon_key(KeyRing& keyring);
+
+  void collect_metadata(Metadata *m);
+  void update_mon_metadata(int from, Metadata&& m);
+  int load_metadata();
+  void count_metadata(const string& field, Formatter *f);
+  void count_metadata(const string& field, map<string,int> *out);
+
+  // features
+  static CompatSet get_initial_supported_features();
+  static CompatSet get_supported_features();
+  static CompatSet get_legacy_features();
+  /// read the ondisk features into the CompatSet pointed to by read_features
+  static void read_features_off_disk(MonitorDBStore *store, CompatSet *read_features);
+  void read_features();
+  void write_features(MonitorDBStore::TransactionRef t);
+
+  OpTracker op_tracker;
+
+ public:
+  Monitor(CephContext *cct_, string nm, MonitorDBStore *s,
+         Messenger *m, Messenger *mgr_m, MonMap *map);
+  ~Monitor() override;
+
+  static int check_features(MonitorDBStore *store);
+
+  // config observer
+  const char** get_tracked_conf_keys() const override;
+  void handle_conf_change(const struct md_config_t *conf,
+                          const std::set<std::string> &changed) override;
+
+  void update_log_clients();
+  int sanitize_options();
+  int preinit();
+  int init();
+  void init_paxos();
+  void refresh_from_paxos(bool *need_bootstrap);
+  void shutdown();
+  void tick();
+
+  void handle_signal(int sig);
+
+  int mkfs(bufferlist& osdmapbl);
+
+  /**
+   * check cluster_fsid file
+   *
+   * @return EEXIST if file exists and doesn't match, 0 on match, or negative error code
+   */
+  int check_fsid();
+
+  /**
+   * write cluster_fsid file
+   *
+   * @return 0 on success, or negative error code
+   */
+  int write_fsid();
+  int write_fsid(MonitorDBStore::TransactionRef t);
+
+  void do_admin_command(std::string command, cmdmap_t& cmdmap,
+                       std::string format, ostream& ss);
+
+private:
+  // don't allow copying
+  Monitor(const Monitor& rhs);
+  Monitor& operator=(const Monitor &rhs);
+
+public:
+  static void format_command_descriptions(const std::vector<MonCommand> &commands,
+                                         Formatter *f,
+                                         bufferlist *rdata,
+                                         bool hide_mgr_flag=false);
+
+  const std::vector<MonCommand> &get_local_commands(mon_feature_t f) {
+    if (f.contains_all(ceph::features::mon::FEATURE_LUMINOUS))
+      return local_mon_commands;
+    else
+      return local_upgrading_mon_commands;
+  }
+  const bufferlist& get_local_commands_bl(mon_feature_t f) {
+    if (f.contains_all(ceph::features::mon::FEATURE_LUMINOUS))
+      return local_mon_commands_bl;
+    else
+      return local_upgrading_mon_commands_bl;
+  }
+  void set_leader_commands(const std::vector<MonCommand>& cmds) {
+    leader_mon_commands = cmds;
+  }
+
+  static bool is_keyring_required();
+};
+
+#define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)")
+#define CEPH_MON_FEATURE_INCOMPAT_GV CompatSet::Feature (2, "global version sequencing (v0.52)")
+#define CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS CompatSet::Feature (3, "single paxos with k/v store (v0.\?)")
+#define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools")
+#define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding")
+#define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code")
+#define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code")
+#define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features")
+#define CEPH_MON_FEATURE_INCOMPAT_LUMINOUS CompatSet::Feature(9, "luminous ondisk layout")
+// make sure you add your feature to Monitor::get_supported_features
+
+
+
+#endif