X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=src%2Fceph%2Fsrc%2Fmon%2FPGMap.h;fp=src%2Fceph%2Fsrc%2Fmon%2FPGMap.h;h=257a9c75aaf839c1d3488553b52d682c0f4834e9;hb=812ff6ca9fcd3e629e49d4328905f33eee8ca3f5;hp=0000000000000000000000000000000000000000;hpb=15280273faafb77777eab341909a3f495cf248d9;p=stor4nfv.git diff --git a/src/ceph/src/mon/PGMap.h b/src/ceph/src/mon/PGMap.h new file mode 100644 index 0000000..257a9c7 --- /dev/null +++ b/src/ceph/src/mon/PGMap.h @@ -0,0 +1,686 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +/* + * Placement Group Map. Placement Groups are logical sets of objects + * that are replicated by the same set of devices. pgid=(r,hash(o)&m) + * where & is a bit-wise AND and m=2^k-1 + */ + +#ifndef CEPH_PGMAP_H +#define CEPH_PGMAP_H + +#include "include/health.h" +#include "common/debug.h" +#include "common/TextTable.h" +#include "osd/osd_types.h" +#include "include/mempool.h" +#include "mon/health_check.h" +#include +#include "mon/PGStatService.h" + +// FIXME: don't like including this here to get OSDMap::Incremental, maybe +// PGMapUpdater needs its own header. +#include "osd/OSDMap.h" + +namespace ceph { class Formatter; } + +class PGMapDigest { +public: + MEMPOOL_CLASS_HELPERS(); + virtual ~PGMapDigest() {} + + mempool::pgmap::vector osd_last_seq; + + mutable std::map avail_space_by_rule; + + // aggregate state, populated by PGMap child + int64_t num_pg = 0, num_osd = 0; + int64_t num_pg_active = 0; + int64_t num_pg_unknown = 0; + mempool::pgmap::unordered_map pg_pool_sum; + mempool::pgmap::map num_pg_by_pool; + pool_stat_t pg_sum; + osd_stat_t osd_sum; + mempool::pgmap::unordered_map num_pg_by_state; + struct pg_count { + int32_t acting = 0; + int32_t up = 0; + int32_t primary = 0; + void encode(bufferlist& bl) const { + ::encode(acting, bl); + ::encode(up, bl); + ::encode(primary, bl); + } + void decode(bufferlist::iterator& p) { + ::decode(acting, p); + ::decode(up, p); + ::decode(primary, p); + } + }; + mempool::pgmap::unordered_map num_pg_by_osd; + + // recent deltas, and summation + /** + * keep track of last deltas for each pool, calculated using + * @p pg_pool_sum as baseline. + */ + mempool::pgmap::unordered_map > > per_pool_sum_deltas; + /** + * keep track of per-pool timestamp deltas, according to last update on + * each pool. + */ + mempool::pgmap::unordered_map per_pool_sum_deltas_stamps; + /** + * keep track of sum deltas, per-pool, taking into account any previous + * deltas existing in @p per_pool_sum_deltas. The utime_t as second member + * of the pair is the timestamp refering to the last update (i.e., the first + * member of the pair) for a given pool. + */ + mempool::pgmap::unordered_map > per_pool_sum_delta; + + pool_stat_t pg_sum_delta; + utime_t stamp_delta; + + + void print_summary(Formatter *f, ostream *out) const; + void print_oneline_summary(Formatter *f, ostream *out) const; + + void recovery_summary(Formatter *f, list *psl, + const pool_stat_t& delta_sum) const; + void overall_recovery_summary(Formatter *f, list *psl) const; + void pool_recovery_summary(Formatter *f, list *psl, + uint64_t poolid) const; + void recovery_rate_summary(Formatter *f, ostream *out, + const pool_stat_t& delta_sum, + utime_t delta_stamp) const; + void overall_recovery_rate_summary(Formatter *f, ostream *out) const; + void pool_recovery_rate_summary(Formatter *f, ostream *out, + uint64_t poolid) const; + /** + * Obtain a formatted/plain output for client I/O, source from stats for a + * given @p delta_sum pool over a given @p delta_stamp period of time. + */ + void client_io_rate_summary(Formatter *f, ostream *out, + const pool_stat_t& delta_sum, + utime_t delta_stamp) const; + /** + * Obtain a formatted/plain output for the overall client I/O, which is + * calculated resorting to @p pg_sum_delta and @p stamp_delta. + */ + void overall_client_io_rate_summary(Formatter *f, ostream *out) const; + /** + * Obtain a formatted/plain output for client I/O over a given pool + * with id @p pool_id. We will then obtain pool-specific data + * from @p per_pool_sum_delta. + */ + void pool_client_io_rate_summary(Formatter *f, ostream *out, + uint64_t poolid) const; + /** + * Obtain a formatted/plain output for cache tier IO, source from stats for a + * given @p delta_sum pool over a given @p delta_stamp period of time. + */ + void cache_io_rate_summary(Formatter *f, ostream *out, + const pool_stat_t& delta_sum, + utime_t delta_stamp) const; + /** + * Obtain a formatted/plain output for the overall cache tier IO, which is + * calculated resorting to @p pg_sum_delta and @p stamp_delta. + */ + void overall_cache_io_rate_summary(Formatter *f, ostream *out) const; + /** + * Obtain a formatted/plain output for cache tier IO over a given pool + * with id @p pool_id. We will then obtain pool-specific data + * from @p per_pool_sum_delta. + */ + void pool_cache_io_rate_summary(Formatter *f, ostream *out, + uint64_t poolid) const; + + /** + * Return the number of additional bytes that can be stored in this + * pool before the first OSD fills up, accounting for PG overhead. + */ + int64_t get_pool_free_space(const OSDMap &osd_map, int64_t poolid) const; + + virtual void dump_pool_stats_full(const OSDMap &osd_map, stringstream *ss, + Formatter *f, bool verbose) const; + void dump_fs_stats(stringstream *ss, Formatter *f, bool verbose) const; + static void dump_object_stat_sum(TextTable &tbl, Formatter *f, + const object_stat_sum_t &sum, + uint64_t avail, + float raw_used_rate, + bool verbose, const pg_pool_t *pool); + + size_t get_num_pg_by_osd(int osd) const { + auto p = num_pg_by_osd.find(osd); + if (p == num_pg_by_osd.end()) + return 0; + else + return p->second.acting; + } + int get_num_primary_pg_by_osd(int osd) const { + auto p = num_pg_by_osd.find(osd); + if (p == num_pg_by_osd.end()) + return 0; + else + return p->second.primary; + } + + ceph_statfs get_statfs(OSDMap &osdmap, + boost::optional data_pool) const; + + int64_t get_rule_avail(int ruleno) const { + auto i = avail_space_by_rule.find(ruleno); + if (i != avail_space_by_rule.end()) + return avail_space_by_rule[ruleno]; + else + return 0; + } + + // kill me post-mimic or -nautilus + bool definitely_converted_snapsets() const { + // false negative is okay; false positive is not! + return + num_pg && + num_pg_unknown == 0 && + pg_sum.stats.sum.num_legacy_snapsets == 0; + } + + // kill me post-luminous: + virtual float get_fallback_full_ratio() const { + return .95; + } + + uint64_t get_last_osd_stat_seq(int osd) { + if (osd < (int)osd_last_seq.size()) + return osd_last_seq[osd]; + return 0; + } + + void encode(bufferlist& bl, uint64_t features) const; + void decode(bufferlist::iterator& p); + void dump(Formatter *f) const; + static void generate_test_instances(list& ls); +}; +WRITE_CLASS_ENCODER(PGMapDigest::pg_count); +WRITE_CLASS_ENCODER_FEATURES(PGMapDigest); + +class PGMap : public PGMapDigest { +public: + MEMPOOL_CLASS_HELPERS(); + + // the map + version_t version; + epoch_t last_osdmap_epoch; // last osdmap epoch i applied to the pgmap + epoch_t last_pg_scan; // osdmap epoch + mempool::pgmap::unordered_map osd_stat; + mempool::pgmap::unordered_map pg_stat; + mempool::pgmap::set full_osds; // for pre-luminous only + mempool::pgmap::set nearfull_osds; // for pre-luminous only + float full_ratio; + float nearfull_ratio; + + // mapping of osd to most recently reported osdmap epoch + mempool::pgmap::unordered_map osd_epochs; + + class Incremental { + public: + MEMPOOL_CLASS_HELPERS(); + version_t version; + mempool::pgmap::map pg_stat_updates; + epoch_t osdmap_epoch; + epoch_t pg_scan; // osdmap epoch + mempool::pgmap::set pg_remove; + float full_ratio; + float nearfull_ratio; + utime_t stamp; + + private: + mempool::pgmap::map osd_stat_updates; + mempool::pgmap::set osd_stat_rm; + + // mapping of osd to most recently reported osdmap epoch. + // 1:1 with osd_stat_updates. + mempool::pgmap::map osd_epochs; + public: + + const mempool::pgmap::map &get_osd_stat_updates() const { + return osd_stat_updates; + } + const mempool::pgmap::set &get_osd_stat_rm() const { + return osd_stat_rm; + } + const mempool::pgmap::map &get_osd_epochs() const { + return osd_epochs; + } + + template + void update_stat(int32_t osd, epoch_t epoch, OsdStat&& stat) { + osd_stat_updates[osd] = std::forward(stat); + osd_epochs[osd] = epoch; + assert(osd_epochs.size() == osd_stat_updates.size()); + } + void stat_osd_out(int32_t osd, epoch_t epoch) { + // 0 the stats for the osd + osd_stat_updates[osd] = osd_stat_t(); + // only fill in the epoch if the osd didn't already report htis + // epoch. that way we zero the stat but still preserve a reported + // new epoch... + if (!osd_epochs.count(osd)) + osd_epochs[osd] = epoch; + // ...and maintain our invariant. + assert(osd_epochs.size() == osd_stat_updates.size()); + } + void stat_osd_down_up(int32_t osd, epoch_t epoch, const PGMap& pg_map) { + // 0 the op_queue_age_hist for this osd + auto p = osd_stat_updates.find(osd); + if (p != osd_stat_updates.end()) { + p->second.op_queue_age_hist.clear(); + return; + } + auto q = pg_map.osd_stat.find(osd); + if (q != pg_map.osd_stat.end()) { + osd_stat_t& t = osd_stat_updates[osd] = q->second; + t.op_queue_age_hist.clear(); + osd_epochs[osd] = epoch; + } + } + void rm_stat(int32_t osd) { + osd_stat_rm.insert(osd); + osd_epochs.erase(osd); + osd_stat_updates.erase(osd); + } + void encode(bufferlist &bl, uint64_t features=-1) const; + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list& o); + + Incremental() : version(0), osdmap_epoch(0), pg_scan(0), + full_ratio(0), nearfull_ratio(0) {} + }; + + + // aggregate stats (soft state), generated by calc_stats() + mutable epoch_t min_last_epoch_clean = 0; + mempool::pgmap::unordered_map > pg_by_osd; + mempool::pgmap::unordered_map blocked_by_sum; + mempool::pgmap::list< pair > pg_sum_deltas; + + utime_t stamp; + + void update_global_delta( + CephContext *cct, + const utime_t ts, const pool_stat_t& pg_sum_old); + void update_pool_deltas( + CephContext *cct, + const utime_t ts, + const mempool::pgmap::unordered_map& pg_pool_sum_old); + void clear_delta(); + + void deleted_pool(int64_t pool) { + pg_pool_sum.erase(pool); + num_pg_by_pool.erase(pool); + per_pool_sum_deltas.erase(pool); + per_pool_sum_deltas_stamps.erase(pool); + per_pool_sum_delta.erase(pool); + } + + private: + void update_delta( + CephContext *cct, + const utime_t ts, + const pool_stat_t& old_pool_sum, + utime_t *last_ts, + const pool_stat_t& current_pool_sum, + pool_stat_t *result_pool_delta, + utime_t *result_ts_delta, + mempool::pgmap::list > *delta_avg_list); + + void update_one_pool_delta(CephContext *cct, + const utime_t ts, + const uint64_t pool, + const pool_stat_t& old_pool_sum); + + epoch_t calc_min_last_epoch_clean() const; + + public: + + mempool::pgmap::set creating_pgs; + mempool::pgmap::map > > creating_pgs_by_osd_epoch; + + // Bits that use to be enum StuckPG + static const int STUCK_INACTIVE = (1<<0); + static const int STUCK_UNCLEAN = (1<<1); + static const int STUCK_UNDERSIZED = (1<<2); + static const int STUCK_DEGRADED = (1<<3); + static const int STUCK_STALE = (1<<4); + + PGMap() + : version(0), + last_osdmap_epoch(0), last_pg_scan(0), + full_ratio(0), nearfull_ratio(0) + {} + + void set_full_ratios(float full, float nearfull) { + if (full_ratio == full && nearfull_ratio == nearfull) + return; + full_ratio = full; + nearfull_ratio = nearfull; + redo_full_sets(); + } + + version_t get_version() const { + return version; + } + void set_version(version_t v) { + version = v; + } + epoch_t get_last_osdmap_epoch() const { + return last_osdmap_epoch; + } + void set_last_osdmap_epoch(epoch_t e) { + last_osdmap_epoch = e; + } + epoch_t get_last_pg_scan() const { + return last_pg_scan; + } + void set_last_pg_scan(epoch_t e) { + last_pg_scan = e; + } + utime_t get_stamp() const { + return stamp; + } + void set_stamp(utime_t s) { + stamp = s; + } + + pool_stat_t get_pg_pool_sum_stat(int64_t pool) const { + auto p = pg_pool_sum.find(pool); + if (p != pg_pool_sum.end()) + return p->second; + return pool_stat_t(); + } + + + void update_pg(pg_t pgid, bufferlist& bl); + void remove_pg(pg_t pgid); + void update_osd(int osd, bufferlist& bl); + void remove_osd(int osd); + + void apply_incremental(CephContext *cct, const Incremental& inc); + void redo_full_sets(); + void register_nearfull_status(int osd, const osd_stat_t& s); + void calc_stats(); + void stat_pg_add(const pg_t &pgid, const pg_stat_t &s, + bool sameosds=false); + void stat_pg_sub(const pg_t &pgid, const pg_stat_t &s, + bool sameosds=false); + void stat_pg_update(const pg_t pgid, pg_stat_t &prev, bufferlist::iterator& blp); + void stat_osd_add(int osd, const osd_stat_t &s); + void stat_osd_sub(int osd, const osd_stat_t &s); + + void encode(bufferlist &bl, uint64_t features=-1) const; + void decode(bufferlist::iterator &bl); + + /// encode subset of our data to a PGMapDigest + void encode_digest(const OSDMap& osdmap, + bufferlist& bl, uint64_t features) const; + + void dirty_all(Incremental& inc); + + int64_t get_rule_avail(const OSDMap& osdmap, int ruleno) const; + void get_rules_avail(const OSDMap& osdmap, + std::map *avail_map) const; + void dump(Formatter *f) const; + void dump_basic(Formatter *f) const; + void dump_pg_stats(Formatter *f, bool brief) const; + void dump_pool_stats(Formatter *f) const; + void dump_osd_stats(Formatter *f) const; + void dump_delta(Formatter *f) const; + void dump_filtered_pg_stats(Formatter *f, set& pgs) const; + void dump_pool_stats_full(const OSDMap &osd_map, stringstream *ss, + Formatter *f, bool verbose) const override { + get_rules_avail(osd_map, &avail_space_by_rule); + PGMapDigest::dump_pool_stats_full(osd_map, ss, f, verbose); + } + + void dump_pg_stats_plain( + ostream& ss, + const mempool::pgmap::unordered_map& pg_stats, + bool brief) const; + void get_stuck_stats( + int types, const utime_t cutoff, + mempool::pgmap::unordered_map& stuck_pgs) const; + bool get_stuck_counts(const utime_t cutoff, map& note) const; + void dump_stuck(Formatter *f, int types, utime_t cutoff) const; + void dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const; + int dump_stuck_pg_stats(stringstream &ds, + Formatter *f, + int threshold, + vector& args) const; + void dump(ostream& ss) const; + void dump_basic(ostream& ss) const; + void dump_pg_stats(ostream& ss, bool brief) const; + void dump_pg_sum_stats(ostream& ss, bool header) const; + void dump_pool_stats(ostream& ss, bool header) const; + void dump_osd_stats(ostream& ss) const; + void dump_osd_sum_stats(ostream& ss) const; + void dump_filtered_pg_stats(ostream& ss, set& pgs) const; + + void dump_osd_perf_stats(Formatter *f) const; + void print_osd_perf_stats(std::ostream *ss) const; + + void dump_osd_blocked_by_stats(Formatter *f) const; + void print_osd_blocked_by_stats(std::ostream *ss) const; + + void get_filtered_pg_stats(uint32_t state, int64_t poolid, int64_t osdid, + bool primary, set& pgs) const; + + epoch_t get_min_last_epoch_clean() const { + if (!min_last_epoch_clean) + min_last_epoch_clean = calc_min_last_epoch_clean(); + return min_last_epoch_clean; + } + + float get_fallback_full_ratio() const override { + if (full_ratio > 0) { + return full_ratio; + } + return .95; + } + + void get_health(CephContext *cct, + const OSDMap& osdmap, + list >& summary, + list > *detail) const; + + void get_health_checks( + CephContext *cct, + const OSDMap& osdmap, + health_check_map_t *checks) const; + + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER_FEATURES(PGMap::Incremental) +WRITE_CLASS_ENCODER_FEATURES(PGMap) + +inline ostream& operator<<(ostream& out, const PGMapDigest& m) { + m.print_oneline_summary(NULL, &out); + return out; +} + +int process_pg_map_command( + const string& prefix, + const map& cmdmap, + const PGMap& pg_map, + const OSDMap& osdmap, + Formatter *f, + stringstream *ss, + bufferlist *odata); + +class PGMapUpdater +{ +public: + static void check_osd_map( + const OSDMap::Incremental &osd_inc, + std::set *need_check_down_pg_osds, + std::map *last_osd_report, + PGMap *pg_map, + PGMap::Incremental *pending_inc); + + static void check_osd_map( + CephContext *cct, + const OSDMap &osdmap, + const PGMap& pg_map, + PGMap::Incremental *pending_inc); + /** + * check latest osdmap for new pgs to register + */ + static void register_new_pgs( + const OSDMap &osd_map, + const PGMap &pg_map, + PGMap::Incremental *pending_inc); + + /** + * recalculate creating pg mappings + */ + static void update_creating_pgs( + const OSDMap &osd_map, + const PGMap &pg_map, + PGMap::Incremental *pending_inc); + + static void register_pg( + const OSDMap &osd_map, + pg_t pgid, epoch_t epoch, + bool new_pool, + const PGMap &pg_map, + PGMap::Incremental *pending_inc); + + // mark pg's state stale if its acting primary osd is down + static void check_down_pgs( + const OSDMap &osd_map, + const PGMap &pg_map, + bool check_all, + const set& need_check_down_pg_osds, + PGMap::Incremental *pending_inc); +}; + +namespace reweight { +/* Assign a lower weight to overloaded OSDs. + * + * The osds that will get a lower weight are those with with a utilization + * percentage 'oload' percent greater than the average utilization. + */ + int by_utilization(const OSDMap &osd_map, + const PGMap &pg_map, + int oload, + double max_changef, + int max_osds, + bool by_pg, const set *pools, + bool no_increasing, + mempool::osdmap::map* new_weights, + std::stringstream *ss, + std::string *out_str, + Formatter *f); +} + + +class PGMapStatService : virtual public PGStatService { +protected: + const PGMap& pgmap; +public: + PGMapStatService(const PGMap& o) + : pgmap(o) {} + + bool is_readable() const override { return true; } + + const pool_stat_t* get_pool_stat(int64_t poolid) const override { + auto i = pgmap.pg_pool_sum.find(poolid); + if (i != pgmap.pg_pool_sum.end()) { + return &i->second; + } + return nullptr; + } + + const osd_stat_t& get_osd_sum() const override { return pgmap.osd_sum; } + + const osd_stat_t *get_osd_stat(int osd) const override { + auto i = pgmap.osd_stat.find(osd); + if (i == pgmap.osd_stat.end()) { + return nullptr; + } + return &i->second; + } + const mempool::pgmap::unordered_map& get_osd_stat() const override { + return pgmap.osd_stat; + } + float get_full_ratio() const override { return pgmap.full_ratio; } + float get_nearfull_ratio() const override { return pgmap.nearfull_ratio; } + + bool have_creating_pgs() const override { + return !pgmap.creating_pgs.empty(); + } + bool is_creating_pg(pg_t pgid) const override { + return pgmap.creating_pgs.count(pgid); + } + + epoch_t get_min_last_epoch_clean() const override { + return pgmap.get_min_last_epoch_clean(); + } + + bool have_full_osds() const override { return !pgmap.full_osds.empty(); } + bool have_nearfull_osds() const override { + return !pgmap.nearfull_osds.empty(); + } + + size_t get_num_pg_by_osd(int osd) const override { + return pgmap.get_num_pg_by_osd(osd); + } + ceph_statfs get_statfs(OSDMap& osd_map, + boost::optional data_pool) const override { + ceph_statfs statfs; + statfs.kb = pgmap.osd_sum.kb; + statfs.kb_used = pgmap.osd_sum.kb_used; + statfs.kb_avail = pgmap.osd_sum.kb_avail; + statfs.num_objects = pgmap.pg_sum.stats.sum.num_objects; + return statfs; + } + void print_summary(Formatter *f, ostream *out) const override { + pgmap.print_summary(f, out); + } + virtual void dump_info(Formatter *f) const override { + f->dump_object("pgmap", pgmap); + } + void dump_fs_stats(stringstream *ss, + Formatter *f, + bool verbose) const override { + pgmap.dump_fs_stats(ss, f, verbose); + } + void dump_pool_stats(const OSDMap& osdm, stringstream *ss, Formatter *f, + bool verbose) const override { + pgmap.dump_pool_stats_full(osdm, ss, f, verbose); + } + + int process_pg_command(const string& prefix, + const map& cmdmap, + const OSDMap& osdmap, + Formatter *f, + stringstream *ss, + bufferlist *odata) const override { + return process_pg_map_command(prefix, cmdmap, pgmap, osdmap, f, ss, odata); + } +}; + + +#endif