1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 * Placement Group Map. Placement Groups are logical sets of objects
17 * that are replicated by the same set of devices. pgid=(r,hash(o)&m)
18 * where & is a bit-wise AND and m=2^k-1
24 #include "include/health.h"
25 #include "common/debug.h"
26 #include "common/TextTable.h"
27 #include "osd/osd_types.h"
28 #include "include/mempool.h"
29 #include "mon/health_check.h"
31 #include "mon/PGStatService.h"
33 // FIXME: don't like including this here to get OSDMap::Incremental, maybe
34 // PGMapUpdater needs its own header.
35 #include "osd/OSDMap.h"
37 namespace ceph { class Formatter; }
41 MEMPOOL_CLASS_HELPERS();
42 virtual ~PGMapDigest() {}
44 mempool::pgmap::vector<uint64_t> osd_last_seq;
46 mutable std::map<int, int64_t> avail_space_by_rule;
48 // aggregate state, populated by PGMap child
49 int64_t num_pg = 0, num_osd = 0;
50 int64_t num_pg_active = 0;
51 int64_t num_pg_unknown = 0;
52 mempool::pgmap::unordered_map<int32_t,pool_stat_t> pg_pool_sum;
53 mempool::pgmap::map<int64_t,int64_t> num_pg_by_pool;
56 mempool::pgmap::unordered_map<int32_t,int32_t> num_pg_by_state;
61 void encode(bufferlist& bl) const {
64 ::encode(primary, bl);
66 void decode(bufferlist::iterator& p) {
72 mempool::pgmap::unordered_map<int32_t,pg_count> num_pg_by_osd;
74 // recent deltas, and summation
76 * keep track of last deltas for each pool, calculated using
77 * @p pg_pool_sum as baseline.
79 mempool::pgmap::unordered_map<uint64_t, mempool::pgmap::list< pair<pool_stat_t, utime_t> > > per_pool_sum_deltas;
81 * keep track of per-pool timestamp deltas, according to last update on
84 mempool::pgmap::unordered_map<uint64_t, utime_t> per_pool_sum_deltas_stamps;
86 * keep track of sum deltas, per-pool, taking into account any previous
87 * deltas existing in @p per_pool_sum_deltas. The utime_t as second member
88 * of the pair is the timestamp refering to the last update (i.e., the first
89 * member of the pair) for a given pool.
91 mempool::pgmap::unordered_map<uint64_t, pair<pool_stat_t,utime_t> > per_pool_sum_delta;
93 pool_stat_t pg_sum_delta;
97 void print_summary(Formatter *f, ostream *out) const;
98 void print_oneline_summary(Formatter *f, ostream *out) const;
100 void recovery_summary(Formatter *f, list<string> *psl,
101 const pool_stat_t& delta_sum) const;
102 void overall_recovery_summary(Formatter *f, list<string> *psl) const;
103 void pool_recovery_summary(Formatter *f, list<string> *psl,
104 uint64_t poolid) const;
105 void recovery_rate_summary(Formatter *f, ostream *out,
106 const pool_stat_t& delta_sum,
107 utime_t delta_stamp) const;
108 void overall_recovery_rate_summary(Formatter *f, ostream *out) const;
109 void pool_recovery_rate_summary(Formatter *f, ostream *out,
110 uint64_t poolid) const;
112 * Obtain a formatted/plain output for client I/O, source from stats for a
113 * given @p delta_sum pool over a given @p delta_stamp period of time.
115 void client_io_rate_summary(Formatter *f, ostream *out,
116 const pool_stat_t& delta_sum,
117 utime_t delta_stamp) const;
119 * Obtain a formatted/plain output for the overall client I/O, which is
120 * calculated resorting to @p pg_sum_delta and @p stamp_delta.
122 void overall_client_io_rate_summary(Formatter *f, ostream *out) const;
124 * Obtain a formatted/plain output for client I/O over a given pool
125 * with id @p pool_id. We will then obtain pool-specific data
126 * from @p per_pool_sum_delta.
128 void pool_client_io_rate_summary(Formatter *f, ostream *out,
129 uint64_t poolid) const;
131 * Obtain a formatted/plain output for cache tier IO, source from stats for a
132 * given @p delta_sum pool over a given @p delta_stamp period of time.
134 void cache_io_rate_summary(Formatter *f, ostream *out,
135 const pool_stat_t& delta_sum,
136 utime_t delta_stamp) const;
138 * Obtain a formatted/plain output for the overall cache tier IO, which is
139 * calculated resorting to @p pg_sum_delta and @p stamp_delta.
141 void overall_cache_io_rate_summary(Formatter *f, ostream *out) const;
143 * Obtain a formatted/plain output for cache tier IO over a given pool
144 * with id @p pool_id. We will then obtain pool-specific data
145 * from @p per_pool_sum_delta.
147 void pool_cache_io_rate_summary(Formatter *f, ostream *out,
148 uint64_t poolid) const;
151 * Return the number of additional bytes that can be stored in this
152 * pool before the first OSD fills up, accounting for PG overhead.
154 int64_t get_pool_free_space(const OSDMap &osd_map, int64_t poolid) const;
156 virtual void dump_pool_stats_full(const OSDMap &osd_map, stringstream *ss,
157 Formatter *f, bool verbose) const;
158 void dump_fs_stats(stringstream *ss, Formatter *f, bool verbose) const;
159 static void dump_object_stat_sum(TextTable &tbl, Formatter *f,
160 const object_stat_sum_t &sum,
163 bool verbose, const pg_pool_t *pool);
165 size_t get_num_pg_by_osd(int osd) const {
166 auto p = num_pg_by_osd.find(osd);
167 if (p == num_pg_by_osd.end())
170 return p->second.acting;
172 int get_num_primary_pg_by_osd(int osd) const {
173 auto p = num_pg_by_osd.find(osd);
174 if (p == num_pg_by_osd.end())
177 return p->second.primary;
180 ceph_statfs get_statfs(OSDMap &osdmap,
181 boost::optional<int64_t> data_pool) const;
183 int64_t get_rule_avail(int ruleno) const {
184 auto i = avail_space_by_rule.find(ruleno);
185 if (i != avail_space_by_rule.end())
186 return avail_space_by_rule[ruleno];
191 // kill me post-mimic or -nautilus
192 bool definitely_converted_snapsets() const {
193 // false negative is okay; false positive is not!
196 num_pg_unknown == 0 &&
197 pg_sum.stats.sum.num_legacy_snapsets == 0;
200 // kill me post-luminous:
201 virtual float get_fallback_full_ratio() const {
205 uint64_t get_last_osd_stat_seq(int osd) {
206 if (osd < (int)osd_last_seq.size())
207 return osd_last_seq[osd];
211 void encode(bufferlist& bl, uint64_t features) const;
212 void decode(bufferlist::iterator& p);
213 void dump(Formatter *f) const;
214 static void generate_test_instances(list<PGMapDigest*>& ls);
216 WRITE_CLASS_ENCODER(PGMapDigest::pg_count);
217 WRITE_CLASS_ENCODER_FEATURES(PGMapDigest);
219 class PGMap : public PGMapDigest {
221 MEMPOOL_CLASS_HELPERS();
225 epoch_t last_osdmap_epoch; // last osdmap epoch i applied to the pgmap
226 epoch_t last_pg_scan; // osdmap epoch
227 mempool::pgmap::unordered_map<int32_t,osd_stat_t> osd_stat;
228 mempool::pgmap::unordered_map<pg_t,pg_stat_t> pg_stat;
229 mempool::pgmap::set<int32_t> full_osds; // for pre-luminous only
230 mempool::pgmap::set<int32_t> nearfull_osds; // for pre-luminous only
232 float nearfull_ratio;
234 // mapping of osd to most recently reported osdmap epoch
235 mempool::pgmap::unordered_map<int32_t,epoch_t> osd_epochs;
239 MEMPOOL_CLASS_HELPERS();
241 mempool::pgmap::map<pg_t,pg_stat_t> pg_stat_updates;
242 epoch_t osdmap_epoch;
243 epoch_t pg_scan; // osdmap epoch
244 mempool::pgmap::set<pg_t> pg_remove;
246 float nearfull_ratio;
250 mempool::pgmap::map<int32_t,osd_stat_t> osd_stat_updates;
251 mempool::pgmap::set<int32_t> osd_stat_rm;
253 // mapping of osd to most recently reported osdmap epoch.
254 // 1:1 with osd_stat_updates.
255 mempool::pgmap::map<int32_t,epoch_t> osd_epochs;
258 const mempool::pgmap::map<int32_t, osd_stat_t> &get_osd_stat_updates() const {
259 return osd_stat_updates;
261 const mempool::pgmap::set<int32_t> &get_osd_stat_rm() const {
264 const mempool::pgmap::map<int32_t, epoch_t> &get_osd_epochs() const {
268 template<typename OsdStat>
269 void update_stat(int32_t osd, epoch_t epoch, OsdStat&& stat) {
270 osd_stat_updates[osd] = std::forward<OsdStat>(stat);
271 osd_epochs[osd] = epoch;
272 assert(osd_epochs.size() == osd_stat_updates.size());
274 void stat_osd_out(int32_t osd, epoch_t epoch) {
275 // 0 the stats for the osd
276 osd_stat_updates[osd] = osd_stat_t();
277 // only fill in the epoch if the osd didn't already report htis
278 // epoch. that way we zero the stat but still preserve a reported
280 if (!osd_epochs.count(osd))
281 osd_epochs[osd] = epoch;
282 // ...and maintain our invariant.
283 assert(osd_epochs.size() == osd_stat_updates.size());
285 void stat_osd_down_up(int32_t osd, epoch_t epoch, const PGMap& pg_map) {
286 // 0 the op_queue_age_hist for this osd
287 auto p = osd_stat_updates.find(osd);
288 if (p != osd_stat_updates.end()) {
289 p->second.op_queue_age_hist.clear();
292 auto q = pg_map.osd_stat.find(osd);
293 if (q != pg_map.osd_stat.end()) {
294 osd_stat_t& t = osd_stat_updates[osd] = q->second;
295 t.op_queue_age_hist.clear();
296 osd_epochs[osd] = epoch;
299 void rm_stat(int32_t osd) {
300 osd_stat_rm.insert(osd);
301 osd_epochs.erase(osd);
302 osd_stat_updates.erase(osd);
304 void encode(bufferlist &bl, uint64_t features=-1) const;
305 void decode(bufferlist::iterator &bl);
306 void dump(Formatter *f) const;
307 static void generate_test_instances(list<Incremental*>& o);
309 Incremental() : version(0), osdmap_epoch(0), pg_scan(0),
310 full_ratio(0), nearfull_ratio(0) {}
314 // aggregate stats (soft state), generated by calc_stats()
315 mutable epoch_t min_last_epoch_clean = 0;
316 mempool::pgmap::unordered_map<int,set<pg_t> > pg_by_osd;
317 mempool::pgmap::unordered_map<int,int> blocked_by_sum;
318 mempool::pgmap::list< pair<pool_stat_t, utime_t> > pg_sum_deltas;
322 void update_global_delta(
324 const utime_t ts, const pool_stat_t& pg_sum_old);
325 void update_pool_deltas(
328 const mempool::pgmap::unordered_map<uint64_t, pool_stat_t>& pg_pool_sum_old);
331 void deleted_pool(int64_t pool) {
332 pg_pool_sum.erase(pool);
333 num_pg_by_pool.erase(pool);
334 per_pool_sum_deltas.erase(pool);
335 per_pool_sum_deltas_stamps.erase(pool);
336 per_pool_sum_delta.erase(pool);
343 const pool_stat_t& old_pool_sum,
345 const pool_stat_t& current_pool_sum,
346 pool_stat_t *result_pool_delta,
347 utime_t *result_ts_delta,
348 mempool::pgmap::list<pair<pool_stat_t,utime_t> > *delta_avg_list);
350 void update_one_pool_delta(CephContext *cct,
353 const pool_stat_t& old_pool_sum);
355 epoch_t calc_min_last_epoch_clean() const;
359 mempool::pgmap::set<pg_t> creating_pgs;
360 mempool::pgmap::map<int,map<epoch_t,set<pg_t> > > creating_pgs_by_osd_epoch;
362 // Bits that use to be enum StuckPG
363 static const int STUCK_INACTIVE = (1<<0);
364 static const int STUCK_UNCLEAN = (1<<1);
365 static const int STUCK_UNDERSIZED = (1<<2);
366 static const int STUCK_DEGRADED = (1<<3);
367 static const int STUCK_STALE = (1<<4);
371 last_osdmap_epoch(0), last_pg_scan(0),
372 full_ratio(0), nearfull_ratio(0)
375 void set_full_ratios(float full, float nearfull) {
376 if (full_ratio == full && nearfull_ratio == nearfull)
379 nearfull_ratio = nearfull;
383 version_t get_version() const {
386 void set_version(version_t v) {
389 epoch_t get_last_osdmap_epoch() const {
390 return last_osdmap_epoch;
392 void set_last_osdmap_epoch(epoch_t e) {
393 last_osdmap_epoch = e;
395 epoch_t get_last_pg_scan() const {
398 void set_last_pg_scan(epoch_t e) {
401 utime_t get_stamp() const {
404 void set_stamp(utime_t s) {
408 pool_stat_t get_pg_pool_sum_stat(int64_t pool) const {
409 auto p = pg_pool_sum.find(pool);
410 if (p != pg_pool_sum.end())
412 return pool_stat_t();
416 void update_pg(pg_t pgid, bufferlist& bl);
417 void remove_pg(pg_t pgid);
418 void update_osd(int osd, bufferlist& bl);
419 void remove_osd(int osd);
421 void apply_incremental(CephContext *cct, const Incremental& inc);
422 void redo_full_sets();
423 void register_nearfull_status(int osd, const osd_stat_t& s);
425 void stat_pg_add(const pg_t &pgid, const pg_stat_t &s,
426 bool sameosds=false);
427 void stat_pg_sub(const pg_t &pgid, const pg_stat_t &s,
428 bool sameosds=false);
429 void stat_pg_update(const pg_t pgid, pg_stat_t &prev, bufferlist::iterator& blp);
430 void stat_osd_add(int osd, const osd_stat_t &s);
431 void stat_osd_sub(int osd, const osd_stat_t &s);
433 void encode(bufferlist &bl, uint64_t features=-1) const;
434 void decode(bufferlist::iterator &bl);
436 /// encode subset of our data to a PGMapDigest
437 void encode_digest(const OSDMap& osdmap,
438 bufferlist& bl, uint64_t features) const;
440 void dirty_all(Incremental& inc);
442 int64_t get_rule_avail(const OSDMap& osdmap, int ruleno) const;
443 void get_rules_avail(const OSDMap& osdmap,
444 std::map<int,int64_t> *avail_map) const;
445 void dump(Formatter *f) const;
446 void dump_basic(Formatter *f) const;
447 void dump_pg_stats(Formatter *f, bool brief) const;
448 void dump_pool_stats(Formatter *f) const;
449 void dump_osd_stats(Formatter *f) const;
450 void dump_delta(Formatter *f) const;
451 void dump_filtered_pg_stats(Formatter *f, set<pg_t>& pgs) const;
452 void dump_pool_stats_full(const OSDMap &osd_map, stringstream *ss,
453 Formatter *f, bool verbose) const override {
454 get_rules_avail(osd_map, &avail_space_by_rule);
455 PGMapDigest::dump_pool_stats_full(osd_map, ss, f, verbose);
458 void dump_pg_stats_plain(
460 const mempool::pgmap::unordered_map<pg_t, pg_stat_t>& pg_stats,
462 void get_stuck_stats(
463 int types, const utime_t cutoff,
464 mempool::pgmap::unordered_map<pg_t, pg_stat_t>& stuck_pgs) const;
465 bool get_stuck_counts(const utime_t cutoff, map<string, int>& note) const;
466 void dump_stuck(Formatter *f, int types, utime_t cutoff) const;
467 void dump_stuck_plain(ostream& ss, int types, utime_t cutoff) const;
468 int dump_stuck_pg_stats(stringstream &ds,
471 vector<string>& args) const;
472 void dump(ostream& ss) const;
473 void dump_basic(ostream& ss) const;
474 void dump_pg_stats(ostream& ss, bool brief) const;
475 void dump_pg_sum_stats(ostream& ss, bool header) const;
476 void dump_pool_stats(ostream& ss, bool header) const;
477 void dump_osd_stats(ostream& ss) const;
478 void dump_osd_sum_stats(ostream& ss) const;
479 void dump_filtered_pg_stats(ostream& ss, set<pg_t>& pgs) const;
481 void dump_osd_perf_stats(Formatter *f) const;
482 void print_osd_perf_stats(std::ostream *ss) const;
484 void dump_osd_blocked_by_stats(Formatter *f) const;
485 void print_osd_blocked_by_stats(std::ostream *ss) const;
487 void get_filtered_pg_stats(uint32_t state, int64_t poolid, int64_t osdid,
488 bool primary, set<pg_t>& pgs) const;
490 epoch_t get_min_last_epoch_clean() const {
491 if (!min_last_epoch_clean)
492 min_last_epoch_clean = calc_min_last_epoch_clean();
493 return min_last_epoch_clean;
496 float get_fallback_full_ratio() const override {
497 if (full_ratio > 0) {
503 void get_health(CephContext *cct,
504 const OSDMap& osdmap,
505 list<pair<health_status_t,string> >& summary,
506 list<pair<health_status_t,string> > *detail) const;
508 void get_health_checks(
510 const OSDMap& osdmap,
511 health_check_map_t *checks) const;
513 static void generate_test_instances(list<PGMap*>& o);
515 WRITE_CLASS_ENCODER_FEATURES(PGMap::Incremental)
516 WRITE_CLASS_ENCODER_FEATURES(PGMap)
518 inline ostream& operator<<(ostream& out, const PGMapDigest& m) {
519 m.print_oneline_summary(NULL, &out);
523 int process_pg_map_command(
524 const string& prefix,
525 const map<string,cmd_vartype>& cmdmap,
527 const OSDMap& osdmap,
535 static void check_osd_map(
536 const OSDMap::Incremental &osd_inc,
537 std::set<int> *need_check_down_pg_osds,
538 std::map<int,utime_t> *last_osd_report,
540 PGMap::Incremental *pending_inc);
542 static void check_osd_map(
544 const OSDMap &osdmap,
546 PGMap::Incremental *pending_inc);
548 * check latest osdmap for new pgs to register
550 static void register_new_pgs(
551 const OSDMap &osd_map,
553 PGMap::Incremental *pending_inc);
556 * recalculate creating pg mappings
558 static void update_creating_pgs(
559 const OSDMap &osd_map,
561 PGMap::Incremental *pending_inc);
563 static void register_pg(
564 const OSDMap &osd_map,
565 pg_t pgid, epoch_t epoch,
568 PGMap::Incremental *pending_inc);
570 // mark pg's state stale if its acting primary osd is down
571 static void check_down_pgs(
572 const OSDMap &osd_map,
575 const set<int>& need_check_down_pg_osds,
576 PGMap::Incremental *pending_inc);
580 /* Assign a lower weight to overloaded OSDs.
582 * The osds that will get a lower weight are those with with a utilization
583 * percentage 'oload' percent greater than the average utilization.
585 int by_utilization(const OSDMap &osd_map,
590 bool by_pg, const set<int64_t> *pools,
592 mempool::osdmap::map<int32_t, uint32_t>* new_weights,
593 std::stringstream *ss,
594 std::string *out_str,
599 class PGMapStatService : virtual public PGStatService {
603 PGMapStatService(const PGMap& o)
606 bool is_readable() const override { return true; }
608 const pool_stat_t* get_pool_stat(int64_t poolid) const override {
609 auto i = pgmap.pg_pool_sum.find(poolid);
610 if (i != pgmap.pg_pool_sum.end()) {
616 const osd_stat_t& get_osd_sum() const override { return pgmap.osd_sum; }
618 const osd_stat_t *get_osd_stat(int osd) const override {
619 auto i = pgmap.osd_stat.find(osd);
620 if (i == pgmap.osd_stat.end()) {
625 const mempool::pgmap::unordered_map<int32_t,osd_stat_t>& get_osd_stat() const override {
626 return pgmap.osd_stat;
628 float get_full_ratio() const override { return pgmap.full_ratio; }
629 float get_nearfull_ratio() const override { return pgmap.nearfull_ratio; }
631 bool have_creating_pgs() const override {
632 return !pgmap.creating_pgs.empty();
634 bool is_creating_pg(pg_t pgid) const override {
635 return pgmap.creating_pgs.count(pgid);
638 epoch_t get_min_last_epoch_clean() const override {
639 return pgmap.get_min_last_epoch_clean();
642 bool have_full_osds() const override { return !pgmap.full_osds.empty(); }
643 bool have_nearfull_osds() const override {
644 return !pgmap.nearfull_osds.empty();
647 size_t get_num_pg_by_osd(int osd) const override {
648 return pgmap.get_num_pg_by_osd(osd);
650 ceph_statfs get_statfs(OSDMap& osd_map,
651 boost::optional<int64_t> data_pool) const override {
653 statfs.kb = pgmap.osd_sum.kb;
654 statfs.kb_used = pgmap.osd_sum.kb_used;
655 statfs.kb_avail = pgmap.osd_sum.kb_avail;
656 statfs.num_objects = pgmap.pg_sum.stats.sum.num_objects;
659 void print_summary(Formatter *f, ostream *out) const override {
660 pgmap.print_summary(f, out);
662 virtual void dump_info(Formatter *f) const override {
663 f->dump_object("pgmap", pgmap);
665 void dump_fs_stats(stringstream *ss,
667 bool verbose) const override {
668 pgmap.dump_fs_stats(ss, f, verbose);
670 void dump_pool_stats(const OSDMap& osdm, stringstream *ss, Formatter *f,
671 bool verbose) const override {
672 pgmap.dump_pool_stats_full(osdm, ss, f, verbose);
675 int process_pg_command(const string& prefix,
676 const map<string,cmd_vartype>& cmdmap,
677 const OSDMap& osdmap,
680 bufferlist *odata) const override {
681 return process_pg_map_command(prefix, cmdmap, pgmap, osdmap, f, ss, odata);