X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=src%2Fceph%2Fsrc%2Fosd%2FOSDMap.h;fp=src%2Fceph%2Fsrc%2Fosd%2FOSDMap.h;h=6ba56511823d1390ad4aa5a4bfcd7b8cfcaabe40;hb=812ff6ca9fcd3e629e49d4328905f33eee8ca3f5;hp=0000000000000000000000000000000000000000;hpb=15280273faafb77777eab341909a3f495cf248d9;p=stor4nfv.git diff --git a/src/ceph/src/osd/OSDMap.h b/src/ceph/src/osd/OSDMap.h new file mode 100644 index 0000000..6ba5651 --- /dev/null +++ b/src/ceph/src/osd/OSDMap.h @@ -0,0 +1,1410 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * Copyright (C) 2013,2014 Cloudwatt + * + * Author: Loic Dachary + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + + +#ifndef CEPH_OSDMAP_H +#define CEPH_OSDMAP_H + +#include "include/cpp-btree/btree_map.h" + +/* + * describe properties of the OSD cluster. + * disks, disk groups, total # osds, + * + */ +#include "include/types.h" +#include "osd_types.h" + +//#include "include/ceph_features.h" +#include "crush/CrushWrapper.h" +#include +#include +#include +#include +#include "include/memory.h" +using namespace std; + +// forward declaration +class CephContext; +class CrushWrapper; +class health_check_map_t; + +// FIXME C++11 does not have std::equal for two differently-typed containers. +// use this until we move to c++14 +template +bool vectors_equal(A a, B b) +{ + return + a.size() == b.size() && + (a.empty() || + memcmp((char*)&a[0], (char*)&b[0], sizeof(a[0]) * a.size()) == 0); +} + + +/* + * we track up to two intervals during which the osd was alive and + * healthy. the most recent is [up_from,up_thru), where up_thru is + * the last epoch the osd is known to have _started_. i.e., a lower + * bound on the actual osd death. down_at (if it is > up_from) is an + * upper bound on the actual osd death. + * + * the second is the last_clean interval [first,last]. in that case, + * the last interval is the last epoch known to have been either + * _finished_, or during which the osd cleanly shut down. when + * possible, we push this forward to the epoch the osd was eventually + * marked down. + * + * the lost_at is used to allow build_prior to proceed without waiting + * for an osd to recover. In certain cases, progress may be blocked + * because an osd is down that may contain updates (i.e., a pg may have + * gone rw during an interval). If the osd can't be brought online, we + * can force things to proceed knowing that we _might_ be losing some + * acked writes. If the osd comes back to life later, that's fine to, + * but those writes will still be lost (the divergent objects will be + * thrown out). + */ +struct osd_info_t { + epoch_t last_clean_begin; // last interval that ended with a clean osd shutdown + epoch_t last_clean_end; + epoch_t up_from; // epoch osd marked up + epoch_t up_thru; // lower bound on actual osd death (if > up_from) + epoch_t down_at; // upper bound on actual osd death (if > up_from) + epoch_t lost_at; // last epoch we decided data was "lost" + + osd_info_t() : last_clean_begin(0), last_clean_end(0), + up_from(0), up_thru(0), down_at(0), lost_at(0) {} + + void dump(Formatter *f) const; + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(osd_info_t) + +ostream& operator<<(ostream& out, const osd_info_t& info); + +struct osd_xinfo_t { + utime_t down_stamp; ///< timestamp when we were last marked down + float laggy_probability; ///< encoded as __u32: 0 = definitely not laggy, 0xffffffff definitely laggy + __u32 laggy_interval; ///< average interval between being marked laggy and recovering + uint64_t features; ///< features supported by this osd we should know about + __u32 old_weight; ///< weight prior to being auto marked out + + osd_xinfo_t() : laggy_probability(0), laggy_interval(0), + features(0), old_weight(0) {} + + void dump(Formatter *f) const; + void encode(bufferlist& bl) const; + void decode(bufferlist::iterator& bl); + static void generate_test_instances(list& o); +}; +WRITE_CLASS_ENCODER(osd_xinfo_t) + +ostream& operator<<(ostream& out, const osd_xinfo_t& xi); + + +struct PGTempMap { +#if 1 + bufferlist data; + typedef btree::btree_map map_t; + map_t map; + + void encode(bufferlist& bl) const { + uint32_t n = map.size(); + ::encode(n, bl); + for (auto &p : map) { + ::encode(p.first, bl); + bl.append((char*)p.second, (*p.second + 1) * sizeof(int32_t)); + } + } + void decode(bufferlist::iterator& p) { + data.clear(); + map.clear(); + uint32_t n; + ::decode(n, p); + if (!n) + return; + bufferlist::iterator pstart = p; + size_t start_off = pstart.get_off(); + vector> offsets; + offsets.resize(n); + for (unsigned i=0; i 1) { + data.rebuild(); + } + //map.reserve(n); + char *start = data.c_str(); + for (auto i : offsets) { + map.insert(map.end(), make_pair(i.first, (int32_t*)(start + i.second))); + } + } + void rebuild() { + bufferlist bl; + encode(bl); + auto p = bl.begin(); + decode(p); + } + friend bool operator==(const PGTempMap& l, const PGTempMap& r) { + return + l.map.size() == r.map.size() && + l.data.contents_equal(r.data); + } + + class iterator { + map_t::const_iterator it; + map_t::const_iterator end; + pair> current; + void init_current() { + if (it != end) { + current.first = it->first; + assert(it->second); + current.second.resize(*it->second); + int32_t *p = it->second + 1; + for (int n = 0; n < *it->second; ++n, ++p) { + current.second[n] = *p; + } + } + } + public: + iterator(map_t::const_iterator p, + map_t::const_iterator e) + : it(p), end(e) { + init_current(); + } + + const pair>& operator*() const { + return current; + } + const pair>* operator->() const { + return ¤t; + } + friend bool operator==(const iterator& l, const iterator& r) { + return l.it == r.it; + } + friend bool operator!=(const iterator& l, const iterator& r) { + return l.it != r.it; + } + iterator& operator++() { + ++it; + if (it != end) + init_current(); + return *this; + } + iterator operator++(int) { + iterator r = *this; + ++it; + if (it != end) + init_current(); + return r; + } + }; + iterator begin() const { + return iterator(map.begin(), map.end()); + } + iterator end() const { + return iterator(map.end(), map.end()); + } + iterator find(pg_t pgid) const { + return iterator(map.find(pgid), map.end()); + } + size_t size() const { + return map.size(); + } + size_t count(pg_t pgid) const { + return map.count(pgid); + } + void erase(pg_t pgid) { + map.erase(pgid); + } + void clear() { + map.clear(); + data.clear(); + } + void set(pg_t pgid, const mempool::osdmap::vector& v) { + size_t need = sizeof(int32_t) * (1 + v.size()); + if (need < data.get_append_buffer_unused_tail_length()) { + bufferptr z(data.get_append_buffer_unused_tail_length()); + z.zero(); + data.append(z.c_str(), z.length()); + } + ::encode(v, data); + map[pgid] = (int32_t*)(data.back().end_c_str()) - (1 + v.size()); + } + mempool::osdmap::vector get(pg_t pgid) { + mempool::osdmap::vector v; + int32_t *p = map[pgid]; + size_t n = *p++; + v.resize(n); + for (size_t i = 0; i < n; ++i, ++p) { + v[i] = *p; + } + return v; + } +#else + // trivial implementation + mempool::osdmap::map > pg_temp; + + void encode(bufferlist& bl) const { + ::encode(pg_temp, bl); + } + void decode(bufferlist::iterator& p) { + ::decode(pg_temp, p); + } + friend bool operator==(const PGTempMap& l, const PGTempMap& r) { + return + l.pg_temp.size() == r.pg_temp.size() && + l.pg_temp == r.pg_temp; + } + + class iterator { + mempool::osdmap::map >::const_iterator it; + public: + iterator(mempool::osdmap::map >::const_iterator p) + : it(p) {} + + pair&> operator*() const { + return *it; + } + const pair>* operator->() const { + return &*it; + } + friend bool operator==(const iterator& l, const iterator& r) { + return l.it == r.it; + } + friend bool operator!=(const iterator& l, const iterator& r) { + return l.it != r.it; + } + iterator& operator++() { + ++it; + return *this; + } + iterator operator++(int) { + iterator r = *this; + ++it; + return r; + } + }; + iterator begin() const { + return iterator(pg_temp.cbegin()); + } + iterator end() const { + return iterator(pg_temp.cend()); + } + iterator find(pg_t pgid) const { + return iterator(pg_temp.find(pgid)); + } + size_t size() const { + return pg_temp.size(); + } + size_t count(pg_t pgid) const { + return pg_temp.count(pgid); + } + void erase(pg_t pgid) { + pg_temp.erase(pgid); + } + void clear() { + pg_temp.clear(); + } + void set(pg_t pgid, const mempool::osdmap::vector& v) { + pg_temp[pgid] = v; + } + const mempool::osdmap::vector& get(pg_t pgid) { + return pg_temp.at(pgid); + } +#endif + void dump(Formatter *f) const { + for (const auto &pg : *this) { + f->open_object_section("osds"); + f->dump_stream("pgid") << pg.first; + f->open_array_section("osds"); + for (const auto osd : pg.second) + f->dump_int("osd", osd); + f->close_section(); + f->close_section(); + } + } +}; +WRITE_CLASS_ENCODER(PGTempMap) + +/** OSDMap + */ +class OSDMap { +public: + MEMPOOL_CLASS_HELPERS(); + + class Incremental { + public: + MEMPOOL_CLASS_HELPERS(); + + /// feature bits we were encoded with. the subsequent OSDMap + /// encoding should match. + uint64_t encode_features; + uuid_d fsid; + epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch + utime_t modified; + int64_t new_pool_max; //incremented by the OSDMonitor on each pool create + int32_t new_flags; + int8_t new_require_osd_release = -1; + + // full (rare) + bufferlist fullmap; // in lieu of below. + bufferlist crush; + + // incremental + int32_t new_max_osd; + mempool::osdmap::map new_pools; + mempool::osdmap::map new_pool_names; + mempool::osdmap::set old_pools; + mempool::osdmap::map > new_erasure_code_profiles; + mempool::osdmap::vector old_erasure_code_profiles; + mempool::osdmap::map new_up_client; + mempool::osdmap::map new_up_cluster; + mempool::osdmap::map new_state; // XORed onto previous state. + mempool::osdmap::map new_weight; + mempool::osdmap::map > new_pg_temp; // [] to remove + mempool::osdmap::map new_primary_temp; // [-1] to remove + mempool::osdmap::map new_primary_affinity; + mempool::osdmap::map new_up_thru; + mempool::osdmap::map > new_last_clean_interval; + mempool::osdmap::map new_lost; + mempool::osdmap::map new_uuid; + mempool::osdmap::map new_xinfo; + + mempool::osdmap::map new_blacklist; + mempool::osdmap::vector old_blacklist; + mempool::osdmap::map new_hb_back_up; + mempool::osdmap::map new_hb_front_up; + + mempool::osdmap::map> new_pg_upmap; + mempool::osdmap::map>> new_pg_upmap_items; + mempool::osdmap::set old_pg_upmap, old_pg_upmap_items; + + string cluster_snapshot; + + float new_nearfull_ratio = -1; + float new_backfillfull_ratio = -1; + float new_full_ratio = -1; + + int8_t new_require_min_compat_client = -1; + + mutable bool have_crc; ///< crc values are defined + uint32_t full_crc; ///< crc of the resulting OSDMap + mutable uint32_t inc_crc; ///< crc of this incremental + + int get_net_marked_out(const OSDMap *previous) const; + int get_net_marked_down(const OSDMap *previous) const; + int identify_osd(uuid_d u) const; + + void encode_client_old(bufferlist& bl) const; + void encode_classic(bufferlist& bl, uint64_t features) const; + void encode(bufferlist& bl, uint64_t features=CEPH_FEATURES_ALL) const; + void decode_classic(bufferlist::iterator &p); + void decode(bufferlist::iterator &bl); + void dump(Formatter *f) const; + static void generate_test_instances(list& o); + + explicit Incremental(epoch_t e=0) : + encode_features(0), + epoch(e), new_pool_max(-1), new_flags(-1), new_max_osd(-1), + have_crc(false), full_crc(0), inc_crc(0) { + memset(&fsid, 0, sizeof(fsid)); + } + explicit Incremental(bufferlist &bl) { + bufferlist::iterator p = bl.begin(); + decode(p); + } + explicit Incremental(bufferlist::iterator &p) { + decode(p); + } + + pg_pool_t *get_new_pool(int64_t pool, const pg_pool_t *orig) { + if (new_pools.count(pool) == 0) + new_pools[pool] = *orig; + return &new_pools[pool]; + } + bool has_erasure_code_profile(const string &name) const { + auto i = new_erasure_code_profiles.find(name); + return i != new_erasure_code_profiles.end(); + } + void set_erasure_code_profile(const string &name, + const map& profile) { + new_erasure_code_profiles[name] = profile; + } + + /// propage update pools' snap metadata to any of their tiers + int propagate_snaps_to_tiers(CephContext *cct, const OSDMap &base); + + /// filter out osds with any pending state changing + size_t get_pending_state_osds(vector *osds) { + assert(osds); + osds->clear(); + + for (auto &p : new_state) { + osds->push_back(p.first); + } + + return osds->size(); + } + + bool pending_osd_has_state(int osd, unsigned state) { + return new_state.count(osd) && (new_state[osd] & state) != 0; + } + + void pending_osd_state_set(int osd, unsigned state) { + new_state[osd] |= state; + } + + // cancel the specified pending osd state if there is any + // return ture on success, false otherwise. + bool pending_osd_state_clear(int osd, unsigned state) { + if (!pending_osd_has_state(osd, state)) { + // never has been set or already has been cancelled. + return false; + } + + new_state[osd] &= ~state; + return true; + } + + }; + +private: + uuid_d fsid; + epoch_t epoch; // what epoch of the osd cluster descriptor is this + utime_t created, modified; // epoch start time + int32_t pool_max; // the largest pool num, ever + + uint32_t flags; + + int num_osd; // not saved; see calc_num_osds + int num_up_osd; // not saved; see calc_num_osds + int num_in_osd; // not saved; see calc_num_osds + + int32_t max_osd; + vector osd_state; + + struct addrs_s { + mempool::osdmap::vector > client_addr; + mempool::osdmap::vector > cluster_addr; + mempool::osdmap::vector > hb_back_addr; + mempool::osdmap::vector > hb_front_addr; + entity_addr_t blank; + }; + ceph::shared_ptr osd_addrs; + + mempool::osdmap::vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out" + mempool::osdmap::vector osd_info; + ceph::shared_ptr pg_temp; // temp pg mapping (e.g. while we rebuild) + ceph::shared_ptr< mempool::osdmap::map > primary_temp; // temp primary mapping (e.g. while we rebuild) + ceph::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline + + // remap (post-CRUSH, pre-up) + mempool::osdmap::map> pg_upmap; ///< remap pg + mempool::osdmap::map>> pg_upmap_items; ///< remap osds in up set + + mempool::osdmap::map pools; + mempool::osdmap::map pool_name; + mempool::osdmap::map > erasure_code_profiles; + mempool::osdmap::map name_pool; + + ceph::shared_ptr< mempool::osdmap::vector > osd_uuid; + mempool::osdmap::vector osd_xinfo; + + mempool::osdmap::unordered_map blacklist; + + epoch_t cluster_snapshot_epoch; + string cluster_snapshot; + bool new_blacklist_entries; + + float full_ratio = 0, backfillfull_ratio = 0, nearfull_ratio = 0; + + /// min compat client we want to support + uint8_t require_min_compat_client = 0; // CEPH_RELEASE_* + +public: + /// require osds to run at least this release + uint8_t require_osd_release = 0; // CEPH_RELEASE_* + +private: + mutable uint64_t cached_up_osd_features; + + mutable bool crc_defined; + mutable uint32_t crc; + + void _calc_up_osd_features(); + + public: + bool have_crc() const { return crc_defined; } + uint32_t get_crc() const { return crc; } + + ceph::shared_ptr crush; // hierarchical map +private: + uint32_t crush_version = 1; + + friend class OSDMonitor; + + public: + OSDMap() : epoch(0), + pool_max(0), + flags(0), + num_osd(0), num_up_osd(0), num_in_osd(0), + max_osd(0), + osd_addrs(std::make_shared()), + pg_temp(std::make_shared()), + primary_temp(std::make_shared>()), + osd_uuid(std::make_shared>()), + cluster_snapshot_epoch(0), + new_blacklist_entries(false), + cached_up_osd_features(0), + crc_defined(false), crc(0), + crush(std::make_shared()) { + memset(&fsid, 0, sizeof(fsid)); + } + + // no copying +private: + OSDMap(const OSDMap& other) = default; + OSDMap& operator=(const OSDMap& other) = default; +public: + + void deepish_copy_from(const OSDMap& o) { + *this = o; + primary_temp.reset(new mempool::osdmap::map(*o.primary_temp)); + pg_temp.reset(new PGTempMap(*o.pg_temp)); + osd_uuid.reset(new mempool::osdmap::vector(*o.osd_uuid)); + + if (o.osd_primary_affinity) + osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>(*o.osd_primary_affinity)); + + // NOTE: this still references shared entity_addr_t's. + osd_addrs.reset(new addrs_s(*o.osd_addrs)); + + // NOTE: we do not copy crush. note that apply_incremental will + // allocate a new CrushWrapper, though. + } + + // map info + const uuid_d& get_fsid() const { return fsid; } + void set_fsid(uuid_d& f) { fsid = f; } + + epoch_t get_epoch() const { return epoch; } + void inc_epoch() { epoch++; } + + void set_epoch(epoch_t e); + + uint32_t get_crush_version() const { + return crush_version; + } + + /* stamps etc */ + const utime_t& get_created() const { return created; } + const utime_t& get_modified() const { return modified; } + + bool is_blacklisted(const entity_addr_t& a) const; + void get_blacklist(list > *bl) const; + void get_blacklist(std::set *bl) const; + + string get_cluster_snapshot() const { + if (cluster_snapshot_epoch == epoch) + return cluster_snapshot; + return string(); + } + + float get_full_ratio() const { + return full_ratio; + } + float get_backfillfull_ratio() const { + return backfillfull_ratio; + } + float get_nearfull_ratio() const { + return nearfull_ratio; + } + void get_full_osd_util( + const mempool::pgmap::unordered_map &osd_stat, + map *full, + map *backfill, + map *nearfull) const; + void get_full_pools(CephContext *cct, + set *full, + set *backfillfull, + set *nearfull) const; + void get_full_osd_counts(set *full, set *backfill, + set *nearfull) const; + + + /***** cluster state *****/ + /* osds */ + int get_max_osd() const { return max_osd; } + void set_max_osd(int m); + + unsigned get_num_osds() const { + return num_osd; + } + unsigned get_num_up_osds() const { + return num_up_osd; + } + unsigned get_num_in_osds() const { + return num_in_osd; + } + /// recalculate cached values for get_num{,_up,_in}_osds + int calc_num_osds(); + + void get_all_osds(set& ls) const; + void get_up_osds(set& ls) const; + void get_out_osds(set& ls) const; + unsigned get_num_pg_temp() const { + return pg_temp->size(); + } + + int get_flags() const { return flags; } + bool test_flag(int f) const { return flags & f; } + void set_flag(int f) { flags |= f; } + void clear_flag(int f) { flags &= ~f; } + + static void calc_state_set(int state, set& st); + + int get_state(int o) const { + assert(o < max_osd); + return osd_state[o]; + } + int get_state(int o, set& st) const { + assert(o < max_osd); + unsigned t = osd_state[o]; + calc_state_set(t, st); + return osd_state[o]; + } + void set_state(int o, unsigned s) { + assert(o < max_osd); + osd_state[o] = s; + } + void set_weight(int o, unsigned w) { + assert(o < max_osd); + osd_weight[o] = w; + if (w) + osd_state[o] |= CEPH_OSD_EXISTS; + } + unsigned get_weight(int o) const { + assert(o < max_osd); + return osd_weight[o]; + } + float get_weightf(int o) const { + return (float)get_weight(o) / (float)CEPH_OSD_IN; + } + void adjust_osd_weights(const map& weights, Incremental& inc) const; + + void set_primary_affinity(int o, int w) { + assert(o < max_osd); + if (!osd_primary_affinity) + osd_primary_affinity.reset( + new mempool::osdmap::vector<__u32>( + max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY)); + (*osd_primary_affinity)[o] = w; + } + unsigned get_primary_affinity(int o) const { + assert(o < max_osd); + if (!osd_primary_affinity) + return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; + return (*osd_primary_affinity)[o]; + } + float get_primary_affinityf(int o) const { + return (float)get_primary_affinity(o) / (float)CEPH_OSD_MAX_PRIMARY_AFFINITY; + } + + bool has_erasure_code_profile(const string &name) const { + auto i = erasure_code_profiles.find(name); + return i != erasure_code_profiles.end(); + } + int get_erasure_code_profile_default(CephContext *cct, + map &profile_map, + ostream *ss); + void set_erasure_code_profile(const string &name, + const map& profile) { + erasure_code_profiles[name] = profile; + } + const map &get_erasure_code_profile( + const string &name) const { + static map empty; + auto i = erasure_code_profiles.find(name); + if (i == erasure_code_profiles.end()) + return empty; + else + return i->second; + } + const mempool::osdmap::map > &get_erasure_code_profiles() const { + return erasure_code_profiles; + } + + bool exists(int osd) const { + //assert(osd >= 0); + return osd >= 0 && osd < max_osd && (osd_state[osd] & CEPH_OSD_EXISTS); + } + + bool is_destroyed(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_DESTROYED); + } + + bool is_up(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_UP); + } + + bool has_been_up_since(int osd, epoch_t epoch) const { + return is_up(osd) && get_up_from(osd) <= epoch; + } + + bool is_down(int osd) const { + return !is_up(osd); + } + + bool is_out(int osd) const { + return !exists(osd) || get_weight(osd) == CEPH_OSD_OUT; + } + + bool is_in(int osd) const { + return !is_out(osd); + } + + bool is_noup(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NOUP); + } + + bool is_nodown(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NODOWN); + } + + bool is_noin(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NOIN); + } + + bool is_noout(int osd) const { + return exists(osd) && (osd_state[osd] & CEPH_OSD_NOOUT); + } + + void get_noup_osds(vector *osds) const { + assert(osds); + osds->clear(); + + for (int i = 0; i < max_osd; i++) { + if (is_noup(i)) { + osds->push_back(i); + } + } + } + + void get_nodown_osds(vector *osds) const { + assert(osds); + osds->clear(); + + for (int i = 0; i < max_osd; i++) { + if (is_nodown(i)) { + osds->push_back(i); + } + } + } + + void get_noin_osds(vector *osds) const { + assert(osds); + osds->clear(); + + for (int i = 0; i < max_osd; i++) { + if (is_noin(i)) { + osds->push_back(i); + } + } + } + + void get_noout_osds(vector *osds) const { + assert(osds); + osds->clear(); + + for (int i = 0; i < max_osd; i++) { + if (is_noout(i)) { + osds->push_back(i); + } + } + } + + /** + * check if an entire crush subtree is down + */ + bool subtree_is_down(int id, set *down_cache) const; + bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set *down_cache) const; + + bool subtree_type_is_down(CephContext *cct, int id, int subtree_type, set *down_in_osds, set *up_in_osds, + set *subtree_up, unordered_map > *subtree_type_down) const; + + int identify_osd(const entity_addr_t& addr) const; + int identify_osd(const uuid_d& u) const; + int identify_osd_on_all_channels(const entity_addr_t& addr) const; + + bool have_addr(const entity_addr_t& addr) const { + return identify_osd(addr) >= 0; + } + int find_osd_on_ip(const entity_addr_t& ip) const; + const entity_addr_t &get_addr(int osd) const { + assert(exists(osd)); + return osd_addrs->client_addr[osd] ? *osd_addrs->client_addr[osd] : osd_addrs->blank; + } + const entity_addr_t &get_cluster_addr(int osd) const { + assert(exists(osd)); + if (!osd_addrs->cluster_addr[osd] || *osd_addrs->cluster_addr[osd] == entity_addr_t()) + return get_addr(osd); + return *osd_addrs->cluster_addr[osd]; + } + const entity_addr_t &get_hb_back_addr(int osd) const { + assert(exists(osd)); + return osd_addrs->hb_back_addr[osd] ? *osd_addrs->hb_back_addr[osd] : osd_addrs->blank; + } + const entity_addr_t &get_hb_front_addr(int osd) const { + assert(exists(osd)); + return osd_addrs->hb_front_addr[osd] ? *osd_addrs->hb_front_addr[osd] : osd_addrs->blank; + } + entity_inst_t get_most_recent_inst(int osd) const { + assert(exists(osd)); + return entity_inst_t(entity_name_t::OSD(osd), get_addr(osd)); + } + entity_inst_t get_inst(int osd) const { + assert(is_up(osd)); + return get_most_recent_inst(osd); + } + entity_inst_t get_cluster_inst(int osd) const { + assert(is_up(osd)); + return entity_inst_t(entity_name_t::OSD(osd), get_cluster_addr(osd)); + } + entity_inst_t get_hb_back_inst(int osd) const { + assert(is_up(osd)); + return entity_inst_t(entity_name_t::OSD(osd), get_hb_back_addr(osd)); + } + entity_inst_t get_hb_front_inst(int osd) const { + assert(is_up(osd)); + return entity_inst_t(entity_name_t::OSD(osd), get_hb_front_addr(osd)); + } + + const uuid_d& get_uuid(int osd) const { + assert(exists(osd)); + return (*osd_uuid)[osd]; + } + + const epoch_t& get_up_from(int osd) const { + assert(exists(osd)); + return osd_info[osd].up_from; + } + const epoch_t& get_up_thru(int osd) const { + assert(exists(osd)); + return osd_info[osd].up_thru; + } + const epoch_t& get_down_at(int osd) const { + assert(exists(osd)); + return osd_info[osd].down_at; + } + const osd_info_t& get_info(int osd) const { + assert(osd < max_osd); + return osd_info[osd]; + } + + const osd_xinfo_t& get_xinfo(int osd) const { + assert(osd < max_osd); + return osd_xinfo[osd]; + } + + int get_next_up_osd_after(int n) const { + if (get_max_osd() == 0) + return -1; + for (int i = n + 1; i != n; ++i) { + if (i >= get_max_osd()) + i = 0; + if (i == n) + break; + if (is_up(i)) + return i; + } + return -1; + } + + int get_previous_up_osd_before(int n) const { + if (get_max_osd() == 0) + return -1; + for (int i = n - 1; i != n; --i) { + if (i < 0) + i = get_max_osd() - 1; + if (i == n) + break; + if (is_up(i)) + return i; + } + return -1; + } + + /** + * get feature bits required by the current structure + * + * @param entity_type [in] what entity type we are asking about + * @param mask [out] set of all possible map-related features we could set + * @return feature bits used by this map + */ + uint64_t get_features(int entity_type, uint64_t *mask) const; + + /** + * get oldest *client* version (firefly, hammer, etc.) that can connect given + * the feature bits required (according to get_features()). + */ + uint8_t get_min_compat_client() const; + + /** + * get intersection of features supported by up osds + */ + uint64_t get_up_osd_features() const; + + int apply_incremental(const Incremental &inc); + + /// try to re-use/reference addrs in oldmap from newmap + static void dedup(const OSDMap *oldmap, OSDMap *newmap); + + static void clean_temps(CephContext *cct, const OSDMap& osdmap, + Incremental *pending_inc); + + // serialize, unserialize +private: + void encode_client_old(bufferlist& bl) const; + void encode_classic(bufferlist& bl, uint64_t features) const; + void decode_classic(bufferlist::iterator& p); + void post_decode(); +public: + void encode(bufferlist& bl, uint64_t features=CEPH_FEATURES_ALL) const; + void decode(bufferlist& bl); + void decode(bufferlist::iterator& bl); + + + /**** mapping facilities ****/ + int map_to_pg( + int64_t pool, + const string& name, + const string& key, + const string& nspace, + pg_t *pg) const; + int object_locator_to_pg(const object_t& oid, const object_locator_t& loc, + pg_t &pg) const; + pg_t object_locator_to_pg(const object_t& oid, + const object_locator_t& loc) const { + pg_t pg; + int ret = object_locator_to_pg(oid, loc, pg); + assert(ret == 0); + return pg; + } + + + static object_locator_t file_to_object_locator(const file_layout_t& layout) { + return object_locator_t(layout.pool_id, layout.pool_ns); + } + + ceph_object_layout file_to_object_layout(object_t oid, + file_layout_t& layout) const { + return make_object_layout(oid, layout.pool_id, layout.pool_ns); + } + + ceph_object_layout make_object_layout(object_t oid, int pg_pool, + string nspace) const; + + int get_pg_num(int pg_pool) const + { + const pg_pool_t *pool = get_pg_pool(pg_pool); + assert(NULL != pool); + return pool->get_pg_num(); + } + + bool pg_exists(pg_t pgid) const { + const pg_pool_t *p = get_pg_pool(pgid.pool()); + return p && pgid.ps() < p->get_pg_num(); + } + + int get_pg_pool_min_size(pg_t pgid) const { + if (!pg_exists(pgid)) { + return -ENOENT; + } + const pg_pool_t *p = get_pg_pool(pgid.pool()); + assert(p); + return p->get_min_size(); + } + + int get_pg_pool_size(pg_t pgid) const { + if (!pg_exists(pgid)) { + return -ENOENT; + } + const pg_pool_t *p = get_pg_pool(pgid.pool()); + assert(p); + return p->get_size(); + } + +private: + /// pg -> (raw osd list) + void _pg_to_raw_osds( + const pg_pool_t& pool, pg_t pg, + vector *osds, + ps_t *ppps) const; + int _pick_primary(const vector& osds) const; + void _remove_nonexistent_osds(const pg_pool_t& pool, vector& osds) const; + + void _apply_primary_affinity(ps_t seed, const pg_pool_t& pool, + vector *osds, int *primary) const; + + /// apply pg_upmap[_items] mappings + void _apply_upmap(const pg_pool_t& pi, pg_t pg, vector *raw) const; + + /// pg -> (up osd list) + void _raw_to_up_osds(const pg_pool_t& pool, const vector& raw, + vector *up) const; + + + /** + * Get the pg and primary temp, if they are specified. + * @param temp_pg [out] Will be empty or contain the temp PG mapping on return + * @param temp_primary [out] Will be the value in primary_temp, or a value derived + * from the pg_temp (if specified), or -1 if you should use the calculated (up_)primary. + */ + void _get_temp_osds(const pg_pool_t& pool, pg_t pg, + vector *temp_pg, int *temp_primary) const; + + /** + * map to up and acting. Fills in whatever fields are non-NULL. + */ + void _pg_to_up_acting_osds(const pg_t& pg, vector *up, int *up_primary, + vector *acting, int *acting_primary, + bool raw_pg_to_pg = true) const; + +public: + /*** + * This is suitable only for looking at raw CRUSH outputs. It skips + * applying the temp and up checks and should not be used + * by anybody for data mapping purposes. + * raw and primary must be non-NULL + */ + void pg_to_raw_osds(pg_t pg, vector *raw, int *primary) const; + /// map a pg to its acting set. @return acting set size + void pg_to_acting_osds(const pg_t& pg, vector *acting, + int *acting_primary) const { + _pg_to_up_acting_osds(pg, NULL, NULL, acting, acting_primary); + } + void pg_to_acting_osds(pg_t pg, vector& acting) const { + return pg_to_acting_osds(pg, &acting, NULL); + } + /** + * This does not apply temp overrides and should not be used + * by anybody for data mapping purposes. Specify both pointers. + */ + void pg_to_raw_up(pg_t pg, vector *up, int *primary) const; + /** + * map a pg to its acting set as well as its up set. You must use + * the acting set for data mapping purposes, but some users will + * also find the up set useful for things like deciding what to + * set as pg_temp. + * Each of these pointers must be non-NULL. + */ + void pg_to_up_acting_osds(pg_t pg, vector *up, int *up_primary, + vector *acting, int *acting_primary) const { + _pg_to_up_acting_osds(pg, up, up_primary, acting, acting_primary); + } + void pg_to_up_acting_osds(pg_t pg, vector& up, vector& acting) const { + int up_primary, acting_primary; + pg_to_up_acting_osds(pg, &up, &up_primary, &acting, &acting_primary); + } + bool pg_is_ec(pg_t pg) const { + auto i = pools.find(pg.pool()); + assert(i != pools.end()); + return i->second.ec_pool(); + } + bool get_primary_shard(const pg_t& pgid, spg_t *out) const { + auto i = get_pools().find(pgid.pool()); + if (i == get_pools().end()) { + return false; + } + if (!i->second.ec_pool()) { + *out = spg_t(pgid); + return true; + } + int primary; + vector acting; + pg_to_acting_osds(pgid, &acting, &primary); + for (uint8_t i = 0; i < acting.size(); ++i) { + if (acting[i] == primary) { + *out = spg_t(pgid, shard_id_t(i)); + return true; + } + } + return false; + } + + int64_t lookup_pg_pool_name(const string& name) const { + auto p = name_pool.find(name); + if (p == name_pool.end()) + return -ENOENT; + return p->second; + } + + int64_t get_pool_max() const { + return pool_max; + } + const mempool::osdmap::map& get_pools() const { + return pools; + } + mempool::osdmap::map& get_pools() { + return pools; + } + void get_pool_ids_by_rule(int rule_id, set *pool_ids) const { + assert(pool_ids); + for (auto &p: pools) { + if ((int)p.second.get_crush_rule() == rule_id) { + pool_ids->insert(p.first); + } + } + } + void get_pool_ids_by_osd(CephContext *cct, + int osd, + set *pool_ids) const; + const string& get_pool_name(int64_t p) const { + auto i = pool_name.find(p); + assert(i != pool_name.end()); + return i->second; + } + const mempool::osdmap::map& get_pool_names() const { + return pool_name; + } + bool have_pg_pool(int64_t p) const { + return pools.count(p); + } + const pg_pool_t* get_pg_pool(int64_t p) const { + auto i = pools.find(p); + if (i != pools.end()) + return &i->second; + return NULL; + } + unsigned get_pg_size(pg_t pg) const { + auto p = pools.find(pg.pool()); + assert(p != pools.end()); + return p->second.get_size(); + } + int get_pg_type(pg_t pg) const { + auto p = pools.find(pg.pool()); + assert(p != pools.end()); + return p->second.get_type(); + } + + + pg_t raw_pg_to_pg(pg_t pg) const { + auto p = pools.find(pg.pool()); + assert(p != pools.end()); + return p->second.raw_pg_to_pg(pg); + } + + // pg -> acting primary osd + int get_pg_acting_primary(pg_t pg) const { + int primary = -1; + _pg_to_up_acting_osds(pg, nullptr, nullptr, nullptr, &primary); + return primary; + } + + /* + * check whether an spg_t maps to a particular osd + */ + bool is_up_acting_osd_shard(spg_t pg, int osd) const { + vector up, acting; + _pg_to_up_acting_osds(pg.pgid, &up, NULL, &acting, NULL, false); + if (pg.shard == shard_id_t::NO_SHARD) { + if (calc_pg_role(osd, acting, acting.size()) >= 0 || + calc_pg_role(osd, up, up.size()) >= 0) + return true; + } else { + if (pg.shard < (int)acting.size() && acting[pg.shard] == osd) + return true; + if (pg.shard < (int)up.size() && up[pg.shard] == osd) + return true; + } + return false; + } + + + /* what replica # is a given osd? 0 primary, -1 for none. */ + static int calc_pg_rank(int osd, const vector& acting, int nrep=0); + static int calc_pg_role(int osd, const vector& acting, int nrep=0); + static bool primary_changed( + int oldprimary, + const vector &oldacting, + int newprimary, + const vector &newacting); + + /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */ + int get_pg_acting_rank(pg_t pg, int osd) const { + vector group; + pg_to_acting_osds(pg, group); + return calc_pg_rank(osd, group, group.size()); + } + /* role is -1 (stray), 0 (primary), 1 (replica) */ + int get_pg_acting_role(const pg_t& pg, int osd) const { + vector group; + pg_to_acting_osds(pg, group); + return calc_pg_role(osd, group, group.size()); + } + + bool osd_is_valid_op_target(pg_t pg, int osd) const { + int primary; + vector group; + pg_to_acting_osds(pg, &group, &primary); + if (osd == primary) + return true; + if (pg_is_ec(pg)) + return false; + + return calc_pg_role(osd, group, group.size()) >= 0; + } + + int clean_pg_upmaps( + CephContext *cct, + Incremental *pending_inc); + + bool try_pg_upmap( + CephContext *cct, + pg_t pg, ///< pg to potentially remap + const set& overfull, ///< osds we'd want to evacuate + const vector& underfull, ///< osds to move to, in order of preference + vector *orig, + vector *out); ///< resulting alternative mapping + + int calc_pg_upmaps( + CephContext *cct, + float max_deviation, ///< max deviation from target (value < 1.0) + int max_iterations, ///< max iterations to run + const set& pools, ///< [optional] restrict to pool + Incremental *pending_inc + ); + + int get_osds_by_bucket_name(const string &name, set *osds) const; + + /* + * handy helpers to build simple maps... + */ + /** + * Build an OSD map suitable for basic usage. If **num_osd** is >= 0 + * it will be initialized with the specified number of OSDs in a + * single host. If **num_osd** is < 0 the layout of the OSD map will + * be built by reading the content of the configuration file. + * + * @param cct [in] in core ceph context + * @param e [in] initial epoch + * @param fsid [in] id of the cluster + * @param num_osd [in] number of OSDs if >= 0 or read from conf if < 0 + * @return **0** on success, negative errno on error. + */ +private: + int build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid, + int num_osd, int pg_bits, int pgp_bits, + bool default_pool); +public: + int build_simple(CephContext *cct, epoch_t e, uuid_d &fsid, + int num_osd) { + return build_simple_optioned(cct, e, fsid, num_osd, 0, 0, false); + } + int build_simple_with_pool(CephContext *cct, epoch_t e, uuid_d &fsid, + int num_osd, int pg_bits, int pgp_bits) { + return build_simple_optioned(cct, e, fsid, num_osd, + pg_bits, pgp_bits, true); + } + static int _build_crush_types(CrushWrapper& crush); + static int build_simple_crush_map(CephContext *cct, CrushWrapper& crush, + int num_osd, ostream *ss); + static int build_simple_crush_map_from_conf(CephContext *cct, + CrushWrapper& crush, + ostream *ss); + static int build_simple_crush_rules( + CephContext *cct, CrushWrapper& crush, + const string& root, + ostream *ss); + + bool crush_rule_in_use(int rule_id) const; + + int validate_crush_rules(CrushWrapper *crush, ostream *ss) const; + + void clear_temp() { + pg_temp->clear(); + primary_temp->clear(); + } + +private: + void print_osd_line(int cur, ostream *out, Formatter *f) const; +public: + void print(ostream& out) const; + void print_pools(ostream& out) const; + void print_summary(Formatter *f, ostream& out, const string& prefix) const; + void print_oneline_summary(ostream& out) const; + + enum { + DUMP_IN = 1, // only 'in' osds + DUMP_OUT = 2, // only 'out' osds + DUMP_UP = 4, // only 'up' osds + DUMP_DOWN = 8, // only 'down' osds + DUMP_DESTROYED = 16, // only 'destroyed' osds + }; + void print_tree(Formatter *f, ostream *out, unsigned dump_flags=0) const; + + int summarize_mapping_stats( + OSDMap *newmap, + const set *pools, + std::string *out, + Formatter *f) const; + + string get_flag_string() const; + static string get_flag_string(unsigned flags); + static void dump_erasure_code_profiles( + const mempool::osdmap::map > &profiles, + Formatter *f); + void dump(Formatter *f) const; + static void generate_test_instances(list& o); + bool check_new_blacklist_entries() const { return new_blacklist_entries; } + + void check_health(health_check_map_t *checks) const; + + int parse_osd_id_list(const vector& ls, + set *out, + ostream *ss) const; +}; +WRITE_CLASS_ENCODER_FEATURES(OSDMap) +WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental) + +typedef ceph::shared_ptr OSDMapRef; + +inline ostream& operator<<(ostream& out, const OSDMap& m) { + m.print_oneline_summary(out); + return out; +} + +class PGStatService; + +void print_osd_utilization(const OSDMap& osdmap, + const PGStatService *pgstat, + ostream& out, + Formatter *f, + bool tree); + +#endif