X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=src%2Fceph%2Fsrc%2Fosd%2FOSDMap.h;fp=src%2Fceph%2Fsrc%2Fosd%2FOSDMap.h;h=0000000000000000000000000000000000000000;hb=7da45d65be36d36b880cc55c5036e96c24b53f00;hp=6ba56511823d1390ad4aa5a4bfcd7b8cfcaabe40;hpb=691462d09d0987b47e112d6ee8740375df3c51b2;p=stor4nfv.git diff --git a/src/ceph/src/osd/OSDMap.h b/src/ceph/src/osd/OSDMap.h deleted file mode 100644 index 6ba5651..0000000 --- a/src/ceph/src/osd/OSDMap.h +++ /dev/null @@ -1,1410 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * Copyright (C) 2013,2014 Cloudwatt - * - * Author: Loic Dachary - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - -#ifndef CEPH_OSDMAP_H -#define CEPH_OSDMAP_H - -#include "include/cpp-btree/btree_map.h" - -/* - * describe properties of the OSD cluster. - * disks, disk groups, total # osds, - * - */ -#include "include/types.h" -#include "osd_types.h" - -//#include "include/ceph_features.h" -#include "crush/CrushWrapper.h" -#include -#include -#include -#include -#include "include/memory.h" -using namespace std; - -// forward declaration -class CephContext; -class CrushWrapper; -class health_check_map_t; - -// FIXME C++11 does not have std::equal for two differently-typed containers. -// use this until we move to c++14 -template -bool vectors_equal(A a, B b) -{ - return - a.size() == b.size() && - (a.empty() || - memcmp((char*)&a[0], (char*)&b[0], sizeof(a[0]) * a.size()) == 0); -} - - -/* - * we track up to two intervals during which the osd was alive and - * healthy. the most recent is [up_from,up_thru), where up_thru is - * the last epoch the osd is known to have _started_. i.e., a lower - * bound on the actual osd death. down_at (if it is > up_from) is an - * upper bound on the actual osd death. - * - * the second is the last_clean interval [first,last]. in that case, - * the last interval is the last epoch known to have been either - * _finished_, or during which the osd cleanly shut down. when - * possible, we push this forward to the epoch the osd was eventually - * marked down. - * - * the lost_at is used to allow build_prior to proceed without waiting - * for an osd to recover. In certain cases, progress may be blocked - * because an osd is down that may contain updates (i.e., a pg may have - * gone rw during an interval). If the osd can't be brought online, we - * can force things to proceed knowing that we _might_ be losing some - * acked writes. If the osd comes back to life later, that's fine to, - * but those writes will still be lost (the divergent objects will be - * thrown out). - */ -struct osd_info_t { - epoch_t last_clean_begin; // last interval that ended with a clean osd shutdown - epoch_t last_clean_end; - epoch_t up_from; // epoch osd marked up - epoch_t up_thru; // lower bound on actual osd death (if > up_from) - epoch_t down_at; // upper bound on actual osd death (if > up_from) - epoch_t lost_at; // last epoch we decided data was "lost" - - osd_info_t() : last_clean_begin(0), last_clean_end(0), - up_from(0), up_thru(0), down_at(0), lost_at(0) {} - - void dump(Formatter *f) const; - void encode(bufferlist& bl) const; - void decode(bufferlist::iterator& bl); - static void generate_test_instances(list& o); -}; -WRITE_CLASS_ENCODER(osd_info_t) - -ostream& operator<<(ostream& out, const osd_info_t& info); - -struct osd_xinfo_t { - utime_t down_stamp; ///< timestamp when we were last marked down - float laggy_probability; ///< encoded as __u32: 0 = definitely not laggy, 0xffffffff definitely laggy - __u32 laggy_interval; ///< average interval between being marked laggy and recovering - uint64_t features; ///< features supported by this osd we should know about - __u32 old_weight; ///< weight prior to being auto marked out - - osd_xinfo_t() : laggy_probability(0), laggy_interval(0), - features(0), old_weight(0) {} - - void dump(Formatter *f) const; - void encode(bufferlist& bl) const; - void decode(bufferlist::iterator& bl); - static void generate_test_instances(list& o); -}; -WRITE_CLASS_ENCODER(osd_xinfo_t) - -ostream& operator<<(ostream& out, const osd_xinfo_t& xi); - - -struct PGTempMap { -#if 1 - bufferlist data; - typedef btree::btree_map map_t; - map_t map; - - void encode(bufferlist& bl) const { - uint32_t n = map.size(); - ::encode(n, bl); - for (auto &p : map) { - ::encode(p.first, bl); - bl.append((char*)p.second, (*p.second + 1) * sizeof(int32_t)); - } - } - void decode(bufferlist::iterator& p) { - data.clear(); - map.clear(); - uint32_t n; - ::decode(n, p); - if (!n) - return; - bufferlist::iterator pstart = p; - size_t start_off = pstart.get_off(); - vector> offsets; - offsets.resize(n); - for (unsigned i=0; i 1) { - data.rebuild(); - } - //map.reserve(n); - char *start = data.c_str(); - for (auto i : offsets) { - map.insert(map.end(), make_pair(i.first, (int32_t*)(start + i.second))); - } - } - void rebuild() { - bufferlist bl; - encode(bl); - auto p = bl.begin(); - decode(p); - } - friend bool operator==(const PGTempMap& l, const PGTempMap& r) { - return - l.map.size() == r.map.size() && - l.data.contents_equal(r.data); - } - - class iterator { - map_t::const_iterator it; - map_t::const_iterator end; - pair> current; - void init_current() { - if (it != end) { - current.first = it->first; - assert(it->second); - current.second.resize(*it->second); - int32_t *p = it->second + 1; - for (int n = 0; n < *it->second; ++n, ++p) { - current.second[n] = *p; - } - } - } - public: - iterator(map_t::const_iterator p, - map_t::const_iterator e) - : it(p), end(e) { - init_current(); - } - - const pair>& operator*() const { - return current; - } - const pair>* operator->() const { - return ¤t; - } - friend bool operator==(const iterator& l, const iterator& r) { - return l.it == r.it; - } - friend bool operator!=(const iterator& l, const iterator& r) { - return l.it != r.it; - } - iterator& operator++() { - ++it; - if (it != end) - init_current(); - return *this; - } - iterator operator++(int) { - iterator r = *this; - ++it; - if (it != end) - init_current(); - return r; - } - }; - iterator begin() const { - return iterator(map.begin(), map.end()); - } - iterator end() const { - return iterator(map.end(), map.end()); - } - iterator find(pg_t pgid) const { - return iterator(map.find(pgid), map.end()); - } - size_t size() const { - return map.size(); - } - size_t count(pg_t pgid) const { - return map.count(pgid); - } - void erase(pg_t pgid) { - map.erase(pgid); - } - void clear() { - map.clear(); - data.clear(); - } - void set(pg_t pgid, const mempool::osdmap::vector& v) { - size_t need = sizeof(int32_t) * (1 + v.size()); - if (need < data.get_append_buffer_unused_tail_length()) { - bufferptr z(data.get_append_buffer_unused_tail_length()); - z.zero(); - data.append(z.c_str(), z.length()); - } - ::encode(v, data); - map[pgid] = (int32_t*)(data.back().end_c_str()) - (1 + v.size()); - } - mempool::osdmap::vector get(pg_t pgid) { - mempool::osdmap::vector v; - int32_t *p = map[pgid]; - size_t n = *p++; - v.resize(n); - for (size_t i = 0; i < n; ++i, ++p) { - v[i] = *p; - } - return v; - } -#else - // trivial implementation - mempool::osdmap::map > pg_temp; - - void encode(bufferlist& bl) const { - ::encode(pg_temp, bl); - } - void decode(bufferlist::iterator& p) { - ::decode(pg_temp, p); - } - friend bool operator==(const PGTempMap& l, const PGTempMap& r) { - return - l.pg_temp.size() == r.pg_temp.size() && - l.pg_temp == r.pg_temp; - } - - class iterator { - mempool::osdmap::map >::const_iterator it; - public: - iterator(mempool::osdmap::map >::const_iterator p) - : it(p) {} - - pair&> operator*() const { - return *it; - } - const pair>* operator->() const { - return &*it; - } - friend bool operator==(const iterator& l, const iterator& r) { - return l.it == r.it; - } - friend bool operator!=(const iterator& l, const iterator& r) { - return l.it != r.it; - } - iterator& operator++() { - ++it; - return *this; - } - iterator operator++(int) { - iterator r = *this; - ++it; - return r; - } - }; - iterator begin() const { - return iterator(pg_temp.cbegin()); - } - iterator end() const { - return iterator(pg_temp.cend()); - } - iterator find(pg_t pgid) const { - return iterator(pg_temp.find(pgid)); - } - size_t size() const { - return pg_temp.size(); - } - size_t count(pg_t pgid) const { - return pg_temp.count(pgid); - } - void erase(pg_t pgid) { - pg_temp.erase(pgid); - } - void clear() { - pg_temp.clear(); - } - void set(pg_t pgid, const mempool::osdmap::vector& v) { - pg_temp[pgid] = v; - } - const mempool::osdmap::vector& get(pg_t pgid) { - return pg_temp.at(pgid); - } -#endif - void dump(Formatter *f) const { - for (const auto &pg : *this) { - f->open_object_section("osds"); - f->dump_stream("pgid") << pg.first; - f->open_array_section("osds"); - for (const auto osd : pg.second) - f->dump_int("osd", osd); - f->close_section(); - f->close_section(); - } - } -}; -WRITE_CLASS_ENCODER(PGTempMap) - -/** OSDMap - */ -class OSDMap { -public: - MEMPOOL_CLASS_HELPERS(); - - class Incremental { - public: - MEMPOOL_CLASS_HELPERS(); - - /// feature bits we were encoded with. the subsequent OSDMap - /// encoding should match. - uint64_t encode_features; - uuid_d fsid; - epoch_t epoch; // new epoch; we are a diff from epoch-1 to epoch - utime_t modified; - int64_t new_pool_max; //incremented by the OSDMonitor on each pool create - int32_t new_flags; - int8_t new_require_osd_release = -1; - - // full (rare) - bufferlist fullmap; // in lieu of below. - bufferlist crush; - - // incremental - int32_t new_max_osd; - mempool::osdmap::map new_pools; - mempool::osdmap::map new_pool_names; - mempool::osdmap::set old_pools; - mempool::osdmap::map > new_erasure_code_profiles; - mempool::osdmap::vector old_erasure_code_profiles; - mempool::osdmap::map new_up_client; - mempool::osdmap::map new_up_cluster; - mempool::osdmap::map new_state; // XORed onto previous state. - mempool::osdmap::map new_weight; - mempool::osdmap::map > new_pg_temp; // [] to remove - mempool::osdmap::map new_primary_temp; // [-1] to remove - mempool::osdmap::map new_primary_affinity; - mempool::osdmap::map new_up_thru; - mempool::osdmap::map > new_last_clean_interval; - mempool::osdmap::map new_lost; - mempool::osdmap::map new_uuid; - mempool::osdmap::map new_xinfo; - - mempool::osdmap::map new_blacklist; - mempool::osdmap::vector old_blacklist; - mempool::osdmap::map new_hb_back_up; - mempool::osdmap::map new_hb_front_up; - - mempool::osdmap::map> new_pg_upmap; - mempool::osdmap::map>> new_pg_upmap_items; - mempool::osdmap::set old_pg_upmap, old_pg_upmap_items; - - string cluster_snapshot; - - float new_nearfull_ratio = -1; - float new_backfillfull_ratio = -1; - float new_full_ratio = -1; - - int8_t new_require_min_compat_client = -1; - - mutable bool have_crc; ///< crc values are defined - uint32_t full_crc; ///< crc of the resulting OSDMap - mutable uint32_t inc_crc; ///< crc of this incremental - - int get_net_marked_out(const OSDMap *previous) const; - int get_net_marked_down(const OSDMap *previous) const; - int identify_osd(uuid_d u) const; - - void encode_client_old(bufferlist& bl) const; - void encode_classic(bufferlist& bl, uint64_t features) const; - void encode(bufferlist& bl, uint64_t features=CEPH_FEATURES_ALL) const; - void decode_classic(bufferlist::iterator &p); - void decode(bufferlist::iterator &bl); - void dump(Formatter *f) const; - static void generate_test_instances(list& o); - - explicit Incremental(epoch_t e=0) : - encode_features(0), - epoch(e), new_pool_max(-1), new_flags(-1), new_max_osd(-1), - have_crc(false), full_crc(0), inc_crc(0) { - memset(&fsid, 0, sizeof(fsid)); - } - explicit Incremental(bufferlist &bl) { - bufferlist::iterator p = bl.begin(); - decode(p); - } - explicit Incremental(bufferlist::iterator &p) { - decode(p); - } - - pg_pool_t *get_new_pool(int64_t pool, const pg_pool_t *orig) { - if (new_pools.count(pool) == 0) - new_pools[pool] = *orig; - return &new_pools[pool]; - } - bool has_erasure_code_profile(const string &name) const { - auto i = new_erasure_code_profiles.find(name); - return i != new_erasure_code_profiles.end(); - } - void set_erasure_code_profile(const string &name, - const map& profile) { - new_erasure_code_profiles[name] = profile; - } - - /// propage update pools' snap metadata to any of their tiers - int propagate_snaps_to_tiers(CephContext *cct, const OSDMap &base); - - /// filter out osds with any pending state changing - size_t get_pending_state_osds(vector *osds) { - assert(osds); - osds->clear(); - - for (auto &p : new_state) { - osds->push_back(p.first); - } - - return osds->size(); - } - - bool pending_osd_has_state(int osd, unsigned state) { - return new_state.count(osd) && (new_state[osd] & state) != 0; - } - - void pending_osd_state_set(int osd, unsigned state) { - new_state[osd] |= state; - } - - // cancel the specified pending osd state if there is any - // return ture on success, false otherwise. - bool pending_osd_state_clear(int osd, unsigned state) { - if (!pending_osd_has_state(osd, state)) { - // never has been set or already has been cancelled. - return false; - } - - new_state[osd] &= ~state; - return true; - } - - }; - -private: - uuid_d fsid; - epoch_t epoch; // what epoch of the osd cluster descriptor is this - utime_t created, modified; // epoch start time - int32_t pool_max; // the largest pool num, ever - - uint32_t flags; - - int num_osd; // not saved; see calc_num_osds - int num_up_osd; // not saved; see calc_num_osds - int num_in_osd; // not saved; see calc_num_osds - - int32_t max_osd; - vector osd_state; - - struct addrs_s { - mempool::osdmap::vector > client_addr; - mempool::osdmap::vector > cluster_addr; - mempool::osdmap::vector > hb_back_addr; - mempool::osdmap::vector > hb_front_addr; - entity_addr_t blank; - }; - ceph::shared_ptr osd_addrs; - - mempool::osdmap::vector<__u32> osd_weight; // 16.16 fixed point, 0x10000 = "in", 0 = "out" - mempool::osdmap::vector osd_info; - ceph::shared_ptr pg_temp; // temp pg mapping (e.g. while we rebuild) - ceph::shared_ptr< mempool::osdmap::map > primary_temp; // temp primary mapping (e.g. while we rebuild) - ceph::shared_ptr< mempool::osdmap::vector<__u32> > osd_primary_affinity; ///< 16.16 fixed point, 0x10000 = baseline - - // remap (post-CRUSH, pre-up) - mempool::osdmap::map> pg_upmap; ///< remap pg - mempool::osdmap::map>> pg_upmap_items; ///< remap osds in up set - - mempool::osdmap::map pools; - mempool::osdmap::map pool_name; - mempool::osdmap::map > erasure_code_profiles; - mempool::osdmap::map name_pool; - - ceph::shared_ptr< mempool::osdmap::vector > osd_uuid; - mempool::osdmap::vector osd_xinfo; - - mempool::osdmap::unordered_map blacklist; - - epoch_t cluster_snapshot_epoch; - string cluster_snapshot; - bool new_blacklist_entries; - - float full_ratio = 0, backfillfull_ratio = 0, nearfull_ratio = 0; - - /// min compat client we want to support - uint8_t require_min_compat_client = 0; // CEPH_RELEASE_* - -public: - /// require osds to run at least this release - uint8_t require_osd_release = 0; // CEPH_RELEASE_* - -private: - mutable uint64_t cached_up_osd_features; - - mutable bool crc_defined; - mutable uint32_t crc; - - void _calc_up_osd_features(); - - public: - bool have_crc() const { return crc_defined; } - uint32_t get_crc() const { return crc; } - - ceph::shared_ptr crush; // hierarchical map -private: - uint32_t crush_version = 1; - - friend class OSDMonitor; - - public: - OSDMap() : epoch(0), - pool_max(0), - flags(0), - num_osd(0), num_up_osd(0), num_in_osd(0), - max_osd(0), - osd_addrs(std::make_shared()), - pg_temp(std::make_shared()), - primary_temp(std::make_shared>()), - osd_uuid(std::make_shared>()), - cluster_snapshot_epoch(0), - new_blacklist_entries(false), - cached_up_osd_features(0), - crc_defined(false), crc(0), - crush(std::make_shared()) { - memset(&fsid, 0, sizeof(fsid)); - } - - // no copying -private: - OSDMap(const OSDMap& other) = default; - OSDMap& operator=(const OSDMap& other) = default; -public: - - void deepish_copy_from(const OSDMap& o) { - *this = o; - primary_temp.reset(new mempool::osdmap::map(*o.primary_temp)); - pg_temp.reset(new PGTempMap(*o.pg_temp)); - osd_uuid.reset(new mempool::osdmap::vector(*o.osd_uuid)); - - if (o.osd_primary_affinity) - osd_primary_affinity.reset(new mempool::osdmap::vector<__u32>(*o.osd_primary_affinity)); - - // NOTE: this still references shared entity_addr_t's. - osd_addrs.reset(new addrs_s(*o.osd_addrs)); - - // NOTE: we do not copy crush. note that apply_incremental will - // allocate a new CrushWrapper, though. - } - - // map info - const uuid_d& get_fsid() const { return fsid; } - void set_fsid(uuid_d& f) { fsid = f; } - - epoch_t get_epoch() const { return epoch; } - void inc_epoch() { epoch++; } - - void set_epoch(epoch_t e); - - uint32_t get_crush_version() const { - return crush_version; - } - - /* stamps etc */ - const utime_t& get_created() const { return created; } - const utime_t& get_modified() const { return modified; } - - bool is_blacklisted(const entity_addr_t& a) const; - void get_blacklist(list > *bl) const; - void get_blacklist(std::set *bl) const; - - string get_cluster_snapshot() const { - if (cluster_snapshot_epoch == epoch) - return cluster_snapshot; - return string(); - } - - float get_full_ratio() const { - return full_ratio; - } - float get_backfillfull_ratio() const { - return backfillfull_ratio; - } - float get_nearfull_ratio() const { - return nearfull_ratio; - } - void get_full_osd_util( - const mempool::pgmap::unordered_map &osd_stat, - map *full, - map *backfill, - map *nearfull) const; - void get_full_pools(CephContext *cct, - set *full, - set *backfillfull, - set *nearfull) const; - void get_full_osd_counts(set *full, set *backfill, - set *nearfull) const; - - - /***** cluster state *****/ - /* osds */ - int get_max_osd() const { return max_osd; } - void set_max_osd(int m); - - unsigned get_num_osds() const { - return num_osd; - } - unsigned get_num_up_osds() const { - return num_up_osd; - } - unsigned get_num_in_osds() const { - return num_in_osd; - } - /// recalculate cached values for get_num{,_up,_in}_osds - int calc_num_osds(); - - void get_all_osds(set& ls) const; - void get_up_osds(set& ls) const; - void get_out_osds(set& ls) const; - unsigned get_num_pg_temp() const { - return pg_temp->size(); - } - - int get_flags() const { return flags; } - bool test_flag(int f) const { return flags & f; } - void set_flag(int f) { flags |= f; } - void clear_flag(int f) { flags &= ~f; } - - static void calc_state_set(int state, set& st); - - int get_state(int o) const { - assert(o < max_osd); - return osd_state[o]; - } - int get_state(int o, set& st) const { - assert(o < max_osd); - unsigned t = osd_state[o]; - calc_state_set(t, st); - return osd_state[o]; - } - void set_state(int o, unsigned s) { - assert(o < max_osd); - osd_state[o] = s; - } - void set_weight(int o, unsigned w) { - assert(o < max_osd); - osd_weight[o] = w; - if (w) - osd_state[o] |= CEPH_OSD_EXISTS; - } - unsigned get_weight(int o) const { - assert(o < max_osd); - return osd_weight[o]; - } - float get_weightf(int o) const { - return (float)get_weight(o) / (float)CEPH_OSD_IN; - } - void adjust_osd_weights(const map& weights, Incremental& inc) const; - - void set_primary_affinity(int o, int w) { - assert(o < max_osd); - if (!osd_primary_affinity) - osd_primary_affinity.reset( - new mempool::osdmap::vector<__u32>( - max_osd, CEPH_OSD_DEFAULT_PRIMARY_AFFINITY)); - (*osd_primary_affinity)[o] = w; - } - unsigned get_primary_affinity(int o) const { - assert(o < max_osd); - if (!osd_primary_affinity) - return CEPH_OSD_DEFAULT_PRIMARY_AFFINITY; - return (*osd_primary_affinity)[o]; - } - float get_primary_affinityf(int o) const { - return (float)get_primary_affinity(o) / (float)CEPH_OSD_MAX_PRIMARY_AFFINITY; - } - - bool has_erasure_code_profile(const string &name) const { - auto i = erasure_code_profiles.find(name); - return i != erasure_code_profiles.end(); - } - int get_erasure_code_profile_default(CephContext *cct, - map &profile_map, - ostream *ss); - void set_erasure_code_profile(const string &name, - const map& profile) { - erasure_code_profiles[name] = profile; - } - const map &get_erasure_code_profile( - const string &name) const { - static map empty; - auto i = erasure_code_profiles.find(name); - if (i == erasure_code_profiles.end()) - return empty; - else - return i->second; - } - const mempool::osdmap::map > &get_erasure_code_profiles() const { - return erasure_code_profiles; - } - - bool exists(int osd) const { - //assert(osd >= 0); - return osd >= 0 && osd < max_osd && (osd_state[osd] & CEPH_OSD_EXISTS); - } - - bool is_destroyed(int osd) const { - return exists(osd) && (osd_state[osd] & CEPH_OSD_DESTROYED); - } - - bool is_up(int osd) const { - return exists(osd) && (osd_state[osd] & CEPH_OSD_UP); - } - - bool has_been_up_since(int osd, epoch_t epoch) const { - return is_up(osd) && get_up_from(osd) <= epoch; - } - - bool is_down(int osd) const { - return !is_up(osd); - } - - bool is_out(int osd) const { - return !exists(osd) || get_weight(osd) == CEPH_OSD_OUT; - } - - bool is_in(int osd) const { - return !is_out(osd); - } - - bool is_noup(int osd) const { - return exists(osd) && (osd_state[osd] & CEPH_OSD_NOUP); - } - - bool is_nodown(int osd) const { - return exists(osd) && (osd_state[osd] & CEPH_OSD_NODOWN); - } - - bool is_noin(int osd) const { - return exists(osd) && (osd_state[osd] & CEPH_OSD_NOIN); - } - - bool is_noout(int osd) const { - return exists(osd) && (osd_state[osd] & CEPH_OSD_NOOUT); - } - - void get_noup_osds(vector *osds) const { - assert(osds); - osds->clear(); - - for (int i = 0; i < max_osd; i++) { - if (is_noup(i)) { - osds->push_back(i); - } - } - } - - void get_nodown_osds(vector *osds) const { - assert(osds); - osds->clear(); - - for (int i = 0; i < max_osd; i++) { - if (is_nodown(i)) { - osds->push_back(i); - } - } - } - - void get_noin_osds(vector *osds) const { - assert(osds); - osds->clear(); - - for (int i = 0; i < max_osd; i++) { - if (is_noin(i)) { - osds->push_back(i); - } - } - } - - void get_noout_osds(vector *osds) const { - assert(osds); - osds->clear(); - - for (int i = 0; i < max_osd; i++) { - if (is_noout(i)) { - osds->push_back(i); - } - } - } - - /** - * check if an entire crush subtree is down - */ - bool subtree_is_down(int id, set *down_cache) const; - bool containing_subtree_is_down(CephContext *cct, int osd, int subtree_type, set *down_cache) const; - - bool subtree_type_is_down(CephContext *cct, int id, int subtree_type, set *down_in_osds, set *up_in_osds, - set *subtree_up, unordered_map > *subtree_type_down) const; - - int identify_osd(const entity_addr_t& addr) const; - int identify_osd(const uuid_d& u) const; - int identify_osd_on_all_channels(const entity_addr_t& addr) const; - - bool have_addr(const entity_addr_t& addr) const { - return identify_osd(addr) >= 0; - } - int find_osd_on_ip(const entity_addr_t& ip) const; - const entity_addr_t &get_addr(int osd) const { - assert(exists(osd)); - return osd_addrs->client_addr[osd] ? *osd_addrs->client_addr[osd] : osd_addrs->blank; - } - const entity_addr_t &get_cluster_addr(int osd) const { - assert(exists(osd)); - if (!osd_addrs->cluster_addr[osd] || *osd_addrs->cluster_addr[osd] == entity_addr_t()) - return get_addr(osd); - return *osd_addrs->cluster_addr[osd]; - } - const entity_addr_t &get_hb_back_addr(int osd) const { - assert(exists(osd)); - return osd_addrs->hb_back_addr[osd] ? *osd_addrs->hb_back_addr[osd] : osd_addrs->blank; - } - const entity_addr_t &get_hb_front_addr(int osd) const { - assert(exists(osd)); - return osd_addrs->hb_front_addr[osd] ? *osd_addrs->hb_front_addr[osd] : osd_addrs->blank; - } - entity_inst_t get_most_recent_inst(int osd) const { - assert(exists(osd)); - return entity_inst_t(entity_name_t::OSD(osd), get_addr(osd)); - } - entity_inst_t get_inst(int osd) const { - assert(is_up(osd)); - return get_most_recent_inst(osd); - } - entity_inst_t get_cluster_inst(int osd) const { - assert(is_up(osd)); - return entity_inst_t(entity_name_t::OSD(osd), get_cluster_addr(osd)); - } - entity_inst_t get_hb_back_inst(int osd) const { - assert(is_up(osd)); - return entity_inst_t(entity_name_t::OSD(osd), get_hb_back_addr(osd)); - } - entity_inst_t get_hb_front_inst(int osd) const { - assert(is_up(osd)); - return entity_inst_t(entity_name_t::OSD(osd), get_hb_front_addr(osd)); - } - - const uuid_d& get_uuid(int osd) const { - assert(exists(osd)); - return (*osd_uuid)[osd]; - } - - const epoch_t& get_up_from(int osd) const { - assert(exists(osd)); - return osd_info[osd].up_from; - } - const epoch_t& get_up_thru(int osd) const { - assert(exists(osd)); - return osd_info[osd].up_thru; - } - const epoch_t& get_down_at(int osd) const { - assert(exists(osd)); - return osd_info[osd].down_at; - } - const osd_info_t& get_info(int osd) const { - assert(osd < max_osd); - return osd_info[osd]; - } - - const osd_xinfo_t& get_xinfo(int osd) const { - assert(osd < max_osd); - return osd_xinfo[osd]; - } - - int get_next_up_osd_after(int n) const { - if (get_max_osd() == 0) - return -1; - for (int i = n + 1; i != n; ++i) { - if (i >= get_max_osd()) - i = 0; - if (i == n) - break; - if (is_up(i)) - return i; - } - return -1; - } - - int get_previous_up_osd_before(int n) const { - if (get_max_osd() == 0) - return -1; - for (int i = n - 1; i != n; --i) { - if (i < 0) - i = get_max_osd() - 1; - if (i == n) - break; - if (is_up(i)) - return i; - } - return -1; - } - - /** - * get feature bits required by the current structure - * - * @param entity_type [in] what entity type we are asking about - * @param mask [out] set of all possible map-related features we could set - * @return feature bits used by this map - */ - uint64_t get_features(int entity_type, uint64_t *mask) const; - - /** - * get oldest *client* version (firefly, hammer, etc.) that can connect given - * the feature bits required (according to get_features()). - */ - uint8_t get_min_compat_client() const; - - /** - * get intersection of features supported by up osds - */ - uint64_t get_up_osd_features() const; - - int apply_incremental(const Incremental &inc); - - /// try to re-use/reference addrs in oldmap from newmap - static void dedup(const OSDMap *oldmap, OSDMap *newmap); - - static void clean_temps(CephContext *cct, const OSDMap& osdmap, - Incremental *pending_inc); - - // serialize, unserialize -private: - void encode_client_old(bufferlist& bl) const; - void encode_classic(bufferlist& bl, uint64_t features) const; - void decode_classic(bufferlist::iterator& p); - void post_decode(); -public: - void encode(bufferlist& bl, uint64_t features=CEPH_FEATURES_ALL) const; - void decode(bufferlist& bl); - void decode(bufferlist::iterator& bl); - - - /**** mapping facilities ****/ - int map_to_pg( - int64_t pool, - const string& name, - const string& key, - const string& nspace, - pg_t *pg) const; - int object_locator_to_pg(const object_t& oid, const object_locator_t& loc, - pg_t &pg) const; - pg_t object_locator_to_pg(const object_t& oid, - const object_locator_t& loc) const { - pg_t pg; - int ret = object_locator_to_pg(oid, loc, pg); - assert(ret == 0); - return pg; - } - - - static object_locator_t file_to_object_locator(const file_layout_t& layout) { - return object_locator_t(layout.pool_id, layout.pool_ns); - } - - ceph_object_layout file_to_object_layout(object_t oid, - file_layout_t& layout) const { - return make_object_layout(oid, layout.pool_id, layout.pool_ns); - } - - ceph_object_layout make_object_layout(object_t oid, int pg_pool, - string nspace) const; - - int get_pg_num(int pg_pool) const - { - const pg_pool_t *pool = get_pg_pool(pg_pool); - assert(NULL != pool); - return pool->get_pg_num(); - } - - bool pg_exists(pg_t pgid) const { - const pg_pool_t *p = get_pg_pool(pgid.pool()); - return p && pgid.ps() < p->get_pg_num(); - } - - int get_pg_pool_min_size(pg_t pgid) const { - if (!pg_exists(pgid)) { - return -ENOENT; - } - const pg_pool_t *p = get_pg_pool(pgid.pool()); - assert(p); - return p->get_min_size(); - } - - int get_pg_pool_size(pg_t pgid) const { - if (!pg_exists(pgid)) { - return -ENOENT; - } - const pg_pool_t *p = get_pg_pool(pgid.pool()); - assert(p); - return p->get_size(); - } - -private: - /// pg -> (raw osd list) - void _pg_to_raw_osds( - const pg_pool_t& pool, pg_t pg, - vector *osds, - ps_t *ppps) const; - int _pick_primary(const vector& osds) const; - void _remove_nonexistent_osds(const pg_pool_t& pool, vector& osds) const; - - void _apply_primary_affinity(ps_t seed, const pg_pool_t& pool, - vector *osds, int *primary) const; - - /// apply pg_upmap[_items] mappings - void _apply_upmap(const pg_pool_t& pi, pg_t pg, vector *raw) const; - - /// pg -> (up osd list) - void _raw_to_up_osds(const pg_pool_t& pool, const vector& raw, - vector *up) const; - - - /** - * Get the pg and primary temp, if they are specified. - * @param temp_pg [out] Will be empty or contain the temp PG mapping on return - * @param temp_primary [out] Will be the value in primary_temp, or a value derived - * from the pg_temp (if specified), or -1 if you should use the calculated (up_)primary. - */ - void _get_temp_osds(const pg_pool_t& pool, pg_t pg, - vector *temp_pg, int *temp_primary) const; - - /** - * map to up and acting. Fills in whatever fields are non-NULL. - */ - void _pg_to_up_acting_osds(const pg_t& pg, vector *up, int *up_primary, - vector *acting, int *acting_primary, - bool raw_pg_to_pg = true) const; - -public: - /*** - * This is suitable only for looking at raw CRUSH outputs. It skips - * applying the temp and up checks and should not be used - * by anybody for data mapping purposes. - * raw and primary must be non-NULL - */ - void pg_to_raw_osds(pg_t pg, vector *raw, int *primary) const; - /// map a pg to its acting set. @return acting set size - void pg_to_acting_osds(const pg_t& pg, vector *acting, - int *acting_primary) const { - _pg_to_up_acting_osds(pg, NULL, NULL, acting, acting_primary); - } - void pg_to_acting_osds(pg_t pg, vector& acting) const { - return pg_to_acting_osds(pg, &acting, NULL); - } - /** - * This does not apply temp overrides and should not be used - * by anybody for data mapping purposes. Specify both pointers. - */ - void pg_to_raw_up(pg_t pg, vector *up, int *primary) const; - /** - * map a pg to its acting set as well as its up set. You must use - * the acting set for data mapping purposes, but some users will - * also find the up set useful for things like deciding what to - * set as pg_temp. - * Each of these pointers must be non-NULL. - */ - void pg_to_up_acting_osds(pg_t pg, vector *up, int *up_primary, - vector *acting, int *acting_primary) const { - _pg_to_up_acting_osds(pg, up, up_primary, acting, acting_primary); - } - void pg_to_up_acting_osds(pg_t pg, vector& up, vector& acting) const { - int up_primary, acting_primary; - pg_to_up_acting_osds(pg, &up, &up_primary, &acting, &acting_primary); - } - bool pg_is_ec(pg_t pg) const { - auto i = pools.find(pg.pool()); - assert(i != pools.end()); - return i->second.ec_pool(); - } - bool get_primary_shard(const pg_t& pgid, spg_t *out) const { - auto i = get_pools().find(pgid.pool()); - if (i == get_pools().end()) { - return false; - } - if (!i->second.ec_pool()) { - *out = spg_t(pgid); - return true; - } - int primary; - vector acting; - pg_to_acting_osds(pgid, &acting, &primary); - for (uint8_t i = 0; i < acting.size(); ++i) { - if (acting[i] == primary) { - *out = spg_t(pgid, shard_id_t(i)); - return true; - } - } - return false; - } - - int64_t lookup_pg_pool_name(const string& name) const { - auto p = name_pool.find(name); - if (p == name_pool.end()) - return -ENOENT; - return p->second; - } - - int64_t get_pool_max() const { - return pool_max; - } - const mempool::osdmap::map& get_pools() const { - return pools; - } - mempool::osdmap::map& get_pools() { - return pools; - } - void get_pool_ids_by_rule(int rule_id, set *pool_ids) const { - assert(pool_ids); - for (auto &p: pools) { - if ((int)p.second.get_crush_rule() == rule_id) { - pool_ids->insert(p.first); - } - } - } - void get_pool_ids_by_osd(CephContext *cct, - int osd, - set *pool_ids) const; - const string& get_pool_name(int64_t p) const { - auto i = pool_name.find(p); - assert(i != pool_name.end()); - return i->second; - } - const mempool::osdmap::map& get_pool_names() const { - return pool_name; - } - bool have_pg_pool(int64_t p) const { - return pools.count(p); - } - const pg_pool_t* get_pg_pool(int64_t p) const { - auto i = pools.find(p); - if (i != pools.end()) - return &i->second; - return NULL; - } - unsigned get_pg_size(pg_t pg) const { - auto p = pools.find(pg.pool()); - assert(p != pools.end()); - return p->second.get_size(); - } - int get_pg_type(pg_t pg) const { - auto p = pools.find(pg.pool()); - assert(p != pools.end()); - return p->second.get_type(); - } - - - pg_t raw_pg_to_pg(pg_t pg) const { - auto p = pools.find(pg.pool()); - assert(p != pools.end()); - return p->second.raw_pg_to_pg(pg); - } - - // pg -> acting primary osd - int get_pg_acting_primary(pg_t pg) const { - int primary = -1; - _pg_to_up_acting_osds(pg, nullptr, nullptr, nullptr, &primary); - return primary; - } - - /* - * check whether an spg_t maps to a particular osd - */ - bool is_up_acting_osd_shard(spg_t pg, int osd) const { - vector up, acting; - _pg_to_up_acting_osds(pg.pgid, &up, NULL, &acting, NULL, false); - if (pg.shard == shard_id_t::NO_SHARD) { - if (calc_pg_role(osd, acting, acting.size()) >= 0 || - calc_pg_role(osd, up, up.size()) >= 0) - return true; - } else { - if (pg.shard < (int)acting.size() && acting[pg.shard] == osd) - return true; - if (pg.shard < (int)up.size() && up[pg.shard] == osd) - return true; - } - return false; - } - - - /* what replica # is a given osd? 0 primary, -1 for none. */ - static int calc_pg_rank(int osd, const vector& acting, int nrep=0); - static int calc_pg_role(int osd, const vector& acting, int nrep=0); - static bool primary_changed( - int oldprimary, - const vector &oldacting, - int newprimary, - const vector &newacting); - - /* rank is -1 (stray), 0 (primary), 1,2,3,... (replica) */ - int get_pg_acting_rank(pg_t pg, int osd) const { - vector group; - pg_to_acting_osds(pg, group); - return calc_pg_rank(osd, group, group.size()); - } - /* role is -1 (stray), 0 (primary), 1 (replica) */ - int get_pg_acting_role(const pg_t& pg, int osd) const { - vector group; - pg_to_acting_osds(pg, group); - return calc_pg_role(osd, group, group.size()); - } - - bool osd_is_valid_op_target(pg_t pg, int osd) const { - int primary; - vector group; - pg_to_acting_osds(pg, &group, &primary); - if (osd == primary) - return true; - if (pg_is_ec(pg)) - return false; - - return calc_pg_role(osd, group, group.size()) >= 0; - } - - int clean_pg_upmaps( - CephContext *cct, - Incremental *pending_inc); - - bool try_pg_upmap( - CephContext *cct, - pg_t pg, ///< pg to potentially remap - const set& overfull, ///< osds we'd want to evacuate - const vector& underfull, ///< osds to move to, in order of preference - vector *orig, - vector *out); ///< resulting alternative mapping - - int calc_pg_upmaps( - CephContext *cct, - float max_deviation, ///< max deviation from target (value < 1.0) - int max_iterations, ///< max iterations to run - const set& pools, ///< [optional] restrict to pool - Incremental *pending_inc - ); - - int get_osds_by_bucket_name(const string &name, set *osds) const; - - /* - * handy helpers to build simple maps... - */ - /** - * Build an OSD map suitable for basic usage. If **num_osd** is >= 0 - * it will be initialized with the specified number of OSDs in a - * single host. If **num_osd** is < 0 the layout of the OSD map will - * be built by reading the content of the configuration file. - * - * @param cct [in] in core ceph context - * @param e [in] initial epoch - * @param fsid [in] id of the cluster - * @param num_osd [in] number of OSDs if >= 0 or read from conf if < 0 - * @return **0** on success, negative errno on error. - */ -private: - int build_simple_optioned(CephContext *cct, epoch_t e, uuid_d &fsid, - int num_osd, int pg_bits, int pgp_bits, - bool default_pool); -public: - int build_simple(CephContext *cct, epoch_t e, uuid_d &fsid, - int num_osd) { - return build_simple_optioned(cct, e, fsid, num_osd, 0, 0, false); - } - int build_simple_with_pool(CephContext *cct, epoch_t e, uuid_d &fsid, - int num_osd, int pg_bits, int pgp_bits) { - return build_simple_optioned(cct, e, fsid, num_osd, - pg_bits, pgp_bits, true); - } - static int _build_crush_types(CrushWrapper& crush); - static int build_simple_crush_map(CephContext *cct, CrushWrapper& crush, - int num_osd, ostream *ss); - static int build_simple_crush_map_from_conf(CephContext *cct, - CrushWrapper& crush, - ostream *ss); - static int build_simple_crush_rules( - CephContext *cct, CrushWrapper& crush, - const string& root, - ostream *ss); - - bool crush_rule_in_use(int rule_id) const; - - int validate_crush_rules(CrushWrapper *crush, ostream *ss) const; - - void clear_temp() { - pg_temp->clear(); - primary_temp->clear(); - } - -private: - void print_osd_line(int cur, ostream *out, Formatter *f) const; -public: - void print(ostream& out) const; - void print_pools(ostream& out) const; - void print_summary(Formatter *f, ostream& out, const string& prefix) const; - void print_oneline_summary(ostream& out) const; - - enum { - DUMP_IN = 1, // only 'in' osds - DUMP_OUT = 2, // only 'out' osds - DUMP_UP = 4, // only 'up' osds - DUMP_DOWN = 8, // only 'down' osds - DUMP_DESTROYED = 16, // only 'destroyed' osds - }; - void print_tree(Formatter *f, ostream *out, unsigned dump_flags=0) const; - - int summarize_mapping_stats( - OSDMap *newmap, - const set *pools, - std::string *out, - Formatter *f) const; - - string get_flag_string() const; - static string get_flag_string(unsigned flags); - static void dump_erasure_code_profiles( - const mempool::osdmap::map > &profiles, - Formatter *f); - void dump(Formatter *f) const; - static void generate_test_instances(list& o); - bool check_new_blacklist_entries() const { return new_blacklist_entries; } - - void check_health(health_check_map_t *checks) const; - - int parse_osd_id_list(const vector& ls, - set *out, - ostream *ss) const; -}; -WRITE_CLASS_ENCODER_FEATURES(OSDMap) -WRITE_CLASS_ENCODER_FEATURES(OSDMap::Incremental) - -typedef ceph::shared_ptr OSDMapRef; - -inline ostream& operator<<(ostream& out, const OSDMap& m) { - m.print_oneline_summary(out); - return out; -} - -class PGStatService; - -void print_osd_utilization(const OSDMap& osdmap, - const PGStatService *pgstat, - ostream& out, - Formatter *f, - bool tree); - -#endif