1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2011 New Dream Network
7 * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
9 * Author: Loic Dachary <loic@dachary.org>
11 * This is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License version 2.1, as published by the Free Software
14 * Foundation. See file COPYING.
18 #include <boost/assign/list_of.hpp>
20 #include "osd_types.h"
21 #include "include/ceph_features.h"
23 #include "crush/hash.h"
27 #include "PGBackend.h"
29 const char *ceph_osd_flag_name(unsigned flag)
32 case CEPH_OSD_FLAG_ACK: return "ack";
33 case CEPH_OSD_FLAG_ONNVRAM: return "onnvram";
34 case CEPH_OSD_FLAG_ONDISK: return "ondisk";
35 case CEPH_OSD_FLAG_RETRY: return "retry";
36 case CEPH_OSD_FLAG_READ: return "read";
37 case CEPH_OSD_FLAG_WRITE: return "write";
38 case CEPH_OSD_FLAG_ORDERSNAP: return "ordersnap";
39 case CEPH_OSD_FLAG_PEERSTAT_OLD: return "peerstat_old";
40 case CEPH_OSD_FLAG_BALANCE_READS: return "balance_reads";
41 case CEPH_OSD_FLAG_PARALLELEXEC: return "parallelexec";
42 case CEPH_OSD_FLAG_PGOP: return "pgop";
43 case CEPH_OSD_FLAG_EXEC: return "exec";
44 case CEPH_OSD_FLAG_EXEC_PUBLIC: return "exec_public";
45 case CEPH_OSD_FLAG_LOCALIZE_READS: return "localize_reads";
46 case CEPH_OSD_FLAG_RWORDERED: return "rwordered";
47 case CEPH_OSD_FLAG_IGNORE_CACHE: return "ignore_cache";
48 case CEPH_OSD_FLAG_SKIPRWLOCKS: return "skiprwlocks";
49 case CEPH_OSD_FLAG_IGNORE_OVERLAY: return "ignore_overlay";
50 case CEPH_OSD_FLAG_FLUSH: return "flush";
51 case CEPH_OSD_FLAG_MAP_SNAP_CLONE: return "map_snap_clone";
52 case CEPH_OSD_FLAG_ENFORCE_SNAPC: return "enforce_snapc";
53 case CEPH_OSD_FLAG_REDIRECTED: return "redirected";
54 case CEPH_OSD_FLAG_KNOWN_REDIR: return "known_if_redirected";
55 case CEPH_OSD_FLAG_FULL_TRY: return "full_try";
56 case CEPH_OSD_FLAG_FULL_FORCE: return "full_force";
57 case CEPH_OSD_FLAG_IGNORE_REDIRECT: return "ignore_redirect";
58 default: return "???";
62 string ceph_osd_flag_string(unsigned flags)
65 for (unsigned i=0; i<32; ++i) {
66 if (flags & (1u<<i)) {
69 s += ceph_osd_flag_name(1u << i);
77 const char * ceph_osd_op_flag_name(unsigned flag)
82 case CEPH_OSD_OP_FLAG_EXCL:
85 case CEPH_OSD_OP_FLAG_FAILOK:
88 case CEPH_OSD_OP_FLAG_FADVISE_RANDOM:
89 name = "fadvise_random";
91 case CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL:
92 name = "fadvise_sequential";
94 case CEPH_OSD_OP_FLAG_FADVISE_WILLNEED:
95 name = "favise_willneed";
97 case CEPH_OSD_OP_FLAG_FADVISE_DONTNEED:
98 name = "fadvise_dontneed";
100 case CEPH_OSD_OP_FLAG_FADVISE_NOCACHE:
101 name = "fadvise_nocache";
110 string ceph_osd_op_flag_string(unsigned flags)
113 for (unsigned i=0; i<32; ++i) {
114 if (flags & (1u<<i)) {
117 s += ceph_osd_op_flag_name(1u << i);
125 string ceph_osd_alloc_hint_flag_string(unsigned flags)
128 for (unsigned i=0; i<32; ++i) {
129 if (flags & (1u<<i)) {
132 s += ceph_osd_alloc_hint_flag_name(1u << i);
140 void pg_shard_t::encode(bufferlist &bl) const
142 ENCODE_START(1, 1, bl);
147 void pg_shard_t::decode(bufferlist::iterator &bl)
155 ostream &operator<<(ostream &lhs, const pg_shard_t &rhs)
157 if (rhs.is_undefined())
159 if (rhs.shard == shard_id_t::NO_SHARD)
160 return lhs << rhs.osd;
161 return lhs << rhs.osd << '(' << (unsigned)(rhs.shard) << ')';
165 void osd_reqid_t::dump(Formatter *f) const
167 f->dump_stream("name") << name;
168 f->dump_int("inc", inc);
169 f->dump_unsigned("tid", tid);
172 void osd_reqid_t::generate_test_instances(list<osd_reqid_t*>& o)
174 o.push_back(new osd_reqid_t);
175 o.push_back(new osd_reqid_t(entity_name_t::CLIENT(123), 1, 45678));
178 // -- object_locator_t --
180 void object_locator_t::encode(bufferlist& bl) const
182 // verify that nobody's corrupted the locator
183 assert(hash == -1 || key.empty());
184 __u8 encode_compat = 3;
185 ENCODE_START(6, encode_compat, bl);
187 int32_t preferred = -1; // tell old code there is no preferred osd (-1).
188 ::encode(preferred, bl);
190 ::encode(nspace, bl);
193 encode_compat = MAX(encode_compat, 6); // need to interpret the hash
194 ENCODE_FINISH_NEW_COMPAT(bl, encode_compat);
197 void object_locator_t::decode(bufferlist::iterator& p)
199 DECODE_START_LEGACY_COMPAT_LEN(6, 3, 3, p);
209 ::decode(preferred, p);
219 // verify that nobody's corrupted the locator
220 assert(hash == -1 || key.empty());
223 void object_locator_t::dump(Formatter *f) const
225 f->dump_int("pool", pool);
226 f->dump_string("key", key);
227 f->dump_string("namespace", nspace);
228 f->dump_int("hash", hash);
231 void object_locator_t::generate_test_instances(list<object_locator_t*>& o)
233 o.push_back(new object_locator_t);
234 o.push_back(new object_locator_t(123));
235 o.push_back(new object_locator_t(123, 876));
236 o.push_back(new object_locator_t(1, "n2"));
237 o.push_back(new object_locator_t(1234, "", "key"));
238 o.push_back(new object_locator_t(12, "n1", "key2"));
241 // -- request_redirect_t --
242 void request_redirect_t::encode(bufferlist& bl) const
244 ENCODE_START(1, 1, bl);
245 ::encode(redirect_locator, bl);
246 ::encode(redirect_object, bl);
247 ::encode(osd_instructions, bl);
251 void request_redirect_t::decode(bufferlist::iterator& bl)
254 ::decode(redirect_locator, bl);
255 ::decode(redirect_object, bl);
256 ::decode(osd_instructions, bl);
260 void request_redirect_t::dump(Formatter *f) const
262 f->dump_string("object", redirect_object);
263 f->open_object_section("locator");
264 redirect_locator.dump(f);
265 f->close_section(); // locator
268 void request_redirect_t::generate_test_instances(list<request_redirect_t*>& o)
270 object_locator_t loc(1, "redir_obj");
271 o.push_back(new request_redirect_t());
272 o.push_back(new request_redirect_t(loc, 0));
273 o.push_back(new request_redirect_t(loc, "redir_obj"));
274 o.push_back(new request_redirect_t(loc));
277 void objectstore_perf_stat_t::dump(Formatter *f) const
279 f->dump_unsigned("commit_latency_ms", os_commit_latency);
280 f->dump_unsigned("apply_latency_ms", os_apply_latency);
283 void objectstore_perf_stat_t::encode(bufferlist &bl) const
285 ENCODE_START(1, 1, bl);
286 ::encode(os_commit_latency, bl);
287 ::encode(os_apply_latency, bl);
291 void objectstore_perf_stat_t::decode(bufferlist::iterator &bl)
294 ::decode(os_commit_latency, bl);
295 ::decode(os_apply_latency, bl);
299 void objectstore_perf_stat_t::generate_test_instances(std::list<objectstore_perf_stat_t*>& o)
301 o.push_back(new objectstore_perf_stat_t());
302 o.push_back(new objectstore_perf_stat_t());
303 o.back()->os_commit_latency = 20;
304 o.back()->os_apply_latency = 30;
308 void osd_stat_t::dump(Formatter *f) const
310 f->dump_unsigned("up_from", up_from);
311 f->dump_unsigned("seq", seq);
312 f->dump_unsigned("num_pgs", num_pgs);
313 f->dump_unsigned("kb", kb);
314 f->dump_unsigned("kb_used", kb_used);
315 f->dump_unsigned("kb_avail", kb_avail);
316 f->open_array_section("hb_peers");
317 for (auto p : hb_peers)
318 f->dump_int("osd", p);
320 f->dump_int("snap_trim_queue_len", snap_trim_queue_len);
321 f->dump_int("num_snap_trimming", num_snap_trimming);
322 f->open_object_section("op_queue_age_hist");
323 op_queue_age_hist.dump(f);
325 f->open_object_section("perf_stat");
326 os_perf_stat.dump(f);
330 void osd_stat_t::encode(bufferlist &bl) const
332 ENCODE_START(7, 2, bl);
334 ::encode(kb_used, bl);
335 ::encode(kb_avail, bl);
336 ::encode(snap_trim_queue_len, bl);
337 ::encode(num_snap_trimming, bl);
338 ::encode(hb_peers, bl);
339 ::encode((uint32_t)0, bl);
340 ::encode(op_queue_age_hist, bl);
341 ::encode(os_perf_stat, bl);
342 ::encode(up_from, bl);
344 ::encode(num_pgs, bl);
348 void osd_stat_t::decode(bufferlist::iterator &bl)
350 DECODE_START_LEGACY_COMPAT_LEN(6, 2, 2, bl);
352 ::decode(kb_used, bl);
353 ::decode(kb_avail, bl);
354 ::decode(snap_trim_queue_len, bl);
355 ::decode(num_snap_trimming, bl);
356 ::decode(hb_peers, bl);
357 vector<int> num_hb_out;
358 ::decode(num_hb_out, bl);
360 ::decode(op_queue_age_hist, bl);
362 ::decode(os_perf_stat, bl);
364 ::decode(up_from, bl);
368 ::decode(num_pgs, bl);
373 void osd_stat_t::generate_test_instances(std::list<osd_stat_t*>& o)
375 o.push_back(new osd_stat_t);
377 o.push_back(new osd_stat_t);
379 o.back()->kb_used = 2;
380 o.back()->kb_avail = 3;
381 o.back()->hb_peers.push_back(7);
382 o.back()->snap_trim_queue_len = 8;
383 o.back()->num_snap_trimming = 99;
388 int pg_t::print(char *o, int maxlen) const
390 if (preferred() >= 0)
391 return snprintf(o, maxlen, "%llu.%xp%d", (unsigned long long)pool(), ps(), preferred());
393 return snprintf(o, maxlen, "%llu.%x", (unsigned long long)pool(), ps());
396 bool pg_t::parse(const char *s)
401 int r = sscanf(s, "%llu.%xp%d", (long long unsigned *)&ppool, &pseed, &pref);
413 bool spg_t::parse(const char *s)
415 pgid.set_preferred(-1);
416 shard = shard_id_t::NO_SHARD;
421 int r = sscanf(s, "%llu.%x", (long long unsigned *)&ppool, &pseed);
424 pgid.set_pool(ppool);
427 const char *p = strchr(s, 'p');
429 r = sscanf(p, "p%d", &pref);
431 pgid.set_preferred(pref);
439 r = sscanf(p, "s%d", &pshard);
441 shard = shard_id_t(pshard);
449 char *spg_t::calc_name(char *buf, const char *suffix_backwords) const
451 while (*suffix_backwords)
452 *--buf = *suffix_backwords++;
454 if (!is_no_shard()) {
455 buf = ritoa<uint8_t, 10>((uint8_t)shard.id, buf);
459 return pgid.calc_name(buf, "");
462 ostream& operator<<(ostream& out, const spg_t &pg)
464 char buf[spg_t::calc_name_buf_size];
465 buf[spg_t::calc_name_buf_size - 1] = '\0';
466 out << pg.calc_name(buf + spg_t::calc_name_buf_size - 1, "");
470 pg_t pg_t::get_ancestor(unsigned old_pg_num) const
472 int old_bits = cbits(old_pg_num);
473 int old_mask = (1 << old_bits) - 1;
475 ret.m_seed = ceph_stable_mod(m_seed, old_pg_num, old_mask);
479 bool pg_t::is_split(unsigned old_pg_num, unsigned new_pg_num, set<pg_t> *children) const
481 assert(m_seed < old_pg_num);
482 if (new_pg_num <= old_pg_num)
487 unsigned old_bits = cbits(old_pg_num);
488 unsigned old_mask = (1 << old_bits) - 1;
489 for (unsigned n = 1; ; n++) {
490 unsigned next_bit = (n << (old_bits-1));
491 unsigned s = next_bit | m_seed;
493 if (s < old_pg_num || s == m_seed)
497 if ((unsigned)ceph_stable_mod(s, old_pg_num, old_mask) == m_seed) {
500 children->insert(pg_t(s, m_pool, m_preferred));
506 int old_bits = cbits(old_pg_num);
507 int old_mask = (1 << old_bits) - 1;
508 for (unsigned x = old_pg_num; x < new_pg_num; ++x) {
509 unsigned o = ceph_stable_mod(x, old_pg_num, old_mask);
512 children->insert(pg_t(x, m_pool, m_preferred));
519 unsigned pg_t::get_split_bits(unsigned pg_num) const {
524 // Find unique p such that pg_num \in [2^(p-1), 2^p)
525 unsigned p = cbits(pg_num);
526 assert(p); // silence coverity #751330
528 if ((m_seed % (1<<(p-1))) < (pg_num % (1<<(p-1))))
534 pg_t pg_t::get_parent() const
536 unsigned bits = cbits(m_seed);
539 retval.m_seed &= ~((~0)<<(bits - 1));
543 hobject_t pg_t::get_hobj_start() const
545 return hobject_t(object_t(), string(), CEPH_NOSNAP, m_seed, m_pool,
549 hobject_t pg_t::get_hobj_end(unsigned pg_num) const
551 // note: this assumes a bitwise sort; with the legacy nibblewise
552 // sort a PG did not always cover a single contiguous range of the
553 // (bit-reversed) hash range.
554 unsigned bits = get_split_bits(pg_num);
555 uint64_t rev_start = hobject_t::_reverse_bits(m_seed);
556 uint64_t rev_end = (rev_start | (0xffffffff >> bits)) + 1;
557 if (rev_end >= 0x100000000) {
558 assert(rev_end == 0x100000000);
559 return hobject_t::get_max();
561 return hobject_t(object_t(), string(), CEPH_NOSNAP,
562 hobject_t::_reverse_bits(rev_end), m_pool,
567 void pg_t::dump(Formatter *f) const
569 f->dump_unsigned("pool", m_pool);
570 f->dump_unsigned("seed", m_seed);
571 f->dump_int("preferred_osd", m_preferred);
574 void pg_t::generate_test_instances(list<pg_t*>& o)
576 o.push_back(new pg_t);
577 o.push_back(new pg_t(1, 2, -1));
578 o.push_back(new pg_t(13123, 3, -1));
579 o.push_back(new pg_t(131223, 4, 23));
582 char *pg_t::calc_name(char *buf, const char *suffix_backwords) const
584 while (*suffix_backwords)
585 *--buf = *suffix_backwords++;
587 if (m_preferred >= 0)
590 buf = ritoa<uint32_t, 16>(m_seed, buf);
594 return ritoa<uint64_t, 10>(m_pool, buf);
597 ostream& operator<<(ostream& out, const pg_t &pg)
599 char buf[pg_t::calc_name_buf_size];
600 buf[pg_t::calc_name_buf_size - 1] = '\0';
601 out << pg.calc_name(buf + pg_t::calc_name_buf_size - 1, "");
608 void coll_t::calc_str()
612 strcpy(_str_buff, "meta");
616 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
617 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "daeh_");
620 _str_buff[spg_t::calc_name_buf_size - 1] = '\0';
621 _str = pgid.calc_name(_str_buff + spg_t::calc_name_buf_size - 1, "PMET_");
624 assert(0 == "unknown collection type");
628 bool coll_t::parse(const std::string& s)
638 if (s.find("_head") == s.length() - 5 &&
639 pgid.parse(s.substr(0, s.length() - 5))) {
646 if (s.find("_TEMP") == s.length() - 5 &&
647 pgid.parse(s.substr(0, s.length() - 5))) {
657 void coll_t::encode(bufferlist& bl) const
659 // when changing this, remember to update encoded_size() too.
661 // can't express this as v2...
663 ::encode(struct_v, bl);
664 ::encode(to_str(), bl);
667 ::encode(struct_v, bl);
668 ::encode((__u8)type, bl);
670 snapid_t snap = CEPH_NOSNAP;
675 size_t coll_t::encoded_size() const
677 size_t r = sizeof(__u8);
690 r += sizeof(ceph_le32) + 2 * sizeof(__u8);
692 r += sizeof(__u8) + sizeof(uint64_t) + 2 * sizeof(uint32_t);
696 r += sizeof(uint64_t);
702 void coll_t::decode(bufferlist::iterator& bl)
705 ::decode(struct_v, bl);
714 if (pgid == spg_t() && snap == 0) {
730 type = (type_t)_type;
739 bool ok = parse(str);
741 throw std::domain_error(std::string("unable to parse pg ") + str);
748 oss << "coll_t::decode(): don't know how to decode version "
750 throw std::domain_error(oss.str());
755 void coll_t::dump(Formatter *f) const
757 f->dump_unsigned("type_id", (unsigned)type);
758 if (type != TYPE_META)
759 f->dump_stream("pgid") << pgid;
760 f->dump_string("name", to_str());
763 void coll_t::generate_test_instances(list<coll_t*>& o)
765 o.push_back(new coll_t());
766 o.push_back(new coll_t(spg_t(pg_t(1, 0), shard_id_t::NO_SHARD)));
767 o.push_back(new coll_t(o.back()->get_temp()));
768 o.push_back(new coll_t(spg_t(pg_t(3, 2), shard_id_t(12))));
769 o.push_back(new coll_t(o.back()->get_temp()));
770 o.push_back(new coll_t());
775 std::string pg_vector_string(const vector<int32_t> &a)
779 for (vector<int32_t>::const_iterator i = a.begin(); i != a.end(); ++i) {
782 if (*i != CRUSH_ITEM_NONE)
791 std::string pg_state_string(int state)
794 if (state & PG_STATE_STALE)
796 if (state & PG_STATE_CREATING)
798 if (state & PG_STATE_ACTIVE)
800 if (state & PG_STATE_ACTIVATING)
801 oss << "activating+";
802 if (state & PG_STATE_CLEAN)
804 if (state & PG_STATE_RECOVERY_WAIT)
805 oss << "recovery_wait+";
806 if (state & PG_STATE_RECOVERY_TOOFULL)
807 oss << "recovery_toofull+";
808 if (state & PG_STATE_RECOVERING)
809 oss << "recovering+";
810 if (state & PG_STATE_FORCED_RECOVERY)
811 oss << "forced_recovery+";
812 if (state & PG_STATE_DOWN)
814 if (state & PG_STATE_UNDERSIZED)
815 oss << "undersized+";
816 if (state & PG_STATE_DEGRADED)
818 if (state & PG_STATE_REMAPPED)
820 if (state & PG_STATE_SCRUBBING)
822 if (state & PG_STATE_DEEP_SCRUB)
824 if (state & PG_STATE_INCONSISTENT)
825 oss << "inconsistent+";
826 if (state & PG_STATE_PEERING)
828 if (state & PG_STATE_REPAIR)
830 if (state & PG_STATE_BACKFILL_WAIT)
831 oss << "backfill_wait+";
832 if (state & PG_STATE_BACKFILLING)
833 oss << "backfilling+";
834 if (state & PG_STATE_FORCED_BACKFILL)
835 oss << "forced_backfill+";
836 if (state & PG_STATE_BACKFILL_TOOFULL)
837 oss << "backfill_toofull+";
838 if (state & PG_STATE_INCOMPLETE)
839 oss << "incomplete+";
840 if (state & PG_STATE_PEERED)
842 if (state & PG_STATE_SNAPTRIM)
844 if (state & PG_STATE_SNAPTRIM_WAIT)
845 oss << "snaptrim_wait+";
846 if (state & PG_STATE_SNAPTRIM_ERROR)
847 oss << "snaptrim_error+";
848 string ret(oss.str());
849 if (ret.length() > 0)
850 ret.resize(ret.length() - 1);
856 boost::optional<uint64_t> pg_string_state(const std::string& state)
858 boost::optional<uint64_t> type;
859 if (state == "active")
860 type = PG_STATE_ACTIVE;
861 else if (state == "clean")
862 type = PG_STATE_CLEAN;
863 else if (state == "down")
864 type = PG_STATE_DOWN;
865 else if (state == "scrubbing")
866 type = PG_STATE_SCRUBBING;
867 else if (state == "degraded")
868 type = PG_STATE_DEGRADED;
869 else if (state == "inconsistent")
870 type = PG_STATE_INCONSISTENT;
871 else if (state == "peering")
872 type = PG_STATE_PEERING;
873 else if (state == "repair")
874 type = PG_STATE_REPAIR;
875 else if (state == "recovering")
876 type = PG_STATE_RECOVERING;
877 else if (state == "forced_recovery")
878 type = PG_STATE_FORCED_RECOVERY;
879 else if (state == "backfill_wait")
880 type = PG_STATE_BACKFILL_WAIT;
881 else if (state == "incomplete")
882 type = PG_STATE_INCOMPLETE;
883 else if (state == "stale")
884 type = PG_STATE_STALE;
885 else if (state == "remapped")
886 type = PG_STATE_REMAPPED;
887 else if (state == "deep_scrub")
888 type = PG_STATE_DEEP_SCRUB;
889 else if (state == "backfilling")
890 type = PG_STATE_BACKFILLING;
891 else if (state == "forced_backfill")
892 type = PG_STATE_FORCED_BACKFILL;
893 else if (state == "backfill_toofull")
894 type = PG_STATE_BACKFILL_TOOFULL;
895 else if (state == "recovery_wait")
896 type = PG_STATE_RECOVERY_WAIT;
897 else if (state == "recovery_toofull")
898 type = PG_STATE_RECOVERY_TOOFULL;
899 else if (state == "undersized")
900 type = PG_STATE_UNDERSIZED;
901 else if (state == "activating")
902 type = PG_STATE_ACTIVATING;
903 else if (state == "peered")
904 type = PG_STATE_PEERED;
905 else if (state == "snaptrim")
906 type = PG_STATE_SNAPTRIM;
907 else if (state == "snaptrim_wait")
908 type = PG_STATE_SNAPTRIM_WAIT;
909 else if (state == "snaptrim_error")
910 type = PG_STATE_SNAPTRIM_ERROR;
917 string eversion_t::get_key_name() const
920 // Below is equivalent of sprintf("%010u.%020llu");
922 ritoa<uint64_t, 10, 20>(version, key + 31);
924 ritoa<uint32_t, 10, 10>(epoch, key + 10);
929 // -- pool_snap_info_t --
930 void pool_snap_info_t::dump(Formatter *f) const
932 f->dump_unsigned("snapid", snapid);
933 f->dump_stream("stamp") << stamp;
934 f->dump_string("name", name);
937 void pool_snap_info_t::encode(bufferlist& bl, uint64_t features) const
939 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
941 ::encode(struct_v, bl);
942 ::encode(snapid, bl);
947 ENCODE_START(2, 2, bl);
948 ::encode(snapid, bl);
954 void pool_snap_info_t::decode(bufferlist::iterator& bl)
956 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
957 ::decode(snapid, bl);
963 void pool_snap_info_t::generate_test_instances(list<pool_snap_info_t*>& o)
965 o.push_back(new pool_snap_info_t);
966 o.push_back(new pool_snap_info_t);
967 o.back()->snapid = 1;
968 o.back()->stamp = utime_t(1, 2);
969 o.back()->name = "foo";
974 typedef std::map<std::string, pool_opts_t::opt_desc_t> opt_mapping_t;
975 static opt_mapping_t opt_mapping = boost::assign::map_list_of
976 ("scrub_min_interval", pool_opts_t::opt_desc_t(
977 pool_opts_t::SCRUB_MIN_INTERVAL, pool_opts_t::DOUBLE))
978 ("scrub_max_interval", pool_opts_t::opt_desc_t(
979 pool_opts_t::SCRUB_MAX_INTERVAL, pool_opts_t::DOUBLE))
980 ("deep_scrub_interval", pool_opts_t::opt_desc_t(
981 pool_opts_t::DEEP_SCRUB_INTERVAL, pool_opts_t::DOUBLE))
982 ("recovery_priority", pool_opts_t::opt_desc_t(
983 pool_opts_t::RECOVERY_PRIORITY, pool_opts_t::INT))
984 ("recovery_op_priority", pool_opts_t::opt_desc_t(
985 pool_opts_t::RECOVERY_OP_PRIORITY, pool_opts_t::INT))
986 ("scrub_priority", pool_opts_t::opt_desc_t(
987 pool_opts_t::SCRUB_PRIORITY, pool_opts_t::INT))
988 ("compression_mode", pool_opts_t::opt_desc_t(
989 pool_opts_t::COMPRESSION_MODE, pool_opts_t::STR))
990 ("compression_algorithm", pool_opts_t::opt_desc_t(
991 pool_opts_t::COMPRESSION_ALGORITHM, pool_opts_t::STR))
992 ("compression_required_ratio", pool_opts_t::opt_desc_t(
993 pool_opts_t::COMPRESSION_REQUIRED_RATIO, pool_opts_t::DOUBLE))
994 ("compression_max_blob_size", pool_opts_t::opt_desc_t(
995 pool_opts_t::COMPRESSION_MAX_BLOB_SIZE, pool_opts_t::INT))
996 ("compression_min_blob_size", pool_opts_t::opt_desc_t(
997 pool_opts_t::COMPRESSION_MIN_BLOB_SIZE, pool_opts_t::INT))
998 ("csum_type", pool_opts_t::opt_desc_t(
999 pool_opts_t::CSUM_TYPE, pool_opts_t::INT))
1000 ("csum_max_block", pool_opts_t::opt_desc_t(
1001 pool_opts_t::CSUM_MAX_BLOCK, pool_opts_t::INT))
1002 ("csum_min_block", pool_opts_t::opt_desc_t(
1003 pool_opts_t::CSUM_MIN_BLOCK, pool_opts_t::INT));
1005 bool pool_opts_t::is_opt_name(const std::string& name) {
1006 return opt_mapping.count(name);
1009 pool_opts_t::opt_desc_t pool_opts_t::get_opt_desc(const std::string& name) {
1010 opt_mapping_t::iterator i = opt_mapping.find(name);
1011 assert(i != opt_mapping.end());
1015 bool pool_opts_t::is_set(pool_opts_t::key_t key) const {
1016 return opts.count(key);
1019 const pool_opts_t::value_t& pool_opts_t::get(pool_opts_t::key_t key) const {
1020 opts_t::const_iterator i = opts.find(key);
1021 assert(i != opts.end());
1025 bool pool_opts_t::unset(pool_opts_t::key_t key) {
1026 return opts.erase(key) > 0;
1029 class pool_opts_dumper_t : public boost::static_visitor<>
1032 pool_opts_dumper_t(const std::string& name_, Formatter* f_) :
1033 name(name_.c_str()), f(f_) {}
1035 void operator()(std::string s) const {
1036 f->dump_string(name, s);
1038 void operator()(int i) const {
1039 f->dump_int(name, i);
1041 void operator()(double d) const {
1042 f->dump_float(name, d);
1050 void pool_opts_t::dump(const std::string& name, Formatter* f) const
1052 const opt_desc_t& desc = get_opt_desc(name);
1053 opts_t::const_iterator i = opts.find(desc.key);
1054 if (i == opts.end()) {
1057 boost::apply_visitor(pool_opts_dumper_t(name, f), i->second);
1060 void pool_opts_t::dump(Formatter* f) const
1062 for (opt_mapping_t::iterator i = opt_mapping.begin(); i != opt_mapping.end();
1064 const std::string& name = i->first;
1065 const opt_desc_t& desc = i->second;
1066 opts_t::const_iterator j = opts.find(desc.key);
1067 if (j == opts.end()) {
1070 boost::apply_visitor(pool_opts_dumper_t(name, f), j->second);
1074 class pool_opts_encoder_t : public boost::static_visitor<>
1077 explicit pool_opts_encoder_t(bufferlist& bl_) : bl(bl_) {}
1079 void operator()(std::string s) const {
1080 ::encode(static_cast<int32_t>(pool_opts_t::STR), bl);
1083 void operator()(int i) const {
1084 ::encode(static_cast<int32_t>(pool_opts_t::INT), bl);
1087 void operator()(double d) const {
1088 ::encode(static_cast<int32_t>(pool_opts_t::DOUBLE), bl);
1096 void pool_opts_t::encode(bufferlist& bl) const {
1097 ENCODE_START(1, 1, bl);
1098 uint32_t n = static_cast<uint32_t>(opts.size());
1100 for (opts_t::const_iterator i = opts.begin(); i != opts.end(); ++i) {
1101 ::encode(static_cast<int32_t>(i->first), bl);
1102 boost::apply_visitor(pool_opts_encoder_t(bl), i->second);
1107 void pool_opts_t::decode(bufferlist::iterator& bl) {
1108 DECODE_START(1, bl);
1119 opts[static_cast<key_t>(k)] = s;
1120 } else if (t == INT) {
1123 opts[static_cast<key_t>(k)] = i;
1124 } else if (t == DOUBLE) {
1127 opts[static_cast<key_t>(k)] = d;
1129 assert(!"invalid type");
1135 ostream& operator<<(ostream& out, const pool_opts_t& opts)
1137 for (opt_mapping_t::iterator i = opt_mapping.begin(); i != opt_mapping.end();
1139 const std::string& name = i->first;
1140 const pool_opts_t::opt_desc_t& desc = i->second;
1141 pool_opts_t::opts_t::const_iterator j = opts.opts.find(desc.key);
1142 if (j == opts.opts.end()) {
1145 out << " " << name << " " << j->second;
1152 const char *pg_pool_t::APPLICATION_NAME_CEPHFS("cephfs");
1153 const char *pg_pool_t::APPLICATION_NAME_RBD("rbd");
1154 const char *pg_pool_t::APPLICATION_NAME_RGW("rgw");
1156 void pg_pool_t::dump(Formatter *f) const
1158 f->dump_unsigned("flags", get_flags());
1159 f->dump_string("flags_names", get_flags_string());
1160 f->dump_int("type", get_type());
1161 f->dump_int("size", get_size());
1162 f->dump_int("min_size", get_min_size());
1163 f->dump_int("crush_rule", get_crush_rule());
1164 f->dump_int("object_hash", get_object_hash());
1165 f->dump_unsigned("pg_num", get_pg_num());
1166 f->dump_unsigned("pg_placement_num", get_pgp_num());
1167 f->dump_unsigned("crash_replay_interval", get_crash_replay_interval());
1168 f->dump_stream("last_change") << get_last_change();
1169 f->dump_stream("last_force_op_resend") << get_last_force_op_resend();
1170 f->dump_stream("last_force_op_resend_preluminous")
1171 << get_last_force_op_resend_preluminous();
1172 f->dump_unsigned("auid", get_auid());
1173 f->dump_string("snap_mode", is_pool_snaps_mode() ? "pool" : "selfmanaged");
1174 f->dump_unsigned("snap_seq", get_snap_seq());
1175 f->dump_unsigned("snap_epoch", get_snap_epoch());
1176 f->open_array_section("pool_snaps");
1177 for (map<snapid_t, pool_snap_info_t>::const_iterator p = snaps.begin(); p != snaps.end(); ++p) {
1178 f->open_object_section("pool_snap_info");
1183 f->dump_stream("removed_snaps") << removed_snaps;
1184 f->dump_unsigned("quota_max_bytes", quota_max_bytes);
1185 f->dump_unsigned("quota_max_objects", quota_max_objects);
1186 f->open_array_section("tiers");
1187 for (set<uint64_t>::const_iterator p = tiers.begin(); p != tiers.end(); ++p)
1188 f->dump_unsigned("pool_id", *p);
1190 f->dump_int("tier_of", tier_of);
1191 f->dump_int("read_tier", read_tier);
1192 f->dump_int("write_tier", write_tier);
1193 f->dump_string("cache_mode", get_cache_mode_name());
1194 f->dump_unsigned("target_max_bytes", target_max_bytes);
1195 f->dump_unsigned("target_max_objects", target_max_objects);
1196 f->dump_unsigned("cache_target_dirty_ratio_micro",
1197 cache_target_dirty_ratio_micro);
1198 f->dump_unsigned("cache_target_dirty_high_ratio_micro",
1199 cache_target_dirty_high_ratio_micro);
1200 f->dump_unsigned("cache_target_full_ratio_micro",
1201 cache_target_full_ratio_micro);
1202 f->dump_unsigned("cache_min_flush_age", cache_min_flush_age);
1203 f->dump_unsigned("cache_min_evict_age", cache_min_evict_age);
1204 f->dump_string("erasure_code_profile", erasure_code_profile);
1205 f->open_object_section("hit_set_params");
1206 hit_set_params.dump(f);
1207 f->close_section(); // hit_set_params
1208 f->dump_unsigned("hit_set_period", hit_set_period);
1209 f->dump_unsigned("hit_set_count", hit_set_count);
1210 f->dump_bool("use_gmt_hitset", use_gmt_hitset);
1211 f->dump_unsigned("min_read_recency_for_promote", min_read_recency_for_promote);
1212 f->dump_unsigned("min_write_recency_for_promote", min_write_recency_for_promote);
1213 f->dump_unsigned("hit_set_grade_decay_rate", hit_set_grade_decay_rate);
1214 f->dump_unsigned("hit_set_search_last_n", hit_set_search_last_n);
1215 f->open_array_section("grade_table");
1216 for (unsigned i = 0; i < hit_set_count; ++i)
1217 f->dump_unsigned("value", get_grade(i));
1219 f->dump_unsigned("stripe_width", get_stripe_width());
1220 f->dump_unsigned("expected_num_objects", expected_num_objects);
1221 f->dump_bool("fast_read", fast_read);
1222 f->open_object_section("options");
1224 f->close_section(); // options
1225 f->open_object_section("application_metadata");
1226 for (auto &app_pair : application_metadata) {
1227 f->open_object_section(app_pair.first.c_str());
1228 for (auto &kv_pair : app_pair.second) {
1229 f->dump_string(kv_pair.first.c_str(), kv_pair.second);
1231 f->close_section(); // application
1233 f->close_section(); // application_metadata
1236 void pg_pool_t::convert_to_pg_shards(const vector<int> &from, set<pg_shard_t>* to) const {
1237 for (size_t i = 0; i < from.size(); ++i) {
1238 if (from[i] != CRUSH_ITEM_NONE) {
1242 ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1247 void pg_pool_t::calc_pg_masks()
1249 pg_num_mask = (1 << cbits(pg_num-1)) - 1;
1250 pgp_num_mask = (1 << cbits(pgp_num-1)) - 1;
1253 unsigned pg_pool_t::get_pg_num_divisor(pg_t pgid) const
1255 if (pg_num == pg_num_mask + 1)
1256 return pg_num; // power-of-2 split
1257 unsigned mask = pg_num_mask >> 1;
1258 if ((pgid.ps() & mask) < (pg_num & mask))
1259 return pg_num_mask + 1; // smaller bin size (already split)
1261 return (pg_num_mask + 1) >> 1; // bigger bin (not yet split)
1265 * we have two snap modes:
1266 * - pool global snaps
1267 * - snap existence/non-existence defined by snaps[] and snap_seq
1268 * - user managed snaps
1269 * - removal governed by removed_snaps
1271 * we know which mode we're using based on whether removed_snaps is empty.
1273 bool pg_pool_t::is_pool_snaps_mode() const
1275 return removed_snaps.empty() && get_snap_seq() > 0;
1278 bool pg_pool_t::is_unmanaged_snaps_mode() const
1280 return removed_snaps.size() && get_snap_seq() > 0;
1283 bool pg_pool_t::is_removed_snap(snapid_t s) const
1285 if (is_pool_snaps_mode())
1286 return s <= get_snap_seq() && snaps.count(s) == 0;
1288 return removed_snaps.contains(s);
1292 * build set of known-removed sets from either pool snaps or
1293 * explicit removed_snaps set.
1295 void pg_pool_t::build_removed_snaps(interval_set<snapid_t>& rs) const
1297 if (is_pool_snaps_mode()) {
1299 for (snapid_t s = 1; s <= get_snap_seq(); s = s + 1)
1300 if (snaps.count(s) == 0)
1307 snapid_t pg_pool_t::snap_exists(const char *s) const
1309 for (map<snapid_t,pool_snap_info_t>::const_iterator p = snaps.begin();
1312 if (p->second.name == s)
1313 return p->second.snapid;
1317 void pg_pool_t::add_snap(const char *n, utime_t stamp)
1319 assert(!is_unmanaged_snaps_mode());
1320 snapid_t s = get_snap_seq() + 1;
1322 snaps[s].snapid = s;
1324 snaps[s].stamp = stamp;
1327 void pg_pool_t::add_unmanaged_snap(uint64_t& snapid)
1329 if (removed_snaps.empty()) {
1330 assert(!is_pool_snaps_mode());
1331 removed_snaps.insert(snapid_t(1));
1334 snapid = snap_seq = snap_seq + 1;
1337 void pg_pool_t::remove_snap(snapid_t s)
1339 assert(snaps.count(s));
1341 snap_seq = snap_seq + 1;
1344 void pg_pool_t::remove_unmanaged_snap(snapid_t s)
1346 assert(is_unmanaged_snaps_mode());
1347 removed_snaps.insert(s);
1348 snap_seq = snap_seq + 1;
1349 removed_snaps.insert(get_snap_seq());
1352 SnapContext pg_pool_t::get_snap_context() const
1354 vector<snapid_t> s(snaps.size());
1356 for (map<snapid_t, pool_snap_info_t>::const_reverse_iterator p = snaps.rbegin();
1360 return SnapContext(get_snap_seq(), s);
1363 uint32_t pg_pool_t::hash_key(const string& key, const string& ns) const
1366 return ceph_str_hash(object_hash, key.data(), key.length());
1367 int nsl = ns.length();
1368 int len = key.length() + nsl + 1;
1370 memcpy(&buf[0], ns.data(), nsl);
1372 memcpy(&buf[nsl+1], key.data(), key.length());
1373 return ceph_str_hash(object_hash, &buf[0], len);
1376 uint32_t pg_pool_t::raw_hash_to_pg(uint32_t v) const
1378 return ceph_stable_mod(v, pg_num, pg_num_mask);
1382 * map a raw pg (with full precision ps) into an actual pg, for storage
1384 pg_t pg_pool_t::raw_pg_to_pg(pg_t pg) const
1386 pg.set_ps(ceph_stable_mod(pg.ps(), pg_num, pg_num_mask));
1391 * map raw pg (full precision ps) into a placement seed. include
1392 * pool id in that value so that different pools don't use the same
1395 ps_t pg_pool_t::raw_pg_to_pps(pg_t pg) const
1397 if (flags & FLAG_HASHPSPOOL) {
1398 // Hash the pool id so that pool PGs do not overlap.
1400 crush_hash32_2(CRUSH_HASH_RJENKINS1,
1401 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask),
1404 // Legacy behavior; add ps and pool together. This is not a great
1405 // idea because the PGs from each pool will essentially overlap on
1406 // top of each other: 0.5 == 1.4 == 2.3 == ...
1408 ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask) +
1413 uint32_t pg_pool_t::get_random_pg_position(pg_t pg, uint32_t seed) const
1415 uint32_t r = crush_hash32_2(CRUSH_HASH_RJENKINS1, seed, 123);
1416 if (pg_num == pg_num_mask + 1) {
1419 unsigned smaller_mask = pg_num_mask >> 1;
1420 if ((pg.ps() & smaller_mask) < (pg_num & smaller_mask)) {
1430 void pg_pool_t::encode(bufferlist& bl, uint64_t features) const
1432 if ((features & CEPH_FEATURE_PGPOOL3) == 0) {
1433 // this encoding matches the old struct ceph_pg_pool
1435 ::encode(struct_v, bl);
1438 ::encode(crush_rule, bl);
1439 ::encode(object_hash, bl);
1440 ::encode(pg_num, bl);
1441 ::encode(pgp_num, bl);
1442 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1443 ::encode(lpg_num, bl);
1444 ::encode(lpgp_num, bl);
1445 ::encode(last_change, bl);
1446 ::encode(snap_seq, bl);
1447 ::encode(snap_epoch, bl);
1449 __u32 n = snaps.size();
1451 n = removed_snaps.num_intervals();
1456 ::encode_nohead(snaps, bl, features);
1457 ::encode_nohead(removed_snaps, bl);
1461 if ((features & CEPH_FEATURE_OSDENC) == 0) {
1463 ::encode(struct_v, bl);
1466 ::encode(crush_rule, bl);
1467 ::encode(object_hash, bl);
1468 ::encode(pg_num, bl);
1469 ::encode(pgp_num, bl);
1470 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1471 ::encode(lpg_num, bl);
1472 ::encode(lpgp_num, bl);
1473 ::encode(last_change, bl);
1474 ::encode(snap_seq, bl);
1475 ::encode(snap_epoch, bl);
1476 ::encode(snaps, bl, features);
1477 ::encode(removed_snaps, bl);
1479 ::encode(flags, bl);
1480 ::encode(crash_replay_interval, bl);
1484 if ((features & CEPH_FEATURE_OSD_POOLRESEND) == 0) {
1485 // we simply added last_force_op_resend here, which is a fully
1486 // backward compatible change. however, encoding the same map
1487 // differently between monitors triggers scrub noise (even though
1488 // they are decodable without the feature), so let's be pendantic
1490 ENCODE_START(14, 5, bl);
1493 ::encode(crush_rule, bl);
1494 ::encode(object_hash, bl);
1495 ::encode(pg_num, bl);
1496 ::encode(pgp_num, bl);
1497 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1498 ::encode(lpg_num, bl);
1499 ::encode(lpgp_num, bl);
1500 ::encode(last_change, bl);
1501 ::encode(snap_seq, bl);
1502 ::encode(snap_epoch, bl);
1503 ::encode(snaps, bl, features);
1504 ::encode(removed_snaps, bl);
1506 ::encode(flags, bl);
1507 ::encode(crash_replay_interval, bl);
1508 ::encode(min_size, bl);
1509 ::encode(quota_max_bytes, bl);
1510 ::encode(quota_max_objects, bl);
1511 ::encode(tiers, bl);
1512 ::encode(tier_of, bl);
1513 __u8 c = cache_mode;
1515 ::encode(read_tier, bl);
1516 ::encode(write_tier, bl);
1517 ::encode(properties, bl);
1518 ::encode(hit_set_params, bl);
1519 ::encode(hit_set_period, bl);
1520 ::encode(hit_set_count, bl);
1521 ::encode(stripe_width, bl);
1522 ::encode(target_max_bytes, bl);
1523 ::encode(target_max_objects, bl);
1524 ::encode(cache_target_dirty_ratio_micro, bl);
1525 ::encode(cache_target_full_ratio_micro, bl);
1526 ::encode(cache_min_flush_age, bl);
1527 ::encode(cache_min_evict_age, bl);
1528 ::encode(erasure_code_profile, bl);
1534 if (!(features & CEPH_FEATURE_NEW_OSDOP_ENCODING)) {
1535 // this was the first post-hammer thing we added; if it's missing, encode
1539 if (!HAVE_FEATURE(features, SERVER_LUMINOUS)) {
1543 ENCODE_START(v, 5, bl);
1546 ::encode(crush_rule, bl);
1547 ::encode(object_hash, bl);
1548 ::encode(pg_num, bl);
1549 ::encode(pgp_num, bl);
1550 __u32 lpg_num = 0, lpgp_num = 0; // tell old code that there are no localized pgs.
1551 ::encode(lpg_num, bl);
1552 ::encode(lpgp_num, bl);
1553 ::encode(last_change, bl);
1554 ::encode(snap_seq, bl);
1555 ::encode(snap_epoch, bl);
1556 ::encode(snaps, bl, features);
1557 ::encode(removed_snaps, bl);
1559 ::encode(flags, bl);
1560 ::encode(crash_replay_interval, bl);
1561 ::encode(min_size, bl);
1562 ::encode(quota_max_bytes, bl);
1563 ::encode(quota_max_objects, bl);
1564 ::encode(tiers, bl);
1565 ::encode(tier_of, bl);
1566 __u8 c = cache_mode;
1568 ::encode(read_tier, bl);
1569 ::encode(write_tier, bl);
1570 ::encode(properties, bl);
1571 ::encode(hit_set_params, bl);
1572 ::encode(hit_set_period, bl);
1573 ::encode(hit_set_count, bl);
1574 ::encode(stripe_width, bl);
1575 ::encode(target_max_bytes, bl);
1576 ::encode(target_max_objects, bl);
1577 ::encode(cache_target_dirty_ratio_micro, bl);
1578 ::encode(cache_target_full_ratio_micro, bl);
1579 ::encode(cache_min_flush_age, bl);
1580 ::encode(cache_min_evict_age, bl);
1581 ::encode(erasure_code_profile, bl);
1582 ::encode(last_force_op_resend_preluminous, bl);
1583 ::encode(min_read_recency_for_promote, bl);
1584 ::encode(expected_num_objects, bl);
1586 ::encode(cache_target_dirty_high_ratio_micro, bl);
1589 ::encode(min_write_recency_for_promote, bl);
1592 ::encode(use_gmt_hitset, bl);
1595 ::encode(fast_read, bl);
1598 ::encode(hit_set_grade_decay_rate, bl);
1599 ::encode(hit_set_search_last_n, bl);
1605 ::encode(last_force_op_resend, bl);
1608 ::encode(application_metadata, bl);
1613 void pg_pool_t::decode(bufferlist::iterator& bl)
1615 DECODE_START_LEGACY_COMPAT_LEN(26, 5, 5, bl);
1618 ::decode(crush_rule, bl);
1619 ::decode(object_hash, bl);
1620 ::decode(pg_num, bl);
1621 ::decode(pgp_num, bl);
1623 __u32 lpg_num, lpgp_num;
1624 ::decode(lpg_num, bl);
1625 ::decode(lpgp_num, bl);
1627 ::decode(last_change, bl);
1628 ::decode(snap_seq, bl);
1629 ::decode(snap_epoch, bl);
1631 if (struct_v >= 3) {
1632 ::decode(snaps, bl);
1633 ::decode(removed_snaps, bl);
1640 ::decode_nohead(n, snaps, bl);
1641 ::decode_nohead(m, removed_snaps, bl);
1644 if (struct_v >= 4) {
1645 ::decode(flags, bl);
1646 ::decode(crash_replay_interval, bl);
1650 // if this looks like the 'data' pool, set the
1651 // crash_replay_interval appropriately. unfortunately, we can't
1652 // be precise here. this should be good enough to preserve replay
1653 // on the data pool for the majority of cluster upgrades, though.
1654 if (crush_rule == 0 && auid == 0)
1655 crash_replay_interval = 60;
1657 crash_replay_interval = 0;
1659 if (struct_v >= 7) {
1660 ::decode(min_size, bl);
1662 min_size = size - size/2;
1664 if (struct_v >= 8) {
1665 ::decode(quota_max_bytes, bl);
1666 ::decode(quota_max_objects, bl);
1668 if (struct_v >= 9) {
1669 ::decode(tiers, bl);
1670 ::decode(tier_of, bl);
1673 cache_mode = (cache_mode_t)v;
1674 ::decode(read_tier, bl);
1675 ::decode(write_tier, bl);
1677 if (struct_v >= 10) {
1678 ::decode(properties, bl);
1680 if (struct_v >= 11) {
1681 ::decode(hit_set_params, bl);
1682 ::decode(hit_set_period, bl);
1683 ::decode(hit_set_count, bl);
1686 hit_set_period = def.hit_set_period;
1687 hit_set_count = def.hit_set_count;
1689 if (struct_v >= 12) {
1690 ::decode(stripe_width, bl);
1692 set_stripe_width(0);
1694 if (struct_v >= 13) {
1695 ::decode(target_max_bytes, bl);
1696 ::decode(target_max_objects, bl);
1697 ::decode(cache_target_dirty_ratio_micro, bl);
1698 ::decode(cache_target_full_ratio_micro, bl);
1699 ::decode(cache_min_flush_age, bl);
1700 ::decode(cache_min_evict_age, bl);
1702 target_max_bytes = 0;
1703 target_max_objects = 0;
1704 cache_target_dirty_ratio_micro = 0;
1705 cache_target_full_ratio_micro = 0;
1706 cache_min_flush_age = 0;
1707 cache_min_evict_age = 0;
1709 if (struct_v >= 14) {
1710 ::decode(erasure_code_profile, bl);
1712 if (struct_v >= 15) {
1713 ::decode(last_force_op_resend_preluminous, bl);
1715 last_force_op_resend_preluminous = 0;
1717 if (struct_v >= 16) {
1718 ::decode(min_read_recency_for_promote, bl);
1720 min_read_recency_for_promote = 1;
1722 if (struct_v >= 17) {
1723 ::decode(expected_num_objects, bl);
1725 expected_num_objects = 0;
1727 if (struct_v >= 19) {
1728 ::decode(cache_target_dirty_high_ratio_micro, bl);
1730 cache_target_dirty_high_ratio_micro = cache_target_dirty_ratio_micro;
1732 if (struct_v >= 20) {
1733 ::decode(min_write_recency_for_promote, bl);
1735 min_write_recency_for_promote = 1;
1737 if (struct_v >= 21) {
1738 ::decode(use_gmt_hitset, bl);
1740 use_gmt_hitset = false;
1742 if (struct_v >= 22) {
1743 ::decode(fast_read, bl);
1747 if (struct_v >= 23) {
1748 ::decode(hit_set_grade_decay_rate, bl);
1749 ::decode(hit_set_search_last_n, bl);
1751 hit_set_grade_decay_rate = 0;
1752 hit_set_search_last_n = 1;
1754 if (struct_v >= 24) {
1757 if (struct_v >= 25) {
1758 ::decode(last_force_op_resend, bl);
1760 last_force_op_resend = last_force_op_resend_preluminous;
1762 if (struct_v >= 26) {
1763 ::decode(application_metadata, bl);
1770 void pg_pool_t::generate_test_instances(list<pg_pool_t*>& o)
1773 o.push_back(new pg_pool_t(a));
1775 a.type = TYPE_REPLICATED;
1782 a.last_force_op_resend = 123823;
1783 a.last_force_op_resend_preluminous = 123824;
1787 a.crash_replay_interval = 13;
1788 a.quota_max_bytes = 473;
1789 a.quota_max_objects = 474;
1790 o.push_back(new pg_pool_t(a));
1792 a.snaps[3].name = "asdf";
1793 a.snaps[3].snapid = 3;
1794 a.snaps[3].stamp = utime_t(123, 4);
1795 a.snaps[6].name = "qwer";
1796 a.snaps[6].snapid = 6;
1797 a.snaps[6].stamp = utime_t(23423, 4);
1798 o.push_back(new pg_pool_t(a));
1800 a.removed_snaps.insert(2); // not quite valid to combine with snaps!
1801 a.quota_max_bytes = 2473;
1802 a.quota_max_objects = 4374;
1806 a.cache_mode = CACHEMODE_WRITEBACK;
1809 a.hit_set_params = HitSet::Params(new BloomHitSet::Params);
1810 a.hit_set_period = 3600;
1811 a.hit_set_count = 8;
1812 a.min_read_recency_for_promote = 1;
1813 a.min_write_recency_for_promote = 1;
1814 a.hit_set_grade_decay_rate = 50;
1815 a.hit_set_search_last_n = 1;
1816 a.calc_grade_table();
1817 a.set_stripe_width(12345);
1818 a.target_max_bytes = 1238132132;
1819 a.target_max_objects = 1232132;
1820 a.cache_target_dirty_ratio_micro = 187232;
1821 a.cache_target_dirty_high_ratio_micro = 309856;
1822 a.cache_target_full_ratio_micro = 987222;
1823 a.cache_min_flush_age = 231;
1824 a.cache_min_evict_age = 2321;
1825 a.erasure_code_profile = "profile in osdmap";
1826 a.expected_num_objects = 123456;
1827 a.fast_read = false;
1828 a.application_metadata = {{"rbd", {{"key", "value"}}}};
1829 o.push_back(new pg_pool_t(a));
1832 ostream& operator<<(ostream& out, const pg_pool_t& p)
1834 out << p.get_type_name()
1835 << " size " << p.get_size()
1836 << " min_size " << p.get_min_size()
1837 << " crush_rule " << p.get_crush_rule()
1838 << " object_hash " << p.get_object_hash_name()
1839 << " pg_num " << p.get_pg_num()
1840 << " pgp_num " << p.get_pgp_num()
1841 << " last_change " << p.get_last_change();
1842 if (p.get_last_force_op_resend() ||
1843 p.get_last_force_op_resend_preluminous())
1844 out << " lfor " << p.get_last_force_op_resend() << "/"
1845 << p.get_last_force_op_resend_preluminous();
1847 out << " owner " << p.get_auid();
1849 out << " flags " << p.get_flags_string();
1850 if (p.crash_replay_interval)
1851 out << " crash_replay_interval " << p.crash_replay_interval;
1852 if (p.quota_max_bytes)
1853 out << " max_bytes " << p.quota_max_bytes;
1854 if (p.quota_max_objects)
1855 out << " max_objects " << p.quota_max_objects;
1856 if (!p.tiers.empty())
1857 out << " tiers " << p.tiers;
1859 out << " tier_of " << p.tier_of;
1860 if (p.has_read_tier())
1861 out << " read_tier " << p.read_tier;
1862 if (p.has_write_tier())
1863 out << " write_tier " << p.write_tier;
1865 out << " cache_mode " << p.get_cache_mode_name();
1866 if (p.target_max_bytes)
1867 out << " target_bytes " << p.target_max_bytes;
1868 if (p.target_max_objects)
1869 out << " target_objects " << p.target_max_objects;
1870 if (p.hit_set_params.get_type() != HitSet::TYPE_NONE) {
1871 out << " hit_set " << p.hit_set_params
1872 << " " << p.hit_set_period << "s"
1873 << " x" << p.hit_set_count << " decay_rate "
1874 << p.hit_set_grade_decay_rate
1875 << " search_last_n " << p.hit_set_search_last_n;
1877 if (p.min_read_recency_for_promote)
1878 out << " min_read_recency_for_promote " << p.min_read_recency_for_promote;
1879 if (p.min_write_recency_for_promote)
1880 out << " min_write_recency_for_promote " << p.min_write_recency_for_promote;
1881 out << " stripe_width " << p.get_stripe_width();
1882 if (p.expected_num_objects)
1883 out << " expected_num_objects " << p.expected_num_objects;
1885 out << " fast_read " << p.fast_read;
1887 if (!p.application_metadata.empty()) {
1888 out << " application ";
1889 for (auto it = p.application_metadata.begin();
1890 it != p.application_metadata.end(); ++it) {
1891 if (it != p.application_metadata.begin())
1900 // -- object_stat_sum_t --
1902 void object_stat_sum_t::dump(Formatter *f) const
1904 f->dump_int("num_bytes", num_bytes);
1905 f->dump_int("num_objects", num_objects);
1906 f->dump_int("num_object_clones", num_object_clones);
1907 f->dump_int("num_object_copies", num_object_copies);
1908 f->dump_int("num_objects_missing_on_primary", num_objects_missing_on_primary);
1909 f->dump_int("num_objects_missing", num_objects_missing);
1910 f->dump_int("num_objects_degraded", num_objects_degraded);
1911 f->dump_int("num_objects_misplaced", num_objects_misplaced);
1912 f->dump_int("num_objects_unfound", num_objects_unfound);
1913 f->dump_int("num_objects_dirty", num_objects_dirty);
1914 f->dump_int("num_whiteouts", num_whiteouts);
1915 f->dump_int("num_read", num_rd);
1916 f->dump_int("num_read_kb", num_rd_kb);
1917 f->dump_int("num_write", num_wr);
1918 f->dump_int("num_write_kb", num_wr_kb);
1919 f->dump_int("num_scrub_errors", num_scrub_errors);
1920 f->dump_int("num_shallow_scrub_errors", num_shallow_scrub_errors);
1921 f->dump_int("num_deep_scrub_errors", num_deep_scrub_errors);
1922 f->dump_int("num_objects_recovered", num_objects_recovered);
1923 f->dump_int("num_bytes_recovered", num_bytes_recovered);
1924 f->dump_int("num_keys_recovered", num_keys_recovered);
1925 f->dump_int("num_objects_omap", num_objects_omap);
1926 f->dump_int("num_objects_hit_set_archive", num_objects_hit_set_archive);
1927 f->dump_int("num_bytes_hit_set_archive", num_bytes_hit_set_archive);
1928 f->dump_int("num_flush", num_flush);
1929 f->dump_int("num_flush_kb", num_flush_kb);
1930 f->dump_int("num_evict", num_evict);
1931 f->dump_int("num_evict_kb", num_evict_kb);
1932 f->dump_int("num_promote", num_promote);
1933 f->dump_int("num_flush_mode_high", num_flush_mode_high);
1934 f->dump_int("num_flush_mode_low", num_flush_mode_low);
1935 f->dump_int("num_evict_mode_some", num_evict_mode_some);
1936 f->dump_int("num_evict_mode_full", num_evict_mode_full);
1937 f->dump_int("num_objects_pinned", num_objects_pinned);
1938 f->dump_int("num_legacy_snapsets", num_legacy_snapsets);
1941 void object_stat_sum_t::encode(bufferlist& bl) const
1943 ENCODE_START(16, 14, bl);
1944 #if defined(CEPH_LITTLE_ENDIAN)
1945 bl.append((char *)(&num_bytes), sizeof(object_stat_sum_t));
1947 ::encode(num_bytes, bl);
1948 ::encode(num_objects, bl);
1949 ::encode(num_object_clones, bl);
1950 ::encode(num_object_copies, bl);
1951 ::encode(num_objects_missing_on_primary, bl);
1952 ::encode(num_objects_degraded, bl);
1953 ::encode(num_objects_unfound, bl);
1954 ::encode(num_rd, bl);
1955 ::encode(num_rd_kb, bl);
1956 ::encode(num_wr, bl);
1957 ::encode(num_wr_kb, bl);
1958 ::encode(num_scrub_errors, bl);
1959 ::encode(num_objects_recovered, bl);
1960 ::encode(num_bytes_recovered, bl);
1961 ::encode(num_keys_recovered, bl);
1962 ::encode(num_shallow_scrub_errors, bl);
1963 ::encode(num_deep_scrub_errors, bl);
1964 ::encode(num_objects_dirty, bl);
1965 ::encode(num_whiteouts, bl);
1966 ::encode(num_objects_omap, bl);
1967 ::encode(num_objects_hit_set_archive, bl);
1968 ::encode(num_objects_misplaced, bl);
1969 ::encode(num_bytes_hit_set_archive, bl);
1970 ::encode(num_flush, bl);
1971 ::encode(num_flush_kb, bl);
1972 ::encode(num_evict, bl);
1973 ::encode(num_evict_kb, bl);
1974 ::encode(num_promote, bl);
1975 ::encode(num_flush_mode_high, bl);
1976 ::encode(num_flush_mode_low, bl);
1977 ::encode(num_evict_mode_some, bl);
1978 ::encode(num_evict_mode_full, bl);
1979 ::encode(num_objects_pinned, bl);
1980 ::encode(num_objects_missing, bl);
1981 ::encode(num_legacy_snapsets, bl);
1986 void object_stat_sum_t::decode(bufferlist::iterator& bl)
1988 bool decode_finish = false;
1989 DECODE_START(16, bl);
1990 #if defined(CEPH_LITTLE_ENDIAN)
1991 if (struct_v >= 16) {
1992 bl.copy(sizeof(object_stat_sum_t), (char*)(&num_bytes));
1993 decode_finish = true;
1996 if (!decode_finish) {
1997 ::decode(num_bytes, bl);
1998 ::decode(num_objects, bl);
1999 ::decode(num_object_clones, bl);
2000 ::decode(num_object_copies, bl);
2001 ::decode(num_objects_missing_on_primary, bl);
2002 ::decode(num_objects_degraded, bl);
2003 ::decode(num_objects_unfound, bl);
2004 ::decode(num_rd, bl);
2005 ::decode(num_rd_kb, bl);
2006 ::decode(num_wr, bl);
2007 ::decode(num_wr_kb, bl);
2008 ::decode(num_scrub_errors, bl);
2009 ::decode(num_objects_recovered, bl);
2010 ::decode(num_bytes_recovered, bl);
2011 ::decode(num_keys_recovered, bl);
2012 ::decode(num_shallow_scrub_errors, bl);
2013 ::decode(num_deep_scrub_errors, bl);
2014 ::decode(num_objects_dirty, bl);
2015 ::decode(num_whiteouts, bl);
2016 ::decode(num_objects_omap, bl);
2017 ::decode(num_objects_hit_set_archive, bl);
2018 ::decode(num_objects_misplaced, bl);
2019 ::decode(num_bytes_hit_set_archive, bl);
2020 ::decode(num_flush, bl);
2021 ::decode(num_flush_kb, bl);
2022 ::decode(num_evict, bl);
2023 ::decode(num_evict_kb, bl);
2024 ::decode(num_promote, bl);
2025 ::decode(num_flush_mode_high, bl);
2026 ::decode(num_flush_mode_low, bl);
2027 ::decode(num_evict_mode_some, bl);
2028 ::decode(num_evict_mode_full, bl);
2029 ::decode(num_objects_pinned, bl);
2030 ::decode(num_objects_missing, bl);
2031 if (struct_v >= 16) {
2032 ::decode(num_legacy_snapsets, bl);
2034 num_legacy_snapsets = num_object_clones; // upper bound
2040 void object_stat_sum_t::generate_test_instances(list<object_stat_sum_t*>& o)
2042 object_stat_sum_t a;
2046 a.num_object_clones = 4;
2047 a.num_object_copies = 5;
2048 a.num_objects_missing_on_primary = 6;
2049 a.num_objects_missing = 123;
2050 a.num_objects_degraded = 7;
2051 a.num_objects_unfound = 8;
2052 a.num_rd = 9; a.num_rd_kb = 10;
2053 a.num_wr = 11; a.num_wr_kb = 12;
2054 a.num_objects_recovered = 14;
2055 a.num_bytes_recovered = 15;
2056 a.num_keys_recovered = 16;
2057 a.num_deep_scrub_errors = 17;
2058 a.num_shallow_scrub_errors = 18;
2059 a.num_scrub_errors = a.num_deep_scrub_errors + a.num_shallow_scrub_errors;
2060 a.num_objects_dirty = 21;
2061 a.num_whiteouts = 22;
2062 a.num_objects_misplaced = 1232;
2063 a.num_objects_hit_set_archive = 2;
2064 a.num_bytes_hit_set_archive = 27;
2070 a.num_flush_mode_high = 0;
2071 a.num_flush_mode_low = 1;
2072 a.num_evict_mode_some = 1;
2073 a.num_evict_mode_full = 0;
2074 a.num_objects_pinned = 20;
2075 o.push_back(new object_stat_sum_t(a));
2078 void object_stat_sum_t::add(const object_stat_sum_t& o)
2080 num_bytes += o.num_bytes;
2081 num_objects += o.num_objects;
2082 num_object_clones += o.num_object_clones;
2083 num_object_copies += o.num_object_copies;
2084 num_objects_missing_on_primary += o.num_objects_missing_on_primary;
2085 num_objects_missing += o.num_objects_missing;
2086 num_objects_degraded += o.num_objects_degraded;
2087 num_objects_misplaced += o.num_objects_misplaced;
2089 num_rd_kb += o.num_rd_kb;
2091 num_wr_kb += o.num_wr_kb;
2092 num_objects_unfound += o.num_objects_unfound;
2093 num_scrub_errors += o.num_scrub_errors;
2094 num_shallow_scrub_errors += o.num_shallow_scrub_errors;
2095 num_deep_scrub_errors += o.num_deep_scrub_errors;
2096 num_objects_recovered += o.num_objects_recovered;
2097 num_bytes_recovered += o.num_bytes_recovered;
2098 num_keys_recovered += o.num_keys_recovered;
2099 num_objects_dirty += o.num_objects_dirty;
2100 num_whiteouts += o.num_whiteouts;
2101 num_objects_omap += o.num_objects_omap;
2102 num_objects_hit_set_archive += o.num_objects_hit_set_archive;
2103 num_bytes_hit_set_archive += o.num_bytes_hit_set_archive;
2104 num_flush += o.num_flush;
2105 num_flush_kb += o.num_flush_kb;
2106 num_evict += o.num_evict;
2107 num_evict_kb += o.num_evict_kb;
2108 num_promote += o.num_promote;
2109 num_flush_mode_high += o.num_flush_mode_high;
2110 num_flush_mode_low += o.num_flush_mode_low;
2111 num_evict_mode_some += o.num_evict_mode_some;
2112 num_evict_mode_full += o.num_evict_mode_full;
2113 num_objects_pinned += o.num_objects_pinned;
2114 num_legacy_snapsets += o.num_legacy_snapsets;
2117 void object_stat_sum_t::sub(const object_stat_sum_t& o)
2119 num_bytes -= o.num_bytes;
2120 num_objects -= o.num_objects;
2121 num_object_clones -= o.num_object_clones;
2122 num_object_copies -= o.num_object_copies;
2123 num_objects_missing_on_primary -= o.num_objects_missing_on_primary;
2124 num_objects_missing -= o.num_objects_missing;
2125 num_objects_degraded -= o.num_objects_degraded;
2126 num_objects_misplaced -= o.num_objects_misplaced;
2128 num_rd_kb -= o.num_rd_kb;
2130 num_wr_kb -= o.num_wr_kb;
2131 num_objects_unfound -= o.num_objects_unfound;
2132 num_scrub_errors -= o.num_scrub_errors;
2133 num_shallow_scrub_errors -= o.num_shallow_scrub_errors;
2134 num_deep_scrub_errors -= o.num_deep_scrub_errors;
2135 num_objects_recovered -= o.num_objects_recovered;
2136 num_bytes_recovered -= o.num_bytes_recovered;
2137 num_keys_recovered -= o.num_keys_recovered;
2138 num_objects_dirty -= o.num_objects_dirty;
2139 num_whiteouts -= o.num_whiteouts;
2140 num_objects_omap -= o.num_objects_omap;
2141 num_objects_hit_set_archive -= o.num_objects_hit_set_archive;
2142 num_bytes_hit_set_archive -= o.num_bytes_hit_set_archive;
2143 num_flush -= o.num_flush;
2144 num_flush_kb -= o.num_flush_kb;
2145 num_evict -= o.num_evict;
2146 num_evict_kb -= o.num_evict_kb;
2147 num_promote -= o.num_promote;
2148 num_flush_mode_high -= o.num_flush_mode_high;
2149 num_flush_mode_low -= o.num_flush_mode_low;
2150 num_evict_mode_some -= o.num_evict_mode_some;
2151 num_evict_mode_full -= o.num_evict_mode_full;
2152 num_objects_pinned -= o.num_objects_pinned;
2153 num_legacy_snapsets -= o.num_legacy_snapsets;
2156 bool operator==(const object_stat_sum_t& l, const object_stat_sum_t& r)
2159 l.num_bytes == r.num_bytes &&
2160 l.num_objects == r.num_objects &&
2161 l.num_object_clones == r.num_object_clones &&
2162 l.num_object_copies == r.num_object_copies &&
2163 l.num_objects_missing_on_primary == r.num_objects_missing_on_primary &&
2164 l.num_objects_missing == r.num_objects_missing &&
2165 l.num_objects_degraded == r.num_objects_degraded &&
2166 l.num_objects_misplaced == r.num_objects_misplaced &&
2167 l.num_objects_unfound == r.num_objects_unfound &&
2168 l.num_rd == r.num_rd &&
2169 l.num_rd_kb == r.num_rd_kb &&
2170 l.num_wr == r.num_wr &&
2171 l.num_wr_kb == r.num_wr_kb &&
2172 l.num_scrub_errors == r.num_scrub_errors &&
2173 l.num_shallow_scrub_errors == r.num_shallow_scrub_errors &&
2174 l.num_deep_scrub_errors == r.num_deep_scrub_errors &&
2175 l.num_objects_recovered == r.num_objects_recovered &&
2176 l.num_bytes_recovered == r.num_bytes_recovered &&
2177 l.num_keys_recovered == r.num_keys_recovered &&
2178 l.num_objects_dirty == r.num_objects_dirty &&
2179 l.num_whiteouts == r.num_whiteouts &&
2180 l.num_objects_omap == r.num_objects_omap &&
2181 l.num_objects_hit_set_archive == r.num_objects_hit_set_archive &&
2182 l.num_bytes_hit_set_archive == r.num_bytes_hit_set_archive &&
2183 l.num_flush == r.num_flush &&
2184 l.num_flush_kb == r.num_flush_kb &&
2185 l.num_evict == r.num_evict &&
2186 l.num_evict_kb == r.num_evict_kb &&
2187 l.num_promote == r.num_promote &&
2188 l.num_flush_mode_high == r.num_flush_mode_high &&
2189 l.num_flush_mode_low == r.num_flush_mode_low &&
2190 l.num_evict_mode_some == r.num_evict_mode_some &&
2191 l.num_evict_mode_full == r.num_evict_mode_full &&
2192 l.num_objects_pinned == r.num_objects_pinned &&
2193 l.num_legacy_snapsets == r.num_legacy_snapsets;
2196 // -- object_stat_collection_t --
2198 void object_stat_collection_t::dump(Formatter *f) const
2200 f->open_object_section("stat_sum");
2205 void object_stat_collection_t::encode(bufferlist& bl) const
2207 ENCODE_START(2, 2, bl);
2209 ::encode((__u32)0, bl);
2213 void object_stat_collection_t::decode(bufferlist::iterator& bl)
2215 DECODE_START_LEGACY_COMPAT_LEN(2, 2, 2, bl);
2218 map<string,object_stat_sum_t> cat_sum;
2219 ::decode(cat_sum, bl);
2224 void object_stat_collection_t::generate_test_instances(list<object_stat_collection_t*>& o)
2226 object_stat_collection_t a;
2227 o.push_back(new object_stat_collection_t(a));
2228 list<object_stat_sum_t*> l;
2229 object_stat_sum_t::generate_test_instances(l);
2230 for (list<object_stat_sum_t*>::iterator p = l.begin(); p != l.end(); ++p) {
2232 o.push_back(new object_stat_collection_t(a));
2239 bool pg_stat_t::is_acting_osd(int32_t osd, bool primary) const
2241 if (primary && osd == acting_primary) {
2243 } else if (!primary) {
2244 for(vector<int32_t>::const_iterator it = acting.begin();
2245 it != acting.end(); ++it)
2254 void pg_stat_t::dump(Formatter *f) const
2256 f->dump_stream("version") << version;
2257 f->dump_stream("reported_seq") << reported_seq;
2258 f->dump_stream("reported_epoch") << reported_epoch;
2259 f->dump_string("state", pg_state_string(state));
2260 f->dump_stream("last_fresh") << last_fresh;
2261 f->dump_stream("last_change") << last_change;
2262 f->dump_stream("last_active") << last_active;
2263 f->dump_stream("last_peered") << last_peered;
2264 f->dump_stream("last_clean") << last_clean;
2265 f->dump_stream("last_became_active") << last_became_active;
2266 f->dump_stream("last_became_peered") << last_became_peered;
2267 f->dump_stream("last_unstale") << last_unstale;
2268 f->dump_stream("last_undegraded") << last_undegraded;
2269 f->dump_stream("last_fullsized") << last_fullsized;
2270 f->dump_unsigned("mapping_epoch", mapping_epoch);
2271 f->dump_stream("log_start") << log_start;
2272 f->dump_stream("ondisk_log_start") << ondisk_log_start;
2273 f->dump_unsigned("created", created);
2274 f->dump_unsigned("last_epoch_clean", last_epoch_clean);
2275 f->dump_stream("parent") << parent;
2276 f->dump_unsigned("parent_split_bits", parent_split_bits);
2277 f->dump_stream("last_scrub") << last_scrub;
2278 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
2279 f->dump_stream("last_deep_scrub") << last_deep_scrub;
2280 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
2281 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
2282 f->dump_int("log_size", log_size);
2283 f->dump_int("ondisk_log_size", ondisk_log_size);
2284 f->dump_bool("stats_invalid", stats_invalid);
2285 f->dump_bool("dirty_stats_invalid", dirty_stats_invalid);
2286 f->dump_bool("omap_stats_invalid", omap_stats_invalid);
2287 f->dump_bool("hitset_stats_invalid", hitset_stats_invalid);
2288 f->dump_bool("hitset_bytes_stats_invalid", hitset_bytes_stats_invalid);
2289 f->dump_bool("pin_stats_invalid", pin_stats_invalid);
2291 f->open_array_section("up");
2292 for (vector<int32_t>::const_iterator p = up.begin(); p != up.end(); ++p)
2293 f->dump_int("osd", *p);
2295 f->open_array_section("acting");
2296 for (vector<int32_t>::const_iterator p = acting.begin(); p != acting.end(); ++p)
2297 f->dump_int("osd", *p);
2299 f->open_array_section("blocked_by");
2300 for (vector<int32_t>::const_iterator p = blocked_by.begin();
2301 p != blocked_by.end(); ++p)
2302 f->dump_int("osd", *p);
2304 f->dump_int("up_primary", up_primary);
2305 f->dump_int("acting_primary", acting_primary);
2308 void pg_stat_t::dump_brief(Formatter *f) const
2310 f->dump_string("state", pg_state_string(state));
2311 f->open_array_section("up");
2312 for (vector<int32_t>::const_iterator p = up.begin(); p != up.end(); ++p)
2313 f->dump_int("osd", *p);
2315 f->open_array_section("acting");
2316 for (vector<int32_t>::const_iterator p = acting.begin(); p != acting.end(); ++p)
2317 f->dump_int("osd", *p);
2319 f->dump_int("up_primary", up_primary);
2320 f->dump_int("acting_primary", acting_primary);
2323 void pg_stat_t::encode(bufferlist &bl) const
2325 ENCODE_START(22, 22, bl);
2326 ::encode(version, bl);
2327 ::encode(reported_seq, bl);
2328 ::encode(reported_epoch, bl);
2329 ::encode(state, bl);
2330 ::encode(log_start, bl);
2331 ::encode(ondisk_log_start, bl);
2332 ::encode(created, bl);
2333 ::encode(last_epoch_clean, bl);
2334 ::encode(parent, bl);
2335 ::encode(parent_split_bits, bl);
2336 ::encode(last_scrub, bl);
2337 ::encode(last_scrub_stamp, bl);
2338 ::encode(stats, bl);
2339 ::encode(log_size, bl);
2340 ::encode(ondisk_log_size, bl);
2342 ::encode(acting, bl);
2343 ::encode(last_fresh, bl);
2344 ::encode(last_change, bl);
2345 ::encode(last_active, bl);
2346 ::encode(last_clean, bl);
2347 ::encode(last_unstale, bl);
2348 ::encode(mapping_epoch, bl);
2349 ::encode(last_deep_scrub, bl);
2350 ::encode(last_deep_scrub_stamp, bl);
2351 ::encode(stats_invalid, bl);
2352 ::encode(last_clean_scrub_stamp, bl);
2353 ::encode(last_became_active, bl);
2354 ::encode(dirty_stats_invalid, bl);
2355 ::encode(up_primary, bl);
2356 ::encode(acting_primary, bl);
2357 ::encode(omap_stats_invalid, bl);
2358 ::encode(hitset_stats_invalid, bl);
2359 ::encode(blocked_by, bl);
2360 ::encode(last_undegraded, bl);
2361 ::encode(last_fullsized, bl);
2362 ::encode(hitset_bytes_stats_invalid, bl);
2363 ::encode(last_peered, bl);
2364 ::encode(last_became_peered, bl);
2365 ::encode(pin_stats_invalid, bl);
2369 void pg_stat_t::decode(bufferlist::iterator &bl)
2372 DECODE_START(22, bl);
2373 ::decode(version, bl);
2374 ::decode(reported_seq, bl);
2375 ::decode(reported_epoch, bl);
2376 ::decode(state, bl);
2377 ::decode(log_start, bl);
2378 ::decode(ondisk_log_start, bl);
2379 ::decode(created, bl);
2380 ::decode(last_epoch_clean, bl);
2381 ::decode(parent, bl);
2382 ::decode(parent_split_bits, bl);
2383 ::decode(last_scrub, bl);
2384 ::decode(last_scrub_stamp, bl);
2385 ::decode(stats, bl);
2386 ::decode(log_size, bl);
2387 ::decode(ondisk_log_size, bl);
2389 ::decode(acting, bl);
2390 ::decode(last_fresh, bl);
2391 ::decode(last_change, bl);
2392 ::decode(last_active, bl);
2393 ::decode(last_clean, bl);
2394 ::decode(last_unstale, bl);
2395 ::decode(mapping_epoch, bl);
2396 ::decode(last_deep_scrub, bl);
2397 ::decode(last_deep_scrub_stamp, bl);
2399 stats_invalid = tmp;
2400 ::decode(last_clean_scrub_stamp, bl);
2401 ::decode(last_became_active, bl);
2403 dirty_stats_invalid = tmp;
2404 ::decode(up_primary, bl);
2405 ::decode(acting_primary, bl);
2407 omap_stats_invalid = tmp;
2409 hitset_stats_invalid = tmp;
2410 ::decode(blocked_by, bl);
2411 ::decode(last_undegraded, bl);
2412 ::decode(last_fullsized, bl);
2414 hitset_bytes_stats_invalid = tmp;
2415 ::decode(last_peered, bl);
2416 ::decode(last_became_peered, bl);
2418 pin_stats_invalid = tmp;
2422 void pg_stat_t::generate_test_instances(list<pg_stat_t*>& o)
2425 o.push_back(new pg_stat_t(a));
2427 a.version = eversion_t(1, 3);
2428 a.reported_epoch = 1;
2431 a.mapping_epoch = 998;
2432 a.last_fresh = utime_t(1002, 1);
2433 a.last_change = utime_t(1002, 2);
2434 a.last_active = utime_t(1002, 3);
2435 a.last_clean = utime_t(1002, 4);
2436 a.last_unstale = utime_t(1002, 5);
2437 a.last_undegraded = utime_t(1002, 7);
2438 a.last_fullsized = utime_t(1002, 8);
2439 a.log_start = eversion_t(1, 4);
2440 a.ondisk_log_start = eversion_t(1, 5);
2442 a.last_epoch_clean = 7;
2443 a.parent = pg_t(1, 2, 3);
2444 a.parent_split_bits = 12;
2445 a.last_scrub = eversion_t(9, 10);
2446 a.last_scrub_stamp = utime_t(11, 12);
2447 a.last_deep_scrub = eversion_t(13, 14);
2448 a.last_deep_scrub_stamp = utime_t(15, 16);
2449 a.last_clean_scrub_stamp = utime_t(17, 18);
2450 list<object_stat_collection_t*> l;
2451 object_stat_collection_t::generate_test_instances(l);
2452 a.stats = *l.back();
2454 a.ondisk_log_size = 88;
2455 a.up.push_back(123);
2457 a.acting.push_back(456);
2458 a.acting_primary = 456;
2459 o.push_back(new pg_stat_t(a));
2461 a.up.push_back(124);
2463 a.acting.push_back(124);
2464 a.acting_primary = 124;
2465 a.blocked_by.push_back(155);
2466 a.blocked_by.push_back(156);
2467 o.push_back(new pg_stat_t(a));
2470 bool operator==(const pg_stat_t& l, const pg_stat_t& r)
2473 l.version == r.version &&
2474 l.reported_seq == r.reported_seq &&
2475 l.reported_epoch == r.reported_epoch &&
2476 l.state == r.state &&
2477 l.last_fresh == r.last_fresh &&
2478 l.last_change == r.last_change &&
2479 l.last_active == r.last_active &&
2480 l.last_peered == r.last_peered &&
2481 l.last_clean == r.last_clean &&
2482 l.last_unstale == r.last_unstale &&
2483 l.last_undegraded == r.last_undegraded &&
2484 l.last_fullsized == r.last_fullsized &&
2485 l.log_start == r.log_start &&
2486 l.ondisk_log_start == r.ondisk_log_start &&
2487 l.created == r.created &&
2488 l.last_epoch_clean == r.last_epoch_clean &&
2489 l.parent == r.parent &&
2490 l.parent_split_bits == r.parent_split_bits &&
2491 l.last_scrub == r.last_scrub &&
2492 l.last_deep_scrub == r.last_deep_scrub &&
2493 l.last_scrub_stamp == r.last_scrub_stamp &&
2494 l.last_deep_scrub_stamp == r.last_deep_scrub_stamp &&
2495 l.last_clean_scrub_stamp == r.last_clean_scrub_stamp &&
2496 l.stats == r.stats &&
2497 l.stats_invalid == r.stats_invalid &&
2498 l.log_size == r.log_size &&
2499 l.ondisk_log_size == r.ondisk_log_size &&
2501 l.acting == r.acting &&
2502 l.mapping_epoch == r.mapping_epoch &&
2503 l.blocked_by == r.blocked_by &&
2504 l.last_became_active == r.last_became_active &&
2505 l.last_became_peered == r.last_became_peered &&
2506 l.dirty_stats_invalid == r.dirty_stats_invalid &&
2507 l.omap_stats_invalid == r.omap_stats_invalid &&
2508 l.hitset_stats_invalid == r.hitset_stats_invalid &&
2509 l.hitset_bytes_stats_invalid == r.hitset_bytes_stats_invalid &&
2510 l.up_primary == r.up_primary &&
2511 l.acting_primary == r.acting_primary &&
2512 l.pin_stats_invalid == r.pin_stats_invalid;
2515 // -- pool_stat_t --
2517 void pool_stat_t::dump(Formatter *f) const
2520 f->dump_int("log_size", log_size);
2521 f->dump_int("ondisk_log_size", ondisk_log_size);
2522 f->dump_int("up", up);
2523 f->dump_int("acting", acting);
2526 void pool_stat_t::encode(bufferlist &bl, uint64_t features) const
2528 if ((features & CEPH_FEATURE_OSDENC) == 0) {
2531 ::encode(stats, bl);
2532 ::encode(log_size, bl);
2533 ::encode(ondisk_log_size, bl);
2537 ENCODE_START(6, 5, bl);
2538 ::encode(stats, bl);
2539 ::encode(log_size, bl);
2540 ::encode(ondisk_log_size, bl);
2542 ::encode(acting, bl);
2546 void pool_stat_t::decode(bufferlist::iterator &bl)
2548 DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
2549 if (struct_v >= 4) {
2550 ::decode(stats, bl);
2551 ::decode(log_size, bl);
2552 ::decode(ondisk_log_size, bl);
2553 if (struct_v >= 6) {
2555 ::decode(acting, bl);
2561 ::decode(stats.sum.num_bytes, bl);
2563 ::decode(num_kb, bl);
2564 ::decode(stats.sum.num_objects, bl);
2565 ::decode(stats.sum.num_object_clones, bl);
2566 ::decode(stats.sum.num_object_copies, bl);
2567 ::decode(stats.sum.num_objects_missing_on_primary, bl);
2568 ::decode(stats.sum.num_objects_degraded, bl);
2569 ::decode(log_size, bl);
2570 ::decode(ondisk_log_size, bl);
2571 if (struct_v >= 2) {
2572 ::decode(stats.sum.num_rd, bl);
2573 ::decode(stats.sum.num_rd_kb, bl);
2574 ::decode(stats.sum.num_wr, bl);
2575 ::decode(stats.sum.num_wr_kb, bl);
2577 if (struct_v >= 3) {
2578 ::decode(stats.sum.num_objects_unfound, bl);
2584 void pool_stat_t::generate_test_instances(list<pool_stat_t*>& o)
2587 o.push_back(new pool_stat_t(a));
2589 list<object_stat_collection_t*> l;
2590 object_stat_collection_t::generate_test_instances(l);
2591 a.stats = *l.back();
2593 a.ondisk_log_size = 456;
2596 o.push_back(new pool_stat_t(a));
2600 // -- pg_history_t --
2602 void pg_history_t::encode(bufferlist &bl) const
2604 ENCODE_START(9, 4, bl);
2605 ::encode(epoch_created, bl);
2606 ::encode(last_epoch_started, bl);
2607 ::encode(last_epoch_clean, bl);
2608 ::encode(last_epoch_split, bl);
2609 ::encode(same_interval_since, bl);
2610 ::encode(same_up_since, bl);
2611 ::encode(same_primary_since, bl);
2612 ::encode(last_scrub, bl);
2613 ::encode(last_scrub_stamp, bl);
2614 ::encode(last_deep_scrub, bl);
2615 ::encode(last_deep_scrub_stamp, bl);
2616 ::encode(last_clean_scrub_stamp, bl);
2617 ::encode(last_epoch_marked_full, bl);
2618 ::encode(last_interval_started, bl);
2619 ::encode(last_interval_clean, bl);
2620 ::encode(epoch_pool_created, bl);
2624 void pg_history_t::decode(bufferlist::iterator &bl)
2626 DECODE_START_LEGACY_COMPAT_LEN(9, 4, 4, bl);
2627 ::decode(epoch_created, bl);
2628 ::decode(last_epoch_started, bl);
2630 ::decode(last_epoch_clean, bl);
2632 last_epoch_clean = last_epoch_started; // careful, it's a lie!
2633 ::decode(last_epoch_split, bl);
2634 ::decode(same_interval_since, bl);
2635 ::decode(same_up_since, bl);
2636 ::decode(same_primary_since, bl);
2637 if (struct_v >= 2) {
2638 ::decode(last_scrub, bl);
2639 ::decode(last_scrub_stamp, bl);
2641 if (struct_v >= 5) {
2642 ::decode(last_deep_scrub, bl);
2643 ::decode(last_deep_scrub_stamp, bl);
2645 if (struct_v >= 6) {
2646 ::decode(last_clean_scrub_stamp, bl);
2648 if (struct_v >= 7) {
2649 ::decode(last_epoch_marked_full, bl);
2651 if (struct_v >= 8) {
2652 ::decode(last_interval_started, bl);
2653 ::decode(last_interval_clean, bl);
2655 if (last_epoch_started >= same_interval_since) {
2656 last_interval_started = same_interval_since;
2658 last_interval_started = last_epoch_started; // best guess
2660 if (last_epoch_clean >= same_interval_since) {
2661 last_interval_clean = same_interval_since;
2663 last_interval_clean = last_epoch_clean; // best guess
2666 if (struct_v >= 9) {
2667 ::decode(epoch_pool_created, bl);
2669 epoch_pool_created = epoch_created;
2674 void pg_history_t::dump(Formatter *f) const
2676 f->dump_int("epoch_created", epoch_created);
2677 f->dump_int("epoch_pool_created", epoch_pool_created);
2678 f->dump_int("last_epoch_started", last_epoch_started);
2679 f->dump_int("last_interval_started", last_interval_started);
2680 f->dump_int("last_epoch_clean", last_epoch_clean);
2681 f->dump_int("last_interval_clean", last_interval_clean);
2682 f->dump_int("last_epoch_split", last_epoch_split);
2683 f->dump_int("last_epoch_marked_full", last_epoch_marked_full);
2684 f->dump_int("same_up_since", same_up_since);
2685 f->dump_int("same_interval_since", same_interval_since);
2686 f->dump_int("same_primary_since", same_primary_since);
2687 f->dump_stream("last_scrub") << last_scrub;
2688 f->dump_stream("last_scrub_stamp") << last_scrub_stamp;
2689 f->dump_stream("last_deep_scrub") << last_deep_scrub;
2690 f->dump_stream("last_deep_scrub_stamp") << last_deep_scrub_stamp;
2691 f->dump_stream("last_clean_scrub_stamp") << last_clean_scrub_stamp;
2694 void pg_history_t::generate_test_instances(list<pg_history_t*>& o)
2696 o.push_back(new pg_history_t);
2697 o.push_back(new pg_history_t);
2698 o.back()->epoch_created = 1;
2699 o.back()->epoch_pool_created = 1;
2700 o.back()->last_epoch_started = 2;
2701 o.back()->last_interval_started = 2;
2702 o.back()->last_epoch_clean = 3;
2703 o.back()->last_interval_clean = 2;
2704 o.back()->last_epoch_split = 4;
2705 o.back()->same_up_since = 5;
2706 o.back()->same_interval_since = 6;
2707 o.back()->same_primary_since = 7;
2708 o.back()->last_scrub = eversion_t(8, 9);
2709 o.back()->last_scrub_stamp = utime_t(10, 11);
2710 o.back()->last_deep_scrub = eversion_t(12, 13);
2711 o.back()->last_deep_scrub_stamp = utime_t(14, 15);
2712 o.back()->last_clean_scrub_stamp = utime_t(16, 17);
2713 o.back()->last_epoch_marked_full = 18;
2719 void pg_info_t::encode(bufferlist &bl) const
2721 ENCODE_START(32, 26, bl);
2722 ::encode(pgid.pgid, bl);
2723 ::encode(last_update, bl);
2724 ::encode(last_complete, bl);
2725 ::encode(log_tail, bl);
2726 if (last_backfill_bitwise && !last_backfill.is_max()) {
2727 ::encode(hobject_t(), bl);
2729 ::encode(last_backfill, bl);
2731 ::encode(stats, bl);
2733 ::encode(purged_snaps, bl);
2734 ::encode(last_epoch_started, bl);
2735 ::encode(last_user_version, bl);
2736 ::encode(hit_set, bl);
2737 ::encode(pgid.shard, bl);
2738 ::encode(last_backfill, bl);
2739 ::encode(last_backfill_bitwise, bl);
2740 ::encode(last_interval_started, bl);
2744 void pg_info_t::decode(bufferlist::iterator &bl)
2746 DECODE_START(32, bl);
2747 ::decode(pgid.pgid, bl);
2748 ::decode(last_update, bl);
2749 ::decode(last_complete, bl);
2750 ::decode(log_tail, bl);
2752 hobject_t old_last_backfill;
2753 ::decode(old_last_backfill, bl);
2755 ::decode(stats, bl);
2757 ::decode(purged_snaps, bl);
2758 ::decode(last_epoch_started, bl);
2759 ::decode(last_user_version, bl);
2760 ::decode(hit_set, bl);
2761 ::decode(pgid.shard, bl);
2762 ::decode(last_backfill, bl);
2763 ::decode(last_backfill_bitwise, bl);
2764 if (struct_v >= 32) {
2765 ::decode(last_interval_started, bl);
2767 last_interval_started = last_epoch_started;
2774 void pg_info_t::dump(Formatter *f) const
2776 f->dump_stream("pgid") << pgid;
2777 f->dump_stream("last_update") << last_update;
2778 f->dump_stream("last_complete") << last_complete;
2779 f->dump_stream("log_tail") << log_tail;
2780 f->dump_int("last_user_version", last_user_version);
2781 f->dump_stream("last_backfill") << last_backfill;
2782 f->dump_int("last_backfill_bitwise", (int)last_backfill_bitwise);
2783 f->open_array_section("purged_snaps");
2784 for (interval_set<snapid_t>::const_iterator i=purged_snaps.begin();
2785 i != purged_snaps.end();
2787 f->open_object_section("purged_snap_interval");
2788 f->dump_stream("start") << i.get_start();
2789 f->dump_stream("length") << i.get_len();
2793 f->open_object_section("history");
2796 f->open_object_section("stats");
2800 f->dump_int("empty", is_empty());
2801 f->dump_int("dne", dne());
2802 f->dump_int("incomplete", is_incomplete());
2803 f->dump_int("last_epoch_started", last_epoch_started);
2805 f->open_object_section("hit_set_history");
2810 void pg_info_t::generate_test_instances(list<pg_info_t*>& o)
2812 o.push_back(new pg_info_t);
2813 o.push_back(new pg_info_t);
2814 list<pg_history_t*> h;
2815 pg_history_t::generate_test_instances(h);
2816 o.back()->history = *h.back();
2817 o.back()->pgid = spg_t(pg_t(1, 2, -1), shard_id_t::NO_SHARD);
2818 o.back()->last_update = eversion_t(3, 4);
2819 o.back()->last_complete = eversion_t(5, 6);
2820 o.back()->last_user_version = 2;
2821 o.back()->log_tail = eversion_t(7, 8);
2822 o.back()->last_backfill = hobject_t(object_t("objname"), "key", 123, 456, -1, "");
2823 o.back()->last_backfill_bitwise = true;
2826 pg_stat_t::generate_test_instances(s);
2827 o.back()->stats = *s.back();
2830 list<pg_hit_set_history_t*> s;
2831 pg_hit_set_history_t::generate_test_instances(s);
2832 o.back()->hit_set = *s.back();
2836 // -- pg_notify_t --
2837 void pg_notify_t::encode(bufferlist &bl) const
2839 ENCODE_START(2, 2, bl);
2840 ::encode(query_epoch, bl);
2841 ::encode(epoch_sent, bl);
2848 void pg_notify_t::decode(bufferlist::iterator &bl)
2850 DECODE_START(2, bl);
2851 ::decode(query_epoch, bl);
2852 ::decode(epoch_sent, bl);
2859 void pg_notify_t::dump(Formatter *f) const
2861 f->dump_int("from", from);
2862 f->dump_int("to", to);
2863 f->dump_unsigned("query_epoch", query_epoch);
2864 f->dump_unsigned("epoch_sent", epoch_sent);
2866 f->open_object_section("info");
2872 void pg_notify_t::generate_test_instances(list<pg_notify_t*>& o)
2874 o.push_back(new pg_notify_t(shard_id_t(3), shard_id_t::NO_SHARD, 1, 1, pg_info_t()));
2875 o.push_back(new pg_notify_t(shard_id_t(0), shard_id_t(0), 3, 10, pg_info_t()));
2878 ostream &operator<<(ostream &lhs, const pg_notify_t ¬ify)
2880 lhs << "(query:" << notify.query_epoch
2881 << " sent:" << notify.epoch_sent
2882 << " " << notify.info;
2883 if (notify.from != shard_id_t::NO_SHARD ||
2884 notify.to != shard_id_t::NO_SHARD)
2885 lhs << " " << (unsigned)notify.from
2886 << "->" << (unsigned)notify.to;
2890 // -- pg_interval_t --
2892 void PastIntervals::pg_interval_t::encode(bufferlist& bl) const
2894 ENCODE_START(4, 2, bl);
2895 ::encode(first, bl);
2898 ::encode(acting, bl);
2899 ::encode(maybe_went_rw, bl);
2900 ::encode(primary, bl);
2901 ::encode(up_primary, bl);
2905 void PastIntervals::pg_interval_t::decode(bufferlist::iterator& bl)
2907 DECODE_START_LEGACY_COMPAT_LEN(4, 2, 2, bl);
2908 ::decode(first, bl);
2911 ::decode(acting, bl);
2912 ::decode(maybe_went_rw, bl);
2913 if (struct_v >= 3) {
2914 ::decode(primary, bl);
2917 primary = acting[0];
2919 if (struct_v >= 4) {
2920 ::decode(up_primary, bl);
2928 void PastIntervals::pg_interval_t::dump(Formatter *f) const
2930 f->dump_unsigned("first", first);
2931 f->dump_unsigned("last", last);
2932 f->dump_int("maybe_went_rw", maybe_went_rw ? 1 : 0);
2933 f->open_array_section("up");
2934 for (vector<int>::const_iterator p = up.begin(); p != up.end(); ++p)
2935 f->dump_int("osd", *p);
2937 f->open_array_section("acting");
2938 for (vector<int>::const_iterator p = acting.begin(); p != acting.end(); ++p)
2939 f->dump_int("osd", *p);
2941 f->dump_int("primary", primary);
2942 f->dump_int("up_primary", up_primary);
2945 void PastIntervals::pg_interval_t::generate_test_instances(list<pg_interval_t*>& o)
2947 o.push_back(new pg_interval_t);
2948 o.push_back(new pg_interval_t);
2949 o.back()->up.push_back(1);
2950 o.back()->acting.push_back(2);
2951 o.back()->acting.push_back(3);
2952 o.back()->first = 4;
2954 o.back()->maybe_went_rw = true;
2957 WRITE_CLASS_ENCODER(PastIntervals::pg_interval_t)
2959 class pi_simple_rep : public PastIntervals::interval_rep {
2960 map<epoch_t, PastIntervals::pg_interval_t> interval_map;
2964 std::list<PastIntervals::pg_interval_t> &&intervals) {
2965 for (auto &&i: intervals)
2966 add_interval(ec_pool, i);
2970 pi_simple_rep() = default;
2971 pi_simple_rep(const pi_simple_rep &) = default;
2972 pi_simple_rep(pi_simple_rep &&) = default;
2973 pi_simple_rep &operator=(pi_simple_rep &&) = default;
2974 pi_simple_rep &operator=(const pi_simple_rep &) = default;
2976 size_t size() const override { return interval_map.size(); }
2977 bool empty() const override { return interval_map.empty(); }
2978 void clear() override { interval_map.clear(); }
2979 pair<epoch_t, epoch_t> get_bounds() const override {
2980 auto iter = interval_map.begin();
2981 if (iter != interval_map.end()) {
2982 auto riter = interval_map.rbegin();
2985 riter->second.last + 1);
2987 return make_pair(0, 0);
2990 set<pg_shard_t> get_all_participants(
2991 bool ec_pool) const override {
2992 set<pg_shard_t> all_participants;
2994 // We need to decide who might have unfound objects that we need
2995 auto p = interval_map.rbegin();
2996 auto end = interval_map.rend();
2997 for (; p != end; ++p) {
2998 const PastIntervals::pg_interval_t &interval(p->second);
2999 // If nothing changed, we don't care about this interval.
3000 if (!interval.maybe_went_rw)
3004 std::vector<int>::const_iterator a = interval.acting.begin();
3005 std::vector<int>::const_iterator a_end = interval.acting.end();
3006 for (; a != a_end; ++a, ++i) {
3007 pg_shard_t shard(*a, ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD);
3008 if (*a != CRUSH_ITEM_NONE)
3009 all_participants.insert(shard);
3012 return all_participants;
3016 const PastIntervals::pg_interval_t &interval) override {
3017 interval_map[interval.first] = interval;
3019 unique_ptr<PastIntervals::interval_rep> clone() const override {
3020 return unique_ptr<PastIntervals::interval_rep>(new pi_simple_rep(*this));
3022 ostream &print(ostream &out) const override {
3023 return out << interval_map;
3025 void encode(bufferlist &bl) const override {
3026 ::encode(interval_map, bl);
3028 void decode(bufferlist::iterator &bl) override {
3029 ::decode(interval_map, bl);
3031 void dump(Formatter *f) const override {
3032 f->open_array_section("PastIntervals::compat_rep");
3033 for (auto &&i: interval_map) {
3034 f->open_object_section("pg_interval_t");
3035 f->dump_int("epoch", i.first);
3036 f->open_object_section("interval");
3043 bool is_classic() const override {
3046 static void generate_test_instances(list<pi_simple_rep*> &o) {
3047 using ival = PastIntervals::pg_interval_t;
3048 using ivallst = std::list<ival>;
3052 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3053 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3054 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3055 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3060 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3061 , ival{{ 1, 2}, { 1, 2}, 20, 30, true, 1, 1}
3062 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3063 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3068 { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3069 , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3070 , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3071 , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3075 void iterate_mayberw_back_to(
3078 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
3079 for (auto i = interval_map.rbegin(); i != interval_map.rend(); ++i) {
3080 if (!i->second.maybe_went_rw)
3082 if (i->second.last < les)
3084 set<pg_shard_t> actingset;
3085 for (unsigned j = 0; j < i->second.acting.size(); ++j) {
3086 if (i->second.acting[j] == CRUSH_ITEM_NONE)
3090 i->second.acting[j],
3091 ec_pool ? shard_id_t(j) : shard_id_t::NO_SHARD));
3093 f(i->second.first, actingset);
3097 bool has_full_intervals() const override { return true; }
3098 void iterate_all_intervals(
3099 std::function<void(const PastIntervals::pg_interval_t &)> &&f
3101 for (auto &&i: interval_map) {
3105 virtual ~pi_simple_rep() override {}
3111 * PastIntervals only needs to be able to answer two questions:
3112 * 1) Where should the primary look for unfound objects?
3113 * 2) List a set of subsets of the OSDs such that contacting at least
3114 * one from each subset guarrantees we speak to at least one witness
3115 * of any completed write.
3117 * Crucially, 2) does not require keeping *all* past intervals. Certainly,
3118 * we don't need to keep any where maybe_went_rw would be false. We also
3119 * needn't keep two intervals where the actingset in one is a subset
3120 * of the other (only need to keep the smaller of the two sets). In order
3121 * to accurately trim the set of intervals as last_epoch_started changes
3122 * without rebuilding the set from scratch, we'll retain the larger set
3123 * if it in an older interval.
3125 struct compact_interval_t {
3128 set<pg_shard_t> acting;
3129 bool supersedes(const compact_interval_t &other) {
3130 for (auto &&i: acting) {
3131 if (!other.acting.count(i))
3136 void dump(Formatter *f) const {
3137 f->open_object_section("compact_interval_t");
3138 f->dump_stream("first") << first;
3139 f->dump_stream("last") << last;
3140 f->dump_stream("acting") << acting;
3143 void encode(bufferlist &bl) const {
3144 ENCODE_START(1, 1, bl);
3145 ::encode(first, bl);
3147 ::encode(acting, bl);
3150 void decode(bufferlist::iterator &bl) {
3151 DECODE_START(1, bl);
3152 ::decode(first, bl);
3154 ::decode(acting, bl);
3157 static void generate_test_instances(list<compact_interval_t*> & o) {
3158 /* Not going to be used, we'll generate pi_compact_rep directly */
3161 ostream &operator<<(ostream &o, const compact_interval_t &rhs)
3163 return o << "([" << rhs.first << "," << rhs.last
3164 << "] acting " << rhs.acting << ")";
3166 WRITE_CLASS_ENCODER(compact_interval_t)
3168 class pi_compact_rep : public PastIntervals::interval_rep {
3170 epoch_t last = 0; // inclusive
3171 set<pg_shard_t> all_participants;
3172 list<compact_interval_t> intervals;
3175 std::list<PastIntervals::pg_interval_t> &&intervals) {
3176 for (auto &&i: intervals)
3177 add_interval(ec_pool, i);
3180 pi_compact_rep() = default;
3181 pi_compact_rep(const pi_compact_rep &) = default;
3182 pi_compact_rep(pi_compact_rep &&) = default;
3183 pi_compact_rep &operator=(const pi_compact_rep &) = default;
3184 pi_compact_rep &operator=(pi_compact_rep &&) = default;
3186 size_t size() const override { return intervals.size(); }
3187 bool empty() const override {
3188 return first > last || (first == 0 && last == 0);
3190 void clear() override {
3191 *this = pi_compact_rep();
3193 pair<epoch_t, epoch_t> get_bounds() const override {
3194 return make_pair(first, last + 1);
3196 set<pg_shard_t> get_all_participants(
3197 bool ec_pool) const override {
3198 return all_participants;
3201 bool ec_pool, const PastIntervals::pg_interval_t &interval) override {
3203 first = interval.first;
3204 assert(interval.last > last);
3205 last = interval.last;
3206 set<pg_shard_t> acting;
3207 for (unsigned i = 0; i < interval.acting.size(); ++i) {
3208 if (interval.acting[i] == CRUSH_ITEM_NONE)
3213 ec_pool ? shard_id_t(i) : shard_id_t::NO_SHARD));
3215 all_participants.insert(acting.begin(), acting.end());
3216 if (!interval.maybe_went_rw)
3218 intervals.push_back(
3219 compact_interval_t{interval.first, interval.last, acting});
3220 auto plast = intervals.end();
3222 for (auto cur = intervals.begin(); cur != plast; ) {
3223 if (plast->supersedes(*cur)) {
3224 intervals.erase(cur++);
3230 unique_ptr<PastIntervals::interval_rep> clone() const override {
3231 return unique_ptr<PastIntervals::interval_rep>(new pi_compact_rep(*this));
3233 ostream &print(ostream &out) const override {
3234 return out << "([" << first << "," << last
3235 << "] intervals=" << intervals << ")";
3237 void encode(bufferlist &bl) const override {
3238 ENCODE_START(1, 1, bl);
3239 ::encode(first, bl);
3241 ::encode(all_participants, bl);
3242 ::encode(intervals, bl);
3245 void decode(bufferlist::iterator &bl) override {
3246 DECODE_START(1, bl);
3247 ::decode(first, bl);
3249 ::decode(all_participants, bl);
3250 ::decode(intervals, bl);
3253 void dump(Formatter *f) const override {
3254 f->open_object_section("PastIntervals::compact_rep");
3255 f->dump_stream("first") << first;
3256 f->dump_stream("last") << last;
3257 f->open_array_section("all_participants");
3258 for (auto& i : all_participants) {
3259 f->dump_object("pg_shard", i);
3262 f->open_array_section("intervals");
3263 for (auto &&i: intervals) {
3269 bool is_classic() const override {
3272 static void generate_test_instances(list<pi_compact_rep*> &o) {
3273 using ival = PastIntervals::pg_interval_t;
3274 using ivallst = std::list<ival>;
3278 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3279 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3280 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3281 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3286 { ival{{0, 1, 2}, {0, 1, 2}, 10, 20, true, 0, 0}
3287 , ival{{ 1, 2}, { 1, 2}, 21, 30, true, 1, 1}
3288 , ival{{ 2}, { 2}, 31, 35, false, 2, 2}
3289 , ival{{0, 2}, {0, 2}, 36, 50, true, 0, 0}
3294 { ival{{2, 1, 0}, {2, 1, 0}, 10, 20, true, 1, 1}
3295 , ival{{ 0, 2}, { 0, 2}, 21, 30, true, 0, 0}
3296 , ival{{ 0, 2}, {2, 0}, 31, 35, true, 2, 2}
3297 , ival{{ 0, 2}, { 0, 2}, 36, 50, true, 0, 0}
3300 void iterate_mayberw_back_to(
3303 std::function<void(epoch_t, const set<pg_shard_t> &)> &&f) const override {
3304 for (auto i = intervals.rbegin(); i != intervals.rend(); ++i) {
3307 f(i->first, i->acting);
3310 virtual ~pi_compact_rep() override {}
3312 WRITE_CLASS_ENCODER(pi_compact_rep)
3314 PastIntervals::PastIntervals(const PastIntervals &rhs)
3315 : past_intervals(rhs.past_intervals ?
3316 rhs.past_intervals->clone() :
3319 PastIntervals &PastIntervals::operator=(const PastIntervals &rhs)
3321 PastIntervals other(rhs);
3326 ostream& operator<<(ostream& out, const PastIntervals &i)
3328 if (i.past_intervals) {
3329 return i.past_intervals->print(out);
3331 return out << "(empty)";
3335 ostream& operator<<(ostream& out, const PastIntervals::PriorSet &i)
3337 return out << "PriorSet("
3338 << "ec_pool: " << i.ec_pool
3339 << ", probe: " << i.probe
3340 << ", down: " << i.down
3341 << ", blocked_by: " << i.blocked_by
3342 << ", pg_down: " << i.pg_down
3346 void PastIntervals::decode(bufferlist::iterator &bl)
3348 DECODE_START(1, bl);
3355 past_intervals.reset(new pi_simple_rep);
3356 past_intervals->decode(bl);
3359 past_intervals.reset(new pi_compact_rep);
3360 past_intervals->decode(bl);
3366 void PastIntervals::decode_classic(bufferlist::iterator &bl)
3368 past_intervals.reset(new pi_simple_rep);
3369 past_intervals->decode(bl);
3372 void PastIntervals::generate_test_instances(list<PastIntervals*> &o)
3375 list<pi_simple_rep *> simple;
3376 pi_simple_rep::generate_test_instances(simple);
3377 for (auto &&i: simple) {
3378 // takes ownership of contents
3379 o.push_back(new PastIntervals(i));
3383 list<pi_compact_rep *> compact;
3384 pi_compact_rep::generate_test_instances(compact);
3385 for (auto &&i: compact) {
3386 // takes ownership of contents
3387 o.push_back(new PastIntervals(i));
3393 void PastIntervals::update_type(bool ec_pool, bool compact)
3396 if (!past_intervals) {
3397 past_intervals.reset(new pi_simple_rep);
3399 // we never convert from compact back to classic
3400 assert(is_classic());
3403 if (!past_intervals) {
3404 past_intervals.reset(new pi_compact_rep);
3405 } else if (is_classic()) {
3406 auto old = std::move(past_intervals);
3407 past_intervals.reset(new pi_compact_rep);
3408 assert(old->has_full_intervals());
3409 old->iterate_all_intervals([&](const pg_interval_t &i) {
3410 past_intervals->add_interval(ec_pool, i);
3416 void PastIntervals::update_type_from_map(bool ec_pool, const OSDMap &osdmap)
3418 update_type(ec_pool, osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS);
3421 bool PastIntervals::is_new_interval(
3422 int old_acting_primary,
3423 int new_acting_primary,
3424 const vector<int> &old_acting,
3425 const vector<int> &new_acting,
3428 const vector<int> &old_up,
3429 const vector<int> &new_up,
3434 unsigned old_pg_num,
3435 unsigned new_pg_num,
3436 bool old_sort_bitwise,
3437 bool new_sort_bitwise,
3438 bool old_recovery_deletes,
3439 bool new_recovery_deletes,
3441 return old_acting_primary != new_acting_primary ||
3442 new_acting != old_acting ||
3443 old_up_primary != new_up_primary ||
3445 old_min_size != new_min_size ||
3446 old_size != new_size ||
3447 pgid.is_split(old_pg_num, new_pg_num, 0) ||
3448 old_sort_bitwise != new_sort_bitwise ||
3449 old_recovery_deletes != new_recovery_deletes;
3452 bool PastIntervals::is_new_interval(
3453 int old_acting_primary,
3454 int new_acting_primary,
3455 const vector<int> &old_acting,
3456 const vector<int> &new_acting,
3459 const vector<int> &old_up,
3460 const vector<int> &new_up,
3464 return !(lastmap->get_pools().count(pgid.pool())) ||
3465 is_new_interval(old_acting_primary,
3473 lastmap->get_pools().find(pgid.pool())->second.size,
3474 osdmap->get_pools().find(pgid.pool())->second.size,
3475 lastmap->get_pools().find(pgid.pool())->second.min_size,
3476 osdmap->get_pools().find(pgid.pool())->second.min_size,
3477 lastmap->get_pg_num(pgid.pool()),
3478 osdmap->get_pg_num(pgid.pool()),
3479 lastmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
3480 osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE),
3481 lastmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
3482 osdmap->test_flag(CEPH_OSDMAP_RECOVERY_DELETES),
3486 bool PastIntervals::check_new_interval(
3487 int old_acting_primary,
3488 int new_acting_primary,
3489 const vector<int> &old_acting,
3490 const vector<int> &new_acting,
3493 const vector<int> &old_up,
3494 const vector<int> &new_up,
3495 epoch_t same_interval_since,
3496 epoch_t last_epoch_clean,
3500 IsPGRecoverablePredicate *could_have_gone_active,
3501 PastIntervals *past_intervals,
3505 * We have to be careful to gracefully deal with situations like
3506 * so. Say we have a power outage or something that takes out both
3507 * OSDs, but the monitor doesn't mark them down in the same epoch.
3508 * The history may look like
3512 * 3: let's say B dies for good, too (say, from the power spike)
3515 * which makes it look like B may have applied updates to the PG
3516 * that we need in order to proceed. This sucks...
3518 * To minimize the risk of this happening, we CANNOT go active if
3519 * _any_ OSDs in the prior set are down until we send an MOSDAlive
3520 * to the monitor such that the OSDMap sets osd_up_thru to an epoch.
3521 * Then, we have something like
3528 * -> we can ignore B, bc it couldn't have gone active (up_thru still 0).
3538 * -> we must wait for B, bc it was alive through 2, and could have
3539 * written to the pg.
3541 * If B is really dead, then an administrator will need to manually
3542 * intervene by marking the OSD as "lost."
3545 // remember past interval
3546 // NOTE: a change in the up set primary triggers an interval
3547 // change, even though the interval members in the pg_interval_t
3549 assert(past_intervals);
3550 assert(past_intervals->past_intervals);
3551 if (is_new_interval(
3564 i.first = same_interval_since;
3565 i.last = osdmap->get_epoch() - 1;
3566 assert(i.first <= i.last);
3567 i.acting = old_acting;
3569 i.primary = old_acting_primary;
3570 i.up_primary = old_up_primary;
3572 unsigned num_acting = 0;
3573 for (vector<int>::const_iterator p = i.acting.begin(); p != i.acting.end();
3575 if (*p != CRUSH_ITEM_NONE)
3578 assert(lastmap->get_pools().count(pgid.pool()));
3579 const pg_pool_t& old_pg_pool = lastmap->get_pools().find(pgid.pool())->second;
3580 set<pg_shard_t> old_acting_shards;
3581 old_pg_pool.convert_to_pg_shards(old_acting, &old_acting_shards);
3585 num_acting >= old_pg_pool.min_size &&
3586 (*could_have_gone_active)(old_acting_shards)) {
3588 *out << __func__ << " " << i
3590 << " up_thru " << lastmap->get_up_thru(i.primary)
3591 << " up_from " << lastmap->get_up_from(i.primary)
3592 << " last_epoch_clean " << last_epoch_clean
3594 if (lastmap->get_up_thru(i.primary) >= i.first &&
3595 lastmap->get_up_from(i.primary) <= i.first) {
3596 i.maybe_went_rw = true;
3598 *out << __func__ << " " << i
3599 << " : primary up " << lastmap->get_up_from(i.primary)
3600 << "-" << lastmap->get_up_thru(i.primary)
3601 << " includes interval"
3603 } else if (last_epoch_clean >= i.first &&
3604 last_epoch_clean <= i.last) {
3605 // If the last_epoch_clean is included in this interval, then
3606 // the pg must have been rw (for recovery to have completed).
3607 // This is important because we won't know the _real_
3608 // first_epoch because we stop at last_epoch_clean, and we
3609 // don't want the oldest interval to randomly have
3610 // maybe_went_rw false depending on the relative up_thru vs
3611 // last_epoch_clean timing.
3612 i.maybe_went_rw = true;
3614 *out << __func__ << " " << i
3615 << " : includes last_epoch_clean " << last_epoch_clean
3616 << " and presumed to have been rw"
3619 i.maybe_went_rw = false;
3621 *out << __func__ << " " << i
3622 << " : primary up " << lastmap->get_up_from(i.primary)
3623 << "-" << lastmap->get_up_thru(i.primary)
3624 << " does not include interval"
3628 i.maybe_went_rw = false;
3630 *out << __func__ << " " << i << " : acting set is too small" << std::endl;
3632 past_intervals->past_intervals->add_interval(old_pg_pool.ec_pool(), i);
3640 // true if the given map affects the prior set
3641 bool PastIntervals::PriorSet::affected_by_map(
3642 const OSDMap &osdmap,
3643 const DoutPrefixProvider *dpp) const
3645 for (set<pg_shard_t>::iterator p = probe.begin();
3650 // did someone in the prior set go down?
3651 if (osdmap.is_down(o) && down.count(o) == 0) {
3652 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now down" << dendl;
3656 // did a down osd in cur get (re)marked as lost?
3657 map<int, epoch_t>::const_iterator r = blocked_by.find(o);
3658 if (r != blocked_by.end()) {
3659 if (!osdmap.exists(o)) {
3660 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
3663 if (osdmap.get_info(o).lost_at != r->second) {
3664 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
3670 // did someone in the prior down set go up?
3671 for (set<int>::const_iterator p = down.begin();
3676 if (osdmap.is_up(o)) {
3677 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " now up" << dendl;
3681 // did someone in the prior set get lost or destroyed?
3682 if (!osdmap.exists(o)) {
3683 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " no longer exists" << dendl;
3686 // did a down osd in down get (re)marked as lost?
3687 map<int, epoch_t>::const_iterator r = blocked_by.find(o);
3688 if (r != blocked_by.end()) {
3689 if (osdmap.get_info(o).lost_at != r->second) {
3690 ldpp_dout(dpp, 10) << "affected_by_map osd." << o << " (re)marked as lost" << dendl;
3699 ostream& operator<<(ostream& out, const PastIntervals::pg_interval_t& i)
3701 out << "interval(" << i.first << "-" << i.last
3702 << " up " << i.up << "(" << i.up_primary << ")"
3703 << " acting " << i.acting << "(" << i.primary << ")";
3704 if (i.maybe_went_rw)
3705 out << " maybe_went_rw";
3714 void pg_query_t::encode(bufferlist &bl, uint64_t features) const {
3715 ENCODE_START(3, 3, bl);
3717 ::encode(since, bl);
3719 ::encode(epoch_sent, bl);
3725 void pg_query_t::decode(bufferlist::iterator &bl) {
3726 DECODE_START(3, bl);
3728 ::decode(since, bl);
3730 ::decode(epoch_sent, bl);
3736 void pg_query_t::dump(Formatter *f) const
3738 f->dump_int("from", from);
3739 f->dump_int("to", to);
3740 f->dump_string("type", get_type_name());
3741 f->dump_stream("since") << since;
3742 f->dump_stream("epoch_sent") << epoch_sent;
3743 f->open_object_section("history");
3747 void pg_query_t::generate_test_instances(list<pg_query_t*>& o)
3749 o.push_back(new pg_query_t());
3750 list<pg_history_t*> h;
3751 pg_history_t::generate_test_instances(h);
3752 o.push_back(new pg_query_t(pg_query_t::INFO, shard_id_t(1), shard_id_t(2), *h.back(), 4));
3753 o.push_back(new pg_query_t(pg_query_t::MISSING, shard_id_t(2), shard_id_t(3), *h.back(), 4));
3754 o.push_back(new pg_query_t(pg_query_t::LOG, shard_id_t(0), shard_id_t(0),
3755 eversion_t(4, 5), *h.back(), 4));
3756 o.push_back(new pg_query_t(pg_query_t::FULLLOG,
3757 shard_id_t::NO_SHARD, shard_id_t::NO_SHARD,
3761 // -- ObjectModDesc --
3762 void ObjectModDesc::visit(Visitor *visitor) const
3764 bufferlist::iterator bp = bl.begin();
3767 DECODE_START(max_required_version, bp);
3774 visitor->append(size);
3778 map<string, boost::optional<bufferlist> > attrs;
3779 ::decode(attrs, bp);
3780 visitor->setattrs(attrs);
3784 version_t old_version;
3785 ::decode(old_version, bp);
3786 visitor->rmobject(old_version);
3793 case UPDATE_SNAPS: {
3794 set<snapid_t> snaps;
3795 ::decode(snaps, bp);
3796 visitor->update_snaps(snaps);
3800 version_t old_version;
3801 ::decode(old_version, bp);
3802 visitor->try_rmobject(old_version);
3805 case ROLLBACK_EXTENTS: {
3806 vector<pair<uint64_t, uint64_t> > extents;
3809 ::decode(extents, bp);
3810 visitor->rollback_extents(gen,extents);
3814 assert(0 == "Invalid rollback code");
3819 assert(0 == "Invalid encoding");
3823 struct DumpVisitor : public ObjectModDesc::Visitor {
3825 explicit DumpVisitor(Formatter *f) : f(f) {}
3826 void append(uint64_t old_size) override {
3827 f->open_object_section("op");
3828 f->dump_string("code", "APPEND");
3829 f->dump_unsigned("old_size", old_size);
3832 void setattrs(map<string, boost::optional<bufferlist> > &attrs) override {
3833 f->open_object_section("op");
3834 f->dump_string("code", "SETATTRS");
3835 f->open_array_section("attrs");
3836 for (map<string, boost::optional<bufferlist> >::iterator i = attrs.begin();
3839 f->dump_string("attr_name", i->first);
3844 void rmobject(version_t old_version) override {
3845 f->open_object_section("op");
3846 f->dump_string("code", "RMOBJECT");
3847 f->dump_unsigned("old_version", old_version);
3850 void try_rmobject(version_t old_version) override {
3851 f->open_object_section("op");
3852 f->dump_string("code", "TRY_RMOBJECT");
3853 f->dump_unsigned("old_version", old_version);
3856 void create() override {
3857 f->open_object_section("op");
3858 f->dump_string("code", "CREATE");
3861 void update_snaps(const set<snapid_t> &snaps) override {
3862 f->open_object_section("op");
3863 f->dump_string("code", "UPDATE_SNAPS");
3864 f->dump_stream("snaps") << snaps;
3867 void rollback_extents(
3869 const vector<pair<uint64_t, uint64_t> > &extents) override {
3870 f->open_object_section("op");
3871 f->dump_string("code", "ROLLBACK_EXTENTS");
3872 f->dump_unsigned("gen", gen);
3873 f->dump_stream("snaps") << extents;
3878 void ObjectModDesc::dump(Formatter *f) const
3880 f->open_object_section("object_mod_desc");
3881 f->dump_bool("can_local_rollback", can_local_rollback);
3882 f->dump_bool("rollback_info_completed", rollback_info_completed);
3884 f->open_array_section("ops");
3892 void ObjectModDesc::generate_test_instances(list<ObjectModDesc*>& o)
3894 map<string, boost::optional<bufferlist> > attrs;
3898 o.push_back(new ObjectModDesc());
3899 o.back()->append(100);
3900 o.back()->setattrs(attrs);
3901 o.push_back(new ObjectModDesc());
3902 o.back()->rmobject(1001);
3903 o.push_back(new ObjectModDesc());
3905 o.back()->setattrs(attrs);
3906 o.push_back(new ObjectModDesc());
3908 o.back()->setattrs(attrs);
3909 o.back()->mark_unrollbackable();
3910 o.back()->append(1000);
3913 void ObjectModDesc::encode(bufferlist &_bl) const
3915 ENCODE_START(max_required_version, max_required_version, _bl);
3916 ::encode(can_local_rollback, _bl);
3917 ::encode(rollback_info_completed, _bl);
3921 void ObjectModDesc::decode(bufferlist::iterator &_bl)
3923 DECODE_START(2, _bl);
3924 max_required_version = struct_v;
3925 ::decode(can_local_rollback, _bl);
3926 ::decode(rollback_info_completed, _bl);
3928 // ensure bl does not pin a larger buffer in memory
3930 bl.reassign_to_mempool(mempool::mempool_osd_pglog);
3934 // -- pg_log_entry_t --
3936 string pg_log_entry_t::get_key_name() const
3938 return version.get_key_name();
3941 void pg_log_entry_t::encode_with_checksum(bufferlist& bl) const
3943 bufferlist ebl(sizeof(*this)*2);
3945 __u32 crc = ebl.crc32c(0);
3950 void pg_log_entry_t::decode_with_checksum(bufferlist::iterator& p)
3956 if (crc != bl.crc32c(0))
3957 throw buffer::malformed_input("bad checksum on pg_log_entry_t");
3958 bufferlist::iterator q = bl.begin();
3962 void pg_log_entry_t::encode(bufferlist &bl) const
3964 ENCODE_START(11, 4, bl);
3967 ::encode(version, bl);
3970 * Added with reverting_to:
3971 * Previous code used prior_version to encode
3972 * what we now call reverting_to. This will
3973 * allow older code to decode reverting_to
3974 * into prior_version as expected.
3976 if (op == LOST_REVERT)
3977 ::encode(reverting_to, bl);
3979 ::encode(prior_version, bl);
3981 ::encode(reqid, bl);
3982 ::encode(mtime, bl);
3983 if (op == LOST_REVERT)
3984 ::encode(prior_version, bl);
3985 ::encode(snaps, bl);
3986 ::encode(user_version, bl);
3987 ::encode(mod_desc, bl);
3988 ::encode(extra_reqids, bl);
3990 ::encode(return_code, bl);
3994 void pg_log_entry_t::decode(bufferlist::iterator &bl)
3996 DECODE_START_LEGACY_COMPAT_LEN(11, 4, 4, bl);
4000 ::decode(old_soid, bl);
4001 soid.oid = old_soid.oid;
4002 soid.snap = old_soid.snap;
4003 invalid_hash = true;
4008 invalid_hash = true;
4009 ::decode(version, bl);
4011 if (struct_v >= 6 && op == LOST_REVERT)
4012 ::decode(reverting_to, bl);
4014 ::decode(prior_version, bl);
4016 ::decode(reqid, bl);
4018 ::decode(mtime, bl);
4020 invalid_pool = true;
4022 if (op == LOST_REVERT) {
4023 if (struct_v >= 6) {
4024 ::decode(prior_version, bl);
4026 reverting_to = prior_version;
4029 if (struct_v >= 7 || // for v >= 7, this is for all ops.
4030 op == CLONE) { // for v < 7, it's only present for CLONE.
4031 ::decode(snaps, bl);
4032 // ensure snaps does not pin a larger buffer in memory
4034 snaps.reassign_to_mempool(mempool::mempool_osd_pglog);
4038 ::decode(user_version, bl);
4040 user_version = version.version;
4043 ::decode(mod_desc, bl);
4045 mod_desc.mark_unrollbackable();
4047 ::decode(extra_reqids, bl);
4048 if (struct_v >= 11 && op == ERROR)
4049 ::decode(return_code, bl);
4053 void pg_log_entry_t::dump(Formatter *f) const
4055 f->dump_string("op", get_op_name());
4056 f->dump_stream("object") << soid;
4057 f->dump_stream("version") << version;
4058 f->dump_stream("prior_version") << prior_version;
4059 f->dump_stream("reqid") << reqid;
4060 f->open_array_section("extra_reqids");
4061 for (auto p = extra_reqids.begin();
4062 p != extra_reqids.end();
4064 f->open_object_section("extra_reqid");
4065 f->dump_stream("reqid") << p->first;
4066 f->dump_stream("user_version") << p->second;
4070 f->dump_stream("mtime") << mtime;
4071 f->dump_int("return_code", return_code);
4072 if (snaps.length() > 0) {
4074 bufferlist c = snaps;
4075 bufferlist::iterator p = c.begin();
4081 f->open_object_section("snaps");
4082 for (vector<snapid_t>::iterator p = v.begin(); p != v.end(); ++p)
4083 f->dump_unsigned("snap", *p);
4087 f->open_object_section("mod_desc");
4093 void pg_log_entry_t::generate_test_instances(list<pg_log_entry_t*>& o)
4095 o.push_back(new pg_log_entry_t());
4096 hobject_t oid(object_t("objname"), "key", 123, 456, 0, "");
4097 o.push_back(new pg_log_entry_t(MODIFY, oid, eversion_t(1,2), eversion_t(3,4),
4098 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4100 o.push_back(new pg_log_entry_t(ERROR, oid, eversion_t(1,2), eversion_t(3,4),
4101 1, osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4102 utime_t(8,9), -ENOENT));
4105 ostream& operator<<(ostream& out, const pg_log_entry_t& e)
4107 out << e.version << " (" << e.prior_version << ") "
4108 << std::left << std::setw(8) << e.get_op_name() << ' '
4109 << e.soid << " by " << e.reqid << " " << e.mtime
4110 << " " << e.return_code;
4111 if (e.snaps.length()) {
4112 vector<snapid_t> snaps;
4113 bufferlist c = e.snaps;
4114 bufferlist::iterator p = c.begin();
4120 out << " snaps " << snaps;
4125 // -- pg_log_dup_t --
4127 string pg_log_dup_t::get_key_name() const
4129 return "dup_" + version.get_key_name();
4132 void pg_log_dup_t::encode(bufferlist &bl) const
4134 ENCODE_START(1, 1, bl);
4135 ::encode(reqid, bl);
4136 ::encode(version, bl);
4137 ::encode(user_version, bl);
4138 ::encode(return_code, bl);
4142 void pg_log_dup_t::decode(bufferlist::iterator &bl)
4144 DECODE_START(1, bl);
4145 ::decode(reqid, bl);
4146 ::decode(version, bl);
4147 ::decode(user_version, bl);
4148 ::decode(return_code, bl);
4152 void pg_log_dup_t::dump(Formatter *f) const
4154 f->dump_stream("reqid") << reqid;
4155 f->dump_stream("version") << version;
4156 f->dump_stream("user_version") << user_version;
4157 f->dump_stream("return_code") << return_code;
4160 void pg_log_dup_t::generate_test_instances(list<pg_log_dup_t*>& o)
4162 o.push_back(new pg_log_dup_t());
4163 o.push_back(new pg_log_dup_t(eversion_t(1,2),
4165 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4167 o.push_back(new pg_log_dup_t(eversion_t(1,2),
4169 osd_reqid_t(entity_name_t::CLIENT(777), 8, 999),
4174 std::ostream& operator<<(std::ostream& out, const pg_log_dup_t& e) {
4175 return out << "log_dup(reqid=" << e.reqid <<
4176 " v=" << e.version << " uv=" << e.user_version <<
4177 " rc=" << e.return_code << ")";
4183 // out: pg_log_t that only has entries that apply to import_pgid using curmap
4184 // reject: Entries rejected from "in" are in the reject.log. Other fields not set.
4185 void pg_log_t::filter_log(spg_t import_pgid, const OSDMap &curmap,
4186 const string &hit_set_namespace, const pg_log_t &in,
4187 pg_log_t &out, pg_log_t &reject)
4193 for (list<pg_log_entry_t>::const_iterator i = in.log.begin();
4194 i != in.log.end(); ++i) {
4196 // Reject pg log entries for temporary objects
4197 if (i->soid.is_temp()) {
4198 reject.log.push_back(*i);
4202 if (i->soid.nspace != hit_set_namespace) {
4203 object_t oid = i->soid.oid;
4204 object_locator_t loc(i->soid);
4205 pg_t raw_pgid = curmap.object_locator_to_pg(oid, loc);
4206 pg_t pgid = curmap.raw_pg_to_pg(raw_pgid);
4208 if (import_pgid.pgid == pgid) {
4209 out.log.push_back(*i);
4211 reject.log.push_back(*i);
4214 out.log.push_back(*i);
4219 void pg_log_t::encode(bufferlist& bl) const
4221 ENCODE_START(7, 3, bl);
4225 ::encode(can_rollback_to, bl);
4226 ::encode(rollback_info_trimmed_to, bl);
4231 void pg_log_t::decode(bufferlist::iterator &bl, int64_t pool)
4233 DECODE_START_LEGACY_COMPAT_LEN(7, 3, 3, bl);
4238 ::decode(backlog, bl);
4242 ::decode(can_rollback_to, bl);
4245 ::decode(rollback_info_trimmed_to, bl);
4247 rollback_info_trimmed_to = tail;
4254 // handle hobject_t format change
4256 for (list<pg_log_entry_t>::iterator i = log.begin();
4259 if (!i->soid.is_max() && i->soid.pool == -1)
4260 i->soid.pool = pool;
4265 void pg_log_t::dump(Formatter *f) const
4267 f->dump_stream("head") << head;
4268 f->dump_stream("tail") << tail;
4269 f->open_array_section("log");
4270 for (list<pg_log_entry_t>::const_iterator p = log.begin(); p != log.end(); ++p) {
4271 f->open_object_section("entry");
4276 f->open_array_section("dups");
4277 for (const auto& entry : dups) {
4278 f->open_object_section("entry");
4285 void pg_log_t::generate_test_instances(list<pg_log_t*>& o)
4287 o.push_back(new pg_log_t);
4289 // this is nonsensical:
4290 o.push_back(new pg_log_t);
4291 o.back()->head = eversion_t(1,2);
4292 o.back()->tail = eversion_t(3,4);
4293 list<pg_log_entry_t*> e;
4294 pg_log_entry_t::generate_test_instances(e);
4295 for (list<pg_log_entry_t*>::iterator p = e.begin(); p != e.end(); ++p)
4296 o.back()->log.push_back(**p);
4299 void pg_log_t::copy_after(const pg_log_t &other, eversion_t v)
4301 can_rollback_to = other.can_rollback_to;
4304 for (list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin();
4305 i != other.log.rend();
4307 assert(i->version > other.tail);
4308 if (i->version <= v) {
4309 // make tail accurate.
4317 void pg_log_t::copy_range(const pg_log_t &other, eversion_t from, eversion_t to)
4319 can_rollback_to = other.can_rollback_to;
4320 list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin();
4321 assert(i != other.log.rend());
4322 while (i->version > to) {
4324 assert(i != other.log.rend());
4326 assert(i->version == to);
4328 for ( ; i != other.log.rend(); ++i) {
4329 if (i->version <= from) {
4337 void pg_log_t::copy_up_to(const pg_log_t &other, int max)
4339 can_rollback_to = other.can_rollback_to;
4343 for (list<pg_log_entry_t>::const_reverse_iterator i = other.log.rbegin();
4344 i != other.log.rend();
4354 ostream& pg_log_t::print(ostream& out) const
4356 out << *this << std::endl;
4357 for (list<pg_log_entry_t>::const_iterator p = log.begin();
4360 out << *p << std::endl;
4361 for (const auto& entry : dups) {
4362 out << " dup entry: " << entry << std::endl;
4367 // -- pg_missing_t --
4369 ostream& operator<<(ostream& out, const pg_missing_item& i)
4372 if (i.have != eversion_t())
4373 out << "(" << i.have << ")";
4374 out << " flags = " << i.flag_str();
4378 // -- object_copy_cursor_t --
4380 void object_copy_cursor_t::encode(bufferlist& bl) const
4382 ENCODE_START(1, 1, bl);
4383 ::encode(attr_complete, bl);
4384 ::encode(data_offset, bl);
4385 ::encode(data_complete, bl);
4386 ::encode(omap_offset, bl);
4387 ::encode(omap_complete, bl);
4391 void object_copy_cursor_t::decode(bufferlist::iterator &bl)
4393 DECODE_START(1, bl);
4394 ::decode(attr_complete, bl);
4395 ::decode(data_offset, bl);
4396 ::decode(data_complete, bl);
4397 ::decode(omap_offset, bl);
4398 ::decode(omap_complete, bl);
4402 void object_copy_cursor_t::dump(Formatter *f) const
4404 f->dump_unsigned("attr_complete", (int)attr_complete);
4405 f->dump_unsigned("data_offset", data_offset);
4406 f->dump_unsigned("data_complete", (int)data_complete);
4407 f->dump_string("omap_offset", omap_offset);
4408 f->dump_unsigned("omap_complete", (int)omap_complete);
4411 void object_copy_cursor_t::generate_test_instances(list<object_copy_cursor_t*>& o)
4413 o.push_back(new object_copy_cursor_t);
4414 o.push_back(new object_copy_cursor_t);
4415 o.back()->attr_complete = true;
4416 o.back()->data_offset = 123;
4417 o.push_back(new object_copy_cursor_t);
4418 o.back()->attr_complete = true;
4419 o.back()->data_complete = true;
4420 o.back()->omap_offset = "foo";
4421 o.push_back(new object_copy_cursor_t);
4422 o.back()->attr_complete = true;
4423 o.back()->data_complete = true;
4424 o.back()->omap_complete = true;
4427 // -- object_copy_data_t --
4429 void object_copy_data_t::encode(bufferlist& bl, uint64_t features) const
4431 ENCODE_START(7, 5, bl);
4433 ::encode(mtime, bl);
4434 ::encode(attrs, bl);
4436 ::encode(omap_data, bl);
4437 ::encode(cursor, bl);
4438 ::encode(omap_header, bl);
4439 ::encode(snaps, bl);
4440 ::encode(snap_seq, bl);
4441 ::encode(flags, bl);
4442 ::encode(data_digest, bl);
4443 ::encode(omap_digest, bl);
4444 ::encode(reqids, bl);
4445 ::encode(truncate_seq, bl);
4446 ::encode(truncate_size, bl);
4450 void object_copy_data_t::decode(bufferlist::iterator& bl)
4452 DECODE_START(7, bl);
4456 ::decode(mtime, bl);
4459 ::decode(category, bl); // no longer used
4461 ::decode(attrs, bl);
4464 map<string,bufferlist> omap;
4468 ::encode(omap, omap_data);
4470 ::decode(cursor, bl);
4472 ::decode(omap_header, bl);
4473 if (struct_v >= 3) {
4474 ::decode(snaps, bl);
4475 ::decode(snap_seq, bl);
4480 if (struct_v >= 4) {
4481 ::decode(flags, bl);
4482 ::decode(data_digest, bl);
4483 ::decode(omap_digest, bl);
4488 ::decode(mtime, bl);
4489 ::decode(attrs, bl);
4491 ::decode(omap_data, bl);
4492 ::decode(cursor, bl);
4493 ::decode(omap_header, bl);
4494 ::decode(snaps, bl);
4495 ::decode(snap_seq, bl);
4496 if (struct_v >= 4) {
4497 ::decode(flags, bl);
4498 ::decode(data_digest, bl);
4499 ::decode(omap_digest, bl);
4501 if (struct_v >= 6) {
4502 ::decode(reqids, bl);
4504 if (struct_v >= 7) {
4505 ::decode(truncate_seq, bl);
4506 ::decode(truncate_size, bl);
4512 void object_copy_data_t::generate_test_instances(list<object_copy_data_t*>& o)
4514 o.push_back(new object_copy_data_t());
4516 list<object_copy_cursor_t*> cursors;
4517 object_copy_cursor_t::generate_test_instances(cursors);
4518 list<object_copy_cursor_t*>::iterator ci = cursors.begin();
4519 o.back()->cursor = **(ci++);
4521 o.push_back(new object_copy_data_t());
4522 o.back()->cursor = **(ci++);
4524 o.push_back(new object_copy_data_t());
4525 o.back()->size = 1234;
4526 o.back()->mtime.set_from_double(1234);
4527 bufferptr bp("there", 5);
4530 o.back()->attrs["hello"] = bl;
4531 bufferptr bp2("not", 3);
4534 map<string,bufferlist> omap;
4536 ::encode(omap, o.back()->omap_data);
4537 bufferptr databp("iamsomedatatocontain", 20);
4538 o.back()->data.push_back(databp);
4539 o.back()->omap_header.append("this is an omap header");
4540 o.back()->snaps.push_back(123);
4541 o.back()->reqids.push_back(make_pair(osd_reqid_t(), version_t()));
4544 void object_copy_data_t::dump(Formatter *f) const
4546 f->open_object_section("cursor");
4548 f->close_section(); // cursor
4549 f->dump_int("size", size);
4550 f->dump_stream("mtime") << mtime;
4551 /* we should really print out the attrs here, but bufferlist
4552 const-correctness prevents that */
4553 f->dump_int("attrs_size", attrs.size());
4554 f->dump_int("flags", flags);
4555 f->dump_unsigned("data_digest", data_digest);
4556 f->dump_unsigned("omap_digest", omap_digest);
4557 f->dump_int("omap_data_length", omap_data.length());
4558 f->dump_int("omap_header_length", omap_header.length());
4559 f->dump_int("data_length", data.length());
4560 f->open_array_section("snaps");
4561 for (vector<snapid_t>::const_iterator p = snaps.begin();
4562 p != snaps.end(); ++p)
4563 f->dump_unsigned("snap", *p);
4565 f->open_array_section("reqids");
4566 for (auto p = reqids.begin();
4569 f->open_object_section("extra_reqid");
4570 f->dump_stream("reqid") << p->first;
4571 f->dump_stream("user_version") << p->second;
4577 // -- pg_create_t --
4579 void pg_create_t::encode(bufferlist &bl) const
4581 ENCODE_START(1, 1, bl);
4582 ::encode(created, bl);
4583 ::encode(parent, bl);
4584 ::encode(split_bits, bl);
4588 void pg_create_t::decode(bufferlist::iterator &bl)
4590 DECODE_START(1, bl);
4591 ::decode(created, bl);
4592 ::decode(parent, bl);
4593 ::decode(split_bits, bl);
4597 void pg_create_t::dump(Formatter *f) const
4599 f->dump_unsigned("created", created);
4600 f->dump_stream("parent") << parent;
4601 f->dump_int("split_bits", split_bits);
4604 void pg_create_t::generate_test_instances(list<pg_create_t*>& o)
4606 o.push_back(new pg_create_t);
4607 o.push_back(new pg_create_t(1, pg_t(3, 4, -1), 2));
4611 // -- pg_hit_set_info_t --
4613 void pg_hit_set_info_t::encode(bufferlist& bl) const
4615 ENCODE_START(2, 1, bl);
4616 ::encode(begin, bl);
4618 ::encode(version, bl);
4619 ::encode(using_gmt, bl);
4623 void pg_hit_set_info_t::decode(bufferlist::iterator& p)
4628 ::decode(version, p);
4629 if (struct_v >= 2) {
4630 ::decode(using_gmt, p);
4637 void pg_hit_set_info_t::dump(Formatter *f) const
4639 f->dump_stream("begin") << begin;
4640 f->dump_stream("end") << end;
4641 f->dump_stream("version") << version;
4642 f->dump_stream("using_gmt") << using_gmt;
4645 void pg_hit_set_info_t::generate_test_instances(list<pg_hit_set_info_t*>& ls)
4647 ls.push_back(new pg_hit_set_info_t);
4648 ls.push_back(new pg_hit_set_info_t);
4649 ls.back()->begin = utime_t(1, 2);
4650 ls.back()->end = utime_t(3, 4);
4654 // -- pg_hit_set_history_t --
4656 void pg_hit_set_history_t::encode(bufferlist& bl) const
4658 ENCODE_START(1, 1, bl);
4659 ::encode(current_last_update, bl);
4661 utime_t dummy_stamp;
4662 ::encode(dummy_stamp, bl);
4665 pg_hit_set_info_t dummy_info;
4666 ::encode(dummy_info, bl);
4668 ::encode(history, bl);
4672 void pg_hit_set_history_t::decode(bufferlist::iterator& p)
4675 ::decode(current_last_update, p);
4677 utime_t dummy_stamp;
4678 ::decode(dummy_stamp, p);
4681 pg_hit_set_info_t dummy_info;
4682 ::decode(dummy_info, p);
4684 ::decode(history, p);
4688 void pg_hit_set_history_t::dump(Formatter *f) const
4690 f->dump_stream("current_last_update") << current_last_update;
4691 f->open_array_section("history");
4692 for (list<pg_hit_set_info_t>::const_iterator p = history.begin();
4693 p != history.end(); ++p) {
4694 f->open_object_section("info");
4701 void pg_hit_set_history_t::generate_test_instances(list<pg_hit_set_history_t*>& ls)
4703 ls.push_back(new pg_hit_set_history_t);
4704 ls.push_back(new pg_hit_set_history_t);
4705 ls.back()->current_last_update = eversion_t(1, 2);
4706 ls.back()->history.push_back(pg_hit_set_info_t());
4709 // -- osd_peer_stat_t --
4711 void osd_peer_stat_t::encode(bufferlist& bl) const
4713 ENCODE_START(1, 1, bl);
4714 ::encode(stamp, bl);
4718 void osd_peer_stat_t::decode(bufferlist::iterator& bl)
4720 DECODE_START(1, bl);
4721 ::decode(stamp, bl);
4725 void osd_peer_stat_t::dump(Formatter *f) const
4727 f->dump_stream("stamp") << stamp;
4730 void osd_peer_stat_t::generate_test_instances(list<osd_peer_stat_t*>& o)
4732 o.push_back(new osd_peer_stat_t);
4733 o.push_back(new osd_peer_stat_t);
4734 o.back()->stamp = utime_t(1, 2);
4737 ostream& operator<<(ostream& out, const osd_peer_stat_t &stat)
4739 return out << "stat(" << stat.stamp << ")";
4743 // -- OSDSuperblock --
4745 void OSDSuperblock::encode(bufferlist &bl) const
4747 ENCODE_START(8, 5, bl);
4748 ::encode(cluster_fsid, bl);
4749 ::encode(whoami, bl);
4750 ::encode(current_epoch, bl);
4751 ::encode(oldest_map, bl);
4752 ::encode(newest_map, bl);
4753 ::encode(weight, bl);
4754 compat_features.encode(bl);
4755 ::encode(clean_thru, bl);
4756 ::encode(mounted, bl);
4757 ::encode(osd_fsid, bl);
4758 ::encode((epoch_t)0, bl); // epoch_t last_epoch_marked_full
4759 ::encode((uint32_t)0, bl); // map<int64_t,epoch_t> pool_last_epoch_marked_full
4763 void OSDSuperblock::decode(bufferlist::iterator &bl)
4765 DECODE_START_LEGACY_COMPAT_LEN(8, 5, 5, bl);
4768 ::decode(magic, bl);
4770 ::decode(cluster_fsid, bl);
4771 ::decode(whoami, bl);
4772 ::decode(current_epoch, bl);
4773 ::decode(oldest_map, bl);
4774 ::decode(newest_map, bl);
4775 ::decode(weight, bl);
4776 if (struct_v >= 2) {
4777 compat_features.decode(bl);
4778 } else { //upgrade it!
4779 compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
4781 ::decode(clean_thru, bl);
4782 ::decode(mounted, bl);
4784 ::decode(osd_fsid, bl);
4785 if (struct_v >= 6) {
4786 epoch_t last_map_marked_full;
4787 ::decode(last_map_marked_full, bl);
4789 if (struct_v >= 7) {
4790 map<int64_t,epoch_t> pool_last_map_marked_full;
4791 ::decode(pool_last_map_marked_full, bl);
4796 void OSDSuperblock::dump(Formatter *f) const
4798 f->dump_stream("cluster_fsid") << cluster_fsid;
4799 f->dump_stream("osd_fsid") << osd_fsid;
4800 f->dump_int("whoami", whoami);
4801 f->dump_int("current_epoch", current_epoch);
4802 f->dump_int("oldest_map", oldest_map);
4803 f->dump_int("newest_map", newest_map);
4804 f->dump_float("weight", weight);
4805 f->open_object_section("compat");
4806 compat_features.dump(f);
4808 f->dump_int("clean_thru", clean_thru);
4809 f->dump_int("last_epoch_mounted", mounted);
4812 void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
4815 o.push_back(new OSDSuperblock(z));
4816 memset(&z.cluster_fsid, 1, sizeof(z.cluster_fsid));
4817 memset(&z.osd_fsid, 2, sizeof(z.osd_fsid));
4819 z.current_epoch = 4;
4824 o.push_back(new OSDSuperblock(z));
4825 o.push_back(new OSDSuperblock(z));
4830 void SnapSet::encode(bufferlist& bl) const
4832 ENCODE_START(3, 2, bl);
4834 ::encode(head_exists, bl);
4835 ::encode(snaps, bl);
4836 ::encode(clones, bl);
4837 ::encode(clone_overlap, bl);
4838 ::encode(clone_size, bl);
4839 ::encode(clone_snaps, bl);
4843 void SnapSet::decode(bufferlist::iterator& bl)
4845 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
4847 ::decode(head_exists, bl);
4848 ::decode(snaps, bl);
4849 ::decode(clones, bl);
4850 ::decode(clone_overlap, bl);
4851 ::decode(clone_size, bl);
4852 if (struct_v >= 3) {
4853 ::decode(clone_snaps, bl);
4855 clone_snaps.clear();
4860 void SnapSet::dump(Formatter *f) const
4862 SnapContext sc(seq, snaps);
4863 f->open_object_section("snap_context");
4866 f->dump_int("head_exists", head_exists);
4867 f->open_array_section("clones");
4868 for (vector<snapid_t>::const_iterator p = clones.begin(); p != clones.end(); ++p) {
4869 f->open_object_section("clone");
4870 f->dump_unsigned("snap", *p);
4871 f->dump_unsigned("size", clone_size.find(*p)->second);
4872 f->dump_stream("overlap") << clone_overlap.find(*p)->second;
4873 auto q = clone_snaps.find(*p);
4874 if (q != clone_snaps.end()) {
4875 f->open_array_section("snaps");
4876 for (auto s : q->second) {
4877 f->dump_unsigned("snap", s);
4886 void SnapSet::generate_test_instances(list<SnapSet*>& o)
4888 o.push_back(new SnapSet);
4889 o.push_back(new SnapSet);
4890 o.back()->head_exists = true;
4891 o.back()->seq = 123;
4892 o.back()->snaps.push_back(123);
4893 o.back()->snaps.push_back(12);
4894 o.push_back(new SnapSet);
4895 o.back()->head_exists = true;
4896 o.back()->seq = 123;
4897 o.back()->snaps.push_back(123);
4898 o.back()->snaps.push_back(12);
4899 o.back()->clones.push_back(12);
4900 o.back()->clone_size[12] = 12345;
4901 o.back()->clone_overlap[12];
4902 o.back()->clone_snaps[12] = {12, 10, 8};
4905 ostream& operator<<(ostream& out, const SnapSet& cs)
4907 if (cs.is_legacy()) {
4908 out << cs.seq << "=" << cs.snaps << ":"
4910 << (cs.head_exists ? "+head":"");
4911 if (!cs.clone_snaps.empty()) {
4912 out << "+stray_clone_snaps=" << cs.clone_snaps;
4916 return out << cs.seq << "=" << cs.snaps << ":"
4921 void SnapSet::from_snap_set(const librados::snap_set_t& ss, bool legacy)
4923 // NOTE: our reconstruction of snaps (and the snapc) is not strictly
4924 // correct: it will not include snaps that still logically exist
4925 // but for which there was no clone that is defined. For all
4926 // practical purposes this doesn't matter, since we only use that
4927 // information to clone on the OSD, and we have already moved
4928 // forward past that part of the object history.
4931 set<snapid_t> _snaps;
4932 set<snapid_t> _clones;
4933 head_exists = false;
4934 for (vector<librados::clone_info_t>::const_iterator p = ss.clones.begin();
4935 p != ss.clones.end();
4937 if (p->cloneid == librados::SNAP_HEAD) {
4940 _clones.insert(p->cloneid);
4941 _snaps.insert(p->snaps.begin(), p->snaps.end());
4942 clone_size[p->cloneid] = p->size;
4943 clone_overlap[p->cloneid]; // the entry must exist, even if it's empty.
4944 for (vector<pair<uint64_t, uint64_t> >::const_iterator q =
4945 p->overlap.begin(); q != p->overlap.end(); ++q)
4946 clone_overlap[p->cloneid].insert(q->first, q->second);
4948 // p->snaps is ascending; clone_snaps is descending
4949 vector<snapid_t>& v = clone_snaps[p->cloneid];
4950 for (auto q = p->snaps.rbegin(); q != p->snaps.rend(); ++q) {
4959 clones.reserve(_clones.size());
4960 for (set<snapid_t>::iterator p = _clones.begin(); p != _clones.end(); ++p)
4961 clones.push_back(*p);
4965 snaps.reserve(_snaps.size());
4966 for (set<snapid_t>::reverse_iterator p = _snaps.rbegin();
4967 p != _snaps.rend(); ++p)
4968 snaps.push_back(*p);
4971 uint64_t SnapSet::get_clone_bytes(snapid_t clone) const
4973 assert(clone_size.count(clone));
4974 uint64_t size = clone_size.find(clone)->second;
4975 assert(clone_overlap.count(clone));
4976 const interval_set<uint64_t> &overlap = clone_overlap.find(clone)->second;
4977 for (interval_set<uint64_t>::const_iterator i = overlap.begin();
4980 assert(size >= i.get_len());
4981 size -= i.get_len();
4986 void SnapSet::filter(const pg_pool_t &pinfo)
4988 vector<snapid_t> oldsnaps;
4989 oldsnaps.swap(snaps);
4990 for (vector<snapid_t>::const_iterator i = oldsnaps.begin();
4991 i != oldsnaps.end();
4993 if (!pinfo.is_removed_snap(*i))
4994 snaps.push_back(*i);
4998 SnapSet SnapSet::get_filtered(const pg_pool_t &pinfo) const
5005 // -- watch_info_t --
5007 void watch_info_t::encode(bufferlist& bl, uint64_t features) const
5009 ENCODE_START(4, 3, bl);
5010 ::encode(cookie, bl);
5011 ::encode(timeout_seconds, bl);
5012 ::encode(addr, bl, features);
5016 void watch_info_t::decode(bufferlist::iterator& bl)
5018 DECODE_START_LEGACY_COMPAT_LEN(4, 3, 3, bl);
5019 ::decode(cookie, bl);
5024 ::decode(timeout_seconds, bl);
5025 if (struct_v >= 4) {
5031 void watch_info_t::dump(Formatter *f) const
5033 f->dump_unsigned("cookie", cookie);
5034 f->dump_unsigned("timeout_seconds", timeout_seconds);
5035 f->open_object_section("addr");
5040 void watch_info_t::generate_test_instances(list<watch_info_t*>& o)
5042 o.push_back(new watch_info_t);
5043 o.push_back(new watch_info_t);
5044 o.back()->cookie = 123;
5045 o.back()->timeout_seconds = 99;
5047 ea.set_type(entity_addr_t::TYPE_LEGACY);
5049 ea.set_family(AF_INET);
5050 ea.set_in4_quad(0, 127);
5051 ea.set_in4_quad(1, 0);
5052 ea.set_in4_quad(2, 1);
5053 ea.set_in4_quad(3, 2);
5055 o.back()->addr = ea;
5058 // -- object_manifest_t --
5060 void object_manifest_t::encode(bufferlist& bl) const
5062 ENCODE_START(1, 1, bl);
5065 case TYPE_NONE: break;
5067 ::encode(redirect_target, bl);
5075 void object_manifest_t::decode(bufferlist::iterator& bl)
5077 DECODE_START(1, bl);
5080 case TYPE_NONE: break;
5082 ::decode(redirect_target, bl);
5090 void object_manifest_t::dump(Formatter *f) const
5092 f->dump_unsigned("type", type);
5093 f->open_object_section("redirect_target");
5094 redirect_target.dump(f);
5098 void object_manifest_t::generate_test_instances(list<object_manifest_t*>& o)
5100 o.push_back(new object_manifest_t());
5101 o.back()->type = TYPE_REDIRECT;
5104 ostream& operator<<(ostream& out, const object_manifest_t& om)
5106 return out << "type:" << om.type << " redirect_target:" << om.redirect_target;
5109 // -- object_info_t --
5111 void object_info_t::copy_user_bits(const object_info_t& other)
5113 // these bits are copied from head->clone.
5115 mtime = other.mtime;
5116 local_mtime = other.local_mtime;
5117 last_reqid = other.last_reqid;
5118 truncate_seq = other.truncate_seq;
5119 truncate_size = other.truncate_size;
5120 flags = other.flags;
5121 user_version = other.user_version;
5122 data_digest = other.data_digest;
5123 omap_digest = other.omap_digest;
5126 ps_t object_info_t::legacy_object_locator_to_ps(const object_t &oid,
5127 const object_locator_t &loc) {
5129 if (loc.key.length())
5130 // Hack, we don't have the osd map, so we don't really know the hash...
5131 ps = ceph_str_hash(CEPH_STR_HASH_RJENKINS, loc.key.c_str(),
5134 ps = ceph_str_hash(CEPH_STR_HASH_RJENKINS, oid.name.c_str(),
5139 void object_info_t::encode(bufferlist& bl, uint64_t features) const
5141 object_locator_t myoloc(soid);
5142 map<entity_name_t, watch_info_t> old_watchers;
5143 for (map<pair<uint64_t, entity_name_t>, watch_info_t>::const_iterator i =
5145 i != watchers.end();
5147 old_watchers.insert(make_pair(i->first.second, i->second));
5149 ENCODE_START(17, 8, bl);
5151 ::encode(myoloc, bl); //Retained for compatibility
5152 ::encode((__u32)0, bl); // was category, no longer used
5153 ::encode(version, bl);
5154 ::encode(prior_version, bl);
5155 ::encode(last_reqid, bl);
5157 ::encode(mtime, bl);
5158 if (soid.snap == CEPH_NOSNAP)
5159 ::encode(osd_reqid_t(), bl); // used to be wrlock_by
5161 ::encode(legacy_snaps, bl);
5162 ::encode(truncate_seq, bl);
5163 ::encode(truncate_size, bl);
5164 ::encode(is_lost(), bl);
5165 ::encode(old_watchers, bl, features);
5166 /* shenanigans to avoid breaking backwards compatibility in the disk format.
5167 * When we can, switch this out for simply putting the version_t on disk. */
5168 eversion_t user_eversion(0, user_version);
5169 ::encode(user_eversion, bl);
5170 ::encode(test_flag(FLAG_USES_TMAP), bl);
5171 ::encode(watchers, bl, features);
5172 __u32 _flags = flags;
5173 ::encode(_flags, bl);
5174 ::encode(local_mtime, bl);
5175 ::encode(data_digest, bl);
5176 ::encode(omap_digest, bl);
5177 ::encode(expected_object_size, bl);
5178 ::encode(expected_write_size, bl);
5179 ::encode(alloc_hint_flags, bl);
5180 if (has_manifest()) {
5181 ::encode(manifest, bl);
5186 void object_info_t::decode(bufferlist::iterator& bl)
5188 object_locator_t myoloc;
5189 DECODE_START_LEGACY_COMPAT_LEN(17, 8, 8, bl);
5190 map<entity_name_t, watch_info_t> old_watchers;
5192 ::decode(myoloc, bl);
5195 ::decode(category, bl); // no longer used
5197 ::decode(version, bl);
5198 ::decode(prior_version, bl);
5199 ::decode(last_reqid, bl);
5201 ::decode(mtime, bl);
5202 if (soid.snap == CEPH_NOSNAP) {
5203 osd_reqid_t wrlock_by;
5204 ::decode(wrlock_by, bl);
5206 ::decode(legacy_snaps, bl);
5208 ::decode(truncate_seq, bl);
5209 ::decode(truncate_size, bl);
5211 // if this is struct_v >= 13, we will overwrite this
5212 // below since this field is just here for backwards
5218 ::decode(old_watchers, bl);
5219 eversion_t user_eversion;
5220 ::decode(user_eversion, bl);
5221 user_version = user_eversion.version;
5223 if (struct_v >= 9) {
5224 bool uses_tmap = false;
5225 ::decode(uses_tmap, bl);
5227 set_flag(FLAG_USES_TMAP);
5229 set_flag(FLAG_USES_TMAP);
5232 soid.pool = myoloc.pool;
5233 if (struct_v >= 11) {
5234 ::decode(watchers, bl);
5236 for (map<entity_name_t, watch_info_t>::iterator i = old_watchers.begin();
5237 i != old_watchers.end();
5241 make_pair(i->second.cookie, i->first), i->second));
5244 if (struct_v >= 13) {
5246 ::decode(_flags, bl);
5247 flags = (flag_t)_flags;
5249 if (struct_v >= 14) {
5250 ::decode(local_mtime, bl);
5252 local_mtime = utime_t();
5254 if (struct_v >= 15) {
5255 ::decode(data_digest, bl);
5256 ::decode(omap_digest, bl);
5258 data_digest = omap_digest = -1;
5259 clear_flag(FLAG_DATA_DIGEST);
5260 clear_flag(FLAG_OMAP_DIGEST);
5262 if (struct_v >= 16) {
5263 ::decode(expected_object_size, bl);
5264 ::decode(expected_write_size, bl);
5265 ::decode(alloc_hint_flags, bl);
5267 expected_object_size = 0;
5268 expected_write_size = 0;
5269 alloc_hint_flags = 0;
5271 if (struct_v >= 17) {
5272 if (has_manifest()) {
5273 ::decode(manifest, bl);
5279 void object_info_t::dump(Formatter *f) const
5281 f->open_object_section("oid");
5284 f->dump_stream("version") << version;
5285 f->dump_stream("prior_version") << prior_version;
5286 f->dump_stream("last_reqid") << last_reqid;
5287 f->dump_unsigned("user_version", user_version);
5288 f->dump_unsigned("size", size);
5289 f->dump_stream("mtime") << mtime;
5290 f->dump_stream("local_mtime") << local_mtime;
5291 f->dump_unsigned("lost", (int)is_lost());
5292 f->dump_unsigned("flags", (int)flags);
5293 f->open_array_section("legacy_snaps");
5294 for (auto s : legacy_snaps) {
5295 f->dump_unsigned("snap", s);
5298 f->dump_unsigned("truncate_seq", truncate_seq);
5299 f->dump_unsigned("truncate_size", truncate_size);
5300 f->dump_unsigned("data_digest", data_digest);
5301 f->dump_unsigned("omap_digest", omap_digest);
5302 f->dump_unsigned("expected_object_size", expected_object_size);
5303 f->dump_unsigned("expected_write_size", expected_write_size);
5304 f->dump_unsigned("alloc_hint_flags", alloc_hint_flags);
5305 f->dump_object("manifest", manifest);
5306 f->open_object_section("watchers");
5307 for (map<pair<uint64_t, entity_name_t>,watch_info_t>::const_iterator p =
5308 watchers.begin(); p != watchers.end(); ++p) {
5310 ss << p->first.second;
5311 f->open_object_section(ss.str().c_str());
5318 void object_info_t::generate_test_instances(list<object_info_t*>& o)
5320 o.push_back(new object_info_t());
5326 ostream& operator<<(ostream& out, const object_info_t& oi)
5328 out << oi.soid << "(" << oi.version
5329 << " " << oi.last_reqid;
5330 if (oi.soid.snap != CEPH_NOSNAP && !oi.legacy_snaps.empty())
5331 out << " " << oi.legacy_snaps;
5333 out << " " << oi.get_flag_string();
5334 out << " s " << oi.size;
5335 out << " uv " << oi.user_version;
5336 if (oi.is_data_digest())
5337 out << " dd " << std::hex << oi.data_digest << std::dec;
5338 if (oi.is_omap_digest())
5339 out << " od " << std::hex << oi.omap_digest << std::dec;
5340 out << " alloc_hint [" << oi.expected_object_size
5341 << " " << oi.expected_write_size
5342 << " " << oi.alloc_hint_flags << "]";
5343 if (oi.has_manifest())
5344 out << " " << oi.manifest;
5350 // -- ObjectRecovery --
5351 void ObjectRecoveryProgress::encode(bufferlist &bl) const
5353 ENCODE_START(1, 1, bl);
5354 ::encode(first, bl);
5355 ::encode(data_complete, bl);
5356 ::encode(data_recovered_to, bl);
5357 ::encode(omap_recovered_to, bl);
5358 ::encode(omap_complete, bl);
5362 void ObjectRecoveryProgress::decode(bufferlist::iterator &bl)
5364 DECODE_START(1, bl);
5365 ::decode(first, bl);
5366 ::decode(data_complete, bl);
5367 ::decode(data_recovered_to, bl);
5368 ::decode(omap_recovered_to, bl);
5369 ::decode(omap_complete, bl);
5373 ostream &operator<<(ostream &out, const ObjectRecoveryProgress &prog)
5375 return prog.print(out);
5378 void ObjectRecoveryProgress::generate_test_instances(
5379 list<ObjectRecoveryProgress*>& o)
5381 o.push_back(new ObjectRecoveryProgress);
5382 o.back()->first = false;
5383 o.back()->data_complete = true;
5384 o.back()->omap_complete = true;
5385 o.back()->data_recovered_to = 100;
5387 o.push_back(new ObjectRecoveryProgress);
5388 o.back()->first = true;
5389 o.back()->data_complete = false;
5390 o.back()->omap_complete = false;
5391 o.back()->data_recovered_to = 0;
5394 ostream &ObjectRecoveryProgress::print(ostream &out) const
5396 return out << "ObjectRecoveryProgress("
5397 << ( first ? "" : "!" ) << "first, "
5398 << "data_recovered_to:" << data_recovered_to
5399 << ", data_complete:" << ( data_complete ? "true" : "false" )
5400 << ", omap_recovered_to:" << omap_recovered_to
5401 << ", omap_complete:" << ( omap_complete ? "true" : "false" )
5402 << ", error:" << ( error ? "true" : "false" )
5406 void ObjectRecoveryProgress::dump(Formatter *f) const
5408 f->dump_int("first?", first);
5409 f->dump_int("data_complete?", data_complete);
5410 f->dump_unsigned("data_recovered_to", data_recovered_to);
5411 f->dump_int("omap_complete?", omap_complete);
5412 f->dump_string("omap_recovered_to", omap_recovered_to);
5415 void ObjectRecoveryInfo::encode(bufferlist &bl, uint64_t features) const
5417 ENCODE_START(2, 1, bl);
5419 ::encode(version, bl);
5421 ::encode(oi, bl, features);
5423 ::encode(copy_subset, bl);
5424 ::encode(clone_subset, bl);
5428 void ObjectRecoveryInfo::decode(bufferlist::iterator &bl,
5431 DECODE_START(2, bl);
5433 ::decode(version, bl);
5437 ::decode(copy_subset, bl);
5438 ::decode(clone_subset, bl);
5442 if (!soid.is_max() && soid.pool == -1)
5444 map<hobject_t, interval_set<uint64_t>> tmp;
5445 tmp.swap(clone_subset);
5446 for (map<hobject_t, interval_set<uint64_t>>::iterator i = tmp.begin();
5449 hobject_t first(i->first);
5450 if (!first.is_max() && first.pool == -1)
5452 clone_subset[first].swap(i->second);
5457 void ObjectRecoveryInfo::generate_test_instances(
5458 list<ObjectRecoveryInfo*>& o)
5460 o.push_back(new ObjectRecoveryInfo);
5461 o.back()->soid = hobject_t(sobject_t("key", CEPH_NOSNAP));
5462 o.back()->version = eversion_t(0,0);
5463 o.back()->size = 100;
5467 void ObjectRecoveryInfo::dump(Formatter *f) const
5469 f->dump_stream("object") << soid;
5470 f->dump_stream("at_version") << version;
5471 f->dump_stream("size") << size;
5473 f->open_object_section("object_info");
5478 f->open_object_section("snapset");
5482 f->dump_stream("copy_subset") << copy_subset;
5483 f->dump_stream("clone_subset") << clone_subset;
5486 ostream& operator<<(ostream& out, const ObjectRecoveryInfo &inf)
5488 return inf.print(out);
5491 ostream &ObjectRecoveryInfo::print(ostream &out) const
5493 return out << "ObjectRecoveryInfo("
5494 << soid << "@" << version
5495 << ", size: " << size
5496 << ", copy_subset: " << copy_subset
5497 << ", clone_subset: " << clone_subset
5498 << ", snapset: " << ss
5502 // -- PushReplyOp --
5503 void PushReplyOp::generate_test_instances(list<PushReplyOp*> &o)
5505 o.push_back(new PushReplyOp);
5506 o.push_back(new PushReplyOp);
5507 o.back()->soid = hobject_t(sobject_t("asdf", 2));
5508 o.push_back(new PushReplyOp);
5509 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
5512 void PushReplyOp::encode(bufferlist &bl) const
5514 ENCODE_START(1, 1, bl);
5519 void PushReplyOp::decode(bufferlist::iterator &bl)
5521 DECODE_START(1, bl);
5526 void PushReplyOp::dump(Formatter *f) const
5528 f->dump_stream("soid") << soid;
5531 ostream &PushReplyOp::print(ostream &out) const
5534 << "PushReplyOp(" << soid
5538 ostream& operator<<(ostream& out, const PushReplyOp &op)
5540 return op.print(out);
5543 uint64_t PushReplyOp::cost(CephContext *cct) const
5546 return cct->_conf->osd_push_per_object_cost +
5547 cct->_conf->osd_recovery_max_chunk;
5551 void PullOp::generate_test_instances(list<PullOp*> &o)
5553 o.push_back(new PullOp);
5554 o.push_back(new PullOp);
5555 o.back()->soid = hobject_t(sobject_t("asdf", 2));
5556 o.back()->recovery_info.version = eversion_t(3, 10);
5557 o.push_back(new PullOp);
5558 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
5559 o.back()->recovery_info.version = eversion_t(0, 0);
5562 void PullOp::encode(bufferlist &bl, uint64_t features) const
5564 ENCODE_START(1, 1, bl);
5566 ::encode(recovery_info, bl, features);
5567 ::encode(recovery_progress, bl);
5571 void PullOp::decode(bufferlist::iterator &bl)
5573 DECODE_START(1, bl);
5575 ::decode(recovery_info, bl);
5576 ::decode(recovery_progress, bl);
5580 void PullOp::dump(Formatter *f) const
5582 f->dump_stream("soid") << soid;
5584 f->open_object_section("recovery_info");
5585 recovery_info.dump(f);
5589 f->open_object_section("recovery_progress");
5590 recovery_progress.dump(f);
5595 ostream &PullOp::print(ostream &out) const
5598 << "PullOp(" << soid
5599 << ", recovery_info: " << recovery_info
5600 << ", recovery_progress: " << recovery_progress
5604 ostream& operator<<(ostream& out, const PullOp &op)
5606 return op.print(out);
5609 uint64_t PullOp::cost(CephContext *cct) const
5611 return cct->_conf->osd_push_per_object_cost +
5612 cct->_conf->osd_recovery_max_chunk;
5616 void PushOp::generate_test_instances(list<PushOp*> &o)
5618 o.push_back(new PushOp);
5619 o.push_back(new PushOp);
5620 o.back()->soid = hobject_t(sobject_t("asdf", 2));
5621 o.back()->version = eversion_t(3, 10);
5622 o.push_back(new PushOp);
5623 o.back()->soid = hobject_t(sobject_t("asdf", CEPH_NOSNAP));
5624 o.back()->version = eversion_t(0, 0);
5627 void PushOp::encode(bufferlist &bl, uint64_t features) const
5629 ENCODE_START(1, 1, bl);
5631 ::encode(version, bl);
5633 ::encode(data_included, bl);
5634 ::encode(omap_header, bl);
5635 ::encode(omap_entries, bl);
5636 ::encode(attrset, bl);
5637 ::encode(recovery_info, bl, features);
5638 ::encode(after_progress, bl);
5639 ::encode(before_progress, bl);
5643 void PushOp::decode(bufferlist::iterator &bl)
5645 DECODE_START(1, bl);
5647 ::decode(version, bl);
5649 ::decode(data_included, bl);
5650 ::decode(omap_header, bl);
5651 ::decode(omap_entries, bl);
5652 ::decode(attrset, bl);
5653 ::decode(recovery_info, bl);
5654 ::decode(after_progress, bl);
5655 ::decode(before_progress, bl);
5659 void PushOp::dump(Formatter *f) const
5661 f->dump_stream("soid") << soid;
5662 f->dump_stream("version") << version;
5663 f->dump_int("data_len", data.length());
5664 f->dump_stream("data_included") << data_included;
5665 f->dump_int("omap_header_len", omap_header.length());
5666 f->dump_int("omap_entries_len", omap_entries.size());
5667 f->dump_int("attrset_len", attrset.size());
5669 f->open_object_section("recovery_info");
5670 recovery_info.dump(f);
5674 f->open_object_section("after_progress");
5675 after_progress.dump(f);
5679 f->open_object_section("before_progress");
5680 before_progress.dump(f);
5685 ostream &PushOp::print(ostream &out) const
5688 << "PushOp(" << soid
5689 << ", version: " << version
5690 << ", data_included: " << data_included
5691 << ", data_size: " << data.length()
5692 << ", omap_header_size: " << omap_header.length()
5693 << ", omap_entries_size: " << omap_entries.size()
5694 << ", attrset_size: " << attrset.size()
5695 << ", recovery_info: " << recovery_info
5696 << ", after_progress: " << after_progress
5697 << ", before_progress: " << before_progress
5701 ostream& operator<<(ostream& out, const PushOp &op)
5703 return op.print(out);
5706 uint64_t PushOp::cost(CephContext *cct) const
5708 uint64_t cost = data_included.size();
5709 for (map<string, bufferlist>::const_iterator i =
5710 omap_entries.begin();
5711 i != omap_entries.end();
5713 cost += i->second.length();
5715 cost += cct->_conf->osd_push_per_object_cost;
5721 void ScrubMap::merge_incr(const ScrubMap &l)
5723 assert(valid_through == l.incr_since);
5724 valid_through = l.valid_through;
5726 for (map<hobject_t,object>::const_iterator p = l.objects.begin();
5727 p != l.objects.end();
5729 if (p->second.negative) {
5730 map<hobject_t,object>::iterator q = objects.find(p->first);
5731 if (q != objects.end()) {
5735 objects[p->first] = p->second;
5740 void ScrubMap::encode(bufferlist& bl) const
5742 ENCODE_START(3, 2, bl);
5743 ::encode(objects, bl);
5744 ::encode((__u32)0, bl); // used to be attrs; now deprecated
5745 bufferlist old_logbl; // not used
5746 ::encode(old_logbl, bl);
5747 ::encode(valid_through, bl);
5748 ::encode(incr_since, bl);
5752 void ScrubMap::decode(bufferlist::iterator& bl, int64_t pool)
5754 DECODE_START_LEGACY_COMPAT_LEN(3, 2, 2, bl);
5755 ::decode(objects, bl);
5757 map<string,string> attrs; // deprecated
5758 ::decode(attrs, bl);
5760 bufferlist old_logbl; // not used
5761 ::decode(old_logbl, bl);
5762 ::decode(valid_through, bl);
5763 ::decode(incr_since, bl);
5766 // handle hobject_t upgrade
5768 map<hobject_t, object> tmp;
5770 for (map<hobject_t, object>::iterator i = tmp.begin();
5773 hobject_t first(i->first);
5774 if (!first.is_max() && first.pool == -1)
5776 objects[first] = i->second;
5781 void ScrubMap::dump(Formatter *f) const
5783 f->dump_stream("valid_through") << valid_through;
5784 f->dump_stream("incremental_since") << incr_since;
5785 f->open_array_section("objects");
5786 for (map<hobject_t,object>::const_iterator p = objects.begin(); p != objects.end(); ++p) {
5787 f->open_object_section("object");
5788 f->dump_string("name", p->first.oid.name);
5789 f->dump_unsigned("hash", p->first.get_hash());
5790 f->dump_string("key", p->first.get_key());
5791 f->dump_int("snapid", p->first.snap);
5798 void ScrubMap::generate_test_instances(list<ScrubMap*>& o)
5800 o.push_back(new ScrubMap);
5801 o.push_back(new ScrubMap);
5802 o.back()->valid_through = eversion_t(1, 2);
5803 o.back()->incr_since = eversion_t(3, 4);
5805 object::generate_test_instances(obj);
5806 o.back()->objects[hobject_t(object_t("foo"), "fookey", 123, 456, 0, "")] = *obj.back();
5808 o.back()->objects[hobject_t(object_t("bar"), string(), 123, 456, 0, "")] = *obj.back();
5811 // -- ScrubMap::object --
5813 void ScrubMap::object::encode(bufferlist& bl) const
5815 bool compat_read_error = read_error || ec_hash_mismatch || ec_size_mismatch;
5816 ENCODE_START(8, 7, bl);
5818 ::encode(negative, bl);
5819 ::encode(attrs, bl);
5820 ::encode(digest, bl);
5821 ::encode(digest_present, bl);
5822 ::encode((uint32_t)0, bl); // obsolete nlinks
5823 ::encode((uint32_t)0, bl); // snapcolls
5824 ::encode(omap_digest, bl);
5825 ::encode(omap_digest_present, bl);
5826 ::encode(compat_read_error, bl);
5827 ::encode(stat_error, bl);
5828 ::encode(read_error, bl);
5829 ::encode(ec_hash_mismatch, bl);
5830 ::encode(ec_size_mismatch, bl);
5834 void ScrubMap::object::decode(bufferlist::iterator& bl)
5836 DECODE_START(8, bl);
5838 bool tmp, compat_read_error = false;
5841 ::decode(attrs, bl);
5842 ::decode(digest, bl);
5844 digest_present = tmp;
5847 ::decode(nlinks, bl);
5848 set<snapid_t> snapcolls;
5849 ::decode(snapcolls, bl);
5851 ::decode(omap_digest, bl);
5853 omap_digest_present = tmp;
5854 ::decode(compat_read_error, bl);
5857 if (struct_v >= 8) {
5861 ec_hash_mismatch = tmp;
5863 ec_size_mismatch = tmp;
5865 // If older encoder found a read_error, set read_error
5866 if (compat_read_error && !read_error && !ec_hash_mismatch && !ec_size_mismatch)
5871 void ScrubMap::object::dump(Formatter *f) const
5873 f->dump_int("size", size);
5874 f->dump_int("negative", negative);
5875 f->open_array_section("attrs");
5876 for (map<string,bufferptr>::const_iterator p = attrs.begin(); p != attrs.end(); ++p) {
5877 f->open_object_section("attr");
5878 f->dump_string("name", p->first);
5879 f->dump_int("length", p->second.length());
5885 void ScrubMap::object::generate_test_instances(list<object*>& o)
5887 o.push_back(new object);
5888 o.push_back(new object);
5889 o.back()->negative = true;
5890 o.push_back(new object);
5891 o.back()->size = 123;
5892 o.back()->attrs["foo"] = buffer::copy("foo", 3);
5893 o.back()->attrs["bar"] = buffer::copy("barval", 6);
5898 ostream& operator<<(ostream& out, const OSDOp& op)
5900 out << ceph_osd_op_name(op.op.op);
5901 if (ceph_osd_op_type_data(op.op.op)) {
5904 case CEPH_OSD_OP_ASSERT_VER:
5905 out << " v" << op.op.assert_ver.ver;
5907 case CEPH_OSD_OP_TRUNCATE:
5908 out << " " << op.op.extent.offset;
5910 case CEPH_OSD_OP_MASKTRUNC:
5911 case CEPH_OSD_OP_TRIMTRUNC:
5912 out << " " << op.op.extent.truncate_seq << "@"
5913 << (int64_t)op.op.extent.truncate_size;
5915 case CEPH_OSD_OP_ROLLBACK:
5916 out << " " << snapid_t(op.op.snap.snapid);
5918 case CEPH_OSD_OP_WATCH:
5919 out << " " << ceph_osd_watch_op_name(op.op.watch.op)
5920 << " cookie " << op.op.watch.cookie;
5921 if (op.op.watch.gen)
5922 out << " gen " << op.op.watch.gen;
5924 case CEPH_OSD_OP_NOTIFY:
5925 case CEPH_OSD_OP_NOTIFY_ACK:
5926 out << " cookie " << op.op.notify.cookie;
5928 case CEPH_OSD_OP_COPY_GET:
5929 out << " max " << op.op.copy_get.max;
5931 case CEPH_OSD_OP_COPY_FROM:
5932 out << " ver " << op.op.copy_from.src_version;
5934 case CEPH_OSD_OP_SETALLOCHINT:
5935 out << " object_size " << op.op.alloc_hint.expected_object_size
5936 << " write_size " << op.op.alloc_hint.expected_write_size;
5938 case CEPH_OSD_OP_READ:
5939 case CEPH_OSD_OP_SPARSE_READ:
5940 case CEPH_OSD_OP_SYNC_READ:
5941 case CEPH_OSD_OP_WRITE:
5942 case CEPH_OSD_OP_WRITEFULL:
5943 case CEPH_OSD_OP_ZERO:
5944 case CEPH_OSD_OP_APPEND:
5945 case CEPH_OSD_OP_MAPEXT:
5946 out << " " << op.op.extent.offset << "~" << op.op.extent.length;
5947 if (op.op.extent.truncate_seq)
5948 out << " [" << op.op.extent.truncate_seq << "@"
5949 << (int64_t)op.op.extent.truncate_size << "]";
5951 out << " [" << ceph_osd_op_flag_string(op.op.flags) << "]";
5953 // don't show any arg info
5956 } else if (ceph_osd_op_type_attr(op.op.op)) {
5958 if (op.op.xattr.name_len && op.indata.length()) {
5960 op.indata.write(0, op.op.xattr.name_len, out);
5962 if (op.op.xattr.value_len)
5963 out << " (" << op.op.xattr.value_len << ")";
5964 if (op.op.op == CEPH_OSD_OP_CMPXATTR)
5965 out << " op " << (int)op.op.xattr.cmp_op
5966 << " mode " << (int)op.op.xattr.cmp_mode;
5967 } else if (ceph_osd_op_type_exec(op.op.op)) {
5969 if (op.op.cls.class_len && op.indata.length()) {
5971 op.indata.write(0, op.op.cls.class_len, out);
5973 op.indata.write(op.op.cls.class_len, op.op.cls.method_len, out);
5975 } else if (ceph_osd_op_type_pg(op.op.op)) {
5977 case CEPH_OSD_OP_PGLS:
5978 case CEPH_OSD_OP_PGLS_FILTER:
5979 case CEPH_OSD_OP_PGNLS:
5980 case CEPH_OSD_OP_PGNLS_FILTER:
5981 out << " start_epoch " << op.op.pgls.start_epoch;
5983 case CEPH_OSD_OP_PG_HITSET_LS:
5985 case CEPH_OSD_OP_PG_HITSET_GET:
5986 out << " " << utime_t(op.op.hit_set_get.stamp);
5988 case CEPH_OSD_OP_SCRUBLS:
5996 void OSDOp::split_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& in)
5998 bufferlist::iterator datap = in.begin();
5999 for (unsigned i = 0; i < ops.size(); i++) {
6000 if (ops[i].op.payload_len) {
6001 datap.copy(ops[i].op.payload_len, ops[i].indata);
6006 void OSDOp::merge_osd_op_vector_in_data(vector<OSDOp>& ops, bufferlist& out)
6008 for (unsigned i = 0; i < ops.size(); i++) {
6009 if (ops[i].indata.length()) {
6010 ops[i].op.payload_len = ops[i].indata.length();
6011 out.append(ops[i].indata);
6016 void OSDOp::split_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& in)
6018 bufferlist::iterator datap = in.begin();
6019 for (unsigned i = 0; i < ops.size(); i++) {
6020 if (ops[i].op.payload_len) {
6021 datap.copy(ops[i].op.payload_len, ops[i].outdata);
6026 void OSDOp::merge_osd_op_vector_out_data(vector<OSDOp>& ops, bufferlist& out)
6028 for (unsigned i = 0; i < ops.size(); i++) {
6029 if (ops[i].outdata.length()) {
6030 ops[i].op.payload_len = ops[i].outdata.length();
6031 out.append(ops[i].outdata);
6036 bool store_statfs_t::operator==(const store_statfs_t& other) const
6038 return total == other.total
6039 && available == other.available
6040 && allocated == other.allocated
6041 && stored == other.stored
6042 && compressed == other.compressed
6043 && compressed_allocated == other.compressed_allocated
6044 && compressed_original == other.compressed_original;
6047 void store_statfs_t::dump(Formatter *f) const
6049 f->dump_int("total", total);
6050 f->dump_int("available", available);
6051 f->dump_int("allocated", allocated);
6052 f->dump_int("stored", stored);
6053 f->dump_int("compressed", compressed);
6054 f->dump_int("compressed_allocated", compressed_allocated);
6055 f->dump_int("compressed_original", compressed_original);
6058 ostream& operator<<(ostream& out, const store_statfs_t &s)
6061 << "store_statfs(0x" << s.available
6063 << ", stored 0x" << s.stored
6064 << "/0x" << s.allocated
6065 << ", compress 0x" << s.compressed
6066 << "/0x" << s.compressed_allocated
6067 << "/0x" << s.compressed_original
6073 void OSDOp::clear_data(vector<OSDOp>& ops)
6075 for (unsigned i = 0; i < ops.size(); i++) {
6078 if (ceph_osd_op_type_attr(op.op.op) &&
6079 op.op.xattr.name_len &&
6080 op.indata.length() >= op.op.xattr.name_len) {
6081 bufferptr bp(op.op.xattr.name_len);
6084 bl.copy_in(0, op.op.xattr.name_len, op.indata);
6085 op.indata.claim(bl);
6086 } else if (ceph_osd_op_type_exec(op.op.op) &&
6087 op.op.cls.class_len &&
6088 op.indata.length() >
6089 (op.op.cls.class_len + op.op.cls.method_len)) {
6090 __u8 len = op.op.cls.class_len + op.op.cls.method_len;
6094 bl.copy_in(0, len, op.indata);
6095 op.indata.claim(bl);