1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <boost/statechart/custom_reaction.hpp>
19 #include <boost/statechart/event.hpp>
20 #include <boost/statechart/simple_state.hpp>
21 #include <boost/statechart/state.hpp>
22 #include <boost/statechart/state_machine.hpp>
23 #include <boost/statechart/transition.hpp>
24 #include <boost/statechart/event_base.hpp>
25 #include <boost/scoped_ptr.hpp>
26 #include <boost/circular_buffer.hpp>
27 #include "include/memory.h"
28 #include "include/mempool.h"
30 // re-include our assert to clobber boost's
31 #include "include/assert.h"
33 #include "include/types.h"
34 #include "include/stringify.h"
35 #include "osd_types.h"
36 #include "include/xlist.h"
37 #include "SnapMapper.h"
39 #include "common/Timer.h"
43 #include "messages/MOSDPGLog.h"
44 #include "include/str_list.h"
45 #include "PGBackend.h"
55 // #include "include/unordered_map.h"
56 // #include "include/unordered_set.h"
58 //#define DEBUG_RECOVERY_OIDS // track set of recovering oids explicitly, to find counting bugs
69 typedef OpRequest::Ref OpRequestRef;
77 void intrusive_ptr_add_ref(PG *pg);
78 void intrusive_ptr_release(PG *pg);
80 using state_history_entry = std::tuple<utime_t, utime_t, const char*>;
81 using embedded_state = std::pair<utime_t, const char*>;
83 struct PGStateInstance {
84 // Time spent in pg states
86 void setepoch(const epoch_t current_epoch) {
87 this_epoch = current_epoch;
90 void enter_state(const utime_t entime, const char* state) {
91 embedded_states.push(std::make_pair(entime, state));
94 void exit_state(const utime_t extime) {
95 embedded_state this_state = embedded_states.top();
96 state_history.push_back(state_history_entry{
97 this_state.first, extime, this_state.second});
98 embedded_states.pop();
103 std::vector<state_history_entry> state_history;
104 std::stack<embedded_state> embedded_states;
107 class PGStateHistory {
108 // Member access protected with the PG lock
110 PGStateHistory() : buffer(10) {}
112 void enter(PG* pg, const utime_t entime, const char* state);
114 void exit(const char* state);
120 void set_pg_in_destructor() { pg_in_destructor = true; }
122 void dump(Formatter* f) const;
125 bool pg_in_destructor = false;
126 PG* thispg = nullptr;
127 std::unique_ptr<PGStateInstance> tmppi;
128 PGStateInstance* pi = nullptr;
129 boost::circular_buffer<std::unique_ptr<PGStateInstance>> buffer;
134 #include "common/tracked_int_ptr.hpp"
135 uint64_t get_with_id(PG *pg);
136 void put_with_id(PG *pg, uint64_t id);
137 typedef TrackedIntPtr<PG> PGRef;
139 typedef boost::intrusive_ptr<PG> PGRef;
142 class PGRecoveryStats {
143 struct per_state_info {
144 uint64_t enter, exit; // enter/exit counts
146 utime_t event_time; // time spent processing events
147 utime_t total_time; // total time in state
148 utime_t min_time, max_time;
150 // cppcheck-suppress unreachableCode
151 per_state_info() : enter(0), exit(0), events(0) {}
153 map<const char *,per_state_info> info;
157 PGRecoveryStats() : lock("PGRecoverStats::lock") {}
160 Mutex::Locker l(lock);
163 void dump(ostream& out) {
164 Mutex::Locker l(lock);
165 for (map<const char *,per_state_info>::iterator p = info.begin(); p != info.end(); ++p) {
166 per_state_info& i = p->second;
167 out << i.enter << "\t" << i.exit << "\t"
168 << i.events << "\t" << i.event_time << "\t"
169 << i.total_time << "\t"
170 << i.min_time << "\t" << i.max_time << "\t"
175 void dump_formatted(Formatter *f) {
176 Mutex::Locker l(lock);
177 f->open_array_section("pg_recovery_stats");
178 for (map<const char *,per_state_info>::iterator p = info.begin();
179 p != info.end(); ++p) {
180 per_state_info& i = p->second;
181 f->open_object_section("recovery_state");
182 f->dump_int("enter", i.enter);
183 f->dump_int("exit", i.exit);
184 f->dump_int("events", i.events);
185 f->dump_stream("event_time") << i.event_time;
186 f->dump_stream("total_time") << i.total_time;
187 f->dump_stream("min_time") << i.min_time;
188 f->dump_stream("max_time") << i.max_time;
189 vector<string> states;
190 get_str_vec(p->first, "/", states);
191 f->open_array_section("nested_states");
192 for (vector<string>::iterator st = states.begin();
193 st != states.end(); ++st) {
194 f->dump_string("state", *st);
202 void log_enter(const char *s) {
203 Mutex::Locker l(lock);
206 void log_exit(const char *s, utime_t dur, uint64_t events, utime_t event_dur) {
207 Mutex::Locker l(lock);
208 per_state_info &i = info[s];
211 if (dur > i.max_time)
213 if (dur < i.min_time || i.min_time == utime_t())
216 i.event_time += event_dur;
222 epoch_t cached_epoch;
228 SnapContext snapc; // the default pool snapc, ready to go.
230 interval_set<snapid_t> cached_removed_snaps; // current removed_snaps set
231 interval_set<snapid_t> newly_removed_snaps; // newly removed in the last epoch
233 PGPool(CephContext* cct, OSDMapRef map, int64_t i)
235 cached_epoch(map->get_epoch()),
237 name(map->get_pool_name(id)),
238 auid(map->get_pg_pool(id)->auid) {
239 const pg_pool_t *pi = map->get_pg_pool(id);
242 snapc = pi->get_snap_context();
243 pi->build_removed_snaps(cached_removed_snaps);
246 void update(OSDMapRef map);
249 /** PG - Replica Placement Group
253 class PG : public DoutPrefixProvider {
258 SnapMapper snap_mapper;
259 bool eio_errors_to_process = false;
261 virtual PGBackend *get_pgbackend() = 0;
263 std::string gen_prefix() const override;
264 CephContext *get_cct() const override { return cct; }
265 unsigned get_subsys() const override { return ceph_subsys_osd; }
268 void update_snap_mapper_bits(uint32_t bits) {
269 snap_mapper.update_bits(bits);
271 /// get_is_recoverable_predicate: caller owns returned pointer and must delete when done
272 IsPGRecoverablePredicate *get_is_recoverable_predicate() {
273 return get_pgbackend()->get_is_recoverable_predicate();
276 OSDMapRef osdmap_ref;
277 OSDMapRef last_persisted_osdmap_ref;
280 void requeue_map_waiters();
282 void update_osdmap_ref(OSDMapRef newmap) {
283 assert(_lock.is_locked_by_me());
284 osdmap_ref = std::move(newmap);
288 OSDMapRef get_osdmap() const {
295 /** locking and reference counting.
296 * I destroy myself when the reference count hits zero.
297 * lock() should be called before doing anything.
298 * get() should be called on pointer copy (to another thread, etc.).
299 * put() should be called on destruction of some previously copied pointer.
300 * unlock() when done with the current pointer (_most common_).
303 std::atomic_uint ref{0};
307 map<uint64_t, string> _live_ids;
308 map<string, uint64_t> _tag_counts;
313 bool deleting; // true while in removing or OSD is shutting down
315 ZTracer::Endpoint trace_endpoint;
317 void lock_suspend_timeout(ThreadPool::TPHandle &handle);
318 void lock(bool no_lockdep = false) const;
319 void unlock() const {
320 //generic_dout(0) << this << " " << info.pgid << " unlock" << dendl;
322 assert(!dirty_big_info);
326 bool is_locked() const {
327 return _lock.is_locked();
331 uint64_t get_with_id();
332 void put_with_id(uint64_t);
333 void dump_live_ids();
335 void get(const char* tag);
336 void put(const char* tag);
338 bool dirty_info, dirty_big_info;
341 bool is_ec_pg() const {
342 return pool.info.ec_pool();
345 pg_info_t info; ///< current pg info
346 pg_info_t last_written_info; ///< last written info
348 static const __u8 cur_struct_v = 10;
349 // v10 is the new past_intervals encoding
350 // v9 was fastinfo_key addition
351 // v8 was the move to a per-pg pgmeta object
352 // v7 was SnapMapper addition in 86658392516d5175b2756659ef7ffaaf95b0f8ad
353 // (first appeared in cuttlefish).
354 static const __u8 compat_struct_v = 7;
355 bool must_upgrade() {
356 return info_struct_v < cur_struct_v;
359 return info_struct_v >= compat_struct_v;
361 void upgrade(ObjectStore *store);
364 ObjectStore::CollectionHandle ch;
366 static string get_info_key(spg_t pgid) {
367 return stringify(pgid) + "_info";
369 static string get_biginfo_key(spg_t pgid) {
370 return stringify(pgid) + "_biginfo";
372 static string get_epoch_key(spg_t pgid) {
373 return stringify(pgid) + "_epoch";
375 ghobject_t pgmeta_oid;
378 map<hobject_t, pg_missing_item> needs_recovery_map;
379 map<hobject_t, set<pg_shard_t> > missing_loc;
380 set<pg_shard_t> missing_loc_sources;
382 set<pg_shard_t> empty_set;
384 boost::scoped_ptr<IsPGReadablePredicate> is_readable;
385 boost::scoped_ptr<IsPGRecoverablePredicate> is_recoverable;
386 explicit MissingLoc(PG *pg)
388 void set_backend_predicates(
389 IsPGReadablePredicate *_is_readable,
390 IsPGRecoverablePredicate *_is_recoverable) {
391 is_readable.reset(_is_readable);
392 is_recoverable.reset(_is_recoverable);
394 string gen_prefix() const { return pg->gen_prefix(); }
396 const hobject_t &hoid,
397 eversion_t *v = 0) const {
398 map<hobject_t, pg_missing_item>::const_iterator i =
399 needs_recovery_map.find(hoid);
400 if (i == needs_recovery_map.end())
406 bool is_deleted(const hobject_t &hoid) const {
407 auto i = needs_recovery_map.find(hoid);
408 if (i == needs_recovery_map.end())
410 return i->second.is_delete();
412 bool is_unfound(const hobject_t &hoid) const {
413 return needs_recovery(hoid) && !is_deleted(hoid) && (
414 !missing_loc.count(hoid) ||
415 !(*is_recoverable)(missing_loc.find(hoid)->second));
417 bool readable_with_acting(
418 const hobject_t &hoid,
419 const set<pg_shard_t> &acting) const;
420 uint64_t num_unfound() const {
422 for (map<hobject_t, pg_missing_item>::const_iterator i =
423 needs_recovery_map.begin();
424 i != needs_recovery_map.end();
426 if (is_unfound(i->first))
432 bool have_unfound() const {
433 for (map<hobject_t, pg_missing_item>::const_iterator i =
434 needs_recovery_map.begin();
435 i != needs_recovery_map.end();
437 if (is_unfound(i->first))
443 needs_recovery_map.clear();
445 missing_loc_sources.clear();
448 void add_location(const hobject_t &hoid, pg_shard_t location) {
449 missing_loc[hoid].insert(location);
451 void remove_location(const hobject_t &hoid, pg_shard_t location) {
452 missing_loc[hoid].erase(location);
454 void add_active_missing(const pg_missing_t &missing) {
455 for (map<hobject_t, pg_missing_item>::const_iterator i =
456 missing.get_items().begin();
457 i != missing.get_items().end();
459 map<hobject_t, pg_missing_item>::const_iterator j =
460 needs_recovery_map.find(i->first);
461 if (j == needs_recovery_map.end()) {
462 needs_recovery_map.insert(*i);
464 lgeneric_dout(pg->cct, 0) << this << " " << pg->info.pgid << " unexpected need for "
465 << i->first << " have " << j->second
466 << " tried to add " << i->second << dendl;
467 assert(i->second.need == j->second.need);
472 void add_missing(const hobject_t &hoid, eversion_t need, eversion_t have) {
473 needs_recovery_map[hoid] = pg_missing_item(need, have);
475 void revise_need(const hobject_t &hoid, eversion_t need) {
476 assert(needs_recovery(hoid));
477 needs_recovery_map[hoid].need = need;
480 /// Adds info about a possible recovery source
481 bool add_source_info(
482 pg_shard_t source, ///< [in] source
483 const pg_info_t &oinfo, ///< [in] info
484 const pg_missing_t &omissing, ///< [in] (optional) missing
485 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
486 ); ///< @return whether a new object location was discovered
488 /// Adds recovery sources in batch
489 void add_batch_sources_info(
490 const set<pg_shard_t> &sources, ///< [in] a set of resources which can be used for all objects
491 ThreadPool::TPHandle* handle ///< [in] ThreadPool handle
494 /// Uses osdmap to update structures for now down sources
495 void check_recovery_sources(const OSDMapRef& osdmap);
497 /// Call when hoid is no longer missing in acting set
498 void recovered(const hobject_t &hoid) {
499 needs_recovery_map.erase(hoid);
500 missing_loc.erase(hoid);
503 /// Call to update structures for hoid after a change
505 const hobject_t &hoid,
507 const set<pg_shard_t> to_recover,
508 const pg_info_t &info,
509 const pg_missing_t &missing,
510 const map<pg_shard_t, pg_missing_t> &pmissing,
511 const map<pg_shard_t, pg_info_t> &pinfo) {
513 boost::optional<pg_missing_item> item;
514 auto miter = missing.get_items().find(hoid);
515 if (miter != missing.get_items().end()) {
516 item = miter->second;
518 for (auto &&i: to_recover) {
521 auto pmiter = pmissing.find(i);
522 assert(pmiter != pmissing.end());
523 miter = pmiter->second.get_items().find(hoid);
524 if (miter != pmiter->second.get_items().end()) {
525 item = miter->second;
531 return; // recovered!
533 needs_recovery_map[hoid] = *item;
534 if (item->is_delete())
537 missing_loc.insert(make_pair(hoid, set<pg_shard_t>())).first;
538 assert(info.last_backfill.is_max());
539 assert(info.last_update >= item->need);
540 if (!missing.is_missing(hoid))
541 mliter->second.insert(self);
542 for (auto &&i: pmissing) {
543 auto pinfoiter = pinfo.find(i.first);
544 assert(pinfoiter != pinfo.end());
545 if (item->need <= pinfoiter->second.last_update &&
546 hoid <= pinfoiter->second.last_backfill &&
547 !i.second.is_missing(hoid))
548 mliter->second.insert(i.first);
552 const set<pg_shard_t> &get_locations(const hobject_t &hoid) const {
553 return missing_loc.count(hoid) ?
554 missing_loc.find(hoid)->second : empty_set;
556 const map<hobject_t, set<pg_shard_t>> &get_missing_locs() const {
559 const map<hobject_t, pg_missing_item> &get_needs_recovery() const {
560 return needs_recovery_map;
564 PastIntervals past_intervals;
566 interval_set<snapid_t> snap_trimq;
568 /* You should not use these items without taking their respective queue locks
569 * (if they have one) */
570 xlist<PG*>::item stat_queue_item;
572 bool recovery_queued;
574 int recovery_ops_active;
575 set<pg_shard_t> waiting_on_backfill;
576 #ifdef DEBUG_RECOVERY_OIDS
577 set<hobject_t> recovering_oids;
581 int role; // 0 = primary, 1 = replica, -1=none.
582 unsigned state; // PG_STATE_*
584 bool send_notify; ///< true if we are non-primary and should notify the primary
587 eversion_t last_update_ondisk; // last_update that has committed; ONLY DEFINED WHEN is_active()
588 eversion_t last_complete_ondisk; // last_complete that has committed.
589 eversion_t last_update_applied;
592 struct C_UpdateLastRollbackInfoTrimmedToApplied : Context {
596 C_UpdateLastRollbackInfoTrimmedToApplied(PG *pg, epoch_t e, eversion_t v)
597 : pg(pg), e(e), v(v) {}
598 void finish(int) override {
600 if (!pg->pg_has_reset_since(e)) {
601 pg->last_rollback_info_trimmed_to_applied = v;
606 // entries <= last_rollback_info_trimmed_to_applied have been trimmed,
607 // and the transaction has applied
608 eversion_t last_rollback_info_trimmed_to_applied;
613 pg_shard_t pg_whoami;
614 pg_shard_t up_primary;
615 vector<int> up, acting, want_acting;
616 set<pg_shard_t> actingbackfill, actingset, upset;
617 map<pg_shard_t,eversion_t> peer_last_complete_ondisk;
618 eversion_t min_last_complete_ondisk; // up: min over last_complete_ondisk, peer_last_complete_ondisk
619 eversion_t pg_trim_to;
621 set<int> blocked_by; ///< osds we are blocked by (for pg stats)
623 // [primary only] content recovery state
626 struct BufferedRecoveryMessages {
627 map<int, map<spg_t, pg_query_t> > query_map;
628 map<int, vector<pair<pg_notify_t, PastIntervals> > > info_map;
629 map<int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
634 map<int, map<spg_t, pg_query_t> > *query_map;
635 map<int, vector<pair<pg_notify_t, PastIntervals> > > *info_map;
636 map<int, vector<pair<pg_notify_t, PastIntervals> > > *notify_list;
637 set<PGRef> created_pgs;
638 C_Contexts *on_applied;
640 ObjectStore::Transaction *transaction;
641 ThreadPool::TPHandle* handle;
642 RecoveryCtx(map<int, map<spg_t, pg_query_t> > *query_map,
644 vector<pair<pg_notify_t, PastIntervals> > > *info_map,
646 vector<pair<pg_notify_t, PastIntervals> > > *notify_list,
647 C_Contexts *on_applied,
649 ObjectStore::Transaction *transaction)
650 : query_map(query_map), info_map(info_map),
651 notify_list(notify_list),
652 on_applied(on_applied),
654 transaction(transaction),
657 RecoveryCtx(BufferedRecoveryMessages &buf, RecoveryCtx &rctx)
658 : query_map(&(buf.query_map)),
659 info_map(&(buf.info_map)),
660 notify_list(&(buf.notify_list)),
661 on_applied(rctx.on_applied),
662 on_safe(rctx.on_safe),
663 transaction(rctx.transaction),
664 handle(rctx.handle) {}
666 void accept_buffered_messages(BufferedRecoveryMessages &m) {
670 for (map<int, map<spg_t, pg_query_t> >::iterator i = m.query_map.begin();
671 i != m.query_map.end();
673 map<spg_t, pg_query_t> &omap = (*query_map)[i->first];
674 for (map<spg_t, pg_query_t>::iterator j = i->second.begin();
675 j != i->second.end();
677 omap[j->first] = j->second;
680 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
681 = m.info_map.begin();
682 i != m.info_map.end();
684 vector<pair<pg_notify_t, PastIntervals> > &ovec =
685 (*info_map)[i->first];
686 ovec.reserve(ovec.size() + i->second.size());
687 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
689 for (map<int, vector<pair<pg_notify_t, PastIntervals> > >::iterator i
690 = m.notify_list.begin();
691 i != m.notify_list.end();
693 vector<pair<pg_notify_t, PastIntervals> > &ovec =
694 (*notify_list)[i->first];
695 ovec.reserve(ovec.size() + i->second.size());
696 ovec.insert(ovec.end(), i->second.begin(), i->second.end());
702 PGStateHistory pgstate_history;
705 const char *state_name;
708 const char *get_state_name() { return state_name; }
709 NamedState(PG *pg_, const char *state_name_)
710 : state_name(state_name_), enter_time(ceph_clock_now()), pg(pg_) {
711 pg->pgstate_history.enter(pg, enter_time, state_name);
713 virtual ~NamedState() { pg->pgstate_history.exit(state_name); }
721 * peer_info -- projected (updates _before_ replicas ack)
722 * peer_missing -- committed (updates _after_ replicas ack)
726 set<pg_shard_t> stray_set; // non-acting osds that have PG data.
727 eversion_t oldest_update; // acting: lowest (valid) last_update in active set
728 map<pg_shard_t, pg_info_t> peer_info; // info from peers (stray or prior)
729 set<pg_shard_t> peer_purged; // peers purged
730 map<pg_shard_t, pg_missing_t> peer_missing;
731 set<pg_shard_t> peer_log_requested; // logs i've requested (and start stamps)
732 set<pg_shard_t> peer_missing_requested;
734 // i deleted these strays; ignore racing PGInfo from them
735 set<pg_shard_t> peer_activated;
737 // primary-only, recovery-only state
738 set<pg_shard_t> might_have_unfound; // These osds might have objects on them
739 // which are unfound on the primary
740 epoch_t last_peering_reset;
743 /* heartbeat peers */
744 void set_probe_targets(const set<pg_shard_t> &probe_set);
745 void clear_probe_targets();
747 Mutex heartbeat_peer_lock;
748 set<int> heartbeat_peers;
749 set<int> probe_targets;
754 * Represents the objects in a range [begin, end)
757 * 1) begin == end == hobject_t() indicates the the interval is unpopulated
758 * 2) Else, objects contains all objects in [begin, end)
760 struct BackfillInterval {
761 // info about a backfill interval on a peer
762 eversion_t version; /// version at which the scan occurred
763 map<hobject_t,eversion_t> objects;
769 *this = BackfillInterval();
772 /// clear objects list only
773 void clear_objects() {
777 /// reinstantiate with a new start+end position and sort order
778 void reset(hobject_t start) {
783 /// true if there are no objects in this interval
785 return objects.empty();
788 /// true if interval extends to the end of the range
789 bool extends_to_end() const {
793 /// removes items <= soid and adjusts begin to the first object
794 void trim_to(const hobject_t &soid) {
796 while (!objects.empty() &&
797 objects.begin()->first <= soid) {
802 /// Adjusts begin to the first object
804 if (!objects.empty())
805 begin = objects.begin()->first;
810 /// drop first entry, and adjust @begin accordingly
812 assert(!objects.empty());
813 objects.erase(objects.begin());
818 void dump(Formatter *f) const {
819 f->dump_stream("begin") << begin;
820 f->dump_stream("end") << end;
821 f->open_array_section("objects");
822 for (map<hobject_t, eversion_t>::const_iterator i =
826 f->open_object_section("object");
827 f->dump_stream("object") << i->first;
828 f->dump_stream("version") << i->second;
836 BackfillInterval backfill_info;
837 map<pg_shard_t, BackfillInterval> peer_backfill_info;
838 bool backfill_reserved;
839 bool backfill_reserving;
844 set<pg_shard_t> backfill_targets;
846 bool is_backfill_targets(pg_shard_t osd) {
847 return backfill_targets.count(osd);
853 * blocked request wait hierarchy
855 * In order to preserve request ordering we need to be careful about the
856 * order in which blocked requests get requeued. Generally speaking, we
857 * push the requests back up to the op_wq in reverse order (most recent
858 * request first) so that they come back out again in the original order.
859 * However, because there are multiple wait queues, we need to requeue
860 * waitlists in order. Generally speaking, we requeue the wait lists
861 * that are checked first.
863 * Here are the various wait lists, in the order they are used during
864 * request processing, with notes:
867 * - may start or stop blocking at any time (depending on client epoch)
868 * - waiting_for_peered
869 * - !is_peered() or flushes_in_progress
870 * - only starts blocking on interval change; never restarts
871 * - waiting_for_active
873 * - only starts blocking on interval change; never restarts
874 * - waiting_for_scrub
875 * - starts and stops blocking for varying intervals during scrub
876 * - waiting_for_unreadable_object
877 * - never restarts once object is readable (* except for EIO?)
878 * - waiting_for_degraded_object
879 * - never restarts once object is writeable (* except for EIO?)
880 * - waiting_for_blocked_object
881 * - starts and stops based on proxied op activity
883 * - starts and stops based on read/write activity
887 * 1. During and interval change, we requeue *everything* in the above order.
889 * 2. When an obc rwlock is released, we check for a scrub block and requeue
890 * the op there if it applies. We ignore the unreadable/degraded/blocked
891 * queues because we assume they cannot apply at that time (this is
892 * probably mostly true).
894 * 3. The requeue_ops helper will push ops onto the waiting_for_map list if
897 * These three behaviors are generally sufficient to maintain ordering, with
898 * the possible exception of cases where we make an object degraded or
899 * unreadable that was previously okay, e.g. when scrub or op processing
900 * encounter an unexpected error. FIXME.
904 unsigned flushes_in_progress;
906 // ops with newer maps than our (or blocked behind them)
907 // track these by client, since inter-request ordering doesn't otherwise
909 unordered_map<entity_name_t,list<OpRequestRef>> waiting_for_map;
911 // ops waiting on peered
912 list<OpRequestRef> waiting_for_peered;
914 // ops waiting on active (require peered as well)
915 list<OpRequestRef> waiting_for_active;
916 list<OpRequestRef> waiting_for_scrub;
918 list<OpRequestRef> waiting_for_cache_not_full;
919 list<OpRequestRef> waiting_for_clean_to_primary_repair;
920 map<hobject_t, list<OpRequestRef>> waiting_for_unreadable_object,
921 waiting_for_degraded_object,
922 waiting_for_blocked_object;
924 set<hobject_t> objects_blocked_on_cache_full;
925 map<hobject_t,snapid_t> objects_blocked_on_degraded_snap;
926 map<hobject_t,ObjectContextRef> objects_blocked_on_snap_promotion;
928 // Callbacks should assume pg (and nothing else) is locked
929 map<hobject_t, list<Context*>> callbacks_for_degraded_object;
932 list<pair<OpRequestRef, version_t> > > waiting_for_ondisk;
934 void requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m);
935 void requeue_op(OpRequestRef op);
936 void requeue_ops(list<OpRequestRef> &l);
938 // stats that persist lazily
939 object_stat_collection_t unstable_stats;
942 Mutex pg_stats_publish_lock;
943 bool pg_stats_publish_valid;
944 pg_stat_t pg_stats_publish;
946 // for ordering writes
947 ceph::shared_ptr<ObjectStore::Sequencer> osr;
949 void _update_calc_stats();
950 void _update_blocked_by();
951 void publish_stats_to_osd();
952 void clear_publish_stats();
955 void clear_primary_state();
957 bool is_actingbackfill(pg_shard_t osd) const {
958 return actingbackfill.count(osd);
960 bool is_acting(pg_shard_t osd) const {
961 return has_shard(pool.info.ec_pool(), acting, osd);
963 bool is_up(pg_shard_t osd) const {
964 return has_shard(pool.info.ec_pool(), up, osd);
966 static bool has_shard(bool ec, const vector<int>& v, pg_shard_t osd) {
968 return v.size() > (unsigned)osd.shard && v[osd.shard] == osd.osd;
970 return std::find(v.begin(), v.end(), osd.osd) != v.end();
974 bool needs_recovery() const;
975 bool needs_backfill() const;
977 /// clip calculated priority to reasonable range
978 inline int clamp_recovery_priority(int priority);
979 /// get log recovery reservation priority
980 unsigned get_recovery_priority();
981 /// get backfill reservation priority
982 unsigned get_backfill_priority();
984 void mark_clean(); ///< mark an active pg clean
985 void _change_recovery_force_mode(int new_mode, bool clear);
987 /// return [start,end) bounds for required past_intervals
988 static pair<epoch_t, epoch_t> get_required_past_interval_bounds(
989 const pg_info_t &info,
990 epoch_t oldest_map) {
992 info.history.last_epoch_clean ? info.history.last_epoch_clean :
993 info.history.epoch_pool_created,
996 info.history.same_interval_since,
997 info.history.epoch_pool_created);
998 return make_pair(start, end);
1000 void check_past_interval_bounds() const;
1001 PastIntervals::PriorSet build_prior();
1003 void remove_down_peer_info(const OSDMapRef osdmap);
1005 bool adjust_need_up_thru(const OSDMapRef osdmap);
1007 bool all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const;
1008 virtual void dump_recovery_info(Formatter *f) const = 0;
1010 bool calc_min_last_complete_ondisk() {
1011 eversion_t min = last_complete_ondisk;
1012 assert(!actingbackfill.empty());
1013 for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1014 i != actingbackfill.end();
1016 if (*i == get_primary()) continue;
1017 if (peer_last_complete_ondisk.count(*i) == 0)
1018 return false; // we don't have complete info
1019 eversion_t a = peer_last_complete_ondisk[*i];
1023 if (min == min_last_complete_ondisk)
1025 min_last_complete_ondisk = min;
1029 virtual void calc_trim_to() = 0;
1031 void proc_replica_log(pg_info_t &oinfo, const pg_log_t &olog,
1032 pg_missing_t& omissing, pg_shard_t from);
1033 void proc_master_log(ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog,
1034 pg_missing_t& omissing, pg_shard_t from);
1035 bool proc_replica_info(
1036 pg_shard_t from, const pg_info_t &info, epoch_t send_epoch);
1038 struct PGLogEntryHandler : public PGLog::LogEntryHandler {
1040 ObjectStore::Transaction *t;
1041 PGLogEntryHandler(PG *pg, ObjectStore::Transaction *t) : pg(pg), t(t) {}
1044 void remove(const hobject_t &hoid) override {
1045 pg->get_pgbackend()->remove(hoid, t);
1047 void try_stash(const hobject_t &hoid, version_t v) override {
1048 pg->get_pgbackend()->try_stash(hoid, v, t);
1050 void rollback(const pg_log_entry_t &entry) override {
1051 assert(entry.can_rollback());
1052 pg->get_pgbackend()->rollback(entry, t);
1054 void rollforward(const pg_log_entry_t &entry) override {
1055 pg->get_pgbackend()->rollforward(entry, t);
1057 void trim(const pg_log_entry_t &entry) override {
1058 pg->get_pgbackend()->trim(entry, t);
1062 void update_object_snap_mapping(
1063 ObjectStore::Transaction *t, const hobject_t &soid,
1064 const set<snapid_t> &snaps);
1065 void clear_object_snap_mapping(
1066 ObjectStore::Transaction *t, const hobject_t &soid);
1067 void remove_snap_mapped_object(
1068 ObjectStore::Transaction& t, const hobject_t& soid);
1070 ObjectStore::Transaction& t, pg_info_t &oinfo,
1071 pg_log_t &olog, pg_shard_t from);
1072 void rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead);
1073 bool search_for_missing(
1074 const pg_info_t &oinfo, const pg_missing_t &omissing,
1078 void check_for_lost_objects();
1079 void forget_lost_objects();
1081 void discover_all_missing(std::map<int, map<spg_t,pg_query_t> > &query_map);
1083 void trim_write_ahead();
1085 map<pg_shard_t, pg_info_t>::const_iterator find_best_info(
1086 const map<pg_shard_t, pg_info_t> &infos,
1087 bool restrict_to_up_acting,
1088 bool *history_les_bound) const;
1089 static void calc_ec_acting(
1090 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1092 const vector<int> &acting,
1093 pg_shard_t acting_primary,
1094 const vector<int> &up,
1095 pg_shard_t up_primary,
1096 const map<pg_shard_t, pg_info_t> &all_info,
1097 bool restrict_to_up_acting,
1099 set<pg_shard_t> *backfill,
1100 set<pg_shard_t> *acting_backfill,
1101 pg_shard_t *want_primary,
1103 static void calc_replicated_acting(
1104 map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1106 const vector<int> &acting,
1107 pg_shard_t acting_primary,
1108 const vector<int> &up,
1109 pg_shard_t up_primary,
1110 const map<pg_shard_t, pg_info_t> &all_info,
1111 bool restrict_to_up_acting,
1113 set<pg_shard_t> *backfill,
1114 set<pg_shard_t> *acting_backfill,
1115 pg_shard_t *want_primary,
1117 bool choose_acting(pg_shard_t &auth_log_shard,
1118 bool restrict_to_up_acting,
1119 bool *history_les_bound);
1120 void build_might_have_unfound();
1122 ObjectStore::Transaction& t,
1123 epoch_t activation_epoch,
1124 list<Context*>& tfin,
1125 map<int, map<spg_t,pg_query_t> >& query_map,
1127 vector<pair<pg_notify_t, PastIntervals> > > *activator_map,
1129 void _activate_committed(epoch_t epoch, epoch_t activation_epoch);
1130 void all_activated_and_committed();
1132 void proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &info);
1134 bool have_unfound() const {
1135 return missing_loc.have_unfound();
1137 uint64_t get_num_unfound() const {
1138 return missing_loc.num_unfound();
1141 virtual void check_local() = 0;
1144 * @param ops_begun returns how many recovery ops the function started
1145 * @returns true if any useful work was accomplished; false otherwise
1147 virtual bool start_recovery_ops(
1149 ThreadPool::TPHandle &handle,
1150 uint64_t *ops_begun) = 0;
1152 void purge_strays();
1154 void update_heartbeat_peers();
1156 Context *finish_sync_event;
1158 void finish_recovery(list<Context*>& tfin);
1159 void _finish_recovery(Context *c);
1160 void cancel_recovery();
1161 void clear_recovery_state();
1162 virtual void _clear_recovery_state() = 0;
1163 virtual void check_recovery_sources(const OSDMapRef& newmap) = 0;
1164 void start_recovery_op(const hobject_t& soid);
1165 void finish_recovery_op(const hobject_t& soid, bool dequeue=false);
1167 void split_into(pg_t child_pgid, PG *child, unsigned split_bits);
1168 virtual void _split_into(pg_t child_pgid, PG *child, unsigned split_bits) = 0;
1170 friend class C_OSD_RepModify_Commit;
1173 Mutex backoff_lock; // orders inside Backoff::lock
1174 map<hobject_t,set<BackoffRef>> backoffs;
1176 void add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end);
1177 void release_backoffs(const hobject_t& begin, const hobject_t& end);
1178 void release_backoffs(const hobject_t& o) {
1179 release_backoffs(o, o);
1181 void clear_backoffs();
1183 void add_pg_backoff(SessionRef s) {
1184 hobject_t begin = info.pgid.pgid.get_hobj_start();
1185 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1186 add_backoff(s, begin, end);
1188 void release_pg_backoffs() {
1189 hobject_t begin = info.pgid.pgid.get_hobj_start();
1190 hobject_t end = info.pgid.pgid.get_hobj_end(pool.info.get_pg_num());
1191 release_backoffs(begin, end);
1194 void rm_backoff(BackoffRef b);
1202 set<pg_shard_t> reserved_peers;
1203 bool reserved, reserve_failed;
1204 epoch_t epoch_start;
1206 // common to both scrubs
1209 set<pg_shard_t> waiting_on_whom;
1213 ScrubMap primary_scrubmap;
1214 map<pg_shard_t, ScrubMap> received_maps;
1215 OpRequestRef active_rep_scrub;
1216 utime_t scrub_reg_stamp; // stamp we registered for
1219 bool sleeping = false;
1220 bool needs_sleep = true;
1221 utime_t sleep_start;
1223 // flags to indicate explicitly requested scrubs (by admin)
1224 bool must_scrub, must_deep_scrub, must_repair;
1226 // Priority to use for scrub scheduling
1229 // this flag indicates whether we would like to do auto-repair of the PG or not
1232 // Maps from objects with errors to missing/inconsistent peers
1233 map<hobject_t, set<pg_shard_t>> missing;
1234 map<hobject_t, set<pg_shard_t>> inconsistent;
1236 // Map from object with errors to good peers
1237 map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >> authoritative;
1239 // Cleaned map pending snap metadata scrub
1240 ScrubMap cleaned_meta_map;
1242 // digest updates which we are waiting on
1243 int num_digest_updates_pending;
1246 hobject_t start, end;
1247 eversion_t subset_last_update;
1249 // chunky scrub state
1258 WAIT_DIGEST_UPDATES,
1262 std::unique_ptr<Scrub::Store> store;
1267 list<Context*> callbacks;
1268 void add_callback(Context *context) {
1269 callbacks.push_back(context);
1271 void run_callbacks() {
1272 list<Context*> to_run;
1273 to_run.swap(callbacks);
1274 for (list<Context*>::iterator i = to_run.begin();
1281 static const char *state_string(const PG::Scrubber::State& state) {
1282 const char *ret = NULL;
1285 case INACTIVE: ret = "INACTIVE"; break;
1286 case NEW_CHUNK: ret = "NEW_CHUNK"; break;
1287 case WAIT_PUSHES: ret = "WAIT_PUSHES"; break;
1288 case WAIT_LAST_UPDATE: ret = "WAIT_LAST_UPDATE"; break;
1289 case BUILD_MAP: ret = "BUILD_MAP"; break;
1290 case WAIT_REPLICAS: ret = "WAIT_REPLICAS"; break;
1291 case COMPARE_MAPS: ret = "COMPARE_MAPS"; break;
1292 case WAIT_DIGEST_UPDATES: ret = "WAIT_DIGEST_UPDATES"; break;
1293 case FINISH: ret = "FINISH"; break;
1298 bool is_chunky_scrub_active() const { return state != INACTIVE; }
1300 // classic (non chunk) scrubs block all writes
1301 // chunky scrubs only block writes to a range
1302 bool write_blocked_by_scrub(const hobject_t &soid) {
1303 return (soid >= start && soid < end);
1310 waiting_on_whom.clear();
1311 if (active_rep_scrub) {
1312 active_rep_scrub = OpRequestRef();
1314 received_maps.clear();
1317 must_deep_scrub = false;
1318 must_repair = false;
1319 auto_repair = false;
1321 state = PG::Scrubber::INACTIVE;
1322 start = hobject_t();
1324 subset_last_update = eversion_t();
1331 inconsistent.clear();
1333 authoritative.clear();
1334 num_digest_updates_pending = 0;
1335 cleaned_meta_map = ScrubMap();
1338 sleep_start = utime_t();
1341 void create_results(const hobject_t& obj);
1342 void cleanup_store(ObjectStore::Transaction *t);
1345 bool scrub_after_recovery;
1350 const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
1351 pg_shard_t bad_peer);
1353 void scrub(epoch_t queued, ThreadPool::TPHandle &handle);
1354 void chunky_scrub(ThreadPool::TPHandle &handle);
1355 void scrub_compare_maps();
1357 * return true if any inconsistency/missing is repaired, false otherwise
1359 bool scrub_process_inconsistent();
1360 bool ops_blocked_by_scrub() const;
1361 void scrub_finish();
1362 void scrub_clear_state();
1363 void _scan_snaps(ScrubMap &map);
1364 void _repair_oinfo_oid(ScrubMap &map);
1365 void _scan_rollback_obs(
1366 const vector<ghobject_t> &rollback_obs,
1367 ThreadPool::TPHandle &handle);
1368 void _request_scrub_map(pg_shard_t replica, eversion_t version,
1369 hobject_t start, hobject_t end, bool deep,
1371 int build_scrub_map_chunk(
1373 hobject_t start, hobject_t end, bool deep, uint32_t seed,
1374 ThreadPool::TPHandle &handle);
1376 * returns true if [begin, end) is good to scrub at this time
1377 * a false return value obliges the implementer to requeue scrub when the
1378 * condition preventing scrub clears
1380 virtual bool _range_available_for_scrub(
1381 const hobject_t &begin, const hobject_t &end) = 0;
1382 virtual void scrub_snapshot_metadata(
1384 const std::map<hobject_t, pair<uint32_t, uint32_t>> &missing_digest) { }
1385 virtual void _scrub_clear_state() { }
1386 virtual void _scrub_finish() { }
1387 virtual void split_colls(
1391 const pg_pool_t *pool,
1392 ObjectStore::Transaction *t) = 0;
1393 void clear_scrub_reserved();
1394 void scrub_reserve_replicas();
1395 void scrub_unreserve_replicas();
1396 bool scrub_all_replicas_reserved() const;
1398 void reg_next_scrub();
1399 void unreg_next_scrub();
1403 ThreadPool::TPHandle &handle);
1404 void do_replica_scrub_map(OpRequestRef op);
1405 void sub_op_scrub_map(OpRequestRef op);
1407 void handle_scrub_reserve_request(OpRequestRef op);
1408 void handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from);
1409 void handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from);
1410 void handle_scrub_reserve_release(OpRequestRef op);
1412 void reject_reservation();
1413 void schedule_backfill_retry(float retry);
1414 void schedule_recovery_retry(float retry);
1416 // -- recovery state --
1418 template <class EVT>
1419 struct QueuePeeringEvt : Context {
1423 QueuePeeringEvt(PG *pg, epoch_t epoch, EVT evt) :
1424 pg(pg), epoch(epoch), evt(evt) {}
1425 void finish(int r) override {
1427 pg->queue_peering_event(PG::CephPeeringEvtRef(
1428 new PG::CephPeeringEvt(
1436 class CephPeeringEvt {
1438 epoch_t epoch_requested;
1439 boost::intrusive_ptr< const boost::statechart::event_base > evt;
1442 MEMPOOL_CLASS_HELPERS();
1444 CephPeeringEvt(epoch_t epoch_sent,
1445 epoch_t epoch_requested,
1447 epoch_sent(epoch_sent), epoch_requested(epoch_requested),
1448 evt(evt_.intrusive_from_this()) {
1450 out << "epoch_sent: " << epoch_sent
1451 << " epoch_requested: " << epoch_requested << " ";
1455 epoch_t get_epoch_sent() { return epoch_sent; }
1456 epoch_t get_epoch_requested() { return epoch_requested; }
1457 const boost::statechart::event_base &get_event() { return *evt; }
1458 string get_desc() { return desc; }
1460 typedef ceph::shared_ptr<CephPeeringEvt> CephPeeringEvtRef;
1461 list<CephPeeringEvtRef> peering_queue; // op queue
1462 list<CephPeeringEvtRef> peering_waiters;
1464 struct QueryState : boost::statechart::event< QueryState > {
1466 explicit QueryState(Formatter *f) : f(f) {}
1467 void print(std::ostream *out) const {
1472 struct MInfoRec : boost::statechart::event< MInfoRec > {
1476 MInfoRec(pg_shard_t from, const pg_info_t &info, epoch_t msg_epoch) :
1477 from(from), info(info), msg_epoch(msg_epoch) {}
1478 void print(std::ostream *out) const {
1479 *out << "MInfoRec from " << from << " info: " << info;
1483 struct MLogRec : boost::statechart::event< MLogRec > {
1485 boost::intrusive_ptr<MOSDPGLog> msg;
1486 MLogRec(pg_shard_t from, MOSDPGLog *msg) :
1487 from(from), msg(msg) {}
1488 void print(std::ostream *out) const {
1489 *out << "MLogRec from " << from;
1493 struct MNotifyRec : boost::statechart::event< MNotifyRec > {
1497 MNotifyRec(pg_shard_t from, const pg_notify_t ¬ify, uint64_t f) :
1498 from(from), notify(notify), features(f) {}
1499 void print(std::ostream *out) const {
1500 *out << "MNotifyRec from " << from << " notify: " << notify
1501 << " features: 0x" << hex << features << dec;
1505 struct MQuery : boost::statechart::event< MQuery > {
1508 epoch_t query_epoch;
1509 MQuery(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch):
1510 from(from), query(query), query_epoch(query_epoch) {}
1511 void print(std::ostream *out) const {
1512 *out << "MQuery from " << from
1513 << " query_epoch " << query_epoch
1514 << " query: " << query;
1518 struct AdvMap : boost::statechart::event< AdvMap > {
1521 vector<int> newup, newacting;
1522 int up_primary, acting_primary;
1524 OSDMapRef osdmap, OSDMapRef lastmap,
1525 vector<int>& newup, int up_primary,
1526 vector<int>& newacting, int acting_primary):
1527 osdmap(osdmap), lastmap(lastmap),
1529 newacting(newacting),
1530 up_primary(up_primary),
1531 acting_primary(acting_primary) {}
1532 void print(std::ostream *out) const {
1537 struct ActMap : boost::statechart::event< ActMap > {
1538 ActMap() : boost::statechart::event< ActMap >() {}
1539 void print(std::ostream *out) const {
1543 struct Activate : boost::statechart::event< Activate > {
1544 epoch_t activation_epoch;
1545 explicit Activate(epoch_t q) : boost::statechart::event< Activate >(),
1546 activation_epoch(q) {}
1547 void print(std::ostream *out) const {
1548 *out << "Activate from " << activation_epoch;
1551 struct RequestBackfillPrio : boost::statechart::event< RequestBackfillPrio > {
1553 explicit RequestBackfillPrio(unsigned prio) :
1554 boost::statechart::event< RequestBackfillPrio >(),
1556 void print(std::ostream *out) const {
1557 *out << "RequestBackfillPrio: priority " << priority;
1560 #define TrivialEvent(T) struct T : boost::statechart::event< T > { \
1561 T() : boost::statechart::event< T >() {} \
1562 void print(std::ostream *out) const { \
1566 struct DeferBackfill : boost::statechart::event<DeferBackfill> {
1568 explicit DeferBackfill(float delay) : delay(delay) {}
1569 void print(std::ostream *out) const {
1570 *out << "DeferBackfill: delay " << delay;
1573 struct DeferRecovery : boost::statechart::event<DeferRecovery> {
1575 explicit DeferRecovery(float delay) : delay(delay) {}
1576 void print(std::ostream *out) const {
1577 *out << "DeferRecovery: delay " << delay;
1581 TrivialEvent(Initialize)
1583 TrivialEvent(GotInfo)
1584 TrivialEvent(NeedUpThru)
1585 TrivialEvent(NullEvt)
1586 TrivialEvent(FlushedEvt)
1587 TrivialEvent(Backfilled)
1588 TrivialEvent(LocalBackfillReserved)
1589 TrivialEvent(RemoteBackfillReserved)
1590 TrivialEvent(RejectRemoteReservation)
1591 TrivialEvent(RemoteReservationRejected)
1592 TrivialEvent(RemoteReservationCanceled)
1593 TrivialEvent(RequestBackfill)
1594 TrivialEvent(RequestRecovery)
1595 TrivialEvent(RecoveryDone)
1596 TrivialEvent(BackfillTooFull)
1597 TrivialEvent(RecoveryTooFull)
1599 TrivialEvent(MakePrimary)
1600 TrivialEvent(MakeStray)
1601 TrivialEvent(NeedActingChange)
1602 TrivialEvent(IsIncomplete)
1603 TrivialEvent(IsDown)
1605 TrivialEvent(AllReplicasRecovered)
1606 TrivialEvent(DoRecovery)
1607 TrivialEvent(LocalRecoveryReserved)
1608 TrivialEvent(RemoteRecoveryReserved)
1609 TrivialEvent(AllRemotesReserved)
1610 TrivialEvent(AllBackfillsReserved)
1611 TrivialEvent(GoClean)
1613 TrivialEvent(AllReplicasActivated)
1615 TrivialEvent(IntervalFlush)
1617 /* Encapsulates PG recovery process */
1618 class RecoveryState {
1619 void start_handle(RecoveryCtx *new_ctx);
1622 void begin_block_outgoing();
1623 void end_block_outgoing();
1624 void clear_blocked_outgoing();
1629 class RecoveryMachine : public boost::statechart::state_machine< RecoveryMachine, Initial > {
1630 RecoveryState *state;
1635 uint64_t event_count;
1637 void clear_event_counters() {
1638 event_time = utime_t();
1642 void log_enter(const char *state_name);
1643 void log_exit(const char *state_name, utime_t duration);
1645 RecoveryMachine(RecoveryState *state, PG *pg) : state(state), pg(pg), event_count(0) {}
1647 /* Accessor functions for state methods */
1648 ObjectStore::Transaction* get_cur_transaction() {
1649 assert(state->rctx);
1650 assert(state->rctx->transaction);
1651 return state->rctx->transaction;
1654 void send_query(pg_shard_t to, const pg_query_t &query) {
1655 assert(state->rctx);
1656 assert(state->rctx->query_map);
1657 (*state->rctx->query_map)[to.osd][spg_t(pg->info.pgid.pgid, to.shard)] =
1661 map<int, map<spg_t, pg_query_t> > *get_query_map() {
1662 assert(state->rctx);
1663 assert(state->rctx->query_map);
1664 return state->rctx->query_map;
1667 map<int, vector<pair<pg_notify_t, PastIntervals> > > *get_info_map() {
1668 assert(state->rctx);
1669 assert(state->rctx->info_map);
1670 return state->rctx->info_map;
1673 list< Context* > *get_on_safe_context_list() {
1674 assert(state->rctx);
1675 assert(state->rctx->on_safe);
1676 return &(state->rctx->on_safe->contexts);
1679 list< Context * > *get_on_applied_context_list() {
1680 assert(state->rctx);
1681 assert(state->rctx->on_applied);
1682 return &(state->rctx->on_applied->contexts);
1685 RecoveryCtx *get_recovery_ctx() { return &*(state->rctx); }
1687 void send_notify(pg_shard_t to,
1688 const pg_notify_t &info, const PastIntervals &pi) {
1689 assert(state->rctx);
1690 assert(state->rctx->notify_list);
1691 (*state->rctx->notify_list)[to.osd].push_back(make_pair(info, pi));
1694 friend class RecoveryMachine;
1698 struct Crashed : boost::statechart::state< Crashed, RecoveryMachine >, NamedState {
1699 explicit Crashed(my_context ctx);
1704 struct Initial : boost::statechart::state< Initial, RecoveryMachine >, NamedState {
1705 explicit Initial(my_context ctx);
1708 typedef boost::mpl::list <
1709 boost::statechart::transition< Initialize, Reset >,
1710 boost::statechart::custom_reaction< Load >,
1711 boost::statechart::custom_reaction< NullEvt >,
1712 boost::statechart::transition< boost::statechart::event_base, Crashed >
1715 boost::statechart::result react(const Load&);
1716 boost::statechart::result react(const MNotifyRec&);
1717 boost::statechart::result react(const MInfoRec&);
1718 boost::statechart::result react(const MLogRec&);
1719 boost::statechart::result react(const boost::statechart::event_base&) {
1720 return discard_event();
1724 struct Reset : boost::statechart::state< Reset, RecoveryMachine >, NamedState {
1725 explicit Reset(my_context ctx);
1728 typedef boost::mpl::list <
1729 boost::statechart::custom_reaction< QueryState >,
1730 boost::statechart::custom_reaction< AdvMap >,
1731 boost::statechart::custom_reaction< ActMap >,
1732 boost::statechart::custom_reaction< NullEvt >,
1733 boost::statechart::custom_reaction< FlushedEvt >,
1734 boost::statechart::custom_reaction< IntervalFlush >,
1735 boost::statechart::transition< boost::statechart::event_base, Crashed >
1737 boost::statechart::result react(const QueryState& q);
1738 boost::statechart::result react(const AdvMap&);
1739 boost::statechart::result react(const ActMap&);
1740 boost::statechart::result react(const FlushedEvt&);
1741 boost::statechart::result react(const IntervalFlush&);
1742 boost::statechart::result react(const boost::statechart::event_base&) {
1743 return discard_event();
1749 struct Started : boost::statechart::state< Started, RecoveryMachine, Start >, NamedState {
1750 explicit Started(my_context ctx);
1753 typedef boost::mpl::list <
1754 boost::statechart::custom_reaction< QueryState >,
1755 boost::statechart::custom_reaction< AdvMap >,
1756 boost::statechart::custom_reaction< NullEvt >,
1757 boost::statechart::custom_reaction< FlushedEvt >,
1758 boost::statechart::custom_reaction< IntervalFlush >,
1759 boost::statechart::transition< boost::statechart::event_base, Crashed >
1761 boost::statechart::result react(const QueryState& q);
1762 boost::statechart::result react(const AdvMap&);
1763 boost::statechart::result react(const FlushedEvt&);
1764 boost::statechart::result react(const IntervalFlush&);
1765 boost::statechart::result react(const boost::statechart::event_base&) {
1766 return discard_event();
1773 struct Start : boost::statechart::state< Start, Started >, NamedState {
1774 explicit Start(my_context ctx);
1777 typedef boost::mpl::list <
1778 boost::statechart::transition< MakePrimary, Primary >,
1779 boost::statechart::transition< MakeStray, Stray >
1784 struct WaitActingChange;
1788 struct Primary : boost::statechart::state< Primary, Started, Peering >, NamedState {
1789 explicit Primary(my_context ctx);
1792 typedef boost::mpl::list <
1793 boost::statechart::custom_reaction< ActMap >,
1794 boost::statechart::custom_reaction< MNotifyRec >,
1795 boost::statechart::transition< NeedActingChange, WaitActingChange >
1797 boost::statechart::result react(const ActMap&);
1798 boost::statechart::result react(const MNotifyRec&);
1801 struct WaitActingChange : boost::statechart::state< WaitActingChange, Primary>,
1803 typedef boost::mpl::list <
1804 boost::statechart::custom_reaction< QueryState >,
1805 boost::statechart::custom_reaction< AdvMap >,
1806 boost::statechart::custom_reaction< MLogRec >,
1807 boost::statechart::custom_reaction< MInfoRec >,
1808 boost::statechart::custom_reaction< MNotifyRec >
1810 explicit WaitActingChange(my_context ctx);
1811 boost::statechart::result react(const QueryState& q);
1812 boost::statechart::result react(const AdvMap&);
1813 boost::statechart::result react(const MLogRec&);
1814 boost::statechart::result react(const MInfoRec&);
1815 boost::statechart::result react(const MNotifyRec&);
1822 struct Peering : boost::statechart::state< Peering, Primary, GetInfo >, NamedState {
1823 PastIntervals::PriorSet prior_set;
1824 bool history_les_bound; //< need osd_find_best_info_ignore_history_les
1826 explicit Peering(my_context ctx);
1829 typedef boost::mpl::list <
1830 boost::statechart::custom_reaction< QueryState >,
1831 boost::statechart::transition< Activate, Active >,
1832 boost::statechart::custom_reaction< AdvMap >
1834 boost::statechart::result react(const QueryState& q);
1835 boost::statechart::result react(const AdvMap &advmap);
1838 struct WaitLocalRecoveryReserved;
1840 struct Active : boost::statechart::state< Active, Primary, Activating >, NamedState {
1841 explicit Active(my_context ctx);
1844 const set<pg_shard_t> remote_shards_to_reserve_recovery;
1845 const set<pg_shard_t> remote_shards_to_reserve_backfill;
1846 bool all_replicas_activated;
1848 typedef boost::mpl::list <
1849 boost::statechart::custom_reaction< QueryState >,
1850 boost::statechart::custom_reaction< ActMap >,
1851 boost::statechart::custom_reaction< AdvMap >,
1852 boost::statechart::custom_reaction< MInfoRec >,
1853 boost::statechart::custom_reaction< MNotifyRec >,
1854 boost::statechart::custom_reaction< MLogRec >,
1855 boost::statechart::custom_reaction< Backfilled >,
1856 boost::statechart::custom_reaction< AllReplicasActivated >,
1857 boost::statechart::custom_reaction< DeferRecovery >,
1858 boost::statechart::custom_reaction< DeferBackfill >
1860 boost::statechart::result react(const QueryState& q);
1861 boost::statechart::result react(const ActMap&);
1862 boost::statechart::result react(const AdvMap&);
1863 boost::statechart::result react(const MInfoRec& infoevt);
1864 boost::statechart::result react(const MNotifyRec& notevt);
1865 boost::statechart::result react(const MLogRec& logevt);
1866 boost::statechart::result react(const Backfilled&) {
1867 return discard_event();
1869 boost::statechart::result react(const AllReplicasActivated&);
1870 boost::statechart::result react(const DeferRecovery& evt) {
1871 return discard_event();
1873 boost::statechart::result react(const DeferBackfill& evt) {
1874 return discard_event();
1878 struct Clean : boost::statechart::state< Clean, Active >, NamedState {
1879 typedef boost::mpl::list<
1880 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >
1882 explicit Clean(my_context ctx);
1886 struct Recovered : boost::statechart::state< Recovered, Active >, NamedState {
1887 typedef boost::mpl::list<
1888 boost::statechart::transition< GoClean, Clean >,
1889 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
1890 boost::statechart::custom_reaction< AllReplicasActivated >
1892 explicit Recovered(my_context ctx);
1894 boost::statechart::result react(const AllReplicasActivated&) {
1895 post_event(GoClean());
1896 return forward_event();
1900 struct Backfilling : boost::statechart::state< Backfilling, Active >, NamedState {
1901 typedef boost::mpl::list<
1902 boost::statechart::transition< Backfilled, Recovered >,
1903 boost::statechart::custom_reaction< DeferBackfill >,
1904 boost::statechart::custom_reaction< RemoteReservationRejected >
1906 explicit Backfilling(my_context ctx);
1907 boost::statechart::result react(const RemoteReservationRejected& evt);
1908 boost::statechart::result react(const DeferBackfill& evt);
1912 struct WaitRemoteBackfillReserved : boost::statechart::state< WaitRemoteBackfillReserved, Active >, NamedState {
1913 typedef boost::mpl::list<
1914 boost::statechart::custom_reaction< RemoteBackfillReserved >,
1915 boost::statechart::custom_reaction< RemoteReservationRejected >,
1916 boost::statechart::transition< AllBackfillsReserved, Backfilling >
1918 set<pg_shard_t>::const_iterator backfill_osd_it;
1919 explicit WaitRemoteBackfillReserved(my_context ctx);
1921 boost::statechart::result react(const RemoteBackfillReserved& evt);
1922 boost::statechart::result react(const RemoteReservationRejected& evt);
1925 struct WaitLocalBackfillReserved : boost::statechart::state< WaitLocalBackfillReserved, Active >, NamedState {
1926 typedef boost::mpl::list<
1927 boost::statechart::transition< LocalBackfillReserved, WaitRemoteBackfillReserved >
1929 explicit WaitLocalBackfillReserved(my_context ctx);
1933 struct NotBackfilling : boost::statechart::state< NotBackfilling, Active>, NamedState {
1934 typedef boost::mpl::list<
1935 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved>,
1936 boost::statechart::custom_reaction< RemoteBackfillReserved >,
1937 boost::statechart::custom_reaction< RemoteReservationRejected >
1939 explicit NotBackfilling(my_context ctx);
1941 boost::statechart::result react(const RemoteBackfillReserved& evt);
1942 boost::statechart::result react(const RemoteReservationRejected& evt);
1945 struct NotRecovering : boost::statechart::state< NotRecovering, Active>, NamedState {
1946 typedef boost::mpl::list<
1947 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
1948 boost::statechart::custom_reaction< DeferRecovery >
1950 explicit NotRecovering(my_context ctx);
1951 boost::statechart::result react(const DeferRecovery& evt) {
1953 return discard_event();
1958 struct RepNotRecovering;
1959 struct ReplicaActive : boost::statechart::state< ReplicaActive, Started, RepNotRecovering >, NamedState {
1960 explicit ReplicaActive(my_context ctx);
1963 typedef boost::mpl::list <
1964 boost::statechart::custom_reaction< QueryState >,
1965 boost::statechart::custom_reaction< ActMap >,
1966 boost::statechart::custom_reaction< MQuery >,
1967 boost::statechart::custom_reaction< MInfoRec >,
1968 boost::statechart::custom_reaction< MLogRec >,
1969 boost::statechart::custom_reaction< Activate >,
1970 boost::statechart::custom_reaction< DeferRecovery >,
1971 boost::statechart::custom_reaction< DeferBackfill >
1973 boost::statechart::result react(const QueryState& q);
1974 boost::statechart::result react(const MInfoRec& infoevt);
1975 boost::statechart::result react(const MLogRec& logevt);
1976 boost::statechart::result react(const ActMap&);
1977 boost::statechart::result react(const MQuery&);
1978 boost::statechart::result react(const Activate&);
1979 boost::statechart::result react(const DeferRecovery& evt) {
1980 return discard_event();
1982 boost::statechart::result react(const DeferBackfill& evt) {
1983 return discard_event();
1987 struct RepRecovering : boost::statechart::state< RepRecovering, ReplicaActive >, NamedState {
1988 typedef boost::mpl::list<
1989 boost::statechart::transition< RecoveryDone, RepNotRecovering >,
1990 // for compat with old peers
1991 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
1992 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
1993 boost::statechart::custom_reaction< BackfillTooFull >
1995 explicit RepRecovering(my_context ctx);
1996 boost::statechart::result react(const BackfillTooFull &evt);
2000 struct RepWaitBackfillReserved : boost::statechart::state< RepWaitBackfillReserved, ReplicaActive >, NamedState {
2001 typedef boost::mpl::list<
2002 boost::statechart::custom_reaction< RemoteBackfillReserved >,
2003 boost::statechart::custom_reaction< RejectRemoteReservation >,
2004 boost::statechart::custom_reaction< RemoteReservationRejected >,
2005 boost::statechart::custom_reaction< RemoteReservationCanceled >
2007 explicit RepWaitBackfillReserved(my_context ctx);
2009 boost::statechart::result react(const RemoteBackfillReserved &evt);
2010 boost::statechart::result react(const RejectRemoteReservation &evt);
2011 boost::statechart::result react(const RemoteReservationRejected &evt);
2012 boost::statechart::result react(const RemoteReservationCanceled &evt);
2015 struct RepWaitRecoveryReserved : boost::statechart::state< RepWaitRecoveryReserved, ReplicaActive >, NamedState {
2016 typedef boost::mpl::list<
2017 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2018 // for compat with old peers
2019 boost::statechart::custom_reaction< RemoteReservationRejected >,
2020 boost::statechart::custom_reaction< RemoteReservationCanceled >
2022 explicit RepWaitRecoveryReserved(my_context ctx);
2024 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2025 boost::statechart::result react(const RemoteReservationRejected &evt) {
2026 // for compat with old peers
2027 post_event(RemoteReservationCanceled());
2028 return discard_event();
2030 boost::statechart::result react(const RemoteReservationCanceled &evt);
2033 struct RepNotRecovering : boost::statechart::state< RepNotRecovering, ReplicaActive>, NamedState {
2034 typedef boost::mpl::list<
2035 boost::statechart::custom_reaction< RequestBackfillPrio >,
2036 boost::statechart::transition< RequestRecovery, RepWaitRecoveryReserved >,
2037 boost::statechart::custom_reaction< RejectRemoteReservation >,
2038 boost::statechart::transition< RemoteReservationRejected, RepNotRecovering >,
2039 boost::statechart::transition< RemoteReservationCanceled, RepNotRecovering >,
2040 boost::statechart::transition< RecoveryDone, RepNotRecovering > // for compat with pre-reservation peers
2042 explicit RepNotRecovering(my_context ctx);
2043 boost::statechart::result react(const RequestBackfillPrio &evt);
2044 boost::statechart::result react(const RejectRemoteReservation &evt);
2048 struct Recovering : boost::statechart::state< Recovering, Active >, NamedState {
2049 typedef boost::mpl::list <
2050 boost::statechart::custom_reaction< AllReplicasRecovered >,
2051 boost::statechart::custom_reaction< DeferRecovery >,
2052 boost::statechart::custom_reaction< RequestBackfill >
2054 explicit Recovering(my_context ctx);
2056 void release_reservations(bool cancel = false);
2057 boost::statechart::result react(const AllReplicasRecovered &evt);
2058 boost::statechart::result react(const DeferRecovery& evt);
2059 boost::statechart::result react(const RequestBackfill &evt);
2062 struct WaitRemoteRecoveryReserved : boost::statechart::state< WaitRemoteRecoveryReserved, Active >, NamedState {
2063 typedef boost::mpl::list <
2064 boost::statechart::custom_reaction< RemoteRecoveryReserved >,
2065 boost::statechart::transition< AllRemotesReserved, Recovering >
2067 set<pg_shard_t>::const_iterator remote_recovery_reservation_it;
2068 explicit WaitRemoteRecoveryReserved(my_context ctx);
2069 boost::statechart::result react(const RemoteRecoveryReserved &evt);
2073 struct WaitLocalRecoveryReserved : boost::statechart::state< WaitLocalRecoveryReserved, Active >, NamedState {
2074 typedef boost::mpl::list <
2075 boost::statechart::transition< LocalRecoveryReserved, WaitRemoteRecoveryReserved >,
2076 boost::statechart::custom_reaction< RecoveryTooFull >
2078 explicit WaitLocalRecoveryReserved(my_context ctx);
2080 boost::statechart::result react(const RecoveryTooFull &evt);
2083 struct Activating : boost::statechart::state< Activating, Active >, NamedState {
2084 typedef boost::mpl::list <
2085 boost::statechart::transition< AllReplicasRecovered, Recovered >,
2086 boost::statechart::transition< DoRecovery, WaitLocalRecoveryReserved >,
2087 boost::statechart::transition< RequestBackfill, WaitLocalBackfillReserved >
2089 explicit Activating(my_context ctx);
2093 struct Stray : boost::statechart::state< Stray, Started >, NamedState {
2094 map<int, pair<pg_query_t, epoch_t> > pending_queries;
2096 explicit Stray(my_context ctx);
2099 typedef boost::mpl::list <
2100 boost::statechart::custom_reaction< MQuery >,
2101 boost::statechart::custom_reaction< MLogRec >,
2102 boost::statechart::custom_reaction< MInfoRec >,
2103 boost::statechart::custom_reaction< ActMap >,
2104 boost::statechart::custom_reaction< RecoveryDone >
2106 boost::statechart::result react(const MQuery& query);
2107 boost::statechart::result react(const MLogRec& logevt);
2108 boost::statechart::result react(const MInfoRec& infoevt);
2109 boost::statechart::result react(const ActMap&);
2110 boost::statechart::result react(const RecoveryDone&) {
2111 return discard_event();
2117 struct GetInfo : boost::statechart::state< GetInfo, Peering >, NamedState {
2118 set<pg_shard_t> peer_info_requested;
2120 explicit GetInfo(my_context ctx);
2124 typedef boost::mpl::list <
2125 boost::statechart::custom_reaction< QueryState >,
2126 boost::statechart::transition< GotInfo, GetLog >,
2127 boost::statechart::custom_reaction< MNotifyRec >,
2128 boost::statechart::transition< IsDown, Down >
2130 boost::statechart::result react(const QueryState& q);
2131 boost::statechart::result react(const MNotifyRec& infoevt);
2134 struct GotLog : boost::statechart::event< GotLog > {
2135 GotLog() : boost::statechart::event< GotLog >() {}
2138 struct GetLog : boost::statechart::state< GetLog, Peering >, NamedState {
2139 pg_shard_t auth_log_shard;
2140 boost::intrusive_ptr<MOSDPGLog> msg;
2142 explicit GetLog(my_context ctx);
2145 typedef boost::mpl::list <
2146 boost::statechart::custom_reaction< QueryState >,
2147 boost::statechart::custom_reaction< MLogRec >,
2148 boost::statechart::custom_reaction< GotLog >,
2149 boost::statechart::custom_reaction< AdvMap >,
2150 boost::statechart::transition< IsIncomplete, Incomplete >
2152 boost::statechart::result react(const AdvMap&);
2153 boost::statechart::result react(const QueryState& q);
2154 boost::statechart::result react(const MLogRec& logevt);
2155 boost::statechart::result react(const GotLog&);
2160 struct GetMissing : boost::statechart::state< GetMissing, Peering >, NamedState {
2161 set<pg_shard_t> peer_missing_requested;
2163 explicit GetMissing(my_context ctx);
2166 typedef boost::mpl::list <
2167 boost::statechart::custom_reaction< QueryState >,
2168 boost::statechart::custom_reaction< MLogRec >,
2169 boost::statechart::transition< NeedUpThru, WaitUpThru >
2171 boost::statechart::result react(const QueryState& q);
2172 boost::statechart::result react(const MLogRec& logevt);
2175 struct WaitUpThru : boost::statechart::state< WaitUpThru, Peering >, NamedState {
2176 explicit WaitUpThru(my_context ctx);
2179 typedef boost::mpl::list <
2180 boost::statechart::custom_reaction< QueryState >,
2181 boost::statechart::custom_reaction< ActMap >,
2182 boost::statechart::custom_reaction< MLogRec >
2184 boost::statechart::result react(const QueryState& q);
2185 boost::statechart::result react(const ActMap& am);
2186 boost::statechart::result react(const MLogRec& logrec);
2189 struct Down : boost::statechart::state< Down, Peering>, NamedState {
2190 explicit Down(my_context ctx);
2191 typedef boost::mpl::list <
2192 boost::statechart::custom_reaction< QueryState >
2194 boost::statechart::result react(const QueryState& infoevt);
2198 struct Incomplete : boost::statechart::state< Incomplete, Peering>, NamedState {
2199 typedef boost::mpl::list <
2200 boost::statechart::custom_reaction< AdvMap >,
2201 boost::statechart::custom_reaction< MNotifyRec >,
2202 boost::statechart::custom_reaction< QueryState >
2204 explicit Incomplete(my_context ctx);
2205 boost::statechart::result react(const AdvMap &advmap);
2206 boost::statechart::result react(const MNotifyRec& infoevt);
2207 boost::statechart::result react(const QueryState& infoevt);
2212 RecoveryMachine machine;
2215 /// context passed in by state machine caller
2216 RecoveryCtx *orig_ctx;
2218 /// populated if we are buffering messages pending a flush
2219 boost::optional<BufferedRecoveryMessages> messages_pending_flush;
2222 * populated between start_handle() and end_handle(), points into
2223 * the message lists for messages_pending_flush while blocking messages
2224 * or into orig_ctx otherwise
2226 boost::optional<RecoveryCtx> rctx;
2229 explicit RecoveryState(PG *pg)
2230 : machine(this, pg), pg(pg), orig_ctx(0) {
2234 void handle_event(const boost::statechart::event_base &evt,
2235 RecoveryCtx *rctx) {
2237 machine.process_event(evt);
2241 void handle_event(CephPeeringEvtRef evt,
2242 RecoveryCtx *rctx) {
2244 machine.process_event(evt->get_event());
2252 PG(OSDService *o, OSDMapRef curmap,
2253 const PGPool &pool, spg_t p);
2258 explicit PG(const PG& rhs);
2259 PG& operator=(const PG& rhs);
2261 uint64_t peer_features;
2262 uint64_t acting_features;
2263 uint64_t upacting_features;
2268 const spg_t& get_pgid() const { return pg_id; }
2270 void reset_min_peer_features() {
2271 peer_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
2273 uint64_t get_min_peer_features() const { return peer_features; }
2274 void apply_peer_features(uint64_t f) { peer_features &= f; }
2276 uint64_t get_min_acting_features() const { return acting_features; }
2277 uint64_t get_min_upacting_features() const { return upacting_features; }
2278 bool perform_deletes_during_peering() const {
2279 return !(get_osdmap()->test_flag(CEPH_OSDMAP_RECOVERY_DELETES));
2282 void init_primary_up_acting(
2283 const vector<int> &newup,
2284 const vector<int> &newacting,
2286 int new_acting_primary) {
2289 for (uint8_t i = 0; i < acting.size(); ++i) {
2290 if (acting[i] != CRUSH_ITEM_NONE)
2294 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2298 for (uint8_t i = 0; i < up.size(); ++i) {
2299 if (up[i] != CRUSH_ITEM_NONE)
2303 pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
2305 if (!pool.info.ec_pool()) {
2306 up_primary = pg_shard_t(new_up_primary, shard_id_t::NO_SHARD);
2307 primary = pg_shard_t(new_acting_primary, shard_id_t::NO_SHARD);
2310 up_primary = pg_shard_t();
2311 primary = pg_shard_t();
2312 for (uint8_t i = 0; i < up.size(); ++i) {
2313 if (up[i] == new_up_primary) {
2314 up_primary = pg_shard_t(up[i], shard_id_t(i));
2318 for (uint8_t i = 0; i < acting.size(); ++i) {
2319 if (acting[i] == new_acting_primary) {
2320 primary = pg_shard_t(acting[i], shard_id_t(i));
2324 assert(up_primary.osd == new_up_primary);
2325 assert(primary.osd == new_acting_primary);
2327 pg_shard_t get_primary() const { return primary; }
2329 int get_role() const { return role; }
2330 void set_role(int r) { role = r; }
2332 bool is_primary() const { return pg_whoami == primary; }
2333 bool is_replica() const { return role > 0; }
2335 epoch_t get_last_peering_reset() const { return last_peering_reset; }
2337 //int get_state() const { return state; }
2338 bool state_test(int m) const { return (state & m) != 0; }
2339 void state_set(int m) { state |= m; }
2340 void state_clear(int m) { state &= ~m; }
2342 bool is_complete() const { return info.last_complete == info.last_update; }
2343 bool should_send_notify() const { return send_notify; }
2345 int get_state() const { return state; }
2346 bool is_active() const { return state_test(PG_STATE_ACTIVE); }
2347 bool is_activating() const { return state_test(PG_STATE_ACTIVATING); }
2348 bool is_peering() const { return state_test(PG_STATE_PEERING); }
2349 bool is_down() const { return state_test(PG_STATE_DOWN); }
2350 bool is_incomplete() const { return state_test(PG_STATE_INCOMPLETE); }
2351 bool is_clean() const { return state_test(PG_STATE_CLEAN); }
2352 bool is_degraded() const { return state_test(PG_STATE_DEGRADED); }
2353 bool is_undersized() const { return state_test(PG_STATE_UNDERSIZED); }
2355 bool is_scrubbing() const { return state_test(PG_STATE_SCRUBBING); }
2356 bool is_peered() const {
2357 return state_test(PG_STATE_ACTIVE) || state_test(PG_STATE_PEERED);
2360 bool is_empty() const { return info.last_update == eversion_t(0,0); }
2364 const vector<int>& up,
2366 const vector<int>& acting,
2368 const pg_history_t& history,
2369 const PastIntervals& pim,
2371 ObjectStore::Transaction *t);
2374 void do_pending_flush();
2376 static void _create(ObjectStore::Transaction& t, spg_t pgid, int bits);
2377 static void _init(ObjectStore::Transaction& t,
2378 spg_t pgid, const pg_pool_t *pool);
2381 void prepare_write_info(map<string,bufferlist> *km);
2383 void update_store_with_options();
2384 void update_store_on_load();
2387 static int _prepare_write_info(
2389 map<string,bufferlist> *km,
2392 pg_info_t &last_written_info,
2393 PastIntervals &past_intervals,
2394 bool dirty_big_info,
2397 PerfCounters *logger = nullptr);
2398 void write_if_dirty(ObjectStore::Transaction& t);
2400 PGLog::IndexedLog projected_log;
2401 bool check_in_progress_op(
2402 const osd_reqid_t &r,
2403 eversion_t *version,
2404 version_t *user_version,
2405 int *return_code) const;
2406 eversion_t projected_last_update;
2407 eversion_t get_next_version() const {
2408 eversion_t at_version(
2409 get_osdmap()->get_epoch(),
2410 projected_last_update.version+1);
2411 assert(at_version > info.last_update);
2412 assert(at_version > pg_log.get_head());
2413 assert(at_version > projected_last_update);
2417 void add_log_entry(const pg_log_entry_t& e, bool applied);
2419 const vector<pg_log_entry_t>& logv,
2421 eversion_t roll_forward_to,
2422 ObjectStore::Transaction &t,
2423 bool transaction_applied = true);
2424 bool check_log_for_corruption(ObjectStore *store);
2427 std::string get_corrupt_pg_log_name() const;
2428 static int read_info(
2429 ObjectStore *store, spg_t pgid, const coll_t &coll,
2430 bufferlist &bl, pg_info_t &info, PastIntervals &past_intervals,
2432 void read_state(ObjectStore *store, bufferlist &bl);
2433 static bool _has_removal_flag(ObjectStore *store, spg_t pgid);
2434 static int peek_map_epoch(ObjectStore *store, spg_t pgid,
2435 epoch_t *pepoch, bufferlist *bl);
2436 void update_snap_map(
2437 const vector<pg_log_entry_t> &log_entries,
2438 ObjectStore::Transaction& t);
2440 void filter_snapc(vector<snapid_t> &snaps);
2442 void log_weirdness();
2444 virtual void kick_snap_trim() = 0;
2445 virtual void snap_trimmer_scrub_complete() = 0;
2446 bool requeue_scrub(bool high_priority = false);
2447 void queue_recovery();
2449 unsigned get_scrub_priority();
2451 /// share pg info after a pg is active
2452 void share_pg_info();
2455 bool append_log_entries_update_missing(
2456 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
2457 ObjectStore::Transaction &t);
2460 * Merge entries updating missing as necessary on all
2461 * actingbackfill logs and missings (also missing_loc)
2463 void merge_new_log_entries(
2464 const mempool::osd_pglog::list<pg_log_entry_t> &entries,
2465 ObjectStore::Transaction &t);
2467 void reset_interval_flush();
2468 void start_peering_interval(
2469 const OSDMapRef lastmap,
2470 const vector<int>& newup, int up_primary,
2471 const vector<int>& newacting, int acting_primary,
2472 ObjectStore::Transaction *t);
2473 void on_new_interval();
2474 virtual void _on_new_interval() = 0;
2475 void start_flush(ObjectStore::Transaction *t,
2476 list<Context *> *on_applied,
2477 list<Context *> *on_safe);
2478 void set_last_peering_reset();
2479 bool pg_has_reset_since(epoch_t e) {
2480 assert(is_locked());
2481 return deleting || e < get_last_peering_reset();
2484 void update_history(const pg_history_t& history);
2485 void fulfill_info(pg_shard_t from, const pg_query_t &query,
2486 pair<pg_shard_t, pg_info_t> ¬ify_info);
2487 void fulfill_log(pg_shard_t from, const pg_query_t &query, epoch_t query_epoch);
2489 void check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap);
2491 bool should_restart_peering(
2493 int newactingprimary,
2494 const vector<int>& newup,
2495 const vector<int>& newacting,
2499 // OpRequest queueing
2500 bool can_discard_op(OpRequestRef& op);
2501 bool can_discard_scan(OpRequestRef op);
2502 bool can_discard_backfill(OpRequestRef op);
2503 bool can_discard_request(OpRequestRef& op);
2505 template<typename T, int MSGTYPE>
2506 bool can_discard_replica_op(OpRequestRef& op);
2508 bool old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch);
2509 bool old_peering_evt(CephPeeringEvtRef evt) {
2510 return old_peering_msg(evt->get_epoch_sent(), evt->get_epoch_requested());
2512 static bool have_same_or_newer_map(epoch_t cur_epoch, epoch_t e) {
2513 return e <= cur_epoch;
2515 bool have_same_or_newer_map(epoch_t e) {
2516 return e <= get_osdmap()->get_epoch();
2519 bool op_has_sufficient_caps(OpRequestRef& op);
2523 void take_waiters();
2524 void queue_peering_event(CephPeeringEvtRef evt);
2525 void handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx);
2526 void queue_query(epoch_t msg_epoch, epoch_t query_epoch,
2527 pg_shard_t from, const pg_query_t& q);
2528 void queue_null(epoch_t msg_epoch, epoch_t query_epoch);
2529 void queue_flushed(epoch_t started_at);
2530 void handle_advance_map(
2531 OSDMapRef osdmap, OSDMapRef lastmap,
2532 vector<int>& newup, int up_primary,
2533 vector<int>& newacting, int acting_primary,
2535 void handle_activate_map(RecoveryCtx *rctx);
2536 void handle_create(RecoveryCtx *rctx);
2537 void handle_loaded(RecoveryCtx *rctx);
2538 void handle_query_state(Formatter *f);
2540 virtual void on_removal(ObjectStore::Transaction *t) = 0;
2544 virtual void do_request(
2546 ThreadPool::TPHandle &handle
2549 virtual void do_op(OpRequestRef& op) = 0;
2550 virtual void do_sub_op(OpRequestRef op) = 0;
2551 virtual void do_sub_op_reply(OpRequestRef op) = 0;
2552 virtual void do_scan(
2554 ThreadPool::TPHandle &handle
2556 virtual void do_backfill(OpRequestRef op) = 0;
2557 virtual void snap_trimmer(epoch_t epoch_queued) = 0;
2559 virtual int do_command(
2565 ceph_tid_t tid) = 0;
2567 virtual void on_role_change() = 0;
2568 virtual void on_pool_change() = 0;
2569 virtual void on_change(ObjectStore::Transaction *t) = 0;
2570 virtual void on_activate() = 0;
2571 virtual void on_flushed() = 0;
2572 virtual void on_shutdown() = 0;
2573 virtual void check_blacklisted_watchers() = 0;
2574 virtual void get_watchers(std::list<obj_watch_item_t>&) = 0;
2576 virtual bool agent_work(int max) = 0;
2577 virtual bool agent_work(int max, int agent_flush_quota) = 0;
2578 virtual void agent_stop() = 0;
2579 virtual void agent_delay() = 0;
2580 virtual void agent_clear() = 0;
2581 virtual void agent_choose_mode_restart() = 0;
2584 ostream& operator<<(ostream& out, const PG& pg);
2586 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi);