Fix some bugs when testing opensds ansible
[stor4nfv.git] / src / ceph / src / osd / PG.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4  * Ceph - scalable distributed file system
5  *
6  * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7  *
8  * This is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License version 2.1, as published by the Free Software
11  * Foundation.  See file COPYING.
12  *
13  */
14
15 #include "PG.h"
16 // #include "msg/Messenger.h"
17 #include "messages/MOSDRepScrub.h"
18 // #include "common/cmdparse.h"
19 // #include "common/ceph_context.h"
20
21 #include "common/errno.h"
22 #include "common/config.h"
23 #include "OSD.h"
24 #include "OpRequest.h"
25 #include "ScrubStore.h"
26 #include "Session.h"
27
28 #include "common/Timer.h"
29 #include "common/perf_counters.h"
30
31 #include "messages/MOSDOp.h"
32 #include "messages/MOSDPGNotify.h"
33 // #include "messages/MOSDPGLog.h"
34 #include "messages/MOSDPGRemove.h"
35 #include "messages/MOSDPGInfo.h"
36 #include "messages/MOSDPGTrim.h"
37 #include "messages/MOSDPGScan.h"
38 #include "messages/MOSDPGBackfill.h"
39 #include "messages/MOSDPGBackfillRemove.h"
40 #include "messages/MBackfillReserve.h"
41 #include "messages/MRecoveryReserve.h"
42 #include "messages/MOSDPGPush.h"
43 #include "messages/MOSDPGPushReply.h"
44 #include "messages/MOSDPGPull.h"
45 #include "messages/MOSDECSubOpWrite.h"
46 #include "messages/MOSDECSubOpWriteReply.h"
47 #include "messages/MOSDECSubOpRead.h"
48 #include "messages/MOSDECSubOpReadReply.h"
49 #include "messages/MOSDPGUpdateLogMissing.h"
50 #include "messages/MOSDPGUpdateLogMissingReply.h"
51 #include "messages/MOSDBackoff.h"
52 #include "messages/MOSDScrubReserve.h"
53 #include "messages/MOSDSubOp.h"
54 #include "messages/MOSDRepOp.h"
55 #include "messages/MOSDSubOpReply.h"
56 #include "messages/MOSDRepOpReply.h"
57 #include "messages/MOSDRepScrubMap.h"
58 #include "messages/MOSDPGRecoveryDelete.h"
59 #include "messages/MOSDPGRecoveryDeleteReply.h"
60
61 #include "common/BackTrace.h"
62 #include "common/EventTrace.h"
63
64 #ifdef WITH_LTTNG
65 #define TRACEPOINT_DEFINE
66 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
67 #include "tracing/pg.h"
68 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
69 #undef TRACEPOINT_DEFINE
70 #else
71 #define tracepoint(...)
72 #endif
73
74 #include <sstream>
75
76 #define dout_context cct
77 #define dout_subsys ceph_subsys_osd
78 #undef dout_prefix
79 #define dout_prefix _prefix(_dout, this)
80
81 // prefix pgmeta_oid keys with _ so that PGLog::read_log_and_missing() can
82 // easily skip them
83 const string infover_key("_infover");
84 const string info_key("_info");
85 const string biginfo_key("_biginfo");
86 const string epoch_key("_epoch");
87 const string fastinfo_key("_fastinfo");
88
89 template <class T>
90 static ostream& _prefix(std::ostream *_dout, T *t)
91 {
92   return *_dout << t->gen_prefix();
93 }
94
95 MEMPOOL_DEFINE_OBJECT_FACTORY(PG::CephPeeringEvt, pg_peering_evt, osd);
96
97 void PGStateHistory::enter(PG* pg, const utime_t entime, const char* state)
98 {
99   // Ignore trimming state machine for now
100   if (::strstr(state, "Trimming") != NULL) {
101     return;
102   } else if (pi != nullptr) {
103     pi->enter_state(entime, state);
104   } else {
105     // Store current state since we can't reliably take the PG lock here
106     if ( tmppi == nullptr) {
107       tmppi = std::unique_ptr<PGStateInstance>(new PGStateInstance);
108     }
109
110     thispg = pg;
111     tmppi->enter_state(entime, state);
112   }
113 }
114
115 void PGStateHistory::exit(const char* state) {
116   // Ignore trimming state machine for now
117   // Do nothing if PG is being destroyed!
118   if (::strstr(state, "Trimming") != NULL || pg_in_destructor) {
119     return;
120   } else {
121     bool ilocked = false;
122     if(!thispg->is_locked()) {
123       thispg->lock();
124       ilocked = true;
125     }
126     if (pi == nullptr) {
127       buffer.push_back(std::unique_ptr<PGStateInstance>(tmppi.release()));
128       pi = buffer.back().get();
129       pi->setepoch(thispg->get_osdmap()->get_epoch());
130     }
131
132     pi->exit_state(ceph_clock_now());
133     if (::strcmp(state, "Reset") == 0) {
134       this->reset();
135     }
136     if(ilocked) {
137       thispg->unlock();
138     }
139   }
140 }
141
142 void PGStateHistory::dump(Formatter* f) const {
143   f->open_array_section("history");
144   for (auto pi = buffer.begin(); pi != buffer.end(); ++pi) {
145     f->open_object_section("states");
146     f->dump_stream("epoch") << (*pi)->this_epoch;
147     for (auto she : (*pi)->state_history) {
148       f->dump_string("state", std::get<2>(she));
149       f->dump_stream("enter") << std::get<0>(she);
150       f->dump_stream("exit") << std::get<1>(she);
151     }
152     f->close_section();
153   }
154   f->close_section();
155 }
156
157 void PG::get(const char* tag)
158 {
159   ref++;
160 #ifdef PG_DEBUG_REFS
161   Mutex::Locker l(_ref_id_lock);
162   _tag_counts[tag]++;
163 #endif
164 }
165
166 void PG::put(const char* tag)
167 {
168 #ifdef PG_DEBUG_REFS
169   {
170     Mutex::Locker l(_ref_id_lock);
171     auto tag_counts_entry = _tag_counts.find(tag);
172     assert(tag_counts_entry != _tag_counts.end());
173     --tag_counts_entry->second;
174     if (tag_counts_entry->second == 0) {
175       _tag_counts.erase(tag_counts_entry);
176     }
177   }
178 #endif
179   if (--ref== 0)
180     delete this;
181 }
182
183 #ifdef PG_DEBUG_REFS
184 uint64_t PG::get_with_id()
185 {
186   ref++;
187   Mutex::Locker l(_ref_id_lock);
188   uint64_t id = ++_ref_id;
189   BackTrace bt(0);
190   stringstream ss;
191   bt.print(ss);
192   dout(20) << __func__ << ": " << info.pgid << " got id " << id << " (new) ref==" << ref << dendl;
193   assert(!_live_ids.count(id));
194   _live_ids.insert(make_pair(id, ss.str()));
195   return id;
196 }
197
198 void PG::put_with_id(uint64_t id)
199 {
200   dout(20) << __func__ << ": " << info.pgid << " put id " << id << " (current) ref==" << ref << dendl;
201   {
202     Mutex::Locker l(_ref_id_lock);
203     assert(_live_ids.count(id));
204     _live_ids.erase(id);
205   }
206   if (--ref == 0)
207     delete this;
208 }
209
210 void PG::dump_live_ids()
211 {
212   Mutex::Locker l(_ref_id_lock);
213   dout(0) << "\t" << __func__ << ": " << info.pgid << " live ids:" << dendl;
214   for (map<uint64_t, string>::iterator i = _live_ids.begin();
215        i != _live_ids.end();
216        ++i) {
217     dout(0) << "\t\tid: " << *i << dendl;
218   }
219   dout(0) << "\t" << __func__ << ": " << info.pgid << " live tags:" << dendl;
220   for (map<string, uint64_t>::iterator i = _tag_counts.begin();
221        i != _tag_counts.end();
222        ++i) {
223     dout(0) << "\t\tid: " << *i << dendl;
224   }
225 }
226 #endif
227
228
229 void PGPool::update(OSDMapRef map)
230 {
231   const pg_pool_t *pi = map->get_pg_pool(id);
232   assert(pi);
233   info = *pi;
234   auid = pi->auid;
235   name = map->get_pool_name(id);
236   bool updated = false;
237   if ((map->get_epoch() != cached_epoch + 1) ||
238       (pi->get_snap_epoch() == map->get_epoch())) {
239     updated = true;
240     pi->build_removed_snaps(newly_removed_snaps);
241     interval_set<snapid_t> intersection;
242     intersection.intersection_of(newly_removed_snaps, cached_removed_snaps);
243     if (intersection == cached_removed_snaps) {
244         newly_removed_snaps.subtract(cached_removed_snaps);
245         cached_removed_snaps.union_of(newly_removed_snaps);
246     } else {
247         lgeneric_subdout(cct, osd, 0) << __func__
248           << " cached_removed_snaps shrank from " << cached_removed_snaps
249           << " to " << newly_removed_snaps << dendl;
250         cached_removed_snaps = newly_removed_snaps;
251         newly_removed_snaps.clear();
252     }
253     snapc = pi->get_snap_context();
254   } else {
255     /* 1) map->get_epoch() == cached_epoch + 1 &&
256      * 2) pi->get_snap_epoch() != map->get_epoch()
257      *
258      * From the if branch, 1 && 2 must be true.  From 2, we know that
259      * this map didn't change the set of removed snaps.  From 1, we
260      * know that our cached_removed_snaps matches the previous map.
261      * Thus, from 1 && 2, cached_removed snaps matches the current
262      * set of removed snaps and all we have to do is clear
263      * newly_removed_snaps.
264      */
265     newly_removed_snaps.clear();
266   }
267   cached_epoch = map->get_epoch();
268   lgeneric_subdout(cct, osd, 20)
269     << "PGPool::update cached_removed_snaps "
270     << cached_removed_snaps
271     << " newly_removed_snaps "
272     << newly_removed_snaps
273     << " snapc " << snapc
274     << (updated ? " (updated)":" (no change)")
275     << dendl;
276 }
277
278 PG::PG(OSDService *o, OSDMapRef curmap,
279        const PGPool &_pool, spg_t p) :
280   osd(o),
281   cct(o->cct),
282   osdriver(osd->store, coll_t(), OSD::make_snapmapper_oid()),
283   snap_mapper(
284     cct,
285     &osdriver,
286     p.ps(),
287     p.get_split_bits(curmap->get_pg_num(_pool.id)),
288     _pool.id,
289     p.shard),
290   osdmap_ref(curmap), last_persisted_osdmap_ref(curmap), pool(_pool),
291   _lock("PG::_lock"),
292   #ifdef PG_DEBUG_REFS
293   _ref_id_lock("PG::_ref_id_lock"), _ref_id(0),
294   #endif
295   deleting(false),
296   trace_endpoint("0.0.0.0", 0, "PG"),
297   dirty_info(false), dirty_big_info(false),
298   info(p),
299   info_struct_v(0),
300   coll(p),
301   pg_log(cct),
302   pgmeta_oid(p.make_pgmeta_oid()),
303   missing_loc(this),
304   past_intervals(
305     curmap->get_pools().at(p.pgid.pool()).ec_pool(),
306     *curmap),
307   stat_queue_item(this),
308   scrub_queued(false),
309   recovery_queued(false),
310   recovery_ops_active(0),
311   role(-1),
312   state(0),
313   send_notify(false),
314   pg_whoami(osd->whoami, p.shard),
315   need_up_thru(false),
316   last_peering_reset(0),
317   heartbeat_peer_lock("PG::heartbeat_peer_lock"),
318   backfill_reserved(false),
319   backfill_reserving(false),
320   flushes_in_progress(0),
321   pg_stats_publish_lock("PG::pg_stats_publish_lock"),
322   pg_stats_publish_valid(false),
323   osr(osd->osr_registry.lookup_or_create(p, (stringify(p)))),
324   finish_sync_event(NULL),
325   backoff_lock("PG::backoff_lock"),
326   scrub_after_recovery(false),
327   active_pushes(0),
328   recovery_state(this),
329   pg_id(p),
330   peer_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
331   acting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
332   upacting_features(CEPH_FEATURES_SUPPORTED_DEFAULT),
333   last_epoch(0)
334 {
335 #ifdef PG_DEBUG_REFS
336   osd->add_pgid(p, this);
337 #endif
338 #ifdef WITH_BLKIN
339   std::stringstream ss;
340   ss << "PG " << info.pgid;
341   trace_endpoint.copy_name(ss.str());
342 #endif
343   osr->shard_hint = p;
344 }
345
346 PG::~PG()
347 {
348   pgstate_history.set_pg_in_destructor();
349 #ifdef PG_DEBUG_REFS
350   osd->remove_pgid(info.pgid, this);
351 #endif
352 }
353
354 void PG::lock_suspend_timeout(ThreadPool::TPHandle &handle)
355 {
356   handle.suspend_tp_timeout();
357   lock();
358   handle.reset_tp_timeout();
359 }
360
361 void PG::lock(bool no_lockdep) const
362 {
363   _lock.Lock(no_lockdep);
364   // if we have unrecorded dirty state with the lock dropped, there is a bug
365   assert(!dirty_info);
366   assert(!dirty_big_info);
367
368   dout(30) << "lock" << dendl;
369 }
370
371 std::string PG::gen_prefix() const
372 {
373   stringstream out;
374   OSDMapRef mapref = osdmap_ref;
375   if (_lock.is_locked_by_me()) {
376     out << "osd." << osd->whoami
377         << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
378         << " " << *this << " ";
379   } else {
380     out << "osd." << osd->whoami
381         << " pg_epoch: " << (mapref ? mapref->get_epoch():0)
382         << " pg[" << info.pgid << "(unlocked)] ";
383   }
384   return out.str();
385 }
386   
387 /********* PG **********/
388
389 void PG::proc_master_log(
390   ObjectStore::Transaction& t, pg_info_t &oinfo,
391   pg_log_t &olog, pg_missing_t& omissing, pg_shard_t from)
392 {
393   dout(10) << "proc_master_log for osd." << from << ": "
394            << olog << " " << omissing << dendl;
395   assert(!is_peered() && is_primary());
396
397   // merge log into our own log to build master log.  no need to
398   // make any adjustments to their missing map; we are taking their
399   // log to be authoritative (i.e., their entries are by definitely
400   // non-divergent).
401   merge_log(t, oinfo, olog, from);
402   peer_info[from] = oinfo;
403   dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
404   might_have_unfound.insert(from);
405
406   // See doc/dev/osd_internals/last_epoch_started
407   if (oinfo.last_epoch_started > info.last_epoch_started) {
408     info.last_epoch_started = oinfo.last_epoch_started;
409     dirty_info = true;
410   }
411   if (oinfo.last_interval_started > info.last_interval_started) {
412     info.last_interval_started = oinfo.last_interval_started;
413     dirty_info = true;
414   }
415   update_history(oinfo.history);
416   assert(cct->_conf->osd_find_best_info_ignore_history_les ||
417          info.last_epoch_started >= info.history.last_epoch_started);
418
419   peer_missing[from].claim(omissing);
420 }
421     
422 void PG::proc_replica_log(
423   pg_info_t &oinfo,
424   const pg_log_t &olog,
425   pg_missing_t& omissing,
426   pg_shard_t from)
427 {
428   dout(10) << "proc_replica_log for osd." << from << ": "
429            << oinfo << " " << olog << " " << omissing << dendl;
430
431   pg_log.proc_replica_log(oinfo, olog, omissing, from);
432
433   peer_info[from] = oinfo;
434   dout(10) << " peer osd." << from << " now " << oinfo << " " << omissing << dendl;
435   might_have_unfound.insert(from);
436
437   for (map<hobject_t, pg_missing_item>::const_iterator i =
438          omissing.get_items().begin();
439        i != omissing.get_items().end();
440        ++i) {
441     dout(20) << " after missing " << i->first << " need " << i->second.need
442              << " have " << i->second.have << dendl;
443   }
444   peer_missing[from].claim(omissing);
445 }
446
447 bool PG::proc_replica_info(
448   pg_shard_t from, const pg_info_t &oinfo, epoch_t send_epoch)
449 {
450   map<pg_shard_t, pg_info_t>::iterator p = peer_info.find(from);
451   if (p != peer_info.end() && p->second.last_update == oinfo.last_update) {
452     dout(10) << " got dup osd." << from << " info " << oinfo << ", identical to ours" << dendl;
453     return false;
454   }
455
456   if (!get_osdmap()->has_been_up_since(from.osd, send_epoch)) {
457     dout(10) << " got info " << oinfo << " from down osd." << from
458              << " discarding" << dendl;
459     return false;
460   }
461
462   dout(10) << " got osd." << from << " " << oinfo << dendl;
463   assert(is_primary());
464   peer_info[from] = oinfo;
465   might_have_unfound.insert(from);
466
467   update_history(oinfo.history);
468   
469   // stray?
470   if (!is_up(from) && !is_acting(from)) {
471     dout(10) << " osd." << from << " has stray content: " << oinfo << dendl;
472     stray_set.insert(from);
473     if (is_clean()) {
474       purge_strays();
475     }
476   }
477
478   // was this a new info?  if so, update peers!
479   if (p == peer_info.end())
480     update_heartbeat_peers();
481
482   return true;
483 }
484
485 void PG::remove_snap_mapped_object(
486   ObjectStore::Transaction &t, const hobject_t &soid)
487 {
488   t.remove(
489     coll,
490     ghobject_t(soid, ghobject_t::NO_GEN, pg_whoami.shard));
491   clear_object_snap_mapping(&t, soid);
492 }
493
494 void PG::clear_object_snap_mapping(
495   ObjectStore::Transaction *t, const hobject_t &soid)
496 {
497   OSDriver::OSTransaction _t(osdriver.get_transaction(t));
498   if (soid.snap < CEPH_MAXSNAP) {
499     int r = snap_mapper.remove_oid(
500       soid,
501       &_t);
502     if (!(r == 0 || r == -ENOENT)) {
503       derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
504       ceph_abort();
505     }
506   }
507 }
508
509 void PG::update_object_snap_mapping(
510   ObjectStore::Transaction *t, const hobject_t &soid, const set<snapid_t> &snaps)
511 {
512   OSDriver::OSTransaction _t(osdriver.get_transaction(t));
513   assert(soid.snap < CEPH_MAXSNAP);
514   int r = snap_mapper.remove_oid(
515     soid,
516     &_t);
517   if (!(r == 0 || r == -ENOENT)) {
518     derr << __func__ << ": remove_oid returned " << cpp_strerror(r) << dendl;
519     ceph_abort();
520   }
521   snap_mapper.add_oid(
522     soid,
523     snaps,
524     &_t);
525 }
526
527 void PG::merge_log(
528   ObjectStore::Transaction& t, pg_info_t &oinfo, pg_log_t &olog, pg_shard_t from)
529 {
530   PGLogEntryHandler rollbacker{this, &t};
531   pg_log.merge_log(
532     oinfo, olog, from, info, &rollbacker, dirty_info, dirty_big_info);
533 }
534
535 void PG::rewind_divergent_log(ObjectStore::Transaction& t, eversion_t newhead)
536 {
537   PGLogEntryHandler rollbacker{this, &t};
538   pg_log.rewind_divergent_log(
539     newhead, info, &rollbacker, dirty_info, dirty_big_info);
540 }
541
542 /*
543  * Process information from a replica to determine if it could have any
544  * objects that i need.
545  *
546  * TODO: if the missing set becomes very large, this could get expensive.
547  * Instead, we probably want to just iterate over our unfound set.
548  */
549 bool PG::search_for_missing(
550   const pg_info_t &oinfo, const pg_missing_t &omissing,
551   pg_shard_t from,
552   RecoveryCtx *ctx)
553 {
554   uint64_t num_unfound_before = missing_loc.num_unfound();
555   bool found_missing = missing_loc.add_source_info(
556     from, oinfo, omissing, ctx->handle);
557   if (found_missing && num_unfound_before != missing_loc.num_unfound())
558     publish_stats_to_osd();
559   if (found_missing &&
560       (get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, NULL) &
561        CEPH_FEATURE_OSD_ERASURE_CODES)) {
562     pg_info_t tinfo(oinfo);
563     tinfo.pgid.shard = pg_whoami.shard;
564     (*(ctx->info_map))[from.osd].push_back(
565       make_pair(
566         pg_notify_t(
567           from.shard, pg_whoami.shard,
568           get_osdmap()->get_epoch(),
569           get_osdmap()->get_epoch(),
570           tinfo),
571         past_intervals));
572   }
573   return found_missing;
574 }
575
576 bool PG::MissingLoc::readable_with_acting(
577   const hobject_t &hoid,
578   const set<pg_shard_t> &acting) const {
579   if (!needs_recovery(hoid))
580     return true;
581   if (is_deleted(hoid))
582     return false;
583   auto missing_loc_entry = missing_loc.find(hoid);
584   if (missing_loc_entry == missing_loc.end())
585     return false;
586   const set<pg_shard_t> &locs = missing_loc_entry->second;
587   ldout(pg->cct, 10) << __func__ << ": locs:" << locs << dendl;
588   set<pg_shard_t> have_acting;
589   for (set<pg_shard_t>::const_iterator i = locs.begin();
590        i != locs.end();
591        ++i) {
592     if (acting.count(*i))
593       have_acting.insert(*i);
594   }
595   return (*is_readable)(have_acting);
596 }
597
598 void PG::MissingLoc::add_batch_sources_info(
599   const set<pg_shard_t> &sources, ThreadPool::TPHandle* handle)
600 {
601   ldout(pg->cct, 10) << __func__ << ": adding sources in batch "
602                      << sources.size() << dendl;
603   unsigned loop = 0;
604   for (map<hobject_t, pg_missing_item>::const_iterator i = needs_recovery_map.begin();
605       i != needs_recovery_map.end();
606       ++i) {
607     if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
608       handle->reset_tp_timeout();
609       loop = 0;
610     }
611     if (i->second.is_delete())
612       continue;
613     missing_loc[i->first].insert(sources.begin(), sources.end());
614     missing_loc_sources.insert(sources.begin(), sources.end());
615   }
616 }
617
618 bool PG::MissingLoc::add_source_info(
619   pg_shard_t fromosd,
620   const pg_info_t &oinfo,
621   const pg_missing_t &omissing,
622   ThreadPool::TPHandle* handle)
623 {
624   bool found_missing = false;
625   unsigned loop = 0;
626   // found items?
627   for (map<hobject_t,pg_missing_item>::const_iterator p = needs_recovery_map.begin();
628        p != needs_recovery_map.end();
629        ++p) {
630     const hobject_t &soid(p->first);
631     eversion_t need = p->second.need;
632     if (handle && ++loop >= pg->cct->_conf->osd_loop_before_reset_tphandle) {
633       handle->reset_tp_timeout();
634       loop = 0;
635     }
636     if (p->second.is_delete()) {
637       ldout(pg->cct, 10) << __func__ << " " << soid
638                          << " delete, ignoring source" << dendl;
639       found_missing = true;
640       continue;
641     }
642     if (oinfo.last_update < need) {
643       ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
644                          << " also missing on osd." << fromosd
645                          << " (last_update " << oinfo.last_update
646                          << " < needed " << need << ")" << dendl;
647       continue;
648     }
649     if (!oinfo.last_backfill.is_max() &&
650         !oinfo.last_backfill_bitwise) {
651       ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
652                          << " also missing on osd." << fromosd
653                          << " (last_backfill " << oinfo.last_backfill
654                          << " but with wrong sort order)"
655                          << dendl;
656       continue;
657     }
658     if (p->first >= oinfo.last_backfill) {
659       // FIXME: this is _probably_ true, although it could conceivably
660       // be in the undefined region!  Hmm!
661       ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
662                          << " also missing on osd." << fromosd
663                          << " (past last_backfill " << oinfo.last_backfill
664                          << ")" << dendl;
665       continue;
666     }
667     if (oinfo.last_complete < need) {
668       if (omissing.is_missing(soid)) {
669         ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
670                            << " also missing on osd." << fromosd << dendl;
671         continue;
672       }
673     }
674
675     ldout(pg->cct, 10) << "search_for_missing " << soid << " " << need
676                        << " is on osd." << fromosd << dendl;
677
678     missing_loc[soid].insert(fromosd);
679     missing_loc_sources.insert(fromosd);
680     found_missing = true;
681   }
682
683   ldout(pg->cct, 20) << "needs_recovery_map missing " << needs_recovery_map
684                      << dendl;
685   return found_missing;
686 }
687
688 void PG::discover_all_missing(map<int, map<spg_t,pg_query_t> > &query_map)
689 {
690   auto &missing = pg_log.get_missing();
691   uint64_t unfound = get_num_unfound();
692   assert(unfound > 0);
693
694   dout(10) << __func__ << " "
695            << missing.num_missing() << " missing, "
696            << unfound << " unfound"
697            << dendl;
698
699   std::set<pg_shard_t>::const_iterator m = might_have_unfound.begin();
700   std::set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
701   for (; m != mend; ++m) {
702     pg_shard_t peer(*m);
703     
704     if (!get_osdmap()->is_up(peer.osd)) {
705       dout(20) << __func__ << " skipping down osd." << peer << dendl;
706       continue;
707     }
708
709     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(peer);
710     if (iter != peer_info.end() &&
711         (iter->second.is_empty() || iter->second.dne())) {
712       // ignore empty peers
713       continue;
714     }
715
716     // If we've requested any of this stuff, the pg_missing_t information
717     // should be on its way.
718     // TODO: coalsce requested_* into a single data structure
719     if (peer_missing.find(peer) != peer_missing.end()) {
720       dout(20) << __func__ << ": osd." << peer
721                << ": we already have pg_missing_t" << dendl;
722       continue;
723     }
724     if (peer_log_requested.find(peer) != peer_log_requested.end()) {
725       dout(20) << __func__ << ": osd." << peer
726                << ": in peer_log_requested" << dendl;
727       continue;
728     }
729     if (peer_missing_requested.find(peer) != peer_missing_requested.end()) {
730       dout(20) << __func__ << ": osd." << peer
731                << ": in peer_missing_requested" << dendl;
732       continue;
733     }
734
735     // Request missing
736     dout(10) << __func__ << ": osd." << peer << ": requesting pg_missing_t"
737              << dendl;
738     peer_missing_requested.insert(peer);
739     query_map[peer.osd][spg_t(info.pgid.pgid, peer.shard)] =
740       pg_query_t(
741         pg_query_t::FULLLOG,
742         peer.shard, pg_whoami.shard,
743         info.history, get_osdmap()->get_epoch());
744   }
745 }
746
747 /******* PG ***********/
748 bool PG::needs_recovery() const
749 {
750   assert(is_primary());
751
752   auto &missing = pg_log.get_missing();
753
754   if (missing.num_missing()) {
755     dout(10) << __func__ << " primary has " << missing.num_missing()
756       << " missing" << dendl;
757     return true;
758   }
759
760   assert(!actingbackfill.empty());
761   set<pg_shard_t>::const_iterator end = actingbackfill.end();
762   set<pg_shard_t>::const_iterator a = actingbackfill.begin();
763   for (; a != end; ++a) {
764     if (*a == get_primary()) continue;
765     pg_shard_t peer = *a;
766     map<pg_shard_t, pg_missing_t>::const_iterator pm = peer_missing.find(peer);
767     if (pm == peer_missing.end()) {
768       dout(10) << __func__ << " osd." << peer << " doesn't have missing set"
769         << dendl;
770       continue;
771     }
772     if (pm->second.num_missing()) {
773       dout(10) << __func__ << " osd." << peer << " has "
774         << pm->second.num_missing() << " missing" << dendl;
775       return true;
776     }
777   }
778
779   dout(10) << __func__ << " is recovered" << dendl;
780   return false;
781 }
782
783 bool PG::needs_backfill() const
784 {
785   assert(is_primary());
786
787   // We can assume that only possible osds that need backfill
788   // are on the backfill_targets vector nodes.
789   set<pg_shard_t>::const_iterator end = backfill_targets.end();
790   set<pg_shard_t>::const_iterator a = backfill_targets.begin();
791   for (; a != end; ++a) {
792     pg_shard_t peer = *a;
793     map<pg_shard_t, pg_info_t>::const_iterator pi = peer_info.find(peer);
794     if (!pi->second.last_backfill.is_max()) {
795       dout(10) << __func__ << " osd." << peer << " has last_backfill " << pi->second.last_backfill << dendl;
796       return true;
797     }
798   }
799
800   dout(10) << __func__ << " does not need backfill" << dendl;
801   return false;
802 }
803
804
805 void PG::check_past_interval_bounds() const
806 {
807   auto rpib = get_required_past_interval_bounds(
808     info,
809     osd->get_superblock().oldest_map);
810   if (rpib.first >= rpib.second) {
811     if (!past_intervals.empty()) {
812       osd->clog->error() << info.pgid << " required past_interval bounds are"
813                          << " empty [" << rpib << ") but past_intervals is not: "
814                          << past_intervals;
815       derr << info.pgid << " required past_interval bounds are"
816            << " empty [" << rpib << ") but past_intervals is not: "
817            << past_intervals << dendl;
818     }
819   } else {
820     if (past_intervals.empty()) {
821       osd->clog->error() << info.pgid << " required past_interval bounds are"
822                          << " not empty [" << rpib << ") but past_intervals "
823                          << past_intervals << " is empty";
824       derr << info.pgid << " required past_interval bounds are"
825            << " not empty [" << rpib << ") but past_intervals "
826            << past_intervals << " is empty" << dendl;
827       assert(!past_intervals.empty());
828     }
829
830     auto apib = past_intervals.get_bounds();
831     if (apib.first > rpib.first) {
832       osd->clog->error() << info.pgid << " past_intervals [" << apib
833                          << ") start interval does not contain the required"
834                          << " bound [" << rpib << ") start";
835       derr << info.pgid << " past_intervals [" << apib
836            << ") start interval does not contain the required"
837            << " bound [" << rpib << ") start" << dendl;
838       assert(0 == "past_interval start interval mismatch");
839     }
840     if (apib.second != rpib.second) {
841       osd->clog->error() << info.pgid << " past_interal bound [" << apib
842                          << ") end does not match required [" << rpib
843                          << ") end";
844       derr << info.pgid << " past_interal bound [" << apib
845            << ") end does not match required [" << rpib
846            << ") end" << dendl;
847       assert(0 == "past_interval end mismatch");
848     }
849   }
850 }
851
852 bool PG::adjust_need_up_thru(const OSDMapRef osdmap)
853 {
854   epoch_t up_thru = osdmap->get_up_thru(osd->whoami);
855   if (need_up_thru &&
856       up_thru >= info.history.same_interval_since) {
857     dout(10) << "adjust_need_up_thru now " << up_thru << ", need_up_thru now false" << dendl;
858     need_up_thru = false;
859     return true;
860   }
861   return false;
862 }
863
864 void PG::remove_down_peer_info(const OSDMapRef osdmap)
865 {
866   // Remove any downed osds from peer_info
867   bool removed = false;
868   map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
869   while (p != peer_info.end()) {
870     if (!osdmap->is_up(p->first.osd)) {
871       dout(10) << " dropping down osd." << p->first << " info " << p->second << dendl;
872       peer_missing.erase(p->first);
873       peer_log_requested.erase(p->first);
874       peer_missing_requested.erase(p->first);
875       peer_info.erase(p++);
876       removed = true;
877     } else
878       ++p;
879   }
880
881   // if we removed anyone, update peers (which include peer_info)
882   if (removed)
883     update_heartbeat_peers();
884   check_recovery_sources(osdmap);
885 }
886
887 /*
888  * Returns true unless there is a non-lost OSD in might_have_unfound.
889  */
890 bool PG::all_unfound_are_queried_or_lost(const OSDMapRef osdmap) const
891 {
892   assert(is_primary());
893
894   set<pg_shard_t>::const_iterator peer = might_have_unfound.begin();
895   set<pg_shard_t>::const_iterator mend = might_have_unfound.end();
896   for (; peer != mend; ++peer) {
897     if (peer_missing.count(*peer))
898       continue;
899     map<pg_shard_t, pg_info_t>::const_iterator iter = peer_info.find(*peer);
900     if (iter != peer_info.end() &&
901         (iter->second.is_empty() || iter->second.dne()))
902       continue;
903     if (!osdmap->exists(peer->osd))
904       continue;
905     const osd_info_t &osd_info(osdmap->get_info(peer->osd));
906     if (osd_info.lost_at <= osd_info.up_from) {
907       // If there is even one OSD in might_have_unfound that isn't lost, we
908       // still might retrieve our unfound.
909       return false;
910     }
911   }
912   dout(10) << "all_unfound_are_queried_or_lost all of might_have_unfound " << might_have_unfound 
913            << " have been queried or are marked lost" << dendl;
914   return true;
915 }
916
917 PastIntervals::PriorSet PG::build_prior()
918 {
919   if (1) {
920     // sanity check
921     for (map<pg_shard_t,pg_info_t>::iterator it = peer_info.begin();
922          it != peer_info.end();
923          ++it) {
924       assert(info.history.last_epoch_started >= it->second.history.last_epoch_started);
925     }
926   }
927
928   const OSDMap &osdmap = *get_osdmap();
929   PastIntervals::PriorSet prior = past_intervals.get_prior_set(
930     pool.info.ec_pool(),
931     info.history.last_epoch_started,
932     get_pgbackend()->get_is_recoverable_predicate(),
933     [&](epoch_t start, int osd, epoch_t *lost_at) {
934       const osd_info_t *pinfo = 0;
935       if (osdmap.exists(osd)) {
936         pinfo = &osdmap.get_info(osd);
937         if (lost_at)
938           *lost_at = pinfo->lost_at;
939       }
940
941       if (osdmap.is_up(osd)) {
942         return PastIntervals::UP;
943       } else if (!pinfo) {
944         return PastIntervals::DNE;
945       } else if (pinfo->lost_at > start) {
946         return PastIntervals::LOST;
947       } else {
948         return PastIntervals::DOWN;
949       }
950     },
951     up,
952     acting,
953     this);
954                                  
955   if (prior.pg_down) {
956     state_set(PG_STATE_DOWN);
957   }
958
959   if (get_osdmap()->get_up_thru(osd->whoami) < info.history.same_interval_since) {
960     dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
961              << " < same_since " << info.history.same_interval_since
962              << ", must notify monitor" << dendl;
963     need_up_thru = true;
964   } else {
965     dout(10) << "up_thru " << get_osdmap()->get_up_thru(osd->whoami)
966              << " >= same_since " << info.history.same_interval_since
967              << ", all is well" << dendl;
968     need_up_thru = false;
969   }
970   set_probe_targets(prior.probe);
971   return prior;
972 }
973
974 void PG::clear_primary_state()
975 {
976   dout(10) << "clear_primary_state" << dendl;
977
978   // clear peering state
979   stray_set.clear();
980   peer_log_requested.clear();
981   peer_missing_requested.clear();
982   peer_info.clear();
983   peer_missing.clear();
984   need_up_thru = false;
985   peer_last_complete_ondisk.clear();
986   peer_activated.clear();
987   min_last_complete_ondisk = eversion_t();
988   pg_trim_to = eversion_t();
989   might_have_unfound.clear();
990   projected_log = PGLog::IndexedLog();
991
992   last_update_ondisk = eversion_t();
993
994   snap_trimq.clear();
995
996   finish_sync_event = 0;  // so that _finish_recovery doesn't go off in another thread
997
998   missing_loc.clear();
999
1000   release_pg_backoffs();
1001
1002   pg_log.reset_recovery_pointers();
1003
1004   scrubber.reserved_peers.clear();
1005   scrub_after_recovery = false;
1006
1007   agent_clear();
1008 }
1009
1010 PG::Scrubber::Scrubber()
1011  : reserved(false), reserve_failed(false),
1012    epoch_start(0),
1013    active(false),
1014    waiting_on(0), shallow_errors(0), deep_errors(0), fixed(0),
1015    must_scrub(false), must_deep_scrub(false), must_repair(false),
1016    auto_repair(false),
1017    num_digest_updates_pending(0),
1018    state(INACTIVE),
1019    deep(false),
1020    seed(0)
1021 {}
1022
1023 PG::Scrubber::~Scrubber() {}
1024
1025 /**
1026  * find_best_info
1027  *
1028  * Returns an iterator to the best info in infos sorted by:
1029  *  1) Prefer newer last_update
1030  *  2) Prefer longer tail if it brings another info into contiguity
1031  *  3) Prefer current primary
1032  */
1033 map<pg_shard_t, pg_info_t>::const_iterator PG::find_best_info(
1034   const map<pg_shard_t, pg_info_t> &infos,
1035   bool restrict_to_up_acting,
1036   bool *history_les_bound) const
1037 {
1038   assert(history_les_bound);
1039   /* See doc/dev/osd_internals/last_epoch_started.rst before attempting
1040    * to make changes to this process.  Also, make sure to update it
1041    * when you find bugs! */
1042   eversion_t min_last_update_acceptable = eversion_t::max();
1043   epoch_t max_last_epoch_started_found = 0;
1044   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1045        i != infos.end();
1046        ++i) {
1047     if (!cct->_conf->osd_find_best_info_ignore_history_les &&
1048         max_last_epoch_started_found < i->second.history.last_epoch_started) {
1049       *history_les_bound = true;
1050       max_last_epoch_started_found = i->second.history.last_epoch_started;
1051     }
1052     if (!i->second.is_incomplete() &&
1053         max_last_epoch_started_found < i->second.last_epoch_started) {
1054       max_last_epoch_started_found = i->second.last_epoch_started;
1055     }
1056   }
1057   for (map<pg_shard_t, pg_info_t>::const_iterator i = infos.begin();
1058        i != infos.end();
1059        ++i) {
1060     if (max_last_epoch_started_found <= i->second.last_epoch_started) {
1061       if (min_last_update_acceptable > i->second.last_update)
1062         min_last_update_acceptable = i->second.last_update;
1063     }
1064   }
1065   if (min_last_update_acceptable == eversion_t::max())
1066     return infos.end();
1067
1068   map<pg_shard_t, pg_info_t>::const_iterator best = infos.end();
1069   // find osd with newest last_update (oldest for ec_pool).
1070   // if there are multiples, prefer
1071   //  - a longer tail, if it brings another peer into log contiguity
1072   //  - the current primary
1073   for (map<pg_shard_t, pg_info_t>::const_iterator p = infos.begin();
1074        p != infos.end();
1075        ++p) {
1076     if (restrict_to_up_acting && !is_up(p->first) &&
1077         !is_acting(p->first))
1078       continue;
1079     // Only consider peers with last_update >= min_last_update_acceptable
1080     if (p->second.last_update < min_last_update_acceptable)
1081       continue;
1082     // Disqualify anyone with a too old last_epoch_started
1083     if (p->second.last_epoch_started < max_last_epoch_started_found)
1084       continue;
1085     // Disqualify anyone who is incomplete (not fully backfilled)
1086     if (p->second.is_incomplete())
1087       continue;
1088     if (best == infos.end()) {
1089       best = p;
1090       continue;
1091     }
1092     // Prefer newer last_update
1093     if (pool.info.require_rollback()) {
1094       if (p->second.last_update > best->second.last_update)
1095         continue;
1096       if (p->second.last_update < best->second.last_update) {
1097         best = p;
1098         continue;
1099       }
1100     } else {
1101       if (p->second.last_update < best->second.last_update)
1102         continue;
1103       if (p->second.last_update > best->second.last_update) {
1104         best = p;
1105         continue;
1106       }
1107     }
1108
1109     // Prefer longer tail
1110     if (p->second.log_tail > best->second.log_tail) {
1111       continue;
1112     } else if (p->second.log_tail < best->second.log_tail) {
1113       best = p;
1114       continue;
1115     }
1116
1117     // prefer current primary (usually the caller), all things being equal
1118     if (p->first == pg_whoami) {
1119       dout(10) << "calc_acting prefer osd." << p->first
1120                << " because it is current primary" << dendl;
1121       best = p;
1122       continue;
1123     }
1124   }
1125   return best;
1126 }
1127
1128 void PG::calc_ec_acting(
1129   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1130   unsigned size,
1131   const vector<int> &acting,
1132   pg_shard_t acting_primary,
1133   const vector<int> &up,
1134   pg_shard_t up_primary,
1135   const map<pg_shard_t, pg_info_t> &all_info,
1136   bool restrict_to_up_acting,
1137   vector<int> *_want,
1138   set<pg_shard_t> *backfill,
1139   set<pg_shard_t> *acting_backfill,
1140   pg_shard_t *want_primary,
1141   ostream &ss)
1142 {
1143   vector<int> want(size, CRUSH_ITEM_NONE);
1144   map<shard_id_t, set<pg_shard_t> > all_info_by_shard;
1145   unsigned usable = 0;
1146   for (map<pg_shard_t, pg_info_t>::const_iterator i = all_info.begin();
1147        i != all_info.end();
1148        ++i) {
1149     all_info_by_shard[i->first.shard].insert(i->first);
1150   }
1151   for (uint8_t i = 0; i < want.size(); ++i) {
1152     ss << "For position " << (unsigned)i << ": ";
1153     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE &&
1154         !all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.is_incomplete() &&
1155         all_info.find(pg_shard_t(up[i], shard_id_t(i)))->second.last_update >=
1156         auth_log_shard->second.log_tail) {
1157       ss << " selecting up[i]: " << pg_shard_t(up[i], shard_id_t(i)) << std::endl;
1158       want[i] = up[i];
1159       ++usable;
1160       continue;
1161     }
1162     if (up.size() > (unsigned)i && up[i] != CRUSH_ITEM_NONE) {
1163       ss << " backfilling up[i]: " << pg_shard_t(up[i], shard_id_t(i))
1164          << " and ";
1165       backfill->insert(pg_shard_t(up[i], shard_id_t(i)));
1166     }
1167
1168     if (acting.size() > (unsigned)i && acting[i] != CRUSH_ITEM_NONE &&
1169         !all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.is_incomplete() &&
1170         all_info.find(pg_shard_t(acting[i], shard_id_t(i)))->second.last_update >=
1171         auth_log_shard->second.log_tail) {
1172       ss << " selecting acting[i]: " << pg_shard_t(acting[i], shard_id_t(i)) << std::endl;
1173       want[i] = acting[i];
1174       ++usable;
1175     } else if (!restrict_to_up_acting) {
1176       for (set<pg_shard_t>::iterator j = all_info_by_shard[shard_id_t(i)].begin();
1177            j != all_info_by_shard[shard_id_t(i)].end();
1178            ++j) {
1179         assert(j->shard == i);
1180         if (!all_info.find(*j)->second.is_incomplete() &&
1181             all_info.find(*j)->second.last_update >=
1182             auth_log_shard->second.log_tail) {
1183           ss << " selecting stray: " << *j << std::endl;
1184           want[i] = j->osd;
1185           ++usable;
1186           break;
1187         }
1188       }
1189       if (want[i] == CRUSH_ITEM_NONE)
1190         ss << " failed to fill position " << (int)i << std::endl;
1191     }
1192   }
1193
1194   bool found_primary = false;
1195   for (uint8_t i = 0; i < want.size(); ++i) {
1196     if (want[i] != CRUSH_ITEM_NONE) {
1197       acting_backfill->insert(pg_shard_t(want[i], shard_id_t(i)));
1198       if (!found_primary) {
1199         *want_primary = pg_shard_t(want[i], shard_id_t(i));
1200         found_primary = true;
1201       }
1202     }
1203   }
1204   acting_backfill->insert(backfill->begin(), backfill->end());
1205   _want->swap(want);
1206 }
1207
1208 /**
1209  * calculate the desired acting set.
1210  *
1211  * Choose an appropriate acting set.  Prefer up[0], unless it is
1212  * incomplete, or another osd has a longer tail that allows us to
1213  * bring other up nodes up to date.
1214  */
1215 void PG::calc_replicated_acting(
1216   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard,
1217   unsigned size,
1218   const vector<int> &acting,
1219   pg_shard_t acting_primary,
1220   const vector<int> &up,
1221   pg_shard_t up_primary,
1222   const map<pg_shard_t, pg_info_t> &all_info,
1223   bool restrict_to_up_acting,
1224   vector<int> *want,
1225   set<pg_shard_t> *backfill,
1226   set<pg_shard_t> *acting_backfill,
1227   pg_shard_t *want_primary,
1228   ostream &ss)
1229 {
1230   ss << "calc_acting newest update on osd." << auth_log_shard->first
1231      << " with " << auth_log_shard->second
1232      << (restrict_to_up_acting ? " restrict_to_up_acting" : "") << std::endl;
1233   pg_shard_t auth_log_shard_id = auth_log_shard->first;
1234   
1235   // select primary
1236   map<pg_shard_t,pg_info_t>::const_iterator primary;
1237   if (up.size() &&
1238       !all_info.find(up_primary)->second.is_incomplete() &&
1239       all_info.find(up_primary)->second.last_update >=
1240         auth_log_shard->second.log_tail) {
1241     ss << "up_primary: " << up_primary << ") selected as primary" << std::endl;
1242     primary = all_info.find(up_primary); // prefer up[0], all thing being equal
1243   } else {
1244     assert(!auth_log_shard->second.is_incomplete());
1245     ss << "up[0] needs backfill, osd." << auth_log_shard_id
1246        << " selected as primary instead" << std::endl;
1247     primary = auth_log_shard;
1248   }
1249
1250   ss << "calc_acting primary is osd." << primary->first
1251      << " with " << primary->second << std::endl;
1252   *want_primary = primary->first;
1253   want->push_back(primary->first.osd);
1254   acting_backfill->insert(primary->first);
1255   unsigned usable = 1;
1256
1257   // select replicas that have log contiguity with primary.
1258   // prefer up, then acting, then any peer_info osds 
1259   for (vector<int>::const_iterator i = up.begin();
1260        i != up.end();
1261        ++i) {
1262     pg_shard_t up_cand = pg_shard_t(*i, shard_id_t::NO_SHARD);
1263     if (up_cand == primary->first)
1264       continue;
1265     const pg_info_t &cur_info = all_info.find(up_cand)->second;
1266     if (cur_info.is_incomplete() ||
1267       cur_info.last_update < MIN(
1268         primary->second.log_tail,
1269         auth_log_shard->second.log_tail)) {
1270       /* We include auth_log_shard->second.log_tail because in GetLog,
1271        * we will request logs back to the min last_update over our
1272        * acting_backfill set, which will result in our log being extended
1273        * as far backwards as necessary to pick up any peers which can
1274        * be log recovered by auth_log_shard's log */
1275       ss << " shard " << up_cand << " (up) backfill " << cur_info << std::endl;
1276       backfill->insert(up_cand);
1277       acting_backfill->insert(up_cand);
1278     } else {
1279       want->push_back(*i);
1280       acting_backfill->insert(up_cand);
1281       usable++;
1282       ss << " osd." << *i << " (up) accepted " << cur_info << std::endl;
1283     }
1284   }
1285
1286   // This no longer has backfill OSDs, but they are covered above.
1287   for (vector<int>::const_iterator i = acting.begin();
1288        i != acting.end();
1289        ++i) {
1290     pg_shard_t acting_cand(*i, shard_id_t::NO_SHARD);
1291     if (usable >= size)
1292       break;
1293
1294     // skip up osds we already considered above
1295     if (acting_cand == primary->first)
1296       continue;
1297     vector<int>::const_iterator up_it = find(up.begin(), up.end(), acting_cand.osd);
1298     if (up_it != up.end())
1299       continue;
1300
1301     const pg_info_t &cur_info = all_info.find(acting_cand)->second;
1302     if (cur_info.is_incomplete() ||
1303         cur_info.last_update < primary->second.log_tail) {
1304       ss << " shard " << acting_cand << " (stray) REJECTED "
1305                << cur_info << std::endl;
1306     } else {
1307       want->push_back(*i);
1308       acting_backfill->insert(acting_cand);
1309       ss << " shard " << acting_cand << " (stray) accepted "
1310          << cur_info << std::endl;
1311       usable++;
1312     }
1313   }
1314
1315   if (restrict_to_up_acting) {
1316     return;
1317   }
1318   for (map<pg_shard_t,pg_info_t>::const_iterator i = all_info.begin();
1319        i != all_info.end();
1320        ++i) {
1321     if (usable >= size)
1322       break;
1323
1324     // skip up osds we already considered above
1325     if (i->first == primary->first)
1326       continue;
1327     vector<int>::const_iterator up_it = find(up.begin(), up.end(), i->first.osd);
1328     if (up_it != up.end())
1329       continue;
1330     vector<int>::const_iterator acting_it = find(
1331       acting.begin(), acting.end(), i->first.osd);
1332     if (acting_it != acting.end())
1333       continue;
1334
1335     if (i->second.is_incomplete() ||
1336         i->second.last_update < primary->second.log_tail) {
1337       ss << " shard " << i->first << " (stray) REJECTED "
1338          << i->second << std::endl;
1339     } else {
1340       want->push_back(i->first.osd);
1341       acting_backfill->insert(i->first);
1342       ss << " shard " << i->first << " (stray) accepted "
1343          << i->second << std::endl;
1344       usable++;
1345     }
1346   }
1347 }
1348
1349 /**
1350  * choose acting
1351  *
1352  * calculate the desired acting, and request a change with the monitor
1353  * if it differs from the current acting.
1354  *
1355  * if restrict_to_up_acting=true, we filter out anything that's not in
1356  * up/acting.  in order to lift this restriction, we need to
1357  *  1) check whether it's worth switching the acting set any time we get
1358  *     a new pg info (not just here, when recovery finishes)
1359  *  2) check whether anything in want_acting went down on each new map
1360  *     (and, if so, calculate a new want_acting)
1361  *  3) remove the assertion in PG::RecoveryState::Active::react(const AdvMap)
1362  * TODO!
1363  */
1364 bool PG::choose_acting(pg_shard_t &auth_log_shard_id,
1365                        bool restrict_to_up_acting,
1366                        bool *history_les_bound)
1367 {
1368   map<pg_shard_t, pg_info_t> all_info(peer_info.begin(), peer_info.end());
1369   all_info[pg_whoami] = info;
1370
1371   for (map<pg_shard_t, pg_info_t>::iterator p = all_info.begin();
1372        p != all_info.end();
1373        ++p) {
1374     dout(10) << "calc_acting osd." << p->first << " " << p->second << dendl;
1375   }
1376
1377   map<pg_shard_t, pg_info_t>::const_iterator auth_log_shard =
1378     find_best_info(all_info, restrict_to_up_acting, history_les_bound);
1379
1380   if (auth_log_shard == all_info.end()) {
1381     if (up != acting) {
1382       dout(10) << "choose_acting no suitable info found (incomplete backfills?),"
1383                << " reverting to up" << dendl;
1384       want_acting = up;
1385       vector<int> empty;
1386       osd->queue_want_pg_temp(info.pgid.pgid, empty);
1387     } else {
1388       dout(10) << "choose_acting failed" << dendl;
1389       assert(want_acting.empty());
1390     }
1391     return false;
1392   }
1393
1394   assert(!auth_log_shard->second.is_incomplete());
1395   auth_log_shard_id = auth_log_shard->first;
1396
1397   set<pg_shard_t> want_backfill, want_acting_backfill;
1398   vector<int> want;
1399   pg_shard_t want_primary;
1400   stringstream ss;
1401   if (!pool.info.ec_pool())
1402     calc_replicated_acting(
1403       auth_log_shard,
1404       get_osdmap()->get_pg_size(info.pgid.pgid),
1405       acting,
1406       primary,
1407       up,
1408       up_primary,
1409       all_info,
1410       restrict_to_up_acting,
1411       &want,
1412       &want_backfill,
1413       &want_acting_backfill,
1414       &want_primary,
1415       ss);
1416   else
1417     calc_ec_acting(
1418       auth_log_shard,
1419       get_osdmap()->get_pg_size(info.pgid.pgid),
1420       acting,
1421       primary,
1422       up,
1423       up_primary,
1424       all_info,
1425       restrict_to_up_acting,
1426       &want,
1427       &want_backfill,
1428       &want_acting_backfill,
1429       &want_primary,
1430       ss);
1431   dout(10) << ss.str() << dendl;
1432
1433   unsigned num_want_acting = 0;
1434   set<pg_shard_t> have;
1435   for (int i = 0; i < (int)want.size(); ++i) {
1436     if (want[i] != CRUSH_ITEM_NONE) {
1437       ++num_want_acting;
1438       have.insert(
1439         pg_shard_t(
1440           want[i],
1441           pool.info.ec_pool() ? shard_id_t(i) : shard_id_t::NO_SHARD));
1442     }
1443   }
1444
1445   // We go incomplete if below min_size for ec_pools since backfill
1446   // does not currently maintain rollbackability
1447   // Otherwise, we will go "peered", but not "active"
1448   if (num_want_acting < pool.info.min_size &&
1449       (pool.info.ec_pool() ||
1450        !cct->_conf->osd_allow_recovery_below_min_size)) {
1451     want_acting.clear();
1452     dout(10) << "choose_acting failed, below min size" << dendl;
1453     return false;
1454   }
1455
1456   /* Check whether we have enough acting shards to later perform recovery */
1457   boost::scoped_ptr<IsPGRecoverablePredicate> recoverable_predicate(
1458     get_pgbackend()->get_is_recoverable_predicate());
1459   if (!(*recoverable_predicate)(have)) {
1460     want_acting.clear();
1461     dout(10) << "choose_acting failed, not recoverable" << dendl;
1462     return false;
1463   }
1464
1465   if (want != acting) {
1466     dout(10) << "choose_acting want " << want << " != acting " << acting
1467              << ", requesting pg_temp change" << dendl;
1468     want_acting = want;
1469
1470     if (want_acting == up) {
1471       // There can't be any pending backfill if
1472       // want is the same as crush map up OSDs.
1473       assert(want_backfill.empty());
1474       vector<int> empty;
1475       osd->queue_want_pg_temp(info.pgid.pgid, empty);
1476     } else
1477       osd->queue_want_pg_temp(info.pgid.pgid, want);
1478     return false;
1479   }
1480   want_acting.clear();
1481   actingbackfill = want_acting_backfill;
1482   dout(10) << "actingbackfill is " << actingbackfill << dendl;
1483   assert(backfill_targets.empty() || backfill_targets == want_backfill);
1484   if (backfill_targets.empty()) {
1485     // Caller is GetInfo
1486     backfill_targets = want_backfill;
1487   }
1488   // Will not change if already set because up would have had to change
1489   // Verify that nothing in backfill is in stray_set
1490   for (set<pg_shard_t>::iterator i = want_backfill.begin();
1491       i != want_backfill.end();
1492       ++i) {
1493     assert(stray_set.find(*i) == stray_set.end());
1494   }
1495   dout(10) << "choose_acting want " << want << " (== acting) backfill_targets " 
1496            << want_backfill << dendl;
1497   return true;
1498 }
1499
1500 /* Build the might_have_unfound set.
1501  *
1502  * This is used by the primary OSD during recovery.
1503  *
1504  * This set tracks the OSDs which might have unfound objects that the primary
1505  * OSD needs. As we receive pg_missing_t from each OSD in might_have_unfound, we
1506  * will remove the OSD from the set.
1507  */
1508 void PG::build_might_have_unfound()
1509 {
1510   assert(might_have_unfound.empty());
1511   assert(is_primary());
1512
1513   dout(10) << __func__ << dendl;
1514
1515   check_past_interval_bounds();
1516
1517   might_have_unfound = past_intervals.get_might_have_unfound(
1518     pg_whoami,
1519     pool.info.ec_pool());
1520
1521   // include any (stray) peers
1522   for (map<pg_shard_t, pg_info_t>::iterator p = peer_info.begin();
1523        p != peer_info.end();
1524        ++p)
1525     might_have_unfound.insert(p->first);
1526
1527   dout(15) << __func__ << ": built " << might_have_unfound << dendl;
1528 }
1529
1530 struct C_PG_ActivateCommitted : public Context {
1531   PGRef pg;
1532   epoch_t epoch;
1533   epoch_t activation_epoch;
1534   C_PG_ActivateCommitted(PG *p, epoch_t e, epoch_t ae)
1535     : pg(p), epoch(e), activation_epoch(ae) {}
1536   void finish(int r) override {
1537     pg->_activate_committed(epoch, activation_epoch);
1538   }
1539 };
1540
1541 void PG::activate(ObjectStore::Transaction& t,
1542                   epoch_t activation_epoch,
1543                   list<Context*>& tfin,
1544                   map<int, map<spg_t,pg_query_t> >& query_map,
1545                   map<int,
1546                       vector<
1547                         pair<pg_notify_t,
1548                              PastIntervals> > > *activator_map,
1549                   RecoveryCtx *ctx)
1550 {
1551   assert(!is_peered());
1552   assert(scrubber.callbacks.empty());
1553   assert(callbacks_for_degraded_object.empty());
1554
1555   // twiddle pg state
1556   state_clear(PG_STATE_DOWN);
1557
1558   send_notify = false;
1559
1560   if (is_primary()) {
1561     // only update primary last_epoch_started if we will go active
1562     if (acting.size() >= pool.info.min_size) {
1563       assert(cct->_conf->osd_find_best_info_ignore_history_les ||
1564              info.last_epoch_started <= activation_epoch);
1565       info.last_epoch_started = activation_epoch;
1566       info.last_interval_started = info.history.same_interval_since;
1567     }
1568   } else if (is_acting(pg_whoami)) {
1569     /* update last_epoch_started on acting replica to whatever the primary sent
1570      * unless it's smaller (could happen if we are going peered rather than
1571      * active, see doc/dev/osd_internals/last_epoch_started.rst) */
1572     if (info.last_epoch_started < activation_epoch) {
1573       info.last_epoch_started = activation_epoch;
1574       info.last_interval_started = info.history.same_interval_since;
1575     }
1576   }
1577
1578   auto &missing = pg_log.get_missing();
1579
1580   if (is_primary()) {
1581     last_update_ondisk = info.last_update;
1582     min_last_complete_ondisk = eversion_t(0,0);  // we don't know (yet)!
1583   }
1584   last_update_applied = info.last_update;
1585   last_rollback_info_trimmed_to_applied = pg_log.get_can_rollback_to();
1586
1587   need_up_thru = false;
1588
1589   // write pg info, log
1590   dirty_info = true;
1591   dirty_big_info = true; // maybe
1592
1593   // find out when we commit
1594   t.register_on_complete(
1595     new C_PG_ActivateCommitted(
1596       this,
1597       get_osdmap()->get_epoch(),
1598       activation_epoch));
1599   
1600   // initialize snap_trimq
1601   if (is_primary()) {
1602     dout(20) << "activate - purged_snaps " << info.purged_snaps
1603              << " cached_removed_snaps " << pool.cached_removed_snaps << dendl;
1604     snap_trimq = pool.cached_removed_snaps;
1605     interval_set<snapid_t> intersection;
1606     intersection.intersection_of(snap_trimq, info.purged_snaps);
1607     if (intersection == info.purged_snaps) {
1608       snap_trimq.subtract(info.purged_snaps);
1609     } else {
1610         dout(0) << "warning: info.purged_snaps (" << info.purged_snaps
1611                 << ") is not a subset of pool.cached_removed_snaps ("
1612                 << pool.cached_removed_snaps << ")" << dendl;
1613         snap_trimq.subtract(intersection);
1614     }
1615   }
1616
1617   // init complete pointer
1618   if (missing.num_missing() == 0) {
1619     dout(10) << "activate - no missing, moving last_complete " << info.last_complete 
1620              << " -> " << info.last_update << dendl;
1621     info.last_complete = info.last_update;
1622     pg_log.reset_recovery_pointers();
1623   } else {
1624     dout(10) << "activate - not complete, " << missing << dendl;
1625     pg_log.activate_not_complete(info);
1626   }
1627     
1628   log_weirdness();
1629
1630   // if primary..
1631   if (is_primary()) {
1632     assert(ctx);
1633     // start up replicas
1634
1635     assert(!actingbackfill.empty());
1636     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1637          i != actingbackfill.end();
1638          ++i) {
1639       if (*i == pg_whoami) continue;
1640       pg_shard_t peer = *i;
1641       assert(peer_info.count(peer));
1642       pg_info_t& pi = peer_info[peer];
1643
1644       dout(10) << "activate peer osd." << peer << " " << pi << dendl;
1645
1646       MOSDPGLog *m = 0;
1647       assert(peer_missing.count(peer));
1648       pg_missing_t& pm = peer_missing[peer];
1649
1650       bool needs_past_intervals = pi.dne();
1651
1652       /*
1653        * cover case where peer sort order was different and
1654        * last_backfill cannot be interpreted
1655        */
1656       bool force_restart_backfill =
1657         !pi.last_backfill.is_max() &&
1658         !pi.last_backfill_bitwise;
1659
1660       if (pi.last_update == info.last_update && !force_restart_backfill) {
1661         // empty log
1662         if (!pi.last_backfill.is_max())
1663           osd->clog->info() << info.pgid << " continuing backfill to osd."
1664                             << peer
1665                             << " from (" << pi.log_tail << "," << pi.last_update
1666                             << "] " << pi.last_backfill
1667                             << " to " << info.last_update;
1668         if (!pi.is_empty() && activator_map) {
1669           dout(10) << "activate peer osd." << peer << " is up to date, queueing in pending_activators" << dendl;
1670           (*activator_map)[peer.osd].push_back(
1671             make_pair(
1672               pg_notify_t(
1673                 peer.shard, pg_whoami.shard,
1674                 get_osdmap()->get_epoch(),
1675                 get_osdmap()->get_epoch(),
1676                 info),
1677               past_intervals));
1678         } else {
1679           dout(10) << "activate peer osd." << peer << " is up to date, but sending pg_log anyway" << dendl;
1680           m = new MOSDPGLog(
1681             i->shard, pg_whoami.shard,
1682             get_osdmap()->get_epoch(), info);
1683         }
1684       } else if (
1685         pg_log.get_tail() > pi.last_update ||
1686         pi.last_backfill == hobject_t() ||
1687         force_restart_backfill ||
1688         (backfill_targets.count(*i) && pi.last_backfill.is_max())) {
1689         /* ^ This last case covers a situation where a replica is not contiguous
1690          * with the auth_log, but is contiguous with this replica.  Reshuffling
1691          * the active set to handle this would be tricky, so instead we just go
1692          * ahead and backfill it anyway.  This is probably preferrable in any
1693          * case since the replica in question would have to be significantly
1694          * behind.
1695          */
1696         // backfill
1697         osd->clog->debug() << info.pgid << " starting backfill to osd." << peer
1698                          << " from (" << pi.log_tail << "," << pi.last_update
1699                           << "] " << pi.last_backfill
1700                          << " to " << info.last_update;
1701
1702         pi.last_update = info.last_update;
1703         pi.last_complete = info.last_update;
1704         pi.set_last_backfill(hobject_t());
1705         pi.last_epoch_started = info.last_epoch_started;
1706         pi.last_interval_started = info.last_interval_started;
1707         pi.history = info.history;
1708         pi.hit_set = info.hit_set;
1709         pi.stats.stats.clear();
1710
1711         // initialize peer with our purged_snaps.
1712         pi.purged_snaps = info.purged_snaps;
1713
1714         m = new MOSDPGLog(
1715           i->shard, pg_whoami.shard,
1716           get_osdmap()->get_epoch(), pi);
1717
1718         // send some recent log, so that op dup detection works well.
1719         m->log.copy_up_to(pg_log.get_log(), cct->_conf->osd_min_pg_log_entries);
1720         m->info.log_tail = m->log.tail;
1721         pi.log_tail = m->log.tail;  // sigh...
1722
1723         pm.clear();
1724       } else {
1725         // catch up
1726         assert(pg_log.get_tail() <= pi.last_update);
1727         m = new MOSDPGLog(
1728           i->shard, pg_whoami.shard,
1729           get_osdmap()->get_epoch(), info);
1730         // send new stuff to append to replicas log
1731         m->log.copy_after(pg_log.get_log(), pi.last_update);
1732       }
1733
1734       // share past_intervals if we are creating the pg on the replica
1735       // based on whether our info for that peer was dne() *before*
1736       // updating pi.history in the backfill block above.
1737       if (m && needs_past_intervals)
1738         m->past_intervals = past_intervals;
1739
1740       // update local version of peer's missing list!
1741       if (m && pi.last_backfill != hobject_t()) {
1742         for (list<pg_log_entry_t>::iterator p = m->log.log.begin();
1743              p != m->log.log.end();
1744              ++p) {
1745           if (p->soid <= pi.last_backfill &&
1746               !p->is_error()) {
1747             if (perform_deletes_during_peering() && p->is_delete()) {
1748               pm.rm(p->soid, p->version);
1749             } else {
1750               pm.add_next_event(*p);
1751             }
1752           }
1753         }
1754       }
1755
1756       if (m) {
1757         dout(10) << "activate peer osd." << peer << " sending " << m->log << dendl;
1758         //m->log.print(cout);
1759         osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
1760       }
1761
1762       // peer now has 
1763       pi.last_update = info.last_update;
1764
1765       // update our missing
1766       if (pm.num_missing() == 0) {
1767         pi.last_complete = pi.last_update;
1768         dout(10) << "activate peer osd." << peer << " " << pi << " uptodate" << dendl;
1769       } else {
1770         dout(10) << "activate peer osd." << peer << " " << pi << " missing " << pm << dendl;
1771       }
1772     }
1773
1774     // Set up missing_loc
1775     set<pg_shard_t> complete_shards;
1776     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1777          i != actingbackfill.end();
1778          ++i) {
1779       dout(20) << __func__ << " setting up missing_loc from shard " << *i << " " << dendl;
1780       if (*i == get_primary()) {
1781         missing_loc.add_active_missing(missing);
1782         if (!missing.have_missing())
1783           complete_shards.insert(*i);
1784       } else {
1785         auto peer_missing_entry = peer_missing.find(*i);
1786         assert(peer_missing_entry != peer_missing.end());
1787         missing_loc.add_active_missing(peer_missing_entry->second);
1788         if (!peer_missing_entry->second.have_missing() &&
1789             peer_info[*i].last_backfill.is_max())
1790           complete_shards.insert(*i);
1791       }
1792     }
1793     // If necessary, create might_have_unfound to help us find our unfound objects.
1794     // NOTE: It's important that we build might_have_unfound before trimming the
1795     // past intervals.
1796     might_have_unfound.clear();
1797     if (needs_recovery()) {
1798       // If only one shard has missing, we do a trick to add all others as recovery
1799       // source, this is considered safe since the PGLogs have been merged locally,
1800       // and covers vast majority of the use cases, like one OSD/host is down for
1801       // a while for hardware repairing
1802       if (complete_shards.size() + 1 == actingbackfill.size()) {
1803         missing_loc.add_batch_sources_info(complete_shards, ctx->handle);
1804       } else {
1805         missing_loc.add_source_info(pg_whoami, info, pg_log.get_missing(),
1806                                     ctx->handle);
1807         for (set<pg_shard_t>::iterator i = actingbackfill.begin();
1808              i != actingbackfill.end();
1809              ++i) {
1810           if (*i == pg_whoami) continue;
1811           dout(10) << __func__ << ": adding " << *i << " as a source" << dendl;
1812           assert(peer_missing.count(*i));
1813           assert(peer_info.count(*i));
1814           missing_loc.add_source_info(
1815             *i,
1816             peer_info[*i],
1817             peer_missing[*i],
1818             ctx->handle);
1819         }
1820       }
1821       for (map<pg_shard_t, pg_missing_t>::iterator i = peer_missing.begin();
1822            i != peer_missing.end();
1823            ++i) {
1824         if (is_actingbackfill(i->first))
1825           continue;
1826         assert(peer_info.count(i->first));
1827         search_for_missing(
1828           peer_info[i->first],
1829           i->second,
1830           i->first,
1831           ctx);
1832       }
1833
1834       build_might_have_unfound();
1835
1836       state_set(PG_STATE_DEGRADED);
1837       if (have_unfound())
1838         discover_all_missing(query_map);
1839     }
1840
1841     // degraded?
1842     if (get_osdmap()->get_pg_size(info.pgid.pgid) > actingset.size()) {
1843       state_set(PG_STATE_DEGRADED);
1844       state_set(PG_STATE_UNDERSIZED);
1845     }
1846
1847     state_set(PG_STATE_ACTIVATING);
1848     release_pg_backoffs();
1849     projected_last_update = info.last_update;
1850   }
1851   if (acting.size() >= pool.info.min_size) {
1852     PGLogEntryHandler handler{this, &t};
1853     pg_log.roll_forward(&handler);
1854   }
1855 }
1856
1857 bool PG::op_has_sufficient_caps(OpRequestRef& op)
1858 {
1859   // only check MOSDOp
1860   if (op->get_req()->get_type() != CEPH_MSG_OSD_OP)
1861     return true;
1862
1863   const MOSDOp *req = static_cast<const MOSDOp*>(op->get_req());
1864
1865   Session *session = static_cast<Session*>(req->get_connection()->get_priv());
1866   if (!session) {
1867     dout(0) << "op_has_sufficient_caps: no session for op " << *req << dendl;
1868     return false;
1869   }
1870   OSDCap& caps = session->caps;
1871   session->put();
1872
1873   const string &key = req->get_hobj().get_key().empty() ?
1874     req->get_oid().name :
1875     req->get_hobj().get_key();
1876
1877   bool cap = caps.is_capable(pool.name, req->get_hobj().nspace,
1878                              pool.auid, key,
1879                              op->need_read_cap(),
1880                              op->need_write_cap(),
1881                              op->classes());
1882
1883   dout(20) << "op_has_sufficient_caps "
1884            << "session=" << session
1885            << " pool=" << pool.id << " (" << pool.name
1886            << " " << req->get_hobj().nspace
1887            << ") owner=" << pool.auid
1888            << " need_read_cap=" << op->need_read_cap()
1889            << " need_write_cap=" << op->need_write_cap()
1890            << " classes=" << op->classes()
1891            << " -> " << (cap ? "yes" : "NO")
1892            << dendl;
1893   return cap;
1894 }
1895
1896 void PG::_activate_committed(epoch_t epoch, epoch_t activation_epoch)
1897 {
1898   lock();
1899   if (pg_has_reset_since(epoch)) {
1900     dout(10) << "_activate_committed " << epoch
1901              << ", that was an old interval" << dendl;
1902   } else if (is_primary()) {
1903     peer_activated.insert(pg_whoami);
1904     dout(10) << "_activate_committed " << epoch
1905              << " peer_activated now " << peer_activated 
1906              << " last_interval_started " << info.history.last_interval_started
1907              << " last_epoch_started " << info.history.last_epoch_started
1908              << " same_interval_since " << info.history.same_interval_since << dendl;
1909     assert(!actingbackfill.empty());
1910     if (peer_activated.size() == actingbackfill.size())
1911       all_activated_and_committed();
1912   } else {
1913     dout(10) << "_activate_committed " << epoch << " telling primary" << dendl;
1914     MOSDPGInfo *m = new MOSDPGInfo(epoch);
1915     pg_notify_t i = pg_notify_t(
1916       get_primary().shard, pg_whoami.shard,
1917       get_osdmap()->get_epoch(),
1918       get_osdmap()->get_epoch(),
1919       info);
1920
1921     i.info.history.last_epoch_started = activation_epoch;
1922     i.info.history.last_interval_started = i.info.history.same_interval_since;
1923     if (acting.size() >= pool.info.min_size) {
1924       state_set(PG_STATE_ACTIVE);
1925     } else {
1926       state_set(PG_STATE_PEERED);
1927     }
1928
1929     m->pg_list.push_back(make_pair(i, PastIntervals()));
1930     osd->send_message_osd_cluster(get_primary().osd, m, get_osdmap()->get_epoch());
1931
1932     // waiters
1933     if (flushes_in_progress == 0) {
1934       requeue_ops(waiting_for_peered);
1935     }
1936   }
1937
1938   assert(!dirty_info);
1939
1940   unlock();
1941 }
1942
1943 /*
1944  * update info.history.last_epoch_started ONLY after we and all
1945  * replicas have activated AND committed the activate transaction
1946  * (i.e. the peering results are stable on disk).
1947  */
1948 void PG::all_activated_and_committed()
1949 {
1950   dout(10) << "all_activated_and_committed" << dendl;
1951   assert(is_primary());
1952   assert(peer_activated.size() == actingbackfill.size());
1953   assert(!actingbackfill.empty());
1954   assert(blocked_by.empty());
1955
1956   queue_peering_event(
1957     CephPeeringEvtRef(
1958       std::make_shared<CephPeeringEvt>(
1959         get_osdmap()->get_epoch(),
1960         get_osdmap()->get_epoch(),
1961         AllReplicasActivated())));
1962 }
1963
1964 bool PG::requeue_scrub(bool high_priority)
1965 {
1966   assert(is_locked());
1967   if (scrub_queued) {
1968     dout(10) << __func__ << ": already queued" << dendl;
1969     return false;
1970   } else {
1971     dout(10) << __func__ << ": queueing" << dendl;
1972     scrub_queued = true;
1973     osd->queue_for_scrub(this, high_priority);
1974     return true;
1975   }
1976 }
1977
1978 void PG::queue_recovery()
1979 {
1980   if (!is_primary() || !is_peered()) {
1981     dout(10) << "queue_recovery -- not primary or not peered " << dendl;
1982     assert(!recovery_queued);
1983   } else if (recovery_queued) {
1984     dout(10) << "queue_recovery -- already queued" << dendl;
1985   } else {
1986     dout(10) << "queue_recovery -- queuing" << dendl;
1987     recovery_queued = true;
1988     osd->queue_for_recovery(this);
1989   }
1990 }
1991
1992 bool PG::queue_scrub()
1993 {
1994   assert(is_locked());
1995   if (is_scrubbing()) {
1996     return false;
1997   }
1998   scrubber.priority = scrubber.must_scrub ?
1999          cct->_conf->osd_requested_scrub_priority : get_scrub_priority();
2000   scrubber.must_scrub = false;
2001   state_set(PG_STATE_SCRUBBING);
2002   if (scrubber.must_deep_scrub) {
2003     state_set(PG_STATE_DEEP_SCRUB);
2004     scrubber.must_deep_scrub = false;
2005   }
2006   if (scrubber.must_repair || scrubber.auto_repair) {
2007     state_set(PG_STATE_REPAIR);
2008     scrubber.must_repair = false;
2009   }
2010   requeue_scrub();
2011   return true;
2012 }
2013
2014 unsigned PG::get_scrub_priority()
2015 {
2016   // a higher value -> a higher priority
2017   int pool_scrub_priority = 0;
2018   pool.info.opts.get(pool_opts_t::SCRUB_PRIORITY, &pool_scrub_priority);
2019   return pool_scrub_priority > 0 ? pool_scrub_priority : cct->_conf->osd_scrub_priority;
2020 }
2021
2022 struct C_PG_FinishRecovery : public Context {
2023   PGRef pg;
2024   explicit C_PG_FinishRecovery(PG *p) : pg(p) {}
2025   void finish(int r) override {
2026     pg->_finish_recovery(this);
2027   }
2028 };
2029
2030 void PG::mark_clean()
2031 {
2032   if (actingset.size() == get_osdmap()->get_pg_size(info.pgid.pgid)) {
2033     state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
2034     state_set(PG_STATE_CLEAN);
2035     info.history.last_epoch_clean = get_osdmap()->get_epoch();
2036     info.history.last_interval_clean = info.history.same_interval_since;
2037     past_intervals.clear();
2038     dirty_big_info = true;
2039     dirty_info = true;
2040   }
2041
2042   kick_snap_trim();
2043 }
2044
2045 void PG::_change_recovery_force_mode(int new_mode, bool clear)
2046 {
2047   if (!deleting) {
2048     // we can't and shouldn't do anything if the PG is being deleted locally
2049     if (clear) {
2050       state_clear(new_mode);
2051     } else {
2052       state_set(new_mode);
2053     }
2054     publish_stats_to_osd();
2055   }
2056 }
2057
2058 inline int PG::clamp_recovery_priority(int priority)
2059 {
2060   static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range");
2061   static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type");
2062
2063   // Clamp to valid range
2064   if (priority > OSD_RECOVERY_PRIORITY_MAX) {
2065     return OSD_RECOVERY_PRIORITY_MAX;
2066   } else if (priority < OSD_RECOVERY_PRIORITY_MIN) {
2067     return OSD_RECOVERY_PRIORITY_MIN;
2068   } else {
2069     return priority;
2070   }
2071 }
2072
2073 unsigned PG::get_recovery_priority()
2074 {
2075   // a higher value -> a higher priority
2076   int ret = 0;
2077
2078   if (state & PG_STATE_FORCED_RECOVERY) {
2079     ret = OSD_RECOVERY_PRIORITY_FORCED;
2080   } else {
2081     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &ret);
2082     ret = clamp_recovery_priority(OSD_RECOVERY_PRIORITY_BASE + ret);
2083   }
2084   dout(20) << __func__ << " recovery priority for " << *this << " is " << ret << ", state is " << state << dendl;
2085   return static_cast<unsigned>(ret);
2086 }
2087
2088 unsigned PG::get_backfill_priority()
2089 {
2090   // a higher value -> a higher priority
2091   int ret = OSD_BACKFILL_PRIORITY_BASE;
2092   if (state & PG_STATE_FORCED_BACKFILL) {
2093     ret = OSD_RECOVERY_PRIORITY_FORCED;
2094   } else {
2095     if (acting.size() < pool.info.min_size) {
2096       // inactive: no. of replicas < min_size, highest priority since it blocks IO
2097       ret = OSD_BACKFILL_INACTIVE_PRIORITY_BASE + (pool.info.min_size - acting.size());
2098
2099     } else if (is_undersized()) {
2100       // undersized: OSD_BACKFILL_DEGRADED_PRIORITY_BASE + num missing replicas
2101       assert(pool.info.size > actingset.size());
2102       ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE + (pool.info.size - actingset.size());
2103
2104     } else if (is_degraded()) {
2105       // degraded: baseline degraded
2106       ret = OSD_BACKFILL_DEGRADED_PRIORITY_BASE;
2107     }
2108
2109     // Adjust with pool's recovery priority
2110     int pool_recovery_priority = 0;
2111     pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority);
2112
2113     ret = clamp_recovery_priority(pool_recovery_priority + ret);
2114   }
2115
2116   return static_cast<unsigned>(ret);
2117 }
2118
2119 void PG::finish_recovery(list<Context*>& tfin)
2120 {
2121   dout(10) << "finish_recovery" << dendl;
2122   assert(info.last_complete == info.last_update);
2123
2124   clear_recovery_state();
2125
2126   /*
2127    * sync all this before purging strays.  but don't block!
2128    */
2129   finish_sync_event = new C_PG_FinishRecovery(this);
2130   tfin.push_back(finish_sync_event);
2131 }
2132
2133 void PG::_finish_recovery(Context *c)
2134 {
2135   lock();
2136   if (deleting) {
2137     unlock();
2138     return;
2139   }
2140   if (c == finish_sync_event) {
2141     dout(10) << "_finish_recovery" << dendl;
2142     finish_sync_event = 0;
2143     purge_strays();
2144
2145     publish_stats_to_osd();
2146
2147     if (scrub_after_recovery) {
2148       dout(10) << "_finish_recovery requeueing for scrub" << dendl;
2149       scrub_after_recovery = false;
2150       scrubber.must_deep_scrub = true;
2151       queue_scrub();
2152     }
2153   } else {
2154     dout(10) << "_finish_recovery -- stale" << dendl;
2155   }
2156   unlock();
2157 }
2158
2159 void PG::start_recovery_op(const hobject_t& soid)
2160 {
2161   dout(10) << "start_recovery_op " << soid
2162 #ifdef DEBUG_RECOVERY_OIDS
2163            << " (" << recovering_oids << ")"
2164 #endif
2165            << dendl;
2166   assert(recovery_ops_active >= 0);
2167   recovery_ops_active++;
2168 #ifdef DEBUG_RECOVERY_OIDS
2169   assert(recovering_oids.count(soid) == 0);
2170   recovering_oids.insert(soid);
2171 #endif
2172   osd->start_recovery_op(this, soid);
2173 }
2174
2175 void PG::finish_recovery_op(const hobject_t& soid, bool dequeue)
2176 {
2177   dout(10) << "finish_recovery_op " << soid
2178 #ifdef DEBUG_RECOVERY_OIDS
2179            << " (" << recovering_oids << ")" 
2180 #endif
2181            << dendl;
2182   assert(recovery_ops_active > 0);
2183   recovery_ops_active--;
2184 #ifdef DEBUG_RECOVERY_OIDS
2185   assert(recovering_oids.count(soid));
2186   recovering_oids.erase(soid);
2187 #endif
2188   osd->finish_recovery_op(this, soid, dequeue);
2189
2190   if (!dequeue) {
2191     queue_recovery();
2192   }
2193 }
2194
2195 void PG::split_into(pg_t child_pgid, PG *child, unsigned split_bits)
2196 {
2197   child->update_snap_mapper_bits(split_bits);
2198   child->update_osdmap_ref(get_osdmap());
2199
2200   child->pool = pool;
2201
2202   // Log
2203   pg_log.split_into(child_pgid, split_bits, &(child->pg_log));
2204   child->info.last_complete = info.last_complete;
2205
2206   info.last_update = pg_log.get_head();
2207   child->info.last_update = child->pg_log.get_head();
2208
2209   child->info.last_user_version = info.last_user_version;
2210
2211   info.log_tail = pg_log.get_tail();
2212   child->info.log_tail = child->pg_log.get_tail();
2213
2214   if (info.last_complete < pg_log.get_tail())
2215     info.last_complete = pg_log.get_tail();
2216   if (child->info.last_complete < child->pg_log.get_tail())
2217     child->info.last_complete = child->pg_log.get_tail();
2218
2219   // Info
2220   child->info.history = info.history;
2221   child->info.history.epoch_created = get_osdmap()->get_epoch();
2222   child->info.purged_snaps = info.purged_snaps;
2223
2224   if (info.last_backfill.is_max()) {
2225     child->info.set_last_backfill(hobject_t::get_max());
2226   } else {
2227     // restart backfill on parent and child to be safe.  we could
2228     // probably do better in the bitwise sort case, but it's more
2229     // fragile (there may be special work to do on backfill completion
2230     // in the future).
2231     info.set_last_backfill(hobject_t());
2232     child->info.set_last_backfill(hobject_t());
2233     // restarting backfill implies that the missing set is empty,
2234     // since it is only used for objects prior to last_backfill
2235     pg_log.reset_backfill();
2236     child->pg_log.reset_backfill();
2237   }
2238
2239   child->info.stats = info.stats;
2240   child->info.stats.parent_split_bits = split_bits;
2241   info.stats.stats_invalid = true;
2242   child->info.stats.stats_invalid = true;
2243   child->info.last_epoch_started = info.last_epoch_started;
2244   child->info.last_interval_started = info.last_interval_started;
2245
2246   child->snap_trimq = snap_trimq;
2247
2248   // There can't be recovery/backfill going on now
2249   int primary, up_primary;
2250   vector<int> newup, newacting;
2251   get_osdmap()->pg_to_up_acting_osds(
2252     child->info.pgid.pgid, &newup, &up_primary, &newacting, &primary);
2253   child->init_primary_up_acting(
2254     newup,
2255     newacting,
2256     up_primary,
2257     primary);
2258   child->role = OSDMap::calc_pg_role(osd->whoami, child->acting);
2259
2260   // this comparison includes primary rank via pg_shard_t
2261   if (get_primary() != child->get_primary())
2262     child->info.history.same_primary_since = get_osdmap()->get_epoch();
2263
2264   child->info.stats.up = up;
2265   child->info.stats.up_primary = up_primary;
2266   child->info.stats.acting = acting;
2267   child->info.stats.acting_primary = primary;
2268   child->info.stats.mapping_epoch = get_osdmap()->get_epoch();
2269
2270   // History
2271   child->past_intervals = past_intervals;
2272
2273   _split_into(child_pgid, child, split_bits);
2274
2275   // release all backoffs for simplicity
2276   release_backoffs(hobject_t(), hobject_t::get_max());
2277
2278   child->on_new_interval();
2279
2280   child->dirty_info = true;
2281   child->dirty_big_info = true;
2282   dirty_info = true;
2283   dirty_big_info = true;
2284 }
2285
2286 void PG::add_backoff(SessionRef s, const hobject_t& begin, const hobject_t& end)
2287 {
2288   ConnectionRef con = s->con;
2289   if (!con)   // OSD::ms_handle_reset clears s->con without a lock
2290     return;
2291   BackoffRef b(s->have_backoff(info.pgid, begin));
2292   if (b) {
2293     derr << __func__ << " already have backoff for " << s << " begin " << begin
2294          << " " << *b << dendl;
2295     ceph_abort();
2296   }
2297   Mutex::Locker l(backoff_lock);
2298   {
2299     b = new Backoff(info.pgid, this, s, ++s->backoff_seq, begin, end);
2300     backoffs[begin].insert(b);
2301     s->add_backoff(b);
2302     dout(10) << __func__ << " session " << s << " added " << *b << dendl;
2303   }
2304   con->send_message(
2305     new MOSDBackoff(
2306       info.pgid,
2307       get_osdmap()->get_epoch(),
2308       CEPH_OSD_BACKOFF_OP_BLOCK,
2309       b->id,
2310       begin,
2311       end));
2312 }
2313
2314 void PG::release_backoffs(const hobject_t& begin, const hobject_t& end)
2315 {
2316   dout(10) << __func__ << " [" << begin << "," << end << ")" << dendl;
2317   vector<BackoffRef> bv;
2318   {
2319     Mutex::Locker l(backoff_lock);
2320     auto p = backoffs.lower_bound(begin);
2321     while (p != backoffs.end()) {
2322       int r = cmp(p->first, end);
2323       dout(20) << __func__ << " ? " << r << " " << p->first
2324                << " " << p->second << dendl;
2325       // note: must still examine begin=end=p->first case
2326       if (r > 0 || (r == 0 && begin < end)) {
2327         break;
2328       }
2329       dout(20) << __func__ << " checking " << p->first
2330                << " " << p->second << dendl;
2331       auto q = p->second.begin();
2332       while (q != p->second.end()) {
2333         dout(20) << __func__ << " checking  " << *q << dendl;
2334         int r = cmp((*q)->begin, begin);
2335         if (r == 0 || (r > 0 && (*q)->end < end)) {
2336           bv.push_back(*q);
2337           q = p->second.erase(q);
2338         } else {
2339           ++q;
2340         }
2341       }
2342       if (p->second.empty()) {
2343         p = backoffs.erase(p);
2344       } else {
2345         ++p;
2346       }
2347     }
2348   }
2349   for (auto b : bv) {
2350     Mutex::Locker l(b->lock);
2351     dout(10) << __func__ << " " << *b << dendl;
2352     if (b->session) {
2353       assert(b->pg == this);
2354       ConnectionRef con = b->session->con;
2355       if (con) {   // OSD::ms_handle_reset clears s->con without a lock
2356         con->send_message(
2357           new MOSDBackoff(
2358             info.pgid,
2359             get_osdmap()->get_epoch(),
2360             CEPH_OSD_BACKOFF_OP_UNBLOCK,
2361             b->id,
2362             b->begin,
2363             b->end));
2364       }
2365       if (b->is_new()) {
2366         b->state = Backoff::STATE_DELETING;
2367       } else {
2368         b->session->rm_backoff(b);
2369         b->session.reset();
2370       }
2371       b->pg.reset();
2372     }
2373   }
2374 }
2375
2376 void PG::clear_backoffs()
2377 {
2378   dout(10) << __func__ << " " << dendl;
2379   map<hobject_t,set<BackoffRef>> ls;
2380   {
2381     Mutex::Locker l(backoff_lock);
2382     ls.swap(backoffs);
2383   }
2384   for (auto& p : ls) {
2385     for (auto& b : p.second) {
2386       Mutex::Locker l(b->lock);
2387       dout(10) << __func__ << " " << *b << dendl;
2388       if (b->session) {
2389         assert(b->pg == this);
2390         if (b->is_new()) {
2391           b->state = Backoff::STATE_DELETING;
2392         } else {
2393           b->session->rm_backoff(b);
2394           b->session.reset();
2395         }
2396         b->pg.reset();
2397       }
2398     }
2399   }
2400 }
2401
2402 // called by Session::clear_backoffs()
2403 void PG::rm_backoff(BackoffRef b)
2404 {
2405   dout(10) << __func__ << " " << *b << dendl;
2406   Mutex::Locker l(backoff_lock);
2407   assert(b->lock.is_locked_by_me());
2408   assert(b->pg == this);
2409   auto p = backoffs.find(b->begin);
2410   // may race with release_backoffs()
2411   if (p != backoffs.end()) {
2412     auto q = p->second.find(b);
2413     if (q != p->second.end()) {
2414       p->second.erase(q);
2415       if (p->second.empty()) {
2416         backoffs.erase(p);
2417       }
2418     }
2419   }
2420 }
2421
2422 void PG::clear_recovery_state() 
2423 {
2424   dout(10) << "clear_recovery_state" << dendl;
2425
2426   pg_log.reset_recovery_pointers();
2427   finish_sync_event = 0;
2428
2429   hobject_t soid;
2430   while (recovery_ops_active > 0) {
2431 #ifdef DEBUG_RECOVERY_OIDS
2432     soid = *recovering_oids.begin();
2433 #endif
2434     finish_recovery_op(soid, true);
2435   }
2436
2437   backfill_targets.clear();
2438   backfill_info.clear();
2439   peer_backfill_info.clear();
2440   waiting_on_backfill.clear();
2441   _clear_recovery_state();  // pg impl specific hook
2442 }
2443
2444 void PG::cancel_recovery()
2445 {
2446   dout(10) << "cancel_recovery" << dendl;
2447   clear_recovery_state();
2448 }
2449
2450
2451 void PG::purge_strays()
2452 {
2453   dout(10) << "purge_strays " << stray_set << dendl;
2454   
2455   bool removed = false;
2456   for (set<pg_shard_t>::iterator p = stray_set.begin();
2457        p != stray_set.end();
2458        ++p) {
2459     assert(!is_actingbackfill(*p));
2460     if (get_osdmap()->is_up(p->osd)) {
2461       dout(10) << "sending PGRemove to osd." << *p << dendl;
2462       vector<spg_t> to_remove;
2463       to_remove.push_back(spg_t(info.pgid.pgid, p->shard));
2464       MOSDPGRemove *m = new MOSDPGRemove(
2465         get_osdmap()->get_epoch(),
2466         to_remove);
2467       osd->send_message_osd_cluster(p->osd, m, get_osdmap()->get_epoch());
2468     } else {
2469       dout(10) << "not sending PGRemove to down osd." << *p << dendl;
2470     }
2471     peer_missing.erase(*p);
2472     peer_info.erase(*p);
2473     peer_purged.insert(*p);
2474     removed = true;
2475   }
2476
2477   // if we removed anyone, update peers (which include peer_info)
2478   if (removed)
2479     update_heartbeat_peers();
2480
2481   stray_set.clear();
2482
2483   // clear _requested maps; we may have to peer() again if we discover
2484   // (more) stray content
2485   peer_log_requested.clear();
2486   peer_missing_requested.clear();
2487 }
2488
2489 void PG::set_probe_targets(const set<pg_shard_t> &probe_set)
2490 {
2491   Mutex::Locker l(heartbeat_peer_lock);
2492   probe_targets.clear();
2493   for (set<pg_shard_t>::iterator i = probe_set.begin();
2494        i != probe_set.end();
2495        ++i) {
2496     probe_targets.insert(i->osd);
2497   }
2498 }
2499
2500 void PG::clear_probe_targets()
2501 {
2502   Mutex::Locker l(heartbeat_peer_lock);
2503   probe_targets.clear();
2504 }
2505
2506 void PG::update_heartbeat_peers()
2507 {
2508   assert(is_locked());
2509
2510   if (!is_primary())
2511     return;
2512
2513   set<int> new_peers;
2514   for (unsigned i=0; i<acting.size(); i++) {
2515     if (acting[i] != CRUSH_ITEM_NONE)
2516       new_peers.insert(acting[i]);
2517   }
2518   for (unsigned i=0; i<up.size(); i++) {
2519     if (up[i] != CRUSH_ITEM_NONE)
2520       new_peers.insert(up[i]);
2521   }
2522   for (map<pg_shard_t,pg_info_t>::iterator p = peer_info.begin();
2523     p != peer_info.end();
2524     ++p)
2525     new_peers.insert(p->first.osd);
2526
2527   bool need_update = false;
2528   heartbeat_peer_lock.Lock();
2529   if (new_peers == heartbeat_peers) {
2530     dout(10) << "update_heartbeat_peers " << heartbeat_peers << " unchanged" << dendl;
2531   } else {
2532     dout(10) << "update_heartbeat_peers " << heartbeat_peers << " -> " << new_peers << dendl;
2533     heartbeat_peers.swap(new_peers);
2534     need_update = true;
2535   }
2536   heartbeat_peer_lock.Unlock();
2537
2538   if (need_update)
2539     osd->need_heartbeat_peer_update();
2540 }
2541
2542
2543 bool PG::check_in_progress_op(
2544   const osd_reqid_t &r,
2545   eversion_t *version,
2546   version_t *user_version,
2547   int *return_code) const
2548 {
2549   return (
2550     projected_log.get_request(r, version, user_version, return_code) ||
2551     pg_log.get_log().get_request(r, version, user_version, return_code));
2552 }
2553
2554 void PG::_update_calc_stats()
2555 {
2556   info.stats.version = info.last_update;
2557   info.stats.created = info.history.epoch_created;
2558   info.stats.last_scrub = info.history.last_scrub;
2559   info.stats.last_scrub_stamp = info.history.last_scrub_stamp;
2560   info.stats.last_deep_scrub = info.history.last_deep_scrub;
2561   info.stats.last_deep_scrub_stamp = info.history.last_deep_scrub_stamp;
2562   info.stats.last_clean_scrub_stamp = info.history.last_clean_scrub_stamp;
2563   info.stats.last_epoch_clean = info.history.last_epoch_clean;
2564
2565   info.stats.log_size = pg_log.get_head().version - pg_log.get_tail().version;
2566   info.stats.ondisk_log_size = info.stats.log_size;
2567   info.stats.log_start = pg_log.get_tail();
2568   info.stats.ondisk_log_start = pg_log.get_tail();
2569
2570   // If actingset is larger then upset we will have misplaced,
2571   // so we will report based on actingset size.
2572
2573   // If upset is larger then we will have degraded,
2574   // so we will report based on upset size.
2575
2576   // If target is the largest of them all, it will contribute to
2577   // the degraded count because num_object_copies is
2578   // computed using target and eventual used to get degraded total.
2579
2580   unsigned target = get_osdmap()->get_pg_size(info.pgid.pgid);
2581   unsigned nrep = MAX(actingset.size(), upset.size());
2582   // calc num_object_copies
2583   info.stats.stats.calc_copies(MAX(target, nrep));
2584   info.stats.stats.sum.num_objects_degraded = 0;
2585   info.stats.stats.sum.num_objects_unfound = 0;
2586   info.stats.stats.sum.num_objects_misplaced = 0;
2587   if ((is_degraded() || is_undersized() || !is_clean()) && is_peered()) {
2588     // NOTE: we only generate copies, degraded, misplaced and unfound
2589     // values for the summation, not individual stat categories.
2590     int64_t num_objects = info.stats.stats.sum.num_objects;
2591
2592     // Total sum of all missing
2593     int64_t missing = 0;
2594     // Objects that have arrived backfilled to up OSDs (not in acting)
2595     int64_t backfilled = 0;
2596     // A misplaced object is not stored on the correct OSD
2597     int64_t misplaced = 0;
2598     // Total of object copies/shards found
2599     int64_t object_copies = 0;
2600
2601     // num_objects_missing on each peer
2602     for (map<pg_shard_t, pg_info_t>::iterator pi =
2603         peer_info.begin();
2604         pi != peer_info.end();
2605         ++pi) {
2606       map<pg_shard_t, pg_missing_t>::const_iterator pm =
2607         peer_missing.find(pi->first);
2608       if (pm != peer_missing.end()) {
2609         pi->second.stats.stats.sum.num_objects_missing =
2610           pm->second.num_missing();
2611       }
2612     }
2613
2614     assert(!actingbackfill.empty());
2615     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
2616          i != actingbackfill.end();
2617          ++i) {
2618       const pg_shard_t &p = *i;
2619
2620       bool in_up = (upset.find(p) != upset.end());
2621       bool in_acting = (actingset.find(p) != actingset.end());
2622       assert(in_up || in_acting);
2623
2624       // in acting                  Compute total objects excluding num_missing
2625       // in acting and not in up    Compute misplaced objects excluding num_missing
2626       // in up and not in acting    Compute total objects already backfilled
2627       if (in_acting) {
2628         unsigned osd_missing;
2629         // primary handling
2630         if (p == pg_whoami) {
2631           osd_missing = pg_log.get_missing().num_missing();
2632           info.stats.stats.sum.num_objects_missing_on_primary =
2633               osd_missing;
2634           object_copies += num_objects; // My local (primary) count
2635         } else {
2636           assert(peer_missing.count(p));
2637           osd_missing = peer_missing[p].num_missing();
2638           object_copies += peer_info[p].stats.stats.sum.num_objects;
2639         }
2640         missing += osd_missing;
2641         // Count non-missing objects not in up as misplaced
2642         if (!in_up && num_objects > osd_missing)
2643           misplaced += num_objects - osd_missing;
2644       } else {
2645         assert(in_up && !in_acting);
2646
2647         // If this peer has more objects then it should, ignore them
2648         backfilled += MIN(num_objects, peer_info[p].stats.stats.sum.num_objects);
2649       }
2650     }
2651
2652     // Any objects that have been backfilled to up OSDs can deducted from misplaced
2653     misplaced = MAX(0, misplaced - backfilled);
2654
2655     // Deduct computed total missing on acting nodes
2656     object_copies -= missing;
2657     // Include computed backfilled objects on up nodes
2658     object_copies += backfilled;
2659     // a degraded objects has fewer replicas or EC shards than the
2660     // pool specifies.  num_object_copies will never be smaller than target * num_copies.
2661     int64_t degraded = MAX(0, info.stats.stats.sum.num_object_copies - object_copies);
2662
2663     info.stats.stats.sum.num_objects_degraded = degraded;
2664     info.stats.stats.sum.num_objects_unfound = get_num_unfound();
2665     info.stats.stats.sum.num_objects_misplaced = misplaced;
2666   }
2667 }
2668
2669 void PG::_update_blocked_by()
2670 {
2671   // set a max on the number of blocking peers we report. if we go
2672   // over, report a random subset.  keep the result sorted.
2673   unsigned keep = MIN(blocked_by.size(), cct->_conf->osd_max_pg_blocked_by);
2674   unsigned skip = blocked_by.size() - keep;
2675   info.stats.blocked_by.clear();
2676   info.stats.blocked_by.resize(keep);
2677   unsigned pos = 0;
2678   for (set<int>::iterator p = blocked_by.begin();
2679        p != blocked_by.end() && keep > 0;
2680        ++p) {
2681     if (skip > 0 && (rand() % (skip + keep) < skip)) {
2682       --skip;
2683     } else {
2684       info.stats.blocked_by[pos++] = *p;
2685       --keep;
2686     }
2687   }
2688 }
2689
2690 void PG::publish_stats_to_osd()
2691 {
2692   if (!is_primary())
2693     return;
2694
2695   pg_stats_publish_lock.Lock();
2696
2697   if (info.stats.stats.sum.num_scrub_errors)
2698     state_set(PG_STATE_INCONSISTENT);
2699   else
2700     state_clear(PG_STATE_INCONSISTENT);
2701
2702   utime_t now = ceph_clock_now();
2703   if (info.stats.state != state) {
2704     info.stats.last_change = now;
2705     // Optimistic estimation, if we just find out an inactive PG,
2706     // assumt it is active till now.
2707     if (!(state & PG_STATE_ACTIVE) &&
2708         (info.stats.state & PG_STATE_ACTIVE))
2709       info.stats.last_active = now;
2710
2711     if ((state & PG_STATE_ACTIVE) &&
2712         !(info.stats.state & PG_STATE_ACTIVE))
2713       info.stats.last_became_active = now;
2714     if ((state & (PG_STATE_ACTIVE|PG_STATE_PEERED)) &&
2715         !(info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED)))
2716       info.stats.last_became_peered = now;
2717     if (!(state & PG_STATE_CREATING) &&
2718         (info.stats.state & PG_STATE_CREATING)) {
2719       osd->send_pg_created(get_pgid().pgid);
2720     }
2721     info.stats.state = state;
2722   }
2723
2724   _update_calc_stats();
2725   _update_blocked_by();
2726
2727   bool publish = false;
2728   pg_stat_t pre_publish = info.stats;
2729   pre_publish.stats.add(unstable_stats);
2730   utime_t cutoff = now;
2731   cutoff -= cct->_conf->osd_pg_stat_report_interval_max;
2732   if (pg_stats_publish_valid && pre_publish == pg_stats_publish &&
2733       info.stats.last_fresh > cutoff) {
2734     dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2735              << ": no change since " << info.stats.last_fresh << dendl;
2736   } else {
2737     // update our stat summary and timestamps
2738     info.stats.reported_epoch = get_osdmap()->get_epoch();
2739     ++info.stats.reported_seq;
2740
2741     info.stats.last_fresh = now;
2742
2743     if (info.stats.state & PG_STATE_CLEAN)
2744       info.stats.last_clean = now;
2745     if (info.stats.state & PG_STATE_ACTIVE)
2746       info.stats.last_active = now;
2747     if (info.stats.state & (PG_STATE_ACTIVE|PG_STATE_PEERED))
2748       info.stats.last_peered = now;
2749     info.stats.last_unstale = now;
2750     if ((info.stats.state & PG_STATE_DEGRADED) == 0)
2751       info.stats.last_undegraded = now;
2752     if ((info.stats.state & PG_STATE_UNDERSIZED) == 0)
2753       info.stats.last_fullsized = now;
2754
2755     // do not send pgstat to mon anymore once we are luminous, since mgr takes
2756     // care of this by sending MMonMgrReport to mon.
2757     publish =
2758       osd->osd->get_osdmap()->require_osd_release < CEPH_RELEASE_LUMINOUS;
2759     pg_stats_publish_valid = true;
2760     pg_stats_publish = pre_publish;
2761
2762     dout(15) << "publish_stats_to_osd " << pg_stats_publish.reported_epoch
2763              << ":" << pg_stats_publish.reported_seq << dendl;
2764   }
2765   pg_stats_publish_lock.Unlock();
2766
2767   if (publish)
2768     osd->pg_stat_queue_enqueue(this);
2769 }
2770
2771 void PG::clear_publish_stats()
2772 {
2773   dout(15) << "clear_stats" << dendl;
2774   pg_stats_publish_lock.Lock();
2775   pg_stats_publish_valid = false;
2776   pg_stats_publish_lock.Unlock();
2777
2778   osd->pg_stat_queue_dequeue(this);
2779 }
2780
2781 /**
2782  * initialize a newly instantiated pg
2783  *
2784  * Initialize PG state, as when a PG is initially created, or when it
2785  * is first instantiated on the current node.
2786  *
2787  * @param role our role/rank
2788  * @param newup up set
2789  * @param newacting acting set
2790  * @param history pg history
2791  * @param pi past_intervals
2792  * @param backfill true if info should be marked as backfill
2793  * @param t transaction to write out our new state in
2794  */
2795 void PG::init(
2796   int role,
2797   const vector<int>& newup, int new_up_primary,
2798   const vector<int>& newacting, int new_acting_primary,
2799   const pg_history_t& history,
2800   const PastIntervals& pi,
2801   bool backfill,
2802   ObjectStore::Transaction *t)
2803 {
2804   dout(10) << "init role " << role << " up " << newup << " acting " << newacting
2805            << " history " << history
2806            << " past_intervals " << pi
2807            << dendl;
2808
2809   set_role(role);
2810   acting = newacting;
2811   up = newup;
2812   init_primary_up_acting(
2813     newup,
2814     newacting,
2815     new_up_primary,
2816     new_acting_primary);
2817
2818   info.history = history;
2819   past_intervals = pi;
2820
2821   info.stats.up = up;
2822   info.stats.up_primary = new_up_primary;
2823   info.stats.acting = acting;
2824   info.stats.acting_primary = new_acting_primary;
2825   info.stats.mapping_epoch = info.history.same_interval_since;
2826
2827   if (backfill) {
2828     dout(10) << __func__ << ": Setting backfill" << dendl;
2829     info.set_last_backfill(hobject_t());
2830     info.last_complete = info.last_update;
2831     pg_log.mark_log_for_rewrite();
2832   }
2833
2834   on_new_interval();
2835
2836   dirty_info = true;
2837   dirty_big_info = true;
2838   write_if_dirty(*t);
2839 }
2840
2841 #pragma GCC diagnostic ignored "-Wpragmas"
2842 #pragma GCC diagnostic push
2843 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
2844
2845 void PG::upgrade(ObjectStore *store)
2846 {
2847   assert(info_struct_v <= 10);
2848   ObjectStore::Transaction t;
2849
2850   assert(info_struct_v >= 7);
2851
2852   // 7 -> 8
2853   if (info_struct_v <= 7) {
2854     pg_log.mark_log_for_rewrite();
2855     ghobject_t log_oid(OSD::make_pg_log_oid(pg_id));
2856     ghobject_t biginfo_oid(OSD::make_pg_biginfo_oid(pg_id));
2857     t.remove(coll_t::meta(), log_oid);
2858     t.remove(coll_t::meta(), biginfo_oid);
2859     t.touch(coll, pgmeta_oid);
2860   }
2861
2862   // 8 -> 9
2863   if (info_struct_v <= 8) {
2864     // no special action needed.
2865   }
2866
2867   // 9 -> 10
2868   if (info_struct_v <= 9) {
2869     // previous versions weren't (as) aggressively clearing past_intervals
2870     if (info.history.last_epoch_clean >= info.history.same_interval_since) {
2871       dout(20) << __func__ << " clearing past_intervals" << dendl;
2872       past_intervals.clear();
2873     }
2874   }
2875
2876   // update infover_key
2877   if (info_struct_v < cur_struct_v) {
2878     map<string,bufferlist> v;
2879     __u8 ver = cur_struct_v;
2880     ::encode(ver, v[infover_key]);
2881     t.omap_setkeys(coll, pgmeta_oid, v);
2882   }
2883
2884   dirty_info = true;
2885   dirty_big_info = true;
2886   write_if_dirty(t);
2887
2888   ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
2889                                       ObjectStore::Sequencer>("upgrade"));
2890   int r = store->apply_transaction(osr.get(), std::move(t));
2891   if (r != 0) {
2892     derr << __func__ << ": apply_transaction returned "
2893          << cpp_strerror(r) << dendl;
2894     ceph_abort();
2895   }
2896   assert(r == 0);
2897
2898   C_SaferCond waiter;
2899   if (!osr->flush_commit(&waiter)) {
2900     waiter.wait();
2901   }
2902 }
2903
2904 #pragma GCC diagnostic pop
2905 #pragma GCC diagnostic warning "-Wpragmas"
2906
2907 int PG::_prepare_write_info(CephContext* cct,
2908                             map<string,bufferlist> *km,
2909                             epoch_t epoch,
2910                             pg_info_t &info, pg_info_t &last_written_info,
2911                             PastIntervals &past_intervals,
2912                             bool dirty_big_info,
2913                             bool dirty_epoch,
2914                             bool try_fast_info,
2915                             PerfCounters *logger)
2916 {
2917   if (dirty_epoch) {
2918     ::encode(epoch, (*km)[epoch_key]);
2919   }
2920
2921   if (logger)
2922     logger->inc(l_osd_pg_info);
2923
2924   // try to do info efficiently?
2925   if (!dirty_big_info && try_fast_info &&
2926       info.last_update > last_written_info.last_update) {
2927     pg_fast_info_t fast;
2928     fast.populate_from(info);
2929     bool did = fast.try_apply_to(&last_written_info);
2930     assert(did);  // we verified last_update increased above
2931     if (info == last_written_info) {
2932       ::encode(fast, (*km)[fastinfo_key]);
2933       if (logger)
2934         logger->inc(l_osd_pg_fastinfo);
2935       return 0;
2936     }
2937     generic_dout(30) << __func__ << " fastinfo failed, info:\n";
2938     {
2939       JSONFormatter jf(true);
2940       jf.dump_object("info", info);
2941       jf.flush(*_dout);
2942     }
2943     {
2944       *_dout << "\nlast_written_info:\n";
2945       JSONFormatter jf(true);
2946       jf.dump_object("last_written_info", last_written_info);
2947       jf.flush(*_dout);
2948     }
2949     *_dout << dendl;
2950   }
2951   last_written_info = info;
2952
2953   // info.  store purged_snaps separately.
2954   interval_set<snapid_t> purged_snaps;
2955   purged_snaps.swap(info.purged_snaps);
2956   ::encode(info, (*km)[info_key]);
2957   purged_snaps.swap(info.purged_snaps);
2958
2959   if (dirty_big_info) {
2960     // potentially big stuff
2961     bufferlist& bigbl = (*km)[biginfo_key];
2962     ::encode(past_intervals, bigbl);
2963     ::encode(info.purged_snaps, bigbl);
2964     //dout(20) << "write_info bigbl " << bigbl.length() << dendl;
2965     if (logger)
2966       logger->inc(l_osd_pg_biginfo);
2967   }
2968
2969   return 0;
2970 }
2971
2972 void PG::_create(ObjectStore::Transaction& t, spg_t pgid, int bits)
2973 {
2974   coll_t coll(pgid);
2975   t.create_collection(coll, bits);
2976 }
2977
2978 void PG::_init(ObjectStore::Transaction& t, spg_t pgid, const pg_pool_t *pool)
2979 {
2980   coll_t coll(pgid);
2981
2982   if (pool) {
2983     // Give a hint to the PG collection
2984     bufferlist hint;
2985     uint32_t pg_num = pool->get_pg_num();
2986     uint64_t expected_num_objects_pg = pool->expected_num_objects / pg_num;
2987     ::encode(pg_num, hint);
2988     ::encode(expected_num_objects_pg, hint);
2989     uint32_t hint_type = ObjectStore::Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS;
2990     t.collection_hint(coll, hint_type, hint);
2991   }
2992
2993   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
2994   t.touch(coll, pgmeta_oid);
2995   map<string,bufferlist> values;
2996   __u8 struct_v = cur_struct_v;
2997   ::encode(struct_v, values[infover_key]);
2998   t.omap_setkeys(coll, pgmeta_oid, values);
2999 }
3000
3001 void PG::prepare_write_info(map<string,bufferlist> *km)
3002 {
3003   info.stats.stats.add(unstable_stats);
3004   unstable_stats.clear();
3005
3006   bool need_update_epoch = last_epoch < get_osdmap()->get_epoch();
3007   int ret = _prepare_write_info(cct, km, get_osdmap()->get_epoch(),
3008                                 info,
3009                                 last_written_info,
3010                                 past_intervals,
3011                                 dirty_big_info, need_update_epoch,
3012                                 cct->_conf->osd_fast_info,
3013                                 osd->logger);
3014   assert(ret == 0);
3015   if (need_update_epoch)
3016     last_epoch = get_osdmap()->get_epoch();
3017   last_persisted_osdmap_ref = osdmap_ref;
3018
3019   dirty_info = false;
3020   dirty_big_info = false;
3021 }
3022
3023 #pragma GCC diagnostic ignored "-Wpragmas"
3024 #pragma GCC diagnostic push
3025 #pragma GCC diagnostic ignored "-Wdeprecated-declarations"
3026
3027 bool PG::_has_removal_flag(ObjectStore *store,
3028                            spg_t pgid)
3029 {
3030   coll_t coll(pgid);
3031   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3032
3033   // first try new way
3034   set<string> keys;
3035   keys.insert("_remove");
3036   map<string,bufferlist> values;
3037   if (store->omap_get_values(coll, pgmeta_oid, keys, &values) == 0 &&
3038       values.size() == 1)
3039     return true;
3040
3041   return false;
3042 }
3043
3044 int PG::peek_map_epoch(ObjectStore *store,
3045                        spg_t pgid,
3046                        epoch_t *pepoch,
3047                        bufferlist *bl)
3048 {
3049   coll_t coll(pgid);
3050   ghobject_t legacy_infos_oid(OSD::make_infos_oid());
3051   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3052   epoch_t cur_epoch = 0;
3053
3054   assert(bl);
3055   {
3056     // validate collection name
3057     assert(coll.is_pg());
3058   }
3059
3060   // try for v8
3061   set<string> keys;
3062   keys.insert(infover_key);
3063   keys.insert(epoch_key);
3064   map<string,bufferlist> values;
3065   int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3066   if (r == 0) {
3067     assert(values.size() == 2);
3068
3069     // sanity check version
3070     bufferlist::iterator bp = values[infover_key].begin();
3071     __u8 struct_v = 0;
3072     ::decode(struct_v, bp);
3073     assert(struct_v >= 8);
3074
3075     // get epoch
3076     bp = values[epoch_key].begin();
3077     ::decode(cur_epoch, bp);
3078   } else {
3079     // probably bug 10617; see OSD::load_pgs()
3080     return -1;
3081   }
3082
3083   *pepoch = cur_epoch;
3084   return 0;
3085 }
3086
3087 #pragma GCC diagnostic pop
3088 #pragma GCC diagnostic warning "-Wpragmas"
3089
3090 void PG::write_if_dirty(ObjectStore::Transaction& t)
3091 {
3092   map<string,bufferlist> km;
3093   if (dirty_big_info || dirty_info)
3094     prepare_write_info(&km);
3095   pg_log.write_log_and_missing(t, &km, coll, pgmeta_oid, pool.info.require_rollback());
3096   if (!km.empty())
3097     t.omap_setkeys(coll, pgmeta_oid, km);
3098 }
3099
3100 void PG::trim_log()
3101 {
3102   assert(is_primary());
3103   calc_trim_to();
3104   dout(10) << __func__ << " to " << pg_trim_to << dendl;
3105   if (pg_trim_to != eversion_t()) {
3106     // inform peers to trim log
3107     assert(!actingbackfill.empty());
3108     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3109          i != actingbackfill.end();
3110          ++i) {
3111       if (*i == pg_whoami) continue;
3112       osd->send_message_osd_cluster(
3113         i->osd,
3114         new MOSDPGTrim(
3115           get_osdmap()->get_epoch(),
3116           spg_t(info.pgid.pgid, i->shard),
3117           pg_trim_to),
3118         get_osdmap()->get_epoch());
3119     }
3120
3121     // trim primary as well
3122     pg_log.trim(pg_trim_to, info);
3123     dirty_info = true;
3124   }
3125 }
3126
3127 void PG::add_log_entry(const pg_log_entry_t& e, bool applied)
3128 {
3129   // raise last_complete only if we were previously up to date
3130   if (info.last_complete == info.last_update)
3131     info.last_complete = e.version;
3132   
3133   // raise last_update.
3134   assert(e.version > info.last_update);
3135   info.last_update = e.version;
3136
3137   // raise user_version, if it increased (it may have not get bumped
3138   // by all logged updates)
3139   if (e.user_version > info.last_user_version)
3140     info.last_user_version = e.user_version;
3141
3142   // log mutation
3143   pg_log.add(e, applied);
3144   dout(10) << "add_log_entry " << e << dendl;
3145 }
3146
3147
3148 void PG::append_log(
3149   const vector<pg_log_entry_t>& logv,
3150   eversion_t trim_to,
3151   eversion_t roll_forward_to,
3152   ObjectStore::Transaction &t,
3153   bool transaction_applied)
3154 {
3155   if (transaction_applied)
3156     update_snap_map(logv, t);
3157
3158   /* The primary has sent an info updating the history, but it may not
3159    * have arrived yet.  We want to make sure that we cannot remember this
3160    * write without remembering that it happened in an interval which went
3161    * active in epoch history.last_epoch_started.
3162    */
3163   if (info.last_epoch_started != info.history.last_epoch_started) {
3164     info.history.last_epoch_started = info.last_epoch_started;
3165   }
3166   if (info.last_interval_started != info.history.last_interval_started) {
3167     info.history.last_interval_started = info.last_interval_started;
3168   }
3169   dout(10) << "append_log " << pg_log.get_log() << " " << logv << dendl;
3170
3171   PGLogEntryHandler handler{this, &t};
3172   if (!transaction_applied) {
3173      /* We must be a backfill peer, so it's ok if we apply
3174       * out-of-turn since we won't be considered when
3175       * determining a min possible last_update.
3176       */
3177     pg_log.roll_forward(&handler);
3178   }
3179
3180   for (vector<pg_log_entry_t>::const_iterator p = logv.begin();
3181        p != logv.end();
3182        ++p) {
3183     add_log_entry(*p, transaction_applied);
3184
3185     /* We don't want to leave the rollforward artifacts around
3186      * here past last_backfill.  It's ok for the same reason as
3187      * above */
3188     if (transaction_applied &&
3189         p->soid > info.last_backfill) {
3190       pg_log.roll_forward(&handler);
3191     }
3192   }
3193   auto last = logv.rbegin();
3194   if (is_primary() && last != logv.rend()) {
3195     projected_log.skip_can_rollback_to_to_head();
3196     projected_log.trim(cct, last->version, nullptr, nullptr, nullptr);
3197   }
3198
3199   if (transaction_applied && roll_forward_to > pg_log.get_can_rollback_to()) {
3200     pg_log.roll_forward_to(
3201       roll_forward_to,
3202       &handler);
3203     t.register_on_applied(
3204       new C_UpdateLastRollbackInfoTrimmedToApplied(
3205         this,
3206         get_osdmap()->get_epoch(),
3207         roll_forward_to));
3208   }
3209
3210   pg_log.trim(trim_to, info);
3211
3212   // update the local pg, pg log
3213   dirty_info = true;
3214   write_if_dirty(t);
3215 }
3216
3217 bool PG::check_log_for_corruption(ObjectStore *store)
3218 {
3219   /// TODO: this method needs to work with the omap log
3220   return true;
3221 }
3222
3223 //! Get the name we're going to save our corrupt page log as
3224 std::string PG::get_corrupt_pg_log_name() const
3225 {
3226   const int MAX_BUF = 512;
3227   char buf[MAX_BUF];
3228   struct tm tm_buf;
3229   time_t my_time(time(NULL));
3230   const struct tm *t = localtime_r(&my_time, &tm_buf);
3231   int ret = strftime(buf, sizeof(buf), "corrupt_log_%Y-%m-%d_%k:%M_", t);
3232   if (ret == 0) {
3233     dout(0) << "strftime failed" << dendl;
3234     return "corrupt_log_unknown_time";
3235   }
3236   string out(buf);
3237   out += stringify(info.pgid);
3238   return out;
3239 }
3240
3241 int PG::read_info(
3242   ObjectStore *store, spg_t pgid, const coll_t &coll, bufferlist &bl,
3243   pg_info_t &info, PastIntervals &past_intervals,
3244   __u8 &struct_v)
3245 {
3246   // try for v8 or later
3247   set<string> keys;
3248   keys.insert(infover_key);
3249   keys.insert(info_key);
3250   keys.insert(biginfo_key);
3251   keys.insert(fastinfo_key);
3252   ghobject_t pgmeta_oid(pgid.make_pgmeta_oid());
3253   map<string,bufferlist> values;
3254   int r = store->omap_get_values(coll, pgmeta_oid, keys, &values);
3255   if (r == 0) {
3256     assert(values.size() == 3 ||
3257            values.size() == 4);
3258
3259     bufferlist::iterator p = values[infover_key].begin();
3260     ::decode(struct_v, p);
3261     assert(struct_v >= 8);
3262
3263     p = values[info_key].begin();
3264     ::decode(info, p);
3265
3266     p = values[biginfo_key].begin();
3267     if (struct_v >= 10) {
3268       ::decode(past_intervals, p);
3269     } else {
3270       past_intervals.decode_classic(p);
3271     }
3272     ::decode(info.purged_snaps, p);
3273
3274     p = values[fastinfo_key].begin();
3275     if (!p.end()) {
3276       pg_fast_info_t fast;
3277       ::decode(fast, p);
3278       fast.try_apply_to(&info);
3279     }
3280     return 0;
3281   }
3282
3283   // legacy (ver < 8)
3284   ghobject_t infos_oid(OSD::make_infos_oid());
3285   bufferlist::iterator p = bl.begin();
3286   ::decode(struct_v, p);
3287   assert(struct_v == 7);
3288
3289   // get info out of leveldb
3290   string k = get_info_key(info.pgid);
3291   string bk = get_biginfo_key(info.pgid);
3292   keys.clear();
3293   keys.insert(k);
3294   keys.insert(bk);
3295   values.clear();
3296   store->omap_get_values(coll_t::meta(), ghobject_t(infos_oid), keys, &values);
3297   assert(values.size() == 2);
3298
3299   p = values[k].begin();
3300   ::decode(info, p);
3301
3302   p = values[bk].begin();
3303   ::decode(past_intervals, p);
3304   interval_set<snapid_t> snap_collections;  // obsolete
3305   ::decode(snap_collections, p);
3306   ::decode(info.purged_snaps, p);
3307   return 0;
3308 }
3309
3310 void PG::read_state(ObjectStore *store, bufferlist &bl)
3311 {
3312   int r = read_info(store, pg_id, coll, bl, info, past_intervals,
3313                     info_struct_v);
3314   assert(r >= 0);
3315
3316   last_written_info = info;
3317
3318   // if we are upgrading from jewel, we need to force rebuild of
3319   // missing set.  v9 was fastinfo, added v11.0.2-331-g1d5dc29a13
3320   // (before kraken).  persisted missing set was circa
3321   // v11.0.0-866-gb0e239da95 (a bit earlier, also before kraken).
3322   // v8 was pre-jewel (per-pg meta object).
3323   bool force_rebuild_missing = info_struct_v < 9;
3324   if (force_rebuild_missing) {
3325     dout(10) << __func__ << " detected upgrade from jewel, force_rebuild_missing"
3326              << dendl;
3327   }
3328
3329   ostringstream oss;
3330   pg_log.read_log_and_missing(
3331     store,
3332     coll,
3333     info_struct_v < 8 ? coll_t::meta() : coll,
3334     ghobject_t(info_struct_v < 8 ? OSD::make_pg_log_oid(pg_id) : pgmeta_oid),
3335     info,
3336     force_rebuild_missing,
3337     oss,
3338     cct->_conf->osd_ignore_stale_divergent_priors,
3339     cct->_conf->osd_debug_verify_missing_on_start);
3340   if (oss.tellp())
3341     osd->clog->error() << oss.rdbuf();
3342
3343   if (force_rebuild_missing) {
3344     dout(10) << __func__ << " forced rebuild of missing got "
3345              << pg_log.get_missing()
3346              << dendl;
3347   }
3348
3349   // log any weirdness
3350   log_weirdness();
3351 }
3352
3353 void PG::log_weirdness()
3354 {
3355   if (pg_log.get_tail() != info.log_tail)
3356     osd->clog->error() << info.pgid
3357                        << " info mismatch, log.tail " << pg_log.get_tail()
3358                        << " != info.log_tail " << info.log_tail;
3359   if (pg_log.get_head() != info.last_update)
3360     osd->clog->error() << info.pgid
3361                        << " info mismatch, log.head " << pg_log.get_head()
3362                        << " != info.last_update " << info.last_update;
3363
3364   if (!pg_log.get_log().empty()) {
3365     // sloppy check
3366     if ((pg_log.get_log().log.begin()->version <= pg_log.get_tail()))
3367       osd->clog->error() << info.pgid
3368                         << " log bound mismatch, info (tail,head] ("
3369                         << pg_log.get_tail() << "," << pg_log.get_head() << "]"
3370                         << " actual ["
3371                         << pg_log.get_log().log.begin()->version << ","
3372                          << pg_log.get_log().log.rbegin()->version << "]";
3373   }
3374   
3375   if (pg_log.get_log().caller_ops.size() > pg_log.get_log().log.size()) {
3376     osd->clog->error() << info.pgid
3377                       << " caller_ops.size " << pg_log.get_log().caller_ops.size()
3378                        << " > log size " << pg_log.get_log().log.size();
3379   }
3380 }
3381
3382 void PG::update_snap_map(
3383   const vector<pg_log_entry_t> &log_entries,
3384   ObjectStore::Transaction &t)
3385 {
3386   for (vector<pg_log_entry_t>::const_iterator i = log_entries.begin();
3387        i != log_entries.end();
3388        ++i) {
3389     OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
3390     if (i->soid.snap < CEPH_MAXSNAP) {
3391       if (i->is_delete()) {
3392         int r = snap_mapper.remove_oid(
3393           i->soid,
3394           &_t);
3395         assert(r == 0);
3396       } else if (i->is_update()) {
3397         assert(i->snaps.length() > 0);
3398         vector<snapid_t> snaps;
3399         bufferlist snapbl = i->snaps;
3400         bufferlist::iterator p = snapbl.begin();
3401         try {
3402           ::decode(snaps, p);
3403         } catch (...) {
3404           snaps.clear();
3405         }
3406         set<snapid_t> _snaps(snaps.begin(), snaps.end());
3407
3408         if (i->is_clone() || i->is_promote()) {
3409           snap_mapper.add_oid(
3410             i->soid,
3411             _snaps,
3412             &_t);
3413         } else if (i->is_modify()) {
3414           assert(i->is_modify());
3415           int r = snap_mapper.update_snaps(
3416             i->soid,
3417             _snaps,
3418             0,
3419             &_t);
3420           assert(r == 0);
3421         } else {
3422           assert(i->is_clean());
3423         }
3424       }
3425     }
3426   }
3427 }
3428
3429 /**
3430  * filter trimming|trimmed snaps out of snapcontext
3431  */
3432 void PG::filter_snapc(vector<snapid_t> &snaps)
3433 {
3434   //nothing needs to trim, we can return immediately
3435   if(snap_trimq.empty() && info.purged_snaps.empty())
3436     return;
3437
3438   bool filtering = false;
3439   vector<snapid_t> newsnaps;
3440   for (vector<snapid_t>::iterator p = snaps.begin();
3441        p != snaps.end();
3442        ++p) {
3443     if (snap_trimq.contains(*p) || info.purged_snaps.contains(*p)) {
3444       if (!filtering) {
3445         // start building a new vector with what we've seen so far
3446         dout(10) << "filter_snapc filtering " << snaps << dendl;
3447         newsnaps.insert(newsnaps.begin(), snaps.begin(), p);
3448         filtering = true;
3449       }
3450       dout(20) << "filter_snapc  removing trimq|purged snap " << *p << dendl;
3451     } else {
3452       if (filtering)
3453         newsnaps.push_back(*p);  // continue building new vector
3454     }
3455   }
3456   if (filtering) {
3457     snaps.swap(newsnaps);
3458     dout(10) << "filter_snapc  result " << snaps << dendl;
3459   }
3460 }
3461
3462 void PG::requeue_object_waiters(map<hobject_t, list<OpRequestRef>>& m)
3463 {
3464   for (map<hobject_t, list<OpRequestRef>>::iterator it = m.begin();
3465        it != m.end();
3466        ++it)
3467     requeue_ops(it->second);
3468   m.clear();
3469 }
3470
3471 void PG::requeue_op(OpRequestRef op)
3472 {
3473   auto p = waiting_for_map.find(op->get_source());
3474   if (p != waiting_for_map.end()) {
3475     dout(20) << __func__ << " " << op << " (waiting_for_map " << p->first << ")"
3476              << dendl;
3477     p->second.push_front(op);
3478   } else {
3479     dout(20) << __func__ << " " << op << dendl;
3480     osd->enqueue_front(info.pgid, PGQueueable(op, get_osdmap()->get_epoch()));
3481   }
3482 }
3483
3484 void PG::requeue_ops(list<OpRequestRef> &ls)
3485 {
3486   for (list<OpRequestRef>::reverse_iterator i = ls.rbegin();
3487        i != ls.rend();
3488        ++i) {
3489     auto p = waiting_for_map.find((*i)->get_source());
3490     if (p != waiting_for_map.end()) {
3491       dout(20) << __func__ << " " << *i << " (waiting_for_map " << p->first
3492                << ")" << dendl;
3493       p->second.push_front(*i);
3494     } else {
3495       dout(20) << __func__ << " " << *i << dendl;
3496       osd->enqueue_front(info.pgid, PGQueueable(*i, get_osdmap()->get_epoch()));
3497     }
3498   }
3499   ls.clear();
3500 }
3501
3502 void PG::requeue_map_waiters()
3503 {
3504   epoch_t epoch = get_osdmap()->get_epoch();
3505   auto p = waiting_for_map.begin();
3506   while (p != waiting_for_map.end()) {
3507     if (epoch < p->second.front()->min_epoch) {
3508       dout(20) << __func__ << " " << p->first << " front op "
3509                << p->second.front() << " must still wait, doing nothing"
3510                << dendl;
3511       ++p;
3512     } else {
3513       dout(20) << __func__ << " " << p->first << " " << p->second << dendl;
3514       for (auto q = p->second.rbegin(); q != p->second.rend(); ++q) {
3515         osd->enqueue_front(info.pgid, PGQueueable(*q, epoch));
3516       }
3517       p = waiting_for_map.erase(p);
3518     }
3519   }
3520 }
3521
3522
3523 // ==========================================================================================
3524 // SCRUB
3525
3526 /*
3527  * when holding pg and sched_scrub_lock, then the states are:
3528  *   scheduling:
3529  *     scrubber.reserved = true
3530  *     scrub_rserved_peers includes whoami
3531  *     osd->scrub_pending++
3532  *   scheduling, replica declined:
3533  *     scrubber.reserved = true
3534  *     scrubber.reserved_peers includes -1
3535  *     osd->scrub_pending++
3536  *   pending:
3537  *     scrubber.reserved = true
3538  *     scrubber.reserved_peers.size() == acting.size();
3539  *     pg on scrub_wq
3540  *     osd->scrub_pending++
3541  *   scrubbing:
3542  *     scrubber.reserved = false;
3543  *     scrubber.reserved_peers empty
3544  *     osd->scrubber.active++
3545  */
3546
3547 // returns true if a scrub has been newly kicked off
3548 bool PG::sched_scrub()
3549 {
3550   bool nodeep_scrub = false;
3551   assert(is_locked());
3552   if (!(is_primary() && is_active() && is_clean() && !is_scrubbing())) {
3553     return false;
3554   }
3555
3556   double deep_scrub_interval = 0;
3557   pool.info.opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
3558   if (deep_scrub_interval <= 0) {
3559     deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
3560   }
3561   bool time_for_deep = ceph_clock_now() >=
3562     info.history.last_deep_scrub_stamp + deep_scrub_interval;
3563
3564   bool deep_coin_flip = false;
3565   // Only add random deep scrubs when NOT user initiated scrub
3566   if (!scrubber.must_scrub)
3567       deep_coin_flip = (rand() % 100) < cct->_conf->osd_deep_scrub_randomize_ratio * 100;
3568   dout(20) << __func__ << ": time_for_deep=" << time_for_deep << " deep_coin_flip=" << deep_coin_flip << dendl;
3569
3570   time_for_deep = (time_for_deep || deep_coin_flip);
3571
3572   //NODEEP_SCRUB so ignore time initiated deep-scrub
3573   if (osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NODEEP_SCRUB) ||
3574       pool.info.has_flag(pg_pool_t::FLAG_NODEEP_SCRUB)) {
3575     time_for_deep = false;
3576     nodeep_scrub = true;
3577   }
3578
3579   if (!scrubber.must_scrub) {
3580     assert(!scrubber.must_deep_scrub);
3581
3582     //NOSCRUB so skip regular scrubs
3583     if ((osd->osd->get_osdmap()->test_flag(CEPH_OSDMAP_NOSCRUB) ||
3584          pool.info.has_flag(pg_pool_t::FLAG_NOSCRUB)) && !time_for_deep) {
3585       if (scrubber.reserved) {
3586         // cancel scrub if it is still in scheduling,
3587         // so pgs from other pools where scrub are still legal
3588         // have a chance to go ahead with scrubbing.
3589         clear_scrub_reserved();
3590         scrub_unreserve_replicas();
3591       }
3592       return false;
3593     }
3594   }
3595
3596   if (cct->_conf->osd_scrub_auto_repair
3597       && get_pgbackend()->auto_repair_supported()
3598       && time_for_deep
3599       // respect the command from user, and not do auto-repair
3600       && !scrubber.must_repair
3601       && !scrubber.must_scrub
3602       && !scrubber.must_deep_scrub) {
3603     dout(20) << __func__ << ": auto repair with deep scrubbing" << dendl;
3604     scrubber.auto_repair = true;
3605   } else {
3606     // this happens when user issue the scrub/repair command during
3607     // the scheduling of the scrub/repair (e.g. request reservation)
3608     scrubber.auto_repair = false;
3609   }
3610
3611   bool ret = true;
3612   if (!scrubber.reserved) {
3613     assert(scrubber.reserved_peers.empty());
3614     if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
3615          osd->inc_scrubs_pending()) {
3616       dout(20) << __func__ << ": reserved locally, reserving replicas" << dendl;
3617       scrubber.reserved = true;
3618       scrubber.reserved_peers.insert(pg_whoami);
3619       scrub_reserve_replicas();
3620     } else {
3621       dout(20) << __func__ << ": failed to reserve locally" << dendl;
3622       ret = false;
3623     }
3624   }
3625   if (scrubber.reserved) {
3626     if (scrubber.reserve_failed) {
3627       dout(20) << "sched_scrub: failed, a peer declined" << dendl;
3628       clear_scrub_reserved();
3629       scrub_unreserve_replicas();
3630       ret = false;
3631     } else if (scrubber.reserved_peers.size() == acting.size()) {
3632       dout(20) << "sched_scrub: success, reserved self and replicas" << dendl;
3633       if (time_for_deep) {
3634         dout(10) << "sched_scrub: scrub will be deep" << dendl;
3635         state_set(PG_STATE_DEEP_SCRUB);
3636       } else if (!scrubber.must_deep_scrub && info.stats.stats.sum.num_deep_scrub_errors) {
3637         if (!nodeep_scrub) {
3638           osd->clog->info() << "osd." << osd->whoami
3639                             << " pg " << info.pgid
3640                             << " Deep scrub errors, upgrading scrub to deep-scrub";
3641           state_set(PG_STATE_DEEP_SCRUB);
3642         } else if (!scrubber.must_scrub) {
3643           osd->clog->error() << "osd." << osd->whoami
3644                              << " pg " << info.pgid
3645                              << " Regular scrub skipped due to deep-scrub errors and nodeep-scrub set";
3646           clear_scrub_reserved();
3647           scrub_unreserve_replicas();
3648           return false;
3649         } else {
3650           osd->clog->error() << "osd." << osd->whoami
3651                              << " pg " << info.pgid
3652                              << " Regular scrub request, deep-scrub details will be lost";
3653         }
3654       }
3655       queue_scrub();
3656     } else {
3657       // none declined, since scrubber.reserved is set
3658       dout(20) << "sched_scrub: reserved " << scrubber.reserved_peers << ", waiting for replicas" << dendl;
3659     }
3660   }
3661
3662   return ret;
3663 }
3664
3665 void PG::reg_next_scrub()
3666 {
3667   if (!is_primary())
3668     return;
3669
3670   utime_t reg_stamp;
3671   if (scrubber.must_scrub ||
3672       (info.stats.stats_invalid && cct->_conf->osd_scrub_invalid_stats)) {
3673     reg_stamp = ceph_clock_now();
3674   } else {
3675     reg_stamp = info.history.last_scrub_stamp;
3676   }
3677   // note down the sched_time, so we can locate this scrub, and remove it
3678   // later on.
3679   double scrub_min_interval = 0, scrub_max_interval = 0;
3680   pool.info.opts.get(pool_opts_t::SCRUB_MIN_INTERVAL, &scrub_min_interval);
3681   pool.info.opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
3682   assert(scrubber.scrub_reg_stamp == utime_t());
3683   scrubber.scrub_reg_stamp = osd->reg_pg_scrub(info.pgid,
3684                                                reg_stamp,
3685                                                scrub_min_interval,
3686                                                scrub_max_interval,
3687                                                scrubber.must_scrub);
3688 }
3689
3690 void PG::unreg_next_scrub()
3691 {
3692   if (is_primary()) {
3693     osd->unreg_pg_scrub(info.pgid, scrubber.scrub_reg_stamp);
3694     scrubber.scrub_reg_stamp = utime_t();
3695   }
3696 }
3697
3698 void PG::do_replica_scrub_map(OpRequestRef op)
3699 {
3700   const MOSDRepScrubMap *m = static_cast<const MOSDRepScrubMap*>(op->get_req());
3701   dout(7) << __func__ << " " << *m << dendl;
3702   if (m->map_epoch < info.history.same_interval_since) {
3703     dout(10) << __func__ << " discarding old from "
3704              << m->map_epoch << " < " << info.history.same_interval_since
3705              << dendl;
3706     return;
3707   }
3708   if (!scrubber.is_chunky_scrub_active()) {
3709     dout(10) << __func__ << " scrub isn't active" << dendl;
3710     return;
3711   }
3712
3713   op->mark_started();
3714
3715   bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3716   scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3717   dout(10) << "map version is "
3718            << scrubber.received_maps[m->from].valid_through
3719            << dendl;
3720
3721   --scrubber.waiting_on;
3722   scrubber.waiting_on_whom.erase(m->from);
3723   if (scrubber.waiting_on == 0) {
3724     if (ops_blocked_by_scrub()) {
3725       requeue_scrub(true);
3726     } else {
3727       requeue_scrub(false);
3728     }
3729   }
3730 }
3731
3732 void PG::sub_op_scrub_map(OpRequestRef op)
3733 {
3734   // for legacy jewel compatibility only
3735   const MOSDSubOp *m = static_cast<const MOSDSubOp *>(op->get_req());
3736   assert(m->get_type() == MSG_OSD_SUBOP);
3737   dout(7) << "sub_op_scrub_map" << dendl;
3738
3739   if (m->map_epoch < info.history.same_interval_since) {
3740     dout(10) << "sub_op_scrub discarding old sub_op from "
3741              << m->map_epoch << " < " << info.history.same_interval_since << dendl;
3742     return;
3743   }
3744
3745   if (!scrubber.is_chunky_scrub_active()) {
3746     dout(10) << "sub_op_scrub_map scrub isn't active" << dendl;
3747     return;
3748   }
3749
3750   op->mark_started();
3751
3752   dout(10) << " got " << m->from << " scrub map" << dendl;
3753   bufferlist::iterator p = const_cast<bufferlist&>(m->get_data()).begin();
3754
3755   scrubber.received_maps[m->from].decode(p, info.pgid.pool());
3756   dout(10) << "map version is "
3757              << scrubber.received_maps[m->from].valid_through
3758              << dendl;
3759
3760   --scrubber.waiting_on;
3761   scrubber.waiting_on_whom.erase(m->from);
3762
3763   if (scrubber.waiting_on == 0) {
3764     if (ops_blocked_by_scrub()) {
3765       requeue_scrub(true);
3766     } else {
3767       requeue_scrub(false);
3768     }
3769   }
3770 }
3771
3772 // send scrub v3 messages (chunky scrub)
3773 void PG::_request_scrub_map(
3774   pg_shard_t replica, eversion_t version,
3775   hobject_t start, hobject_t end,
3776   bool deep, uint32_t seed)
3777 {
3778   assert(replica != pg_whoami);
3779   dout(10) << "scrub  requesting scrubmap from osd." << replica
3780            << " deep " << (int)deep << " seed " << seed << dendl;
3781   MOSDRepScrub *repscrubop = new MOSDRepScrub(
3782     spg_t(info.pgid.pgid, replica.shard), version,
3783     get_osdmap()->get_epoch(),
3784     get_last_peering_reset(),
3785     start, end, deep, seed);
3786   // default priority, we want the rep scrub processed prior to any recovery
3787   // or client io messages (we are holding a lock!)
3788   osd->send_message_osd_cluster(
3789     replica.osd, repscrubop, get_osdmap()->get_epoch());
3790 }
3791
3792 void PG::handle_scrub_reserve_request(OpRequestRef op)
3793 {
3794   dout(7) << __func__ << " " << *op->get_req() << dendl;
3795   op->mark_started();
3796   if (scrubber.reserved) {
3797     dout(10) << __func__ << " ignoring reserve request: Already reserved"
3798              << dendl;
3799     return;
3800   }
3801   if ((cct->_conf->osd_scrub_during_recovery || !osd->is_recovery_active()) &&
3802       osd->inc_scrubs_pending()) {
3803     scrubber.reserved = true;
3804   } else {
3805     dout(20) << __func__ << ": failed to reserve remotely" << dendl;
3806     scrubber.reserved = false;
3807   }
3808   if (op->get_req()->get_type() == MSG_OSD_SCRUB_RESERVE) {
3809     const MOSDScrubReserve *m =
3810       static_cast<const MOSDScrubReserve*>(op->get_req());
3811     Message *reply = new MOSDScrubReserve(
3812       spg_t(info.pgid.pgid, primary.shard),
3813       m->map_epoch,
3814       scrubber.reserved ? MOSDScrubReserve::GRANT : MOSDScrubReserve::REJECT,
3815       pg_whoami);
3816     osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3817   } else {
3818     // for jewel compat only
3819     const MOSDSubOp *req = static_cast<const MOSDSubOp*>(op->get_req());
3820     assert(req->get_type() == MSG_OSD_SUBOP);
3821     MOSDSubOpReply *reply = new MOSDSubOpReply(
3822       req, pg_whoami, 0, get_osdmap()->get_epoch(), CEPH_OSD_FLAG_ACK);
3823     ::encode(scrubber.reserved, reply->get_data());
3824     osd->send_message_osd_cluster(reply, op->get_req()->get_connection());
3825   }
3826 }
3827
3828 void PG::handle_scrub_reserve_grant(OpRequestRef op, pg_shard_t from)
3829 {
3830   dout(7) << __func__ << " " << *op->get_req() << dendl;
3831   op->mark_started();
3832   if (!scrubber.reserved) {
3833     dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3834     return;
3835   }
3836   if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3837     dout(10) << " already had osd." << from << " reserved" << dendl;
3838   } else {
3839     dout(10) << " osd." << from << " scrub reserve = success" << dendl;
3840     scrubber.reserved_peers.insert(from);
3841     sched_scrub();
3842   }
3843 }
3844
3845 void PG::handle_scrub_reserve_reject(OpRequestRef op, pg_shard_t from)
3846 {
3847   dout(7) << __func__ << " " << *op->get_req() << dendl;
3848   op->mark_started();
3849   if (!scrubber.reserved) {
3850     dout(10) << "ignoring obsolete scrub reserve reply" << dendl;
3851     return;
3852   }
3853   if (scrubber.reserved_peers.find(from) != scrubber.reserved_peers.end()) {
3854     dout(10) << " already had osd." << from << " reserved" << dendl;
3855   } else {
3856     /* One decline stops this pg from being scheduled for scrubbing. */
3857     dout(10) << " osd." << from << " scrub reserve = fail" << dendl;
3858     scrubber.reserve_failed = true;
3859     sched_scrub();
3860   }
3861 }
3862
3863 void PG::handle_scrub_reserve_release(OpRequestRef op)
3864 {
3865   dout(7) << __func__ << " " << *op->get_req() << dendl;
3866   op->mark_started();
3867   clear_scrub_reserved();
3868 }
3869
3870 void PG::reject_reservation()
3871 {
3872   osd->send_message_osd_cluster(
3873     primary.osd,
3874     new MBackfillReserve(
3875       MBackfillReserve::REJECT,
3876       spg_t(info.pgid.pgid, primary.shard),
3877       get_osdmap()->get_epoch()),
3878     get_osdmap()->get_epoch());
3879 }
3880
3881 void PG::schedule_backfill_retry(float delay)
3882 {
3883   Mutex::Locker lock(osd->recovery_request_lock);
3884   osd->recovery_request_timer.add_event_after(
3885     delay,
3886     new QueuePeeringEvt<RequestBackfill>(
3887       this, get_osdmap()->get_epoch(),
3888       RequestBackfill()));
3889 }
3890
3891 void PG::schedule_recovery_retry(float delay)
3892 {
3893   Mutex::Locker lock(osd->recovery_request_lock);
3894   osd->recovery_request_timer.add_event_after(
3895     delay,
3896     new QueuePeeringEvt<DoRecovery>(
3897       this, get_osdmap()->get_epoch(),
3898       DoRecovery()));
3899 }
3900
3901 void PG::clear_scrub_reserved()
3902 {
3903   scrubber.reserved_peers.clear();
3904   scrubber.reserve_failed = false;
3905
3906   if (scrubber.reserved) {
3907     scrubber.reserved = false;
3908     osd->dec_scrubs_pending();
3909   }
3910 }
3911
3912 void PG::scrub_reserve_replicas()
3913 {
3914   assert(backfill_targets.empty());
3915   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3916        i != actingbackfill.end();
3917        ++i) {
3918     if (*i == pg_whoami) continue;
3919     dout(10) << "scrub requesting reserve from osd." << *i << dendl;
3920     if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
3921       osd->send_message_osd_cluster(
3922         i->osd,
3923         new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
3924                              get_osdmap()->get_epoch(),
3925                              MOSDScrubReserve::REQUEST, pg_whoami),
3926         get_osdmap()->get_epoch());
3927     } else {
3928       // for jewel compat only
3929       vector<OSDOp> scrub(1);
3930       scrub[0].op.op = CEPH_OSD_OP_SCRUB_RESERVE;
3931       hobject_t poid;
3932       eversion_t v;
3933       osd_reqid_t reqid;
3934       MOSDSubOp *subop = new MOSDSubOp(
3935         reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
3936         get_osdmap()->get_epoch(), osd->get_tid(), v);
3937       subop->ops = scrub;
3938       osd->send_message_osd_cluster(
3939         i->osd, subop, get_osdmap()->get_epoch());
3940     }
3941   }
3942 }
3943
3944 void PG::scrub_unreserve_replicas()
3945 {
3946   assert(backfill_targets.empty());
3947   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
3948        i != actingbackfill.end();
3949        ++i) {
3950     if (*i == pg_whoami) continue;
3951     dout(10) << "scrub requesting unreserve from osd." << *i << dendl;
3952     if (HAVE_FEATURE(get_min_acting_features(), SERVER_LUMINOUS)) {
3953       osd->send_message_osd_cluster(
3954         i->osd,
3955         new MOSDScrubReserve(spg_t(info.pgid.pgid, i->shard),
3956                              get_osdmap()->get_epoch(),
3957                              MOSDScrubReserve::RELEASE, pg_whoami),
3958         get_osdmap()->get_epoch());
3959     } else {
3960       // for jewel compat only
3961       vector<OSDOp> scrub(1);
3962       scrub[0].op.op = CEPH_OSD_OP_SCRUB_UNRESERVE;
3963       hobject_t poid;
3964       eversion_t v;
3965       osd_reqid_t reqid;
3966       MOSDSubOp *subop = new MOSDSubOp(
3967         reqid, pg_whoami, spg_t(info.pgid.pgid, i->shard), poid, 0,
3968         get_osdmap()->get_epoch(), osd->get_tid(), v);
3969       subop->ops = scrub;
3970       osd->send_message_osd_cluster(i->osd, subop, get_osdmap()->get_epoch());
3971     }
3972   }
3973 }
3974
3975 void PG::_scan_rollback_obs(
3976   const vector<ghobject_t> &rollback_obs,
3977   ThreadPool::TPHandle &handle)
3978 {
3979   ObjectStore::Transaction t;
3980   eversion_t trimmed_to = last_rollback_info_trimmed_to_applied;
3981   for (vector<ghobject_t>::const_iterator i = rollback_obs.begin();
3982        i != rollback_obs.end();
3983        ++i) {
3984     if (i->generation < trimmed_to.version) {
3985       osd->clog->error() << "osd." << osd->whoami
3986                         << " pg " << info.pgid
3987                         << " found obsolete rollback obj "
3988                         << *i << " generation < trimmed_to "
3989                         << trimmed_to
3990                         << "...repaired";
3991       t.remove(coll, *i);
3992     }
3993   }
3994   if (!t.empty()) {
3995     derr << __func__ << ": queueing trans to clean up obsolete rollback objs"
3996          << dendl;
3997     osd->store->queue_transaction(osr.get(), std::move(t), NULL);
3998   }
3999 }
4000
4001 void PG::_scan_snaps(ScrubMap &smap) 
4002 {
4003   hobject_t head;
4004   SnapSet snapset;
4005   for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4006        i != smap.objects.rend();
4007        ++i) {
4008     const hobject_t &hoid = i->first;
4009     ScrubMap::object &o = i->second;
4010
4011     if (hoid.is_head() || hoid.is_snapdir()) {
4012       // parse the SnapSet
4013       bufferlist bl;
4014       if (o.attrs.find(SS_ATTR) == o.attrs.end()) {
4015         continue;
4016       }
4017       bl.push_back(o.attrs[SS_ATTR]);
4018       auto p = bl.begin();
4019       try {
4020         ::decode(snapset, p);
4021       } catch(...) {
4022         continue;
4023       }
4024       head = hoid.get_head();
4025       // Make sure head_exists is correct for is_legacy() check
4026       if (hoid.is_head())
4027         snapset.head_exists = true;
4028       continue;
4029     }
4030     if (hoid.snap < CEPH_MAXSNAP) {
4031       // check and if necessary fix snap_mapper
4032       if (hoid.get_head() != head) {
4033         derr << __func__ << " no head for " << hoid << " (have " << head << ")"
4034              << dendl;
4035         continue;
4036       }
4037       set<snapid_t> obj_snaps;
4038       if (!snapset.is_legacy()) {
4039         auto p = snapset.clone_snaps.find(hoid.snap);
4040         if (p == snapset.clone_snaps.end()) {
4041           derr << __func__ << " no clone_snaps for " << hoid << " in " << snapset
4042                << dendl;
4043           continue;
4044         }
4045         obj_snaps.insert(p->second.begin(), p->second.end());
4046       } else {
4047         bufferlist bl;
4048         if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4049           continue;
4050         }
4051         bl.push_back(o.attrs[OI_ATTR]);
4052         object_info_t oi;
4053         try {
4054           oi.decode(bl);
4055         } catch(...) {
4056           continue;
4057         }
4058         obj_snaps.insert(oi.legacy_snaps.begin(), oi.legacy_snaps.end());
4059       }
4060       set<snapid_t> cur_snaps;
4061       int r = snap_mapper.get_snaps(hoid, &cur_snaps);
4062       if (r != 0 && r != -ENOENT) {
4063         derr << __func__ << ": get_snaps returned " << cpp_strerror(r) << dendl;
4064         ceph_abort();
4065       }
4066       if (r == -ENOENT || cur_snaps != obj_snaps) {
4067         ObjectStore::Transaction t;
4068         OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4069         if (r == 0) {
4070           r = snap_mapper.remove_oid(hoid, &_t);
4071           if (r != 0) {
4072             derr << __func__ << ": remove_oid returned " << cpp_strerror(r)
4073                  << dendl;
4074             ceph_abort();
4075           }
4076           osd->clog->error() << "osd." << osd->whoami
4077                             << " found snap mapper error on pg "
4078                             << info.pgid
4079                             << " oid " << hoid << " snaps in mapper: "
4080                             << cur_snaps << ", oi: "
4081                             << obj_snaps
4082                             << "...repaired";
4083         } else {
4084           osd->clog->error() << "osd." << osd->whoami
4085                             << " found snap mapper error on pg "
4086                             << info.pgid
4087                             << " oid " << hoid << " snaps missing in mapper"
4088                             << ", should be: "
4089                             << obj_snaps
4090                             << "...repaired";
4091         }
4092         snap_mapper.add_oid(hoid, obj_snaps, &_t);
4093         r = osd->store->apply_transaction(osr.get(), std::move(t));
4094         if (r != 0) {
4095           derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4096                << dendl;
4097         }
4098       }
4099     }
4100   }
4101 }
4102
4103 void PG::_repair_oinfo_oid(ScrubMap &smap)
4104 {
4105   for (map<hobject_t, ScrubMap::object>::reverse_iterator i = smap.objects.rbegin();
4106        i != smap.objects.rend();
4107        ++i) {
4108     const hobject_t &hoid = i->first;
4109     ScrubMap::object &o = i->second;
4110
4111     bufferlist bl;
4112     if (o.attrs.find(OI_ATTR) == o.attrs.end()) {
4113       continue;
4114     }
4115     bl.push_back(o.attrs[OI_ATTR]);
4116     object_info_t oi;
4117     try {
4118       oi.decode(bl);
4119     } catch(...) {
4120       continue;
4121     }
4122     if (oi.soid != hoid) {
4123       ObjectStore::Transaction t;
4124       OSDriver::OSTransaction _t(osdriver.get_transaction(&t));
4125       osd->clog->error() << "osd." << osd->whoami
4126                             << " found object info error on pg "
4127                             << info.pgid
4128                             << " oid " << hoid << " oid in object info: "
4129                             << oi.soid
4130                             << "...repaired";
4131       // Fix object info
4132       oi.soid = hoid;
4133       bl.clear();
4134       ::encode(oi, bl, get_osdmap()->get_features(CEPH_ENTITY_TYPE_OSD, nullptr));
4135
4136       bufferptr bp(bl.c_str(), bl.length());
4137       o.attrs[OI_ATTR] = bp;
4138
4139       t.setattr(coll, ghobject_t(hoid), OI_ATTR, bl);
4140       int r = osd->store->apply_transaction(osr.get(), std::move(t));
4141       if (r != 0) {
4142         derr << __func__ << ": apply_transaction got " << cpp_strerror(r)
4143              << dendl;
4144       }
4145     }
4146   }
4147 }
4148
4149 /*
4150  * build a scrub map over a chunk without releasing the lock
4151  * only used by chunky scrub
4152  */
4153 int PG::build_scrub_map_chunk(
4154   ScrubMap &map,
4155   hobject_t start, hobject_t end, bool deep, uint32_t seed,
4156   ThreadPool::TPHandle &handle)
4157 {
4158   dout(10) << __func__ << " [" << start << "," << end << ") "
4159            << " seed " << seed << dendl;
4160
4161   map.valid_through = info.last_update;
4162
4163   // objects
4164   vector<hobject_t> ls;
4165   vector<ghobject_t> rollback_obs;
4166   int ret = get_pgbackend()->objects_list_range(
4167     start,
4168     end,
4169     0,
4170     &ls,
4171     &rollback_obs);
4172   if (ret < 0) {
4173     dout(5) << "objects_list_range error: " << ret << dendl;
4174     return ret;
4175   }
4176
4177
4178   get_pgbackend()->be_scan_list(map, ls, deep, seed, handle);
4179   _scan_rollback_obs(rollback_obs, handle);
4180   _scan_snaps(map);
4181   _repair_oinfo_oid(map);
4182
4183   dout(20) << __func__ << " done" << dendl;
4184   return 0;
4185 }
4186
4187 void PG::Scrubber::cleanup_store(ObjectStore::Transaction *t) {
4188   if (!store)
4189     return;
4190   struct OnComplete : Context {
4191     std::unique_ptr<Scrub::Store> store;
4192     OnComplete(
4193       std::unique_ptr<Scrub::Store> &&store)
4194       : store(std::move(store)) {}
4195     void finish(int) override {}
4196   };
4197   store->cleanup(t);
4198   t->register_on_complete(new OnComplete(std::move(store)));
4199   assert(!store);
4200 }
4201
4202 void PG::repair_object(
4203   const hobject_t& soid, list<pair<ScrubMap::object, pg_shard_t> > *ok_peers,
4204   pg_shard_t bad_peer)
4205 {
4206   list<pg_shard_t> op_shards;
4207   for (auto i : *ok_peers) {
4208     op_shards.push_back(i.second);
4209   }
4210   dout(10) << "repair_object " << soid << " bad_peer osd."
4211            << bad_peer << " ok_peers osd.{" << op_shards << "}" << dendl;
4212   ScrubMap::object &po = ok_peers->back().first;
4213   eversion_t v;
4214   bufferlist bv;
4215   bv.push_back(po.attrs[OI_ATTR]);
4216   object_info_t oi;
4217   try {
4218     bufferlist::iterator bliter = bv.begin();
4219     ::decode(oi, bliter);
4220   } catch (...) {
4221     dout(0) << __func__ << ": Need version of replica, bad object_info_t: " << soid << dendl;
4222     assert(0);
4223   }
4224   if (bad_peer != primary) {
4225     peer_missing[bad_peer].add(soid, oi.version, eversion_t(), false);
4226   } else {
4227     // We should only be scrubbing if the PG is clean.
4228     assert(waiting_for_unreadable_object.empty());
4229
4230     pg_log.missing_add(soid, oi.version, eversion_t());
4231
4232     pg_log.set_last_requested(0);
4233     dout(10) << __func__ << ": primary = " << primary << dendl;
4234   }
4235
4236   if (is_ec_pg() || bad_peer == primary) {
4237     // we'd better collect all shard for EC pg, and prepare good peers as the
4238     // source of pull in the case of replicated pg.
4239     missing_loc.add_missing(soid, oi.version, eversion_t());
4240     list<pair<ScrubMap::object, pg_shard_t> >::iterator i;
4241     for (i = ok_peers->begin();
4242         i != ok_peers->end();
4243         ++i)
4244       missing_loc.add_location(soid, i->second);
4245   }
4246 }
4247
4248 /* replica_scrub
4249  *
4250  * Wait for last_update_applied to match msg->scrub_to as above. Wait
4251  * for pushes to complete in case of recent recovery. Build a single
4252  * scrubmap of objects that are in the range [msg->start, msg->end).
4253  */
4254 void PG::replica_scrub(
4255   OpRequestRef op,
4256   ThreadPool::TPHandle &handle)
4257 {
4258   const MOSDRepScrub *msg = static_cast<const MOSDRepScrub *>(op->get_req());
4259   assert(!scrubber.active_rep_scrub);
4260   dout(7) << "replica_scrub" << dendl;
4261
4262   if (msg->map_epoch < info.history.same_interval_since) {
4263     dout(10) << "replica_scrub discarding old replica_scrub from "
4264              << msg->map_epoch << " < " << info.history.same_interval_since 
4265              << dendl;
4266     return;
4267   }
4268
4269   ScrubMap map;
4270
4271   assert(msg->chunky);
4272   if (last_update_applied < msg->scrub_to) {
4273     dout(10) << "waiting for last_update_applied to catch up" << dendl;
4274     scrubber.active_rep_scrub = op;
4275     return;
4276   }
4277
4278   if (active_pushes > 0) {
4279     dout(10) << "waiting for active pushes to finish" << dendl;
4280     scrubber.active_rep_scrub = op;
4281     return;
4282   }
4283
4284   // compensate for hobject_t's with wrong pool from sloppy hammer OSDs
4285   hobject_t start = msg->start;
4286   hobject_t end = msg->end;
4287   if (!start.is_max())
4288     start.pool = info.pgid.pool();
4289   if (!end.is_max())
4290     end.pool = info.pgid.pool();
4291
4292   build_scrub_map_chunk(
4293     map, start, end, msg->deep, msg->seed,
4294     handle);
4295
4296   if (HAVE_FEATURE(acting_features, SERVER_LUMINOUS)) {
4297     MOSDRepScrubMap *reply = new MOSDRepScrubMap(
4298       spg_t(info.pgid.pgid, get_primary().shard),
4299       msg->map_epoch,
4300       pg_whoami);
4301     ::encode(map, reply->get_data());
4302     osd->send_message_osd_cluster(reply, msg->get_connection());
4303   } else {
4304     // for jewel compatibility
4305     vector<OSDOp> scrub(1);
4306     scrub[0].op.op = CEPH_OSD_OP_SCRUB_MAP;
4307     hobject_t poid;
4308     eversion_t v;
4309     osd_reqid_t reqid;
4310     MOSDSubOp *subop = new MOSDSubOp(
4311       reqid,
4312       pg_whoami,
4313       spg_t(info.pgid.pgid, get_primary().shard),
4314       poid,
4315       0,
4316       msg->map_epoch,
4317       osd->get_tid(),
4318       v);
4319     ::encode(map, subop->get_data());
4320     subop->ops = scrub;
4321     osd->send_message_osd_cluster(subop, msg->get_connection());
4322   }
4323 }
4324
4325 /* Scrub:
4326  * PG_STATE_SCRUBBING is set when the scrub is queued
4327  * 
4328  * scrub will be chunky if all OSDs in PG support chunky scrub
4329  * scrub will fail if OSDs are too old.
4330  */
4331 void PG::scrub(epoch_t queued, ThreadPool::TPHandle &handle)
4332 {
4333   if (cct->_conf->osd_scrub_sleep > 0 &&
4334       (scrubber.state == PG::Scrubber::NEW_CHUNK ||
4335        scrubber.state == PG::Scrubber::INACTIVE) &&
4336        scrubber.needs_sleep) {
4337     ceph_assert(!scrubber.sleeping);
4338     dout(20) << __func__ << " state is INACTIVE|NEW_CHUNK, sleeping" << dendl;
4339
4340     // Do an async sleep so we don't block the op queue
4341     OSDService *osds = osd;
4342     spg_t pgid = get_pgid();
4343     int state = scrubber.state;
4344     auto scrub_requeue_callback =
4345         new FunctionContext([osds, pgid, state](int r) {
4346           PG *pg = osds->osd->lookup_lock_pg(pgid);
4347           if (pg == nullptr) {
4348             lgeneric_dout(osds->osd->cct, 20)
4349                 << "scrub_requeue_callback: Could not find "
4350                 << "PG " << pgid << " can't complete scrub requeue after sleep"
4351                 << dendl;
4352             return;
4353           }
4354           pg->scrubber.sleeping = false;
4355           pg->scrubber.needs_sleep = false;
4356           lgeneric_dout(pg->cct, 20)
4357               << "scrub_requeue_callback: slept for "
4358               << ceph_clock_now() - pg->scrubber.sleep_start
4359               << ", re-queuing scrub with state " << state << dendl;
4360           pg->scrub_queued = false;
4361           pg->requeue_scrub();
4362           pg->scrubber.sleep_start = utime_t();
4363           pg->unlock();
4364         });
4365     Mutex::Locker l(osd->scrub_sleep_lock);
4366     osd->scrub_sleep_timer.add_event_after(cct->_conf->osd_scrub_sleep,
4367                                            scrub_requeue_callback);
4368     scrubber.sleeping = true;
4369     scrubber.sleep_start = ceph_clock_now();
4370     return;
4371   }
4372   if (pg_has_reset_since(queued)) {
4373     return;
4374   }
4375   assert(scrub_queued);
4376   scrub_queued = false;
4377   scrubber.needs_sleep = true;
4378
4379   if (!is_primary() || !is_active() || !is_clean() || !is_scrubbing()) {
4380     dout(10) << "scrub -- not primary or active or not clean" << dendl;
4381     state_clear(PG_STATE_SCRUBBING);
4382     state_clear(PG_STATE_REPAIR);
4383     state_clear(PG_STATE_DEEP_SCRUB);
4384     publish_stats_to_osd();
4385     return;
4386   }
4387
4388   if (!scrubber.active) {
4389     assert(backfill_targets.empty());
4390
4391     scrubber.deep = state_test(PG_STATE_DEEP_SCRUB);
4392
4393     dout(10) << "starting a new chunky scrub" << dendl;
4394   }
4395
4396   chunky_scrub(handle);
4397 }
4398
4399 /*
4400  * Chunky scrub scrubs objects one chunk at a time with writes blocked for that
4401  * chunk.
4402  *
4403  * The object store is partitioned into chunks which end on hash boundaries. For
4404  * each chunk, the following logic is performed:
4405  *
4406  *  (1) Block writes on the chunk
4407  *  (2) Request maps from replicas
4408  *  (3) Wait for pushes to be applied (after recovery)
4409  *  (4) Wait for writes to flush on the chunk
4410  *  (5) Wait for maps from replicas
4411  *  (6) Compare / repair all scrub maps
4412  *  (7) Wait for digest updates to apply
4413  *
4414  * This logic is encoded in the mostly linear state machine:
4415  *
4416  *           +------------------+
4417  *  _________v__________        |
4418  * |                    |       |
4419  * |      INACTIVE      |       |
4420  * |____________________|       |
4421  *           |                  |
4422  *           |   +----------+   |
4423  *  _________v___v______    |   |
4424  * |                    |   |   |
4425  * |      NEW_CHUNK     |   |   |
4426  * |____________________|   |   |
4427  *           |              |   |
4428  *  _________v__________    |   |
4429  * |                    |   |   |
4430  * |     WAIT_PUSHES    |   |   |
4431  * |____________________|   |   |
4432  *           |              |   |
4433  *  _________v__________    |   |
4434  * |                    |   |   |
4435  * |  WAIT_LAST_UPDATE  |   |   |
4436  * |____________________|   |   |
4437  *           |              |   |
4438  *  _________v__________    |   |
4439  * |                    |   |   |
4440  * |      BUILD_MAP     |   |   |
4441  * |____________________|   |   |
4442  *           |              |   |
4443  *  _________v__________    |   |
4444  * |                    |   |   |
4445  * |    WAIT_REPLICAS   |   |   |
4446  * |____________________|   |   |
4447  *           |              |   |
4448  *  _________v__________    |   |
4449  * |                    |   |   |
4450  * |    COMPARE_MAPS    |   |   |
4451  * |____________________|   |   |
4452  *           |              |   |
4453  *           |              |   |
4454  *  _________v__________    |   |
4455  * |                    |   |   |
4456  * |WAIT_DIGEST_UPDATES |   |   |
4457  * |____________________|   |   |
4458  *           |   |          |   |
4459  *           |   +----------+   |
4460  *  _________v__________        |
4461  * |                    |       |
4462  * |       FINISH       |       |
4463  * |____________________|       |
4464  *           |                  |
4465  *           +------------------+
4466  *
4467  * The primary determines the last update from the subset by walking the log. If
4468  * it sees a log entry pertaining to a file in the chunk, it tells the replicas
4469  * to wait until that update is applied before building a scrub map. Both the
4470  * primary and replicas will wait for any active pushes to be applied.
4471  *
4472  * In contrast to classic_scrub, chunky_scrub is entirely handled by scrub_wq.
4473  *
4474  * scrubber.state encodes the current state of the scrub (refer to state diagram
4475  * for details).
4476  */
4477 void PG::chunky_scrub(ThreadPool::TPHandle &handle)
4478 {
4479   // check for map changes
4480   if (scrubber.is_chunky_scrub_active()) {
4481     if (scrubber.epoch_start != info.history.same_interval_since) {
4482       dout(10) << "scrub  pg changed, aborting" << dendl;
4483       scrub_clear_state();
4484       scrub_unreserve_replicas();
4485       return;
4486     }
4487   }
4488
4489   bool done = false;
4490   int ret;
4491
4492   while (!done) {
4493     dout(20) << "scrub state " << Scrubber::state_string(scrubber.state)
4494              << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4495
4496     switch (scrubber.state) {
4497       case PG::Scrubber::INACTIVE:
4498         dout(10) << "scrub start" << dendl;
4499
4500         publish_stats_to_osd();
4501         scrubber.epoch_start = info.history.same_interval_since;
4502         scrubber.active = true;
4503
4504         osd->inc_scrubs_active(scrubber.reserved);
4505         if (scrubber.reserved) {
4506           scrubber.reserved = false;
4507           scrubber.reserved_peers.clear();
4508         }
4509
4510         {
4511           ObjectStore::Transaction t;
4512           scrubber.cleanup_store(&t);
4513           scrubber.store.reset(Scrub::Store::create(osd->store, &t,
4514                                                     info.pgid, coll));
4515           osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4516         }
4517
4518         // Don't include temporary objects when scrubbing
4519         scrubber.start = info.pgid.pgid.get_hobj_start();
4520         scrubber.state = PG::Scrubber::NEW_CHUNK;
4521
4522         {
4523           bool repair = state_test(PG_STATE_REPAIR);
4524           bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4525           const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4526           stringstream oss;
4527           oss << info.pgid.pgid << " " << mode << " starts" << std::endl;
4528           osd->clog->debug(oss);
4529         }
4530
4531         scrubber.seed = -1;
4532
4533         break;
4534
4535       case PG::Scrubber::NEW_CHUNK:
4536         scrubber.primary_scrubmap = ScrubMap();
4537         scrubber.received_maps.clear();
4538
4539         {
4540           /* get the start and end of our scrub chunk
4541            *
4542            * Our scrub chunk has an important restriction we're going to need to
4543            * respect. We can't let head or snapdir be start or end.
4544            * Using a half-open interval means that if end == head|snapdir,
4545            * we'd scrub/lock head and the clone right next to head in different
4546            * chunks which would allow us to miss clones created between
4547            * scrubbing that chunk and scrubbing the chunk including head.
4548            * This isn't true for any of the other clones since clones can
4549            * only be created "just to the left of" head.  There is one exception
4550            * to this: promotion of clones which always happens to the left of the
4551            * left-most clone, but promote_object checks the scrubber in that
4552            * case, so it should be ok.  Also, it's ok to "miss" clones at the
4553            * left end of the range if we are a tier because they may legitimately
4554            * not exist (see _scrub).
4555            */
4556           int min = MAX(3, cct->_conf->osd_scrub_chunk_min);
4557           hobject_t start = scrubber.start;
4558           hobject_t candidate_end;
4559           vector<hobject_t> objects;
4560           ret = get_pgbackend()->objects_list_partial(
4561             start,
4562             min,
4563             MAX(min, cct->_conf->osd_scrub_chunk_max),
4564             &objects,
4565             &candidate_end);
4566           assert(ret >= 0);
4567
4568           if (!objects.empty()) {
4569             hobject_t back = objects.back();
4570             while (candidate_end.has_snapset() &&
4571                       candidate_end.get_head() == back.get_head()) {
4572               candidate_end = back;
4573               objects.pop_back();
4574               if (objects.empty()) {
4575                 assert(0 ==
4576                        "Somehow we got more than 2 objects which"
4577                        "have the same head but are not clones");
4578               }
4579               back = objects.back();
4580             }
4581             if (candidate_end.has_snapset()) {
4582               assert(candidate_end.get_head() != back.get_head());
4583               candidate_end = candidate_end.get_object_boundary();
4584             }
4585           } else {
4586             assert(candidate_end.is_max());
4587           }
4588
4589           if (!_range_available_for_scrub(scrubber.start, candidate_end)) {
4590             // we'll be requeued by whatever made us unavailable for scrub
4591             dout(10) << __func__ << ": scrub blocked somewhere in range "
4592                      << "[" << scrubber.start << ", " << candidate_end << ")"
4593                      << dendl;
4594             done = true;
4595             break;
4596           }
4597           scrubber.end = candidate_end;
4598         }
4599
4600         // walk the log to find the latest update that affects our chunk
4601         scrubber.subset_last_update = eversion_t();
4602         for (auto p = projected_log.log.rbegin();
4603              p != projected_log.log.rend();
4604              ++p) {
4605           if (p->soid >= scrubber.start &&
4606               p->soid < scrubber.end) {
4607             scrubber.subset_last_update = p->version;
4608             break;
4609           }
4610         }
4611         if (scrubber.subset_last_update == eversion_t()) {
4612           for (list<pg_log_entry_t>::const_reverse_iterator p =
4613                  pg_log.get_log().log.rbegin();
4614                p != pg_log.get_log().log.rend();
4615                ++p) {
4616             if (p->soid >= scrubber.start &&
4617                 p->soid < scrubber.end) {
4618               scrubber.subset_last_update = p->version;
4619               break;
4620             }
4621           }
4622         }
4623
4624         // ask replicas to wait until
4625         // last_update_applied >= scrubber.subset_last_update and then scan
4626         scrubber.waiting_on_whom.insert(pg_whoami);
4627         ++scrubber.waiting_on;
4628
4629         // request maps from replicas
4630         for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4631              i != actingbackfill.end();
4632              ++i) {
4633           if (*i == pg_whoami) continue;
4634           _request_scrub_map(*i, scrubber.subset_last_update,
4635                              scrubber.start, scrubber.end, scrubber.deep,
4636                              scrubber.seed);
4637           scrubber.waiting_on_whom.insert(*i);
4638           ++scrubber.waiting_on;
4639         }
4640
4641         scrubber.state = PG::Scrubber::WAIT_PUSHES;
4642
4643         break;
4644
4645       case PG::Scrubber::WAIT_PUSHES:
4646         if (active_pushes == 0) {
4647           scrubber.state = PG::Scrubber::WAIT_LAST_UPDATE;
4648         } else {
4649           dout(15) << "wait for pushes to apply" << dendl;
4650           done = true;
4651         }
4652         break;
4653
4654       case PG::Scrubber::WAIT_LAST_UPDATE:
4655         if (last_update_applied >= scrubber.subset_last_update) {
4656           scrubber.state = PG::Scrubber::BUILD_MAP;
4657         } else {
4658           // will be requeued by op_applied
4659           dout(15) << "wait for writes to flush" << dendl;
4660           done = true;
4661         }
4662         break;
4663
4664       case PG::Scrubber::BUILD_MAP:
4665         assert(last_update_applied >= scrubber.subset_last_update);
4666
4667         // build my own scrub map
4668         ret = build_scrub_map_chunk(scrubber.primary_scrubmap,
4669                                     scrubber.start, scrubber.end,
4670                                     scrubber.deep, scrubber.seed,
4671                                     handle);
4672         if (ret < 0) {
4673           dout(5) << "error building scrub map: " << ret << ", aborting" << dendl;
4674           scrub_clear_state();
4675           scrub_unreserve_replicas();
4676           return;
4677         }
4678
4679         --scrubber.waiting_on;
4680         scrubber.waiting_on_whom.erase(pg_whoami);
4681
4682         scrubber.state = PG::Scrubber::WAIT_REPLICAS;
4683         break;
4684
4685       case PG::Scrubber::WAIT_REPLICAS:
4686         if (scrubber.waiting_on > 0) {
4687           // will be requeued by sub_op_scrub_map
4688           dout(10) << "wait for replicas to build scrub map" << dendl;
4689           done = true;
4690         } else {
4691           scrubber.state = PG::Scrubber::COMPARE_MAPS;
4692         }
4693         break;
4694
4695       case PG::Scrubber::COMPARE_MAPS:
4696         assert(last_update_applied >= scrubber.subset_last_update);
4697         assert(scrubber.waiting_on == 0);
4698
4699         scrub_compare_maps();
4700         scrubber.start = scrubber.end;
4701         scrubber.run_callbacks();
4702
4703         // requeue the writes from the chunk that just finished
4704         requeue_ops(waiting_for_scrub);
4705
4706         scrubber.state = PG::Scrubber::WAIT_DIGEST_UPDATES;
4707
4708         // fall-thru
4709
4710       case PG::Scrubber::WAIT_DIGEST_UPDATES:
4711         if (scrubber.num_digest_updates_pending) {
4712           dout(10) << __func__ << " waiting on "
4713                    << scrubber.num_digest_updates_pending
4714                    << " digest updates" << dendl;
4715           done = true;
4716           break;
4717         }
4718
4719         if (!(scrubber.end.is_max())) {
4720           scrubber.state = PG::Scrubber::NEW_CHUNK;
4721           requeue_scrub();
4722           done = true;
4723         } else {
4724           scrubber.state = PG::Scrubber::FINISH;
4725         }
4726
4727         break;
4728
4729       case PG::Scrubber::FINISH:
4730         scrub_finish();
4731         scrubber.state = PG::Scrubber::INACTIVE;
4732         done = true;
4733
4734         if (!snap_trimq.empty()) {
4735           dout(10) << "scrub finished, requeuing snap_trimmer" << dendl;
4736           snap_trimmer_scrub_complete();
4737         }
4738
4739         break;
4740
4741       default:
4742         ceph_abort();
4743     }
4744   }
4745   dout(20) << "scrub final state " << Scrubber::state_string(scrubber.state)
4746            << " [" << scrubber.start << "," << scrubber.end << ")" << dendl;
4747 }
4748
4749 void PG::scrub_clear_state()
4750 {
4751   assert(is_locked());
4752   state_clear(PG_STATE_SCRUBBING);
4753   state_clear(PG_STATE_REPAIR);
4754   state_clear(PG_STATE_DEEP_SCRUB);
4755   publish_stats_to_osd();
4756
4757   // active -> nothing.
4758   if (scrubber.active)
4759     osd->dec_scrubs_active();
4760
4761   requeue_ops(waiting_for_scrub);
4762
4763   scrubber.reset();
4764
4765   // type-specific state clear
4766   _scrub_clear_state();
4767 }
4768
4769 void PG::scrub_compare_maps() 
4770 {
4771   dout(10) << __func__ << " has maps, analyzing" << dendl;
4772
4773   // construct authoritative scrub map for type specific scrubbing
4774   scrubber.cleaned_meta_map.insert(scrubber.primary_scrubmap);
4775   map<hobject_t, pair<uint32_t, uint32_t>> missing_digest;
4776
4777   if (acting.size() > 1) {
4778     dout(10) << __func__ << "  comparing replica scrub maps" << dendl;
4779
4780     stringstream ss;
4781
4782     // Map from object with errors to good peer
4783     map<hobject_t, list<pg_shard_t>> authoritative;
4784     map<pg_shard_t, ScrubMap *> maps;
4785
4786     dout(2) << __func__ << "   osd." << acting[0] << " has "
4787             << scrubber.primary_scrubmap.objects.size() << " items" << dendl;
4788     maps[pg_whoami] = &scrubber.primary_scrubmap;
4789
4790     for (set<pg_shard_t>::iterator i = actingbackfill.begin();
4791          i != actingbackfill.end();
4792          ++i) {
4793       if (*i == pg_whoami) continue;
4794       dout(2) << __func__ << " replica " << *i << " has "
4795               << scrubber.received_maps[*i].objects.size()
4796               << " items" << dendl;
4797       maps[*i] = &scrubber.received_maps[*i];
4798     }
4799
4800     get_pgbackend()->be_compare_scrubmaps(
4801       maps,
4802       state_test(PG_STATE_REPAIR),
4803       scrubber.missing,
4804       scrubber.inconsistent,
4805       authoritative,
4806       missing_digest,
4807       scrubber.shallow_errors,
4808       scrubber.deep_errors,
4809       scrubber.store.get(),
4810       info.pgid, acting,
4811       ss);
4812     dout(2) << ss.str() << dendl;
4813
4814     if (!ss.str().empty()) {
4815       osd->clog->error(ss);
4816     }
4817
4818     for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4819          i != authoritative.end();
4820          ++i) {
4821       list<pair<ScrubMap::object, pg_shard_t> > good_peers;
4822       for (list<pg_shard_t>::const_iterator j = i->second.begin();
4823            j != i->second.end();
4824            ++j) {
4825         good_peers.push_back(make_pair(maps[*j]->objects[i->first], *j));
4826       }
4827       scrubber.authoritative.insert(
4828         make_pair(
4829           i->first,
4830           good_peers));
4831     }
4832
4833     for (map<hobject_t, list<pg_shard_t>>::iterator i = authoritative.begin();
4834          i != authoritative.end();
4835          ++i) {
4836       scrubber.cleaned_meta_map.objects.erase(i->first);
4837       scrubber.cleaned_meta_map.objects.insert(
4838         *(maps[i->second.back()]->objects.find(i->first))
4839         );
4840     }
4841   }
4842
4843   ScrubMap for_meta_scrub;
4844   if (scrubber.end.is_max() ||
4845       scrubber.cleaned_meta_map.objects.empty()) {
4846     scrubber.cleaned_meta_map.swap(for_meta_scrub);
4847   } else {
4848     auto iter = scrubber.cleaned_meta_map.objects.end();
4849     --iter; // not empty, see if clause
4850     auto begin = scrubber.cleaned_meta_map.objects.begin();
4851     while (iter != begin) {
4852       auto next = iter--;
4853       if (next->first.get_head() != iter->first.get_head()) {
4854         ++iter;
4855         break;
4856       }
4857     }
4858     for_meta_scrub.objects.insert(begin, iter);
4859     scrubber.cleaned_meta_map.objects.erase(begin, iter);
4860   }
4861
4862   // ok, do the pg-type specific scrubbing
4863   scrub_snapshot_metadata(for_meta_scrub, missing_digest);
4864   if (!scrubber.store->empty()) {
4865     if (state_test(PG_STATE_REPAIR)) {
4866       dout(10) << __func__ << ": discarding scrub results" << dendl;
4867       scrubber.store->flush(nullptr);
4868     } else {
4869       dout(10) << __func__ << ": updating scrub object" << dendl;
4870       ObjectStore::Transaction t;
4871       scrubber.store->flush(&t);
4872       osd->store->queue_transaction(osr.get(), std::move(t), nullptr);
4873     }
4874   }
4875 }
4876
4877 bool PG::scrub_process_inconsistent()
4878 {
4879   dout(10) << __func__ << ": checking authoritative" << dendl;
4880   bool repair = state_test(PG_STATE_REPAIR);
4881   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4882   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4883   
4884   // authoriative only store objects which missing or inconsistent.
4885   if (!scrubber.authoritative.empty()) {
4886     stringstream ss;
4887     ss << info.pgid << " " << mode << " "
4888        << scrubber.missing.size() << " missing, "
4889        << scrubber.inconsistent.size() << " inconsistent objects";
4890     dout(2) << ss.str() << dendl;
4891     osd->clog->error(ss);
4892     if (repair) {
4893       state_clear(PG_STATE_CLEAN);
4894       for (map<hobject_t, list<pair<ScrubMap::object, pg_shard_t> >>::iterator i =
4895              scrubber.authoritative.begin();
4896            i != scrubber.authoritative.end();
4897            ++i) {
4898         set<pg_shard_t>::iterator j;
4899
4900         auto missing_entry = scrubber.missing.find(i->first);
4901         if (missing_entry != scrubber.missing.end()) {
4902           for (j = missing_entry->second.begin();
4903                j != missing_entry->second.end();
4904                ++j) {
4905             repair_object(
4906               i->first,
4907               &(i->second),
4908               *j);
4909             ++scrubber.fixed;
4910           }
4911         }
4912         if (scrubber.inconsistent.count(i->first)) {
4913           for (j = scrubber.inconsistent[i->first].begin(); 
4914                j != scrubber.inconsistent[i->first].end(); 
4915                ++j) {
4916             repair_object(i->first, 
4917               &(i->second),
4918               *j);
4919             ++scrubber.fixed;
4920           }
4921         }
4922       }
4923     }
4924   }
4925   return (!scrubber.authoritative.empty() && repair);
4926 }
4927
4928 bool PG::ops_blocked_by_scrub() const {
4929   return (waiting_for_scrub.size() != 0);
4930 }
4931
4932 // the part that actually finalizes a scrub
4933 void PG::scrub_finish() 
4934 {
4935   bool repair = state_test(PG_STATE_REPAIR);
4936   // if the repair request comes from auto-repair and large number of errors,
4937   // we would like to cancel auto-repair
4938   if (repair && scrubber.auto_repair
4939       && scrubber.authoritative.size() > cct->_conf->osd_scrub_auto_repair_num_errors) {
4940     state_clear(PG_STATE_REPAIR);
4941     repair = false;
4942   }
4943   bool deep_scrub = state_test(PG_STATE_DEEP_SCRUB);
4944   const char *mode = (repair ? "repair": (deep_scrub ? "deep-scrub" : "scrub"));
4945
4946   // type-specific finish (can tally more errors)
4947   _scrub_finish();
4948
4949   bool has_error = scrub_process_inconsistent();
4950
4951   {
4952     stringstream oss;
4953     oss << info.pgid.pgid << " " << mode << " ";
4954     int total_errors = scrubber.shallow_errors + scrubber.deep_errors;
4955     if (total_errors)
4956       oss << total_errors << " errors";
4957     else
4958       oss << "ok";
4959     if (!deep_scrub && info.stats.stats.sum.num_deep_scrub_errors)
4960       oss << " ( " << info.stats.stats.sum.num_deep_scrub_errors
4961           << " remaining deep scrub error details lost)";
4962     if (repair)
4963       oss << ", " << scrubber.fixed << " fixed";
4964     if (total_errors)
4965       osd->clog->error(oss);
4966     else
4967       osd->clog->debug(oss);
4968   }
4969
4970   // finish up
4971   unreg_next_scrub();
4972   utime_t now = ceph_clock_now();
4973   info.history.last_scrub = info.last_update;
4974   info.history.last_scrub_stamp = now;
4975   if (scrubber.deep) {
4976     info.history.last_deep_scrub = info.last_update;
4977     info.history.last_deep_scrub_stamp = now;
4978   }
4979   // Since we don't know which errors were fixed, we can only clear them
4980   // when every one has been fixed.
4981   if (repair) {
4982     if (scrubber.fixed == scrubber.shallow_errors + scrubber.deep_errors) {
4983       assert(deep_scrub);
4984       scrubber.shallow_errors = scrubber.deep_errors = 0;
4985     } else {
4986       // Deep scrub in order to get corrected error counts
4987       scrub_after_recovery = true;
4988     }
4989   }
4990   if (deep_scrub) {
4991     if ((scrubber.shallow_errors == 0) && (scrubber.deep_errors == 0))
4992       info.history.last_clean_scrub_stamp = now;
4993     info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
4994     info.stats.stats.sum.num_deep_scrub_errors = scrubber.deep_errors;
4995   } else {
4996     info.stats.stats.sum.num_shallow_scrub_errors = scrubber.shallow_errors;
4997     // XXX: last_clean_scrub_stamp doesn't mean the pg is not inconsistent
4998     // because of deep-scrub errors
4999     if (scrubber.shallow_errors == 0)
5000       info.history.last_clean_scrub_stamp = now;
5001   }
5002   info.stats.stats.sum.num_scrub_errors = 
5003     info.stats.stats.sum.num_shallow_scrub_errors +
5004     info.stats.stats.sum.num_deep_scrub_errors;
5005   reg_next_scrub();
5006
5007   {
5008     ObjectStore::Transaction t;
5009     dirty_info = true;
5010     write_if_dirty(t);
5011     int tr = osd->store->queue_transaction(osr.get(), std::move(t), NULL);
5012     assert(tr == 0);
5013   }
5014
5015
5016   if (has_error) {
5017     queue_peering_event(
5018       CephPeeringEvtRef(
5019         std::make_shared<CephPeeringEvt>(
5020           get_osdmap()->get_epoch(),
5021           get_osdmap()->get_epoch(),
5022           DoRecovery())));
5023   }
5024
5025   scrub_clear_state();
5026   scrub_unreserve_replicas();
5027
5028   if (is_active() && is_primary()) {
5029     share_pg_info();
5030   }
5031 }
5032
5033 void PG::share_pg_info()
5034 {
5035   dout(10) << "share_pg_info" << dendl;
5036
5037   // share new pg_info_t with replicas
5038   assert(!actingbackfill.empty());
5039   for (set<pg_shard_t>::iterator i = actingbackfill.begin();
5040        i != actingbackfill.end();
5041        ++i) {
5042     if (*i == pg_whoami) continue;
5043     pg_shard_t peer = *i;
5044     if (peer_info.count(peer)) {
5045       peer_info[peer].last_epoch_started = info.last_epoch_started;
5046       peer_info[peer].last_interval_started = info.last_interval_started;
5047       peer_info[peer].history.merge(info.history);
5048     }
5049     MOSDPGInfo *m = new MOSDPGInfo(get_osdmap()->get_epoch());
5050     m->pg_list.push_back(
5051       make_pair(
5052         pg_notify_t(
5053           peer.shard, pg_whoami.shard,
5054           get_osdmap()->get_epoch(),
5055           get_osdmap()->get_epoch(),
5056           info),
5057         PastIntervals()));
5058     osd->send_message_osd_cluster(peer.osd, m, get_osdmap()->get_epoch());
5059   }
5060 }
5061
5062 bool PG::append_log_entries_update_missing(
5063   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5064   ObjectStore::Transaction &t)
5065 {
5066   assert(!entries.empty());
5067   assert(entries.begin()->version > info.last_update);
5068
5069   PGLogEntryHandler rollbacker{this, &t};
5070   bool invalidate_stats =
5071     pg_log.append_new_log_entries(info.last_backfill,
5072                                   info.last_backfill_bitwise,
5073                                   entries,
5074                                   &rollbacker);
5075   info.last_update = pg_log.get_head();
5076
5077   if (pg_log.get_missing().num_missing() == 0) {
5078     // advance last_complete since nothing else is missing!
5079     info.last_complete = info.last_update;
5080   }
5081
5082   info.stats.stats_invalid = info.stats.stats_invalid || invalidate_stats;
5083   dirty_info = true;
5084   write_if_dirty(t);
5085   return invalidate_stats;
5086 }
5087
5088
5089 void PG::merge_new_log_entries(
5090   const mempool::osd_pglog::list<pg_log_entry_t> &entries,
5091   ObjectStore::Transaction &t)
5092 {
5093   dout(10) << __func__ << " " << entries << dendl;
5094   assert(is_primary());
5095
5096   bool rebuild_missing = append_log_entries_update_missing(entries, t);
5097   for (set<pg_shard_t>::const_iterator i = actingbackfill.begin();
5098        i != actingbackfill.end();
5099        ++i) {
5100     pg_shard_t peer(*i);
5101     if (peer == pg_whoami) continue;
5102     assert(peer_missing.count(peer));
5103     assert(peer_info.count(peer));
5104     pg_missing_t& pmissing(peer_missing[peer]);
5105     dout(20) << __func__ << " peer_missing for " << peer << " = " << pmissing << dendl;
5106     pg_info_t& pinfo(peer_info[peer]);
5107     bool invalidate_stats = PGLog::append_log_entries_update_missing(
5108       pinfo.last_backfill,
5109       info.last_backfill_bitwise,
5110       entries,
5111       true,
5112       NULL,
5113       pmissing,
5114       NULL,
5115       this);
5116     pinfo.last_update = info.last_update;
5117     pinfo.stats.stats_invalid = pinfo.stats.stats_invalid || invalidate_stats;
5118     rebuild_missing = rebuild_missing || invalidate_stats;
5119   }
5120
5121   if (!rebuild_missing) {
5122     return;
5123   }
5124
5125   for (auto &&i: entries) {
5126     missing_loc.rebuild(
5127       i.soid,
5128       pg_whoami,
5129       actingbackfill,
5130       info,
5131       pg_log.get_missing(),
5132       peer_missing,
5133       peer_info);
5134   }
5135 }
5136
5137 void PG::update_history(const pg_history_t& new_history)
5138 {
5139   unreg_next_scrub();
5140   if (info.history.merge(new_history)) {
5141     dout(20) << __func__ << " advanced history from " << new_history << dendl;
5142     dirty_info = true;
5143     if (info.history.last_epoch_clean >= info.history.same_interval_since) {
5144       dout(20) << __func__ << " clearing past_intervals" << dendl;
5145       past_intervals.clear();
5146       dirty_big_info = true;
5147     }
5148   }
5149   reg_next_scrub();
5150 }
5151
5152 void PG::fulfill_info(
5153   pg_shard_t from, const pg_query_t &query,
5154   pair<pg_shard_t, pg_info_t> &notify_info)
5155 {
5156   assert(from == primary);
5157   assert(query.type == pg_query_t::INFO);
5158
5159   // info
5160   dout(10) << "sending info" << dendl;
5161   notify_info = make_pair(from, info);
5162 }
5163
5164 void PG::fulfill_log(
5165   pg_shard_t from, const pg_query_t &query, epoch_t query_epoch)
5166 {
5167   dout(10) << "log request from " << from << dendl;
5168   assert(from == primary);
5169   assert(query.type != pg_query_t::INFO);
5170   ConnectionRef con = osd->get_con_osd_cluster(
5171     from.osd, get_osdmap()->get_epoch());
5172   if (!con) return;
5173
5174   MOSDPGLog *mlog = new MOSDPGLog(
5175     from.shard, pg_whoami.shard,
5176     get_osdmap()->get_epoch(),
5177     info, query_epoch);
5178   mlog->missing = pg_log.get_missing();
5179
5180   // primary -> other, when building master log
5181   if (query.type == pg_query_t::LOG) {
5182     dout(10) << " sending info+missing+log since " << query.since
5183              << dendl;
5184     if (query.since != eversion_t() && query.since < pg_log.get_tail()) {
5185       osd->clog->error() << info.pgid << " got broken pg_query_t::LOG since " << query.since
5186                         << " when my log.tail is " << pg_log.get_tail()
5187                         << ", sending full log instead";
5188       mlog->log = pg_log.get_log();           // primary should not have requested this!!
5189     } else
5190       mlog->log.copy_after(pg_log.get_log(), query.since);
5191   }
5192   else if (query.type == pg_query_t::FULLLOG) {
5193     dout(10) << " sending info+missing+full log" << dendl;
5194     mlog->log = pg_log.get_log();
5195   }
5196
5197   dout(10) << " sending " << mlog->log << " " << mlog->missing << dendl;
5198
5199   osd->share_map_peer(from.osd, con.get(), get_osdmap());
5200   osd->send_message_osd_cluster(mlog, con.get());
5201 }
5202
5203 void PG::check_full_transition(OSDMapRef lastmap, OSDMapRef osdmap)
5204 {
5205   bool changed = false;
5206   if (osdmap->test_flag(CEPH_OSDMAP_FULL) &&
5207       !lastmap->test_flag(CEPH_OSDMAP_FULL)) {
5208     dout(10) << " cluster was marked full in " << osdmap->get_epoch() << dendl;
5209     changed = true;
5210   }
5211   const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5212   assert(pi);
5213   if (pi->has_flag(pg_pool_t::FLAG_FULL)) {
5214     const pg_pool_t *opi = lastmap->get_pg_pool(info.pgid.pool());
5215     if (!opi || !opi->has_flag(pg_pool_t::FLAG_FULL)) {
5216       dout(10) << " pool was marked full in " << osdmap->get_epoch() << dendl;
5217       changed = true;
5218     }
5219   }
5220   if (changed) {
5221     info.history.last_epoch_marked_full = osdmap->get_epoch();
5222     dirty_info = true;
5223   }
5224 }
5225
5226 bool PG::should_restart_peering(
5227   int newupprimary,
5228   int newactingprimary,
5229   const vector<int>& newup,
5230   const vector<int>& newacting,
5231   OSDMapRef lastmap,
5232   OSDMapRef osdmap)
5233 {
5234   if (PastIntervals::is_new_interval(
5235         primary.osd,
5236         newactingprimary,
5237         acting,
5238         newacting,
5239         up_primary.osd,
5240         newupprimary,
5241         up,
5242         newup,
5243         osdmap,
5244         lastmap,
5245         info.pgid.pgid)) {
5246     dout(20) << "new interval newup " << newup
5247              << " newacting " << newacting << dendl;
5248     return true;
5249   } else {
5250     return false;
5251   }
5252 }
5253
5254 bool PG::old_peering_msg(epoch_t reply_epoch, epoch_t query_epoch)
5255 {
5256   if (last_peering_reset > reply_epoch ||
5257       last_peering_reset > query_epoch) {
5258     dout(10) << "old_peering_msg reply_epoch " << reply_epoch << " query_epoch " << query_epoch
5259              << " last_peering_reset " << last_peering_reset
5260              << dendl;
5261     return true;
5262   }
5263   return false;
5264 }
5265
5266 void PG::set_last_peering_reset()
5267 {
5268   dout(20) << "set_last_peering_reset " << get_osdmap()->get_epoch() << dendl;
5269   if (last_peering_reset != get_osdmap()->get_epoch()) {
5270     last_peering_reset = get_osdmap()->get_epoch();
5271     reset_interval_flush();
5272   }
5273 }
5274
5275 struct FlushState {
5276   PGRef pg;
5277   epoch_t epoch;
5278   FlushState(PG *pg, epoch_t epoch) : pg(pg), epoch(epoch) {}
5279   ~FlushState() {
5280     pg->lock();
5281     if (!pg->pg_has_reset_since(epoch))
5282       pg->queue_flushed(epoch);
5283     pg->unlock();
5284   }
5285 };
5286 typedef ceph::shared_ptr<FlushState> FlushStateRef;
5287
5288 void PG::start_flush(ObjectStore::Transaction *t,
5289                      list<Context *> *on_applied,
5290                      list<Context *> *on_safe)
5291 {
5292   // flush in progress ops
5293   FlushStateRef flush_trigger (std::make_shared<FlushState>(
5294                                this, get_osdmap()->get_epoch()));
5295   t->nop();
5296   flushes_in_progress++;
5297   on_applied->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5298   on_safe->push_back(new ContainerContext<FlushStateRef>(flush_trigger));
5299 }
5300
5301 void PG::reset_interval_flush()
5302 {
5303   dout(10) << "Clearing blocked outgoing recovery messages" << dendl;
5304   recovery_state.clear_blocked_outgoing();
5305   
5306   Context *c = new QueuePeeringEvt<IntervalFlush>(
5307     this, get_osdmap()->get_epoch(), IntervalFlush());
5308   if (!osr->flush_commit(c)) {
5309     dout(10) << "Beginning to block outgoing recovery messages" << dendl;
5310     recovery_state.begin_block_outgoing();
5311   } else {
5312     dout(10) << "Not blocking outgoing recovery messages" << dendl;
5313     delete c;
5314   }
5315 }
5316
5317 /* Called before initializing peering during advance_map */
5318 void PG::start_peering_interval(
5319   const OSDMapRef lastmap,
5320   const vector<int>& newup, int new_up_primary,
5321   const vector<int>& newacting, int new_acting_primary,
5322   ObjectStore::Transaction *t)
5323 {
5324   const OSDMapRef osdmap = get_osdmap();
5325
5326   set_last_peering_reset();
5327
5328   vector<int> oldacting, oldup;
5329   int oldrole = get_role();
5330
5331   unreg_next_scrub();
5332
5333   pg_shard_t old_acting_primary = get_primary();
5334   pg_shard_t old_up_primary = up_primary;
5335   bool was_old_primary = is_primary();
5336
5337   acting.swap(oldacting);
5338   up.swap(oldup);
5339   init_primary_up_acting(
5340     newup,
5341     newacting,
5342     new_up_primary,
5343     new_acting_primary);
5344
5345   if (info.stats.up != up ||
5346       info.stats.acting != acting ||
5347       info.stats.up_primary != new_up_primary ||
5348       info.stats.acting_primary != new_acting_primary) {
5349     info.stats.up = up;
5350     info.stats.up_primary = new_up_primary;
5351     info.stats.acting = acting;
5352     info.stats.acting_primary = new_acting_primary;
5353     info.stats.mapping_epoch = osdmap->get_epoch();
5354   }
5355
5356   pg_stats_publish_lock.Lock();
5357   pg_stats_publish_valid = false;
5358   pg_stats_publish_lock.Unlock();
5359
5360   // This will now be remapped during a backfill in cases
5361   // that it would not have been before.
5362   if (up != acting)
5363     state_set(PG_STATE_REMAPPED);
5364   else
5365     state_clear(PG_STATE_REMAPPED);
5366
5367   int role = osdmap->calc_pg_role(osd->whoami, acting, acting.size());
5368   if (pool.info.is_replicated() || role == pg_whoami.shard)
5369     set_role(role);
5370   else
5371     set_role(-1);
5372
5373   // did acting, up, primary|acker change?
5374   if (!lastmap) {
5375     dout(10) << " no lastmap" << dendl;
5376     dirty_info = true;
5377     dirty_big_info = true;
5378     info.history.same_interval_since = osdmap->get_epoch();
5379   } else {
5380     std::stringstream debug;
5381     assert(info.history.same_interval_since != 0);
5382     boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
5383       get_is_recoverable_predicate());
5384     bool new_interval = PastIntervals::check_new_interval(
5385       old_acting_primary.osd,
5386       new_acting_primary,
5387       oldacting, newacting,
5388       old_up_primary.osd,
5389       new_up_primary,
5390       oldup, newup,
5391       info.history.same_interval_since,
5392       info.history.last_epoch_clean,
5393       osdmap,
5394       lastmap,
5395       info.pgid.pgid,
5396       recoverable.get(),
5397       &past_intervals,
5398       &debug);
5399     dout(10) << __func__ << ": check_new_interval output: "
5400              << debug.str() << dendl;
5401     if (new_interval) {
5402       if (osdmap->get_epoch() == osd->get_superblock().oldest_map &&
5403           info.history.last_epoch_clean < osdmap->get_epoch()) {
5404         dout(10) << " map gap, clearing past_intervals and faking" << dendl;
5405         // our information is incomplete and useless; someone else was clean
5406         // after everything we know if osdmaps were trimmed.
5407         past_intervals.clear();
5408       } else {
5409         dout(10) << " noting past " << past_intervals << dendl;
5410       }
5411       dirty_info = true;
5412       dirty_big_info = true;
5413       info.history.same_interval_since = osdmap->get_epoch();
5414       if (info.pgid.pgid.is_split(lastmap->get_pg_num(info.pgid.pgid.pool()),
5415                                   osdmap->get_pg_num(info.pgid.pgid.pool()),
5416                                   nullptr)) {
5417         info.history.last_epoch_split = osdmap->get_epoch();
5418       }
5419     }
5420   }
5421
5422   if (old_up_primary != up_primary ||
5423       oldup != up) {
5424     info.history.same_up_since = osdmap->get_epoch();
5425   }
5426   // this comparison includes primary rank via pg_shard_t
5427   if (old_acting_primary != get_primary()) {
5428     info.history.same_primary_since = osdmap->get_epoch();
5429   }
5430
5431   on_new_interval();
5432
5433   dout(1) << __func__ << " up " << oldup << " -> " << up
5434            << ", acting " << oldacting << " -> " << acting 
5435            << ", acting_primary " << old_acting_primary << " -> " << new_acting_primary
5436            << ", up_primary " << old_up_primary << " -> " << new_up_primary
5437            << ", role " << oldrole << " -> " << role
5438            << ", features acting " << acting_features
5439            << " upacting " << upacting_features
5440            << dendl;
5441
5442   // deactivate.
5443   state_clear(PG_STATE_ACTIVE);
5444   state_clear(PG_STATE_PEERED);
5445   state_clear(PG_STATE_DOWN);
5446   state_clear(PG_STATE_RECOVERY_WAIT);
5447   state_clear(PG_STATE_RECOVERY_TOOFULL);
5448   state_clear(PG_STATE_RECOVERING);
5449
5450   peer_purged.clear();
5451   actingbackfill.clear();
5452   scrub_queued = false;
5453
5454   // reset primary state?
5455   if (was_old_primary || is_primary()) {
5456     osd->remove_want_pg_temp(info.pgid.pgid);
5457   }
5458   clear_primary_state();
5459
5460     
5461   // pg->on_*
5462   on_change(t);
5463
5464   projected_last_update = eversion_t();
5465
5466   assert(!deleting);
5467
5468   // should we tell the primary we are here?
5469   send_notify = !is_primary();
5470
5471   if (role != oldrole ||
5472       was_old_primary != is_primary()) {
5473     // did primary change?
5474     if (was_old_primary != is_primary()) {
5475       state_clear(PG_STATE_CLEAN);
5476       clear_publish_stats();
5477     }
5478
5479     on_role_change();
5480
5481     // take active waiters
5482     requeue_ops(waiting_for_peered);
5483
5484   } else {
5485     // no role change.
5486     // did primary change?
5487     if (get_primary() != old_acting_primary) {    
5488       dout(10) << *this << " " << oldacting << " -> " << acting 
5489                << ", acting primary " 
5490                << old_acting_primary << " -> " << get_primary() 
5491                << dendl;
5492     } else {
5493       // primary is the same.
5494       if (is_primary()) {
5495         // i am (still) primary. but my replica set changed.
5496         state_clear(PG_STATE_CLEAN);
5497           
5498         dout(10) << oldacting << " -> " << acting
5499                  << ", replicas changed" << dendl;
5500       }
5501     }
5502   }
5503   cancel_recovery();
5504
5505   if (acting.empty() && !up.empty() && up_primary == pg_whoami) {
5506     dout(10) << " acting empty, but i am up[0], clearing pg_temp" << dendl;
5507     osd->queue_want_pg_temp(info.pgid.pgid, acting);
5508   }
5509 }
5510
5511 void PG::on_new_interval()
5512 {
5513   const OSDMapRef osdmap = get_osdmap();
5514
5515   reg_next_scrub();
5516
5517   // initialize features
5518   acting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5519   upacting_features = CEPH_FEATURES_SUPPORTED_DEFAULT;
5520   for (vector<int>::iterator p = acting.begin(); p != acting.end(); ++p) {
5521     if (*p == CRUSH_ITEM_NONE)
5522       continue;
5523     uint64_t f = osdmap->get_xinfo(*p).features;
5524     acting_features &= f;
5525     upacting_features &= f;
5526   }
5527   for (vector<int>::iterator p = up.begin(); p != up.end(); ++p) {
5528     if (*p == CRUSH_ITEM_NONE)
5529       continue;
5530     upacting_features &= osdmap->get_xinfo(*p).features;
5531   }
5532
5533   _on_new_interval();
5534 }
5535
5536 void PG::proc_primary_info(ObjectStore::Transaction &t, const pg_info_t &oinfo)
5537 {
5538   assert(!is_primary());
5539
5540   update_history(oinfo.history);
5541
5542   if (last_complete_ondisk.epoch >= info.history.last_epoch_started) {
5543     // DEBUG: verify that the snaps are empty in snap_mapper
5544     if (cct->_conf->osd_debug_verify_snaps_on_info) {
5545       interval_set<snapid_t> p;
5546       p.union_of(oinfo.purged_snaps, info.purged_snaps);
5547       p.subtract(info.purged_snaps);
5548       if (!p.empty()) {
5549         for (interval_set<snapid_t>::iterator i = p.begin();
5550              i != p.end();
5551              ++i) {
5552           for (snapid_t snap = i.get_start();
5553                snap != i.get_len() + i.get_start();
5554                ++snap) {
5555             vector<hobject_t> hoids;
5556             int r = snap_mapper.get_next_objects_to_trim(snap, 1, &hoids);
5557             if (r != 0 && r != -ENOENT) {
5558               derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
5559                    << cpp_strerror(r) << dendl;
5560               ceph_abort();
5561             } else if (r != -ENOENT) {
5562               assert(!hoids.empty());
5563               derr << __func__ << ": snap_mapper get_next_object_to_trim returned "
5564                    << cpp_strerror(r) << " for object "
5565                    << hoids[0] << " on snap " << snap
5566                    << " which should have been fully trimmed " << dendl;
5567               ceph_abort();
5568             }
5569           }
5570         }
5571       }
5572     }
5573     info.purged_snaps = oinfo.purged_snaps;
5574     dirty_info = true;
5575     dirty_big_info = true;
5576   }
5577 }
5578
5579 ostream& operator<<(ostream& out, const PG& pg)
5580 {
5581   out << "pg[" << pg.info
5582       << " " << pg.up;
5583   if (pg.acting != pg.up)
5584     out << "/" << pg.acting;
5585   out << " r=" << pg.get_role();
5586   out << " lpr=" << pg.get_last_peering_reset();
5587
5588   if (!pg.past_intervals.empty()) {
5589     out << " pi=[" << pg.past_intervals.get_bounds()
5590         << ")/" << pg.past_intervals.size();
5591   }
5592
5593   if (pg.is_peered()) {
5594     if (pg.last_update_ondisk != pg.info.last_update)
5595       out << " luod=" << pg.last_update_ondisk;
5596     if (pg.last_update_applied != pg.info.last_update)
5597       out << " lua=" << pg.last_update_applied;
5598   }
5599
5600   if (pg.recovery_ops_active)
5601     out << " rops=" << pg.recovery_ops_active;
5602
5603   if (pg.pg_log.get_tail() != pg.info.log_tail ||
5604       pg.pg_log.get_head() != pg.info.last_update)
5605     out << " (info mismatch, " << pg.pg_log.get_log() << ")";
5606
5607   if (!pg.pg_log.get_log().empty()) {
5608     if ((pg.pg_log.get_log().log.begin()->version <= pg.pg_log.get_tail())) {
5609       out << " (log bound mismatch, actual=["
5610           << pg.pg_log.get_log().log.begin()->version << ","
5611           << pg.pg_log.get_log().log.rbegin()->version << "]";
5612       out << ")";
5613     }
5614   }
5615
5616   if (!pg.backfill_targets.empty())
5617     out << " bft=" << pg.backfill_targets;
5618   out << " crt=" << pg.pg_log.get_can_rollback_to();
5619
5620   if (pg.last_complete_ondisk != pg.info.last_complete)
5621     out << " lcod " << pg.last_complete_ondisk;
5622
5623   if (pg.is_primary()) {
5624     out << " mlcod " << pg.min_last_complete_ondisk;
5625   }
5626
5627   out << " " << pg_state_string(pg.get_state());
5628   if (pg.should_send_notify())
5629     out << " NOTIFY";
5630
5631   if (pg.scrubber.must_repair)
5632     out << " MUST_REPAIR";
5633   if (pg.scrubber.auto_repair)
5634     out << " AUTO_REPAIR";
5635   if (pg.scrubber.must_deep_scrub)
5636     out << " MUST_DEEP_SCRUB";
5637   if (pg.scrubber.must_scrub)
5638     out << " MUST_SCRUB";
5639
5640   //out << " (" << pg.pg_log.get_tail() << "," << pg.pg_log.get_head() << "]";
5641   if (pg.pg_log.get_missing().num_missing()) {
5642     out << " m=" << pg.pg_log.get_missing().num_missing();
5643     if (pg.is_primary()) {
5644       uint64_t unfound = pg.get_num_unfound();
5645       if (unfound)
5646         out << " u=" << unfound;
5647     }
5648   }
5649   if (pg.snap_trimq.size())
5650     out << " snaptrimq=" << pg.snap_trimq;
5651
5652   out << "]";
5653
5654
5655   return out;
5656 }
5657
5658 bool PG::can_discard_op(OpRequestRef& op)
5659 {
5660   const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
5661   if (cct->_conf->osd_discard_disconnected_ops && OSD::op_is_discardable(m)) {
5662     dout(20) << " discard " << *m << dendl;
5663     return true;
5664   }
5665
5666   if (m->get_map_epoch() < info.history.same_primary_since) {
5667     dout(7) << " changed after " << m->get_map_epoch()
5668             << ", dropping " << *m << dendl;
5669     return true;
5670   }
5671
5672   if (m->get_connection()->has_feature(CEPH_FEATURE_RESEND_ON_SPLIT)) {
5673     if (m->get_map_epoch() < pool.info.get_last_force_op_resend()) {
5674       dout(7) << __func__ << " sent before last_force_op_resend "
5675               << pool.info.last_force_op_resend << ", dropping" << *m << dendl;
5676       return true;
5677     }
5678     if (m->get_map_epoch() < info.history.last_epoch_split) {
5679       dout(7) << __func__ << " pg split in "
5680               << info.history.last_epoch_split << ", dropping" << dendl;
5681       return true;
5682     }
5683   } else if (m->get_connection()->has_feature(CEPH_FEATURE_OSD_POOLRESEND)) {
5684     if (m->get_map_epoch() < pool.info.get_last_force_op_resend_preluminous()) {
5685       dout(7) << __func__ << " sent before last_force_op_resend_preluminous "
5686               << pool.info.last_force_op_resend_preluminous
5687               << ", dropping" << *m << dendl;
5688       return true;
5689     }
5690   }
5691
5692   return false;
5693 }
5694
5695 template<typename T, int MSGTYPE>
5696 bool PG::can_discard_replica_op(OpRequestRef& op)
5697 {
5698   const T *m = static_cast<const T *>(op->get_req());
5699   assert(m->get_type() == MSGTYPE);
5700
5701   int from = m->get_source().num();
5702
5703   // if a repop is replied after a replica goes down in a new osdmap, and
5704   // before the pg advances to this new osdmap, the repop replies before this
5705   // repop can be discarded by that replica OSD, because the primary resets the
5706   // connection to it when handling the new osdmap marking it down, and also
5707   // resets the messenger sesssion when the replica reconnects. to avoid the
5708   // out-of-order replies, the messages from that replica should be discarded.
5709   if (osd->get_osdmap()->is_down(from))
5710     return true;
5711   /* Mostly, this overlaps with the old_peering_msg
5712    * condition.  An important exception is pushes
5713    * sent by replicas not in the acting set, since
5714    * if such a replica goes down it does not cause
5715    * a new interval. */
5716   if (get_osdmap()->get_down_at(from) >= m->map_epoch)
5717     return true;
5718
5719   // same pg?
5720   //  if pg changes _at all_, we reset and repeer!
5721   if (old_peering_msg(m->map_epoch, m->map_epoch)) {
5722     dout(10) << "can_discard_replica_op pg changed " << info.history
5723              << " after " << m->map_epoch
5724              << ", dropping" << dendl;
5725     return true;
5726   }
5727   return false;
5728 }
5729
5730 bool PG::can_discard_scan(OpRequestRef op)
5731 {
5732   const MOSDPGScan *m = static_cast<const MOSDPGScan *>(op->get_req());
5733   assert(m->get_type() == MSG_OSD_PG_SCAN);
5734
5735   if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5736     dout(10) << " got old scan, ignoring" << dendl;
5737     return true;
5738   }
5739   return false;
5740 }
5741
5742 bool PG::can_discard_backfill(OpRequestRef op)
5743 {
5744   const MOSDPGBackfill *m = static_cast<const MOSDPGBackfill *>(op->get_req());
5745   assert(m->get_type() == MSG_OSD_PG_BACKFILL);
5746
5747   if (old_peering_msg(m->map_epoch, m->query_epoch)) {
5748     dout(10) << " got old backfill, ignoring" << dendl;
5749     return true;
5750   }
5751
5752   return false;
5753
5754 }
5755
5756 bool PG::can_discard_request(OpRequestRef& op)
5757 {
5758   switch (op->get_req()->get_type()) {
5759   case CEPH_MSG_OSD_OP:
5760     return can_discard_op(op);
5761   case CEPH_MSG_OSD_BACKOFF:
5762     return false; // never discard
5763   case MSG_OSD_SUBOP:
5764     return can_discard_replica_op<MOSDSubOp, MSG_OSD_SUBOP>(op);
5765   case MSG_OSD_REPOP:
5766     return can_discard_replica_op<MOSDRepOp, MSG_OSD_REPOP>(op);
5767   case MSG_OSD_PG_PUSH:
5768     return can_discard_replica_op<MOSDPGPush, MSG_OSD_PG_PUSH>(op);
5769   case MSG_OSD_PG_PULL:
5770     return can_discard_replica_op<MOSDPGPull, MSG_OSD_PG_PULL>(op);
5771   case MSG_OSD_PG_PUSH_REPLY:
5772     return can_discard_replica_op<MOSDPGPushReply, MSG_OSD_PG_PUSH_REPLY>(op);
5773   case MSG_OSD_SUBOPREPLY:
5774     return can_discard_replica_op<MOSDSubOpReply, MSG_OSD_SUBOPREPLY>(op);
5775   case MSG_OSD_REPOPREPLY:
5776     return can_discard_replica_op<MOSDRepOpReply, MSG_OSD_REPOPREPLY>(op);
5777   case MSG_OSD_PG_RECOVERY_DELETE:
5778     return can_discard_replica_op<MOSDPGRecoveryDelete, MSG_OSD_PG_RECOVERY_DELETE>(op);
5779
5780   case MSG_OSD_PG_RECOVERY_DELETE_REPLY:
5781     return can_discard_replica_op<MOSDPGRecoveryDeleteReply, MSG_OSD_PG_RECOVERY_DELETE_REPLY>(op);
5782
5783   case MSG_OSD_EC_WRITE:
5784     return can_discard_replica_op<MOSDECSubOpWrite, MSG_OSD_EC_WRITE>(op);
5785   case MSG_OSD_EC_WRITE_REPLY:
5786     return can_discard_replica_op<MOSDECSubOpWriteReply, MSG_OSD_EC_WRITE_REPLY>(op);
5787   case MSG_OSD_EC_READ:
5788     return can_discard_replica_op<MOSDECSubOpRead, MSG_OSD_EC_READ>(op);
5789   case MSG_OSD_EC_READ_REPLY:
5790     return can_discard_replica_op<MOSDECSubOpReadReply, MSG_OSD_EC_READ_REPLY>(op);
5791   case MSG_OSD_REP_SCRUB:
5792     return can_discard_replica_op<MOSDRepScrub, MSG_OSD_REP_SCRUB>(op);
5793   case MSG_OSD_SCRUB_RESERVE:
5794     return can_discard_replica_op<MOSDScrubReserve, MSG_OSD_SCRUB_RESERVE>(op);
5795   case MSG_OSD_REP_SCRUBMAP:
5796     return can_discard_replica_op<MOSDRepScrubMap, MSG_OSD_REP_SCRUBMAP>(op);
5797   case MSG_OSD_PG_UPDATE_LOG_MISSING:
5798     return can_discard_replica_op<
5799       MOSDPGUpdateLogMissing, MSG_OSD_PG_UPDATE_LOG_MISSING>(op);
5800   case MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY:
5801     return can_discard_replica_op<
5802       MOSDPGUpdateLogMissingReply, MSG_OSD_PG_UPDATE_LOG_MISSING_REPLY>(op);
5803
5804   case MSG_OSD_PG_SCAN:
5805     return can_discard_scan(op);
5806   case MSG_OSD_PG_BACKFILL:
5807     return can_discard_backfill(op);
5808   case MSG_OSD_PG_BACKFILL_REMOVE:
5809     return can_discard_replica_op<MOSDPGBackfillRemove,
5810                                   MSG_OSD_PG_BACKFILL_REMOVE>(op);
5811   }
5812   return true;
5813 }
5814
5815 void PG::take_waiters()
5816 {
5817   dout(10) << "take_waiters" << dendl;
5818   requeue_map_waiters();
5819   for (list<CephPeeringEvtRef>::iterator i = peering_waiters.begin();
5820        i != peering_waiters.end();
5821        ++i) osd->queue_for_peering(this);
5822   peering_queue.splice(peering_queue.begin(), peering_waiters,
5823                        peering_waiters.begin(), peering_waiters.end());
5824 }
5825
5826 void PG::handle_peering_event(CephPeeringEvtRef evt, RecoveryCtx *rctx)
5827 {
5828   dout(10) << "handle_peering_event: " << evt->get_desc() << dendl;
5829   if (!have_same_or_newer_map(evt->get_epoch_sent())) {
5830     dout(10) << "deferring event " << evt->get_desc() << dendl;
5831     peering_waiters.push_back(evt);
5832     return;
5833   }
5834   if (old_peering_evt(evt))
5835     return;
5836   recovery_state.handle_event(evt, rctx);
5837 }
5838
5839 void PG::queue_peering_event(CephPeeringEvtRef evt)
5840 {
5841   if (old_peering_evt(evt))
5842     return;
5843   peering_queue.push_back(evt);
5844   osd->queue_for_peering(this);
5845 }
5846
5847 void PG::queue_null(epoch_t msg_epoch,
5848                     epoch_t query_epoch)
5849 {
5850   dout(10) << "null" << dendl;
5851   queue_peering_event(
5852     CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5853                                          NullEvt())));
5854 }
5855
5856 void PG::queue_flushed(epoch_t e)
5857 {
5858   dout(10) << "flushed" << dendl;
5859   queue_peering_event(
5860     CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(e, e,
5861                                          FlushedEvt())));
5862 }
5863
5864 void PG::queue_query(epoch_t msg_epoch,
5865                      epoch_t query_epoch,
5866                      pg_shard_t from, const pg_query_t& q)
5867 {
5868   dout(10) << "handle_query " << q << " from replica " << from << dendl;
5869   queue_peering_event(
5870     CephPeeringEvtRef(std::make_shared<CephPeeringEvt>(msg_epoch, query_epoch,
5871                                          MQuery(from, q, query_epoch))));
5872 }
5873
5874 void PG::handle_advance_map(
5875   OSDMapRef osdmap, OSDMapRef lastmap,
5876   vector<int>& newup, int up_primary,
5877   vector<int>& newacting, int acting_primary,
5878   RecoveryCtx *rctx)
5879 {
5880   assert(lastmap->get_epoch() == osdmap_ref->get_epoch());
5881   assert(lastmap == osdmap_ref);
5882   dout(10) << "handle_advance_map "
5883            << newup << "/" << newacting
5884            << " -- " << up_primary << "/" << acting_primary
5885            << dendl;
5886   update_osdmap_ref(osdmap);
5887   pool.update(osdmap);
5888   past_intervals.update_type_from_map(pool.info.ec_pool(), *osdmap);
5889   if (cct->_conf->osd_debug_verify_cached_snaps) {
5890     interval_set<snapid_t> actual_removed_snaps;
5891     const pg_pool_t *pi = osdmap->get_pg_pool(info.pgid.pool());
5892     assert(pi);
5893     pi->build_removed_snaps(actual_removed_snaps);
5894     if (!(actual_removed_snaps == pool.cached_removed_snaps)) {
5895       derr << __func__ << ": mismatch between the actual removed snaps "
5896            << actual_removed_snaps << " and pool.cached_removed_snaps "
5897            << " pool.cached_removed_snaps " << pool.cached_removed_snaps
5898            << dendl;
5899     }
5900     assert(actual_removed_snaps == pool.cached_removed_snaps);
5901   }
5902   AdvMap evt(
5903     osdmap, lastmap, newup, up_primary,
5904     newacting, acting_primary);
5905   recovery_state.handle_event(evt, rctx);
5906   if (pool.info.last_change == osdmap_ref->get_epoch()) {
5907     on_pool_change();
5908     update_store_with_options();
5909   }
5910 }
5911
5912 void PG::handle_activate_map(RecoveryCtx *rctx)
5913 {
5914   dout(10) << "handle_activate_map " << dendl;
5915   ActMap evt;
5916   recovery_state.handle_event(evt, rctx);
5917   if (osdmap_ref->get_epoch() - last_persisted_osdmap_ref->get_epoch() >
5918     cct->_conf->osd_pg_epoch_persisted_max_stale) {
5919     dout(20) << __func__ << ": Dirtying info: last_persisted is "
5920              << last_persisted_osdmap_ref->get_epoch()
5921              << " while current is " << osdmap_ref->get_epoch() << dendl;
5922     dirty_info = true;
5923   } else {
5924     dout(20) << __func__ << ": Not dirtying info: last_persisted is "
5925              << last_persisted_osdmap_ref->get_epoch()
5926              << " while current is " << osdmap_ref->get_epoch() << dendl;
5927   }
5928   if (osdmap_ref->check_new_blacklist_entries()) check_blacklisted_watchers();
5929 }
5930
5931 void PG::handle_loaded(RecoveryCtx *rctx)
5932 {
5933   dout(10) << "handle_loaded" << dendl;
5934   Load evt;
5935   recovery_state.handle_event(evt, rctx);
5936 }
5937
5938 void PG::handle_create(RecoveryCtx *rctx)
5939 {
5940   dout(10) << "handle_create" << dendl;
5941   rctx->created_pgs.insert(this);
5942   Initialize evt;
5943   recovery_state.handle_event(evt, rctx);
5944   ActMap evt2;
5945   recovery_state.handle_event(evt2, rctx);
5946 }
5947
5948 void PG::handle_query_state(Formatter *f)
5949 {
5950   dout(10) << "handle_query_state" << dendl;
5951   QueryState q(f);
5952   recovery_state.handle_event(q, 0);
5953 }
5954
5955 void PG::update_store_with_options()
5956 {
5957   auto r = osd->store->set_collection_opts(coll, pool.info.opts);
5958   if(r < 0 && r != -EOPNOTSUPP) {
5959     derr << __func__ << "set_collection_opts returns error:" << r << dendl;
5960   }
5961 }
5962
5963 void PG::update_store_on_load()
5964 {
5965   if (osd->store->get_type() == "filestore") {
5966     // legacy filestore didn't store collection bit width; fix.
5967     int bits = osd->store->collection_bits(coll);
5968     if (bits < 0) {
5969       assert(!coll.is_meta()); // otherwise OSD::load_pgs() did a bad thing
5970       bits = info.pgid.get_split_bits(pool.info.get_pg_num());
5971       lderr(cct) << __func__ << " setting bit width to " << bits << dendl;
5972       ObjectStore::Transaction t;
5973       t.collection_set_bits(coll, bits);
5974       osd->store->apply_transaction(osr.get(), std::move(t));
5975     }
5976   }
5977 }
5978
5979 /*------------ Recovery State Machine----------------*/
5980 #undef dout_prefix
5981 #define dout_prefix (*_dout << context< RecoveryMachine >().pg->gen_prefix() \
5982                      << "state<" << get_state_name() << ">: ")
5983
5984 /*------Crashed-------*/
5985 PG::RecoveryState::Crashed::Crashed(my_context ctx)
5986   : my_base(ctx),
5987     NamedState(context< RecoveryMachine >().pg, "Crashed")
5988 {
5989   context< RecoveryMachine >().log_enter(state_name);
5990   assert(0 == "we got a bad state machine event");
5991 }
5992
5993
5994 /*------Initial-------*/
5995 PG::RecoveryState::Initial::Initial(my_context ctx)
5996   : my_base(ctx),
5997     NamedState(context< RecoveryMachine >().pg, "Initial")
5998 {
5999   context< RecoveryMachine >().log_enter(state_name);
6000 }
6001
6002 boost::statechart::result PG::RecoveryState::Initial::react(const Load& l)
6003 {
6004   PG *pg = context< RecoveryMachine >().pg;
6005
6006   // do we tell someone we're here?
6007   pg->send_notify = (!pg->is_primary());
6008   pg->update_store_with_options();
6009
6010   pg->update_store_on_load();
6011
6012   return transit< Reset >();
6013 }
6014
6015 boost::statechart::result PG::RecoveryState::Initial::react(const MNotifyRec& notify)
6016 {
6017   PG *pg = context< RecoveryMachine >().pg;
6018   pg->proc_replica_info(
6019     notify.from, notify.notify.info, notify.notify.epoch_sent);
6020   pg->set_last_peering_reset();
6021   return transit< Primary >();
6022 }
6023
6024 boost::statechart::result PG::RecoveryState::Initial::react(const MInfoRec& i)
6025 {
6026   PG *pg = context< RecoveryMachine >().pg;
6027   assert(!pg->is_primary());
6028   post_event(i);
6029   return transit< Stray >();
6030 }
6031
6032 boost::statechart::result PG::RecoveryState::Initial::react(const MLogRec& i)
6033 {
6034   PG *pg = context< RecoveryMachine >().pg;
6035   assert(!pg->is_primary());
6036   post_event(i);
6037   return transit< Stray >();
6038 }
6039
6040 void PG::RecoveryState::Initial::exit()
6041 {
6042   context< RecoveryMachine >().log_exit(state_name, enter_time);
6043   PG *pg = context< RecoveryMachine >().pg;
6044   utime_t dur = ceph_clock_now() - enter_time;
6045   pg->osd->recoverystate_perf->tinc(rs_initial_latency, dur);
6046 }
6047
6048 /*------Started-------*/
6049 PG::RecoveryState::Started::Started(my_context ctx)
6050   : my_base(ctx),
6051     NamedState(context< RecoveryMachine >().pg, "Started")
6052 {
6053   context< RecoveryMachine >().log_enter(state_name);
6054 }
6055
6056 boost::statechart::result
6057 PG::RecoveryState::Started::react(const IntervalFlush&)
6058 {
6059   PG *pg = context< RecoveryMachine >().pg;
6060   ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6061   context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6062   return discard_event();
6063 }
6064
6065
6066 boost::statechart::result
6067 PG::RecoveryState::Started::react(const FlushedEvt&)
6068 {
6069   PG *pg = context< RecoveryMachine >().pg;
6070   pg->on_flushed();
6071   return discard_event();
6072 }
6073
6074
6075 boost::statechart::result PG::RecoveryState::Started::react(const AdvMap& advmap)
6076 {
6077   PG *pg = context< RecoveryMachine >().pg;
6078   ldout(pg->cct, 10) << "Started advmap" << dendl;
6079   pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6080   if (pg->should_restart_peering(
6081         advmap.up_primary,
6082         advmap.acting_primary,
6083         advmap.newup,
6084         advmap.newacting,
6085         advmap.lastmap,
6086         advmap.osdmap)) {
6087     ldout(pg->cct, 10) << "should_restart_peering, transitioning to Reset"
6088                        << dendl;
6089     post_event(advmap);
6090     return transit< Reset >();
6091   }
6092   pg->remove_down_peer_info(advmap.osdmap);
6093   return discard_event();
6094 }
6095
6096 boost::statechart::result PG::RecoveryState::Started::react(const QueryState& q)
6097 {
6098   q.f->open_object_section("state");
6099   q.f->dump_string("name", state_name);
6100   q.f->dump_stream("enter_time") << enter_time;
6101   q.f->close_section();
6102   return discard_event();
6103 }
6104
6105 void PG::RecoveryState::Started::exit()
6106 {
6107   context< RecoveryMachine >().log_exit(state_name, enter_time);
6108   PG *pg = context< RecoveryMachine >().pg;
6109   utime_t dur = ceph_clock_now() - enter_time;
6110   pg->osd->recoverystate_perf->tinc(rs_started_latency, dur);
6111 }
6112
6113 /*--------Reset---------*/
6114 PG::RecoveryState::Reset::Reset(my_context ctx)
6115   : my_base(ctx),
6116     NamedState(context< RecoveryMachine >().pg, "Reset")
6117 {
6118   context< RecoveryMachine >().log_enter(state_name);
6119   PG *pg = context< RecoveryMachine >().pg;
6120
6121   pg->flushes_in_progress = 0;
6122   pg->set_last_peering_reset();
6123 }
6124
6125 boost::statechart::result
6126 PG::RecoveryState::Reset::react(const FlushedEvt&)
6127 {
6128   PG *pg = context< RecoveryMachine >().pg;
6129   pg->on_flushed();
6130   return discard_event();
6131 }
6132
6133 boost::statechart::result
6134 PG::RecoveryState::Reset::react(const IntervalFlush&)
6135 {
6136   PG *pg = context< RecoveryMachine >().pg;
6137   ldout(pg->cct, 10) << "Ending blocked outgoing recovery messages" << dendl;
6138   context< RecoveryMachine >().pg->recovery_state.end_block_outgoing();
6139   return discard_event();
6140 }
6141
6142 boost::statechart::result PG::RecoveryState::Reset::react(const AdvMap& advmap)
6143 {
6144   PG *pg = context< RecoveryMachine >().pg;
6145   ldout(pg->cct, 10) << "Reset advmap" << dendl;
6146
6147   pg->check_full_transition(advmap.lastmap, advmap.osdmap);
6148
6149   if (pg->should_restart_peering(
6150         advmap.up_primary,
6151         advmap.acting_primary,
6152         advmap.newup,
6153         advmap.newacting,
6154         advmap.lastmap,
6155         advmap.osdmap)) {
6156     ldout(pg->cct, 10) << "should restart peering, calling start_peering_interval again"
6157                        << dendl;
6158     pg->start_peering_interval(
6159       advmap.lastmap,
6160       advmap.newup, advmap.up_primary,
6161       advmap.newacting, advmap.acting_primary,
6162       context< RecoveryMachine >().get_cur_transaction());
6163   }
6164   pg->remove_down_peer_info(advmap.osdmap);
6165   pg->check_past_interval_bounds();
6166   return discard_event();
6167 }
6168
6169 boost::statechart::result PG::RecoveryState::Reset::react(const ActMap&)
6170 {
6171   PG *pg = context< RecoveryMachine >().pg;
6172   if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
6173     context< RecoveryMachine >().send_notify(
6174       pg->get_primary(),
6175       pg_notify_t(
6176         pg->get_primary().shard, pg->pg_whoami.shard,
6177         pg->get_osdmap()->get_epoch(),
6178         pg->get_osdmap()->get_epoch(),
6179         pg->info),
6180       pg->past_intervals);
6181   }
6182
6183   pg->update_heartbeat_peers();
6184   pg->take_waiters();
6185
6186   return transit< Started >();
6187 }
6188
6189 boost::statechart::result PG::RecoveryState::Reset::react(const QueryState& q)
6190 {
6191   q.f->open_object_section("state");
6192   q.f->dump_string("name", state_name);
6193   q.f->dump_stream("enter_time") << enter_time;
6194   q.f->close_section();
6195   return discard_event();
6196 }
6197
6198 void PG::RecoveryState::Reset::exit()
6199 {
6200   context< RecoveryMachine >().log_exit(state_name, enter_time);
6201   PG *pg = context< RecoveryMachine >().pg;
6202   utime_t dur = ceph_clock_now() - enter_time;
6203   pg->osd->recoverystate_perf->tinc(rs_reset_latency, dur);
6204 }
6205
6206 /*-------Start---------*/
6207 PG::RecoveryState::Start::Start(my_context ctx)
6208   : my_base(ctx),
6209     NamedState(context< RecoveryMachine >().pg, "Start")
6210 {
6211   context< RecoveryMachine >().log_enter(state_name);
6212
6213   PG *pg = context< RecoveryMachine >().pg;
6214   if (pg->is_primary()) {
6215     ldout(pg->cct, 1) << "transitioning to Primary" << dendl;
6216     post_event(MakePrimary());
6217   } else { //is_stray
6218     ldout(pg->cct, 1) << "transitioning to Stray" << dendl;
6219     post_event(MakeStray());
6220   }
6221 }
6222
6223 void PG::RecoveryState::Start::exit()
6224 {
6225   context< RecoveryMachine >().log_exit(state_name, enter_time);
6226   PG *pg = context< RecoveryMachine >().pg;
6227   utime_t dur = ceph_clock_now() - enter_time;
6228   pg->osd->recoverystate_perf->tinc(rs_start_latency, dur);
6229 }
6230
6231 /*---------Primary--------*/
6232 PG::RecoveryState::Primary::Primary(my_context ctx)
6233   : my_base(ctx),
6234     NamedState(context< RecoveryMachine >().pg, "Started/Primary")
6235 {
6236   context< RecoveryMachine >().log_enter(state_name);
6237   PG *pg = context< RecoveryMachine >().pg;
6238   assert(pg->want_acting.empty());
6239
6240   // set CREATING bit until we have peered for the first time.
6241   if (pg->info.history.last_epoch_started == 0) {
6242     pg->state_set(PG_STATE_CREATING);
6243     // use the history timestamp, which ultimately comes from the
6244     // monitor in the create case.
6245     utime_t t = pg->info.history.last_scrub_stamp;
6246     pg->info.stats.last_fresh = t;
6247     pg->info.stats.last_active = t;
6248     pg->info.stats.last_change = t;
6249     pg->info.stats.last_peered = t;
6250     pg->info.stats.last_clean = t;
6251     pg->info.stats.last_unstale = t;
6252     pg->info.stats.last_undegraded = t;
6253     pg->info.stats.last_fullsized = t;
6254     pg->info.stats.last_scrub_stamp = t;
6255     pg->info.stats.last_deep_scrub_stamp = t;
6256     pg->info.stats.last_clean_scrub_stamp = t;
6257   }
6258 }
6259
6260 boost::statechart::result PG::RecoveryState::Primary::react(const MNotifyRec& notevt)
6261 {
6262   PG *pg = context< RecoveryMachine >().pg;
6263   ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
6264   pg->proc_replica_info(
6265     notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
6266   return discard_event();
6267 }
6268
6269 boost::statechart::result PG::RecoveryState::Primary::react(const ActMap&)
6270 {
6271   PG *pg = context< RecoveryMachine >().pg;
6272   ldout(pg->cct, 7) << "handle ActMap primary" << dendl;
6273   pg->publish_stats_to_osd();
6274   pg->take_waiters();
6275   return discard_event();
6276 }
6277
6278 void PG::RecoveryState::Primary::exit()
6279 {
6280   context< RecoveryMachine >().log_exit(state_name, enter_time);
6281   PG *pg = context< RecoveryMachine >().pg;
6282   pg->want_acting.clear();
6283   utime_t dur = ceph_clock_now() - enter_time;
6284   pg->osd->recoverystate_perf->tinc(rs_primary_latency, dur);
6285   pg->clear_primary_state();
6286   pg->state_clear(PG_STATE_CREATING);
6287 }
6288
6289 /*---------Peering--------*/
6290 PG::RecoveryState::Peering::Peering(my_context ctx)
6291   : my_base(ctx),
6292     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering"),
6293     history_les_bound(false)
6294 {
6295   context< RecoveryMachine >().log_enter(state_name);
6296
6297   PG *pg = context< RecoveryMachine >().pg;
6298   assert(!pg->is_peered());
6299   assert(!pg->is_peering());
6300   assert(pg->is_primary());
6301   pg->state_set(PG_STATE_PEERING);
6302 }
6303
6304 boost::statechart::result PG::RecoveryState::Peering::react(const AdvMap& advmap) 
6305 {
6306   PG *pg = context< RecoveryMachine >().pg;
6307   ldout(pg->cct, 10) << "Peering advmap" << dendl;
6308   if (prior_set.affected_by_map(*(advmap.osdmap), pg)) {
6309     ldout(pg->cct, 1) << "Peering, affected_by_map, going to Reset" << dendl;
6310     post_event(advmap);
6311     return transit< Reset >();
6312   }
6313   
6314   pg->adjust_need_up_thru(advmap.osdmap);
6315   
6316   return forward_event();
6317 }
6318
6319 boost::statechart::result PG::RecoveryState::Peering::react(const QueryState& q)
6320 {
6321   PG *pg = context< RecoveryMachine >().pg;
6322
6323   q.f->open_object_section("state");
6324   q.f->dump_string("name", state_name);
6325   q.f->dump_stream("enter_time") << enter_time;
6326
6327   q.f->open_array_section("past_intervals");
6328   pg->past_intervals.dump(q.f);
6329   q.f->close_section();
6330
6331   q.f->open_array_section("probing_osds");
6332   for (set<pg_shard_t>::iterator p = prior_set.probe.begin();
6333        p != prior_set.probe.end();
6334        ++p)
6335     q.f->dump_stream("osd") << *p;
6336   q.f->close_section();
6337
6338   if (prior_set.pg_down)
6339     q.f->dump_string("blocked", "peering is blocked due to down osds");
6340
6341   q.f->open_array_section("down_osds_we_would_probe");
6342   for (set<int>::iterator p = prior_set.down.begin();
6343        p != prior_set.down.end();
6344        ++p)
6345     q.f->dump_int("osd", *p);
6346   q.f->close_section();
6347
6348   q.f->open_array_section("peering_blocked_by");
6349   for (map<int,epoch_t>::iterator p = prior_set.blocked_by.begin();
6350        p != prior_set.blocked_by.end();
6351        ++p) {
6352     q.f->open_object_section("osd");
6353     q.f->dump_int("osd", p->first);
6354     q.f->dump_int("current_lost_at", p->second);
6355     q.f->dump_string("comment", "starting or marking this osd lost may let us proceed");
6356     q.f->close_section();
6357   }
6358   q.f->close_section();
6359
6360   if (history_les_bound) {
6361     q.f->open_array_section("peering_blocked_by_detail");
6362     q.f->open_object_section("item");
6363     q.f->dump_string("detail","peering_blocked_by_history_les_bound");
6364     q.f->close_section();
6365     q.f->close_section();
6366   }
6367
6368   q.f->close_section();
6369   return forward_event();
6370 }
6371
6372 void PG::RecoveryState::Peering::exit()
6373 {
6374   PG *pg = context< RecoveryMachine >().pg;
6375   ldout(pg->cct, 10) << "Leaving Peering" << dendl;
6376   context< RecoveryMachine >().log_exit(state_name, enter_time);
6377   pg->state_clear(PG_STATE_PEERING);
6378   pg->clear_probe_targets();
6379
6380   utime_t dur = ceph_clock_now() - enter_time;
6381   pg->osd->recoverystate_perf->tinc(rs_peering_latency, dur);
6382 }
6383
6384
6385 /*------Backfilling-------*/
6386 PG::RecoveryState::Backfilling::Backfilling(my_context ctx)
6387   : my_base(ctx),
6388     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Backfilling")
6389 {
6390   context< RecoveryMachine >().log_enter(state_name);
6391   PG *pg = context< RecoveryMachine >().pg;
6392   pg->backfill_reserved = true;
6393   pg->queue_recovery();
6394   pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
6395   pg->state_clear(PG_STATE_BACKFILL_WAIT);
6396   pg->state_set(PG_STATE_BACKFILLING);
6397   pg->publish_stats_to_osd();
6398 }
6399
6400 boost::statechart::result
6401 PG::RecoveryState::Backfilling::react(const DeferBackfill &c)
6402 {
6403   PG *pg = context< RecoveryMachine >().pg;
6404   ldout(pg->cct, 10) << "defer backfill, retry delay " << c.delay << dendl;
6405   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6406
6407   pg->state_set(PG_STATE_BACKFILL_WAIT);
6408   pg->state_clear(PG_STATE_BACKFILLING);
6409
6410   for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6411        it != pg->backfill_targets.end();
6412        ++it) {
6413     assert(*it != pg->pg_whoami);
6414     ConnectionRef con = pg->osd->get_con_osd_cluster(
6415       it->osd, pg->get_osdmap()->get_epoch());
6416     if (con) {
6417       pg->osd->send_message_osd_cluster(
6418         new MBackfillReserve(
6419           MBackfillReserve::REJECT,
6420           spg_t(pg->info.pgid.pgid, it->shard),
6421           pg->get_osdmap()->get_epoch()),
6422         con.get());
6423     }
6424   }
6425
6426
6427   if (!pg->waiting_on_backfill.empty()) {
6428     pg->waiting_on_backfill.clear();
6429     pg->finish_recovery_op(hobject_t::get_max());
6430   }
6431
6432   pg->schedule_backfill_retry(c.delay);
6433   return transit<NotBackfilling>();
6434 }
6435
6436 boost::statechart::result
6437 PG::RecoveryState::Backfilling::react(const RemoteReservationRejected &)
6438 {
6439   PG *pg = context< RecoveryMachine >().pg;
6440   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6441   pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6442
6443   for (set<pg_shard_t>::iterator it = pg->backfill_targets.begin();
6444        it != pg->backfill_targets.end();
6445        ++it) {
6446     assert(*it != pg->pg_whoami);
6447     ConnectionRef con = pg->osd->get_con_osd_cluster(
6448       it->osd, pg->get_osdmap()->get_epoch());
6449     if (con) {
6450       pg->osd->send_message_osd_cluster(
6451         new MBackfillReserve(
6452           MBackfillReserve::REJECT,
6453           spg_t(pg->info.pgid.pgid, it->shard),
6454           pg->get_osdmap()->get_epoch()),
6455         con.get());
6456     }
6457   }
6458
6459   if (!pg->waiting_on_backfill.empty()) {
6460     pg->waiting_on_backfill.clear();
6461     pg->finish_recovery_op(hobject_t::get_max());
6462   }
6463
6464   pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
6465   return transit<NotBackfilling>();
6466 }
6467
6468 void PG::RecoveryState::Backfilling::exit()
6469 {
6470   context< RecoveryMachine >().log_exit(state_name, enter_time);
6471   PG *pg = context< RecoveryMachine >().pg;
6472   pg->backfill_reserved = false;
6473   pg->backfill_reserving = false;
6474   pg->state_clear(PG_STATE_BACKFILLING);
6475   pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
6476   utime_t dur = ceph_clock_now() - enter_time;
6477   pg->osd->recoverystate_perf->tinc(rs_backfilling_latency, dur);
6478 }
6479
6480 /*--WaitRemoteBackfillReserved--*/
6481
6482 PG::RecoveryState::WaitRemoteBackfillReserved::WaitRemoteBackfillReserved(my_context ctx)
6483   : my_base(ctx),
6484     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteBackfillReserved"),
6485     backfill_osd_it(context< Active >().remote_shards_to_reserve_backfill.begin())
6486 {
6487   context< RecoveryMachine >().log_enter(state_name);
6488   PG *pg = context< RecoveryMachine >().pg;
6489   pg->state_set(PG_STATE_BACKFILL_WAIT);
6490   pg->publish_stats_to_osd();
6491   post_event(RemoteBackfillReserved());
6492 }
6493
6494 boost::statechart::result
6495 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteBackfillReserved &evt)
6496 {
6497   PG *pg = context< RecoveryMachine >().pg;
6498
6499   if (backfill_osd_it != context< Active >().remote_shards_to_reserve_backfill.end()) {
6500     //The primary never backfills itself
6501     assert(*backfill_osd_it != pg->pg_whoami);
6502     ConnectionRef con = pg->osd->get_con_osd_cluster(
6503       backfill_osd_it->osd, pg->get_osdmap()->get_epoch());
6504     if (con) {
6505       pg->osd->send_message_osd_cluster(
6506         new MBackfillReserve(
6507         MBackfillReserve::REQUEST,
6508         spg_t(pg->info.pgid.pgid, backfill_osd_it->shard),
6509         pg->get_osdmap()->get_epoch(),
6510         pg->get_backfill_priority()),
6511       con.get());
6512     }
6513     ++backfill_osd_it;
6514   } else {
6515     post_event(AllBackfillsReserved());
6516   }
6517   return discard_event();
6518 }
6519
6520 void PG::RecoveryState::WaitRemoteBackfillReserved::exit()
6521 {
6522   context< RecoveryMachine >().log_exit(state_name, enter_time);
6523   PG *pg = context< RecoveryMachine >().pg;
6524   utime_t dur = ceph_clock_now() - enter_time;
6525   pg->osd->recoverystate_perf->tinc(rs_waitremotebackfillreserved_latency, dur);
6526 }
6527
6528 boost::statechart::result
6529 PG::RecoveryState::WaitRemoteBackfillReserved::react(const RemoteReservationRejected &evt)
6530 {
6531   PG *pg = context< RecoveryMachine >().pg;
6532   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6533
6534   // Send REJECT to all previously acquired reservations
6535   set<pg_shard_t>::const_iterator it, begin, end, next;
6536   begin = context< Active >().remote_shards_to_reserve_backfill.begin();
6537   end = context< Active >().remote_shards_to_reserve_backfill.end();
6538   assert(begin != end);
6539   for (next = it = begin, ++next ; next != backfill_osd_it; ++it, ++next) {
6540     //The primary never backfills itself
6541     assert(*it != pg->pg_whoami);
6542     ConnectionRef con = pg->osd->get_con_osd_cluster(
6543       it->osd, pg->get_osdmap()->get_epoch());
6544     if (con) {
6545       pg->osd->send_message_osd_cluster(
6546         new MBackfillReserve(
6547         MBackfillReserve::REJECT,
6548         spg_t(pg->info.pgid.pgid, it->shard),
6549         pg->get_osdmap()->get_epoch()),
6550       con.get());
6551     }
6552   }
6553
6554   pg->state_clear(PG_STATE_BACKFILL_WAIT);
6555   pg->state_set(PG_STATE_BACKFILL_TOOFULL);
6556   pg->publish_stats_to_osd();
6557
6558   pg->schedule_backfill_retry(pg->cct->_conf->osd_recovery_retry_interval);
6559
6560   return transit<NotBackfilling>();
6561 }
6562
6563 /*--WaitLocalBackfillReserved--*/
6564 PG::RecoveryState::WaitLocalBackfillReserved::WaitLocalBackfillReserved(my_context ctx)
6565   : my_base(ctx),
6566     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalBackfillReserved")
6567 {
6568   context< RecoveryMachine >().log_enter(state_name);
6569   PG *pg = context< RecoveryMachine >().pg;
6570   pg->state_set(PG_STATE_BACKFILL_WAIT);
6571   pg->osd->local_reserver.request_reservation(
6572     pg->info.pgid,
6573     new QueuePeeringEvt<LocalBackfillReserved>(
6574       pg, pg->get_osdmap()->get_epoch(),
6575       LocalBackfillReserved()),
6576     pg->get_backfill_priority(),
6577     new QueuePeeringEvt<DeferBackfill>(
6578       pg, pg->get_osdmap()->get_epoch(),
6579       DeferBackfill(0.0)));
6580   pg->publish_stats_to_osd();
6581 }
6582
6583 void PG::RecoveryState::WaitLocalBackfillReserved::exit()
6584 {
6585   context< RecoveryMachine >().log_exit(state_name, enter_time);
6586   PG *pg = context< RecoveryMachine >().pg;
6587   utime_t dur = ceph_clock_now() - enter_time;
6588   pg->osd->recoverystate_perf->tinc(rs_waitlocalbackfillreserved_latency, dur);
6589 }
6590
6591 /*----NotBackfilling------*/
6592 PG::RecoveryState::NotBackfilling::NotBackfilling(my_context ctx)
6593   : my_base(ctx),
6594     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotBackfilling")
6595 {
6596   context< RecoveryMachine >().log_enter(state_name);
6597   PG *pg = context< RecoveryMachine >().pg;
6598   pg->publish_stats_to_osd();
6599 }
6600
6601 boost::statechart::result
6602 PG::RecoveryState::NotBackfilling::react(const RemoteBackfillReserved &evt)
6603 {
6604   return discard_event();
6605 }
6606
6607 boost::statechart::result
6608 PG::RecoveryState::NotBackfilling::react(const RemoteReservationRejected &evt)
6609 {
6610   return discard_event();
6611 }
6612
6613 void PG::RecoveryState::NotBackfilling::exit()
6614 {
6615   context< RecoveryMachine >().log_exit(state_name, enter_time);
6616   PG *pg = context< RecoveryMachine >().pg;
6617   utime_t dur = ceph_clock_now() - enter_time;
6618   pg->osd->recoverystate_perf->tinc(rs_notbackfilling_latency, dur);
6619 }
6620
6621 /*----NotRecovering------*/
6622 PG::RecoveryState::NotRecovering::NotRecovering(my_context ctx)
6623   : my_base(ctx),
6624     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/NotRecovering")
6625 {
6626   context< RecoveryMachine >().log_enter(state_name);
6627   PG *pg = context< RecoveryMachine >().pg;
6628   pg->publish_stats_to_osd();
6629 }
6630
6631 void PG::RecoveryState::NotRecovering::exit()
6632 {
6633   context< RecoveryMachine >().log_exit(state_name, enter_time);
6634   PG *pg = context< RecoveryMachine >().pg;
6635   utime_t dur = ceph_clock_now() - enter_time;
6636   pg->osd->recoverystate_perf->tinc(rs_notrecovering_latency, dur);
6637 }
6638
6639 /*---RepNotRecovering----*/
6640 PG::RecoveryState::RepNotRecovering::RepNotRecovering(my_context ctx)
6641   : my_base(ctx),
6642     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepNotRecovering")
6643 {
6644   context< RecoveryMachine >().log_enter(state_name);
6645 }
6646
6647 boost::statechart::result
6648 PG::RecoveryState::RepNotRecovering::react(const RejectRemoteReservation &evt)
6649 {
6650   PG *pg = context< RecoveryMachine >().pg;
6651   pg->reject_reservation();
6652   post_event(RemoteReservationRejected());
6653   return discard_event();
6654 }
6655
6656 void PG::RecoveryState::RepNotRecovering::exit()
6657 {
6658   context< RecoveryMachine >().log_exit(state_name, enter_time);
6659   PG *pg = context< RecoveryMachine >().pg;
6660   utime_t dur = ceph_clock_now() - enter_time;
6661   pg->osd->recoverystate_perf->tinc(rs_repnotrecovering_latency, dur);
6662 }
6663
6664 /*---RepWaitRecoveryReserved--*/
6665 PG::RecoveryState::RepWaitRecoveryReserved::RepWaitRecoveryReserved(my_context ctx)
6666   : my_base(ctx),
6667     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitRecoveryReserved")
6668 {
6669   context< RecoveryMachine >().log_enter(state_name);
6670   PG *pg = context< RecoveryMachine >().pg;
6671
6672   pg->osd->remote_reserver.request_reservation(
6673     pg->info.pgid,
6674     new QueuePeeringEvt<RemoteRecoveryReserved>(
6675       pg, pg->get_osdmap()->get_epoch(),
6676       RemoteRecoveryReserved()),
6677     pg->get_recovery_priority());
6678 }
6679
6680 boost::statechart::result
6681 PG::RecoveryState::RepWaitRecoveryReserved::react(const RemoteRecoveryReserved &evt)
6682 {
6683   PG *pg = context< RecoveryMachine >().pg;
6684   pg->osd->send_message_osd_cluster(
6685     pg->primary.osd,
6686     new MRecoveryReserve(
6687       MRecoveryReserve::GRANT,
6688       spg_t(pg->info.pgid.pgid, pg->primary.shard),
6689       pg->get_osdmap()->get_epoch()),
6690     pg->get_osdmap()->get_epoch());
6691   return transit<RepRecovering>();
6692 }
6693
6694 boost::statechart::result
6695 PG::RecoveryState::RepWaitRecoveryReserved::react(
6696   const RemoteReservationCanceled &evt)
6697 {
6698   PG *pg = context< RecoveryMachine >().pg;
6699   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6700   return transit<RepNotRecovering>();
6701 }
6702
6703 void PG::RecoveryState::RepWaitRecoveryReserved::exit()
6704 {
6705   context< RecoveryMachine >().log_exit(state_name, enter_time);
6706   PG *pg = context< RecoveryMachine >().pg;
6707   utime_t dur = ceph_clock_now() - enter_time;
6708   pg->osd->recoverystate_perf->tinc(rs_repwaitrecoveryreserved_latency, dur);
6709 }
6710
6711 /*-RepWaitBackfillReserved*/
6712 PG::RecoveryState::RepWaitBackfillReserved::RepWaitBackfillReserved(my_context ctx)
6713   : my_base(ctx),
6714     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepWaitBackfillReserved")
6715 {
6716   context< RecoveryMachine >().log_enter(state_name);
6717 }
6718
6719 boost::statechart::result
6720 PG::RecoveryState::RepNotRecovering::react(const RequestBackfillPrio &evt)
6721 {
6722   PG *pg = context< RecoveryMachine >().pg;
6723   ostringstream ss;
6724
6725   if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6726       (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6727     ldout(pg->cct, 10) << "backfill reservation rejected: failure injection"
6728                        << dendl;
6729     post_event(RejectRemoteReservation());
6730   } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6731       pg->osd->check_backfill_full(ss)) {
6732     ldout(pg->cct, 10) << "backfill reservation rejected: "
6733                        << ss.str() << dendl;
6734     post_event(RejectRemoteReservation());
6735   } else {
6736     pg->osd->remote_reserver.request_reservation(
6737       pg->info.pgid,
6738       new QueuePeeringEvt<RemoteBackfillReserved>(
6739         pg, pg->get_osdmap()->get_epoch(),
6740         RemoteBackfillReserved()), evt.priority);
6741   }
6742   return transit<RepWaitBackfillReserved>();
6743 }
6744
6745 void PG::RecoveryState::RepWaitBackfillReserved::exit()
6746 {
6747   context< RecoveryMachine >().log_exit(state_name, enter_time);
6748   PG *pg = context< RecoveryMachine >().pg;
6749   utime_t dur = ceph_clock_now() - enter_time;
6750   pg->osd->recoverystate_perf->tinc(rs_repwaitbackfillreserved_latency, dur);
6751 }
6752
6753 boost::statechart::result
6754 PG::RecoveryState::RepWaitBackfillReserved::react(const RemoteBackfillReserved &evt)
6755 {
6756   PG *pg = context< RecoveryMachine >().pg;
6757
6758   ostringstream ss;
6759   if (pg->cct->_conf->osd_debug_reject_backfill_probability > 0 &&
6760       (rand()%1000 < (pg->cct->_conf->osd_debug_reject_backfill_probability*1000.0))) {
6761     ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6762                        << "failure injection" << dendl;
6763     post_event(RejectRemoteReservation());
6764     return discard_event();
6765   } else if (!pg->cct->_conf->osd_debug_skip_full_check_in_backfill_reservation &&
6766              pg->osd->check_backfill_full(ss)) {
6767     ldout(pg->cct, 10) << "backfill reservation rejected after reservation: "
6768                        << ss.str() << dendl;
6769     post_event(RejectRemoteReservation());
6770     return discard_event();
6771   } else {
6772     pg->osd->send_message_osd_cluster(
6773       pg->primary.osd,
6774       new MBackfillReserve(
6775         MBackfillReserve::GRANT,
6776         spg_t(pg->info.pgid.pgid, pg->primary.shard),
6777         pg->get_osdmap()->get_epoch()),
6778       pg->get_osdmap()->get_epoch());
6779     return transit<RepRecovering>();
6780   }
6781 }
6782
6783 boost::statechart::result
6784 PG::RecoveryState::RepWaitBackfillReserved::react(
6785   const RejectRemoteReservation &evt)
6786 {
6787   PG *pg = context< RecoveryMachine >().pg;
6788   pg->reject_reservation();
6789   post_event(RemoteReservationRejected());
6790   return discard_event();
6791 }
6792
6793 boost::statechart::result
6794 PG::RecoveryState::RepWaitBackfillReserved::react(
6795   const RemoteReservationRejected &evt)
6796 {
6797   PG *pg = context< RecoveryMachine >().pg;
6798   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6799   return transit<RepNotRecovering>();
6800 }
6801
6802 boost::statechart::result
6803 PG::RecoveryState::RepWaitBackfillReserved::react(
6804   const RemoteReservationCanceled &evt)
6805 {
6806   PG *pg = context< RecoveryMachine >().pg;
6807   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6808   return transit<RepNotRecovering>();
6809 }
6810
6811 /*---RepRecovering-------*/
6812 PG::RecoveryState::RepRecovering::RepRecovering(my_context ctx)
6813   : my_base(ctx),
6814     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive/RepRecovering")
6815 {
6816   context< RecoveryMachine >().log_enter(state_name);
6817 }
6818
6819 boost::statechart::result
6820 PG::RecoveryState::RepRecovering::react(const BackfillTooFull &)
6821 {
6822   PG *pg = context< RecoveryMachine >().pg;
6823   pg->reject_reservation();
6824   return discard_event();
6825 }
6826
6827 void PG::RecoveryState::RepRecovering::exit()
6828 {
6829   context< RecoveryMachine >().log_exit(state_name, enter_time);
6830   PG *pg = context< RecoveryMachine >().pg;
6831   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
6832   utime_t dur = ceph_clock_now() - enter_time;
6833   pg->osd->recoverystate_perf->tinc(rs_reprecovering_latency, dur);
6834 }
6835
6836 /*------Activating--------*/
6837 PG::RecoveryState::Activating::Activating(my_context ctx)
6838   : my_base(ctx),
6839     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Activating")
6840 {
6841   context< RecoveryMachine >().log_enter(state_name);
6842 }
6843
6844 void PG::RecoveryState::Activating::exit()
6845 {
6846   context< RecoveryMachine >().log_exit(state_name, enter_time);
6847   PG *pg = context< RecoveryMachine >().pg;
6848   utime_t dur = ceph_clock_now() - enter_time;
6849   pg->osd->recoverystate_perf->tinc(rs_activating_latency, dur);
6850 }
6851
6852 PG::RecoveryState::WaitLocalRecoveryReserved::WaitLocalRecoveryReserved(my_context ctx)
6853   : my_base(ctx),
6854     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitLocalRecoveryReserved")
6855 {
6856   context< RecoveryMachine >().log_enter(state_name);
6857   PG *pg = context< RecoveryMachine >().pg;
6858
6859   // Make sure all nodes that part of the recovery aren't full
6860   if (!pg->cct->_conf->osd_debug_skip_full_check_in_recovery &&
6861       pg->osd->check_osdmap_full(pg->actingbackfill)) {
6862     post_event(RecoveryTooFull());
6863     return;
6864   }
6865
6866   pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
6867   pg->state_set(PG_STATE_RECOVERY_WAIT);
6868   pg->osd->local_reserver.request_reservation(
6869     pg->info.pgid,
6870     new QueuePeeringEvt<LocalRecoveryReserved>(
6871       pg, pg->get_osdmap()->get_epoch(),
6872       LocalRecoveryReserved()),
6873     pg->get_recovery_priority(),
6874     new QueuePeeringEvt<DeferRecovery>(
6875       pg, pg->get_osdmap()->get_epoch(),
6876       DeferRecovery(0.0)));
6877   pg->publish_stats_to_osd();
6878 }
6879
6880 boost::statechart::result
6881 PG::RecoveryState::WaitLocalRecoveryReserved::react(const RecoveryTooFull &evt)
6882 {
6883   PG *pg = context< RecoveryMachine >().pg;
6884   pg->state_set(PG_STATE_RECOVERY_TOOFULL);
6885   pg->schedule_recovery_retry(pg->cct->_conf->osd_recovery_retry_interval);
6886   return transit<NotRecovering>();
6887 }
6888
6889 void PG::RecoveryState::WaitLocalRecoveryReserved::exit()
6890 {
6891   context< RecoveryMachine >().log_exit(state_name, enter_time);
6892   PG *pg = context< RecoveryMachine >().pg;
6893   utime_t dur = ceph_clock_now() - enter_time;
6894   pg->osd->recoverystate_perf->tinc(rs_waitlocalrecoveryreserved_latency, dur);
6895 }
6896
6897 PG::RecoveryState::WaitRemoteRecoveryReserved::WaitRemoteRecoveryReserved(my_context ctx)
6898   : my_base(ctx),
6899     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/WaitRemoteRecoveryReserved"),
6900     remote_recovery_reservation_it(context< Active >().remote_shards_to_reserve_recovery.begin())
6901 {
6902   context< RecoveryMachine >().log_enter(state_name);
6903   post_event(RemoteRecoveryReserved());
6904 }
6905
6906 boost::statechart::result
6907 PG::RecoveryState::WaitRemoteRecoveryReserved::react(const RemoteRecoveryReserved &evt) {
6908   PG *pg = context< RecoveryMachine >().pg;
6909
6910   if (remote_recovery_reservation_it != context< Active >().remote_shards_to_reserve_recovery.end()) {
6911     assert(*remote_recovery_reservation_it != pg->pg_whoami);
6912     ConnectionRef con = pg->osd->get_con_osd_cluster(
6913       remote_recovery_reservation_it->osd, pg->get_osdmap()->get_epoch());
6914     if (con) {
6915       pg->osd->send_message_osd_cluster(
6916         new MRecoveryReserve(
6917           MRecoveryReserve::REQUEST,
6918           spg_t(pg->info.pgid.pgid, remote_recovery_reservation_it->shard),
6919           pg->get_osdmap()->get_epoch()),
6920         con.get());
6921     }
6922     ++remote_recovery_reservation_it;
6923   } else {
6924     post_event(AllRemotesReserved());
6925   }
6926   return discard_event();
6927 }
6928
6929 void PG::RecoveryState::WaitRemoteRecoveryReserved::exit()
6930 {
6931   context< RecoveryMachine >().log_exit(state_name, enter_time);
6932   PG *pg = context< RecoveryMachine >().pg;
6933   utime_t dur = ceph_clock_now() - enter_time;
6934   pg->osd->recoverystate_perf->tinc(rs_waitremoterecoveryreserved_latency, dur);
6935 }
6936
6937 PG::RecoveryState::Recovering::Recovering(my_context ctx)
6938   : my_base(ctx),
6939     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovering")
6940 {
6941   context< RecoveryMachine >().log_enter(state_name);
6942
6943   PG *pg = context< RecoveryMachine >().pg;
6944   pg->state_clear(PG_STATE_RECOVERY_WAIT);
6945   pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
6946   pg->state_set(PG_STATE_RECOVERING);
6947   pg->publish_stats_to_osd();
6948   pg->queue_recovery();
6949 }
6950
6951 void PG::RecoveryState::Recovering::release_reservations(bool cancel)
6952 {
6953   PG *pg = context< RecoveryMachine >().pg;
6954   assert(cancel || !pg->pg_log.get_missing().have_missing());
6955
6956   // release remote reservations
6957   for (set<pg_shard_t>::const_iterator i =
6958          context< Active >().remote_shards_to_reserve_recovery.begin();
6959         i != context< Active >().remote_shards_to_reserve_recovery.end();
6960         ++i) {
6961     if (*i == pg->pg_whoami) // skip myself
6962       continue;
6963     ConnectionRef con = pg->osd->get_con_osd_cluster(
6964       i->osd, pg->get_osdmap()->get_epoch());
6965     if (con) {
6966       pg->osd->send_message_osd_cluster(
6967         new MRecoveryReserve(
6968           MRecoveryReserve::RELEASE,
6969           spg_t(pg->info.pgid.pgid, i->shard),
6970           pg->get_osdmap()->get_epoch()),
6971         con.get());
6972     }
6973   }
6974 }
6975
6976 boost::statechart::result
6977 PG::RecoveryState::Recovering::react(const AllReplicasRecovered &evt)
6978 {
6979   PG *pg = context< RecoveryMachine >().pg;
6980   pg->state_clear(PG_STATE_RECOVERING);
6981   pg->state_clear(PG_STATE_FORCED_RECOVERY);
6982   release_reservations();
6983   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6984   return transit<Recovered>();
6985 }
6986
6987 boost::statechart::result
6988 PG::RecoveryState::Recovering::react(const RequestBackfill &evt)
6989 {
6990   PG *pg = context< RecoveryMachine >().pg;
6991   pg->state_clear(PG_STATE_RECOVERING);
6992   pg->state_clear(PG_STATE_FORCED_RECOVERY);
6993   release_reservations();
6994   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
6995   return transit<WaitLocalBackfillReserved>();
6996 }
6997
6998 boost::statechart::result
6999 PG::RecoveryState::Recovering::react(const DeferRecovery &evt)
7000 {
7001   PG *pg = context< RecoveryMachine >().pg;
7002   ldout(pg->cct, 10) << "defer recovery, retry delay " << evt.delay << dendl;
7003   pg->state_clear(PG_STATE_RECOVERING);
7004   pg->state_set(PG_STATE_RECOVERY_WAIT);
7005   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7006   release_reservations(true);
7007   pg->schedule_recovery_retry(evt.delay);
7008   return transit<NotRecovering>();
7009 }
7010
7011 void PG::RecoveryState::Recovering::exit()
7012 {
7013   context< RecoveryMachine >().log_exit(state_name, enter_time);
7014   PG *pg = context< RecoveryMachine >().pg;
7015   utime_t dur = ceph_clock_now() - enter_time;
7016   pg->osd->recoverystate_perf->tinc(rs_recovering_latency, dur);
7017 }
7018
7019 PG::RecoveryState::Recovered::Recovered(my_context ctx)
7020   : my_base(ctx),
7021     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Recovered")
7022 {
7023   pg_shard_t auth_log_shard;
7024
7025   context< RecoveryMachine >().log_enter(state_name);
7026
7027   PG *pg = context< RecoveryMachine >().pg;
7028
7029   assert(!pg->needs_recovery());
7030
7031   // if we finished backfill, all acting are active; recheck if
7032   // DEGRADED | UNDERSIZED is appropriate.
7033   assert(!pg->actingbackfill.empty());
7034   if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <=
7035       pg->actingbackfill.size()) {
7036     pg->state_clear(PG_STATE_DEGRADED);
7037     pg->state_clear(PG_STATE_FORCED_BACKFILL | PG_STATE_FORCED_RECOVERY);
7038     pg->publish_stats_to_osd();
7039   }
7040
7041   // trim pglog on recovered
7042   pg->trim_log();
7043
7044   // adjust acting set?  (e.g. because backfill completed...)
7045   bool history_les_bound = false;
7046   if (pg->acting != pg->up && !pg->choose_acting(auth_log_shard,
7047                                                  true, &history_les_bound))
7048     assert(pg->want_acting.size());
7049
7050   if (context< Active >().all_replicas_activated)
7051     post_event(GoClean());
7052 }
7053
7054 void PG::RecoveryState::Recovered::exit()
7055 {
7056   context< RecoveryMachine >().log_exit(state_name, enter_time);
7057   PG *pg = context< RecoveryMachine >().pg;
7058   utime_t dur = ceph_clock_now() - enter_time;
7059   pg->osd->recoverystate_perf->tinc(rs_recovered_latency, dur);
7060 }
7061
7062 PG::RecoveryState::Clean::Clean(my_context ctx)
7063   : my_base(ctx),
7064     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active/Clean")
7065 {
7066   context< RecoveryMachine >().log_enter(state_name);
7067
7068   PG *pg = context< RecoveryMachine >().pg;
7069
7070   if (pg->info.last_complete != pg->info.last_update) {
7071     ceph_abort();
7072   }
7073   pg->finish_recovery(*context< RecoveryMachine >().get_on_safe_context_list());
7074
7075   if (pg->is_active()) {
7076     pg->mark_clean();
7077   }
7078
7079   pg->share_pg_info();
7080   pg->publish_stats_to_osd();
7081   pg->requeue_ops(pg->waiting_for_clean_to_primary_repair);
7082 }
7083
7084 void PG::RecoveryState::Clean::exit()
7085 {
7086   context< RecoveryMachine >().log_exit(state_name, enter_time);
7087   PG *pg = context< RecoveryMachine >().pg;
7088   pg->state_clear(PG_STATE_CLEAN);
7089   utime_t dur = ceph_clock_now() - enter_time;
7090   pg->osd->recoverystate_perf->tinc(rs_clean_latency, dur);
7091 }
7092
7093 template <typename T>
7094 set<pg_shard_t> unique_osd_shard_set(const pg_shard_t & skip, const T &in)
7095 {
7096   set<int> osds_found;
7097   set<pg_shard_t> out;
7098   for (typename T::const_iterator i = in.begin();
7099        i != in.end();
7100        ++i) {
7101     if (*i != skip && !osds_found.count(i->osd)) {
7102       osds_found.insert(i->osd);
7103       out.insert(*i);
7104     }
7105   }
7106   return out;
7107 }
7108
7109 /*---------Active---------*/
7110 PG::RecoveryState::Active::Active(my_context ctx)
7111   : my_base(ctx),
7112     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Active"),
7113     remote_shards_to_reserve_recovery(
7114       unique_osd_shard_set(
7115         context< RecoveryMachine >().pg->pg_whoami,
7116         context< RecoveryMachine >().pg->actingbackfill)),
7117     remote_shards_to_reserve_backfill(
7118       unique_osd_shard_set(
7119         context< RecoveryMachine >().pg->pg_whoami,
7120         context< RecoveryMachine >().pg->backfill_targets)),
7121     all_replicas_activated(false)
7122 {
7123   context< RecoveryMachine >().log_enter(state_name);
7124
7125   PG *pg = context< RecoveryMachine >().pg;
7126
7127   assert(!pg->backfill_reserving);
7128   assert(!pg->backfill_reserved);
7129   assert(pg->is_primary());
7130   ldout(pg->cct, 10) << "In Active, about to call activate" << dendl;
7131   pg->start_flush(
7132     context< RecoveryMachine >().get_cur_transaction(),
7133     context< RecoveryMachine >().get_on_applied_context_list(),
7134     context< RecoveryMachine >().get_on_safe_context_list());
7135   pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7136                pg->get_osdmap()->get_epoch(),
7137                *context< RecoveryMachine >().get_on_safe_context_list(),
7138                *context< RecoveryMachine >().get_query_map(),
7139                context< RecoveryMachine >().get_info_map(),
7140                context< RecoveryMachine >().get_recovery_ctx());
7141
7142   // everyone has to commit/ack before we are truly active
7143   pg->blocked_by.clear();
7144   for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7145        p != pg->actingbackfill.end();
7146        ++p) {
7147     if (p->shard != pg->pg_whoami.shard) {
7148       pg->blocked_by.insert(p->shard);
7149     }
7150   }
7151   pg->publish_stats_to_osd();
7152   ldout(pg->cct, 10) << "Activate Finished" << dendl;
7153 }
7154
7155 boost::statechart::result PG::RecoveryState::Active::react(const AdvMap& advmap)
7156 {
7157   PG *pg = context< RecoveryMachine >().pg;
7158   ldout(pg->cct, 10) << "Active advmap" << dendl;
7159   if (!pg->pool.newly_removed_snaps.empty()) {
7160     pg->snap_trimq.union_of(pg->pool.newly_removed_snaps);
7161     ldout(pg->cct, 10) << *pg << " snap_trimq now " << pg->snap_trimq << dendl;
7162     pg->dirty_info = true;
7163     pg->dirty_big_info = true;
7164   }
7165
7166   for (size_t i = 0; i < pg->want_acting.size(); i++) {
7167     int osd = pg->want_acting[i];
7168     if (!advmap.osdmap->is_up(osd)) {
7169       pg_shard_t osd_with_shard(osd, shard_id_t(i));
7170       assert(pg->is_acting(osd_with_shard) || pg->is_up(osd_with_shard));
7171     }
7172   }
7173
7174   bool need_publish = false;
7175   /* Check for changes in pool size (if the acting set changed as a result,
7176    * this does not matter) */
7177   if (advmap.lastmap->get_pg_size(pg->info.pgid.pgid) !=
7178       pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid)) {
7179     if (pg->get_osdmap()->get_pg_size(pg->info.pgid.pgid) <= pg->actingset.size()) {
7180       pg->state_clear(PG_STATE_UNDERSIZED);
7181       if (pg->needs_recovery()) {
7182         pg->state_set(PG_STATE_DEGRADED);
7183       } else {
7184         pg->state_clear(PG_STATE_DEGRADED);
7185       }
7186     } else {
7187       pg->state_set(PG_STATE_UNDERSIZED);
7188       pg->state_set(PG_STATE_DEGRADED);
7189     }
7190     need_publish = true; // degraded may have changed
7191   }
7192
7193   // if we haven't reported our PG stats in a long time, do so now.
7194   if (pg->info.stats.reported_epoch + pg->cct->_conf->osd_pg_stat_report_interval_max < advmap.osdmap->get_epoch()) {
7195     ldout(pg->cct, 20) << "reporting stats to osd after " << (advmap.osdmap->get_epoch() - pg->info.stats.reported_epoch)
7196                        << " epochs" << dendl;
7197     need_publish = true;
7198   }
7199
7200   if (need_publish)
7201     pg->publish_stats_to_osd();
7202
7203   return forward_event();
7204 }
7205     
7206 boost::statechart::result PG::RecoveryState::Active::react(const ActMap&)
7207 {
7208   PG *pg = context< RecoveryMachine >().pg;
7209   ldout(pg->cct, 10) << "Active: handling ActMap" << dendl;
7210   assert(pg->is_primary());
7211
7212   if (pg->have_unfound()) {
7213     // object may have become unfound
7214     pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7215   }
7216
7217   if (pg->cct->_conf->osd_check_for_log_corruption)
7218     pg->check_log_for_corruption(pg->osd->store);
7219
7220   uint64_t unfound = pg->missing_loc.num_unfound();
7221   if (unfound > 0 &&
7222       pg->all_unfound_are_queried_or_lost(pg->get_osdmap())) {
7223     if (pg->cct->_conf->osd_auto_mark_unfound_lost) {
7224       pg->osd->clog->error() << pg->info.pgid.pgid << " has " << unfound
7225                             << " objects unfound and apparently lost, would automatically "
7226                             << "mark these objects lost but this feature is not yet implemented "
7227                             << "(osd_auto_mark_unfound_lost)";
7228     } else
7229       pg->osd->clog->error() << pg->info.pgid.pgid << " has "
7230                              << unfound << " objects unfound and apparently lost";
7231   }
7232
7233   if (pg->is_active()) {
7234     ldout(pg->cct, 10) << "Active: kicking snap trim" << dendl;
7235     pg->kick_snap_trim();
7236   }
7237
7238   if (pg->is_peered() &&
7239       !pg->is_clean() &&
7240       !pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOBACKFILL) &&
7241       (!pg->get_osdmap()->test_flag(CEPH_OSDMAP_NOREBALANCE) || pg->is_degraded())) {
7242     pg->queue_recovery();
7243   }
7244   return forward_event();
7245 }
7246
7247 boost::statechart::result PG::RecoveryState::Active::react(const MNotifyRec& notevt)
7248 {
7249   PG *pg = context< RecoveryMachine >().pg;
7250   assert(pg->is_primary());
7251   if (pg->peer_info.count(notevt.from)) {
7252     ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7253                        << ", already have info from that osd, ignoring"
7254                        << dendl;
7255   } else if (pg->peer_purged.count(notevt.from)) {
7256     ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7257                        << ", already purged that peer, ignoring"
7258                        << dendl;
7259   } else {
7260     ldout(pg->cct, 10) << "Active: got notify from " << notevt.from
7261                        << ", calling proc_replica_info and discover_all_missing"
7262                        << dendl;
7263     pg->proc_replica_info(
7264       notevt.from, notevt.notify.info, notevt.notify.epoch_sent);
7265     if (pg->have_unfound()) {
7266       pg->discover_all_missing(*context< RecoveryMachine >().get_query_map());
7267     }
7268   }
7269   return discard_event();
7270 }
7271
7272 boost::statechart::result PG::RecoveryState::Active::react(const MInfoRec& infoevt)
7273 {
7274   PG *pg = context< RecoveryMachine >().pg;
7275   assert(pg->is_primary());
7276
7277   assert(!pg->actingbackfill.empty());
7278   // don't update history (yet) if we are active and primary; the replica
7279   // may be telling us they have activated (and committed) but we can't
7280   // share that until _everyone_ does the same.
7281   if (pg->is_actingbackfill(infoevt.from)) {
7282     ldout(pg->cct, 10) << " peer osd." << infoevt.from
7283                        << " activated and committed" << dendl;
7284     pg->peer_activated.insert(infoevt.from);
7285     pg->blocked_by.erase(infoevt.from.shard);
7286     pg->publish_stats_to_osd();
7287     if (pg->peer_activated.size() == pg->actingbackfill.size()) {
7288       pg->all_activated_and_committed();
7289     }
7290   }
7291   return discard_event();
7292 }
7293
7294 boost::statechart::result PG::RecoveryState::Active::react(const MLogRec& logevt)
7295 {
7296   PG *pg = context< RecoveryMachine >().pg;
7297   ldout(pg->cct, 10) << "searching osd." << logevt.from
7298                      << " log for unfound items" << dendl;
7299   pg->proc_replica_log(
7300     logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
7301   bool got_missing = pg->search_for_missing(
7302     pg->peer_info[logevt.from],
7303     pg->peer_missing[logevt.from],
7304     logevt.from,
7305     context< RecoveryMachine >().get_recovery_ctx());
7306   if (pg->is_peered() &&
7307       got_missing)
7308     pg->queue_recovery();
7309   return discard_event();
7310 }
7311
7312 boost::statechart::result PG::RecoveryState::Active::react(const QueryState& q)
7313 {
7314   PG *pg = context< RecoveryMachine >().pg;
7315
7316   q.f->open_object_section("state");
7317   q.f->dump_string("name", state_name);
7318   q.f->dump_stream("enter_time") << enter_time;
7319
7320   {
7321     q.f->open_array_section("might_have_unfound");
7322     for (set<pg_shard_t>::iterator p = pg->might_have_unfound.begin();
7323          p != pg->might_have_unfound.end();
7324          ++p) {
7325       q.f->open_object_section("osd");
7326       q.f->dump_stream("osd") << *p;
7327       if (pg->peer_missing.count(*p)) {
7328         q.f->dump_string("status", "already probed");
7329       } else if (pg->peer_missing_requested.count(*p)) {
7330         q.f->dump_string("status", "querying");
7331       } else if (!pg->get_osdmap()->is_up(p->osd)) {
7332         q.f->dump_string("status", "osd is down");
7333       } else {
7334         q.f->dump_string("status", "not queried");
7335       }
7336       q.f->close_section();
7337     }
7338     q.f->close_section();
7339   }
7340   {
7341     q.f->open_object_section("recovery_progress");
7342     pg->dump_recovery_info(q.f);
7343     q.f->close_section();
7344   }
7345
7346   {
7347     q.f->open_object_section("scrub");
7348     q.f->dump_stream("scrubber.epoch_start") << pg->scrubber.epoch_start;
7349     q.f->dump_bool("scrubber.active", pg->scrubber.active);
7350     q.f->dump_string("scrubber.state", Scrubber::state_string(pg->scrubber.state));
7351     q.f->dump_stream("scrubber.start") << pg->scrubber.start;
7352     q.f->dump_stream("scrubber.end") << pg->scrubber.end;
7353     q.f->dump_stream("scrubber.subset_last_update") << pg->scrubber.subset_last_update;
7354     q.f->dump_bool("scrubber.deep", pg->scrubber.deep);
7355     q.f->dump_unsigned("scrubber.seed", pg->scrubber.seed);
7356     q.f->dump_int("scrubber.waiting_on", pg->scrubber.waiting_on);
7357     {
7358       q.f->open_array_section("scrubber.waiting_on_whom");
7359       for (set<pg_shard_t>::iterator p = pg->scrubber.waiting_on_whom.begin();
7360            p != pg->scrubber.waiting_on_whom.end();
7361            ++p) {
7362         q.f->dump_stream("shard") << *p;
7363       }
7364       q.f->close_section();
7365     }
7366     q.f->close_section();
7367   }
7368
7369   q.f->close_section();
7370   return forward_event();
7371 }
7372
7373 boost::statechart::result PG::RecoveryState::Active::react(const AllReplicasActivated &evt)
7374 {
7375   PG *pg = context< RecoveryMachine >().pg;
7376   all_replicas_activated = true;
7377
7378   pg->state_clear(PG_STATE_ACTIVATING);
7379   pg->state_clear(PG_STATE_CREATING);
7380   if (pg->acting.size() >= pg->pool.info.min_size) {
7381     pg->state_set(PG_STATE_ACTIVE);
7382   } else {
7383     pg->state_set(PG_STATE_PEERED);
7384   }
7385
7386   // info.last_epoch_started is set during activate()
7387   pg->info.history.last_epoch_started = pg->info.last_epoch_started;
7388   pg->info.history.last_interval_started = pg->info.last_interval_started;
7389   pg->dirty_info = true;
7390
7391   pg->share_pg_info();
7392   pg->publish_stats_to_osd();
7393
7394   pg->check_local();
7395
7396   // waiters
7397   if (pg->flushes_in_progress == 0) {
7398     pg->requeue_ops(pg->waiting_for_peered);
7399   }
7400
7401   pg->on_activate();
7402
7403   return discard_event();
7404 }
7405
7406 void PG::RecoveryState::Active::exit()
7407 {
7408   context< RecoveryMachine >().log_exit(state_name, enter_time);
7409   PG *pg = context< RecoveryMachine >().pg;
7410   pg->osd->local_reserver.cancel_reservation(pg->info.pgid);
7411
7412   pg->blocked_by.clear();
7413   pg->backfill_reserved = false;
7414   pg->backfill_reserving = false;
7415   pg->state_clear(PG_STATE_ACTIVATING);
7416   pg->state_clear(PG_STATE_DEGRADED);
7417   pg->state_clear(PG_STATE_UNDERSIZED);
7418   pg->state_clear(PG_STATE_BACKFILL_TOOFULL);
7419   pg->state_clear(PG_STATE_BACKFILL_WAIT);
7420   pg->state_clear(PG_STATE_RECOVERY_WAIT);
7421   pg->state_clear(PG_STATE_RECOVERY_TOOFULL);
7422   utime_t dur = ceph_clock_now() - enter_time;
7423   pg->osd->recoverystate_perf->tinc(rs_active_latency, dur);
7424   pg->agent_stop();
7425 }
7426
7427 /*------ReplicaActive-----*/
7428 PG::RecoveryState::ReplicaActive::ReplicaActive(my_context ctx) 
7429   : my_base(ctx),
7430     NamedState(context< RecoveryMachine >().pg, "Started/ReplicaActive")
7431 {
7432   context< RecoveryMachine >().log_enter(state_name);
7433
7434   PG *pg = context< RecoveryMachine >().pg;
7435   pg->start_flush(
7436     context< RecoveryMachine >().get_cur_transaction(),
7437     context< RecoveryMachine >().get_on_applied_context_list(),
7438     context< RecoveryMachine >().get_on_safe_context_list());
7439 }
7440
7441
7442 boost::statechart::result PG::RecoveryState::ReplicaActive::react(
7443   const Activate& actevt) {
7444   PG *pg = context< RecoveryMachine >().pg;
7445   ldout(pg->cct, 10) << "In ReplicaActive, about to call activate" << dendl;
7446   map<int, map<spg_t, pg_query_t> > query_map;
7447   pg->activate(*context< RecoveryMachine >().get_cur_transaction(),
7448                actevt.activation_epoch,
7449                *context< RecoveryMachine >().get_on_safe_context_list(),
7450                query_map, NULL, NULL);
7451   ldout(pg->cct, 10) << "Activate Finished" << dendl;
7452   return discard_event();
7453 }
7454
7455 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MInfoRec& infoevt)
7456 {
7457   PG *pg = context< RecoveryMachine >().pg;
7458   pg->proc_primary_info(*context<RecoveryMachine>().get_cur_transaction(),
7459                         infoevt.info);
7460   return discard_event();
7461 }
7462
7463 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MLogRec& logevt)
7464 {
7465   PG *pg = context< RecoveryMachine >().pg;
7466   ldout(pg->cct, 10) << "received log from " << logevt.from << dendl;
7467   ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7468   pg->merge_log(*t, logevt.msg->info, logevt.msg->log, logevt.from);
7469   assert(pg->pg_log.get_head() == pg->info.last_update);
7470
7471   return discard_event();
7472 }
7473
7474 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const ActMap&)
7475 {
7476   PG *pg = context< RecoveryMachine >().pg;
7477   if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7478     context< RecoveryMachine >().send_notify(
7479       pg->get_primary(),
7480       pg_notify_t(
7481         pg->get_primary().shard, pg->pg_whoami.shard,
7482         pg->get_osdmap()->get_epoch(),
7483         pg->get_osdmap()->get_epoch(),
7484         pg->info),
7485       pg->past_intervals);
7486   }
7487   pg->take_waiters();
7488   return discard_event();
7489 }
7490
7491 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const MQuery& query)
7492 {
7493   PG *pg = context< RecoveryMachine >().pg;
7494   if (query.query.type == pg_query_t::MISSING) {
7495     pg->update_history(query.query.history);
7496     pg->fulfill_log(query.from, query.query, query.query_epoch);
7497   } // else: from prior to activation, safe to ignore
7498   return discard_event();
7499 }
7500
7501 boost::statechart::result PG::RecoveryState::ReplicaActive::react(const QueryState& q)
7502 {
7503   q.f->open_object_section("state");
7504   q.f->dump_string("name", state_name);
7505   q.f->dump_stream("enter_time") << enter_time;
7506   q.f->close_section();
7507   return forward_event();
7508 }
7509
7510 void PG::RecoveryState::ReplicaActive::exit()
7511 {
7512   context< RecoveryMachine >().log_exit(state_name, enter_time);
7513   PG *pg = context< RecoveryMachine >().pg;
7514   pg->osd->remote_reserver.cancel_reservation(pg->info.pgid);
7515   utime_t dur = ceph_clock_now() - enter_time;
7516   pg->osd->recoverystate_perf->tinc(rs_replicaactive_latency, dur);
7517 }
7518
7519 /*-------Stray---*/
7520 PG::RecoveryState::Stray::Stray(my_context ctx) 
7521   : my_base(ctx),
7522     NamedState(context< RecoveryMachine >().pg, "Started/Stray")
7523 {
7524   context< RecoveryMachine >().log_enter(state_name);
7525
7526   PG *pg = context< RecoveryMachine >().pg;
7527   assert(!pg->is_peered());
7528   assert(!pg->is_peering());
7529   assert(!pg->is_primary());
7530   pg->start_flush(
7531     context< RecoveryMachine >().get_cur_transaction(),
7532     context< RecoveryMachine >().get_on_applied_context_list(),
7533     context< RecoveryMachine >().get_on_safe_context_list());
7534 }
7535
7536 boost::statechart::result PG::RecoveryState::Stray::react(const MLogRec& logevt)
7537 {
7538   PG *pg = context< RecoveryMachine >().pg;
7539   MOSDPGLog *msg = logevt.msg.get();
7540   ldout(pg->cct, 10) << "got info+log from osd." << logevt.from << " " << msg->info << " " << msg->log << dendl;
7541
7542   ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7543   if (msg->info.last_backfill == hobject_t()) {
7544     // restart backfill
7545     pg->unreg_next_scrub();
7546     pg->info = msg->info;
7547     pg->reg_next_scrub();
7548     pg->dirty_info = true;
7549     pg->dirty_big_info = true;  // maybe.
7550
7551     PGLogEntryHandler rollbacker{pg, t};
7552     pg->pg_log.reset_backfill_claim_log(msg->log, &rollbacker);
7553
7554     pg->pg_log.reset_backfill();
7555   } else {
7556     pg->merge_log(*t, msg->info, msg->log, logevt.from);
7557   }
7558
7559   assert(pg->pg_log.get_head() == pg->info.last_update);
7560
7561   post_event(Activate(logevt.msg->info.last_epoch_started));
7562   return transit<ReplicaActive>();
7563 }
7564
7565 boost::statechart::result PG::RecoveryState::Stray::react(const MInfoRec& infoevt)
7566 {
7567   PG *pg = context< RecoveryMachine >().pg;
7568   ldout(pg->cct, 10) << "got info from osd." << infoevt.from << " " << infoevt.info << dendl;
7569
7570   if (pg->info.last_update > infoevt.info.last_update) {
7571     // rewind divergent log entries
7572     ObjectStore::Transaction* t = context<RecoveryMachine>().get_cur_transaction();
7573     pg->rewind_divergent_log(*t, infoevt.info.last_update);
7574     pg->info.stats = infoevt.info.stats;
7575     pg->info.hit_set = infoevt.info.hit_set;
7576   }
7577   
7578   assert(infoevt.info.last_update == pg->info.last_update);
7579   assert(pg->pg_log.get_head() == pg->info.last_update);
7580
7581   post_event(Activate(infoevt.info.last_epoch_started));
7582   return transit<ReplicaActive>();
7583 }
7584
7585 boost::statechart::result PG::RecoveryState::Stray::react(const MQuery& query)
7586 {
7587   PG *pg = context< RecoveryMachine >().pg;
7588   if (query.query.type == pg_query_t::INFO) {
7589     pair<pg_shard_t, pg_info_t> notify_info;
7590     pg->update_history(query.query.history);
7591     pg->fulfill_info(query.from, query.query, notify_info);
7592     context< RecoveryMachine >().send_notify(
7593       notify_info.first,
7594       pg_notify_t(
7595         notify_info.first.shard, pg->pg_whoami.shard,
7596         query.query_epoch,
7597         pg->get_osdmap()->get_epoch(),
7598         notify_info.second),
7599       pg->past_intervals);
7600   } else {
7601     pg->fulfill_log(query.from, query.query, query.query_epoch);
7602   }
7603   return discard_event();
7604 }
7605
7606 boost::statechart::result PG::RecoveryState::Stray::react(const ActMap&)
7607 {
7608   PG *pg = context< RecoveryMachine >().pg;
7609   if (pg->should_send_notify() && pg->get_primary().osd >= 0) {
7610     context< RecoveryMachine >().send_notify(
7611       pg->get_primary(),
7612       pg_notify_t(
7613         pg->get_primary().shard, pg->pg_whoami.shard,
7614         pg->get_osdmap()->get_epoch(),
7615         pg->get_osdmap()->get_epoch(),
7616         pg->info),
7617       pg->past_intervals);
7618   }
7619   pg->take_waiters();
7620   return discard_event();
7621 }
7622
7623 void PG::RecoveryState::Stray::exit()
7624 {
7625   context< RecoveryMachine >().log_exit(state_name, enter_time);
7626   PG *pg = context< RecoveryMachine >().pg;
7627   utime_t dur = ceph_clock_now() - enter_time;
7628   pg->osd->recoverystate_perf->tinc(rs_stray_latency, dur);
7629 }
7630
7631 /*--------GetInfo---------*/
7632 PG::RecoveryState::GetInfo::GetInfo(my_context ctx)
7633   : my_base(ctx),
7634     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetInfo")
7635 {
7636   context< RecoveryMachine >().log_enter(state_name);
7637
7638   PG *pg = context< RecoveryMachine >().pg;
7639   pg->check_past_interval_bounds();
7640   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7641
7642   assert(pg->blocked_by.empty());
7643
7644   prior_set = pg->build_prior();
7645
7646   pg->reset_min_peer_features();
7647   get_infos();
7648   if (prior_set.pg_down) {
7649     post_event(IsDown());
7650   } else if (peer_info_requested.empty()) {
7651     post_event(GotInfo());
7652   }
7653 }
7654
7655 void PG::RecoveryState::GetInfo::get_infos()
7656 {
7657   PG *pg = context< RecoveryMachine >().pg;
7658   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7659
7660   pg->blocked_by.clear();
7661   for (set<pg_shard_t>::const_iterator it = prior_set.probe.begin();
7662        it != prior_set.probe.end();
7663        ++it) {
7664     pg_shard_t peer = *it;
7665     if (peer == pg->pg_whoami) {
7666       continue;
7667     }
7668     if (pg->peer_info.count(peer)) {
7669       ldout(pg->cct, 10) << " have osd." << peer << " info " << pg->peer_info[peer] << dendl;
7670       continue;
7671     }
7672     if (peer_info_requested.count(peer)) {
7673       ldout(pg->cct, 10) << " already requested info from osd." << peer << dendl;
7674       pg->blocked_by.insert(peer.osd);
7675     } else if (!pg->get_osdmap()->is_up(peer.osd)) {
7676       ldout(pg->cct, 10) << " not querying info from down osd." << peer << dendl;
7677     } else {
7678       ldout(pg->cct, 10) << " querying info from osd." << peer << dendl;
7679       context< RecoveryMachine >().send_query(
7680         peer, pg_query_t(pg_query_t::INFO,
7681                          it->shard, pg->pg_whoami.shard,
7682                          pg->info.history,
7683                          pg->get_osdmap()->get_epoch()));
7684       peer_info_requested.insert(peer);
7685       pg->blocked_by.insert(peer.osd);
7686     }
7687   }
7688
7689   pg->publish_stats_to_osd();
7690 }
7691
7692 boost::statechart::result PG::RecoveryState::GetInfo::react(const MNotifyRec& infoevt) 
7693 {
7694   PG *pg = context< RecoveryMachine >().pg;
7695
7696   set<pg_shard_t>::iterator p = peer_info_requested.find(infoevt.from);
7697   if (p != peer_info_requested.end()) {
7698     peer_info_requested.erase(p);
7699     pg->blocked_by.erase(infoevt.from.osd);
7700   }
7701
7702   epoch_t old_start = pg->info.history.last_epoch_started;
7703   if (pg->proc_replica_info(
7704         infoevt.from, infoevt.notify.info, infoevt.notify.epoch_sent)) {
7705     // we got something new ...
7706     PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
7707     if (old_start < pg->info.history.last_epoch_started) {
7708       ldout(pg->cct, 10) << " last_epoch_started moved forward, rebuilding prior" << dendl;
7709       prior_set = pg->build_prior();
7710
7711       // filter out any osds that got dropped from the probe set from
7712       // peer_info_requested.  this is less expensive than restarting
7713       // peering (which would re-probe everyone).
7714       set<pg_shard_t>::iterator p = peer_info_requested.begin();
7715       while (p != peer_info_requested.end()) {
7716         if (prior_set.probe.count(*p) == 0) {
7717           ldout(pg->cct, 20) << " dropping osd." << *p << " from info_requested, no longer in probe set" << dendl;
7718           peer_info_requested.erase(p++);
7719         } else {
7720           ++p;
7721         }
7722       }
7723       get_infos();
7724     }
7725     ldout(pg->cct, 20) << "Adding osd: " << infoevt.from.osd << " peer features: "
7726                        << hex << infoevt.features << dec << dendl;
7727     pg->apply_peer_features(infoevt.features);
7728
7729     // are we done getting everything?
7730     if (peer_info_requested.empty() && !prior_set.pg_down) {
7731       ldout(pg->cct, 20) << "Common peer features: " << hex << pg->get_min_peer_features() << dec << dendl;
7732       ldout(pg->cct, 20) << "Common acting features: " << hex << pg->get_min_acting_features() << dec << dendl;
7733       ldout(pg->cct, 20) << "Common upacting features: " << hex << pg->get_min_upacting_features() << dec << dendl;
7734       post_event(GotInfo());
7735     }
7736   }
7737   return discard_event();
7738 }
7739
7740 boost::statechart::result PG::RecoveryState::GetInfo::react(const QueryState& q)
7741 {
7742   PG *pg = context< RecoveryMachine >().pg;
7743   q.f->open_object_section("state");
7744   q.f->dump_string("name", state_name);
7745   q.f->dump_stream("enter_time") << enter_time;
7746
7747   q.f->open_array_section("requested_info_from");
7748   for (set<pg_shard_t>::iterator p = peer_info_requested.begin();
7749        p != peer_info_requested.end();
7750        ++p) {
7751     q.f->open_object_section("osd");
7752     q.f->dump_stream("osd") << *p;
7753     if (pg->peer_info.count(*p)) {
7754       q.f->open_object_section("got_info");
7755       pg->peer_info[*p].dump(q.f);
7756       q.f->close_section();
7757     }
7758     q.f->close_section();
7759   }
7760   q.f->close_section();
7761
7762   q.f->close_section();
7763   return forward_event();
7764 }
7765
7766 void PG::RecoveryState::GetInfo::exit()
7767 {
7768   context< RecoveryMachine >().log_exit(state_name, enter_time);
7769   PG *pg = context< RecoveryMachine >().pg;
7770   utime_t dur = ceph_clock_now() - enter_time;
7771   pg->osd->recoverystate_perf->tinc(rs_getinfo_latency, dur);
7772   pg->blocked_by.clear();
7773   pg->publish_stats_to_osd();
7774 }
7775
7776 /*------GetLog------------*/
7777 PG::RecoveryState::GetLog::GetLog(my_context ctx)
7778   : my_base(ctx),
7779     NamedState(
7780       context< RecoveryMachine >().pg, "Started/Primary/Peering/GetLog"),
7781     msg(0)
7782 {
7783   context< RecoveryMachine >().log_enter(state_name);
7784
7785   PG *pg = context< RecoveryMachine >().pg;
7786
7787   // adjust acting?
7788   if (!pg->choose_acting(auth_log_shard, false,
7789                          &context< Peering >().history_les_bound)) {
7790     if (!pg->want_acting.empty()) {
7791       post_event(NeedActingChange());
7792     } else {
7793       post_event(IsIncomplete());
7794     }
7795     return;
7796   }
7797
7798   // am i the best?
7799   if (auth_log_shard == pg->pg_whoami) {
7800     post_event(GotLog());
7801     return;
7802   }
7803
7804   const pg_info_t& best = pg->peer_info[auth_log_shard];
7805
7806   // am i broken?
7807   if (pg->info.last_update < best.log_tail) {
7808     ldout(pg->cct, 10) << " not contiguous with osd." << auth_log_shard << ", down" << dendl;
7809     post_event(IsIncomplete());
7810     return;
7811   }
7812
7813   // how much log to request?
7814   eversion_t request_log_from = pg->info.last_update;
7815   assert(!pg->actingbackfill.empty());
7816   for (set<pg_shard_t>::iterator p = pg->actingbackfill.begin();
7817        p != pg->actingbackfill.end();
7818        ++p) {
7819     if (*p == pg->pg_whoami) continue;
7820     pg_info_t& ri = pg->peer_info[*p];
7821     if (ri.last_update < pg->info.log_tail && ri.last_update >= best.log_tail &&
7822         ri.last_update < request_log_from)
7823       request_log_from = ri.last_update;
7824   }
7825
7826   // how much?
7827   ldout(pg->cct, 10) << " requesting log from osd." << auth_log_shard << dendl;
7828   context<RecoveryMachine>().send_query(
7829     auth_log_shard,
7830     pg_query_t(
7831       pg_query_t::LOG,
7832       auth_log_shard.shard, pg->pg_whoami.shard,
7833       request_log_from, pg->info.history,
7834       pg->get_osdmap()->get_epoch()));
7835
7836   assert(pg->blocked_by.empty());
7837   pg->blocked_by.insert(auth_log_shard.osd);
7838   pg->publish_stats_to_osd();
7839 }
7840
7841 boost::statechart::result PG::RecoveryState::GetLog::react(const AdvMap& advmap)
7842 {
7843   PG *pg = context< RecoveryMachine >().pg;
7844   // make sure our log source didn't go down.  we need to check
7845   // explicitly because it may not be part of the prior set, which
7846   // means the Peering state check won't catch it going down.
7847   if (!advmap.osdmap->is_up(auth_log_shard.osd)) {
7848     ldout(pg->cct, 10) << "GetLog: auth_log_shard osd."
7849                        << auth_log_shard.osd << " went down" << dendl;
7850     post_event(advmap);
7851     return transit< Reset >();
7852   }
7853
7854   // let the Peering state do its checks.
7855   return forward_event();
7856 }
7857
7858 boost::statechart::result PG::RecoveryState::GetLog::react(const MLogRec& logevt)
7859 {
7860   PG *pg = context< RecoveryMachine >().pg;
7861   assert(!msg);
7862   if (logevt.from != auth_log_shard) {
7863     ldout(pg->cct, 10) << "GetLog: discarding log from "
7864                        << "non-auth_log_shard osd." << logevt.from << dendl;
7865     return discard_event();
7866   }
7867   ldout(pg->cct, 10) << "GetLog: received master log from osd"
7868                      << logevt.from << dendl;
7869   msg = logevt.msg;
7870   post_event(GotLog());
7871   return discard_event();
7872 }
7873
7874 boost::statechart::result PG::RecoveryState::GetLog::react(const GotLog&)
7875 {
7876   PG *pg = context< RecoveryMachine >().pg;
7877   ldout(pg->cct, 10) << "leaving GetLog" << dendl;
7878   if (msg) {
7879     ldout(pg->cct, 10) << "processing master log" << dendl;
7880     pg->proc_master_log(*context<RecoveryMachine>().get_cur_transaction(),
7881                         msg->info, msg->log, msg->missing,
7882                         auth_log_shard);
7883   }
7884   pg->start_flush(
7885     context< RecoveryMachine >().get_cur_transaction(),
7886     context< RecoveryMachine >().get_on_applied_context_list(),
7887     context< RecoveryMachine >().get_on_safe_context_list());
7888   return transit< GetMissing >();
7889 }
7890
7891 boost::statechart::result PG::RecoveryState::GetLog::react(const QueryState& q)
7892 {
7893   q.f->open_object_section("state");
7894   q.f->dump_string("name", state_name);
7895   q.f->dump_stream("enter_time") << enter_time;
7896   q.f->dump_stream("auth_log_shard") << auth_log_shard;
7897   q.f->close_section();
7898   return forward_event();
7899 }
7900
7901 void PG::RecoveryState::GetLog::exit()
7902 {
7903   context< RecoveryMachine >().log_exit(state_name, enter_time);
7904   PG *pg = context< RecoveryMachine >().pg;
7905   utime_t dur = ceph_clock_now() - enter_time;
7906   pg->osd->recoverystate_perf->tinc(rs_getlog_latency, dur);
7907   pg->blocked_by.clear();
7908   pg->publish_stats_to_osd();
7909 }
7910
7911 /*------WaitActingChange--------*/
7912 PG::RecoveryState::WaitActingChange::WaitActingChange(my_context ctx)
7913   : my_base(ctx),
7914     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitActingChange")
7915 {
7916   context< RecoveryMachine >().log_enter(state_name);
7917 }
7918
7919 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const AdvMap& advmap)
7920 {
7921   PG *pg = context< RecoveryMachine >().pg;
7922   OSDMapRef osdmap = advmap.osdmap;
7923
7924   ldout(pg->cct, 10) << "verifying no want_acting " << pg->want_acting << " targets didn't go down" << dendl;
7925   for (vector<int>::iterator p = pg->want_acting.begin(); p != pg->want_acting.end(); ++p) {
7926     if (!osdmap->is_up(*p)) {
7927       ldout(pg->cct, 10) << " want_acting target osd." << *p << " went down, resetting" << dendl;
7928       post_event(advmap);
7929       return transit< Reset >();
7930     }
7931   }
7932   return forward_event();
7933 }
7934
7935 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MLogRec& logevt)
7936 {
7937   PG *pg = context< RecoveryMachine >().pg;
7938   ldout(pg->cct, 10) << "In WaitActingChange, ignoring MLocRec" << dendl;
7939   return discard_event();
7940 }
7941
7942 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MInfoRec& evt)
7943 {
7944   PG *pg = context< RecoveryMachine >().pg;
7945   ldout(pg->cct, 10) << "In WaitActingChange, ignoring MInfoRec" << dendl;
7946   return discard_event();
7947 }
7948
7949 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const MNotifyRec& evt)
7950 {
7951   PG *pg = context< RecoveryMachine >().pg;
7952   ldout(pg->cct, 10) << "In WaitActingChange, ignoring MNotifyRec" << dendl;
7953   return discard_event();
7954 }
7955
7956 boost::statechart::result PG::RecoveryState::WaitActingChange::react(const QueryState& q)
7957 {
7958   q.f->open_object_section("state");
7959   q.f->dump_string("name", state_name);
7960   q.f->dump_stream("enter_time") << enter_time;
7961   q.f->dump_string("comment", "waiting for pg acting set to change");
7962   q.f->close_section();
7963   return forward_event();
7964 }
7965
7966 void PG::RecoveryState::WaitActingChange::exit()
7967 {
7968   context< RecoveryMachine >().log_exit(state_name, enter_time);
7969   PG *pg = context< RecoveryMachine >().pg;
7970   utime_t dur = ceph_clock_now() - enter_time;
7971   pg->osd->recoverystate_perf->tinc(rs_waitactingchange_latency, dur);
7972 }
7973
7974 /*------Down--------*/
7975 PG::RecoveryState::Down::Down(my_context ctx)
7976   : my_base(ctx),
7977     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Down")
7978 {
7979   context< RecoveryMachine >().log_enter(state_name);
7980   PG *pg = context< RecoveryMachine >().pg;
7981
7982   pg->state_clear(PG_STATE_PEERING);
7983   pg->state_set(PG_STATE_DOWN);
7984
7985   auto &prior_set = context< Peering >().prior_set;
7986   assert(pg->blocked_by.empty());
7987   pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
7988   pg->publish_stats_to_osd();
7989 }
7990
7991 void PG::RecoveryState::Down::exit()
7992 {
7993   context< RecoveryMachine >().log_exit(state_name, enter_time);
7994   PG *pg = context< RecoveryMachine >().pg;
7995
7996   pg->state_clear(PG_STATE_DOWN);
7997   utime_t dur = ceph_clock_now() - enter_time;
7998   pg->osd->recoverystate_perf->tinc(rs_down_latency, dur);
7999
8000   pg->blocked_by.clear();
8001   pg->publish_stats_to_osd();
8002 }
8003
8004 boost::statechart::result PG::RecoveryState::Down::react(const QueryState& q)
8005 {
8006   q.f->open_object_section("state");
8007   q.f->dump_string("name", state_name);
8008   q.f->dump_stream("enter_time") << enter_time;
8009   q.f->dump_string("comment",
8010                    "not enough up instances of this PG to go active");
8011   q.f->close_section();
8012   return forward_event();
8013 }
8014
8015 /*------Incomplete--------*/
8016 PG::RecoveryState::Incomplete::Incomplete(my_context ctx)
8017   : my_base(ctx),
8018     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/Incomplete")
8019 {
8020   context< RecoveryMachine >().log_enter(state_name);
8021   PG *pg = context< RecoveryMachine >().pg;
8022
8023   pg->state_clear(PG_STATE_PEERING);
8024   pg->state_set(PG_STATE_INCOMPLETE);
8025
8026   PastIntervals::PriorSet &prior_set = context< Peering >().prior_set;
8027   assert(pg->blocked_by.empty());
8028   pg->blocked_by.insert(prior_set.down.begin(), prior_set.down.end());
8029   pg->publish_stats_to_osd();
8030 }
8031
8032 boost::statechart::result PG::RecoveryState::Incomplete::react(const AdvMap &advmap) {
8033   PG *pg = context< RecoveryMachine >().pg;
8034   int64_t poolnum = pg->info.pgid.pool();
8035
8036   // Reset if min_size turn smaller than previous value, pg might now be able to go active
8037   if (advmap.lastmap->get_pools().find(poolnum)->second.min_size >
8038       advmap.osdmap->get_pools().find(poolnum)->second.min_size) {
8039     post_event(advmap);
8040     return transit< Reset >();
8041   }
8042
8043   return forward_event();
8044 }
8045
8046 boost::statechart::result PG::RecoveryState::Incomplete::react(const MNotifyRec& notevt) {
8047   PG *pg = context< RecoveryMachine >().pg;
8048   ldout(pg->cct, 7) << "handle_pg_notify from osd." << notevt.from << dendl;
8049   if (pg->proc_replica_info(
8050     notevt.from, notevt.notify.info, notevt.notify.epoch_sent)) {
8051     // We got something new, try again!
8052     return transit< GetLog >();
8053   } else {
8054     return discard_event();
8055   }
8056 }
8057
8058 boost::statechart::result PG::RecoveryState::Incomplete::react(
8059   const QueryState& q)
8060 {
8061   q.f->open_object_section("state");
8062   q.f->dump_string("name", state_name);
8063   q.f->dump_stream("enter_time") << enter_time;
8064   q.f->dump_string("comment", "not enough complete instances of this PG");
8065   q.f->close_section();
8066   return forward_event();
8067 }
8068
8069 void PG::RecoveryState::Incomplete::exit()
8070 {
8071   context< RecoveryMachine >().log_exit(state_name, enter_time);
8072   PG *pg = context< RecoveryMachine >().pg;
8073
8074   pg->state_clear(PG_STATE_INCOMPLETE);
8075   utime_t dur = ceph_clock_now() - enter_time;
8076   pg->osd->recoverystate_perf->tinc(rs_incomplete_latency, dur);
8077
8078   pg->blocked_by.clear();
8079   pg->publish_stats_to_osd();
8080 }
8081
8082 /*------GetMissing--------*/
8083 PG::RecoveryState::GetMissing::GetMissing(my_context ctx)
8084   : my_base(ctx),
8085     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/GetMissing")
8086 {
8087   context< RecoveryMachine >().log_enter(state_name);
8088
8089   PG *pg = context< RecoveryMachine >().pg;
8090   assert(!pg->actingbackfill.empty());
8091   eversion_t since;
8092   for (set<pg_shard_t>::iterator i = pg->actingbackfill.begin();
8093        i != pg->actingbackfill.end();
8094        ++i) {
8095     if (*i == pg->get_primary()) continue;
8096     const pg_info_t& pi = pg->peer_info[*i];
8097     // reset this so to make sure the pg_missing_t is initialized and
8098     // has the correct semantics even if we don't need to get a
8099     // missing set from a shard. This way later additions due to
8100     // lost+unfound delete work properly.
8101     pg->peer_missing[*i].may_include_deletes = !pg->perform_deletes_during_peering();
8102
8103     if (pi.is_empty())
8104       continue;                                // no pg data, nothing divergent
8105
8106     if (pi.last_update < pg->pg_log.get_tail()) {
8107       ldout(pg->cct, 10) << " osd." << *i << " is not contiguous, will restart backfill" << dendl;
8108       pg->peer_missing[*i].clear();
8109       continue;
8110     }
8111     if (pi.last_backfill == hobject_t()) {
8112       ldout(pg->cct, 10) << " osd." << *i << " will fully backfill; can infer empty missing set" << dendl;
8113       pg->peer_missing[*i].clear();
8114       continue;
8115     }
8116
8117     if (pi.last_update == pi.last_complete &&  // peer has no missing
8118         pi.last_update == pg->info.last_update) {  // peer is up to date
8119       // replica has no missing and identical log as us.  no need to
8120       // pull anything.
8121       // FIXME: we can do better here.  if last_update==last_complete we
8122       //        can infer the rest!
8123       ldout(pg->cct, 10) << " osd." << *i << " has no missing, identical log" << dendl;
8124       pg->peer_missing[*i].clear();
8125       continue;
8126     }
8127
8128     // We pull the log from the peer's last_epoch_started to ensure we
8129     // get enough log to detect divergent updates.
8130     since.epoch = pi.last_epoch_started;
8131     assert(pi.last_update >= pg->info.log_tail);  // or else choose_acting() did a bad thing
8132     if (pi.log_tail <= since) {
8133       ldout(pg->cct, 10) << " requesting log+missing since " << since << " from osd." << *i << dendl;
8134       context< RecoveryMachine >().send_query(
8135         *i,
8136         pg_query_t(
8137           pg_query_t::LOG,
8138           i->shard, pg->pg_whoami.shard,
8139           since, pg->info.history,
8140           pg->get_osdmap()->get_epoch()));
8141     } else {
8142       ldout(pg->cct, 10) << " requesting fulllog+missing from osd." << *i
8143                          << " (want since " << since << " < log.tail "
8144                          << pi.log_tail << ")" << dendl;
8145       context< RecoveryMachine >().send_query(
8146         *i, pg_query_t(
8147           pg_query_t::FULLLOG,
8148           i->shard, pg->pg_whoami.shard,
8149           pg->info.history, pg->get_osdmap()->get_epoch()));
8150     }
8151     peer_missing_requested.insert(*i);
8152     pg->blocked_by.insert(i->osd);
8153   }
8154
8155   if (peer_missing_requested.empty()) {
8156     if (pg->need_up_thru) {
8157       ldout(pg->cct, 10) << " still need up_thru update before going active"
8158                          << dendl;
8159       post_event(NeedUpThru());
8160       return;
8161     }
8162
8163     // all good!
8164     post_event(Activate(pg->get_osdmap()->get_epoch()));
8165   } else {
8166     pg->publish_stats_to_osd();
8167   }
8168 }
8169
8170 boost::statechart::result PG::RecoveryState::GetMissing::react(const MLogRec& logevt)
8171 {
8172   PG *pg = context< RecoveryMachine >().pg;
8173
8174   peer_missing_requested.erase(logevt.from);
8175   pg->proc_replica_log(logevt.msg->info, logevt.msg->log, logevt.msg->missing, logevt.from);
8176   
8177   if (peer_missing_requested.empty()) {
8178     if (pg->need_up_thru) {
8179       ldout(pg->cct, 10) << " still need up_thru update before going active"
8180                          << dendl;
8181       post_event(NeedUpThru());
8182     } else {
8183       ldout(pg->cct, 10) << "Got last missing, don't need missing "
8184                          << "posting Activate" << dendl;
8185       post_event(Activate(pg->get_osdmap()->get_epoch()));
8186     }
8187   }
8188   return discard_event();
8189 }
8190
8191 boost::statechart::result PG::RecoveryState::GetMissing::react(const QueryState& q)
8192 {
8193   PG *pg = context< RecoveryMachine >().pg;
8194   q.f->open_object_section("state");
8195   q.f->dump_string("name", state_name);
8196   q.f->dump_stream("enter_time") << enter_time;
8197
8198   q.f->open_array_section("peer_missing_requested");
8199   for (set<pg_shard_t>::iterator p = peer_missing_requested.begin();
8200        p != peer_missing_requested.end();
8201        ++p) {
8202     q.f->open_object_section("osd");
8203     q.f->dump_stream("osd") << *p;
8204     if (pg->peer_missing.count(*p)) {
8205       q.f->open_object_section("got_missing");
8206       pg->peer_missing[*p].dump(q.f);
8207       q.f->close_section();
8208     }
8209     q.f->close_section();
8210   }
8211   q.f->close_section();
8212
8213   q.f->close_section();
8214   return forward_event();
8215 }
8216
8217 void PG::RecoveryState::GetMissing::exit()
8218 {
8219   context< RecoveryMachine >().log_exit(state_name, enter_time);
8220   PG *pg = context< RecoveryMachine >().pg;
8221   utime_t dur = ceph_clock_now() - enter_time;
8222   pg->osd->recoverystate_perf->tinc(rs_getmissing_latency, dur);
8223   pg->blocked_by.clear();
8224   pg->publish_stats_to_osd();
8225 }
8226
8227 /*------WaitUpThru--------*/
8228 PG::RecoveryState::WaitUpThru::WaitUpThru(my_context ctx)
8229   : my_base(ctx),
8230     NamedState(context< RecoveryMachine >().pg, "Started/Primary/Peering/WaitUpThru")
8231 {
8232   context< RecoveryMachine >().log_enter(state_name);
8233 }
8234
8235 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const ActMap& am)
8236 {
8237   PG *pg = context< RecoveryMachine >().pg;
8238   if (!pg->need_up_thru) {
8239     post_event(Activate(pg->get_osdmap()->get_epoch()));
8240   }
8241   return forward_event();
8242 }
8243
8244 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const MLogRec& logevt)
8245 {
8246   PG *pg = context< RecoveryMachine >().pg;
8247   ldout(pg->cct, 10) << "Noting missing from osd." << logevt.from << dendl;
8248   pg->peer_missing[logevt.from].claim(logevt.msg->missing);
8249   pg->peer_info[logevt.from] = logevt.msg->info;
8250   return discard_event();
8251 }
8252
8253 boost::statechart::result PG::RecoveryState::WaitUpThru::react(const QueryState& q)
8254 {
8255   q.f->open_object_section("state");
8256   q.f->dump_string("name", state_name);
8257   q.f->dump_stream("enter_time") << enter_time;
8258   q.f->dump_string("comment", "waiting for osdmap to reflect a new up_thru for this osd");
8259   q.f->close_section();
8260   return forward_event();
8261 }
8262
8263 void PG::RecoveryState::WaitUpThru::exit()
8264 {
8265   context< RecoveryMachine >().log_exit(state_name, enter_time);
8266   PG *pg = context< RecoveryMachine >().pg;
8267   utime_t dur = ceph_clock_now() - enter_time;
8268   pg->osd->recoverystate_perf->tinc(rs_waitupthru_latency, dur);
8269 }
8270
8271 /*----RecoveryState::RecoveryMachine Methods-----*/
8272 #undef dout_prefix
8273 #define dout_prefix *_dout << pg->gen_prefix()
8274
8275 void PG::RecoveryState::RecoveryMachine::log_enter(const char *state_name)
8276 {
8277   PG *pg = context< RecoveryMachine >().pg;
8278   ldout(pg->cct, 5) << "enter " << state_name << dendl;
8279   pg->osd->pg_recovery_stats.log_enter(state_name);
8280 }
8281
8282 void PG::RecoveryState::RecoveryMachine::log_exit(const char *state_name, utime_t enter_time)
8283 {
8284   utime_t dur = ceph_clock_now() - enter_time;
8285   PG *pg = context< RecoveryMachine >().pg;
8286   ldout(pg->cct, 5) << "exit " << state_name << " " << dur << " " << event_count << " " << event_time << dendl;
8287   pg->osd->pg_recovery_stats.log_exit(state_name, ceph_clock_now() - enter_time,
8288                                       event_count, event_time);
8289   event_count = 0;
8290   event_time = utime_t();
8291 }
8292
8293
8294 /*---------------------------------------------------*/
8295 #undef dout_prefix
8296 #define dout_prefix (*_dout << (debug_pg ? debug_pg->gen_prefix() : string()) << " PriorSet: ")
8297
8298 void PG::RecoveryState::start_handle(RecoveryCtx *new_ctx) {
8299   assert(!rctx);
8300   assert(!orig_ctx);
8301   orig_ctx = new_ctx;
8302   if (new_ctx) {
8303     if (messages_pending_flush) {
8304       rctx = RecoveryCtx(*messages_pending_flush, *new_ctx);
8305     } else {
8306       rctx = *new_ctx;
8307     }
8308     rctx->start_time = ceph_clock_now();
8309   }
8310 }
8311
8312 void PG::RecoveryState::begin_block_outgoing() {
8313   assert(!messages_pending_flush);
8314   assert(orig_ctx);
8315   assert(rctx);
8316   messages_pending_flush = BufferedRecoveryMessages();
8317   rctx = RecoveryCtx(*messages_pending_flush, *orig_ctx);
8318 }
8319
8320 void PG::RecoveryState::clear_blocked_outgoing() {
8321   assert(orig_ctx);
8322   assert(rctx);
8323   messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8324 }
8325
8326 void PG::RecoveryState::end_block_outgoing() {
8327   assert(messages_pending_flush);
8328   assert(orig_ctx);
8329   assert(rctx);
8330
8331   rctx = RecoveryCtx(*orig_ctx);
8332   rctx->accept_buffered_messages(*messages_pending_flush);
8333   messages_pending_flush = boost::optional<BufferedRecoveryMessages>();
8334 }
8335
8336 void PG::RecoveryState::end_handle() {
8337   if (rctx) {
8338     utime_t dur = ceph_clock_now() - rctx->start_time;
8339     machine.event_time += dur;
8340   }
8341
8342   machine.event_count++;
8343   rctx = boost::optional<RecoveryCtx>();
8344   orig_ctx = NULL;
8345 }
8346
8347 ostream& operator<<(ostream& out, const PG::BackfillInterval& bi)
8348 {
8349   out << "BackfillInfo(" << bi.begin << "-" << bi.end
8350       << " " << bi.objects.size() << " objects";
8351   if (!bi.objects.empty())
8352     out << " " << bi.objects;
8353   out << ")";
8354   return out;
8355 }
8356
8357 void intrusive_ptr_add_ref(PG *pg) { pg->get("intptr"); }
8358 void intrusive_ptr_release(PG *pg) { pg->put("intptr"); }
8359
8360 #ifdef PG_DEBUG_REFS
8361   uint64_t get_with_id(PG *pg) { return pg->get_with_id(); }
8362   void put_with_id(PG *pg, uint64_t id) { return pg->put_with_id(id); }
8363 #endif