1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 * Copyright (C) 2017 OVH
9 * This is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License version 2.1, as published by the Free Software
12 * Foundation. See file COPYING.
23 #include <boost/scoped_ptr.hpp>
25 #ifdef HAVE_SYS_PARAM_H
26 #include <sys/param.h>
29 #ifdef HAVE_SYS_MOUNT_H
30 #include <sys/mount.h>
35 #include "include/types.h"
36 #include "include/compat.h"
41 #include "osdc/Objecter.h"
43 #include "common/errno.h"
44 #include "common/ceph_argparse.h"
45 #include "common/ceph_time.h"
46 #include "common/version.h"
47 #include "common/io_priority.h"
48 #include "common/pick_address.h"
50 #include "os/ObjectStore.h"
52 #include "os/FuseStore.h"
55 #include "PrimaryLogPG.h"
58 #include "msg/Messenger.h"
59 #include "msg/Message.h"
61 #include "mon/MonClient.h"
63 #include "messages/MLog.h"
65 #include "messages/MGenericMessage.h"
66 #include "messages/MOSDPing.h"
67 #include "messages/MOSDFailure.h"
68 #include "messages/MOSDMarkMeDown.h"
69 #include "messages/MOSDFull.h"
70 #include "messages/MOSDOp.h"
71 #include "messages/MOSDOpReply.h"
72 #include "messages/MOSDBackoff.h"
73 #include "messages/MOSDBeacon.h"
74 #include "messages/MOSDRepOp.h"
75 #include "messages/MOSDRepOpReply.h"
76 #include "messages/MOSDBoot.h"
77 #include "messages/MOSDPGTemp.h"
79 #include "messages/MOSDMap.h"
80 #include "messages/MMonGetOSDMap.h"
81 #include "messages/MOSDPGNotify.h"
82 #include "messages/MOSDPGQuery.h"
83 #include "messages/MOSDPGLog.h"
84 #include "messages/MOSDPGRemove.h"
85 #include "messages/MOSDPGInfo.h"
86 #include "messages/MOSDPGCreate.h"
87 #include "messages/MOSDPGTrim.h"
88 #include "messages/MOSDPGScan.h"
89 #include "messages/MOSDPGBackfill.h"
90 #include "messages/MBackfillReserve.h"
91 #include "messages/MRecoveryReserve.h"
92 #include "messages/MOSDForceRecovery.h"
93 #include "messages/MOSDECSubOpWrite.h"
94 #include "messages/MOSDECSubOpWriteReply.h"
95 #include "messages/MOSDECSubOpRead.h"
96 #include "messages/MOSDECSubOpReadReply.h"
97 #include "messages/MOSDPGCreated.h"
98 #include "messages/MOSDPGUpdateLogMissing.h"
99 #include "messages/MOSDPGUpdateLogMissingReply.h"
101 #include "messages/MOSDAlive.h"
103 #include "messages/MOSDScrub.h"
104 #include "messages/MOSDScrubReserve.h"
105 #include "messages/MOSDRepScrub.h"
107 #include "messages/MMonCommand.h"
108 #include "messages/MCommand.h"
109 #include "messages/MCommandReply.h"
111 #include "messages/MPGStats.h"
112 #include "messages/MPGStatsAck.h"
114 #include "messages/MWatchNotify.h"
115 #include "messages/MOSDPGPush.h"
116 #include "messages/MOSDPGPushReply.h"
117 #include "messages/MOSDPGPull.h"
119 #include "common/perf_counters.h"
120 #include "common/Timer.h"
121 #include "common/LogClient.h"
122 #include "common/AsyncReserver.h"
123 #include "common/HeartbeatMap.h"
124 #include "common/admin_socket.h"
125 #include "common/ceph_context.h"
127 #include "global/signal_handler.h"
128 #include "global/pidfile.h"
130 #include "include/color.h"
131 #include "perfglue/cpu_profiler.h"
132 #include "perfglue/heap_profiler.h"
134 #include "osd/OpRequest.h"
136 #include "auth/AuthAuthorizeHandler.h"
137 #include "auth/RotatingKeyRing.h"
138 #include "common/errno.h"
140 #include "objclass/objclass.h"
142 #include "common/cmdparse.h"
143 #include "include/str_list.h"
144 #include "include/util.h"
146 #include "include/assert.h"
147 #include "common/config.h"
148 #include "common/EventTrace.h"
151 #define TRACEPOINT_DEFINE
152 #define TRACEPOINT_PROBE_DYNAMIC_LINKAGE
153 #include "tracing/osd.h"
154 #undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE
155 #undef TRACEPOINT_DEFINE
157 #define tracepoint(...)
160 #define dout_context cct
161 #define dout_subsys ceph_subsys_osd
163 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
166 const double OSD::OSD_TICK_INTERVAL = 1.0;
168 static ostream& _prefix(std::ostream* _dout, int whoami, epoch_t epoch) {
169 return *_dout << "osd." << whoami << " " << epoch << " ";
172 //Initial features in new superblock.
173 //Features here are also automatically upgraded
174 CompatSet OSD::get_osd_initial_compat_set() {
175 CompatSet::FeatureSet ceph_osd_feature_compat;
176 CompatSet::FeatureSet ceph_osd_feature_ro_compat;
177 CompatSet::FeatureSet ceph_osd_feature_incompat;
178 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
179 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGINFO);
180 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_OLOC);
181 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEC);
182 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_CATEGORIES);
183 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HOBJECTPOOL);
184 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BIGINFO);
185 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBINFO);
186 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_LEVELDBLOG);
187 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SNAPMAPPER);
188 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_HINTS);
189 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_PGMETA);
190 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_MISSING);
191 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_FASTINFO);
192 ceph_osd_feature_incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES);
193 return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat,
194 ceph_osd_feature_incompat);
197 //Features are added here that this OSD supports.
198 CompatSet OSD::get_osd_compat_set() {
199 CompatSet compat = get_osd_initial_compat_set();
200 //Any features here can be set in code, but not in initial superblock
201 compat.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
205 OSDService::OSDService(OSD *osd) :
208 meta_osr(new ObjectStore::Sequencer("meta")),
209 whoami(osd->whoami), store(osd->store),
210 log_client(osd->log_client), clog(osd->clog),
211 pg_recovery_stats(osd->pg_recovery_stats),
212 cluster_messenger(osd->cluster_messenger),
213 client_messenger(osd->client_messenger),
215 recoverystate_perf(osd->recoverystate_perf),
217 peering_wq(osd->peering_wq),
218 recovery_gen_wq("recovery_gen_wq", cct->_conf->osd_recovery_thread_timeout,
220 class_handler(osd->class_handler),
221 pg_epoch_lock("OSDService::pg_epoch_lock"),
222 publish_lock("OSDService::publish_lock"),
223 pre_publish_lock("OSDService::pre_publish_lock"),
225 peer_map_epoch_lock("OSDService::peer_map_epoch_lock"),
226 sched_scrub_lock("OSDService::sched_scrub_lock"), scrubs_pending(0),
228 agent_lock("OSDService::agent_lock"),
229 agent_valid_iterator(false),
231 flush_mode_high_count(0),
234 agent_stop_flag(false),
235 agent_timer_lock("OSDService::agent_timer_lock"),
236 agent_timer(osd->client_messenger->cct, agent_timer_lock),
237 last_recalibrate(ceph_clock_now()),
238 promote_max_objects(0),
239 promote_max_bytes(0),
240 objecter(new Objecter(osd->client_messenger->cct, osd->objecter_messenger, osd->monc, NULL, 0, 0)),
241 objecter_finisher(osd->client_messenger->cct),
242 watch_lock("OSDService::watch_lock"),
243 watch_timer(osd->client_messenger->cct, watch_lock),
245 recovery_request_lock("OSDService::recovery_request_lock"),
246 recovery_request_timer(cct, recovery_request_lock, false),
247 recovery_sleep_lock("OSDService::recovery_sleep_lock"),
248 recovery_sleep_timer(cct, recovery_sleep_lock, false),
249 reserver_finisher(cct),
250 local_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
251 cct->_conf->osd_min_recovery_priority),
252 remote_reserver(cct, &reserver_finisher, cct->_conf->osd_max_backfills,
253 cct->_conf->osd_min_recovery_priority),
254 pg_temp_lock("OSDService::pg_temp_lock"),
255 snap_sleep_lock("OSDService::snap_sleep_lock"),
257 osd->client_messenger->cct, snap_sleep_lock, false /* relax locking */),
258 scrub_sleep_lock("OSDService::scrub_sleep_lock"),
260 osd->client_messenger->cct, scrub_sleep_lock, false /* relax locking */),
261 snap_reserver(cct, &reserver_finisher,
262 cct->_conf->osd_max_trimming_pgs),
263 recovery_lock("OSDService::recovery_lock"),
264 recovery_ops_active(0),
265 recovery_ops_reserved(0),
266 recovery_paused(false),
267 map_cache_lock("OSDService::map_cache_lock"),
268 map_cache(cct, cct->_conf->osd_map_cache_size),
269 map_bl_cache(cct->_conf->osd_map_cache_size),
270 map_bl_inc_cache(cct->_conf->osd_map_cache_size),
271 in_progress_split_lock("OSDService::in_progress_split_lock"),
272 stat_lock("OSDService::stat_lock"),
273 full_status_lock("OSDService::full_status_lock"),
276 epoch_lock("OSDService::epoch_lock"),
277 boot_epoch(0), up_epoch(0), bind_epoch(0),
278 is_stopping_lock("OSDService::is_stopping_lock")
280 , pgid_lock("OSDService::pgid_lock")
286 OSDService::~OSDService()
294 void OSDService::add_pgid(spg_t pgid, PG *pg){
295 Mutex::Locker l(pgid_lock);
296 if (!pgid_tracker.count(pgid)) {
299 pgid_tracker[pgid]++;
301 void OSDService::remove_pgid(spg_t pgid, PG *pg)
303 Mutex::Locker l(pgid_lock);
304 assert(pgid_tracker.count(pgid));
305 assert(pgid_tracker[pgid] > 0);
306 pgid_tracker[pgid]--;
307 if (pgid_tracker[pgid] == 0) {
308 pgid_tracker.erase(pgid);
309 live_pgs.erase(pgid);
312 void OSDService::dump_live_pgids()
314 Mutex::Locker l(pgid_lock);
315 derr << "live pgids:" << dendl;
316 for (map<spg_t, int>::const_iterator i = pgid_tracker.cbegin();
317 i != pgid_tracker.cend();
319 derr << "\t" << *i << dendl;
320 live_pgs[i->first]->dump_live_ids();
326 void OSDService::_start_split(spg_t parent, const set<spg_t> &children)
328 for (set<spg_t>::const_iterator i = children.begin();
331 dout(10) << __func__ << ": Starting split on pg " << *i
332 << ", parent=" << parent << dendl;
333 assert(!pending_splits.count(*i));
334 assert(!in_progress_splits.count(*i));
335 pending_splits.insert(make_pair(*i, parent));
337 assert(!rev_pending_splits[parent].count(*i));
338 rev_pending_splits[parent].insert(*i);
342 void OSDService::mark_split_in_progress(spg_t parent, const set<spg_t> &children)
344 Mutex::Locker l(in_progress_split_lock);
345 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
346 assert(piter != rev_pending_splits.end());
347 for (set<spg_t>::const_iterator i = children.begin();
350 assert(piter->second.count(*i));
351 assert(pending_splits.count(*i));
352 assert(!in_progress_splits.count(*i));
353 assert(pending_splits[*i] == parent);
355 pending_splits.erase(*i);
356 piter->second.erase(*i);
357 in_progress_splits.insert(*i);
359 if (piter->second.empty())
360 rev_pending_splits.erase(piter);
363 void OSDService::cancel_pending_splits_for_parent(spg_t parent)
365 Mutex::Locker l(in_progress_split_lock);
366 _cancel_pending_splits_for_parent(parent);
369 void OSDService::_cancel_pending_splits_for_parent(spg_t parent)
371 map<spg_t, set<spg_t> >::iterator piter = rev_pending_splits.find(parent);
372 if (piter == rev_pending_splits.end())
375 for (set<spg_t>::iterator i = piter->second.begin();
376 i != piter->second.end();
378 assert(pending_splits.count(*i));
379 assert(!in_progress_splits.count(*i));
380 pending_splits.erase(*i);
381 dout(10) << __func__ << ": Completing split on pg " << *i
382 << " for parent: " << parent << dendl;
383 _cancel_pending_splits_for_parent(*i);
385 rev_pending_splits.erase(piter);
388 void OSDService::_maybe_split_pgid(OSDMapRef old_map,
392 assert(old_map->have_pg_pool(pgid.pool()));
393 int old_pgnum = old_map->get_pg_num(pgid.pool());
394 if (pgid.ps() < static_cast<unsigned>(old_pgnum)) {
396 if (pgid.is_split(old_pgnum,
397 new_map->get_pg_num(pgid.pool()), &children)) {
398 _start_split(pgid, children); }
400 assert(pgid.ps() < static_cast<unsigned>(new_map->get_pg_num(pgid.pool())));
404 void OSDService::init_splits_between(spg_t pgid,
408 // First, check whether we can avoid this potentially expensive check
409 if (tomap->have_pg_pool(pgid.pool()) &&
411 frommap->get_pg_num(pgid.pool()),
412 tomap->get_pg_num(pgid.pool()),
414 // Ok, a split happened, so we need to walk the osdmaps
415 set<spg_t> new_pgs; // pgs to scan on each map
416 new_pgs.insert(pgid);
417 OSDMapRef curmap(get_map(frommap->get_epoch()));
418 for (epoch_t e = frommap->get_epoch() + 1;
419 e <= tomap->get_epoch();
421 OSDMapRef nextmap(try_get_map(e));
424 set<spg_t> even_newer_pgs; // pgs added in this loop
425 for (set<spg_t>::iterator i = new_pgs.begin(); i != new_pgs.end(); ++i) {
426 set<spg_t> split_pgs;
427 if (i->is_split(curmap->get_pg_num(i->pool()),
428 nextmap->get_pg_num(i->pool()),
430 start_split(*i, split_pgs);
431 even_newer_pgs.insert(split_pgs.begin(), split_pgs.end());
434 new_pgs.insert(even_newer_pgs.begin(), even_newer_pgs.end());
437 assert(curmap == tomap); // we must have had both frommap and tomap
441 void OSDService::expand_pg_num(OSDMapRef old_map,
444 Mutex::Locker l(in_progress_split_lock);
445 for (set<spg_t>::iterator i = in_progress_splits.begin();
446 i != in_progress_splits.end();
448 if (!new_map->have_pg_pool(i->pool())) {
449 in_progress_splits.erase(i++);
451 _maybe_split_pgid(old_map, new_map, *i);
455 for (map<spg_t, spg_t>::iterator i = pending_splits.begin();
456 i != pending_splits.end();
458 if (!new_map->have_pg_pool(i->first.pool())) {
459 rev_pending_splits.erase(i->second);
460 pending_splits.erase(i++);
462 _maybe_split_pgid(old_map, new_map, i->first);
468 bool OSDService::splitting(spg_t pgid)
470 Mutex::Locker l(in_progress_split_lock);
471 return in_progress_splits.count(pgid) ||
472 pending_splits.count(pgid);
475 void OSDService::complete_split(const set<spg_t> &pgs)
477 Mutex::Locker l(in_progress_split_lock);
478 for (set<spg_t>::const_iterator i = pgs.begin();
481 dout(10) << __func__ << ": Completing split on pg " << *i << dendl;
482 assert(!pending_splits.count(*i));
483 assert(in_progress_splits.count(*i));
484 in_progress_splits.erase(*i);
488 void OSDService::need_heartbeat_peer_update()
490 osd->need_heartbeat_peer_update();
493 void OSDService::pg_stat_queue_enqueue(PG *pg)
495 osd->pg_stat_queue_enqueue(pg);
498 void OSDService::pg_stat_queue_dequeue(PG *pg)
500 osd->pg_stat_queue_dequeue(pg);
503 void OSDService::start_shutdown()
506 Mutex::Locker l(agent_timer_lock);
507 agent_timer.shutdown();
511 Mutex::Locker l(recovery_sleep_lock);
512 recovery_sleep_timer.shutdown();
516 void OSDService::shutdown_reserver()
518 reserver_finisher.wait_for_empty();
519 reserver_finisher.stop();
522 void OSDService::shutdown()
525 Mutex::Locker l(watch_lock);
526 watch_timer.shutdown();
529 objecter->shutdown();
530 objecter_finisher.wait_for_empty();
531 objecter_finisher.stop();
534 Mutex::Locker l(recovery_request_lock);
535 recovery_request_timer.shutdown();
539 Mutex::Locker l(snap_sleep_lock);
540 snap_sleep_timer.shutdown();
544 Mutex::Locker l(scrub_sleep_lock);
545 scrub_sleep_timer.shutdown();
548 osdmap = OSDMapRef();
549 next_osdmap = OSDMapRef();
552 void OSDService::init()
554 reserver_finisher.start();
555 objecter_finisher.start();
556 objecter->set_client_incarnation(0);
558 // deprioritize objecter in daemonperf output
559 objecter->get_logger()->set_prio_adjust(-3);
563 snap_sleep_timer.init();
564 scrub_sleep_timer.init();
566 agent_thread.create("osd_srv_agent");
568 if (cct->_conf->osd_recovery_delay_start)
569 defer_recovery(cct->_conf->osd_recovery_delay_start);
572 void OSDService::final_init()
574 objecter->start(osdmap.get());
577 void OSDService::activate_map()
579 // wake/unwake the tiering agent
582 !osdmap->test_flag(CEPH_OSDMAP_NOTIERAGENT) &&
588 void OSDService::request_osdmap_update(epoch_t e)
590 osd->osdmap_subscribe(e, false);
593 class AgentTimeoutCB : public Context {
596 explicit AgentTimeoutCB(PGRef _pg) : pg(_pg) {}
597 void finish(int) override {
598 pg->agent_choose_mode_restart();
602 void OSDService::agent_entry()
604 dout(10) << __func__ << " start" << dendl;
607 while (!agent_stop_flag) {
608 if (agent_queue.empty()) {
609 dout(20) << __func__ << " empty queue" << dendl;
610 agent_cond.Wait(agent_lock);
613 uint64_t level = agent_queue.rbegin()->first;
614 set<PGRef>& top = agent_queue.rbegin()->second;
616 << " tiers " << agent_queue.size()
617 << ", top is " << level
618 << " with pgs " << top.size()
619 << ", ops " << agent_ops << "/"
620 << cct->_conf->osd_agent_max_ops
621 << (agent_active ? " active" : " NOT ACTIVE")
623 dout(20) << __func__ << " oids " << agent_oids << dendl;
624 int max = cct->_conf->osd_agent_max_ops - agent_ops;
625 int agent_flush_quota = max;
626 if (!flush_mode_high_count)
627 agent_flush_quota = cct->_conf->osd_agent_max_low_ops - agent_ops;
628 if (agent_flush_quota <= 0 || top.empty() || !agent_active) {
629 agent_cond.Wait(agent_lock);
633 if (!agent_valid_iterator || agent_queue_pos == top.end()) {
634 agent_queue_pos = top.begin();
635 agent_valid_iterator = true;
637 PGRef pg = *agent_queue_pos;
638 dout(10) << "high_count " << flush_mode_high_count
639 << " agent_ops " << agent_ops
640 << " flush_quota " << agent_flush_quota << dendl;
642 if (!pg->agent_work(max, agent_flush_quota)) {
643 dout(10) << __func__ << " " << pg->get_pgid()
644 << " no agent_work, delay for " << cct->_conf->osd_agent_delay_time
645 << " seconds" << dendl;
647 osd->logger->inc(l_osd_tier_delay);
648 // Queue a timer to call agent_choose_mode for this pg in 5 seconds
649 agent_timer_lock.Lock();
650 Context *cb = new AgentTimeoutCB(pg);
651 agent_timer.add_event_after(cct->_conf->osd_agent_delay_time, cb);
652 agent_timer_lock.Unlock();
657 dout(10) << __func__ << " finish" << dendl;
660 void OSDService::agent_stop()
663 Mutex::Locker l(agent_lock);
665 // By this time all ops should be cancelled
666 assert(agent_ops == 0);
667 // By this time all PGs are shutdown and dequeued
668 if (!agent_queue.empty()) {
669 set<PGRef>& top = agent_queue.rbegin()->second;
670 derr << "agent queue not empty, for example " << (*top.begin())->info.pgid << dendl;
671 assert(0 == "agent queue not empty");
674 agent_stop_flag = true;
680 // -------------------------------------
682 void OSDService::promote_throttle_recalibrate()
684 utime_t now = ceph_clock_now();
685 double dur = now - last_recalibrate;
686 last_recalibrate = now;
687 unsigned prob = promote_probability_millis;
689 uint64_t target_obj_sec = cct->_conf->osd_tier_promote_max_objects_sec;
690 uint64_t target_bytes_sec = cct->_conf->osd_tier_promote_max_bytes_sec;
692 unsigned min_prob = 1;
694 uint64_t attempts, obj, bytes;
695 promote_counter.sample_and_attenuate(&attempts, &obj, &bytes);
696 dout(10) << __func__ << " " << attempts << " attempts, promoted "
697 << obj << " objects and " << pretty_si_t(bytes) << " bytes; target "
698 << target_obj_sec << " obj/sec or "
699 << pretty_si_t(target_bytes_sec) << " bytes/sec"
702 // calculate what the probability *should* be, given the targets
704 if (attempts && dur > 0) {
705 uint64_t avg_size = 1;
707 avg_size = MAX(bytes / obj, 1);
708 unsigned po = (double)target_obj_sec * dur * 1000.0 / (double)attempts;
709 unsigned pb = (double)target_bytes_sec / (double)avg_size * dur * 1000.0
711 dout(20) << __func__ << " po " << po << " pb " << pb << " avg_size "
712 << avg_size << dendl;
713 if (target_obj_sec && target_bytes_sec)
714 new_prob = MIN(po, pb);
715 else if (target_obj_sec)
717 else if (target_bytes_sec)
724 dout(20) << __func__ << " new_prob " << new_prob << dendl;
726 // correct for persistent skew between target rate and actual rate, adjust
729 if (attempts && obj) {
730 actual = obj * 1000 / attempts;
731 ratio = (double)actual / (double)prob;
732 new_prob = (double)new_prob / ratio;
734 new_prob = MAX(new_prob, min_prob);
735 new_prob = MIN(new_prob, 1000);
738 prob = (prob + new_prob) / 2;
739 prob = MAX(prob, min_prob);
740 prob = MIN(prob, 1000);
741 dout(10) << __func__ << " actual " << actual
742 << ", actual/prob ratio " << ratio
743 << ", adjusted new_prob " << new_prob
744 << ", prob " << promote_probability_millis << " -> " << prob
746 promote_probability_millis = prob;
748 // set hard limits for this interval to mitigate stampedes
749 promote_max_objects = target_obj_sec * OSD::OSD_TICK_INTERVAL * 2;
750 promote_max_bytes = target_bytes_sec * OSD::OSD_TICK_INTERVAL * 2;
753 // -------------------------------------
755 float OSDService::get_failsafe_full_ratio()
757 float full_ratio = cct->_conf->osd_failsafe_full_ratio;
758 if (full_ratio > 1.0) full_ratio /= 100.0;
762 void OSDService::check_full_status(float ratio)
764 Mutex::Locker l(full_status_lock);
768 // The OSDMap ratios take precendence. So if the failsafe is .95 and
769 // the admin sets the cluster full to .96, the failsafe moves up to .96
770 // too. (Not that having failsafe == full is ideal, but it's better than
771 // dropping writes before the clusters appears full.)
772 OSDMapRef osdmap = get_osdmap();
773 if (!osdmap || osdmap->get_epoch() == 0) {
777 float nearfull_ratio = osdmap->get_nearfull_ratio();
778 float backfillfull_ratio = std::max(osdmap->get_backfillfull_ratio(), nearfull_ratio);
779 float full_ratio = std::max(osdmap->get_full_ratio(), backfillfull_ratio);
780 float failsafe_ratio = std::max(get_failsafe_full_ratio(), full_ratio);
782 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
783 // use the failsafe for nearfull and full; the mon isn't using the
784 // flags anyway because we're mid-upgrade.
785 full_ratio = failsafe_ratio;
786 backfillfull_ratio = failsafe_ratio;
787 nearfull_ratio = failsafe_ratio;
788 } else if (full_ratio <= 0 ||
789 backfillfull_ratio <= 0 ||
790 nearfull_ratio <= 0) {
791 derr << __func__ << " full_ratio, backfillfull_ratio or nearfull_ratio is <= 0" << dendl;
792 // use failsafe flag. ick. the monitor did something wrong or the user
793 // did something stupid.
794 full_ratio = failsafe_ratio;
795 backfillfull_ratio = failsafe_ratio;
796 nearfull_ratio = failsafe_ratio;
801 if (injectfull_state > NONE && injectfull) {
802 new_state = injectfull_state;
803 inject = "(Injected)";
804 } else if (ratio > failsafe_ratio) {
805 new_state = FAILSAFE;
806 } else if (ratio > full_ratio) {
808 } else if (ratio > backfillfull_ratio) {
809 new_state = BACKFILLFULL;
810 } else if (ratio > nearfull_ratio) {
811 new_state = NEARFULL;
815 dout(20) << __func__ << " cur ratio " << ratio
816 << ". nearfull_ratio " << nearfull_ratio
817 << ". backfillfull_ratio " << backfillfull_ratio
818 << ", full_ratio " << full_ratio
819 << ", failsafe_ratio " << failsafe_ratio
820 << ", new state " << get_full_state_name(new_state)
825 if (cur_state != new_state) {
826 dout(10) << __func__ << " " << get_full_state_name(cur_state)
827 << " -> " << get_full_state_name(new_state) << dendl;
828 if (new_state == FAILSAFE) {
829 clog->error() << "full status failsafe engaged, dropping updates, now "
830 << (int)roundf(ratio * 100) << "% full";
831 } else if (cur_state == FAILSAFE) {
832 clog->error() << "full status failsafe disengaged, no longer dropping "
833 << "updates, now " << (int)roundf(ratio * 100) << "% full";
835 cur_state = new_state;
839 bool OSDService::need_fullness_update()
841 OSDMapRef osdmap = get_osdmap();
843 if (osdmap->exists(whoami)) {
844 if (osdmap->get_state(whoami) & CEPH_OSD_FULL) {
846 } else if (osdmap->get_state(whoami) & CEPH_OSD_BACKFILLFULL) {
848 } else if (osdmap->get_state(whoami) & CEPH_OSD_NEARFULL) {
855 else if (is_backfillfull())
857 else if (is_nearfull())
862 bool OSDService::_check_full(s_names type, ostream &ss) const
864 Mutex::Locker l(full_status_lock);
866 if (injectfull && injectfull_state >= type) {
867 // injectfull is either a count of the number of times to return failsafe full
868 // or if -1 then always return full
871 ss << "Injected " << get_full_state_name(type) << " OSD ("
872 << (injectfull < 0 ? "set" : std::to_string(injectfull)) << ")";
876 ss << "current usage is " << cur_ratio;
877 return cur_state >= type;
880 bool OSDService::check_failsafe_full(ostream &ss) const
882 return _check_full(FAILSAFE, ss);
885 bool OSDService::check_full(ostream &ss) const
887 return _check_full(FULL, ss);
890 bool OSDService::check_backfill_full(ostream &ss) const
892 return _check_full(BACKFILLFULL, ss);
895 bool OSDService::check_nearfull(ostream &ss) const
897 return _check_full(NEARFULL, ss);
900 bool OSDService::is_failsafe_full() const
902 Mutex::Locker l(full_status_lock);
903 return cur_state == FAILSAFE;
906 bool OSDService::is_full() const
908 Mutex::Locker l(full_status_lock);
909 return cur_state >= FULL;
912 bool OSDService::is_backfillfull() const
914 Mutex::Locker l(full_status_lock);
915 return cur_state >= BACKFILLFULL;
918 bool OSDService::is_nearfull() const
920 Mutex::Locker l(full_status_lock);
921 return cur_state >= NEARFULL;
924 void OSDService::set_injectfull(s_names type, int64_t count)
926 Mutex::Locker l(full_status_lock);
927 injectfull_state = type;
931 osd_stat_t OSDService::set_osd_stat(const struct store_statfs_t &stbuf,
932 vector<int>& hb_peers,
935 uint64_t bytes = stbuf.total;
936 uint64_t used = bytes - stbuf.available;
937 uint64_t avail = stbuf.available;
939 osd->logger->set(l_osd_stat_bytes, bytes);
940 osd->logger->set(l_osd_stat_bytes_used, used);
941 osd->logger->set(l_osd_stat_bytes_avail, avail);
944 Mutex::Locker l(stat_lock);
945 osd_stat.hb_peers.swap(hb_peers);
946 osd->op_tracker.get_age_ms_histogram(&osd_stat.op_queue_age_hist);
947 osd_stat.kb = bytes >> 10;
948 osd_stat.kb_used = used >> 10;
949 osd_stat.kb_avail = avail >> 10;
950 osd_stat.num_pgs = num_pgs;
955 void OSDService::update_osd_stat(vector<int>& hb_peers)
957 // load osd stats first
958 struct store_statfs_t stbuf;
959 int r = osd->store->statfs(&stbuf);
961 derr << "statfs() failed: " << cpp_strerror(r) << dendl;
965 auto new_stat = set_osd_stat(stbuf, hb_peers, osd->get_num_pgs());
966 dout(20) << "update_osd_stat " << new_stat << dendl;
968 float ratio = ((float)new_stat.kb_used) / ((float)new_stat.kb);
969 check_full_status(ratio);
972 bool OSDService::check_osdmap_full(const set<pg_shard_t> &missing_on)
974 OSDMapRef osdmap = get_osdmap();
975 for (auto shard : missing_on) {
976 if (osdmap->get_state(shard.osd) & CEPH_OSD_FULL)
982 void OSDService::send_message_osd_cluster(int peer, Message *m, epoch_t from_epoch)
984 OSDMapRef next_map = get_nextmap_reserved();
985 // service map is always newer/newest
986 assert(from_epoch <= next_map->get_epoch());
988 if (next_map->is_down(peer) ||
989 next_map->get_info(peer).up_from > from_epoch) {
991 release_map(next_map);
994 const entity_inst_t& peer_inst = next_map->get_cluster_inst(peer);
995 ConnectionRef peer_con = osd->cluster_messenger->get_connection(peer_inst);
996 share_map_peer(peer, peer_con.get(), next_map);
997 peer_con->send_message(m);
998 release_map(next_map);
1001 ConnectionRef OSDService::get_con_osd_cluster(int peer, epoch_t from_epoch)
1003 OSDMapRef next_map = get_nextmap_reserved();
1004 // service map is always newer/newest
1005 assert(from_epoch <= next_map->get_epoch());
1007 if (next_map->is_down(peer) ||
1008 next_map->get_info(peer).up_from > from_epoch) {
1009 release_map(next_map);
1012 ConnectionRef con = osd->cluster_messenger->get_connection(next_map->get_cluster_inst(peer));
1013 release_map(next_map);
1017 pair<ConnectionRef,ConnectionRef> OSDService::get_con_osd_hb(int peer, epoch_t from_epoch)
1019 OSDMapRef next_map = get_nextmap_reserved();
1020 // service map is always newer/newest
1021 assert(from_epoch <= next_map->get_epoch());
1023 pair<ConnectionRef,ConnectionRef> ret;
1024 if (next_map->is_down(peer) ||
1025 next_map->get_info(peer).up_from > from_epoch) {
1026 release_map(next_map);
1029 ret.first = osd->hb_back_client_messenger->get_connection(next_map->get_hb_back_inst(peer));
1030 if (next_map->get_hb_front_addr(peer) != entity_addr_t())
1031 ret.second = osd->hb_front_client_messenger->get_connection(next_map->get_hb_front_inst(peer));
1032 release_map(next_map);
1037 void OSDService::queue_want_pg_temp(pg_t pgid, vector<int>& want)
1039 Mutex::Locker l(pg_temp_lock);
1040 map<pg_t,vector<int> >::iterator p = pg_temp_pending.find(pgid);
1041 if (p == pg_temp_pending.end() ||
1042 p->second != want) {
1043 pg_temp_wanted[pgid] = want;
1047 void OSDService::remove_want_pg_temp(pg_t pgid)
1049 Mutex::Locker l(pg_temp_lock);
1050 pg_temp_wanted.erase(pgid);
1051 pg_temp_pending.erase(pgid);
1054 void OSDService::_sent_pg_temp()
1056 for (map<pg_t,vector<int> >::iterator p = pg_temp_wanted.begin();
1057 p != pg_temp_wanted.end();
1059 pg_temp_pending[p->first] = p->second;
1060 pg_temp_wanted.clear();
1063 void OSDService::requeue_pg_temp()
1065 Mutex::Locker l(pg_temp_lock);
1066 // wanted overrides pending. note that remove_want_pg_temp
1067 // clears the item out of both.
1068 unsigned old_wanted = pg_temp_wanted.size();
1069 unsigned old_pending = pg_temp_pending.size();
1071 pg_temp_wanted.swap(pg_temp_pending);
1072 dout(10) << __func__ << " " << old_wanted << " + " << old_pending << " -> "
1073 << pg_temp_wanted.size() << dendl;
1076 void OSDService::send_pg_temp()
1078 Mutex::Locker l(pg_temp_lock);
1079 if (pg_temp_wanted.empty())
1081 dout(10) << "send_pg_temp " << pg_temp_wanted << dendl;
1082 MOSDPGTemp *m = new MOSDPGTemp(osdmap->get_epoch());
1083 m->pg_temp = pg_temp_wanted;
1084 monc->send_mon_message(m);
1088 void OSDService::send_pg_created(pg_t pgid)
1090 dout(20) << __func__ << dendl;
1091 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
1092 monc->send_mon_message(new MOSDPGCreated(pgid));
1096 // --------------------------------------
1099 epoch_t OSDService::get_peer_epoch(int peer)
1101 Mutex::Locker l(peer_map_epoch_lock);
1102 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1103 if (p == peer_map_epoch.end())
1108 epoch_t OSDService::note_peer_epoch(int peer, epoch_t e)
1110 Mutex::Locker l(peer_map_epoch_lock);
1111 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1112 if (p != peer_map_epoch.end()) {
1113 if (p->second < e) {
1114 dout(10) << "note_peer_epoch osd." << peer << " has " << e << dendl;
1117 dout(30) << "note_peer_epoch osd." << peer << " has " << p->second << " >= " << e << dendl;
1121 dout(10) << "note_peer_epoch osd." << peer << " now has " << e << dendl;
1122 peer_map_epoch[peer] = e;
1127 void OSDService::forget_peer_epoch(int peer, epoch_t as_of)
1129 Mutex::Locker l(peer_map_epoch_lock);
1130 map<int,epoch_t>::iterator p = peer_map_epoch.find(peer);
1131 if (p != peer_map_epoch.end()) {
1132 if (p->second <= as_of) {
1133 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1134 << " had " << p->second << dendl;
1135 peer_map_epoch.erase(p);
1137 dout(10) << "forget_peer_epoch osd." << peer << " as_of " << as_of
1138 << " has " << p->second << " - not forgetting" << dendl;
1143 bool OSDService::should_share_map(entity_name_t name, Connection *con,
1144 epoch_t epoch, const OSDMapRef& osdmap,
1145 const epoch_t *sent_epoch_p)
1147 dout(20) << "should_share_map "
1148 << name << " " << con->get_peer_addr()
1149 << " " << epoch << dendl;
1151 // does client have old map?
1152 if (name.is_client()) {
1153 bool message_sendmap = epoch < osdmap->get_epoch();
1154 if (message_sendmap && sent_epoch_p) {
1155 dout(20) << "client session last_sent_epoch: "
1157 << " versus osdmap epoch " << osdmap->get_epoch() << dendl;
1158 if (*sent_epoch_p < osdmap->get_epoch()) {
1160 } // else we don't need to send it out again
1164 if (con->get_messenger() == osd->cluster_messenger &&
1165 con != osd->cluster_messenger->get_loopback_connection() &&
1166 osdmap->is_up(name.num()) &&
1167 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1168 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1170 epoch_t has = MAX(get_peer_epoch(name.num()), epoch);
1173 if (has < osdmap->get_epoch()) {
1174 dout(10) << name << " " << con->get_peer_addr()
1175 << " has old map " << epoch << " < "
1176 << osdmap->get_epoch() << dendl;
1184 void OSDService::share_map(
1189 epoch_t *sent_epoch_p)
1191 dout(20) << "share_map "
1192 << name << " " << con->get_peer_addr()
1193 << " " << epoch << dendl;
1195 if (!osd->is_active()) {
1196 /*It is safe not to proceed as OSD is not in healthy state*/
1200 bool want_shared = should_share_map(name, con, epoch,
1201 osdmap, sent_epoch_p);
1204 if (name.is_client()) {
1205 dout(10) << name << " has old map " << epoch
1206 << " < " << osdmap->get_epoch() << dendl;
1207 // we know the Session is valid or we wouldn't be sending
1209 *sent_epoch_p = osdmap->get_epoch();
1211 send_incremental_map(epoch, con, osdmap);
1212 } else if (con->get_messenger() == osd->cluster_messenger &&
1213 osdmap->is_up(name.num()) &&
1214 (osdmap->get_cluster_addr(name.num()) == con->get_peer_addr() ||
1215 osdmap->get_hb_back_addr(name.num()) == con->get_peer_addr())) {
1216 dout(10) << name << " " << con->get_peer_addr()
1217 << " has old map " << epoch << " < "
1218 << osdmap->get_epoch() << dendl;
1219 note_peer_epoch(name.num(), osdmap->get_epoch());
1220 send_incremental_map(epoch, con, osdmap);
1225 void OSDService::share_map_peer(int peer, Connection *con, OSDMapRef map)
1231 epoch_t pe = get_peer_epoch(peer);
1233 if (pe < map->get_epoch()) {
1234 send_incremental_map(pe, con, map);
1235 note_peer_epoch(peer, map->get_epoch());
1237 dout(20) << "share_map_peer " << con << " already has epoch " << pe << dendl;
1239 dout(20) << "share_map_peer " << con << " don't know epoch, doing nothing" << dendl;
1240 // no idea about peer's epoch.
1241 // ??? send recent ???
1246 bool OSDService::can_inc_scrubs_pending()
1248 bool can_inc = false;
1249 Mutex::Locker l(sched_scrub_lock);
1251 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1252 dout(20) << __func__ << " " << scrubs_pending << " -> " << (scrubs_pending+1)
1253 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1256 dout(20) << __func__ << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1262 bool OSDService::inc_scrubs_pending()
1264 bool result = false;
1266 sched_scrub_lock.Lock();
1267 if (scrubs_pending + scrubs_active < cct->_conf->osd_max_scrubs) {
1268 dout(20) << "inc_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending+1)
1269 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1273 dout(20) << "inc_scrubs_pending " << scrubs_pending << " + " << scrubs_active << " active >= max " << cct->_conf->osd_max_scrubs << dendl;
1275 sched_scrub_lock.Unlock();
1280 void OSDService::dec_scrubs_pending()
1282 sched_scrub_lock.Lock();
1283 dout(20) << "dec_scrubs_pending " << scrubs_pending << " -> " << (scrubs_pending-1)
1284 << " (max " << cct->_conf->osd_max_scrubs << ", active " << scrubs_active << ")" << dendl;
1286 assert(scrubs_pending >= 0);
1287 sched_scrub_lock.Unlock();
1290 void OSDService::inc_scrubs_active(bool reserved)
1292 sched_scrub_lock.Lock();
1296 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1297 << " (max " << cct->_conf->osd_max_scrubs
1298 << ", pending " << (scrubs_pending+1) << " -> " << scrubs_pending << ")" << dendl;
1299 assert(scrubs_pending >= 0);
1301 dout(20) << "inc_scrubs_active " << (scrubs_active-1) << " -> " << scrubs_active
1302 << " (max " << cct->_conf->osd_max_scrubs
1303 << ", pending " << scrubs_pending << ")" << dendl;
1305 sched_scrub_lock.Unlock();
1308 void OSDService::dec_scrubs_active()
1310 sched_scrub_lock.Lock();
1311 dout(20) << "dec_scrubs_active " << scrubs_active << " -> " << (scrubs_active-1)
1312 << " (max " << cct->_conf->osd_max_scrubs << ", pending " << scrubs_pending << ")" << dendl;
1314 assert(scrubs_active >= 0);
1315 sched_scrub_lock.Unlock();
1318 void OSDService::retrieve_epochs(epoch_t *_boot_epoch, epoch_t *_up_epoch,
1319 epoch_t *_bind_epoch) const
1321 Mutex::Locker l(epoch_lock);
1323 *_boot_epoch = boot_epoch;
1325 *_up_epoch = up_epoch;
1327 *_bind_epoch = bind_epoch;
1330 void OSDService::set_epochs(const epoch_t *_boot_epoch, const epoch_t *_up_epoch,
1331 const epoch_t *_bind_epoch)
1333 Mutex::Locker l(epoch_lock);
1335 assert(*_boot_epoch == 0 || *_boot_epoch >= boot_epoch);
1336 boot_epoch = *_boot_epoch;
1339 assert(*_up_epoch == 0 || *_up_epoch >= up_epoch);
1340 up_epoch = *_up_epoch;
1343 assert(*_bind_epoch == 0 || *_bind_epoch >= bind_epoch);
1344 bind_epoch = *_bind_epoch;
1348 bool OSDService::prepare_to_stop()
1350 Mutex::Locker l(is_stopping_lock);
1351 if (get_state() != NOT_STOPPING)
1354 OSDMapRef osdmap = get_osdmap();
1355 if (osdmap && osdmap->is_up(whoami)) {
1356 dout(0) << __func__ << " telling mon we are shutting down" << dendl;
1357 set_state(PREPARING_TO_STOP);
1358 monc->send_mon_message(new MOSDMarkMeDown(monc->get_fsid(),
1359 osdmap->get_inst(whoami),
1360 osdmap->get_epoch(),
1363 utime_t now = ceph_clock_now();
1365 timeout.set_from_double(now + cct->_conf->osd_mon_shutdown_timeout);
1366 while ((ceph_clock_now() < timeout) &&
1367 (get_state() != STOPPING)) {
1368 is_stopping_cond.WaitUntil(is_stopping_lock, timeout);
1371 dout(0) << __func__ << " starting shutdown" << dendl;
1372 set_state(STOPPING);
1376 void OSDService::got_stop_ack()
1378 Mutex::Locker l(is_stopping_lock);
1379 if (get_state() == PREPARING_TO_STOP) {
1380 dout(0) << __func__ << " starting shutdown" << dendl;
1381 set_state(STOPPING);
1382 is_stopping_cond.Signal();
1384 dout(10) << __func__ << " ignoring msg" << dendl;
1388 MOSDMap *OSDService::build_incremental_map_msg(epoch_t since, epoch_t to,
1389 OSDSuperblock& sblock)
1391 MOSDMap *m = new MOSDMap(monc->get_fsid());
1392 m->oldest_map = max_oldest_map;
1393 m->newest_map = sblock.newest_map;
1395 for (epoch_t e = to; e > since; e--) {
1397 if (e > m->oldest_map && get_inc_map_bl(e, bl)) {
1398 m->incremental_maps[e].claim(bl);
1399 } else if (get_map_bl(e, bl)) {
1400 m->maps[e].claim(bl);
1403 derr << "since " << since << " to " << to
1404 << " oldest " << m->oldest_map << " newest " << m->newest_map
1414 void OSDService::send_map(MOSDMap *m, Connection *con)
1416 con->send_message(m);
1419 void OSDService::send_incremental_map(epoch_t since, Connection *con,
1422 epoch_t to = osdmap->get_epoch();
1423 dout(10) << "send_incremental_map " << since << " -> " << to
1424 << " to " << con << " " << con->get_peer_addr() << dendl;
1428 OSDSuperblock sblock(get_superblock());
1429 if (since < sblock.oldest_map) {
1430 // just send latest full map
1431 MOSDMap *m = new MOSDMap(monc->get_fsid());
1432 m->oldest_map = max_oldest_map;
1433 m->newest_map = sblock.newest_map;
1434 get_map_bl(to, m->maps[to]);
1439 if (to > since && (int64_t)(to - since) > cct->_conf->osd_map_share_max_epochs) {
1440 dout(10) << " " << (to - since) << " > max " << cct->_conf->osd_map_share_max_epochs
1441 << ", only sending most recent" << dendl;
1442 since = to - cct->_conf->osd_map_share_max_epochs;
1445 if (to - since > (epoch_t)cct->_conf->osd_map_message_max)
1446 to = since + cct->_conf->osd_map_message_max;
1447 m = build_incremental_map_msg(since, to, sblock);
1452 bool OSDService::_get_map_bl(epoch_t e, bufferlist& bl)
1454 bool found = map_bl_cache.lookup(e, &bl);
1457 logger->inc(l_osd_map_bl_cache_hit);
1461 logger->inc(l_osd_map_bl_cache_miss);
1462 found = store->read(coll_t::meta(),
1463 OSD::get_osdmap_pobject_name(e), 0, 0, bl,
1464 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1471 bool OSDService::get_inc_map_bl(epoch_t e, bufferlist& bl)
1473 Mutex::Locker l(map_cache_lock);
1474 bool found = map_bl_inc_cache.lookup(e, &bl);
1477 logger->inc(l_osd_map_bl_cache_hit);
1481 logger->inc(l_osd_map_bl_cache_miss);
1482 found = store->read(coll_t::meta(),
1483 OSD::get_inc_osdmap_pobject_name(e), 0, 0, bl,
1484 CEPH_OSD_OP_FLAG_FADVISE_WILLNEED) >= 0;
1486 _add_map_inc_bl(e, bl);
1491 void OSDService::_add_map_bl(epoch_t e, bufferlist& bl)
1493 dout(10) << "add_map_bl " << e << " " << bl.length() << " bytes" << dendl;
1494 // cache a contiguous buffer
1495 if (bl.get_num_buffers() > 1) {
1498 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1499 map_bl_cache.add(e, bl);
1502 void OSDService::_add_map_inc_bl(epoch_t e, bufferlist& bl)
1504 dout(10) << "add_map_inc_bl " << e << " " << bl.length() << " bytes" << dendl;
1505 // cache a contiguous buffer
1506 if (bl.get_num_buffers() > 1) {
1509 bl.try_assign_to_mempool(mempool::mempool_osd_mapbl);
1510 map_bl_inc_cache.add(e, bl);
1513 void OSDService::pin_map_inc_bl(epoch_t e, bufferlist &bl)
1515 Mutex::Locker l(map_cache_lock);
1516 // cache a contiguous buffer
1517 if (bl.get_num_buffers() > 1) {
1520 map_bl_inc_cache.pin(e, bl);
1523 void OSDService::pin_map_bl(epoch_t e, bufferlist &bl)
1525 Mutex::Locker l(map_cache_lock);
1526 // cache a contiguous buffer
1527 if (bl.get_num_buffers() > 1) {
1530 map_bl_cache.pin(e, bl);
1533 void OSDService::clear_map_bl_cache_pins(epoch_t e)
1535 Mutex::Locker l(map_cache_lock);
1536 map_bl_inc_cache.clear_pinned(e);
1537 map_bl_cache.clear_pinned(e);
1540 OSDMapRef OSDService::_add_map(OSDMap *o)
1542 epoch_t e = o->get_epoch();
1544 if (cct->_conf->osd_map_dedup) {
1545 // Dedup against an existing map at a nearby epoch
1546 OSDMapRef for_dedup = map_cache.lower_bound(e);
1548 OSDMap::dedup(for_dedup.get(), o);
1552 OSDMapRef l = map_cache.add(e, o, &existed);
1559 OSDMapRef OSDService::try_get_map(epoch_t epoch)
1561 Mutex::Locker l(map_cache_lock);
1562 OSDMapRef retval = map_cache.lookup(epoch);
1564 dout(30) << "get_map " << epoch << " -cached" << dendl;
1566 logger->inc(l_osd_map_cache_hit);
1571 logger->inc(l_osd_map_cache_miss);
1572 epoch_t lb = map_cache.cached_key_lower_bound();
1574 dout(30) << "get_map " << epoch << " - miss, below lower bound" << dendl;
1575 logger->inc(l_osd_map_cache_miss_low);
1576 logger->inc(l_osd_map_cache_miss_low_avg, lb - epoch);
1580 OSDMap *map = new OSDMap;
1582 dout(20) << "get_map " << epoch << " - loading and decoding " << map << dendl;
1584 if (!_get_map_bl(epoch, bl) || bl.length() == 0) {
1585 derr << "failed to load OSD map for epoch " << epoch << ", got " << bl.length() << " bytes" << dendl;
1591 dout(20) << "get_map " << epoch << " - return initial " << map << dendl;
1593 return _add_map(map);
1599 void OSDService::reply_op_error(OpRequestRef op, int err)
1601 reply_op_error(op, err, eversion_t(), 0);
1604 void OSDService::reply_op_error(OpRequestRef op, int err, eversion_t v,
1607 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1608 assert(m->get_type() == CEPH_MSG_OSD_OP);
1610 flags = m->get_flags() & (CEPH_OSD_FLAG_ACK|CEPH_OSD_FLAG_ONDISK);
1612 MOSDOpReply *reply = new MOSDOpReply(m, err, osdmap->get_epoch(), flags,
1614 reply->set_reply_versions(v, uv);
1615 m->get_connection()->send_message(reply);
1618 void OSDService::handle_misdirected_op(PG *pg, OpRequestRef op)
1620 if (!cct->_conf->osd_debug_misdirected_ops) {
1624 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
1625 assert(m->get_type() == CEPH_MSG_OSD_OP);
1627 assert(m->get_map_epoch() >= pg->info.history.same_primary_since);
1629 if (pg->is_ec_pg()) {
1631 * OSD recomputes op target based on current OSDMap. With an EC pg, we
1632 * can get this result:
1633 * 1) client at map 512 sends an op to osd 3, pg_t 3.9 based on mapping
1634 * [CRUSH_ITEM_NONE, 2, 3]/3
1635 * 2) OSD 3 at map 513 remaps op to osd 3, spg_t 3.9s0 based on mapping
1637 * 3) PG 3.9s0 dequeues the op at epoch 512 and notices that it isn't primary
1639 * 4) client resends and this time PG 3.9s0 having caught up to 513 gets
1642 * We can't compute the op target based on the sending map epoch due to
1643 * splitting. The simplest thing is to detect such cases here and drop
1644 * them without an error (the client will resend anyway).
1646 assert(m->get_map_epoch() <= superblock.newest_map);
1647 OSDMapRef opmap = try_get_map(m->get_map_epoch());
1649 dout(7) << __func__ << ": " << *pg << " no longer have map for "
1650 << m->get_map_epoch() << ", dropping" << dendl;
1653 pg_t _pgid = m->get_raw_pg();
1655 if ((m->get_flags() & CEPH_OSD_FLAG_PGOP) == 0)
1656 _pgid = opmap->raw_pg_to_pg(_pgid);
1657 if (opmap->get_primary_shard(_pgid, &pgid) &&
1658 pgid.shard != pg->info.pgid.shard) {
1659 dout(7) << __func__ << ": " << *pg << " primary changed since "
1660 << m->get_map_epoch() << ", dropping" << dendl;
1665 dout(7) << *pg << " misdirected op in " << m->get_map_epoch() << dendl;
1666 clog->warn() << m->get_source_inst() << " misdirected " << m->get_reqid()
1667 << " pg " << m->get_raw_pg()
1668 << " to osd." << whoami
1669 << " not " << pg->acting
1670 << " in e" << m->get_map_epoch() << "/" << osdmap->get_epoch();
1673 void OSDService::enqueue_back(spg_t pgid, PGQueueable qi)
1675 osd->op_shardedwq.queue(make_pair(pgid, qi));
1678 void OSDService::enqueue_front(spg_t pgid, PGQueueable qi)
1680 osd->op_shardedwq.queue_front(make_pair(pgid, qi));
1683 void OSDService::queue_for_peering(PG *pg)
1685 peering_wq.queue(pg);
1688 void OSDService::queue_for_snap_trim(PG *pg)
1690 dout(10) << "queueing " << *pg << " for snaptrim" << dendl;
1691 osd->op_shardedwq.queue(
1695 PGSnapTrim(pg->get_osdmap()->get_epoch()),
1696 cct->_conf->osd_snap_trim_cost,
1697 cct->_conf->osd_snap_trim_priority,
1700 pg->get_osdmap()->get_epoch())));
1704 // ====================================================================
1708 #define dout_prefix *_dout
1710 // Commands shared between OSD's console and admin console:
1712 namespace osd_cmds {
1714 int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os);
1716 }} // namespace ceph::osd_cmds
1718 int OSD::mkfs(CephContext *cct, ObjectStore *store, const string &dev,
1719 uuid_d fsid, int whoami)
1723 ceph::shared_ptr<ObjectStore::Sequencer> osr(
1724 new ObjectStore::Sequencer("mkfs"));
1729 // if we are fed a uuid for this osd, use it.
1730 store->set_fsid(cct->_conf->osd_uuid);
1732 ret = store->mkfs();
1734 derr << "OSD::mkfs: ObjectStore::mkfs failed with error "
1735 << cpp_strerror(ret) << dendl;
1739 store->set_cache_shards(1); // doesn't matter for mkfs!
1741 ret = store->mount();
1743 derr << "OSD::mkfs: couldn't mount ObjectStore: error "
1744 << cpp_strerror(ret) << dendl;
1748 ret = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, sbbl);
1750 /* if we already have superblock, check content of superblock */
1751 dout(0) << " have superblock" << dendl;
1752 bufferlist::iterator p;
1755 if (whoami != sb.whoami) {
1756 derr << "provided osd id " << whoami << " != superblock's " << sb.whoami
1761 if (fsid != sb.cluster_fsid) {
1762 derr << "provided cluster fsid " << fsid
1763 << " != superblock's " << sb.cluster_fsid << dendl;
1768 // create superblock
1769 sb.cluster_fsid = fsid;
1770 sb.osd_fsid = store->get_fsid();
1772 sb.compat_features = get_osd_initial_compat_set();
1777 ObjectStore::Transaction t;
1778 t.create_collection(coll_t::meta(), 0);
1779 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
1780 ret = store->apply_transaction(osr.get(), std::move(t));
1782 derr << "OSD::mkfs: error while writing OSD_SUPERBLOCK_GOBJECT: "
1783 << "apply_transaction returned " << cpp_strerror(ret) << dendl;
1788 if (!osr->flush_commit(&waiter)) {
1792 ret = write_meta(cct, store, sb.cluster_fsid, sb.osd_fsid, whoami);
1794 derr << "OSD::mkfs: failed to write fsid file: error "
1795 << cpp_strerror(ret) << dendl;
1806 int OSD::write_meta(CephContext *cct, ObjectStore *store, uuid_d& cluster_fsid, uuid_d& osd_fsid, int whoami)
1811 snprintf(val, sizeof(val), "%s", CEPH_OSD_ONDISK_MAGIC);
1812 r = store->write_meta("magic", val);
1816 snprintf(val, sizeof(val), "%d", whoami);
1817 r = store->write_meta("whoami", val);
1821 cluster_fsid.print(val);
1822 r = store->write_meta("ceph_fsid", val);
1826 string key = cct->_conf->get_val<string>("key");
1827 lderr(cct) << "key " << key << dendl;
1829 r = store->write_meta("osd_key", key);
1834 r = store->write_meta("ready", "ready");
1841 int OSD::peek_meta(ObjectStore *store, std::string& magic,
1842 uuid_d& cluster_fsid, uuid_d& osd_fsid, int& whoami)
1846 int r = store->read_meta("magic", &val);
1851 r = store->read_meta("whoami", &val);
1854 whoami = atoi(val.c_str());
1856 r = store->read_meta("ceph_fsid", &val);
1859 r = cluster_fsid.parse(val.c_str());
1863 r = store->read_meta("fsid", &val);
1865 osd_fsid = uuid_d();
1867 r = osd_fsid.parse(val.c_str());
1877 #define dout_prefix _prefix(_dout, whoami, get_osdmap_epoch())
1881 OSD::OSD(CephContext *cct_, ObjectStore *store_,
1883 Messenger *internal_messenger,
1884 Messenger *external_messenger,
1885 Messenger *hb_client_front,
1886 Messenger *hb_client_back,
1887 Messenger *hb_front_serverm,
1888 Messenger *hb_back_serverm,
1889 Messenger *osdc_messenger,
1891 const std::string &dev, const std::string &jdev) :
1893 osd_lock("OSD::osd_lock"),
1894 tick_timer(cct, osd_lock),
1895 tick_timer_lock("OSD::tick_timer_lock"),
1896 tick_timer_without_osd_lock(cct, tick_timer_lock),
1897 authorize_handler_cluster_registry(new AuthAuthorizeHandlerRegistry(cct,
1898 cct->_conf->auth_supported.empty() ?
1899 cct->_conf->auth_cluster_required :
1900 cct->_conf->auth_supported)),
1901 authorize_handler_service_registry(new AuthAuthorizeHandlerRegistry(cct,
1902 cct->_conf->auth_supported.empty() ?
1903 cct->_conf->auth_service_required :
1904 cct->_conf->auth_supported)),
1905 cluster_messenger(internal_messenger),
1906 client_messenger(external_messenger),
1907 objecter_messenger(osdc_messenger),
1909 mgrc(cct_, client_messenger),
1911 recoverystate_perf(NULL),
1913 log_client(cct, client_messenger, &mc->monmap, LogClient::NO_FLAGS),
1914 clog(log_client.create_channel()),
1916 dev_path(dev), journal_path(jdev),
1917 store_is_rotational(store->is_rotational()),
1918 trace_endpoint("0.0.0.0", 0, "osd"),
1920 osd_compat(get_osd_compat_set()),
1921 peering_tp(cct, "OSD::peering_tp", "tp_peering",
1922 cct->_conf->osd_peering_wq_threads,
1923 "osd_peering_tp_threads"),
1924 osd_op_tp(cct, "OSD::osd_op_tp", "tp_osd_tp",
1925 get_num_op_threads()),
1926 disk_tp(cct, "OSD::disk_tp", "tp_osd_disk", cct->_conf->osd_disk_threads, "osd_disk_threads"),
1927 command_tp(cct, "OSD::command_tp", "tp_osd_cmd", 1),
1928 session_waiting_lock("OSD::session_waiting_lock"),
1929 osdmap_subscribe_lock("OSD::osdmap_subscribe_lock"),
1930 heartbeat_lock("OSD::heartbeat_lock"),
1931 heartbeat_stop(false),
1932 heartbeat_need_update(true),
1933 hb_front_client_messenger(hb_client_front),
1934 hb_back_client_messenger(hb_client_back),
1935 hb_front_server_messenger(hb_front_serverm),
1936 hb_back_server_messenger(hb_back_serverm),
1938 heartbeat_thread(this),
1939 heartbeat_dispatcher(this),
1940 op_tracker(cct, cct->_conf->osd_enable_op_tracker,
1941 cct->_conf->osd_num_op_tracker_shard),
1942 test_ops_hook(NULL),
1943 op_queue(get_io_queue()),
1944 op_prio_cutoff(get_io_prio_cut()),
1946 get_num_op_shards(),
1948 cct->_conf->osd_op_thread_timeout,
1949 cct->_conf->osd_op_thread_suicide_timeout,
1953 cct->_conf->osd_op_thread_timeout,
1954 cct->_conf->osd_op_thread_suicide_timeout,
1956 map_lock("OSD::map_lock"),
1957 pg_map_lock("OSD::pg_map_lock"),
1958 last_pg_create_epoch(0),
1959 mon_report_lock("OSD::mon_report_lock"),
1960 stats_ack_timeout(cct->_conf->osd_mon_ack_timeout),
1962 requested_full_first(0),
1963 requested_full_last(0),
1964 pg_stat_queue_lock("OSD::pg_stat_queue_lock"),
1965 osd_stat_updated(false),
1966 pg_stat_tid(0), pg_stat_tid_flushed(0),
1969 cct->_conf->osd_command_thread_timeout,
1970 cct->_conf->osd_command_thread_suicide_timeout,
1975 cct->_conf->osd_remove_thread_timeout,
1976 cct->_conf->osd_remove_thread_suicide_timeout,
1980 monc->set_messenger(client_messenger);
1981 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
1982 cct->_conf->osd_op_log_threshold);
1983 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
1984 cct->_conf->osd_op_history_duration);
1985 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
1986 cct->_conf->osd_op_history_slow_op_threshold);
1988 std::stringstream ss;
1989 ss << "osd." << whoami;
1990 trace_endpoint.copy_name(ss.str());
1996 delete authorize_handler_cluster_registry;
1997 delete authorize_handler_service_registry;
1998 delete class_handler;
1999 cct->get_perfcounters_collection()->remove(recoverystate_perf);
2000 cct->get_perfcounters_collection()->remove(logger);
2001 delete recoverystate_perf;
2006 void cls_initialize(ClassHandler *ch);
2008 void OSD::handle_signal(int signum)
2010 assert(signum == SIGINT || signum == SIGTERM);
2011 derr << "*** Got signal " << sig_str(signum) << " ***" << dendl;
2017 Mutex::Locker lock(osd_lock);
2021 if (store->test_mount_in_use()) {
2022 derr << "OSD::pre_init: object store '" << dev_path << "' is "
2023 << "currently in use. (Is ceph-osd already running?)" << dendl;
2027 cct->_conf->add_observer(this);
2033 class OSDSocketHook : public AdminSocketHook {
2036 explicit OSDSocketHook(OSD *o) : osd(o) {}
2037 bool call(std::string admin_command, cmdmap_t& cmdmap, std::string format,
2038 bufferlist& out) override {
2040 bool r = osd->asok_command(admin_command, cmdmap, format, ss);
2046 bool OSD::asok_command(string admin_command, cmdmap_t& cmdmap, string format,
2049 Formatter *f = Formatter::create(format, "json-pretty", "json-pretty");
2050 if (admin_command == "status") {
2051 f->open_object_section("status");
2052 f->dump_stream("cluster_fsid") << superblock.cluster_fsid;
2053 f->dump_stream("osd_fsid") << superblock.osd_fsid;
2054 f->dump_unsigned("whoami", superblock.whoami);
2055 f->dump_string("state", get_state_name(get_state()));
2056 f->dump_unsigned("oldest_map", superblock.oldest_map);
2057 f->dump_unsigned("newest_map", superblock.newest_map);
2059 RWLock::RLocker l(pg_map_lock);
2060 f->dump_unsigned("num_pgs", pg_map.size());
2063 } else if (admin_command == "flush_journal") {
2064 store->flush_journal();
2065 } else if (admin_command == "dump_ops_in_flight" ||
2066 admin_command == "ops" ||
2067 admin_command == "dump_blocked_ops" ||
2068 admin_command == "dump_historic_ops" ||
2069 admin_command == "dump_historic_ops_by_duration" ||
2070 admin_command == "dump_historic_slow_ops") {
2072 const string error_str = "op_tracker tracking is not enabled now, so no ops are tracked currently, \
2073 even those get stuck. Please enable \"osd_enable_op_tracker\", and the tracker \
2074 will start to track new ops received afterwards.";
2076 set<string> filters;
2077 vector<string> filter_str;
2078 if (cmd_getval(cct, cmdmap, "filterstr", filter_str)) {
2079 copy(filter_str.begin(), filter_str.end(),
2080 inserter(filters, filters.end()));
2083 if (admin_command == "dump_ops_in_flight" ||
2084 admin_command == "ops") {
2085 if (!op_tracker.dump_ops_in_flight(f, false, filters)) {
2089 if (admin_command == "dump_blocked_ops") {
2090 if (!op_tracker.dump_ops_in_flight(f, true, filters)) {
2094 if (admin_command == "dump_historic_ops") {
2095 if (!op_tracker.dump_historic_ops(f, false, filters)) {
2099 if (admin_command == "dump_historic_ops_by_duration") {
2100 if (!op_tracker.dump_historic_ops(f, true, filters)) {
2104 if (admin_command == "dump_historic_slow_ops") {
2105 if (!op_tracker.dump_historic_slow_ops(f, filters)) {
2109 } else if (admin_command == "dump_op_pq_state") {
2110 f->open_object_section("pq");
2111 op_shardedwq.dump(f);
2113 } else if (admin_command == "dump_blacklist") {
2114 list<pair<entity_addr_t,utime_t> > bl;
2115 OSDMapRef curmap = service.get_osdmap();
2117 f->open_array_section("blacklist");
2118 curmap->get_blacklist(&bl);
2119 for (list<pair<entity_addr_t,utime_t> >::iterator it = bl.begin();
2120 it != bl.end(); ++it) {
2121 f->open_object_section("entry");
2122 f->open_object_section("entity_addr_t");
2124 f->close_section(); //entity_addr_t
2125 it->second.localtime(f->dump_stream("expire_time"));
2126 f->close_section(); //entry
2128 f->close_section(); //blacklist
2129 } else if (admin_command == "dump_watchers") {
2130 list<obj_watch_item_t> watchers;
2133 Mutex::Locker l(osd_lock);
2134 RWLock::RLocker l2(pg_map_lock);
2135 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2139 list<obj_watch_item_t> pg_watchers;
2140 PG *pg = it->second;
2142 pg->get_watchers(pg_watchers);
2144 watchers.splice(watchers.end(), pg_watchers);
2148 f->open_array_section("watchers");
2149 for (list<obj_watch_item_t>::iterator it = watchers.begin();
2150 it != watchers.end(); ++it) {
2152 f->open_object_section("watch");
2154 f->dump_string("namespace", it->obj.nspace);
2155 f->dump_string("object", it->obj.oid.name);
2157 f->open_object_section("entity_name");
2158 it->wi.name.dump(f);
2159 f->close_section(); //entity_name_t
2161 f->dump_unsigned("cookie", it->wi.cookie);
2162 f->dump_unsigned("timeout", it->wi.timeout_seconds);
2164 f->open_object_section("entity_addr_t");
2165 it->wi.addr.dump(f);
2166 f->close_section(); //entity_addr_t
2168 f->close_section(); //watch
2171 f->close_section(); //watchers
2172 } else if (admin_command == "dump_reservations") {
2173 f->open_object_section("reservations");
2174 f->open_object_section("local_reservations");
2175 service.local_reserver.dump(f);
2177 f->open_object_section("remote_reservations");
2178 service.remote_reserver.dump(f);
2181 } else if (admin_command == "get_latest_osdmap") {
2182 get_latest_osdmap();
2183 } else if (admin_command == "heap") {
2184 auto result = ceph::osd_cmds::heap(*cct, cmdmap, *f, ss);
2186 // Note: Failed heap profile commands won't necessarily trigger an error:
2187 f->open_object_section("result");
2188 f->dump_string("error", cpp_strerror(result));
2189 f->dump_bool("success", result >= 0);
2191 } else if (admin_command == "set_heap_property") {
2195 bool success = false;
2196 if (!cmd_getval(cct, cmdmap, "property", property)) {
2197 error = "unable to get property";
2199 } else if (!cmd_getval(cct, cmdmap, "value", value)) {
2200 error = "unable to get value";
2202 } else if (value < 0) {
2203 error = "negative value not allowed";
2205 } else if (!ceph_heap_set_numeric_property(property.c_str(), (size_t)value)) {
2206 error = "invalid property";
2211 f->open_object_section("result");
2212 f->dump_string("error", error);
2213 f->dump_bool("success", success);
2215 } else if (admin_command == "get_heap_property") {
2219 bool success = false;
2220 if (!cmd_getval(cct, cmdmap, "property", property)) {
2221 error = "unable to get property";
2223 } else if (!ceph_heap_get_numeric_property(property.c_str(), &value)) {
2224 error = "invalid property";
2229 f->open_object_section("result");
2230 f->dump_string("error", error);
2231 f->dump_bool("success", success);
2232 f->dump_int("value", value);
2234 } else if (admin_command == "dump_objectstore_kv_stats") {
2235 store->get_db_statistics(f);
2236 } else if (admin_command == "dump_scrubs") {
2237 service.dumps_scrub(f);
2238 } else if (admin_command == "calc_objectstore_db_histogram") {
2239 store->generate_db_histogram(f);
2240 } else if (admin_command == "flush_store_cache") {
2241 store->flush_cache();
2242 } else if (admin_command == "dump_pgstate_history") {
2243 f->open_object_section("pgstate_history");
2244 RWLock::RLocker l2(pg_map_lock);
2245 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
2249 PG *pg = it->second;
2250 f->dump_stream("pg") << pg->get_pgid();
2252 pg->pgstate_history.dump(f);
2256 } else if (admin_command == "compact") {
2257 dout(1) << "triggering manual compaction" << dendl;
2258 auto start = ceph::coarse_mono_clock::now();
2260 auto end = ceph::coarse_mono_clock::now();
2261 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
2262 dout(1) << "finished manual compaction in "
2263 << time_span.count()
2264 << " seconds" << dendl;
2265 f->open_object_section("compact_result");
2266 f->dump_float("elapsed_time", time_span.count());
2269 assert(0 == "broken asok registration");
2276 class TestOpsSocketHook : public AdminSocketHook {
2277 OSDService *service;
2280 TestOpsSocketHook(OSDService *s, ObjectStore *st) : service(s), store(st) {}
2281 bool call(std::string command, cmdmap_t& cmdmap, std::string format,
2282 bufferlist& out) override {
2284 test_ops(service, store, command, cmdmap, ss);
2288 void test_ops(OSDService *service, ObjectStore *store,
2289 const std::string &command, cmdmap_t& cmdmap, ostream &ss);
2293 class OSD::C_Tick : public Context {
2296 explicit C_Tick(OSD *o) : osd(o) {}
2297 void finish(int r) override {
2302 class OSD::C_Tick_WithoutOSDLock : public Context {
2305 explicit C_Tick_WithoutOSDLock(OSD *o) : osd(o) {}
2306 void finish(int r) override {
2307 osd->tick_without_osd_lock();
2311 int OSD::enable_disable_fuse(bool stop)
2315 string mntpath = cct->_conf->osd_data + "/fuse";
2316 if (fuse_store && (stop || !cct->_conf->osd_objectstore_fuse)) {
2317 dout(1) << __func__ << " disabling" << dendl;
2321 r = ::rmdir(mntpath.c_str());
2324 derr << __func__ << " failed to rmdir " << mntpath << ": "
2325 << cpp_strerror(r) << dendl;
2330 if (!fuse_store && cct->_conf->osd_objectstore_fuse) {
2331 dout(1) << __func__ << " enabling" << dendl;
2332 r = ::mkdir(mntpath.c_str(), 0700);
2335 if (r < 0 && r != -EEXIST) {
2336 derr << __func__ << " unable to create " << mntpath << ": "
2337 << cpp_strerror(r) << dendl;
2340 fuse_store = new FuseStore(store, mntpath);
2341 r = fuse_store->start();
2343 derr << __func__ << " unable to start fuse: " << cpp_strerror(r) << dendl;
2349 #endif // HAVE_LIBFUSE
2353 int OSD::get_num_op_shards()
2355 if (cct->_conf->osd_op_num_shards)
2356 return cct->_conf->osd_op_num_shards;
2357 if (store_is_rotational)
2358 return cct->_conf->osd_op_num_shards_hdd;
2360 return cct->_conf->osd_op_num_shards_ssd;
2363 int OSD::get_num_op_threads()
2365 if (cct->_conf->osd_op_num_threads_per_shard)
2366 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard;
2367 if (store_is_rotational)
2368 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_hdd;
2370 return get_num_op_shards() * cct->_conf->osd_op_num_threads_per_shard_ssd;
2373 float OSD::get_osd_recovery_sleep()
2375 if (cct->_conf->osd_recovery_sleep)
2376 return cct->_conf->osd_recovery_sleep;
2377 if (!store_is_rotational && !journal_is_rotational)
2378 return cct->_conf->osd_recovery_sleep_ssd;
2379 else if (store_is_rotational && !journal_is_rotational)
2380 return cct->_conf->get_val<double>("osd_recovery_sleep_hybrid");
2382 return cct->_conf->osd_recovery_sleep_hdd;
2387 CompatSet initial, diff;
2388 Mutex::Locker lock(osd_lock);
2393 tick_timer_without_osd_lock.init();
2394 service.recovery_request_timer.init();
2395 service.recovery_sleep_timer.init();
2398 dout(2) << "init " << dev_path
2399 << " (looks like " << (store_is_rotational ? "hdd" : "ssd") << ")"
2401 dout(2) << "journal " << journal_path << dendl;
2402 assert(store); // call pre_init() first!
2404 store->set_cache_shards(get_num_op_shards());
2406 int r = store->mount();
2408 derr << "OSD:init: unable to mount object store" << dendl;
2411 journal_is_rotational = store->is_journal_rotational();
2412 dout(2) << "journal looks like " << (journal_is_rotational ? "hdd" : "ssd")
2415 enable_disable_fuse(false);
2417 dout(2) << "boot" << dendl;
2419 // initialize the daily loadavg with current 15min loadavg
2421 if (getloadavg(loadavgs, 3) == 3) {
2422 daily_loadavg = loadavgs[2];
2424 derr << "OSD::init() : couldn't read loadavgs\n" << dendl;
2425 daily_loadavg = 1.0;
2428 int rotating_auth_attempts = 0;
2430 // sanity check long object name handling
2433 l.oid.name = string(cct->_conf->osd_max_object_name_len, 'n');
2434 l.set_key(string(cct->_conf->osd_max_object_name_len, 'k'));
2435 l.nspace = string(cct->_conf->osd_max_object_namespace_len, 's');
2436 r = store->validate_hobject_key(l);
2438 derr << "backend (" << store->get_type() << ") is unable to support max "
2439 << "object name[space] len" << dendl;
2440 derr << " osd max object name len = "
2441 << cct->_conf->osd_max_object_name_len << dendl;
2442 derr << " osd max object namespace len = "
2443 << cct->_conf->osd_max_object_namespace_len << dendl;
2444 derr << cpp_strerror(r) << dendl;
2445 if (cct->_conf->osd_check_max_object_name_len_on_startup) {
2448 derr << "osd_check_max_object_name_len_on_startup = false, starting anyway"
2451 dout(20) << "configured osd_max_object_name[space]_len looks ok" << dendl;
2456 r = read_superblock();
2458 derr << "OSD::init() : unable to read osd superblock" << dendl;
2463 if (osd_compat.compare(superblock.compat_features) < 0) {
2464 derr << "The disk uses features unsupported by the executable." << dendl;
2465 derr << " ondisk features " << superblock.compat_features << dendl;
2466 derr << " daemon features " << osd_compat << dendl;
2468 if (osd_compat.writeable(superblock.compat_features)) {
2469 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2470 derr << "it is still writeable, though. Missing features: " << diff << dendl;
2475 CompatSet diff = osd_compat.unsupported(superblock.compat_features);
2476 derr << "Cannot write to disk! Missing features: " << diff << dendl;
2482 assert_warn(whoami == superblock.whoami);
2483 if (whoami != superblock.whoami) {
2484 derr << "OSD::init: superblock says osd"
2485 << superblock.whoami << " but I am osd." << whoami << dendl;
2490 initial = get_osd_initial_compat_set();
2491 diff = superblock.compat_features.unsupported(initial);
2492 if (superblock.compat_features.merge(initial)) {
2493 // We need to persist the new compat_set before we
2495 dout(5) << "Upgrading superblock adding: " << diff << dendl;
2496 ObjectStore::Transaction t;
2497 write_superblock(t);
2498 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2503 // make sure snap mapper object exists
2504 if (!store->exists(coll_t::meta(), OSD::make_snapmapper_oid())) {
2505 dout(10) << "init creating/touching snapmapper object" << dendl;
2506 ObjectStore::Transaction t;
2507 t.touch(coll_t::meta(), OSD::make_snapmapper_oid());
2508 r = store->apply_transaction(service.meta_osr.get(), std::move(t));
2513 class_handler = new ClassHandler(cct);
2514 cls_initialize(class_handler);
2516 if (cct->_conf->osd_open_classes_on_start) {
2517 int r = class_handler->open_all_classes();
2519 dout(1) << "warning: got an error loading one or more classes: " << cpp_strerror(r) << dendl;
2522 // load up "current" osdmap
2523 assert_warn(!osdmap);
2525 derr << "OSD::init: unable to read current osdmap" << dendl;
2529 osdmap = get_map(superblock.current_epoch);
2530 check_osdmap_features(store);
2532 create_recoverystate_perf();
2535 epoch_t bind_epoch = osdmap->get_epoch();
2536 service.set_epochs(NULL, NULL, &bind_epoch);
2539 clear_temp_objects();
2541 // initialize osdmap references in sharded wq
2542 op_shardedwq.prune_pg_waiters(osdmap, whoami);
2544 // load up pgs (as they previously existed)
2547 dout(2) << "superblock: I am osd." << superblock.whoami << dendl;
2548 dout(0) << "using " << op_queue << " op queue with priority op cut off at " <<
2549 op_prio_cutoff << "." << dendl;
2554 client_messenger->add_dispatcher_head(this);
2555 cluster_messenger->add_dispatcher_head(this);
2557 hb_front_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2558 hb_back_client_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2559 hb_front_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2560 hb_back_server_messenger->add_dispatcher_head(&heartbeat_dispatcher);
2562 objecter_messenger->add_dispatcher_head(service.objecter);
2564 monc->set_want_keys(CEPH_ENTITY_TYPE_MON | CEPH_ENTITY_TYPE_OSD
2565 | CEPH_ENTITY_TYPE_MGR);
2571 * FIXME: this is a placeholder implementation that unconditionally
2572 * sends every is_primary PG's stats every time we're called, unlike
2573 * the existing mon PGStats mechanism that uses pg_stat_queue and acks.
2574 * This has equivalent cost to the existing worst case where all
2575 * PGs are busy and their stats are always enqueued for sending.
2577 mgrc.set_pgstats_cb([this](){
2578 RWLock::RLocker l(map_lock);
2580 utime_t had_for = ceph_clock_now() - had_map_since;
2581 osd_stat_t cur_stat = service.get_osd_stat();
2582 cur_stat.os_perf_stat = store->get_cur_stats();
2584 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
2585 m->osd_stat = cur_stat;
2587 Mutex::Locker lec{min_last_epoch_clean_lock};
2588 min_last_epoch_clean = osdmap->get_epoch();
2589 min_last_epoch_clean_pgs.clear();
2590 RWLock::RLocker lpg(pg_map_lock);
2591 for (const auto &i : pg_map) {
2593 if (!pg->is_primary()) {
2597 pg->pg_stats_publish_lock.Lock();
2598 if (pg->pg_stats_publish_valid) {
2599 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
2600 const auto lec = pg->pg_stats_publish.get_effective_last_epoch_clean();
2601 min_last_epoch_clean = min(min_last_epoch_clean, lec);
2602 min_last_epoch_clean_pgs.push_back(pg->info.pgid.pgid);
2604 pg->pg_stats_publish_lock.Unlock();
2611 client_messenger->add_dispatcher_head(&mgrc);
2613 // tell monc about log_client so it will know about mon session resets
2614 monc->set_log_client(&log_client);
2615 update_log_config();
2622 set_disk_tp_priority();
2624 // start the heartbeat
2625 heartbeat_thread.create("osd_srv_heartbt");
2628 tick_timer.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick(this));
2630 Mutex::Locker l(tick_timer_lock);
2631 tick_timer_without_osd_lock.add_event_after(cct->_conf->osd_heartbeat_interval, new C_Tick_WithoutOSDLock(this));
2635 service.publish_map(osdmap);
2636 service.publish_superblock(superblock);
2637 service.max_oldest_map = superblock.oldest_map;
2641 r = monc->authenticate();
2643 derr << __func__ << " authentication failed: " << cpp_strerror(r)
2645 osd_lock.Lock(); // locker is going to unlock this on function exit
2651 while (monc->wait_auth_rotating(30.0) < 0) {
2652 derr << "unable to obtain rotating service keys; retrying" << dendl;
2653 ++rotating_auth_attempts;
2654 if (rotating_auth_attempts > g_conf->max_rotating_auth_attempts) {
2655 derr << __func__ << " wait_auth_rotating timed out" << dendl;
2656 osd_lock.Lock(); // make locker happy
2657 if (!is_stopping()) {
2664 r = update_crush_device_class();
2666 derr << __func__ << " unable to update_crush_device_class: "
2667 << cpp_strerror(r) << dendl;
2672 r = update_crush_location();
2674 derr << __func__ << " unable to update_crush_location: "
2675 << cpp_strerror(r) << dendl;
2684 // start objecter *after* we have authenticated, so that we don't ignore
2685 // the OSDMaps it requests.
2686 service.final_init();
2690 dout(10) << "ensuring pgs have consumed prior maps" << dendl;
2694 dout(0) << "done with init, starting boot process" << dendl;
2696 // subscribe to any pg creations
2697 monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0);
2699 // MgrClient needs this (it doesn't have MonClient reference itself)
2700 monc->sub_want("mgrmap", 0, 0);
2702 // we don't need to ask for an osdmap here; objecter will
2703 //monc->sub_want("osdmap", osdmap->get_epoch(), CEPH_SUBSCRIBE_ONETIME);
2714 enable_disable_fuse(true);
2721 void OSD::final_init()
2723 AdminSocket *admin_socket = cct->get_admin_socket();
2724 asok_hook = new OSDSocketHook(this);
2725 int r = admin_socket->register_command("status", "status", asok_hook,
2726 "high-level status of OSD");
2728 r = admin_socket->register_command("flush_journal", "flush_journal",
2730 "flush the journal to permanent store");
2732 r = admin_socket->register_command("dump_ops_in_flight",
2733 "dump_ops_in_flight " \
2734 "name=filterstr,type=CephString,n=N,req=false",
2736 "show the ops currently in flight");
2738 r = admin_socket->register_command("ops",
2740 "name=filterstr,type=CephString,n=N,req=false",
2742 "show the ops currently in flight");
2744 r = admin_socket->register_command("dump_blocked_ops",
2745 "dump_blocked_ops " \
2746 "name=filterstr,type=CephString,n=N,req=false",
2748 "show the blocked ops currently in flight");
2750 r = admin_socket->register_command("dump_historic_ops",
2751 "dump_historic_ops " \
2752 "name=filterstr,type=CephString,n=N,req=false",
2756 r = admin_socket->register_command("dump_historic_slow_ops",
2757 "dump_historic_slow_ops " \
2758 "name=filterstr,type=CephString,n=N,req=false",
2760 "show slowest recent ops");
2762 r = admin_socket->register_command("dump_historic_ops_by_duration",
2763 "dump_historic_ops_by_duration " \
2764 "name=filterstr,type=CephString,n=N,req=false",
2766 "show slowest recent ops, sorted by duration");
2768 r = admin_socket->register_command("dump_op_pq_state", "dump_op_pq_state",
2770 "dump op priority queue state");
2772 r = admin_socket->register_command("dump_blacklist", "dump_blacklist",
2774 "dump blacklisted clients and times");
2776 r = admin_socket->register_command("dump_watchers", "dump_watchers",
2778 "show clients which have active watches,"
2779 " and on which objects");
2781 r = admin_socket->register_command("dump_reservations", "dump_reservations",
2783 "show recovery reservations");
2785 r = admin_socket->register_command("get_latest_osdmap", "get_latest_osdmap",
2787 "force osd to update the latest map from "
2791 r = admin_socket->register_command( "heap",
2793 "name=heapcmd,type=CephString",
2795 "show heap usage info (available only if "
2796 "compiled with tcmalloc)");
2799 r = admin_socket->register_command("set_heap_property",
2800 "set_heap_property " \
2801 "name=property,type=CephString " \
2802 "name=value,type=CephInt",
2804 "update malloc extension heap property");
2807 r = admin_socket->register_command("get_heap_property",
2808 "get_heap_property " \
2809 "name=property,type=CephString",
2811 "get malloc extension heap property");
2814 r = admin_socket->register_command("dump_objectstore_kv_stats",
2815 "dump_objectstore_kv_stats",
2817 "print statistics of kvdb which used by bluestore");
2820 r = admin_socket->register_command("dump_scrubs",
2823 "print scheduled scrubs");
2826 r = admin_socket->register_command("calc_objectstore_db_histogram",
2827 "calc_objectstore_db_histogram",
2829 "Generate key value histogram of kvdb(rocksdb) which used by bluestore");
2832 r = admin_socket->register_command("flush_store_cache",
2833 "flush_store_cache",
2835 "Flush bluestore internal cache");
2837 r = admin_socket->register_command("dump_pgstate_history", "dump_pgstate_history",
2839 "show recent state history");
2842 r = admin_socket->register_command("compact", "compact",
2844 "Commpact object store's omap."
2845 " WARNING: Compaction probably slows your requests");
2848 test_ops_hook = new TestOpsSocketHook(&(this->service), this->store);
2849 // Note: pools are CephString instead of CephPoolname because
2850 // these commands traditionally support both pool names and numbers
2851 r = admin_socket->register_command(
2854 "name=pool,type=CephString " \
2855 "name=objname,type=CephObjectname " \
2856 "name=key,type=CephString "\
2857 "name=val,type=CephString",
2861 r = admin_socket->register_command(
2864 "name=pool,type=CephString " \
2865 "name=objname,type=CephObjectname " \
2866 "name=key,type=CephString",
2870 r = admin_socket->register_command(
2873 "name=pool,type=CephString " \
2874 "name=objname,type=CephObjectname " \
2875 "name=header,type=CephString",
2880 r = admin_socket->register_command(
2883 "name=pool,type=CephString " \
2884 "name=objname,type=CephObjectname",
2886 "output entire object map");
2889 r = admin_socket->register_command(
2892 "name=pool,type=CephString " \
2893 "name=objname,type=CephObjectname " \
2894 "name=len,type=CephInt",
2896 "truncate object to length");
2899 r = admin_socket->register_command(
2902 "name=pool,type=CephString " \
2903 "name=objname,type=CephObjectname " \
2904 "name=shardid,type=CephInt,req=false,range=0|255",
2906 "inject data error to an object");
2909 r = admin_socket->register_command(
2912 "name=pool,type=CephString " \
2913 "name=objname,type=CephObjectname " \
2914 "name=shardid,type=CephInt,req=false,range=0|255",
2916 "inject metadata error to an object");
2918 r = admin_socket->register_command(
2919 "set_recovery_delay",
2920 "set_recovery_delay " \
2921 "name=utime,type=CephInt,req=false",
2923 "Delay osd recovery by specified seconds");
2925 r = admin_socket->register_command(
2928 "name=pgid,type=CephString ",
2930 "Trigger a scheduled scrub ");
2932 r = admin_socket->register_command(
2935 "name=type,type=CephString,req=false " \
2936 "name=count,type=CephInt,req=false ",
2938 "Inject a full disk (optional count times)");
2942 void OSD::create_logger()
2944 dout(10) << "create_logger" << dendl;
2946 PerfCountersBuilder osd_plb(cct, "osd", l_osd_first, l_osd_last);
2948 // Latency axis configuration for op histograms, values are in nanoseconds
2949 PerfHistogramCommon::axis_config_d op_hist_x_axis_config{
2951 PerfHistogramCommon::SCALE_LOG2, ///< Latency in logarithmic scale
2953 100000, ///< Quantization unit is 100usec
2954 32, ///< Enough to cover much longer than slow requests
2957 // Op size axis configuration for op histograms, values are in bytes
2958 PerfHistogramCommon::axis_config_d op_hist_y_axis_config{
2959 "Request size (bytes)",
2960 PerfHistogramCommon::SCALE_LOG2, ///< Request size in logarithmic scale
2962 512, ///< Quantization unit is 512 bytes
2963 32, ///< Enough to cover requests larger than GB
2967 // All the basic OSD operation stats are to be considered useful
2968 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
2971 l_osd_op_wip, "op_wip",
2972 "Replication operations currently being processed (primary)");
2973 osd_plb.add_u64_counter(
2975 "Client operations",
2976 "ops", PerfCountersBuilder::PRIO_CRITICAL);
2977 osd_plb.add_u64_counter(
2978 l_osd_op_inb, "op_in_bytes",
2979 "Client operations total write size",
2980 "wr", PerfCountersBuilder::PRIO_INTERESTING);
2981 osd_plb.add_u64_counter(
2982 l_osd_op_outb, "op_out_bytes",
2983 "Client operations total read size",
2984 "rd", PerfCountersBuilder::PRIO_INTERESTING);
2985 osd_plb.add_time_avg(
2986 l_osd_op_lat, "op_latency",
2987 "Latency of client operations (including queue time)",
2989 osd_plb.add_time_avg(
2990 l_osd_op_process_lat, "op_process_latency",
2991 "Latency of client operations (excluding queue time)");
2992 osd_plb.add_time_avg(
2993 l_osd_op_prepare_lat, "op_prepare_latency",
2994 "Latency of client operations (excluding queue time and wait for finished)");
2996 osd_plb.add_u64_counter(
2997 l_osd_op_r, "op_r", "Client read operations");
2998 osd_plb.add_u64_counter(
2999 l_osd_op_r_outb, "op_r_out_bytes", "Client data read");
3000 osd_plb.add_time_avg(
3001 l_osd_op_r_lat, "op_r_latency",
3002 "Latency of read operation (including queue time)");
3003 osd_plb.add_u64_counter_histogram(
3004 l_osd_op_r_lat_outb_hist, "op_r_latency_out_bytes_histogram",
3005 op_hist_x_axis_config, op_hist_y_axis_config,
3006 "Histogram of operation latency (including queue time) + data read");
3007 osd_plb.add_time_avg(
3008 l_osd_op_r_process_lat, "op_r_process_latency",
3009 "Latency of read operation (excluding queue time)");
3010 osd_plb.add_time_avg(
3011 l_osd_op_r_prepare_lat, "op_r_prepare_latency",
3012 "Latency of read operations (excluding queue time and wait for finished)");
3013 osd_plb.add_u64_counter(
3014 l_osd_op_w, "op_w", "Client write operations");
3015 osd_plb.add_u64_counter(
3016 l_osd_op_w_inb, "op_w_in_bytes", "Client data written");
3017 osd_plb.add_time_avg(
3018 l_osd_op_w_lat, "op_w_latency",
3019 "Latency of write operation (including queue time)");
3020 osd_plb.add_u64_counter_histogram(
3021 l_osd_op_w_lat_inb_hist, "op_w_latency_in_bytes_histogram",
3022 op_hist_x_axis_config, op_hist_y_axis_config,
3023 "Histogram of operation latency (including queue time) + data written");
3024 osd_plb.add_time_avg(
3025 l_osd_op_w_process_lat, "op_w_process_latency",
3026 "Latency of write operation (excluding queue time)");
3027 osd_plb.add_time_avg(
3028 l_osd_op_w_prepare_lat, "op_w_prepare_latency",
3029 "Latency of write operations (excluding queue time and wait for finished)");
3030 osd_plb.add_u64_counter(
3031 l_osd_op_rw, "op_rw",
3032 "Client read-modify-write operations");
3033 osd_plb.add_u64_counter(
3034 l_osd_op_rw_inb, "op_rw_in_bytes",
3035 "Client read-modify-write operations write in");
3036 osd_plb.add_u64_counter(
3037 l_osd_op_rw_outb,"op_rw_out_bytes",
3038 "Client read-modify-write operations read out ");
3039 osd_plb.add_time_avg(
3040 l_osd_op_rw_lat, "op_rw_latency",
3041 "Latency of read-modify-write operation (including queue time)");
3042 osd_plb.add_u64_counter_histogram(
3043 l_osd_op_rw_lat_inb_hist, "op_rw_latency_in_bytes_histogram",
3044 op_hist_x_axis_config, op_hist_y_axis_config,
3045 "Histogram of rw operation latency (including queue time) + data written");
3046 osd_plb.add_u64_counter_histogram(
3047 l_osd_op_rw_lat_outb_hist, "op_rw_latency_out_bytes_histogram",
3048 op_hist_x_axis_config, op_hist_y_axis_config,
3049 "Histogram of rw operation latency (including queue time) + data read");
3050 osd_plb.add_time_avg(
3051 l_osd_op_rw_process_lat, "op_rw_process_latency",
3052 "Latency of read-modify-write operation (excluding queue time)");
3053 osd_plb.add_time_avg(
3054 l_osd_op_rw_prepare_lat, "op_rw_prepare_latency",
3055 "Latency of read-modify-write operations (excluding queue time and wait for finished)");
3057 // Now we move on to some more obscure stats, revert to assuming things
3058 // are low priority unless otherwise specified.
3059 osd_plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
3061 osd_plb.add_time_avg(l_osd_op_before_queue_op_lat, "op_before_queue_op_lat",
3062 "Latency of IO before calling queue(before really queue into ShardedOpWq)"); // client io before queue op_wq latency
3063 osd_plb.add_time_avg(l_osd_op_before_dequeue_op_lat, "op_before_dequeue_op_lat",
3064 "Latency of IO before calling dequeue_op(already dequeued and get PG lock)"); // client io before dequeue_op latency
3066 osd_plb.add_u64_counter(
3067 l_osd_sop, "subop", "Suboperations");
3068 osd_plb.add_u64_counter(
3069 l_osd_sop_inb, "subop_in_bytes", "Suboperations total size");
3070 osd_plb.add_time_avg(l_osd_sop_lat, "subop_latency", "Suboperations latency");
3072 osd_plb.add_u64_counter(l_osd_sop_w, "subop_w", "Replicated writes");
3073 osd_plb.add_u64_counter(
3074 l_osd_sop_w_inb, "subop_w_in_bytes", "Replicated written data size");
3075 osd_plb.add_time_avg(
3076 l_osd_sop_w_lat, "subop_w_latency", "Replicated writes latency");
3077 osd_plb.add_u64_counter(
3078 l_osd_sop_pull, "subop_pull", "Suboperations pull requests");
3079 osd_plb.add_time_avg(
3080 l_osd_sop_pull_lat, "subop_pull_latency", "Suboperations pull latency");
3081 osd_plb.add_u64_counter(
3082 l_osd_sop_push, "subop_push", "Suboperations push messages");
3083 osd_plb.add_u64_counter(
3084 l_osd_sop_push_inb, "subop_push_in_bytes", "Suboperations pushed size");
3085 osd_plb.add_time_avg(
3086 l_osd_sop_push_lat, "subop_push_latency", "Suboperations push latency");
3088 osd_plb.add_u64_counter(l_osd_pull, "pull", "Pull requests sent");
3089 osd_plb.add_u64_counter(l_osd_push, "push", "Push messages sent");
3090 osd_plb.add_u64_counter(l_osd_push_outb, "push_out_bytes", "Pushed size");
3092 osd_plb.add_u64_counter(
3093 l_osd_rop, "recovery_ops",
3094 "Started recovery operations",
3095 "rop", PerfCountersBuilder::PRIO_INTERESTING);
3097 osd_plb.add_u64(l_osd_loadavg, "loadavg", "CPU load");
3098 osd_plb.add_u64(l_osd_buf, "buffer_bytes", "Total allocated buffer size");
3099 osd_plb.add_u64(l_osd_history_alloc_bytes, "history_alloc_Mbytes");
3100 osd_plb.add_u64(l_osd_history_alloc_num, "history_alloc_num");
3102 l_osd_cached_crc, "cached_crc", "Total number getting crc from crc_cache");
3104 l_osd_cached_crc_adjusted, "cached_crc_adjusted",
3105 "Total number getting crc from crc_cache with adjusting");
3106 osd_plb.add_u64(l_osd_missed_crc, "missed_crc",
3107 "Total number of crc cache misses");
3109 osd_plb.add_u64(l_osd_pg, "numpg", "Placement groups",
3110 "pgs", PerfCountersBuilder::PRIO_USEFUL);
3112 l_osd_pg_primary, "numpg_primary",
3113 "Placement groups for which this osd is primary");
3115 l_osd_pg_replica, "numpg_replica",
3116 "Placement groups for which this osd is replica");
3118 l_osd_pg_stray, "numpg_stray",
3119 "Placement groups ready to be deleted from this osd");
3121 l_osd_hb_to, "heartbeat_to_peers", "Heartbeat (ping) peers we send to");
3122 osd_plb.add_u64_counter(l_osd_map, "map_messages", "OSD map messages");
3123 osd_plb.add_u64_counter(l_osd_mape, "map_message_epochs", "OSD map epochs");
3124 osd_plb.add_u64_counter(
3125 l_osd_mape_dup, "map_message_epoch_dups", "OSD map duplicates");
3126 osd_plb.add_u64_counter(
3127 l_osd_waiting_for_map, "messages_delayed_for_map",
3128 "Operations waiting for OSD map");
3130 osd_plb.add_u64_counter(
3131 l_osd_map_cache_hit, "osd_map_cache_hit", "osdmap cache hit");
3132 osd_plb.add_u64_counter(
3133 l_osd_map_cache_miss, "osd_map_cache_miss", "osdmap cache miss");
3134 osd_plb.add_u64_counter(
3135 l_osd_map_cache_miss_low, "osd_map_cache_miss_low",
3136 "osdmap cache miss below cache lower bound");
3137 osd_plb.add_u64_avg(
3138 l_osd_map_cache_miss_low_avg, "osd_map_cache_miss_low_avg",
3139 "osdmap cache miss, avg distance below cache lower bound");
3140 osd_plb.add_u64_counter(
3141 l_osd_map_bl_cache_hit, "osd_map_bl_cache_hit",
3142 "OSDMap buffer cache hits");
3143 osd_plb.add_u64_counter(
3144 l_osd_map_bl_cache_miss, "osd_map_bl_cache_miss",
3145 "OSDMap buffer cache misses");
3148 l_osd_stat_bytes, "stat_bytes", "OSD size", "size",
3149 PerfCountersBuilder::PRIO_USEFUL);
3151 l_osd_stat_bytes_used, "stat_bytes_used", "Used space", "used",
3152 PerfCountersBuilder::PRIO_USEFUL);
3153 osd_plb.add_u64(l_osd_stat_bytes_avail, "stat_bytes_avail", "Available space");
3155 osd_plb.add_u64_counter(
3156 l_osd_copyfrom, "copyfrom", "Rados \"copy-from\" operations");
3158 osd_plb.add_u64_counter(l_osd_tier_promote, "tier_promote", "Tier promotions");
3159 osd_plb.add_u64_counter(l_osd_tier_flush, "tier_flush", "Tier flushes");
3160 osd_plb.add_u64_counter(
3161 l_osd_tier_flush_fail, "tier_flush_fail", "Failed tier flushes");
3162 osd_plb.add_u64_counter(
3163 l_osd_tier_try_flush, "tier_try_flush", "Tier flush attempts");
3164 osd_plb.add_u64_counter(
3165 l_osd_tier_try_flush_fail, "tier_try_flush_fail",
3166 "Failed tier flush attempts");
3167 osd_plb.add_u64_counter(
3168 l_osd_tier_evict, "tier_evict", "Tier evictions");
3169 osd_plb.add_u64_counter(
3170 l_osd_tier_whiteout, "tier_whiteout", "Tier whiteouts");
3171 osd_plb.add_u64_counter(
3172 l_osd_tier_dirty, "tier_dirty", "Dirty tier flag set");
3173 osd_plb.add_u64_counter(
3174 l_osd_tier_clean, "tier_clean", "Dirty tier flag cleaned");
3175 osd_plb.add_u64_counter(
3176 l_osd_tier_delay, "tier_delay", "Tier delays (agent waiting)");
3177 osd_plb.add_u64_counter(
3178 l_osd_tier_proxy_read, "tier_proxy_read", "Tier proxy reads");
3179 osd_plb.add_u64_counter(
3180 l_osd_tier_proxy_write, "tier_proxy_write", "Tier proxy writes");
3182 osd_plb.add_u64_counter(
3183 l_osd_agent_wake, "agent_wake", "Tiering agent wake up");
3184 osd_plb.add_u64_counter(
3185 l_osd_agent_skip, "agent_skip", "Objects skipped by agent");
3186 osd_plb.add_u64_counter(
3187 l_osd_agent_flush, "agent_flush", "Tiering agent flushes");
3188 osd_plb.add_u64_counter(
3189 l_osd_agent_evict, "agent_evict", "Tiering agent evictions");
3191 osd_plb.add_u64_counter(
3192 l_osd_object_ctx_cache_hit, "object_ctx_cache_hit", "Object context cache hits");
3193 osd_plb.add_u64_counter(
3194 l_osd_object_ctx_cache_total, "object_ctx_cache_total", "Object context cache lookups");
3196 osd_plb.add_u64_counter(l_osd_op_cache_hit, "op_cache_hit");
3197 osd_plb.add_time_avg(
3198 l_osd_tier_flush_lat, "osd_tier_flush_lat", "Object flush latency");
3199 osd_plb.add_time_avg(
3200 l_osd_tier_promote_lat, "osd_tier_promote_lat", "Object promote latency");
3201 osd_plb.add_time_avg(
3202 l_osd_tier_r_lat, "osd_tier_r_lat", "Object proxy read latency");
3204 osd_plb.add_u64_counter(
3205 l_osd_pg_info, "osd_pg_info", "PG updated its info (using any method)");
3206 osd_plb.add_u64_counter(
3207 l_osd_pg_fastinfo, "osd_pg_fastinfo",
3208 "PG updated its info using fastinfo attr");
3209 osd_plb.add_u64_counter(
3210 l_osd_pg_biginfo, "osd_pg_biginfo", "PG updated its biginfo attr");
3212 logger = osd_plb.create_perf_counters();
3213 cct->get_perfcounters_collection()->add(logger);
3216 void OSD::create_recoverystate_perf()
3218 dout(10) << "create_recoverystate_perf" << dendl;
3220 PerfCountersBuilder rs_perf(cct, "recoverystate_perf", rs_first, rs_last);
3222 rs_perf.add_time_avg(rs_initial_latency, "initial_latency", "Initial recovery state latency");
3223 rs_perf.add_time_avg(rs_started_latency, "started_latency", "Started recovery state latency");
3224 rs_perf.add_time_avg(rs_reset_latency, "reset_latency", "Reset recovery state latency");
3225 rs_perf.add_time_avg(rs_start_latency, "start_latency", "Start recovery state latency");
3226 rs_perf.add_time_avg(rs_primary_latency, "primary_latency", "Primary recovery state latency");
3227 rs_perf.add_time_avg(rs_peering_latency, "peering_latency", "Peering recovery state latency");
3228 rs_perf.add_time_avg(rs_backfilling_latency, "backfilling_latency", "Backfilling recovery state latency");
3229 rs_perf.add_time_avg(rs_waitremotebackfillreserved_latency, "waitremotebackfillreserved_latency", "Wait remote backfill reserved recovery state latency");
3230 rs_perf.add_time_avg(rs_waitlocalbackfillreserved_latency, "waitlocalbackfillreserved_latency", "Wait local backfill reserved recovery state latency");
3231 rs_perf.add_time_avg(rs_notbackfilling_latency, "notbackfilling_latency", "Notbackfilling recovery state latency");
3232 rs_perf.add_time_avg(rs_repnotrecovering_latency, "repnotrecovering_latency", "Repnotrecovering recovery state latency");
3233 rs_perf.add_time_avg(rs_repwaitrecoveryreserved_latency, "repwaitrecoveryreserved_latency", "Rep wait recovery reserved recovery state latency");
3234 rs_perf.add_time_avg(rs_repwaitbackfillreserved_latency, "repwaitbackfillreserved_latency", "Rep wait backfill reserved recovery state latency");
3235 rs_perf.add_time_avg(rs_reprecovering_latency, "reprecovering_latency", "RepRecovering recovery state latency");
3236 rs_perf.add_time_avg(rs_activating_latency, "activating_latency", "Activating recovery state latency");
3237 rs_perf.add_time_avg(rs_waitlocalrecoveryreserved_latency, "waitlocalrecoveryreserved_latency", "Wait local recovery reserved recovery state latency");
3238 rs_perf.add_time_avg(rs_waitremoterecoveryreserved_latency, "waitremoterecoveryreserved_latency", "Wait remote recovery reserved recovery state latency");
3239 rs_perf.add_time_avg(rs_recovering_latency, "recovering_latency", "Recovering recovery state latency");
3240 rs_perf.add_time_avg(rs_recovered_latency, "recovered_latency", "Recovered recovery state latency");
3241 rs_perf.add_time_avg(rs_clean_latency, "clean_latency", "Clean recovery state latency");
3242 rs_perf.add_time_avg(rs_active_latency, "active_latency", "Active recovery state latency");
3243 rs_perf.add_time_avg(rs_replicaactive_latency, "replicaactive_latency", "Replicaactive recovery state latency");
3244 rs_perf.add_time_avg(rs_stray_latency, "stray_latency", "Stray recovery state latency");
3245 rs_perf.add_time_avg(rs_getinfo_latency, "getinfo_latency", "Getinfo recovery state latency");
3246 rs_perf.add_time_avg(rs_getlog_latency, "getlog_latency", "Getlog recovery state latency");
3247 rs_perf.add_time_avg(rs_waitactingchange_latency, "waitactingchange_latency", "Waitactingchange recovery state latency");
3248 rs_perf.add_time_avg(rs_incomplete_latency, "incomplete_latency", "Incomplete recovery state latency");
3249 rs_perf.add_time_avg(rs_down_latency, "down_latency", "Down recovery state latency");
3250 rs_perf.add_time_avg(rs_getmissing_latency, "getmissing_latency", "Getmissing recovery state latency");
3251 rs_perf.add_time_avg(rs_waitupthru_latency, "waitupthru_latency", "Waitupthru recovery state latency");
3252 rs_perf.add_time_avg(rs_notrecovering_latency, "notrecovering_latency", "Notrecovering recovery state latency");
3254 recoverystate_perf = rs_perf.create_perf_counters();
3255 cct->get_perfcounters_collection()->add(recoverystate_perf);
3260 if (!service.prepare_to_stop())
3261 return 0; // already shutting down
3263 if (is_stopping()) {
3267 derr << "shutdown" << dendl;
3269 set_state(STATE_STOPPING);
3272 if (cct->_conf->get_val<bool>("osd_debug_shutdown")) {
3273 cct->_conf->set_val("debug_osd", "100");
3274 cct->_conf->set_val("debug_journal", "100");
3275 cct->_conf->set_val("debug_filestore", "100");
3276 cct->_conf->set_val("debug_bluestore", "100");
3277 cct->_conf->set_val("debug_ms", "100");
3278 cct->_conf->apply_changes(NULL);
3281 // stop MgrClient earlier as it's more like an internal consumer of OSD
3284 service.start_shutdown();
3286 // stop sending work to pgs. this just prevents any new work in _process
3287 // from racing with on_shutdown and potentially entering the pg after.
3288 op_shardedwq.drain();
3292 RWLock::RLocker l(pg_map_lock);
3293 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3296 dout(20) << " kicking pg " << p->first << dendl;
3298 p->second->on_shutdown();
3299 p->second->unlock();
3300 p->second->osr->flush();
3303 clear_pg_stat_queue();
3305 // drain op queue again (in case PGs requeued something)
3306 op_shardedwq.drain();
3308 finished.clear(); // zap waiters (bleh, this is messy)
3311 op_shardedwq.clear_pg_slots();
3313 // unregister commands
3314 cct->get_admin_socket()->unregister_command("status");
3315 cct->get_admin_socket()->unregister_command("flush_journal");
3316 cct->get_admin_socket()->unregister_command("dump_ops_in_flight");
3317 cct->get_admin_socket()->unregister_command("ops");
3318 cct->get_admin_socket()->unregister_command("dump_blocked_ops");
3319 cct->get_admin_socket()->unregister_command("dump_historic_ops");
3320 cct->get_admin_socket()->unregister_command("dump_historic_ops_by_duration");
3321 cct->get_admin_socket()->unregister_command("dump_historic_slow_ops");
3322 cct->get_admin_socket()->unregister_command("dump_op_pq_state");
3323 cct->get_admin_socket()->unregister_command("dump_blacklist");
3324 cct->get_admin_socket()->unregister_command("dump_watchers");
3325 cct->get_admin_socket()->unregister_command("dump_reservations");
3326 cct->get_admin_socket()->unregister_command("get_latest_osdmap");
3327 cct->get_admin_socket()->unregister_command("heap");
3328 cct->get_admin_socket()->unregister_command("set_heap_property");
3329 cct->get_admin_socket()->unregister_command("get_heap_property");
3330 cct->get_admin_socket()->unregister_command("dump_objectstore_kv_stats");
3331 cct->get_admin_socket()->unregister_command("dump_scrubs");
3332 cct->get_admin_socket()->unregister_command("calc_objectstore_db_histogram");
3333 cct->get_admin_socket()->unregister_command("flush_store_cache");
3334 cct->get_admin_socket()->unregister_command("dump_pgstate_history");
3335 cct->get_admin_socket()->unregister_command("compact");
3339 cct->get_admin_socket()->unregister_command("setomapval");
3340 cct->get_admin_socket()->unregister_command("rmomapkey");
3341 cct->get_admin_socket()->unregister_command("setomapheader");
3342 cct->get_admin_socket()->unregister_command("getomap");
3343 cct->get_admin_socket()->unregister_command("truncobj");
3344 cct->get_admin_socket()->unregister_command("injectdataerr");
3345 cct->get_admin_socket()->unregister_command("injectmdataerr");
3346 cct->get_admin_socket()->unregister_command("set_recovery_delay");
3347 cct->get_admin_socket()->unregister_command("trigger_scrub");
3348 cct->get_admin_socket()->unregister_command("injectfull");
3349 delete test_ops_hook;
3350 test_ops_hook = NULL;
3354 heartbeat_lock.Lock();
3355 heartbeat_stop = true;
3356 heartbeat_cond.Signal();
3357 heartbeat_lock.Unlock();
3358 heartbeat_thread.join();
3363 dout(10) << "osd tp stopped" << dendl;
3367 dout(10) << "op sharded tp stopped" << dendl;
3371 dout(10) << "command tp stopped" << dendl;
3375 dout(10) << "disk tp paused (new)" << dendl;
3377 dout(10) << "stopping agent" << dendl;
3378 service.agent_stop();
3382 reset_heartbeat_peers();
3384 tick_timer.shutdown();
3387 Mutex::Locker l(tick_timer_lock);
3388 tick_timer_without_osd_lock.shutdown();
3391 // note unmount epoch
3392 dout(10) << "noting clean unmount in epoch " << osdmap->get_epoch() << dendl;
3393 superblock.mounted = service.get_boot_epoch();
3394 superblock.clean_thru = osdmap->get_epoch();
3395 ObjectStore::Transaction t;
3396 write_superblock(t);
3397 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
3399 derr << "OSD::shutdown: error writing superblock: "
3400 << cpp_strerror(r) << dendl;
3405 Mutex::Locker l(pg_stat_queue_lock);
3406 assert(pg_stat_queue.empty());
3409 service.shutdown_reserver();
3412 #ifdef PG_DEBUG_REFS
3413 service.dump_live_pgids();
3416 RWLock::RLocker l(pg_map_lock);
3417 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
3420 dout(20) << " kicking pg " << p->first << dendl;
3422 if (p->second->ref != 1) {
3423 derr << "pgid " << p->first << " has ref count of "
3424 << p->second->ref << dendl;
3425 #ifdef PG_DEBUG_REFS
3426 p->second->dump_live_ids();
3428 if (cct->_conf->osd_shutdown_pgref_assert) {
3432 p->second->unlock();
3433 p->second->put("PGMap");
3437 #ifdef PG_DEBUG_REFS
3438 service.dump_live_pgids();
3440 cct->_conf->remove_observer(this);
3442 dout(10) << "syncing store" << dendl;
3443 enable_disable_fuse(true);
3445 if (cct->_conf->osd_journal_flush_on_shutdown) {
3446 dout(10) << "flushing journal" << dendl;
3447 store->flush_journal();
3453 dout(10) << "Store synced" << dendl;
3458 osdmap = OSDMapRef();
3460 op_tracker.on_shutdown();
3462 class_handler->shutdown();
3463 client_messenger->shutdown();
3464 cluster_messenger->shutdown();
3465 hb_front_client_messenger->shutdown();
3466 hb_back_client_messenger->shutdown();
3467 objecter_messenger->shutdown();
3468 hb_front_server_messenger->shutdown();
3469 hb_back_server_messenger->shutdown();
3476 int OSD::mon_cmd_maybe_osd_create(string &cmd)
3478 bool created = false;
3480 dout(10) << __func__ << " cmd: " << cmd << dendl;
3481 vector<string> vcmd{cmd};
3485 monc->start_mon_command(vcmd, inbl, NULL, &outs, &w);
3488 if (r == -ENOENT && !created) {
3489 string newcmd = "{\"prefix\": \"osd create\", \"id\": " + stringify(whoami)
3490 + ", \"uuid\": \"" + stringify(superblock.osd_fsid) + "\"}";
3491 vector<string> vnewcmd{newcmd};
3495 monc->start_mon_command(vnewcmd, inbl, NULL, &outs, &w);
3498 derr << __func__ << " fail: osd does not exist and created failed: "
3499 << cpp_strerror(r) << dendl;
3505 derr << __func__ << " fail: '" << outs << "': " << cpp_strerror(r) << dendl;
3514 int OSD::update_crush_location()
3516 if (!cct->_conf->osd_crush_update_on_start) {
3517 dout(10) << __func__ << " osd_crush_update_on_start = false" << dendl;
3522 if (cct->_conf->osd_crush_initial_weight >= 0) {
3523 snprintf(weight, sizeof(weight), "%.4lf", cct->_conf->osd_crush_initial_weight);
3525 struct store_statfs_t st;
3526 int r = store->statfs(&st);
3528 derr << "statfs: " << cpp_strerror(r) << dendl;
3531 snprintf(weight, sizeof(weight), "%.4lf",
3533 (double)(st.total) /
3534 (double)(1ull << 40 /* TB */)));
3537 std::multimap<string,string> loc = cct->crush_location.get_location();
3538 dout(10) << __func__ << " crush location is " << loc << dendl;
3541 string("{\"prefix\": \"osd crush create-or-move\", ") +
3542 string("\"id\": ") + stringify(whoami) + string(", ") +
3543 string("\"weight\":") + weight + string(", ") +
3544 string("\"args\": [");
3545 for (multimap<string,string>::iterator p = loc.begin(); p != loc.end(); ++p) {
3546 if (p != loc.begin())
3548 cmd += "\"" + p->first + "=" + p->second + "\"";
3552 return mon_cmd_maybe_osd_create(cmd);
3555 int OSD::update_crush_device_class()
3557 if (!cct->_conf->osd_class_update_on_start) {
3558 dout(10) << __func__ << " osd_class_update_on_start = false" << dendl;
3562 string device_class;
3563 int r = store->read_meta("crush_device_class", &device_class);
3564 if (r < 0 || device_class.empty()) {
3565 device_class = store->get_default_device_class();
3568 if (device_class.empty()) {
3569 dout(20) << __func__ << " no device class stored locally" << dendl;
3574 string("{\"prefix\": \"osd crush set-device-class\", ") +
3575 string("\"class\": \"") + device_class + string("\", ") +
3576 string("\"ids\": [\"") + stringify(whoami) + string("\"]}");
3578 r = mon_cmd_maybe_osd_create(cmd);
3579 // the above cmd can fail for various reasons, e.g.:
3580 // (1) we are connecting to a pre-luminous monitor
3581 // (2) user manually specify a class other than
3582 // 'ceph-disk prepare --crush-device-class'
3583 // simply skip result-checking for now
3587 void OSD::write_superblock(ObjectStore::Transaction& t)
3589 dout(10) << "write_superblock " << superblock << dendl;
3591 //hack: at minimum it's using the baseline feature set
3592 if (!superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_BASE))
3593 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_BASE);
3596 ::encode(superblock, bl);
3597 t.write(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, bl.length(), bl);
3600 int OSD::read_superblock()
3603 int r = store->read(coll_t::meta(), OSD_SUPERBLOCK_GOBJECT, 0, 0, bl);
3607 bufferlist::iterator p = bl.begin();
3608 ::decode(superblock, p);
3610 dout(10) << "read_superblock " << superblock << dendl;
3615 void OSD::clear_temp_objects()
3617 dout(10) << __func__ << dendl;
3619 store->list_collections(ls);
3620 for (vector<coll_t>::iterator p = ls.begin(); p != ls.end(); ++p) {
3622 if (!p->is_pg(&pgid))
3625 // list temp objects
3626 dout(20) << " clearing temps in " << *p << " pgid " << pgid << dendl;
3628 vector<ghobject_t> temps;
3631 vector<ghobject_t> objects;
3632 store->collection_list(*p, next, ghobject_t::get_max(),
3633 store->get_ideal_list_max(),
3635 if (objects.empty())
3637 vector<ghobject_t>::iterator q;
3638 for (q = objects.begin(); q != objects.end(); ++q) {
3639 // Hammer set pool for temps to -1, so check for clean-up
3640 if (q->hobj.is_temp() || (q->hobj.pool == -1)) {
3641 temps.push_back(*q);
3646 // If we saw a non-temp object and hit the break above we can
3647 // break out of the while loop too.
3648 if (q != objects.end())
3651 if (!temps.empty()) {
3652 ObjectStore::Transaction t;
3654 for (vector<ghobject_t>::iterator q = temps.begin(); q != temps.end(); ++q) {
3655 dout(20) << " removing " << *p << " object " << *q << dendl;
3657 if (++removed > cct->_conf->osd_target_transaction_size) {
3658 store->apply_transaction(service.meta_osr.get(), std::move(t));
3659 t = ObjectStore::Transaction();
3664 store->apply_transaction(service.meta_osr.get(), std::move(t));
3670 void OSD::recursive_remove_collection(CephContext* cct,
3671 ObjectStore *store, spg_t pgid,
3677 make_snapmapper_oid());
3679 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
3680 ObjectStore::Sequencer>("rm"));
3681 ObjectStore::Transaction t;
3682 SnapMapper mapper(cct, &driver, 0, 0, 0, pgid.shard);
3684 vector<ghobject_t> objects;
3685 store->collection_list(tmp, ghobject_t(), ghobject_t::get_max(),
3686 INT_MAX, &objects, 0);
3687 generic_dout(10) << __func__ << " " << objects << dendl;
3690 for (vector<ghobject_t>::iterator p = objects.begin();
3693 OSDriver::OSTransaction _t(driver.get_transaction(&t));
3694 int r = mapper.remove_oid(p->hobj, &_t);
3695 if (r != 0 && r != -ENOENT)
3698 if (removed > cct->_conf->osd_target_transaction_size) {
3699 int r = store->apply_transaction(osr.get(), std::move(t));
3701 t = ObjectStore::Transaction();
3705 t.remove_collection(tmp);
3706 int r = store->apply_transaction(osr.get(), std::move(t));
3710 if (!osr->flush_commit(&waiter)) {
3716 // ======================================================
3719 PGPool OSD::_get_pool(int id, OSDMapRef createmap)
3721 if (!createmap->have_pg_pool(id)) {
3722 dout(5) << __func__ << ": the OSDmap does not contain a PG pool with id = "
3727 PGPool p = PGPool(cct, createmap, id);
3729 dout(10) << "_get_pool " << p.id << dendl;
3733 PG *OSD::_open_lock_pg(
3734 OSDMapRef createmap,
3735 spg_t pgid, bool no_lockdep_check)
3737 assert(osd_lock.is_locked());
3739 PG* pg = _make_pg(createmap, pgid);
3741 RWLock::WLocker l(pg_map_lock);
3742 pg->lock(no_lockdep_check);
3744 pg->get("PGMap"); // because it's in pg_map
3745 service.pg_add_epoch(pg->info.pgid, createmap->get_epoch());
3751 OSDMapRef createmap,
3754 dout(10) << "_open_lock_pg " << pgid << dendl;
3755 PGPool pool = _get_pool(pgid.pool(), createmap);
3759 if (createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_REPLICATED ||
3760 createmap->get_pg_type(pgid.pgid) == pg_pool_t::TYPE_ERASURE)
3761 pg = new PrimaryLogPG(&service, createmap, pool, pgid);
3769 void OSD::add_newly_split_pg(PG *pg, PG::RecoveryCtx *rctx)
3771 epoch_t e(service.get_osdmap()->get_epoch());
3772 pg->get("PGMap"); // For pg_map
3773 pg_map[pg->info.pgid] = pg;
3774 service.pg_add_epoch(pg->info.pgid, pg->get_osdmap()->get_epoch());
3776 dout(10) << "Adding newly split pg " << *pg << dendl;
3777 pg->handle_loaded(rctx);
3778 pg->write_if_dirty(*(rctx->transaction));
3779 pg->queue_null(e, e);
3780 map<spg_t, list<PG::CephPeeringEvtRef> >::iterator to_wake =
3781 peering_wait_for_split.find(pg->info.pgid);
3782 if (to_wake != peering_wait_for_split.end()) {
3783 for (list<PG::CephPeeringEvtRef>::iterator i =
3784 to_wake->second.begin();
3785 i != to_wake->second.end();
3787 pg->queue_peering_event(*i);
3789 peering_wait_for_split.erase(to_wake);
3791 if (!service.get_osdmap()->have_pg_pool(pg->info.pgid.pool()))
3795 OSD::res_result OSD::_try_resurrect_pg(
3796 OSDMapRef curmap, spg_t pgid, spg_t *resurrected, PGRef *old_pg_state)
3798 assert(resurrected);
3799 assert(old_pg_state);
3800 // find nearest ancestor
3801 DeletingStateRef df;
3804 df = service.deleting_pgs.lookup(cur);
3809 cur = cur.get_parent();
3812 return RES_NONE; // good to go
3814 df->old_pg_state->lock();
3815 OSDMapRef create_map = df->old_pg_state->get_osdmap();
3816 df->old_pg_state->unlock();
3818 set<spg_t> children;
3820 if (df->try_stop_deletion()) {
3821 dout(10) << __func__ << ": halted deletion on pg " << pgid << dendl;
3823 *old_pg_state = df->old_pg_state;
3824 service.deleting_pgs.remove(pgid); // PG is no longer being removed!
3827 // raced, ensure we don't see DeletingStateRef when we try to
3829 service.deleting_pgs.remove(pgid);
3832 } else if (cur.is_split(create_map->get_pg_num(cur.pool()),
3833 curmap->get_pg_num(cur.pool()),
3835 children.count(pgid)) {
3836 if (df->try_stop_deletion()) {
3837 dout(10) << __func__ << ": halted deletion on ancestor pg " << pgid
3840 *old_pg_state = df->old_pg_state;
3841 service.deleting_pgs.remove(cur); // PG is no longer being removed!
3844 /* this is not a problem, failing to cancel proves that all objects
3845 * have been removed, so no hobject_t overlap is possible
3853 PG *OSD::_create_lock_pg(
3854 OSDMapRef createmap,
3859 vector<int>& up, int up_primary,
3860 vector<int>& acting, int acting_primary,
3861 pg_history_t history,
3862 const PastIntervals& pi,
3863 ObjectStore::Transaction& t)
3865 assert(osd_lock.is_locked());
3866 dout(20) << "_create_lock_pg pgid " << pgid << dendl;
3868 PG *pg = _open_lock_pg(createmap, pgid, true);
3870 service.init_splits_between(pgid, pg->get_osdmap(), service.get_osdmap());
3883 dout(7) << "_create_lock_pg " << *pg << dendl;
3887 PG *OSD::_lookup_lock_pg(spg_t pgid)
3889 RWLock::RLocker l(pg_map_lock);
3891 auto pg_map_entry = pg_map.find(pgid);
3892 if (pg_map_entry == pg_map.end())
3894 PG *pg = pg_map_entry->second;
3899 PG *OSD::lookup_lock_pg(spg_t pgid)
3901 return _lookup_lock_pg(pgid);
3904 PG *OSD::_lookup_lock_pg_with_map_lock_held(spg_t pgid)
3906 assert(pg_map.count(pgid));
3907 PG *pg = pg_map[pgid];
3912 void OSD::load_pgs()
3914 assert(osd_lock.is_locked());
3915 dout(0) << "load_pgs" << dendl;
3917 RWLock::RLocker l(pg_map_lock);
3918 assert(pg_map.empty());
3922 int r = store->list_collections(ls);
3924 derr << "failed to list pgs: " << cpp_strerror(-r) << dendl;
3927 bool has_upgraded = false;
3929 for (vector<coll_t>::iterator it = ls.begin();
3933 if (it->is_temp(&pgid) ||
3934 (it->is_pg(&pgid) && PG::_has_removal_flag(store, pgid))) {
3935 dout(10) << "load_pgs " << *it << " clearing temp" << dendl;
3936 recursive_remove_collection(cct, store, pgid, *it);
3940 if (!it->is_pg(&pgid)) {
3941 dout(10) << "load_pgs ignoring unrecognized " << *it << dendl;
3945 if (pgid.preferred() >= 0) {
3946 dout(10) << __func__ << ": skipping localized PG " << pgid << dendl;
3947 // FIXME: delete it too, eventually
3951 dout(10) << "pgid " << pgid << " coll " << coll_t(pgid) << dendl;
3953 epoch_t map_epoch = 0;
3954 int r = PG::peek_map_epoch(store, pgid, &map_epoch, &bl);
3956 derr << __func__ << " unable to peek at " << pgid << " metadata, skipping"
3962 if (map_epoch > 0) {
3963 OSDMapRef pgosdmap = service.try_get_map(map_epoch);
3965 if (!osdmap->have_pg_pool(pgid.pool())) {
3966 derr << __func__ << ": could not find map for epoch " << map_epoch
3967 << " on pg " << pgid << ", but the pool is not present in the "
3968 << "current map, so this is probably a result of bug 10617. "
3969 << "Skipping the pg for now, you can use ceph-objectstore-tool "
3970 << "to clean it up later." << dendl;
3973 derr << __func__ << ": have pgid " << pgid << " at epoch "
3974 << map_epoch << ", but missing map. Crashing."
3976 assert(0 == "Missing map in load_pgs");
3979 pg = _open_lock_pg(pgosdmap, pgid);
3981 pg = _open_lock_pg(osdmap, pgid);
3983 // there can be no waiters here, so we don't call wake_pg_waiters
3985 pg->ch = store->open_collection(pg->coll);
3987 // read pg state, log
3988 pg->read_state(store, bl);
3990 if (pg->must_upgrade()) {
3991 if (!pg->can_upgrade()) {
3992 derr << "PG needs upgrade, but on-disk data is too old; upgrade to"
3993 << " an older version first." << dendl;
3994 assert(0 == "PG too old to upgrade");
3996 if (!has_upgraded) {
3997 derr << "PGs are upgrading" << dendl;
3998 has_upgraded = true;
4000 dout(10) << "PG " << pg->info.pgid
4001 << " must upgrade..." << dendl;
4005 service.init_splits_between(pg->info.pgid, pg->get_osdmap(), osdmap);
4007 // generate state for PG's current mapping
4008 int primary, up_primary;
4009 vector<int> acting, up;
4010 pg->get_osdmap()->pg_to_up_acting_osds(
4011 pgid.pgid, &up, &up_primary, &acting, &primary);
4012 pg->init_primary_up_acting(
4017 int role = OSDMap::calc_pg_role(whoami, pg->acting);
4018 if (pg->pool.info.is_replicated() || role == pg->pg_whoami.shard)
4023 pg->reg_next_scrub();
4025 PG::RecoveryCtx rctx(0, 0, 0, 0, 0, 0);
4026 pg->handle_loaded(&rctx);
4028 dout(10) << "load_pgs loaded " << *pg << " " << pg->pg_log.get_log() << dendl;
4029 if (pg->pg_log.is_dirty()) {
4030 ObjectStore::Transaction t;
4031 pg->write_if_dirty(t);
4032 store->apply_transaction(pg->osr.get(), std::move(t));
4037 RWLock::RLocker l(pg_map_lock);
4038 dout(0) << "load_pgs opened " << pg_map.size() << " pgs" << dendl;
4041 // clean up old infos object?
4042 if (has_upgraded && store->exists(coll_t::meta(), OSD::make_infos_oid())) {
4043 dout(1) << __func__ << " removing legacy infos object" << dendl;
4044 ObjectStore::Transaction t;
4045 t.remove(coll_t::meta(), OSD::make_infos_oid());
4046 int r = store->apply_transaction(service.meta_osr.get(), std::move(t));
4048 derr << __func__ << ": apply_transaction returned "
4049 << cpp_strerror(r) << dendl;
4054 build_past_intervals_parallel();
4059 * build past_intervals efficiently on old, degraded, and buried
4060 * clusters. this is important for efficiently catching up osds that
4061 * are way behind on maps to the current cluster state.
4063 * this is a parallel version of PG::generate_past_intervals().
4064 * follow the same logic, but do all pgs at the same time so that we
4065 * can make a single pass across the osdmap history.
4067 void OSD::build_past_intervals_parallel()
4071 vector<int> old_acting, old_up;
4072 epoch_t same_interval_since;
4076 map<PG*,pistate> pis;
4078 // calculate junction of map range
4079 epoch_t end_epoch = superblock.oldest_map;
4080 epoch_t cur_epoch = superblock.newest_map;
4082 RWLock::RLocker l(pg_map_lock);
4083 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4088 // Ignore PGs only partially created (DNE)
4089 if (pg->info.dne()) {
4093 auto rpib = pg->get_required_past_interval_bounds(
4095 superblock.oldest_map);
4096 if (rpib.first >= rpib.second && pg->past_intervals.empty()) {
4097 if (pg->info.history.same_interval_since == 0) {
4098 pg->info.history.same_interval_since = rpib.second;
4102 auto apib = pg->past_intervals.get_bounds();
4103 if (apib.second >= rpib.second &&
4104 apib.first <= rpib.first) {
4105 if (pg->info.history.same_interval_since == 0) {
4106 pg->info.history.same_interval_since = rpib.second;
4112 dout(10) << pg->info.pgid << " needs " << rpib.first << "-"
4113 << rpib.second << dendl;
4114 pistate& p = pis[pg];
4115 p.start = rpib.first;
4116 p.end = rpib.second;
4117 p.same_interval_since = 0;
4119 if (rpib.first < cur_epoch)
4120 cur_epoch = rpib.first;
4121 if (rpib.second > end_epoch)
4122 end_epoch = rpib.second;
4126 dout(10) << __func__ << " nothing to build" << dendl;
4130 dout(1) << __func__ << " over " << cur_epoch << "-" << end_epoch << dendl;
4131 assert(cur_epoch <= end_epoch);
4133 OSDMapRef cur_map, last_map;
4134 for ( ; cur_epoch <= end_epoch; cur_epoch++) {
4135 dout(10) << __func__ << " epoch " << cur_epoch << dendl;
4137 cur_map = get_map(cur_epoch);
4139 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4141 pistate& p = i->second;
4143 if (cur_epoch < p.start || cur_epoch > p.end)
4146 vector<int> acting, up;
4149 pg_t pgid = pg->info.pgid.pgid;
4150 if (p.same_interval_since && last_map->get_pools().count(pgid.pool()))
4151 pgid = pgid.get_ancestor(last_map->get_pg_num(pgid.pool()));
4152 cur_map->pg_to_up_acting_osds(
4153 pgid, &up, &up_primary, &acting, &primary);
4155 if (p.same_interval_since == 0) {
4156 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4157 << " first map, acting " << acting
4158 << " up " << up << ", same_interval_since = " << cur_epoch << dendl;
4159 p.same_interval_since = cur_epoch;
4161 p.old_acting = acting;
4162 p.primary = primary;
4163 p.up_primary = up_primary;
4168 boost::scoped_ptr<IsPGRecoverablePredicate> recoverable(
4169 pg->get_is_recoverable_predicate());
4170 std::stringstream debug;
4171 bool new_interval = PastIntervals::check_new_interval(
4174 p.old_acting, acting,
4178 p.same_interval_since,
4179 pg->info.history.last_epoch_clean,
4183 &pg->past_intervals,
4186 dout(10) << __func__ << " epoch " << cur_epoch << " pg " << pg->info.pgid
4187 << " " << debug.str() << dendl;
4189 p.old_acting = acting;
4190 p.primary = primary;
4191 p.up_primary = up_primary;
4192 p.same_interval_since = cur_epoch;
4197 // Now that past_intervals have been recomputed let's fix the same_interval_since
4198 // if it was cleared by import.
4199 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4201 pistate& p = i->second;
4203 if (pg->info.history.same_interval_since == 0) {
4204 assert(p.same_interval_since);
4205 dout(10) << __func__ << " fix same_interval_since " << p.same_interval_since << " pg " << *pg << dendl;
4206 dout(10) << __func__ << " past_intervals " << pg->past_intervals << dendl;
4208 pg->info.history.same_interval_since = p.same_interval_since;
4212 // write info only at the end. this is necessary because we check
4213 // whether the past_intervals go far enough back or forward in time,
4214 // but we don't check for holes. we could avoid it by discarding
4215 // the previous past_intervals and rebuilding from scratch, or we
4216 // can just do this and commit all our work at the end.
4217 ObjectStore::Transaction t;
4219 for (map<PG*,pistate>::iterator i = pis.begin(); i != pis.end(); ++i) {
4222 pg->dirty_big_info = true;
4223 pg->dirty_info = true;
4224 pg->write_if_dirty(t);
4227 // don't let the transaction get too big
4228 if (++num >= cct->_conf->osd_target_transaction_size) {
4229 store->apply_transaction(service.meta_osr.get(), std::move(t));
4230 t = ObjectStore::Transaction();
4235 store->apply_transaction(service.meta_osr.get(), std::move(t));
4239 * look up a pg. if we have it, great. if not, consider creating it IF the pg mapping
4240 * hasn't changed since the given epoch and we are the primary.
4242 int OSD::handle_pg_peering_evt(
4244 const pg_history_t& orig_history,
4245 const PastIntervals& pi,
4247 PG::CephPeeringEvtRef evt)
4249 if (service.splitting(pgid)) {
4250 peering_wait_for_split[pgid].push_back(evt);
4254 PG *pg = _lookup_lock_pg(pgid);
4257 if (!osdmap->have_pg_pool(pgid.pool()))
4259 int up_primary, acting_primary;
4260 vector<int> up, acting;
4261 osdmap->pg_to_up_acting_osds(
4262 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4264 pg_history_t history = orig_history;
4265 bool valid_history = project_pg_history(
4266 pgid, history, epoch, up, up_primary, acting, acting_primary);
4268 if (!valid_history || epoch < history.same_interval_since) {
4269 dout(10) << __func__ << pgid << " acting changed in "
4270 << history.same_interval_since << " (msg from " << epoch << ")"
4275 if (service.splitting(pgid)) {
4279 const bool is_mon_create =
4280 evt->get_event().dynamic_type() == PG::NullEvt::static_type();
4281 if (maybe_wait_for_max_pg(pgid, is_mon_create)) {
4284 // do we need to resurrect a deleting pg?
4287 res_result result = _try_resurrect_pg(
4288 service.get_osdmap(),
4293 PG::RecoveryCtx rctx = create_context();
4296 const pg_pool_t* pp = osdmap->get_pg_pool(pgid.pool());
4297 if (pp->has_flag(pg_pool_t::FLAG_EC_OVERWRITES) &&
4298 store->get_type() != "bluestore") {
4299 clog->warn() << "pg " << pgid
4300 << " is at risk of silent data corruption: "
4301 << "the pool allows ec overwrites but is not stored in "
4302 << "bluestore, so deep scrubbing will not detect bitrot";
4304 PG::_create(*rctx.transaction, pgid, pgid.get_split_bits(pp->get_pg_num()));
4305 PG::_init(*rctx.transaction, pgid, pp);
4307 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
4308 if (!pp->is_replicated() && role != pgid.shard)
4311 pg = _create_lock_pg(
4316 acting, acting_primary,
4319 pg->handle_create(&rctx);
4320 pg->write_if_dirty(*rctx.transaction);
4321 dispatch_context(rctx, pg, osdmap);
4323 dout(10) << *pg << " is new" << dendl;
4325 pg->queue_peering_event(evt);
4326 wake_pg_waiters(pg);
4331 old_pg_state->lock();
4332 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4333 int old_role = old_pg_state->role;
4334 vector<int> old_up = old_pg_state->up;
4335 int old_up_primary = old_pg_state->up_primary.osd;
4336 vector<int> old_acting = old_pg_state->acting;
4337 int old_primary = old_pg_state->primary.osd;
4338 pg_history_t old_history = old_pg_state->info.history;
4339 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4340 old_pg_state->unlock();
4341 pg = _create_lock_pg(
4354 pg->handle_create(&rctx);
4355 pg->write_if_dirty(*rctx.transaction);
4356 dispatch_context(rctx, pg, osdmap);
4358 dout(10) << *pg << " is new (resurrected)" << dendl;
4360 pg->queue_peering_event(evt);
4361 wake_pg_waiters(pg);
4366 assert(old_pg_state);
4367 old_pg_state->lock();
4368 OSDMapRef old_osd_map = old_pg_state->get_osdmap();
4369 int old_role = old_pg_state->role;
4370 vector<int> old_up = old_pg_state->up;
4371 int old_up_primary = old_pg_state->up_primary.osd;
4372 vector<int> old_acting = old_pg_state->acting;
4373 int old_primary = old_pg_state->primary.osd;
4374 pg_history_t old_history = old_pg_state->info.history;
4375 PastIntervals old_past_intervals = old_pg_state->past_intervals;
4376 old_pg_state->unlock();
4377 PG *parent = _create_lock_pg(
4391 parent->handle_create(&rctx);
4392 parent->write_if_dirty(*rctx.transaction);
4393 dispatch_context(rctx, parent, osdmap);
4395 dout(10) << *parent << " is new" << dendl;
4397 assert(service.splitting(pgid));
4398 peering_wait_for_split[pgid].push_back(evt);
4400 //parent->queue_peering_event(evt);
4401 parent->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
4402 wake_pg_waiters(parent);
4411 // already had it. did the mapping change?
4412 if (epoch < pg->info.history.same_interval_since) {
4413 dout(10) << *pg << __func__ << " acting changed in "
4414 << pg->info.history.same_interval_since
4415 << " (msg from " << epoch << ")" << dendl;
4417 pg->queue_peering_event(evt);
4424 bool OSD::maybe_wait_for_max_pg(spg_t pgid, bool is_mon_create)
4426 const auto max_pgs_per_osd =
4427 (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
4428 cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4430 RWLock::RLocker pg_map_locker{pg_map_lock};
4431 if (pg_map.size() < max_pgs_per_osd) {
4434 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
4435 if (is_mon_create) {
4436 pending_creates_from_mon++;
4438 pending_creates_from_osd.emplace(pgid.pgid);
4440 dout(5) << __func__ << " withhold creation of pg " << pgid
4441 << ": " << pg_map.size() << " >= "<< max_pgs_per_osd << dendl;
4445 // to re-trigger a peering, we have to twiddle the pg mapping a little bit,
4446 // see PG::should_restart_peering(). OSDMap::pg_to_up_acting_osds() will turn
4447 // to up set if pg_temp is empty. so an empty pg_temp won't work.
4448 static vector<int32_t> twiddle(const vector<int>& acting) {
4449 if (acting.size() > 1) {
4452 vector<int32_t> twiddled(acting.begin(), acting.end());
4453 twiddled.push_back(-1);
4458 void OSD::resume_creating_pg()
4460 bool do_sub_pg_creates = false;
4461 MOSDPGTemp *pgtemp = nullptr;
4463 const auto max_pgs_per_osd =
4464 (cct->_conf->get_val<uint64_t>("mon_max_pg_per_osd") *
4465 cct->_conf->get_val<double>("osd_max_pg_per_osd_hard_ratio"));
4466 RWLock::RLocker l(pg_map_lock);
4467 if (max_pgs_per_osd <= pg_map.size()) {
4468 // this could happen if admin decreases this setting before a PG is removed
4471 unsigned spare_pgs = max_pgs_per_osd - pg_map.size();
4472 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
4473 if (pending_creates_from_mon > 0) {
4474 do_sub_pg_creates = true;
4475 if (pending_creates_from_mon >= spare_pgs) {
4476 spare_pgs = pending_creates_from_mon = 0;
4478 spare_pgs -= pending_creates_from_mon;
4479 pending_creates_from_mon = 0;
4482 auto pg = pending_creates_from_osd.cbegin();
4483 while (spare_pgs > 0 && pg != pending_creates_from_osd.cend()) {
4485 pgtemp = new MOSDPGTemp{osdmap->get_epoch()};
4488 osdmap->pg_to_up_acting_osds(*pg, nullptr, nullptr, &acting, nullptr);
4489 pgtemp->pg_temp[*pg] = twiddle(acting);
4490 pg = pending_creates_from_osd.erase(pg);
4494 if (do_sub_pg_creates) {
4495 if (monc->sub_want("osd_pg_creates", last_pg_create_epoch, 0)) {
4496 dout(4) << __func__ << ": resolicit pg creates from mon since "
4497 << last_pg_create_epoch << dendl;
4502 pgtemp->forced = true;
4503 monc->send_mon_message(pgtemp);
4507 void OSD::build_initial_pg_history(
4510 utime_t created_stamp,
4514 dout(10) << __func__ << " " << pgid << " created " << created << dendl;
4515 h->epoch_created = created;
4516 h->epoch_pool_created = created;
4517 h->same_interval_since = created;
4518 h->same_up_since = created;
4519 h->same_primary_since = created;
4520 h->last_scrub_stamp = created_stamp;
4521 h->last_deep_scrub_stamp = created_stamp;
4522 h->last_clean_scrub_stamp = created_stamp;
4524 OSDMapRef lastmap = service.get_map(created);
4525 int up_primary, acting_primary;
4526 vector<int> up, acting;
4527 lastmap->pg_to_up_acting_osds(
4528 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
4530 ostringstream debug;
4531 for (epoch_t e = created + 1; e <= osdmap->get_epoch(); ++e) {
4532 OSDMapRef osdmap = service.get_map(e);
4533 int new_up_primary, new_acting_primary;
4534 vector<int> new_up, new_acting;
4535 osdmap->pg_to_up_acting_osds(
4536 pgid.pgid, &new_up, &new_up_primary, &new_acting, &new_acting_primary);
4538 // this is a bit imprecise, but sufficient?
4539 struct min_size_predicate_t : public IsPGRecoverablePredicate {
4540 const pg_pool_t *pi;
4541 bool operator()(const set<pg_shard_t> &have) const {
4542 return have.size() >= pi->min_size;
4544 min_size_predicate_t(const pg_pool_t *i) : pi(i) {}
4545 } min_size_predicate(osdmap->get_pg_pool(pgid.pgid.pool()));
4547 bool new_interval = PastIntervals::check_new_interval(
4554 h->same_interval_since,
4555 h->last_epoch_clean,
4559 &min_size_predicate,
4563 h->same_interval_since = e;
4565 h->same_up_since = e;
4567 if (acting_primary != new_acting_primary) {
4568 h->same_primary_since = e;
4570 if (pgid.pgid.is_split(lastmap->get_pg_num(pgid.pgid.pool()),
4571 osdmap->get_pg_num(pgid.pgid.pool()),
4573 h->last_epoch_split = e;
4576 acting = new_acting;
4577 up_primary = new_up_primary;
4578 acting_primary = new_acting_primary;
4582 dout(20) << __func__ << " " << debug.str() << dendl;
4583 dout(10) << __func__ << " " << *h << " " << *pi
4584 << " [" << (pi->empty() ? pair<epoch_t,epoch_t>(0,0) :
4585 pi->get_bounds()) << ")"
4590 * Fill in the passed history so you know same_interval_since, same_up_since,
4591 * and same_primary_since.
4593 bool OSD::project_pg_history(spg_t pgid, pg_history_t& h, epoch_t from,
4594 const vector<int>& currentup,
4595 int currentupprimary,
4596 const vector<int>& currentacting,
4597 int currentactingprimary)
4599 dout(15) << "project_pg_history " << pgid
4600 << " from " << from << " to " << osdmap->get_epoch()
4605 for (e = osdmap->get_epoch();
4608 // verify during intermediate epoch (e-1)
4609 OSDMapRef oldmap = service.try_get_map(e-1);
4611 dout(15) << __func__ << ": found map gap, returning false" << dendl;
4614 assert(oldmap->have_pg_pool(pgid.pool()));
4616 int upprimary, actingprimary;
4617 vector<int> up, acting;
4618 oldmap->pg_to_up_acting_osds(
4625 // acting set change?
4626 if ((actingprimary != currentactingprimary ||
4627 upprimary != currentupprimary ||
4628 acting != currentacting ||
4629 up != currentup) && e > h.same_interval_since) {
4630 dout(15) << "project_pg_history " << pgid << " acting|up changed in " << e
4631 << " from " << acting << "/" << up
4632 << " " << actingprimary << "/" << upprimary
4633 << " -> " << currentacting << "/" << currentup
4634 << " " << currentactingprimary << "/" << currentupprimary
4636 h.same_interval_since = e;
4639 if (pgid.is_split(oldmap->get_pg_num(pgid.pool()),
4640 osdmap->get_pg_num(pgid.pool()),
4641 0) && e > h.same_interval_since) {
4642 h.same_interval_since = e;
4645 if ((up != currentup || upprimary != currentupprimary)
4646 && e > h.same_up_since) {
4647 dout(15) << "project_pg_history " << pgid << " up changed in " << e
4648 << " from " << up << " " << upprimary
4649 << " -> " << currentup << " " << currentupprimary << dendl;
4650 h.same_up_since = e;
4654 if (OSDMap::primary_changed(
4657 currentactingprimary,
4659 e > h.same_primary_since) {
4660 dout(15) << "project_pg_history " << pgid << " primary changed in " << e << dendl;
4661 h.same_primary_since = e;
4664 if (h.same_interval_since >= e && h.same_up_since >= e && h.same_primary_since >= e)
4668 // base case: these floors should be the pg creation epoch if we didn't
4669 // find any changes.
4670 if (e == h.epoch_created) {
4671 if (!h.same_interval_since)
4672 h.same_interval_since = e;
4673 if (!h.same_up_since)
4674 h.same_up_since = e;
4675 if (!h.same_primary_since)
4676 h.same_primary_since = e;
4679 dout(15) << "project_pg_history end " << h << dendl;
4685 void OSD::_add_heartbeat_peer(int p)
4691 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(p);
4692 if (i == heartbeat_peers.end()) {
4693 pair<ConnectionRef,ConnectionRef> cons = service.get_con_osd_hb(p, osdmap->get_epoch());
4696 hi = &heartbeat_peers[p];
4698 HeartbeatSession *s = new HeartbeatSession(p);
4699 hi->con_back = cons.first.get();
4700 hi->con_back->set_priv(s->get());
4702 hi->con_front = cons.second.get();
4703 hi->con_front->set_priv(s->get());
4704 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4705 << " " << hi->con_back->get_peer_addr()
4706 << " " << hi->con_front->get_peer_addr()
4709 hi->con_front.reset(NULL);
4710 dout(10) << "_add_heartbeat_peer: new peer osd." << p
4711 << " " << hi->con_back->get_peer_addr()
4718 hi->epoch = osdmap->get_epoch();
4721 void OSD::_remove_heartbeat_peer(int n)
4723 map<int,HeartbeatInfo>::iterator q = heartbeat_peers.find(n);
4724 assert(q != heartbeat_peers.end());
4725 dout(20) << " removing heartbeat peer osd." << n
4726 << " " << q->second.con_back->get_peer_addr()
4727 << " " << (q->second.con_front ? q->second.con_front->get_peer_addr() : entity_addr_t())
4729 q->second.con_back->mark_down();
4730 if (q->second.con_front) {
4731 q->second.con_front->mark_down();
4733 heartbeat_peers.erase(q);
4736 void OSD::need_heartbeat_peer_update()
4740 dout(20) << "need_heartbeat_peer_update" << dendl;
4741 heartbeat_set_peers_need_update();
4744 void OSD::maybe_update_heartbeat_peers()
4746 assert(osd_lock.is_locked());
4748 if (is_waiting_for_healthy()) {
4749 utime_t now = ceph_clock_now();
4750 if (last_heartbeat_resample == utime_t()) {
4751 last_heartbeat_resample = now;
4752 heartbeat_set_peers_need_update();
4753 } else if (!heartbeat_peers_need_update()) {
4754 utime_t dur = now - last_heartbeat_resample;
4755 if (dur > cct->_conf->osd_heartbeat_grace) {
4756 dout(10) << "maybe_update_heartbeat_peers forcing update after " << dur << " seconds" << dendl;
4757 heartbeat_set_peers_need_update();
4758 last_heartbeat_resample = now;
4759 reset_heartbeat_peers(); // we want *new* peers!
4764 if (!heartbeat_peers_need_update())
4766 heartbeat_clear_peers_need_update();
4768 Mutex::Locker l(heartbeat_lock);
4770 dout(10) << "maybe_update_heartbeat_peers updating" << dendl;
4773 // build heartbeat from set
4775 RWLock::RLocker l(pg_map_lock);
4776 for (ceph::unordered_map<spg_t, PG*>::iterator i = pg_map.begin();
4780 pg->heartbeat_peer_lock.Lock();
4781 dout(20) << i->first << " heartbeat_peers " << pg->heartbeat_peers << dendl;
4782 for (set<int>::iterator p = pg->heartbeat_peers.begin();
4783 p != pg->heartbeat_peers.end();
4785 if (osdmap->is_up(*p))
4786 _add_heartbeat_peer(*p);
4787 for (set<int>::iterator p = pg->probe_targets.begin();
4788 p != pg->probe_targets.end();
4790 if (osdmap->is_up(*p))
4791 _add_heartbeat_peer(*p);
4792 pg->heartbeat_peer_lock.Unlock();
4796 // include next and previous up osds to ensure we have a fully-connected set
4797 set<int> want, extras;
4798 int next = osdmap->get_next_up_osd_after(whoami);
4801 int prev = osdmap->get_previous_up_osd_before(whoami);
4802 if (prev >= 0 && prev != next)
4805 for (set<int>::iterator p = want.begin(); p != want.end(); ++p) {
4806 dout(10) << " adding neighbor peer osd." << *p << dendl;
4808 _add_heartbeat_peer(*p);
4811 // remove down peers; enumerate extras
4812 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
4813 while (p != heartbeat_peers.end()) {
4814 if (!osdmap->is_up(p->first)) {
4817 _remove_heartbeat_peer(o);
4820 if (p->second.epoch < osdmap->get_epoch()) {
4821 extras.insert(p->first);
4827 int start = osdmap->get_next_up_osd_after(whoami);
4828 for (int n = start; n >= 0; ) {
4829 if ((int)heartbeat_peers.size() >= cct->_conf->osd_heartbeat_min_peers)
4831 if (!extras.count(n) && !want.count(n) && n != whoami) {
4832 dout(10) << " adding random peer osd." << n << dendl;
4834 _add_heartbeat_peer(n);
4836 n = osdmap->get_next_up_osd_after(n);
4838 break; // came full circle; stop
4842 for (set<int>::iterator p = extras.begin();
4843 (int)heartbeat_peers.size() > cct->_conf->osd_heartbeat_min_peers && p != extras.end();
4847 _remove_heartbeat_peer(*p);
4850 dout(10) << "maybe_update_heartbeat_peers " << heartbeat_peers.size() << " peers, extras " << extras << dendl;
4853 void OSD::reset_heartbeat_peers()
4855 assert(osd_lock.is_locked());
4856 dout(10) << "reset_heartbeat_peers" << dendl;
4857 Mutex::Locker l(heartbeat_lock);
4858 while (!heartbeat_peers.empty()) {
4859 HeartbeatInfo& hi = heartbeat_peers.begin()->second;
4860 hi.con_back->mark_down();
4862 hi.con_front->mark_down();
4864 heartbeat_peers.erase(heartbeat_peers.begin());
4866 failure_queue.clear();
4869 void OSD::handle_osd_ping(MOSDPing *m)
4871 if (superblock.cluster_fsid != m->fsid) {
4872 dout(20) << "handle_osd_ping from " << m->get_source_inst()
4873 << " bad fsid " << m->fsid << " != " << superblock.cluster_fsid << dendl;
4878 int from = m->get_source().num();
4880 heartbeat_lock.Lock();
4881 if (is_stopping()) {
4882 heartbeat_lock.Unlock();
4887 OSDMapRef curmap = service.get_osdmap();
4889 heartbeat_lock.Unlock();
4896 case MOSDPing::PING:
4898 if (cct->_conf->osd_debug_drop_ping_probability > 0) {
4899 auto heartbeat_drop = debug_heartbeat_drops_remaining.find(from);
4900 if (heartbeat_drop != debug_heartbeat_drops_remaining.end()) {
4901 if (heartbeat_drop->second == 0) {
4902 debug_heartbeat_drops_remaining.erase(heartbeat_drop);
4904 --heartbeat_drop->second;
4905 dout(5) << "Dropping heartbeat from " << from
4906 << ", " << heartbeat_drop->second
4907 << " remaining to drop" << dendl;
4910 } else if (cct->_conf->osd_debug_drop_ping_probability >
4911 ((((double)(rand()%100))/100.0))) {
4913 debug_heartbeat_drops_remaining.insert(std::make_pair(from,
4914 cct->_conf->osd_debug_drop_ping_duration)).first;
4915 dout(5) << "Dropping heartbeat from " << from
4916 << ", " << heartbeat_drop->second
4917 << " remaining to drop" << dendl;
4922 if (!cct->get_heartbeat_map()->is_healthy()) {
4923 dout(10) << "internal heartbeat not healthy, dropping ping request" << dendl;
4927 Message *r = new MOSDPing(monc->get_fsid(),
4928 curmap->get_epoch(),
4929 MOSDPing::PING_REPLY, m->stamp,
4930 cct->_conf->osd_heartbeat_min_size);
4931 m->get_connection()->send_message(r);
4933 if (curmap->is_up(from)) {
4934 service.note_peer_epoch(from, m->map_epoch);
4936 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
4938 service.share_map_peer(from, con.get());
4941 } else if (!curmap->exists(from) ||
4942 curmap->get_down_at(from) > m->map_epoch) {
4943 // tell them they have died
4944 Message *r = new MOSDPing(monc->get_fsid(),
4945 curmap->get_epoch(),
4948 cct->_conf->osd_heartbeat_min_size);
4949 m->get_connection()->send_message(r);
4954 case MOSDPing::PING_REPLY:
4956 map<int,HeartbeatInfo>::iterator i = heartbeat_peers.find(from);
4957 if (i != heartbeat_peers.end()) {
4958 if (m->get_connection() == i->second.con_back) {
4959 dout(25) << "handle_osd_ping got reply from osd." << from
4960 << " first_tx " << i->second.first_tx
4961 << " last_tx " << i->second.last_tx
4962 << " last_rx_back " << i->second.last_rx_back << " -> " << m->stamp
4963 << " last_rx_front " << i->second.last_rx_front
4965 i->second.last_rx_back = m->stamp;
4966 // if there is no front con, set both stamps.
4967 if (i->second.con_front == NULL)
4968 i->second.last_rx_front = m->stamp;
4969 } else if (m->get_connection() == i->second.con_front) {
4970 dout(25) << "handle_osd_ping got reply from osd." << from
4971 << " first_tx " << i->second.first_tx
4972 << " last_tx " << i->second.last_tx
4973 << " last_rx_back " << i->second.last_rx_back
4974 << " last_rx_front " << i->second.last_rx_front << " -> " << m->stamp
4976 i->second.last_rx_front = m->stamp;
4979 utime_t cutoff = ceph_clock_now();
4980 cutoff -= cct->_conf->osd_heartbeat_grace;
4981 if (i->second.is_healthy(cutoff)) {
4982 // Cancel false reports
4983 auto failure_queue_entry = failure_queue.find(from);
4984 if (failure_queue_entry != failure_queue.end()) {
4985 dout(10) << "handle_osd_ping canceling queued "
4986 << "failure report for osd." << from << dendl;
4987 failure_queue.erase(failure_queue_entry);
4990 auto failure_pending_entry = failure_pending.find(from);
4991 if (failure_pending_entry != failure_pending.end()) {
4992 dout(10) << "handle_osd_ping canceling in-flight "
4993 << "failure report for osd." << from << dendl;
4994 send_still_alive(curmap->get_epoch(),
4995 failure_pending_entry->second.second);
4996 failure_pending.erase(failure_pending_entry);
5002 curmap->is_up(from)) {
5003 service.note_peer_epoch(from, m->map_epoch);
5005 ConnectionRef con = service.get_con_osd_cluster(from, curmap->get_epoch());
5007 service.share_map_peer(from, con.get());
5014 case MOSDPing::YOU_DIED:
5015 dout(10) << "handle_osd_ping " << m->get_source_inst()
5016 << " says i am down in " << m->map_epoch << dendl;
5017 osdmap_subscribe(curmap->get_epoch()+1, false);
5021 heartbeat_lock.Unlock();
5025 void OSD::heartbeat_entry()
5027 Mutex::Locker l(heartbeat_lock);
5030 while (!heartbeat_stop) {
5033 double wait = .5 + ((float)(rand() % 10)/10.0) * (float)cct->_conf->osd_heartbeat_interval;
5035 w.set_from_double(wait);
5036 dout(30) << "heartbeat_entry sleeping for " << wait << dendl;
5037 heartbeat_cond.WaitInterval(heartbeat_lock, w);
5040 dout(30) << "heartbeat_entry woke up" << dendl;
5044 void OSD::heartbeat_check()
5046 assert(heartbeat_lock.is_locked());
5047 utime_t now = ceph_clock_now();
5049 // check for heartbeat replies (move me elsewhere?)
5050 utime_t cutoff = now;
5051 cutoff -= cct->_conf->osd_heartbeat_grace;
5052 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5053 p != heartbeat_peers.end();
5056 if (p->second.first_tx == utime_t()) {
5057 dout(25) << "heartbeat_check we haven't sent ping to osd." << p->first
5058 << "yet, skipping" << dendl;
5062 dout(25) << "heartbeat_check osd." << p->first
5063 << " first_tx " << p->second.first_tx
5064 << " last_tx " << p->second.last_tx
5065 << " last_rx_back " << p->second.last_rx_back
5066 << " last_rx_front " << p->second.last_rx_front
5068 if (p->second.is_unhealthy(cutoff)) {
5069 if (p->second.last_rx_back == utime_t() ||
5070 p->second.last_rx_front == utime_t()) {
5071 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
5072 << " osd." << p->first << " ever on either front or back, first ping sent "
5073 << p->second.first_tx << " (cutoff " << cutoff << ")" << dendl;
5075 failure_queue[p->first] = p->second.last_tx;
5077 derr << "heartbeat_check: no reply from " << p->second.con_front->get_peer_addr().get_sockaddr()
5078 << " osd." << p->first << " since back " << p->second.last_rx_back
5079 << " front " << p->second.last_rx_front
5080 << " (cutoff " << cutoff << ")" << dendl;
5082 failure_queue[p->first] = MIN(p->second.last_rx_back, p->second.last_rx_front);
5088 void OSD::heartbeat()
5090 dout(30) << "heartbeat" << dendl;
5094 int n_samples = 86400 / cct->_conf->osd_heartbeat_interval;
5095 if (getloadavg(loadavgs, 1) == 1) {
5096 logger->set(l_osd_loadavg, 100 * loadavgs[0]);
5097 daily_loadavg = (daily_loadavg * (n_samples - 1) + loadavgs[0]) / n_samples;
5098 dout(30) << "heartbeat: daily_loadavg " << daily_loadavg << dendl;
5101 dout(30) << "heartbeat checking stats" << dendl;
5104 vector<int> hb_peers;
5105 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5106 p != heartbeat_peers.end();
5108 hb_peers.push_back(p->first);
5109 service.update_osd_stat(hb_peers);
5111 dout(5) << "heartbeat: " << service.get_osd_stat() << dendl;
5113 utime_t now = ceph_clock_now();
5116 for (map<int,HeartbeatInfo>::iterator i = heartbeat_peers.begin();
5117 i != heartbeat_peers.end();
5119 int peer = i->first;
5120 i->second.last_tx = now;
5121 if (i->second.first_tx == utime_t())
5122 i->second.first_tx = now;
5123 dout(30) << "heartbeat sending ping to osd." << peer << dendl;
5124 i->second.con_back->send_message(new MOSDPing(monc->get_fsid(),
5125 service.get_osdmap()->get_epoch(),
5126 MOSDPing::PING, now,
5127 cct->_conf->osd_heartbeat_min_size));
5129 if (i->second.con_front)
5130 i->second.con_front->send_message(new MOSDPing(monc->get_fsid(),
5131 service.get_osdmap()->get_epoch(),
5132 MOSDPing::PING, now,
5133 cct->_conf->osd_heartbeat_min_size));
5136 logger->set(l_osd_hb_to, heartbeat_peers.size());
5138 // hmm.. am i all alone?
5139 dout(30) << "heartbeat lonely?" << dendl;
5140 if (heartbeat_peers.empty()) {
5141 if (now - last_mon_heartbeat > cct->_conf->osd_mon_heartbeat_interval && is_active()) {
5142 last_mon_heartbeat = now;
5143 dout(10) << "i have no heartbeat peers; checking mon for new map" << dendl;
5144 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5148 dout(30) << "heartbeat done" << dendl;
5151 bool OSD::heartbeat_reset(Connection *con)
5153 HeartbeatSession *s = static_cast<HeartbeatSession*>(con->get_priv());
5155 heartbeat_lock.Lock();
5156 if (is_stopping()) {
5157 heartbeat_lock.Unlock();
5161 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(s->peer);
5162 if (p != heartbeat_peers.end() &&
5163 (p->second.con_back == con ||
5164 p->second.con_front == con)) {
5165 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5166 << ", reopening" << dendl;
5167 if (con != p->second.con_back) {
5168 p->second.con_back->mark_down();
5170 p->second.con_back.reset(NULL);
5171 if (p->second.con_front && con != p->second.con_front) {
5172 p->second.con_front->mark_down();
5174 p->second.con_front.reset(NULL);
5175 pair<ConnectionRef,ConnectionRef> newcon = service.get_con_osd_hb(p->second.peer, p->second.epoch);
5177 p->second.con_back = newcon.first.get();
5178 p->second.con_back->set_priv(s->get());
5179 if (newcon.second) {
5180 p->second.con_front = newcon.second.get();
5181 p->second.con_front->set_priv(s->get());
5184 dout(10) << "heartbeat_reset failed hb con " << con << " for osd." << p->second.peer
5185 << ", raced with osdmap update, closing out peer" << dendl;
5186 heartbeat_peers.erase(p);
5189 dout(10) << "heartbeat_reset closing (old) failed hb con " << con << dendl;
5191 heartbeat_lock.Unlock();
5199 // =========================================
5203 assert(osd_lock.is_locked());
5204 dout(10) << "tick" << dendl;
5206 if (is_active() || is_waiting_for_healthy()) {
5207 maybe_update_heartbeat_peers();
5210 if (is_waiting_for_healthy()) {
5212 } else if (is_preboot() &&
5213 waiting_for_luminous_mons &&
5214 monc->monmap.get_required_features().contains_all(
5215 ceph::features::mon::FEATURE_LUMINOUS)) {
5216 // mon upgrade finished!
5222 tick_timer.add_event_after(OSD_TICK_INTERVAL, new C_Tick(this));
5225 void OSD::tick_without_osd_lock()
5227 assert(tick_timer_lock.is_locked());
5228 dout(10) << "tick_without_osd_lock" << dendl;
5230 logger->set(l_osd_buf, buffer::get_total_alloc());
5231 logger->set(l_osd_history_alloc_bytes, SHIFT_ROUND_UP(buffer::get_history_alloc_bytes(), 20));
5232 logger->set(l_osd_history_alloc_num, buffer::get_history_alloc_num());
5233 logger->set(l_osd_cached_crc, buffer::get_cached_crc());
5234 logger->set(l_osd_cached_crc_adjusted, buffer::get_cached_crc_adjusted());
5235 logger->set(l_osd_missed_crc, buffer::get_missed_crc());
5237 // osd_lock is not being held, which means the OSD state
5238 // might change when doing the monitor report
5239 if (is_active() || is_waiting_for_healthy()) {
5240 heartbeat_lock.Lock();
5242 heartbeat_lock.Unlock();
5244 map_lock.get_read();
5245 Mutex::Locker l(mon_report_lock);
5249 bool report = false;
5250 utime_t now = ceph_clock_now();
5251 pg_stat_queue_lock.Lock();
5252 double backoff = stats_ack_timeout / cct->_conf->osd_mon_ack_timeout;
5253 double adjusted_min = cct->_conf->osd_mon_report_interval_min * backoff;
5254 // note: we shouldn't adjust max because it must remain < the
5255 // mon's mon_osd_report_timeout (which defaults to 1.5x our
5257 double max = cct->_conf->osd_mon_report_interval_max;
5258 if (!outstanding_pg_stats.empty() &&
5259 (now - stats_ack_timeout) > last_pg_stats_ack) {
5260 dout(1) << __func__ << " mon hasn't acked PGStats in "
5261 << now - last_pg_stats_ack
5262 << " seconds, reconnecting elsewhere" << dendl;
5264 last_pg_stats_ack = now; // reset clock
5265 last_pg_stats_sent = utime_t();
5267 MAX(cct->_conf->osd_mon_ack_timeout,
5268 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_factor);
5269 outstanding_pg_stats.clear();
5271 if (now - last_pg_stats_sent > max) {
5272 osd_stat_updated = true;
5274 } else if (service.need_fullness_update()) {
5276 } else if ((int)outstanding_pg_stats.size() >=
5277 cct->_conf->osd_mon_report_max_in_flight) {
5278 dout(20) << __func__ << " have max " << outstanding_pg_stats
5279 << " stats updates in flight" << dendl;
5281 if (now - last_mon_report > adjusted_min) {
5282 dout(20) << __func__ << " stats backoff " << backoff
5283 << " adjusted_min " << adjusted_min << " - sending report"
5285 osd_stat_updated = true;
5289 pg_stat_queue_lock.Unlock();
5292 monc->reopen_session();
5293 } else if (report) {
5294 last_mon_report = now;
5296 // do any pending reports
5299 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5303 map_lock.put_read();
5307 if (!scrub_random_backoff()) {
5310 service.promote_throttle_recalibrate();
5311 resume_creating_pg();
5312 bool need_send_beacon = false;
5313 const auto now = ceph::coarse_mono_clock::now();
5315 // borrow lec lock to pretect last_sent_beacon from changing
5316 Mutex::Locker l{min_last_epoch_clean_lock};
5317 const auto elapsed = now - last_sent_beacon;
5318 if (chrono::duration_cast<chrono::seconds>(elapsed).count() >
5319 cct->_conf->osd_beacon_report_interval) {
5320 need_send_beacon = true;
5323 if (need_send_beacon) {
5328 check_ops_in_flight();
5329 service.kick_recovery_queue();
5330 tick_timer_without_osd_lock.add_event_after(OSD_TICK_INTERVAL, new C_Tick_WithoutOSDLock(this));
5333 void OSD::check_ops_in_flight()
5335 vector<string> warnings;
5336 if (op_tracker.check_ops_in_flight(warnings)) {
5337 for (vector<string>::iterator i = warnings.begin();
5338 i != warnings.end();
5346 // setomapval <pool-id> [namespace/]<obj-name> <key> <val>
5347 // rmomapkey <pool-id> [namespace/]<obj-name> <key>
5348 // setomapheader <pool-id> [namespace/]<obj-name> <header>
5349 // getomap <pool> [namespace/]<obj-name>
5350 // truncobj <pool-id> [namespace/]<obj-name> <newlen>
5351 // injectmdataerr [namespace/]<obj-name> [shardid]
5352 // injectdataerr [namespace/]<obj-name> [shardid]
5354 // set_recovery_delay [utime]
5355 void TestOpsSocketHook::test_ops(OSDService *service, ObjectStore *store,
5356 const std::string &command, cmdmap_t& cmdmap, ostream &ss)
5359 //Support changing the omap on a single osd by using the Admin Socket to
5360 //directly request the osd make a change.
5361 if (command == "setomapval" || command == "rmomapkey" ||
5362 command == "setomapheader" || command == "getomap" ||
5363 command == "truncobj" || command == "injectmdataerr" ||
5364 command == "injectdataerr"
5368 OSDMapRef curmap = service->get_osdmap();
5373 cmd_getval(service->cct, cmdmap, "pool", poolstr);
5374 pool = curmap->lookup_pg_pool_name(poolstr);
5375 //If we can't find it by name then maybe id specified
5376 if (pool < 0 && isdigit(poolstr[0]))
5377 pool = atoll(poolstr.c_str());
5379 ss << "Invalid pool '" << poolstr << "''";
5383 string objname, nspace;
5384 cmd_getval(service->cct, cmdmap, "objname", objname);
5385 std::size_t found = objname.find_first_of('/');
5386 if (found != string::npos) {
5387 nspace = objname.substr(0, found);
5388 objname = objname.substr(found+1);
5390 object_locator_t oloc(pool, nspace);
5391 r = curmap->object_locator_to_pg(object_t(objname), oloc, rawpg);
5394 ss << "Invalid namespace/objname";
5399 cmd_getval(service->cct, cmdmap, "shardid", shardid, int64_t(shard_id_t::NO_SHARD));
5400 hobject_t obj(object_t(objname), string(""), CEPH_NOSNAP, rawpg.ps(), pool, nspace);
5401 ghobject_t gobj(obj, ghobject_t::NO_GEN, shard_id_t(uint8_t(shardid)));
5402 spg_t pgid(curmap->raw_pg_to_pg(rawpg), shard_id_t(shardid));
5403 if (curmap->pg_is_ec(rawpg)) {
5404 if ((command != "injectdataerr") && (command != "injectmdataerr")) {
5405 ss << "Must not call on ec pool, except injectdataerr or injectmdataerr";
5410 ObjectStore::Transaction t;
5412 if (command == "setomapval") {
5413 map<string, bufferlist> newattrs;
5416 cmd_getval(service->cct, cmdmap, "key", key);
5417 cmd_getval(service->cct, cmdmap, "val", valstr);
5420 newattrs[key] = val;
5421 t.omap_setkeys(coll_t(pgid), ghobject_t(obj), newattrs);
5422 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5424 ss << "error=" << r;
5427 } else if (command == "rmomapkey") {
5430 cmd_getval(service->cct, cmdmap, "key", key);
5433 t.omap_rmkeys(coll_t(pgid), ghobject_t(obj), keys);
5434 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5436 ss << "error=" << r;
5439 } else if (command == "setomapheader") {
5440 bufferlist newheader;
5443 cmd_getval(service->cct, cmdmap, "header", headerstr);
5444 newheader.append(headerstr);
5445 t.omap_setheader(coll_t(pgid), ghobject_t(obj), newheader);
5446 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5448 ss << "error=" << r;
5451 } else if (command == "getomap") {
5452 //Debug: Output entire omap
5454 map<string, bufferlist> keyvals;
5455 r = store->omap_get(coll_t(pgid), ghobject_t(obj), &hdrbl, &keyvals);
5457 ss << "header=" << string(hdrbl.c_str(), hdrbl.length());
5458 for (map<string, bufferlist>::iterator it = keyvals.begin();
5459 it != keyvals.end(); ++it)
5460 ss << " key=" << (*it).first << " val="
5461 << string((*it).second.c_str(), (*it).second.length());
5463 ss << "error=" << r;
5465 } else if (command == "truncobj") {
5467 cmd_getval(service->cct, cmdmap, "len", trunclen);
5468 t.truncate(coll_t(pgid), ghobject_t(obj), trunclen);
5469 r = store->apply_transaction(service->meta_osr.get(), std::move(t));
5471 ss << "error=" << r;
5474 } else if (command == "injectdataerr") {
5475 store->inject_data_error(gobj);
5477 } else if (command == "injectmdataerr") {
5478 store->inject_mdata_error(gobj);
5483 if (command == "set_recovery_delay") {
5485 cmd_getval(service->cct, cmdmap, "utime", delay, (int64_t)0);
5488 int r = service->cct->_conf->set_val("osd_recovery_delay_start",
5491 ss << "set_recovery_delay: error setting "
5492 << "osd_recovery_delay_start to '" << delay << "': error "
5496 service->cct->_conf->apply_changes(NULL);
5497 ss << "set_recovery_delay: set osd_recovery_delay_start "
5498 << "to " << service->cct->_conf->osd_recovery_delay_start;
5501 if (command == "trigger_scrub") {
5503 OSDMapRef curmap = service->get_osdmap();
5507 cmd_getval(service->cct, cmdmap, "pgid", pgidstr);
5508 if (!pgid.parse(pgidstr.c_str())) {
5509 ss << "Invalid pgid specified";
5513 PG *pg = service->osd->_lookup_lock_pg(pgid);
5514 if (pg == nullptr) {
5515 ss << "Can't find pg " << pgid;
5519 if (pg->is_primary()) {
5520 pg->unreg_next_scrub();
5521 const pg_pool_t *p = curmap->get_pg_pool(pgid.pool());
5522 double pool_scrub_max_interval = 0;
5523 p->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &pool_scrub_max_interval);
5524 double scrub_max_interval = pool_scrub_max_interval > 0 ?
5525 pool_scrub_max_interval : g_conf->osd_scrub_max_interval;
5526 // Instead of marking must_scrub force a schedule scrub
5527 utime_t stamp = ceph_clock_now();
5528 stamp -= scrub_max_interval;
5529 stamp -= 100.0; // push back last scrub more for good measure
5530 pg->info.history.last_scrub_stamp = stamp;
5531 pg->reg_next_scrub();
5534 ss << "Not primary";
5539 if (command == "injectfull") {
5542 OSDService::s_names state;
5543 cmd_getval(service->cct, cmdmap, "type", type, string("full"));
5544 cmd_getval(service->cct, cmdmap, "count", count, (int64_t)-1);
5545 if (type == "none" || count == 0) {
5549 state = service->get_full_state(type);
5550 if (state == OSDService::s_names::INVALID) {
5551 ss << "Invalid type use (none, nearfull, backfillfull, full, failsafe)";
5554 service->set_injectfull(state, count);
5557 ss << "Internal error - command=" << command;
5560 // =========================================
5563 ObjectStore *store, SnapMapper *mapper,
5565 ObjectStore::Sequencer *osr,
5566 coll_t coll, DeletingStateRef dstate,
5568 ThreadPool::TPHandle &handle)
5570 vector<ghobject_t> olist;
5572 ObjectStore::Transaction t;
5574 handle.reset_tp_timeout();
5575 store->collection_list(
5578 ghobject_t::get_max(),
5579 store->get_ideal_list_max(),
5582 generic_dout(10) << __func__ << " " << olist << dendl;
5583 // default cont to true, this is safe because caller(OSD::RemoveWQ::_process())
5584 // will recheck the answer before it really goes on.
5586 for (vector<ghobject_t>::iterator i = olist.begin();
5591 OSDriver::OSTransaction _t(osdriver->get_transaction(&t));
5592 int r = mapper->remove_oid(i->hobj, &_t);
5593 if (r != 0 && r != -ENOENT) {
5597 if (++num >= cct->_conf->osd_target_transaction_size) {
5599 store->queue_transaction(osr, std::move(t), &waiter);
5600 cont = dstate->pause_clearing();
5601 handle.suspend_tp_timeout();
5603 handle.reset_tp_timeout();
5605 cont = dstate->resume_clearing();
5608 t = ObjectStore::Transaction();
5614 store->queue_transaction(osr, std::move(t), &waiter);
5615 cont = dstate->pause_clearing();
5616 handle.suspend_tp_timeout();
5618 handle.reset_tp_timeout();
5620 cont = dstate->resume_clearing();
5622 // whether there are more objects to remove in the collection
5623 *finished = next.is_max();
5627 void OSD::RemoveWQ::_process(
5628 pair<PGRef, DeletingStateRef> item,
5629 ThreadPool::TPHandle &handle)
5632 PGRef pg(item.first);
5633 SnapMapper &mapper = pg->snap_mapper;
5634 OSDriver &driver = pg->osdriver;
5635 coll_t coll = coll_t(pg->info.pgid);
5637 bool finished = false;
5639 if (!item.second->start_or_resume_clearing())
5642 bool cont = remove_dir(
5643 pg->cct, store, &mapper, &driver, pg->osr.get(), coll, item.second,
5648 if (item.second->pause_clearing())
5653 if (!item.second->start_deleting())
5656 ObjectStore::Transaction t;
5657 PGLog::clear_info_log(pg->info.pgid, &t);
5659 if (cct->_conf->osd_inject_failure_on_pg_removal) {
5660 generic_derr << "osd_inject_failure_on_pg_removal" << dendl;
5663 t.remove_collection(coll);
5665 // We need the sequencer to stick around until the op is complete
5666 store->queue_transaction(
5671 0, // onreadable sync
5672 new ContainerContext<PGRef>(pg),
5675 item.second->finish_deleting();
5677 // =========================================
5679 void OSD::ms_handle_connect(Connection *con)
5681 dout(10) << __func__ << " con " << con << dendl;
5682 if (con->get_peer_type() == CEPH_ENTITY_TYPE_MON) {
5683 Mutex::Locker l(osd_lock);
5686 dout(10) << __func__ << " on mon" << dendl;
5690 } else if (is_booting()) {
5691 _send_boot(); // resend boot message
5693 map_lock.get_read();
5694 Mutex::Locker l2(mon_report_lock);
5696 utime_t now = ceph_clock_now();
5697 last_mon_report = now;
5699 // resend everything, it's a new session
5702 service.requeue_pg_temp();
5703 service.send_pg_temp();
5706 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS) {
5710 map_lock.put_read();
5712 send_beacon(ceph::coarse_mono_clock::now());
5716 // full map requests may happen while active or pre-boot
5717 if (requested_full_first) {
5718 rerequest_full_maps();
5723 void OSD::ms_handle_fast_connect(Connection *con)
5725 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5726 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5727 Session *s = static_cast<Session*>(con->get_priv());
5729 s = new Session(cct);
5730 con->set_priv(s->get());
5732 dout(10) << " new session (outgoing) " << s << " con=" << s->con
5733 << " addr=" << s->con->get_peer_addr() << dendl;
5734 // we don't connect to clients
5735 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5736 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5742 void OSD::ms_handle_fast_accept(Connection *con)
5744 if (con->get_peer_type() != CEPH_ENTITY_TYPE_MON &&
5745 con->get_peer_type() != CEPH_ENTITY_TYPE_MGR) {
5746 Session *s = static_cast<Session*>(con->get_priv());
5748 s = new Session(cct);
5749 con->set_priv(s->get());
5751 dout(10) << "new session (incoming)" << s << " con=" << con
5752 << " addr=" << con->get_peer_addr()
5753 << " must have raced with connect" << dendl;
5754 assert(con->get_peer_type() == CEPH_ENTITY_TYPE_OSD);
5755 s->entity_name.set_type(CEPH_ENTITY_TYPE_OSD);
5761 bool OSD::ms_handle_reset(Connection *con)
5763 Session *session = static_cast<Session*>(con->get_priv());
5764 dout(2) << "ms_handle_reset con " << con << " session " << session << dendl;
5767 session->wstate.reset(con);
5768 session->con.reset(NULL); // break con <-> session ref cycle
5769 // note that we break session->con *before* the session_handle_reset
5770 // cleanup below. this avoids a race between us and
5771 // PG::add_backoff, Session::check_backoff, etc.
5772 session_handle_reset(session);
5777 bool OSD::ms_handle_refused(Connection *con)
5779 if (!cct->_conf->osd_fast_fail_on_connection_refused)
5782 Session *session = static_cast<Session*>(con->get_priv());
5783 dout(2) << "ms_handle_refused con " << con << " session " << session << dendl;
5786 int type = con->get_peer_type();
5787 // handle only OSD failures here
5788 if (monc && (type == CEPH_ENTITY_TYPE_OSD)) {
5789 OSDMapRef osdmap = get_osdmap();
5791 int id = osdmap->identify_osd_on_all_channels(con->get_peer_addr());
5792 if (id >= 0 && osdmap->is_up(id)) {
5793 // I'm cheating mon heartbeat grace logic, because we know it's not going
5794 // to respawn alone. +1 so we won't hit any boundary case.
5795 monc->send_mon_message(new MOSDFailure(monc->get_fsid(),
5796 osdmap->get_inst(id),
5797 cct->_conf->osd_heartbeat_grace + 1,
5798 osdmap->get_epoch(),
5799 MOSDFailure::FLAG_IMMEDIATE | MOSDFailure::FLAG_FAILED
5808 struct C_OSD_GetVersion : public Context {
5810 uint64_t oldest, newest;
5811 explicit C_OSD_GetVersion(OSD *o) : osd(o), oldest(0), newest(0) {}
5812 void finish(int r) override {
5814 osd->_got_mon_epochs(oldest, newest);
5818 void OSD::start_boot()
5820 if (!_is_healthy()) {
5821 // if we are not healthy, do not mark ourselves up (yet)
5822 dout(1) << "not healthy; waiting to boot" << dendl;
5823 if (!is_waiting_for_healthy())
5824 start_waiting_for_healthy();
5825 // send pings sooner rather than later
5829 dout(1) << __func__ << dendl;
5830 set_state(STATE_PREBOOT);
5831 waiting_for_luminous_mons = false;
5832 dout(10) << "start_boot - have maps " << superblock.oldest_map
5833 << ".." << superblock.newest_map << dendl;
5834 C_OSD_GetVersion *c = new C_OSD_GetVersion(this);
5835 monc->get_version("osdmap", &c->newest, &c->oldest, c);
5838 void OSD::_got_mon_epochs(epoch_t oldest, epoch_t newest)
5840 Mutex::Locker l(osd_lock);
5842 _preboot(oldest, newest);
5846 void OSD::_preboot(epoch_t oldest, epoch_t newest)
5848 assert(is_preboot());
5849 dout(10) << __func__ << " _preboot mon has osdmaps "
5850 << oldest << ".." << newest << dendl;
5852 // ensure our local fullness awareness is accurate
5855 // if our map within recent history, try to add ourselves to the osdmap.
5856 if (osdmap->get_epoch() == 0) {
5857 derr << "waiting for initial osdmap" << dendl;
5858 } else if (osdmap->is_destroyed(whoami)) {
5859 derr << "osdmap says I am destroyed, exiting" << dendl;
5861 } else if (osdmap->test_flag(CEPH_OSDMAP_NOUP) || osdmap->is_noup(whoami)) {
5862 derr << "osdmap NOUP flag is set, waiting for it to clear" << dendl;
5863 } else if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE)) {
5864 derr << "osdmap SORTBITWISE OSDMap flag is NOT set; please set it"
5866 } else if (osdmap->require_osd_release < CEPH_RELEASE_JEWEL) {
5867 derr << "osdmap REQUIRE_JEWEL OSDMap flag is NOT set; please set it"
5869 } else if (!monc->monmap.get_required_features().contains_all(
5870 ceph::features::mon::FEATURE_LUMINOUS)) {
5871 derr << "monmap REQUIRE_LUMINOUS is NOT set; must upgrade all monitors to "
5872 << "Luminous or later before Luminous OSDs will boot" << dendl;
5873 waiting_for_luminous_mons = true;
5874 } else if (service.need_fullness_update()) {
5875 derr << "osdmap fullness state needs update" << dendl;
5877 } else if (osdmap->get_epoch() >= oldest - 1 &&
5878 osdmap->get_epoch() + cct->_conf->osd_map_message_max > newest) {
5883 // get all the latest maps
5884 if (osdmap->get_epoch() + 1 >= oldest)
5885 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5887 osdmap_subscribe(oldest - 1, true);
5890 void OSD::send_full_update()
5892 if (!service.need_fullness_update())
5895 if (service.is_full()) {
5896 state = CEPH_OSD_FULL;
5897 } else if (service.is_backfillfull()) {
5898 state = CEPH_OSD_BACKFILLFULL;
5899 } else if (service.is_nearfull()) {
5900 state = CEPH_OSD_NEARFULL;
5903 OSDMap::calc_state_set(state, s);
5904 dout(10) << __func__ << " want state " << s << dendl;
5905 monc->send_mon_message(new MOSDFull(osdmap->get_epoch(), state));
5908 void OSD::start_waiting_for_healthy()
5910 dout(1) << "start_waiting_for_healthy" << dendl;
5911 set_state(STATE_WAITING_FOR_HEALTHY);
5912 last_heartbeat_resample = utime_t();
5914 // subscribe to osdmap updates, in case our peers really are known to be dead
5915 osdmap_subscribe(osdmap->get_epoch() + 1, false);
5918 bool OSD::_is_healthy()
5920 if (!cct->get_heartbeat_map()->is_healthy()) {
5921 dout(1) << "is_healthy false -- internal heartbeat failed" << dendl;
5925 if (is_waiting_for_healthy()) {
5926 Mutex::Locker l(heartbeat_lock);
5927 utime_t cutoff = ceph_clock_now();
5928 cutoff -= cct->_conf->osd_heartbeat_grace;
5929 int num = 0, up = 0;
5930 for (map<int,HeartbeatInfo>::iterator p = heartbeat_peers.begin();
5931 p != heartbeat_peers.end();
5933 if (p->second.is_healthy(cutoff))
5937 if ((float)up < (float)num * cct->_conf->osd_heartbeat_min_healthy_ratio) {
5938 dout(1) << "is_healthy false -- only " << up << "/" << num << " up peers (less than "
5939 << int(cct->_conf->osd_heartbeat_min_healthy_ratio * 100.0) << "%)" << dendl;
5947 void OSD::_send_boot()
5949 dout(10) << "_send_boot" << dendl;
5950 entity_addr_t cluster_addr = cluster_messenger->get_myaddr();
5951 Connection *local_connection = cluster_messenger->get_loopback_connection().get();
5952 if (cluster_addr.is_blank_ip()) {
5953 int port = cluster_addr.get_port();
5954 cluster_addr = client_messenger->get_myaddr();
5955 cluster_addr.set_port(port);
5956 cluster_messenger->set_addr_unknowns(cluster_addr);
5957 dout(10) << " assuming cluster_addr ip matches client_addr" << dendl;
5959 Session *s = static_cast<Session*>(local_connection->get_priv());
5963 cluster_messenger->ms_deliver_handle_fast_connect(local_connection);
5966 entity_addr_t hb_back_addr = hb_back_server_messenger->get_myaddr();
5967 local_connection = hb_back_server_messenger->get_loopback_connection().get();
5968 if (hb_back_addr.is_blank_ip()) {
5969 int port = hb_back_addr.get_port();
5970 hb_back_addr = cluster_addr;
5971 hb_back_addr.set_port(port);
5972 hb_back_server_messenger->set_addr_unknowns(hb_back_addr);
5973 dout(10) << " assuming hb_back_addr ip matches cluster_addr" << dendl;
5975 Session *s = static_cast<Session*>(local_connection->get_priv());
5979 hb_back_server_messenger->ms_deliver_handle_fast_connect(local_connection);
5982 entity_addr_t hb_front_addr = hb_front_server_messenger->get_myaddr();
5983 local_connection = hb_front_server_messenger->get_loopback_connection().get();
5984 if (hb_front_addr.is_blank_ip()) {
5985 int port = hb_front_addr.get_port();
5986 hb_front_addr = client_messenger->get_myaddr();
5987 hb_front_addr.set_port(port);
5988 hb_front_server_messenger->set_addr_unknowns(hb_front_addr);
5989 dout(10) << " assuming hb_front_addr ip matches client_addr" << dendl;
5991 Session *s = static_cast<Session*>(local_connection->get_priv());
5995 hb_front_server_messenger->ms_deliver_handle_fast_connect(local_connection);
5998 MOSDBoot *mboot = new MOSDBoot(superblock, get_osdmap_epoch(), service.get_boot_epoch(),
5999 hb_back_addr, hb_front_addr, cluster_addr,
6001 dout(10) << " client_addr " << client_messenger->get_myaddr()
6002 << ", cluster_addr " << cluster_addr
6003 << ", hb_back_addr " << hb_back_addr
6004 << ", hb_front_addr " << hb_front_addr
6006 _collect_metadata(&mboot->metadata);
6007 monc->send_mon_message(mboot);
6008 set_state(STATE_BOOTING);
6011 void OSD::_collect_metadata(map<string,string> *pm)
6014 (*pm)["osd_data"] = dev_path;
6015 if (store->get_type() == "filestore") {
6016 // not applicable for bluestore
6017 (*pm)["osd_journal"] = journal_path;
6019 (*pm)["front_addr"] = stringify(client_messenger->get_myaddr());
6020 (*pm)["back_addr"] = stringify(cluster_messenger->get_myaddr());
6021 (*pm)["hb_front_addr"] = stringify(hb_front_server_messenger->get_myaddr());
6022 (*pm)["hb_back_addr"] = stringify(hb_back_server_messenger->get_myaddr());
6025 (*pm)["osd_objectstore"] = store->get_type();
6026 (*pm)["rotational"] = store_is_rotational ? "1" : "0";
6027 (*pm)["journal_rotational"] = journal_is_rotational ? "1" : "0";
6028 (*pm)["default_device_class"] = store->get_default_device_class();
6029 store->collect_metadata(pm);
6031 collect_sys_info(pm, cct);
6033 std::string front_iface, back_iface;
6036 CEPH_PICK_ADDRESS_PUBLIC | CEPH_PICK_ADDRESS_CLUSTER,
6037 &front_iface, &back_iface);
6039 (*pm)["front_iface"] = pick_iface(cct,
6040 client_messenger->get_myaddr().get_sockaddr_storage());
6041 (*pm)["back_iface"] = pick_iface(cct,
6042 cluster_messenger->get_myaddr().get_sockaddr_storage());
6044 dout(10) << __func__ << " " << *pm << dendl;
6047 void OSD::queue_want_up_thru(epoch_t want)
6049 map_lock.get_read();
6050 epoch_t cur = osdmap->get_up_thru(whoami);
6051 Mutex::Locker l(mon_report_lock);
6052 if (want > up_thru_wanted) {
6053 dout(10) << "queue_want_up_thru now " << want << " (was " << up_thru_wanted << ")"
6054 << ", currently " << cur
6056 up_thru_wanted = want;
6059 dout(10) << "queue_want_up_thru want " << want << " <= queued " << up_thru_wanted
6060 << ", currently " << cur
6063 map_lock.put_read();
6066 void OSD::send_alive()
6068 assert(mon_report_lock.is_locked());
6069 if (!osdmap->exists(whoami))
6071 epoch_t up_thru = osdmap->get_up_thru(whoami);
6072 dout(10) << "send_alive up_thru currently " << up_thru << " want " << up_thru_wanted << dendl;
6073 if (up_thru_wanted > up_thru) {
6074 dout(10) << "send_alive want " << up_thru_wanted << dendl;
6075 monc->send_mon_message(new MOSDAlive(osdmap->get_epoch(), up_thru_wanted));
6079 void OSD::request_full_map(epoch_t first, epoch_t last)
6081 dout(10) << __func__ << " " << first << ".." << last
6082 << ", previously requested "
6083 << requested_full_first << ".." << requested_full_last << dendl;
6084 assert(osd_lock.is_locked());
6085 assert(first > 0 && last > 0);
6086 assert(first <= last);
6087 assert(first >= requested_full_first); // we shouldn't ever ask for older maps
6088 if (requested_full_first == 0) {
6090 requested_full_first = first;
6091 requested_full_last = last;
6092 } else if (last <= requested_full_last) {
6096 // additional request
6097 first = requested_full_last + 1;
6098 requested_full_last = last;
6100 MMonGetOSDMap *req = new MMonGetOSDMap;
6101 req->request_full(first, last);
6102 monc->send_mon_message(req);
6105 void OSD::got_full_map(epoch_t e)
6107 assert(requested_full_first <= requested_full_last);
6108 assert(osd_lock.is_locked());
6109 if (requested_full_first == 0) {
6110 dout(20) << __func__ << " " << e << ", nothing requested" << dendl;
6113 if (e < requested_full_first) {
6114 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6115 << ".." << requested_full_last
6116 << ", ignoring" << dendl;
6119 if (e >= requested_full_last) {
6120 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6121 << ".." << requested_full_last << ", resetting" << dendl;
6122 requested_full_first = requested_full_last = 0;
6126 requested_full_first = e + 1;
6128 dout(10) << __func__ << " " << e << ", requested " << requested_full_first
6129 << ".." << requested_full_last
6130 << ", still need more" << dendl;
6133 void OSD::requeue_failures()
6135 Mutex::Locker l(heartbeat_lock);
6136 unsigned old_queue = failure_queue.size();
6137 unsigned old_pending = failure_pending.size();
6138 for (map<int,pair<utime_t,entity_inst_t> >::iterator p =
6139 failure_pending.begin();
6140 p != failure_pending.end(); ) {
6141 failure_queue[p->first] = p->second.first;
6142 failure_pending.erase(p++);
6144 dout(10) << __func__ << " " << old_queue << " + " << old_pending << " -> "
6145 << failure_queue.size() << dendl;
6148 void OSD::send_failures()
6150 assert(map_lock.is_locked());
6151 assert(mon_report_lock.is_locked());
6152 Mutex::Locker l(heartbeat_lock);
6153 utime_t now = ceph_clock_now();
6154 while (!failure_queue.empty()) {
6155 int osd = failure_queue.begin()->first;
6156 if (!failure_pending.count(osd)) {
6157 entity_inst_t i = osdmap->get_inst(osd);
6158 int failed_for = (int)(double)(now - failure_queue.begin()->second);
6159 monc->send_mon_message(new MOSDFailure(monc->get_fsid(), i, failed_for,
6160 osdmap->get_epoch()));
6161 failure_pending[osd] = make_pair(failure_queue.begin()->second, i);
6163 failure_queue.erase(osd);
6167 void OSD::send_still_alive(epoch_t epoch, const entity_inst_t &i)
6169 MOSDFailure *m = new MOSDFailure(monc->get_fsid(), i, 0, epoch, MOSDFailure::FLAG_ALIVE);
6170 monc->send_mon_message(m);
6173 void OSD::send_pg_stats(const utime_t &now)
6175 assert(map_lock.is_locked());
6176 assert(osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS);
6177 dout(20) << "send_pg_stats" << dendl;
6179 osd_stat_t cur_stat = service.get_osd_stat();
6181 cur_stat.os_perf_stat = store->get_cur_stats();
6183 pg_stat_queue_lock.Lock();
6185 if (osd_stat_updated || !pg_stat_queue.empty()) {
6186 last_pg_stats_sent = now;
6187 osd_stat_updated = false;
6189 dout(10) << "send_pg_stats - " << pg_stat_queue.size() << " pgs updated" << dendl;
6191 utime_t had_for(now);
6192 had_for -= had_map_since;
6194 MPGStats *m = new MPGStats(monc->get_fsid(), osdmap->get_epoch(), had_for);
6196 uint64_t tid = ++pg_stat_tid;
6198 m->osd_stat = cur_stat;
6200 xlist<PG*>::iterator p = pg_stat_queue.begin();
6204 if (!pg->is_primary()) { // we hold map_lock; role is stable.
6205 pg->stat_queue_item.remove_myself();
6206 pg->put("pg_stat_queue");
6209 pg->pg_stats_publish_lock.Lock();
6210 if (pg->pg_stats_publish_valid) {
6211 m->pg_stat[pg->info.pgid.pgid] = pg->pg_stats_publish;
6212 dout(25) << " sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6213 << pg->pg_stats_publish.reported_seq << dendl;
6215 dout(25) << " NOT sending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch << ":"
6216 << pg->pg_stats_publish.reported_seq << ", not valid" << dendl;
6218 pg->pg_stats_publish_lock.Unlock();
6221 if (last_pg_stats_ack == utime_t() || !outstanding_pg_stats.empty()) {
6222 last_pg_stats_ack = ceph_clock_now();
6224 outstanding_pg_stats.insert(tid);
6225 dout(20) << __func__ << " updates pending: " << outstanding_pg_stats << dendl;
6227 monc->send_mon_message(m);
6230 pg_stat_queue_lock.Unlock();
6233 void OSD::handle_pg_stats_ack(MPGStatsAck *ack)
6235 dout(10) << "handle_pg_stats_ack " << dendl;
6237 if (!require_mon_peer(ack)) {
6242 // NOTE: we may get replies from a previous mon even while
6243 // outstanding_pg_stats is empty if reconnecting races with replies
6246 pg_stat_queue_lock.Lock();
6248 last_pg_stats_ack = ceph_clock_now();
6250 // decay timeout slowly (analogous to TCP)
6252 MAX(cct->_conf->osd_mon_ack_timeout,
6253 stats_ack_timeout * cct->_conf->osd_stats_ack_timeout_decay);
6254 dout(20) << __func__ << " timeout now " << stats_ack_timeout << dendl;
6256 if (ack->get_tid() > pg_stat_tid_flushed) {
6257 pg_stat_tid_flushed = ack->get_tid();
6258 pg_stat_queue_cond.Signal();
6261 xlist<PG*>::iterator p = pg_stat_queue.begin();
6267 auto acked = ack->pg_stat.find(pg->info.pgid.pgid);
6268 if (acked != ack->pg_stat.end()) {
6269 pg->pg_stats_publish_lock.Lock();
6270 if (acked->second.first == pg->pg_stats_publish.reported_seq &&
6271 acked->second.second == pg->pg_stats_publish.reported_epoch) {
6272 dout(25) << " ack on " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6273 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6274 pg->stat_queue_item.remove_myself();
6275 pg->put("pg_stat_queue");
6277 dout(25) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6278 << ":" << pg->pg_stats_publish.reported_seq << " > acked "
6279 << acked->second << dendl;
6281 pg->pg_stats_publish_lock.Unlock();
6283 dout(30) << " still pending " << pg->info.pgid << " " << pg->pg_stats_publish.reported_epoch
6284 << ":" << pg->pg_stats_publish.reported_seq << dendl;
6288 outstanding_pg_stats.erase(ack->get_tid());
6289 dout(20) << __func__ << " still pending: " << outstanding_pg_stats << dendl;
6291 pg_stat_queue_lock.Unlock();
6296 void OSD::flush_pg_stats()
6298 dout(10) << "flush_pg_stats" << dendl;
6300 utime_t now = ceph_clock_now();
6301 map_lock.get_read();
6302 mon_report_lock.Lock();
6304 mon_report_lock.Unlock();
6305 map_lock.put_read();
6308 pg_stat_queue_lock.Lock();
6309 uint64_t tid = pg_stat_tid;
6310 dout(10) << "flush_pg_stats waiting for stats tid " << tid << " to flush" << dendl;
6311 while (tid > pg_stat_tid_flushed)
6312 pg_stat_queue_cond.Wait(pg_stat_queue_lock);
6313 dout(10) << "flush_pg_stats finished waiting for stats tid " << tid << " to flush" << dendl;
6314 pg_stat_queue_lock.Unlock();
6319 void OSD::send_beacon(const ceph::coarse_mono_clock::time_point& now)
6321 const auto& monmap = monc->monmap;
6322 // send beacon to mon even if we are just connected, and the monmap is not
6323 // initialized yet by then.
6324 if (monmap.epoch > 0 &&
6325 monmap.get_required_features().contains_all(
6326 ceph::features::mon::FEATURE_LUMINOUS)) {
6327 dout(20) << __func__ << " sending" << dendl;
6328 MOSDBeacon* beacon = nullptr;
6330 Mutex::Locker l{min_last_epoch_clean_lock};
6331 beacon = new MOSDBeacon(osdmap->get_epoch(), min_last_epoch_clean);
6332 std::swap(beacon->pgs, min_last_epoch_clean_pgs);
6333 last_sent_beacon = now;
6335 monc->send_mon_message(beacon);
6337 dout(20) << __func__ << " not sending" << dendl;
6341 void OSD::handle_command(MMonCommand *m)
6343 if (!require_mon_peer(m)) {
6348 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), NULL);
6349 command_wq.queue(c);
6353 void OSD::handle_command(MCommand *m)
6355 ConnectionRef con = m->get_connection();
6356 Session *session = static_cast<Session *>(con->get_priv());
6358 con->send_message(new MCommandReply(m, -EPERM));
6363 OSDCap& caps = session->caps;
6366 if (!caps.allow_all() || m->get_source().is_mon()) {
6367 con->send_message(new MCommandReply(m, -EPERM));
6372 Command *c = new Command(m->cmd, m->get_tid(), m->get_data(), con.get());
6373 command_wq.queue(c);
6383 string availability;
6384 } osd_commands[] = {
6386 #define COMMAND(parsesig, helptext, module, perm, availability) \
6387 {parsesig, helptext, module, perm, availability},
6389 // yes, these are really pg commands, but there's a limit to how
6390 // much work it's worth. The OSD returns all of them. Make this
6391 // form (pg <pgid> <cmd>) valid only for the cli.
6392 // Rest uses "tell <pgid> <cmd>"
6395 "name=pgid,type=CephPgid " \
6396 "name=cmd,type=CephChoices,strings=query", \
6397 "show details of a specific pg", "osd", "r", "cli")
6399 "name=pgid,type=CephPgid " \
6400 "name=cmd,type=CephChoices,strings=mark_unfound_lost " \
6401 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6402 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6405 "name=pgid,type=CephPgid " \
6406 "name=cmd,type=CephChoices,strings=list_missing " \
6407 "name=offset,type=CephString,req=false",
6408 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6411 // new form: tell <pgid> <cmd> for both cli and rest
6414 "show details of a specific pg", "osd", "r", "cli,rest")
6415 COMMAND("mark_unfound_lost " \
6416 "name=mulcmd,type=CephChoices,strings=revert|delete", \
6417 "mark all unfound objects in this pg as lost, either removing or reverting to a prior version if one is available",
6418 "osd", "rw", "cli,rest")
6419 COMMAND("list_missing " \
6420 "name=offset,type=CephString,req=false",
6421 "list missing objects on this pg, perhaps starting at an offset given in JSON",
6422 "osd", "r", "cli,rest")
6423 COMMAND("perf histogram dump "
6424 "name=logger,type=CephString,req=false "
6425 "name=counter,type=CephString,req=false",
6426 "Get histogram data",
6427 "osd", "r", "cli,rest")
6429 // tell <osd.n> commands. Validation of osd.n must be special-cased in client
6430 COMMAND("version", "report version of OSD", "osd", "r", "cli,rest")
6431 COMMAND("get_command_descriptions", "list commands descriptions", "osd", "r", "cli,rest")
6432 COMMAND("injectargs " \
6433 "name=injected_args,type=CephString,n=N",
6434 "inject configuration arguments into running OSD",
6435 "osd", "rw", "cli,rest")
6436 COMMAND("config set " \
6437 "name=key,type=CephString name=value,type=CephString",
6438 "Set a configuration option at runtime (not persistent)",
6439 "osd", "rw", "cli,rest")
6440 COMMAND("cluster_log " \
6441 "name=level,type=CephChoices,strings=error,warning,info,debug " \
6442 "name=message,type=CephString,n=N",
6443 "log a message to the cluster log",
6444 "osd", "rw", "cli,rest")
6446 "name=count,type=CephInt,req=false " \
6447 "name=size,type=CephInt,req=false " \
6448 "name=object_size,type=CephInt,req=false " \
6449 "name=object_num,type=CephInt,req=false ", \
6450 "OSD benchmark: write <count> <size>-byte objects, " \
6451 "(default 1G size 4MB). Results in log.",
6452 "osd", "rw", "cli,rest")
6453 COMMAND("flush_pg_stats", "flush pg stats", "osd", "rw", "cli,rest")
6455 "name=heapcmd,type=CephChoices,strings=dump|start_profiler|stop_profiler|release|stats", \
6456 "show heap usage info (available only if compiled with tcmalloc)", \
6457 "osd", "rw", "cli,rest")
6458 COMMAND("debug dump_missing " \
6459 "name=filename,type=CephFilepath",
6460 "dump missing objects to a named file", "osd", "r", "cli,rest")
6461 COMMAND("debug kick_recovery_wq " \
6462 "name=delay,type=CephInt,range=0",
6463 "set osd_recovery_delay_start to <val>", "osd", "rw", "cli,rest")
6464 COMMAND("cpu_profiler " \
6465 "name=arg,type=CephChoices,strings=status|flush",
6466 "run cpu profiling on daemon", "osd", "rw", "cli,rest")
6467 COMMAND("dump_pg_recovery_stats", "dump pg recovery statistics",
6468 "osd", "r", "cli,rest")
6469 COMMAND("reset_pg_recovery_stats", "reset pg recovery statistics",
6470 "osd", "rw", "cli,rest")
6472 "compact object store's omap. "
6473 "WARNING: Compaction probably slows your requests",
6474 "osd", "rw", "cli,rest")
6477 void OSD::do_command(Connection *con, ceph_tid_t tid, vector<string>& cmd, bufferlist& data)
6480 stringstream ss, ds;
6484 dout(20) << "do_command tid " << tid << " " << cmd << dendl;
6486 map<string, cmd_vartype> cmdmap;
6490 boost::scoped_ptr<Formatter> f;
6493 ss << "no command given";
6497 if (!cmdmap_from_json(cmd, &cmdmap, ss)) {
6502 cmd_getval(cct, cmdmap, "prefix", prefix);
6504 if (prefix == "get_command_descriptions") {
6506 JSONFormatter *f = new JSONFormatter();
6507 f->open_object_section("command_descriptions");
6508 for (OSDCommand *cp = osd_commands;
6509 cp < &osd_commands[ARRAY_SIZE(osd_commands)]; cp++) {
6511 ostringstream secname;
6512 secname << "cmd" << setfill('0') << std::setw(3) << cmdnum;
6513 dump_cmddesc_to_json(f, secname.str(), cp->cmdstring, cp->helpstring,
6514 cp->module, cp->perm, cp->availability, 0);
6517 f->close_section(); // command_descriptions
6524 cmd_getval(cct, cmdmap, "format", format);
6525 f.reset(Formatter::create(format));
6527 if (prefix == "version") {
6529 f->open_object_section("version");
6530 f->dump_string("version", pretty_version_to_str());
6534 ds << pretty_version_to_str();
6538 else if (prefix == "injectargs") {
6539 vector<string> argsvec;
6540 cmd_getval(cct, cmdmap, "injected_args", argsvec);
6542 if (argsvec.empty()) {
6544 ss << "ignoring empty injectargs";
6547 string args = argsvec.front();
6548 for (vector<string>::iterator a = ++argsvec.begin(); a != argsvec.end(); ++a)
6551 r = cct->_conf->injectargs(args, &ss);
6554 else if (prefix == "config set") {
6557 cmd_getval(cct, cmdmap, "key", key);
6558 cmd_getval(cct, cmdmap, "value", val);
6560 r = cct->_conf->set_val(key, val, true, &ss);
6562 cct->_conf->apply_changes(nullptr);
6566 else if (prefix == "cluster_log") {
6568 cmd_getval(cct, cmdmap, "message", msg);
6571 ss << "ignoring empty log message";
6574 string message = msg.front();
6575 for (vector<string>::iterator a = ++msg.begin(); a != msg.end(); ++a)
6576 message += " " + *a;
6578 cmd_getval(cct, cmdmap, "level", lvl);
6579 clog_type level = string_to_clog_type(lvl);
6582 ss << "unknown level '" << lvl << "'";
6585 clog->do_log(level, message);
6588 // either 'pg <pgid> <command>' or
6589 // 'tell <pgid>' (which comes in without any of that prefix)?
6591 else if (prefix == "pg" ||
6592 prefix == "query" ||
6593 prefix == "mark_unfound_lost" ||
6594 prefix == "list_missing"
6598 if (!cmd_getval(cct, cmdmap, "pgid", pgidstr)) {
6599 ss << "no pgid specified";
6601 } else if (!pgid.parse(pgidstr.c_str())) {
6602 ss << "couldn't parse pgid '" << pgidstr << "'";
6607 if (osdmap->get_primary_shard(pgid, &pcand) &&
6608 (pg = _lookup_lock_pg(pcand))) {
6609 if (pg->is_primary()) {
6610 // simulate pg <pgid> cmd= for pg->do-command
6612 cmd_putval(cct, cmdmap, "cmd", prefix);
6613 r = pg->do_command(cmdmap, ss, data, odata, con, tid);
6616 // don't reply, pg will do so async
6620 ss << "not primary for pgid " << pgid;
6622 // send them the latest diff to ensure they realize the mapping
6624 service.send_incremental_map(osdmap->get_epoch() - 1, con, osdmap);
6626 // do not reply; they will get newer maps and realize they
6633 ss << "i don't have pgid " << pgid;
6639 else if (prefix == "bench") {
6642 int64_t osize, onum;
6643 // default count 1G, size 4MB
6644 cmd_getval(cct, cmdmap, "count", count, (int64_t)1 << 30);
6645 cmd_getval(cct, cmdmap, "size", bsize, (int64_t)4 << 20);
6646 cmd_getval(cct, cmdmap, "object_size", osize, (int64_t)0);
6647 cmd_getval(cct, cmdmap, "object_num", onum, (int64_t)0);
6649 ceph::shared_ptr<ObjectStore::Sequencer> osr (std::make_shared<
6650 ObjectStore::Sequencer>("bench"));
6652 uint32_t duration = cct->_conf->osd_bench_duration;
6654 if (bsize > (int64_t) cct->_conf->osd_bench_max_block_size) {
6655 // let us limit the block size because the next checks rely on it
6656 // having a sane value. If we allow any block size to be set things
6657 // can still go sideways.
6658 ss << "block 'size' values are capped at "
6659 << prettybyte_t(cct->_conf->osd_bench_max_block_size) << ". If you wish to use"
6660 << " a higher value, please adjust 'osd_bench_max_block_size'";
6663 } else if (bsize < (int64_t) (1 << 20)) {
6664 // entering the realm of small block sizes.
6665 // limit the count to a sane value, assuming a configurable amount of
6666 // IOPS and duration, so that the OSD doesn't get hung up on this,
6667 // preventing timeouts from going off
6669 bsize * duration * cct->_conf->osd_bench_small_size_max_iops;
6670 if (count > max_count) {
6671 ss << "'count' values greater than " << max_count
6672 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6673 << cct->_conf->osd_bench_small_size_max_iops << " IOPS,"
6674 << " for " << duration << " seconds,"
6675 << " can cause ill effects on osd. "
6676 << " Please adjust 'osd_bench_small_size_max_iops' with a higher"
6677 << " value if you wish to use a higher 'count'.";
6682 // 1MB block sizes are big enough so that we get more stuff done.
6683 // However, to avoid the osd from getting hung on this and having
6684 // timers being triggered, we are going to limit the count assuming
6685 // a configurable throughput and duration.
6686 // NOTE: max_count is the total amount of bytes that we believe we
6687 // will be able to write during 'duration' for the given
6688 // throughput. The block size hardly impacts this unless it's
6689 // way too big. Given we already check how big the block size
6690 // is, it's safe to assume everything will check out.
6692 cct->_conf->osd_bench_large_size_max_throughput * duration;
6693 if (count > max_count) {
6694 ss << "'count' values greater than " << max_count
6695 << " for a block size of " << prettybyte_t(bsize) << ", assuming "
6696 << prettybyte_t(cct->_conf->osd_bench_large_size_max_throughput) << "/s,"
6697 << " for " << duration << " seconds,"
6698 << " can cause ill effects on osd. "
6699 << " Please adjust 'osd_bench_large_size_max_throughput'"
6700 << " with a higher value if you wish to use a higher 'count'.";
6706 if (osize && bsize > osize)
6709 dout(1) << " bench count " << count
6710 << " bsize " << prettybyte_t(bsize) << dendl;
6712 ObjectStore::Transaction cleanupt;
6714 if (osize && onum) {
6716 bufferptr bp(osize);
6718 bl.push_back(std::move(bp));
6719 bl.rebuild_page_aligned();
6720 for (int i=0; i<onum; ++i) {
6722 snprintf(nm, sizeof(nm), "disk_bw_test_%d", i);
6724 hobject_t soid(sobject_t(oid, 0));
6725 ObjectStore::Transaction t;
6726 t.write(coll_t(), ghobject_t(soid), 0, osize, bl);
6727 store->queue_transaction(osr.get(), std::move(t), NULL);
6728 cleanupt.remove(coll_t(), ghobject_t(soid));
6733 bufferptr bp(bsize);
6735 bl.push_back(std::move(bp));
6736 bl.rebuild_page_aligned();
6740 if (!osr->flush_commit(&waiter)) {
6745 utime_t start = ceph_clock_now();
6746 for (int64_t pos = 0; pos < count; pos += bsize) {
6748 unsigned offset = 0;
6749 if (onum && osize) {
6750 snprintf(nm, sizeof(nm), "disk_bw_test_%d", (int)(rand() % onum));
6751 offset = rand() % (osize / bsize) * bsize;
6753 snprintf(nm, sizeof(nm), "disk_bw_test_%lld", (long long)pos);
6756 hobject_t soid(sobject_t(oid, 0));
6757 ObjectStore::Transaction t;
6758 t.write(coll_t::meta(), ghobject_t(soid), offset, bsize, bl);
6759 store->queue_transaction(osr.get(), std::move(t), NULL);
6760 if (!onum || !osize)
6761 cleanupt.remove(coll_t::meta(), ghobject_t(soid));
6766 if (!osr->flush_commit(&waiter)) {
6770 utime_t end = ceph_clock_now();
6773 store->queue_transaction(osr.get(), std::move(cleanupt), NULL);
6776 if (!osr->flush_commit(&waiter)) {
6781 uint64_t rate = (double)count / (end - start);
6783 f->open_object_section("osd_bench_results");
6784 f->dump_int("bytes_written", count);
6785 f->dump_int("blocksize", bsize);
6786 f->dump_unsigned("bytes_per_sec", rate);
6790 ss << "bench: wrote " << prettybyte_t(count)
6791 << " in blocks of " << prettybyte_t(bsize) << " in "
6792 << (end-start) << " sec at " << prettybyte_t(rate) << "/sec";
6796 else if (prefix == "flush_pg_stats") {
6797 if (osdmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
6798 mgrc.send_pgstats();
6799 ds << service.get_osd_stat_seq() << "\n";
6805 else if (prefix == "heap") {
6806 r = ceph::osd_cmds::heap(*cct, cmdmap, *f, ds);
6809 else if (prefix == "debug dump_missing") {
6811 cmd_getval(cct, cmdmap, "filename", file_name);
6812 std::ofstream fout(file_name.c_str());
6813 if (!fout.is_open()) {
6814 ss << "failed to open file '" << file_name << "'";
6819 fout << "*** osd " << whoami << ": dump_missing ***" << std::endl;
6820 RWLock::RLocker l(pg_map_lock);
6821 for (ceph::unordered_map<spg_t, PG*>::const_iterator pg_map_e = pg_map.begin();
6822 pg_map_e != pg_map.end(); ++pg_map_e) {
6823 PG *pg = pg_map_e->second;
6826 fout << *pg << std::endl;
6827 std::map<hobject_t, pg_missing_item>::const_iterator mend =
6828 pg->pg_log.get_missing().get_items().end();
6829 std::map<hobject_t, pg_missing_item>::const_iterator mi =
6830 pg->pg_log.get_missing().get_items().begin();
6831 for (; mi != mend; ++mi) {
6832 fout << mi->first << " -> " << mi->second << std::endl;
6833 if (!pg->missing_loc.needs_recovery(mi->first))
6835 if (pg->missing_loc.is_unfound(mi->first))
6836 fout << " unfound ";
6837 const set<pg_shard_t> &mls(pg->missing_loc.get_locations(mi->first));
6840 fout << "missing_loc: " << mls << std::endl;
6848 else if (prefix == "debug kick_recovery_wq") {
6850 cmd_getval(cct, cmdmap, "delay", delay);
6853 r = cct->_conf->set_val("osd_recovery_delay_start", oss.str().c_str());
6855 ss << "kick_recovery_wq: error setting "
6856 << "osd_recovery_delay_start to '" << delay << "': error "
6860 cct->_conf->apply_changes(NULL);
6861 ss << "kicking recovery queue. set osd_recovery_delay_start "
6862 << "to " << cct->_conf->osd_recovery_delay_start;
6865 else if (prefix == "cpu_profiler") {
6867 cmd_getval(cct, cmdmap, "arg", arg);
6868 vector<string> argvec;
6869 get_str_vec(arg, argvec);
6870 cpu_profiler_handle_command(argvec, ds);
6873 else if (prefix == "dump_pg_recovery_stats") {
6876 pg_recovery_stats.dump_formatted(f.get());
6879 pg_recovery_stats.dump(s);
6880 ds << "dump pg recovery stats: " << s.str();
6884 else if (prefix == "reset_pg_recovery_stats") {
6885 ss << "reset pg recovery stats";
6886 pg_recovery_stats.reset();
6889 else if (prefix == "perf histogram dump") {
6891 std::string counter;
6892 cmd_getval(cct, cmdmap, "logger", logger);
6893 cmd_getval(cct, cmdmap, "counter", counter);
6895 cct->get_perfcounters_collection()->dump_formatted_histograms(
6896 f.get(), false, logger, counter);
6901 else if (prefix == "compact") {
6902 dout(1) << "triggering manual compaction" << dendl;
6903 auto start = ceph::coarse_mono_clock::now();
6905 auto end = ceph::coarse_mono_clock::now();
6906 auto time_span = chrono::duration_cast<chrono::duration<double>>(end - start);
6907 dout(1) << "finished manual compaction in "
6908 << time_span.count()
6909 << " seconds" << dendl;
6910 ss << "compacted omap in " << time_span.count() << " seconds";
6914 ss << "unrecognized command! " << cmd;
6921 dout(0) << "do_command r=" << r << " " << rs << dendl;
6924 MCommandReply *reply = new MCommandReply(r, rs);
6925 reply->set_tid(tid);
6926 reply->set_data(odata);
6927 con->send_message(reply);
6931 bool OSD::heartbeat_dispatch(Message *m)
6933 dout(30) << "heartbeat_dispatch " << m << dendl;
6934 switch (m->get_type()) {
6937 dout(10) << "ping from " << m->get_source_inst() << dendl;
6942 handle_osd_ping(static_cast<MOSDPing*>(m));
6946 dout(0) << "dropping unexpected message " << *m << " from " << m->get_source_inst() << dendl;
6953 bool OSD::ms_dispatch(Message *m)
6955 dout(20) << "OSD::ms_dispatch: " << *m << dendl;
6956 if (m->get_type() == MSG_OSD_MARK_ME_DOWN) {
6957 service.got_stop_ack();
6965 if (is_stopping()) {
6979 void OSD::maybe_share_map(
6984 if (!op->check_send_map) {
6987 epoch_t last_sent_epoch = 0;
6989 session->sent_epoch_lock.lock();
6990 last_sent_epoch = session->last_sent_epoch;
6991 session->sent_epoch_lock.unlock();
6993 const Message *m = op->get_req();
6996 m->get_connection().get(),
6999 session ? &last_sent_epoch : NULL);
7001 session->sent_epoch_lock.lock();
7002 if (session->last_sent_epoch < last_sent_epoch) {
7003 session->last_sent_epoch = last_sent_epoch;
7005 session->sent_epoch_lock.unlock();
7007 op->check_send_map = false;
7010 void OSD::dispatch_session_waiting(Session *session, OSDMapRef osdmap)
7012 assert(session->session_dispatch_lock.is_locked());
7014 auto i = session->waiting_on_map.begin();
7015 while (i != session->waiting_on_map.end()) {
7016 OpRequestRef op = &(*i);
7017 assert(ms_can_fast_dispatch(op->get_req()));
7018 const MOSDFastDispatchOp *m = static_cast<const MOSDFastDispatchOp*>(
7020 if (m->get_min_epoch() > osdmap->get_epoch()) {
7023 session->waiting_on_map.erase(i++);
7027 if (m->get_type() == CEPH_MSG_OSD_OP) {
7028 pg_t actual_pgid = osdmap->raw_pg_to_pg(
7029 static_cast<const MOSDOp*>(m)->get_pg());
7030 if (!osdmap->get_primary_shard(actual_pgid, &pgid)) {
7034 pgid = m->get_spg();
7036 enqueue_op(pgid, op, m->get_map_epoch());
7039 if (session->waiting_on_map.empty()) {
7040 clear_session_waiting_on_map(session);
7042 register_session_waiting_on_map(session);
7046 void OSD::ms_fast_dispatch(Message *m)
7049 if (service.is_stopping()) {
7053 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7056 osd_reqid_t reqid = op->get_reqid();
7058 tracepoint(osd, ms_fast_dispatch, reqid.name._type,
7059 reqid.name._num, reqid.tid, reqid.inc);
7063 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7065 // note sender epoch, min req'd epoch
7066 op->sent_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch();
7067 op->min_epoch = static_cast<MOSDFastDispatchOp*>(m)->get_min_epoch();
7068 assert(op->min_epoch <= op->sent_epoch); // sanity check!
7070 service.maybe_inject_dispatch_delay();
7072 if (m->get_connection()->has_features(CEPH_FEATUREMASK_RESEND_ON_SPLIT) ||
7073 m->get_type() != CEPH_MSG_OSD_OP) {
7074 // queue it directly
7076 static_cast<MOSDFastDispatchOp*>(m)->get_spg(),
7078 static_cast<MOSDFastDispatchOp*>(m)->get_map_epoch());
7080 // legacy client, and this is an MOSDOp (the *only* fast dispatch
7081 // message that didn't have an explicit spg_t); we need to map
7082 // them to an spg_t while preserving delivery order.
7083 Session *session = static_cast<Session*>(m->get_connection()->get_priv());
7086 Mutex::Locker l(session->session_dispatch_lock);
7088 session->waiting_on_map.push_back(*op);
7089 OSDMapRef nextmap = service.get_nextmap_reserved();
7090 dispatch_session_waiting(session, nextmap);
7091 service.release_map(nextmap);
7096 OID_EVENT_TRACE_WITH_MSG(m, "MS_FAST_DISPATCH_END", false);
7099 void OSD::ms_fast_preprocess(Message *m)
7101 if (m->get_connection()->get_peer_type() == CEPH_ENTITY_TYPE_OSD) {
7102 if (m->get_type() == CEPH_MSG_OSD_MAP) {
7103 MOSDMap *mm = static_cast<MOSDMap*>(m);
7104 Session *s = static_cast<Session*>(m->get_connection()->get_priv());
7106 s->received_map_lock.lock();
7107 s->received_map_epoch = mm->get_last();
7108 s->received_map_lock.unlock();
7115 bool OSD::ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new)
7117 dout(10) << "OSD::ms_get_authorizer type=" << ceph_entity_type_name(dest_type) << dendl;
7119 if (is_stopping()) {
7120 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7124 if (dest_type == CEPH_ENTITY_TYPE_MON)
7128 /* the MonClient checks keys every tick(), so we should just wait for that cycle
7130 if (monc->wait_auth_rotating(10) < 0) {
7131 derr << "OSD::ms_get_authorizer wait_auth_rotating failed" << dendl;
7136 *authorizer = monc->build_authorizer(dest_type);
7137 return *authorizer != NULL;
7141 bool OSD::ms_verify_authorizer(Connection *con, int peer_type,
7142 int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
7143 bool& isvalid, CryptoKey& session_key)
7145 AuthAuthorizeHandler *authorize_handler = 0;
7146 switch (peer_type) {
7147 case CEPH_ENTITY_TYPE_MDS:
7149 * note: mds is technically a client from our perspective, but
7150 * this makes the 'cluster' consistent w/ monitor's usage.
7152 case CEPH_ENTITY_TYPE_OSD:
7153 case CEPH_ENTITY_TYPE_MGR:
7154 authorize_handler = authorize_handler_cluster_registry->get_handler(protocol);
7157 authorize_handler = authorize_handler_service_registry->get_handler(protocol);
7159 if (!authorize_handler) {
7160 dout(0) << "No AuthAuthorizeHandler found for protocol " << protocol << dendl;
7165 AuthCapsInfo caps_info;
7168 uint64_t auid = CEPH_AUTH_UID_DEFAULT;
7170 RotatingKeyRing *keys = monc->rotating_secrets.get();
7172 isvalid = authorize_handler->verify_authorizer(
7174 authorizer_data, authorizer_reply, name, global_id, caps_info, session_key,
7177 dout(10) << __func__ << " no rotating_keys (yet), denied" << dendl;
7182 Session *s = static_cast<Session *>(con->get_priv());
7184 s = new Session(cct);
7185 con->set_priv(s->get());
7187 dout(10) << " new session " << s << " con=" << s->con << " addr=" << s->con->get_peer_addr() << dendl;
7190 s->entity_name = name;
7191 if (caps_info.allow_all)
7192 s->caps.set_allow_all();
7195 if (caps_info.caps.length() > 0) {
7196 bufferlist::iterator p = caps_info.caps.begin();
7201 catch (buffer::error& e) {
7203 bool success = s->caps.parse(str);
7205 dout(10) << " session " << s << " " << s->entity_name << " has caps " << s->caps << " '" << str << "'" << dendl;
7207 dout(10) << " session " << s << " " << s->entity_name << " failed to parse caps '" << str << "'" << dendl;
7215 void OSD::do_waiters()
7217 assert(osd_lock.is_locked());
7219 dout(10) << "do_waiters -- start" << dendl;
7220 while (!finished.empty()) {
7221 OpRequestRef next = finished.front();
7222 finished.pop_front();
7225 dout(10) << "do_waiters -- finish" << dendl;
7228 void OSD::dispatch_op(OpRequestRef op)
7230 switch (op->get_req()->get_type()) {
7232 case MSG_OSD_PG_CREATE:
7233 handle_pg_create(op);
7235 case MSG_OSD_PG_NOTIFY:
7236 handle_pg_notify(op);
7238 case MSG_OSD_PG_QUERY:
7239 handle_pg_query(op);
7241 case MSG_OSD_PG_LOG:
7244 case MSG_OSD_PG_REMOVE:
7245 handle_pg_remove(op);
7247 case MSG_OSD_PG_INFO:
7250 case MSG_OSD_PG_TRIM:
7253 case MSG_OSD_BACKFILL_RESERVE:
7254 handle_pg_backfill_reserve(op);
7256 case MSG_OSD_RECOVERY_RESERVE:
7257 handle_pg_recovery_reserve(op);
7262 void OSD::_dispatch(Message *m)
7264 assert(osd_lock.is_locked());
7265 dout(20) << "_dispatch " << m << " " << *m << dendl;
7267 switch (m->get_type()) {
7269 // -- don't need lock --
7271 dout(10) << "ping from " << m->get_source() << dendl;
7275 // -- don't need OSDMap --
7277 // map and replication
7278 case CEPH_MSG_OSD_MAP:
7279 handle_osd_map(static_cast<MOSDMap*>(m));
7283 case MSG_PGSTATSACK:
7284 handle_pg_stats_ack(static_cast<MPGStatsAck*>(m));
7287 case MSG_MON_COMMAND:
7288 handle_command(static_cast<MMonCommand*>(m));
7291 handle_command(static_cast<MCommand*>(m));
7295 handle_scrub(static_cast<MOSDScrub*>(m));
7298 case MSG_OSD_FORCE_RECOVERY:
7299 handle_force_recovery(m);
7302 // -- need OSDMap --
7304 case MSG_OSD_PG_CREATE:
7305 case MSG_OSD_PG_NOTIFY:
7306 case MSG_OSD_PG_QUERY:
7307 case MSG_OSD_PG_LOG:
7308 case MSG_OSD_PG_REMOVE:
7309 case MSG_OSD_PG_INFO:
7310 case MSG_OSD_PG_TRIM:
7311 case MSG_OSD_BACKFILL_RESERVE:
7312 case MSG_OSD_RECOVERY_RESERVE:
7314 OpRequestRef op = op_tracker.create_request<OpRequest, Message*>(m);
7316 op->osd_trace.init("osd op", &trace_endpoint, &m->trace);
7317 // no map? starting up?
7319 dout(7) << "no OSDMap, not booted" << dendl;
7320 logger->inc(l_osd_waiting_for_map);
7321 waiting_for_osdmap.push_back(op);
7322 op->mark_delayed("no osdmap");
7332 void OSD::handle_pg_scrub(MOSDScrub *m, PG *pg)
7335 if (pg->is_primary()) {
7336 pg->unreg_next_scrub();
7337 pg->scrubber.must_scrub = true;
7338 pg->scrubber.must_deep_scrub = m->deep || m->repair;
7339 pg->scrubber.must_repair = m->repair;
7340 pg->reg_next_scrub();
7341 dout(10) << "marking " << *pg << " for scrub" << dendl;
7346 void OSD::handle_scrub(MOSDScrub *m)
7348 dout(10) << "handle_scrub " << *m << dendl;
7349 if (!require_mon_or_mgr_peer(m)) {
7353 if (m->fsid != monc->get_fsid()) {
7354 dout(0) << "handle_scrub fsid " << m->fsid << " != " << monc->get_fsid() << dendl;
7359 RWLock::RLocker l(pg_map_lock);
7360 if (m->scrub_pgs.empty()) {
7361 for (ceph::unordered_map<spg_t, PG*>::iterator p = pg_map.begin();
7364 handle_pg_scrub(m, p->second);
7366 for (vector<pg_t>::iterator p = m->scrub_pgs.begin();
7367 p != m->scrub_pgs.end();
7370 if (osdmap->get_primary_shard(*p, &pcand)) {
7371 auto pg_map_entry = pg_map.find(pcand);
7372 if (pg_map_entry != pg_map.end()) {
7373 handle_pg_scrub(m, pg_map_entry->second);
7382 bool OSD::scrub_random_backoff()
7384 bool coin_flip = (rand() / (double)RAND_MAX >=
7385 cct->_conf->osd_scrub_backoff_ratio);
7387 dout(20) << "scrub_random_backoff lost coin flip, randomly backing off" << dendl;
7393 OSDService::ScrubJob::ScrubJob(CephContext* cct,
7394 const spg_t& pg, const utime_t& timestamp,
7395 double pool_scrub_min_interval,
7396 double pool_scrub_max_interval, bool must)
7399 sched_time(timestamp),
7402 // if not explicitly requested, postpone the scrub with a random delay
7404 double scrub_min_interval = pool_scrub_min_interval > 0 ?
7405 pool_scrub_min_interval : cct->_conf->osd_scrub_min_interval;
7406 double scrub_max_interval = pool_scrub_max_interval > 0 ?
7407 pool_scrub_max_interval : cct->_conf->osd_scrub_max_interval;
7409 sched_time += scrub_min_interval;
7410 double r = rand() / (double)RAND_MAX;
7412 scrub_min_interval * cct->_conf->osd_scrub_interval_randomize_ratio * r;
7413 deadline += scrub_max_interval;
7417 bool OSDService::ScrubJob::ScrubJob::operator<(const OSDService::ScrubJob& rhs) const {
7418 if (sched_time < rhs.sched_time)
7420 if (sched_time > rhs.sched_time)
7422 return pgid < rhs.pgid;
7425 bool OSD::scrub_time_permit(utime_t now)
7428 time_t tt = now.sec();
7429 localtime_r(&tt, &bdt);
7430 bool time_permit = false;
7431 if (cct->_conf->osd_scrub_begin_hour < cct->_conf->osd_scrub_end_hour) {
7432 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour && bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7436 if (bdt.tm_hour >= cct->_conf->osd_scrub_begin_hour || bdt.tm_hour < cct->_conf->osd_scrub_end_hour) {
7441 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7442 << " - " << cct->_conf->osd_scrub_end_hour
7443 << " now " << bdt.tm_hour << " = no" << dendl;
7445 dout(20) << __func__ << " should run between " << cct->_conf->osd_scrub_begin_hour
7446 << " - " << cct->_conf->osd_scrub_end_hour
7447 << " now " << bdt.tm_hour << " = yes" << dendl;
7452 bool OSD::scrub_load_below_threshold()
7455 if (getloadavg(loadavgs, 3) != 3) {
7456 dout(10) << __func__ << " couldn't read loadavgs\n" << dendl;
7460 // allow scrub if below configured threshold
7461 if (loadavgs[0] < cct->_conf->osd_scrub_load_threshold) {
7462 dout(20) << __func__ << " loadavg " << loadavgs[0]
7463 << " < max " << cct->_conf->osd_scrub_load_threshold
7464 << " = yes" << dendl;
7468 // allow scrub if below daily avg and currently decreasing
7469 if (loadavgs[0] < daily_loadavg && loadavgs[0] < loadavgs[2]) {
7470 dout(20) << __func__ << " loadavg " << loadavgs[0]
7471 << " < daily_loadavg " << daily_loadavg
7472 << " and < 15m avg " << loadavgs[2]
7473 << " = yes" << dendl;
7477 dout(20) << __func__ << " loadavg " << loadavgs[0]
7478 << " >= max " << cct->_conf->osd_scrub_load_threshold
7479 << " and ( >= daily_loadavg " << daily_loadavg
7480 << " or >= 15m avg " << loadavgs[2]
7481 << ") = no" << dendl;
7485 void OSD::sched_scrub()
7487 // if not permitted, fail fast
7488 if (!service.can_inc_scrubs_pending()) {
7491 if (!cct->_conf->osd_scrub_during_recovery && service.is_recovery_active()) {
7492 dout(20) << __func__ << " not scheduling scrubs due to active recovery" << dendl;
7497 utime_t now = ceph_clock_now();
7498 bool time_permit = scrub_time_permit(now);
7499 bool load_is_low = scrub_load_below_threshold();
7500 dout(20) << "sched_scrub load_is_low=" << (int)load_is_low << dendl;
7502 OSDService::ScrubJob scrub;
7503 if (service.first_scrub_stamp(&scrub)) {
7505 dout(30) << "sched_scrub examine " << scrub.pgid << " at " << scrub.sched_time << dendl;
7507 if (scrub.sched_time > now) {
7508 // save ourselves some effort
7509 dout(10) << "sched_scrub " << scrub.pgid << " scheduled at " << scrub.sched_time
7510 << " > " << now << dendl;
7514 if ((scrub.deadline >= now) && !(time_permit && load_is_low)) {
7515 dout(10) << __func__ << " not scheduling scrub for " << scrub.pgid << " due to "
7516 << (!time_permit ? "time not permit" : "high load") << dendl;
7520 PG *pg = _lookup_lock_pg(scrub.pgid);
7523 if (pg->get_pgbackend()->scrub_supported() && pg->is_active()) {
7524 dout(10) << "sched_scrub scrubbing " << scrub.pgid << " at " << scrub.sched_time
7525 << (pg->scrubber.must_scrub ? ", explicitly requested" :
7526 (load_is_low ? ", load_is_low" : " deadline < now"))
7528 if (pg->sched_scrub()) {
7534 } while (service.next_scrub_stamp(scrub, &scrub));
7536 dout(20) << "sched_scrub done" << dendl;
7541 // =====================================================
7544 void OSD::wait_for_new_map(OpRequestRef op)
7547 if (waiting_for_osdmap.empty()) {
7548 osdmap_subscribe(osdmap->get_epoch() + 1, false);
7551 logger->inc(l_osd_waiting_for_map);
7552 waiting_for_osdmap.push_back(op);
7553 op->mark_delayed("wait for new map");
7558 * assimilate new OSDMap(s). scan pgs, etc.
7561 void OSD::note_down_osd(int peer)
7563 assert(osd_lock.is_locked());
7564 cluster_messenger->mark_down(osdmap->get_cluster_addr(peer));
7566 heartbeat_lock.Lock();
7567 failure_queue.erase(peer);
7568 failure_pending.erase(peer);
7569 map<int,HeartbeatInfo>::iterator p = heartbeat_peers.find(peer);
7570 if (p != heartbeat_peers.end()) {
7571 p->second.con_back->mark_down();
7572 if (p->second.con_front) {
7573 p->second.con_front->mark_down();
7575 heartbeat_peers.erase(p);
7577 heartbeat_lock.Unlock();
7580 void OSD::note_up_osd(int peer)
7582 service.forget_peer_epoch(peer, osdmap->get_epoch() - 1);
7583 heartbeat_set_peers_need_update();
7586 struct C_OnMapCommit : public Context {
7588 epoch_t first, last;
7590 C_OnMapCommit(OSD *o, epoch_t f, epoch_t l, MOSDMap *m)
7591 : osd(o), first(f), last(l), msg(m) {}
7592 void finish(int r) override {
7593 osd->_committed_osd_maps(first, last, msg);
7598 struct C_OnMapApply : public Context {
7599 OSDService *service;
7600 list<OSDMapRef> pinned_maps;
7602 C_OnMapApply(OSDService *service,
7603 const list<OSDMapRef> &pinned_maps,
7605 : service(service), pinned_maps(pinned_maps), e(e) {}
7606 void finish(int r) override {
7607 service->clear_map_bl_cache_pins(e);
7611 void OSD::osdmap_subscribe(version_t epoch, bool force_request)
7613 Mutex::Locker l(osdmap_subscribe_lock);
7614 if (latest_subscribed_epoch >= epoch && !force_request)
7617 latest_subscribed_epoch = MAX(epoch, latest_subscribed_epoch);
7619 if (monc->sub_want_increment("osdmap", epoch, CEPH_SUBSCRIBE_ONETIME) ||
7625 void OSD::trim_maps(epoch_t oldest, int nreceived, bool skip_maps)
7627 epoch_t min = std::min(oldest, service.map_cache.cached_key_lower_bound());
7628 if (min <= superblock.oldest_map)
7632 ObjectStore::Transaction t;
7633 for (epoch_t e = superblock.oldest_map; e < min; ++e) {
7634 dout(20) << " removing old osdmap epoch " << e << dendl;
7635 t.remove(coll_t::meta(), get_osdmap_pobject_name(e));
7636 t.remove(coll_t::meta(), get_inc_osdmap_pobject_name(e));
7637 superblock.oldest_map = e + 1;
7639 if (num >= cct->_conf->osd_target_transaction_size && num >= nreceived) {
7640 service.publish_superblock(superblock);
7641 write_superblock(t);
7642 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7646 // skip_maps leaves us with a range of old maps if we fail to remove all
7647 // of them before moving superblock.oldest_map forward to the first map
7648 // in the incoming MOSDMap msg. so we should continue removing them in
7649 // this case, even we could do huge series of delete transactions all at
7656 service.publish_superblock(superblock);
7657 write_superblock(t);
7658 int tr = store->queue_transaction(service.meta_osr.get(), std::move(t), nullptr);
7661 // we should not remove the cached maps
7662 assert(min <= service.map_cache.cached_key_lower_bound());
7665 void OSD::handle_osd_map(MOSDMap *m)
7667 assert(osd_lock.is_locked());
7668 // Keep a ref in the list until we get the newly received map written
7669 // onto disk. This is important because as long as the refs are alive,
7670 // the OSDMaps will be pinned in the cache and we won't try to read it
7671 // off of disk. Otherwise these maps will probably not stay in the cache,
7672 // and reading those OSDMaps before they are actually written can result
7674 list<OSDMapRef> pinned_maps;
7675 if (m->fsid != monc->get_fsid()) {
7676 dout(0) << "handle_osd_map fsid " << m->fsid << " != "
7677 << monc->get_fsid() << dendl;
7681 if (is_initializing()) {
7682 dout(0) << "ignoring osdmap until we have initialized" << dendl;
7687 Session *session = static_cast<Session *>(m->get_connection()->get_priv());
7688 if (session && !(session->entity_name.is_mon() ||
7689 session->entity_name.is_osd())) {
7691 dout(10) << "got osd map from Session " << session
7692 << " which we can't take maps from (not a mon or osd)" << dendl;
7700 // share with the objecter
7702 service.objecter->handle_osd_map(m);
7704 epoch_t first = m->get_first();
7705 epoch_t last = m->get_last();
7706 dout(3) << "handle_osd_map epochs [" << first << "," << last << "], i have "
7707 << superblock.newest_map
7708 << ", src has [" << m->oldest_map << "," << m->newest_map << "]"
7711 logger->inc(l_osd_map);
7712 logger->inc(l_osd_mape, last - first + 1);
7713 if (first <= superblock.newest_map)
7714 logger->inc(l_osd_mape_dup, superblock.newest_map - first + 1);
7715 if (service.max_oldest_map < m->oldest_map) {
7716 service.max_oldest_map = m->oldest_map;
7717 assert(service.max_oldest_map >= superblock.oldest_map);
7720 // make sure there is something new, here, before we bother flushing
7721 // the queues and such
7722 if (last <= superblock.newest_map) {
7723 dout(10) << " no new maps here, dropping" << dendl;
7729 bool skip_maps = false;
7730 if (first > superblock.newest_map + 1) {
7731 dout(10) << "handle_osd_map message skips epochs "
7732 << superblock.newest_map + 1 << ".." << (first-1) << dendl;
7733 if (m->oldest_map <= superblock.newest_map + 1) {
7734 osdmap_subscribe(superblock.newest_map + 1, false);
7738 // always try to get the full range of maps--as many as we can. this
7739 // 1- is good to have
7740 // 2- is at present the only way to ensure that we get a *full* map as
7742 if (m->oldest_map < first) {
7743 osdmap_subscribe(m->oldest_map - 1, true);
7750 ObjectStore::Transaction t;
7751 uint64_t txn_size = 0;
7753 // store new maps: queue for disk and put in the osdmap cache
7754 epoch_t start = MAX(superblock.newest_map + 1, first);
7755 for (epoch_t e = start; e <= last; e++) {
7756 if (txn_size >= t.get_num_bytes()) {
7757 derr << __func__ << " transaction size overflowed" << dendl;
7758 assert(txn_size < t.get_num_bytes());
7760 txn_size = t.get_num_bytes();
7761 map<epoch_t,bufferlist>::iterator p;
7762 p = m->maps.find(e);
7763 if (p != m->maps.end()) {
7764 dout(10) << "handle_osd_map got full map for epoch " << e << dendl;
7765 OSDMap *o = new OSDMap;
7766 bufferlist& bl = p->second;
7770 ghobject_t fulloid = get_osdmap_pobject_name(e);
7771 t.write(coll_t::meta(), fulloid, 0, bl.length(), bl);
7773 pinned_maps.push_back(add_map(o));
7779 p = m->incremental_maps.find(e);
7780 if (p != m->incremental_maps.end()) {
7781 dout(10) << "handle_osd_map got inc map for epoch " << e << dendl;
7782 bufferlist& bl = p->second;
7783 ghobject_t oid = get_inc_osdmap_pobject_name(e);
7784 t.write(coll_t::meta(), oid, 0, bl.length(), bl);
7785 pin_map_inc_bl(e, bl);
7787 OSDMap *o = new OSDMap;
7790 bool got = get_map_bl(e - 1, obl);
7795 OSDMap::Incremental inc;
7796 bufferlist::iterator p = bl.begin();
7798 if (o->apply_incremental(inc) < 0) {
7799 derr << "ERROR: bad fsid? i have " << osdmap->get_fsid() << " and inc has " << inc.fsid << dendl;
7800 assert(0 == "bad fsid");
7804 o->encode(fbl, inc.encode_features | CEPH_FEATURE_RESERVED);
7806 bool injected_failure = false;
7807 if (cct->_conf->osd_inject_bad_map_crc_probability > 0 &&
7808 (rand() % 10000) < cct->_conf->osd_inject_bad_map_crc_probability*10000.0) {
7809 derr << __func__ << " injecting map crc failure" << dendl;
7810 injected_failure = true;
7813 if ((inc.have_crc && o->get_crc() != inc.full_crc) || injected_failure) {
7814 dout(2) << "got incremental " << e
7815 << " but failed to encode full with correct crc; requesting"
7817 clog->warn() << "failed to encode map e" << e << " with expected crc";
7818 dout(20) << "my encoded map was:\n";
7819 fbl.hexdump(*_dout);
7822 request_full_map(e, last);
7828 ghobject_t fulloid = get_osdmap_pobject_name(e);
7829 t.write(coll_t::meta(), fulloid, 0, fbl.length(), fbl);
7831 pinned_maps.push_back(add_map(o));
7835 assert(0 == "MOSDMap lied about what maps it had?");
7838 // even if this map isn't from a mon, we may have satisfied our subscription
7839 monc->sub_got("osdmap", last);
7841 if (!m->maps.empty() && requested_full_first) {
7842 dout(10) << __func__ << " still missing full maps " << requested_full_first
7843 << ".." << requested_full_last << dendl;
7844 rerequest_full_maps();
7847 if (superblock.oldest_map) {
7848 // make sure we at least keep pace with incoming maps
7849 trim_maps(m->oldest_map, last - first + 1, skip_maps);
7852 if (!superblock.oldest_map || skip_maps)
7853 superblock.oldest_map = first;
7854 superblock.newest_map = last;
7855 superblock.current_epoch = last;
7857 // note in the superblock that we were clean thru the prior epoch
7858 epoch_t boot_epoch = service.get_boot_epoch();
7859 if (boot_epoch && boot_epoch >= superblock.mounted) {
7860 superblock.mounted = boot_epoch;
7861 superblock.clean_thru = last;
7864 // superblock and commit
7865 write_superblock(t);
7866 store->queue_transaction(
7867 service.meta_osr.get(),
7869 new C_OnMapApply(&service, pinned_maps, last),
7870 new C_OnMapCommit(this, start, last, m), 0);
7871 service.publish_superblock(superblock);
7874 void OSD::_committed_osd_maps(epoch_t first, epoch_t last, MOSDMap *m)
7876 dout(10) << __func__ << " " << first << ".." << last << dendl;
7877 if (is_stopping()) {
7878 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7881 Mutex::Locker l(osd_lock);
7882 if (is_stopping()) {
7883 dout(10) << __func__ << " bailing, we are shutting down" << dendl;
7886 map_lock.get_write();
7888 bool do_shutdown = false;
7889 bool do_restart = false;
7890 bool network_error = false;
7892 // advance through the new maps
7893 for (epoch_t cur = first; cur <= last; cur++) {
7894 dout(10) << " advance to epoch " << cur
7895 << " (<= last " << last
7896 << " <= newest_map " << superblock.newest_map
7899 OSDMapRef newmap = get_map(cur);
7900 assert(newmap); // we just cached it above!
7902 // start blacklisting messages sent to peers that go down.
7903 service.pre_publish_map(newmap);
7905 // kill connections to newly down osds
7906 bool waited_for_reservations = false;
7908 osdmap->get_all_osds(old);
7909 for (set<int>::iterator p = old.begin(); p != old.end(); ++p) {
7911 osdmap->is_up(*p) && // in old map
7912 newmap->is_down(*p)) { // but not the new one
7913 if (!waited_for_reservations) {
7914 service.await_reserved_maps();
7915 waited_for_reservations = true;
7918 } else if (*p != whoami &&
7919 osdmap->is_down(*p) &&
7920 newmap->is_up(*p)) {
7925 if ((osdmap->test_flag(CEPH_OSDMAP_NOUP) !=
7926 newmap->test_flag(CEPH_OSDMAP_NOUP)) ||
7927 (osdmap->is_noup(whoami) != newmap->is_noup(whoami))) {
7928 dout(10) << __func__ << " NOUP flag changed in " << newmap->get_epoch()
7931 // this captures the case where we sent the boot message while
7932 // NOUP was being set on the mon and our boot request was
7933 // dropped, and then later it is cleared. it imperfectly
7934 // handles the case where our original boot message was not
7935 // dropped and we restart even though we might have booted, but
7936 // that is harmless (boot will just take slightly longer).
7940 if (osdmap->require_osd_release < CEPH_RELEASE_LUMINOUS &&
7941 newmap->require_osd_release >= CEPH_RELEASE_LUMINOUS) {
7942 dout(10) << __func__ << " require_osd_release reached luminous in "
7943 << newmap->get_epoch() << dendl;
7944 clear_pg_stat_queue();
7945 clear_outstanding_pg_stats();
7951 service.retrieve_epochs(&boot_epoch, &up_epoch, NULL);
7953 osdmap->is_up(whoami) &&
7954 osdmap->get_inst(whoami) == client_messenger->get_myinst()) {
7955 up_epoch = osdmap->get_epoch();
7956 dout(10) << "up_epoch is " << up_epoch << dendl;
7958 boot_epoch = osdmap->get_epoch();
7959 dout(10) << "boot_epoch is " << boot_epoch << dendl;
7961 service.set_epochs(&boot_epoch, &up_epoch, NULL);
7965 had_map_since = ceph_clock_now();
7967 epoch_t _bind_epoch = service.get_bind_epoch();
7968 if (osdmap->is_up(whoami) &&
7969 osdmap->get_addr(whoami) == client_messenger->get_myaddr() &&
7970 _bind_epoch < osdmap->get_up_from(whoami)) {
7973 dout(1) << "state: booting -> active" << dendl;
7974 set_state(STATE_ACTIVE);
7976 // set incarnation so that osd_reqid_t's we generate for our
7977 // objecter requests are unique across restarts.
7978 service.objecter->set_client_incarnation(osdmap->get_epoch());
7982 if (osdmap->get_epoch() > 0 &&
7984 if (!osdmap->exists(whoami)) {
7985 dout(0) << "map says i do not exist. shutting down." << dendl;
7986 do_shutdown = true; // don't call shutdown() while we have
7987 // everything paused
7988 } else if (!osdmap->is_up(whoami) ||
7989 !osdmap->get_addr(whoami).probably_equals(
7990 client_messenger->get_myaddr()) ||
7991 !osdmap->get_cluster_addr(whoami).probably_equals(
7992 cluster_messenger->get_myaddr()) ||
7993 !osdmap->get_hb_back_addr(whoami).probably_equals(
7994 hb_back_server_messenger->get_myaddr()) ||
7995 (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
7996 !osdmap->get_hb_front_addr(whoami).probably_equals(
7997 hb_front_server_messenger->get_myaddr()))) {
7998 if (!osdmap->is_up(whoami)) {
7999 if (service.is_preparing_to_stop() || service.is_stopping()) {
8000 service.got_stop_ack();
8002 clog->warn() << "Monitor daemon marked osd." << whoami << " down, "
8003 "but it is still running";
8004 clog->debug() << "map e" << osdmap->get_epoch()
8005 << " wrongly marked me down at e"
8006 << osdmap->get_down_at(whoami);
8008 } else if (!osdmap->get_addr(whoami).probably_equals(
8009 client_messenger->get_myaddr())) {
8010 clog->error() << "map e" << osdmap->get_epoch()
8011 << " had wrong client addr (" << osdmap->get_addr(whoami)
8012 << " != my " << client_messenger->get_myaddr() << ")";
8013 } else if (!osdmap->get_cluster_addr(whoami).probably_equals(
8014 cluster_messenger->get_myaddr())) {
8015 clog->error() << "map e" << osdmap->get_epoch()
8016 << " had wrong cluster addr ("
8017 << osdmap->get_cluster_addr(whoami)
8018 << " != my " << cluster_messenger->get_myaddr() << ")";
8019 } else if (!osdmap->get_hb_back_addr(whoami).probably_equals(
8020 hb_back_server_messenger->get_myaddr())) {
8021 clog->error() << "map e" << osdmap->get_epoch()
8022 << " had wrong heartbeat back addr ("
8023 << osdmap->get_hb_back_addr(whoami)
8024 << " != my " << hb_back_server_messenger->get_myaddr()
8026 } else if (osdmap->get_hb_front_addr(whoami) != entity_addr_t() &&
8027 !osdmap->get_hb_front_addr(whoami).probably_equals(
8028 hb_front_server_messenger->get_myaddr())) {
8029 clog->error() << "map e" << osdmap->get_epoch()
8030 << " had wrong heartbeat front addr ("
8031 << osdmap->get_hb_front_addr(whoami)
8032 << " != my " << hb_front_server_messenger->get_myaddr()
8036 if (!service.is_stopping()) {
8037 epoch_t up_epoch = 0;
8038 epoch_t bind_epoch = osdmap->get_epoch();
8039 service.set_epochs(NULL,&up_epoch, &bind_epoch);
8043 utime_t now = ceph_clock_now();
8044 utime_t grace = utime_t(cct->_conf->osd_max_markdown_period, 0);
8045 osd_markdown_log.push_back(now);
8046 //clear all out-of-date log
8047 while (!osd_markdown_log.empty() &&
8048 osd_markdown_log.front() + grace < now)
8049 osd_markdown_log.pop_front();
8050 if ((int)osd_markdown_log.size() > cct->_conf->osd_max_markdown_count) {
8051 dout(0) << __func__ << " marked down "
8052 << osd_markdown_log.size()
8053 << " > osd_max_markdown_count "
8054 << cct->_conf->osd_max_markdown_count
8055 << " in last " << grace << " seconds, shutting down"
8061 start_waiting_for_healthy();
8063 set<int> avoid_ports;
8064 #if defined(__FreeBSD__)
8065 // prevent FreeBSD from grabbing the client_messenger port during
8066 // rebinding. In which case a cluster_meesneger will connect also
8068 avoid_ports.insert(client_messenger->get_myaddr().get_port());
8070 avoid_ports.insert(cluster_messenger->get_myaddr().get_port());
8071 avoid_ports.insert(hb_back_server_messenger->get_myaddr().get_port());
8072 avoid_ports.insert(hb_front_server_messenger->get_myaddr().get_port());
8074 int r = cluster_messenger->rebind(avoid_ports);
8076 do_shutdown = true; // FIXME: do_restart?
8077 network_error = true;
8078 dout(0) << __func__ << " marked down:"
8079 << " rebind cluster_messenger failed" << dendl;
8082 r = hb_back_server_messenger->rebind(avoid_ports);
8084 do_shutdown = true; // FIXME: do_restart?
8085 network_error = true;
8086 dout(0) << __func__ << " marked down:"
8087 << " rebind hb_back_server_messenger failed" << dendl;
8090 r = hb_front_server_messenger->rebind(avoid_ports);
8092 do_shutdown = true; // FIXME: do_restart?
8093 network_error = true;
8094 dout(0) << __func__ << " marked down:"
8095 << " rebind hb_front_server_messenger failed" << dendl;
8098 hb_front_client_messenger->mark_down_all();
8099 hb_back_client_messenger->mark_down_all();
8101 reset_heartbeat_peers();
8106 map_lock.put_write();
8108 check_osdmap_features(store);
8113 if (is_active() || is_waiting_for_healthy())
8114 maybe_update_heartbeat_peers();
8117 dout(10) << " not yet active; waiting for peering wq to drain" << dendl;
8124 if (network_error) {
8125 Mutex::Locker l(heartbeat_lock);
8126 map<int,pair<utime_t,entity_inst_t>>::iterator it =
8127 failure_pending.begin();
8128 while (it != failure_pending.end()) {
8129 dout(10) << "handle_osd_ping canceling in-flight failure report for osd."
8130 << it->first << dendl;
8131 send_still_alive(osdmap->get_epoch(), it->second.second);
8132 failure_pending.erase(it++);
8135 // trigger shutdown in a different thread
8136 dout(0) << __func__ << " shutdown OSD via async signal" << dendl;
8137 queue_async_signal(SIGINT);
8139 else if (m->newest_map && m->newest_map > last) {
8140 dout(10) << " msg say newest map is " << m->newest_map
8141 << ", requesting more" << dendl;
8142 osdmap_subscribe(osdmap->get_epoch()+1, false);
8144 else if (is_preboot()) {
8145 if (m->get_source().is_mon())
8146 _preboot(m->oldest_map, m->newest_map);
8150 else if (do_restart)
8155 void OSD::check_osdmap_features(ObjectStore *fs)
8157 // adjust required feature bits?
8159 // we have to be a bit careful here, because we are accessing the
8160 // Policy structures without taking any lock. in particular, only
8161 // modify integer values that can safely be read by a racing CPU.
8162 // since we are only accessing existing Policy structures a their
8163 // current memory location, and setting or clearing bits in integer
8164 // fields, and we are the only writer, this is not a problem.
8167 Messenger::Policy p = client_messenger->get_default_policy();
8169 uint64_t features = osdmap->get_features(entity_name_t::TYPE_CLIENT, &mask);
8170 if ((p.features_required & mask) != features) {
8171 dout(0) << "crush map has features " << features
8172 << ", adjusting msgr requires for clients" << dendl;
8173 p.features_required = (p.features_required & ~mask) | features;
8174 client_messenger->set_default_policy(p);
8178 Messenger::Policy p = client_messenger->get_policy(entity_name_t::TYPE_MON);
8180 uint64_t features = osdmap->get_features(entity_name_t::TYPE_MON, &mask);
8181 if ((p.features_required & mask) != features) {
8182 dout(0) << "crush map has features " << features
8183 << " was " << p.features_required
8184 << ", adjusting msgr requires for mons" << dendl;
8185 p.features_required = (p.features_required & ~mask) | features;
8186 client_messenger->set_policy(entity_name_t::TYPE_MON, p);
8190 Messenger::Policy p = cluster_messenger->get_policy(entity_name_t::TYPE_OSD);
8192 uint64_t features = osdmap->get_features(entity_name_t::TYPE_OSD, &mask);
8194 if ((p.features_required & mask) != features) {
8195 dout(0) << "crush map has features " << features
8196 << ", adjusting msgr requires for osds" << dendl;
8197 p.features_required = (p.features_required & ~mask) | features;
8198 cluster_messenger->set_policy(entity_name_t::TYPE_OSD, p);
8201 if ((features & CEPH_FEATURE_OSD_ERASURE_CODES) &&
8202 !superblock.compat_features.incompat.contains(CEPH_OSD_FEATURE_INCOMPAT_SHARDS)) {
8203 dout(0) << __func__ << " enabling on-disk ERASURE CODES compat feature" << dendl;
8204 superblock.compat_features.incompat.insert(CEPH_OSD_FEATURE_INCOMPAT_SHARDS);
8205 ObjectStore::Transaction t;
8206 write_superblock(t);
8207 int err = store->queue_transaction(service.meta_osr.get(), std::move(t), NULL);
8213 bool OSD::advance_pg(
8214 epoch_t osd_epoch, PG *pg,
8215 ThreadPool::TPHandle &handle,
8216 PG::RecoveryCtx *rctx,
8217 set<PGRef> *new_pgs)
8219 assert(pg->is_locked());
8220 epoch_t next_epoch = pg->get_osdmap()->get_epoch() + 1;
8221 OSDMapRef lastmap = pg->get_osdmap();
8223 if (lastmap->get_epoch() == osd_epoch)
8225 assert(lastmap->get_epoch() < osd_epoch);
8227 epoch_t min_epoch = service.get_min_pg_epoch();
8230 max = min_epoch + cct->_conf->osd_map_max_advance;
8232 max = next_epoch + cct->_conf->osd_map_max_advance;
8236 next_epoch <= osd_epoch && next_epoch <= max;
8238 OSDMapRef nextmap = service.try_get_map(next_epoch);
8240 dout(20) << __func__ << " missing map " << next_epoch << dendl;
8241 // make sure max is bumped up so that we can get past any
8243 max = MAX(max, next_epoch + cct->_conf->osd_map_max_advance);
8247 vector<int> newup, newacting;
8248 int up_primary, acting_primary;
8249 nextmap->pg_to_up_acting_osds(
8251 &newup, &up_primary,
8252 &newacting, &acting_primary);
8253 pg->handle_advance_map(
8254 nextmap, lastmap, newup, up_primary,
8255 newacting, acting_primary, rctx);
8258 set<spg_t> children;
8259 spg_t parent(pg->info.pgid);
8260 if (parent.is_split(
8261 lastmap->get_pg_num(pg->pool.id),
8262 nextmap->get_pg_num(pg->pool.id),
8264 service.mark_split_in_progress(pg->info.pgid, children);
8266 pg, children, new_pgs, lastmap, nextmap,
8271 handle.reset_tp_timeout();
8273 service.pg_update_epoch(pg->info.pgid, lastmap->get_epoch());
8274 pg->handle_activate_map(rctx);
8275 if (next_epoch <= osd_epoch) {
8276 dout(10) << __func__ << " advanced to max " << max
8277 << " past min epoch " << min_epoch
8278 << " ... will requeue " << *pg << dendl;
8284 void OSD::consume_map()
8286 assert(osd_lock.is_locked());
8287 dout(7) << "consume_map version " << osdmap->get_epoch() << dendl;
8289 /** make sure the cluster is speaking in SORTBITWISE, because we don't
8290 * speak the older sorting version any more. Be careful not to force
8291 * a shutdown if we are merely processing old maps, though.
8293 if (!osdmap->test_flag(CEPH_OSDMAP_SORTBITWISE) && is_active()) {
8294 derr << __func__ << " SORTBITWISE flag is not set" << dendl;
8298 int num_pg_primary = 0, num_pg_replica = 0, num_pg_stray = 0;
8299 list<PGRef> to_remove;
8303 RWLock::RLocker l(pg_map_lock);
8304 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8307 PG *pg = it->second;
8309 if (pg->is_primary())
8311 else if (pg->is_replica())
8316 if (!osdmap->have_pg_pool(pg->info.pgid.pool())) {
8318 to_remove.push_back(PGRef(pg));
8320 service.init_splits_between(it->first, service.get_osdmap(), osdmap);
8326 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
8327 for (auto pg = pending_creates_from_osd.cbegin();
8328 pg != pending_creates_from_osd.cend();) {
8329 if (osdmap->get_pg_acting_rank(*pg, whoami) < 0) {
8330 pg = pending_creates_from_osd.erase(pg);
8337 for (list<PGRef>::iterator i = to_remove.begin();
8338 i != to_remove.end();
8339 to_remove.erase(i++)) {
8340 RWLock::WLocker locker(pg_map_lock);
8346 service.expand_pg_num(service.get_osdmap(), osdmap);
8348 service.pre_publish_map(osdmap);
8349 service.await_reserved_maps();
8350 service.publish_map(osdmap);
8352 service.maybe_inject_dispatch_delay();
8354 dispatch_sessions_waiting_on_map();
8356 service.maybe_inject_dispatch_delay();
8358 // remove any PGs which we no longer host from the session waiting_for_pg lists
8359 dout(20) << __func__ << " checking waiting_for_pg" << dendl;
8360 op_shardedwq.prune_pg_waiters(osdmap, whoami);
8362 service.maybe_inject_dispatch_delay();
8366 RWLock::RLocker l(pg_map_lock);
8367 for (ceph::unordered_map<spg_t,PG*>::iterator it = pg_map.begin();
8370 PG *pg = it->second;
8372 pg->queue_null(osdmap->get_epoch(), osdmap->get_epoch());
8376 logger->set(l_osd_pg, pg_map.size());
8378 logger->set(l_osd_pg_primary, num_pg_primary);
8379 logger->set(l_osd_pg_replica, num_pg_replica);
8380 logger->set(l_osd_pg_stray, num_pg_stray);
8383 void OSD::activate_map()
8385 assert(osd_lock.is_locked());
8387 dout(7) << "activate_map version " << osdmap->get_epoch() << dendl;
8389 if (osdmap->test_flag(CEPH_OSDMAP_FULL)) {
8390 dout(10) << " osdmap flagged full, doing onetime osdmap subscribe" << dendl;
8391 osdmap_subscribe(osdmap->get_epoch() + 1, false);
8395 if (osdmap->test_flag(CEPH_OSDMAP_NORECOVER)) {
8396 if (!service.recovery_is_paused()) {
8397 dout(1) << "pausing recovery (NORECOVER flag set)" << dendl;
8398 service.pause_recovery();
8401 if (service.recovery_is_paused()) {
8402 dout(1) << "unpausing recovery (NORECOVER flag unset)" << dendl;
8403 service.unpause_recovery();
8407 service.activate_map();
8410 take_waiters(waiting_for_osdmap);
8413 bool OSD::require_mon_peer(const Message *m)
8415 if (!m->get_connection()->peer_is_mon()) {
8416 dout(0) << "require_mon_peer received from non-mon "
8417 << m->get_connection()->get_peer_addr()
8418 << " " << *m << dendl;
8424 bool OSD::require_mon_or_mgr_peer(const Message *m)
8426 if (!m->get_connection()->peer_is_mon() &&
8427 !m->get_connection()->peer_is_mgr()) {
8428 dout(0) << "require_mon_or_mgr_peer received from non-mon, non-mgr "
8429 << m->get_connection()->get_peer_addr()
8430 << " " << *m << dendl;
8436 bool OSD::require_osd_peer(const Message *m)
8438 if (!m->get_connection()->peer_is_osd()) {
8439 dout(0) << "require_osd_peer received from non-osd "
8440 << m->get_connection()->get_peer_addr()
8441 << " " << *m << dendl;
8447 bool OSD::require_self_aliveness(const Message *m, epoch_t epoch)
8449 epoch_t up_epoch = service.get_up_epoch();
8450 if (epoch < up_epoch) {
8451 dout(7) << "from pre-up epoch " << epoch << " < " << up_epoch << dendl;
8456 dout(7) << "still in boot state, dropping message " << *m << dendl;
8463 bool OSD::require_same_peer_instance(const Message *m, OSDMapRef& map,
8464 bool is_fast_dispatch)
8466 int from = m->get_source().num();
8468 if (map->is_down(from) ||
8469 (map->get_cluster_addr(from) != m->get_source_inst().addr)) {
8470 dout(5) << "from dead osd." << from << ", marking down, "
8471 << " msg was " << m->get_source_inst().addr
8472 << " expected " << (map->is_up(from) ?
8473 map->get_cluster_addr(from) : entity_addr_t())
8475 ConnectionRef con = m->get_connection();
8477 Session *s = static_cast<Session*>(con->get_priv());
8479 if (!is_fast_dispatch)
8480 s->session_dispatch_lock.Lock();
8481 clear_session_waiting_on_map(s);
8482 con->set_priv(NULL); // break ref <-> session cycle, if any
8483 if (!is_fast_dispatch)
8484 s->session_dispatch_lock.Unlock();
8494 * require that we have same (or newer) map, and that
8495 * the source is the pg primary.
8497 bool OSD::require_same_or_newer_map(OpRequestRef& op, epoch_t epoch,
8498 bool is_fast_dispatch)
8500 const Message *m = op->get_req();
8501 dout(15) << "require_same_or_newer_map " << epoch
8502 << " (i am " << osdmap->get_epoch() << ") " << m << dendl;
8504 assert(osd_lock.is_locked());
8506 // do they have a newer map?
8507 if (epoch > osdmap->get_epoch()) {
8508 dout(7) << "waiting for newer map epoch " << epoch
8509 << " > my " << osdmap->get_epoch() << " with " << m << dendl;
8510 wait_for_new_map(op);
8514 if (!require_self_aliveness(op->get_req(), epoch)) {
8518 // ok, our map is same or newer.. do they still exist?
8519 if (m->get_connection()->get_messenger() == cluster_messenger &&
8520 !require_same_peer_instance(op->get_req(), osdmap, is_fast_dispatch)) {
8531 // ----------------------------------------
8534 void OSD::split_pgs(
8536 const set<spg_t> &childpgids, set<PGRef> *out_pgs,
8539 PG::RecoveryCtx *rctx)
8541 unsigned pg_num = nextmap->get_pg_num(
8543 parent->update_snap_mapper_bits(
8544 parent->info.pgid.get_split_bits(pg_num)
8547 vector<object_stat_sum_t> updated_stats(childpgids.size() + 1);
8548 parent->info.stats.stats.sum.split(updated_stats);
8550 vector<object_stat_sum_t>::iterator stat_iter = updated_stats.begin();
8551 for (set<spg_t>::const_iterator i = childpgids.begin();
8552 i != childpgids.end();
8554 assert(stat_iter != updated_stats.end());
8555 dout(10) << "Splitting " << *parent << " into " << *i << dendl;
8556 assert(service.splitting(*i));
8557 PG* child = _make_pg(nextmap, *i);
8559 out_pgs->insert(child);
8560 rctx->created_pgs.insert(child);
8562 unsigned split_bits = i->get_split_bits(pg_num);
8563 dout(10) << "pg_num is " << pg_num << dendl;
8564 dout(10) << "m_seed " << i->ps() << dendl;
8565 dout(10) << "split_bits is " << split_bits << dendl;
8567 parent->split_colls(
8577 child->info.stats.stats.sum = *stat_iter;
8579 child->write_if_dirty(*(rctx->transaction));
8582 assert(stat_iter != updated_stats.end());
8583 parent->info.stats.stats.sum = *stat_iter;
8584 parent->write_if_dirty(*(rctx->transaction));
8590 void OSD::handle_pg_create(OpRequestRef op)
8592 const MOSDPGCreate *m = static_cast<const MOSDPGCreate*>(op->get_req());
8593 assert(m->get_type() == MSG_OSD_PG_CREATE);
8595 dout(10) << "handle_pg_create " << *m << dendl;
8597 if (!require_mon_peer(op->get_req())) {
8601 if (!require_same_or_newer_map(op, m->epoch, false))
8606 map<pg_t,utime_t>::const_iterator ci = m->ctimes.begin();
8607 for (map<pg_t,pg_create_t>::const_iterator p = m->mkpg.begin();
8610 assert(ci != m->ctimes.end() && ci->first == p->first);
8611 epoch_t created = p->second.created;
8612 if (p->second.split_bits) // Skip split pgs
8616 if (on.preferred() >= 0) {
8617 dout(20) << "ignoring localized pg " << on << dendl;
8621 if (!osdmap->have_pg_pool(on.pool())) {
8622 dout(20) << "ignoring pg on deleted pool " << on << dendl;
8626 dout(20) << "mkpg " << on << " e" << created << "@" << ci->second << dendl;
8628 // is it still ours?
8629 vector<int> up, acting;
8630 int up_primary = -1;
8631 int acting_primary = -1;
8632 osdmap->pg_to_up_acting_osds(on, &up, &up_primary, &acting, &acting_primary);
8633 int role = osdmap->calc_pg_role(whoami, acting, acting.size());
8635 if (acting_primary != whoami) {
8636 dout(10) << "mkpg " << on << " not acting_primary (" << acting_primary
8637 << "), my role=" << role << ", skipping" << dendl;
8642 bool mapped = osdmap->get_primary_shard(on, &pgid);
8646 osdmap->get_pools().at(pgid.pool()).ec_pool(),
8648 pg_history_t history;
8649 build_initial_pg_history(pgid, created, ci->second, &history, &pi);
8651 // The mon won't resend unless the primary changed, so
8652 // we ignore same_interval_since. We'll pass this history
8653 // to handle_pg_peering_evt with the current epoch as the
8654 // event -- the project_pg_history check in
8655 // handle_pg_peering_evt will be a noop.
8656 if (history.same_primary_since > m->epoch) {
8657 dout(10) << __func__ << ": got obsolete pg create on pgid "
8658 << pgid << " from epoch " << m->epoch
8659 << ", primary changed in " << history.same_primary_since
8663 if (handle_pg_peering_evt(
8667 osdmap->get_epoch(),
8668 PG::CephPeeringEvtRef(
8669 new PG::CephPeeringEvt(
8670 osdmap->get_epoch(),
8671 osdmap->get_epoch(),
8674 service.send_pg_created(pgid.pgid);
8679 lock_guard<mutex> pending_creates_locker{pending_creates_lock};
8680 if (pending_creates_from_mon == 0) {
8681 last_pg_create_epoch = m->epoch;
8684 maybe_update_heartbeat_peers();
8688 // ----------------------------------------
8689 // peering and recovery
8691 PG::RecoveryCtx OSD::create_context()
8693 ObjectStore::Transaction *t = new ObjectStore::Transaction;
8694 C_Contexts *on_applied = new C_Contexts(cct);
8695 C_Contexts *on_safe = new C_Contexts(cct);
8696 map<int, map<spg_t,pg_query_t> > *query_map =
8697 new map<int, map<spg_t, pg_query_t> >;
8698 map<int,vector<pair<pg_notify_t, PastIntervals> > > *notify_list =
8699 new map<int, vector<pair<pg_notify_t, PastIntervals> > >;
8700 map<int,vector<pair<pg_notify_t, PastIntervals> > > *info_map =
8701 new map<int,vector<pair<pg_notify_t, PastIntervals> > >;
8702 PG::RecoveryCtx rctx(query_map, info_map, notify_list,
8703 on_applied, on_safe, t);
8707 struct C_OpenPGs : public Context {
8711 C_OpenPGs(set<PGRef>& p, ObjectStore *s, OSD* o) : store(s), osd(o) {
8714 void finish(int r) override {
8715 RWLock::RLocker l(osd->pg_map_lock);
8716 for (auto p : pgs) {
8717 if (osd->pg_map.count(p->info.pgid)) {
8718 p->ch = store->open_collection(p->coll);
8725 void OSD::dispatch_context_transaction(PG::RecoveryCtx &ctx, PG *pg,
8726 ThreadPool::TPHandle *handle)
8728 if (!ctx.transaction->empty()) {
8729 if (!ctx.created_pgs.empty()) {
8730 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8732 int tr = store->queue_transaction(
8734 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL,
8735 TrackedOpRef(), handle);
8736 delete (ctx.transaction);
8738 ctx.transaction = new ObjectStore::Transaction;
8739 ctx.on_applied = new C_Contexts(cct);
8740 ctx.on_safe = new C_Contexts(cct);
8744 void OSD::dispatch_context(PG::RecoveryCtx &ctx, PG *pg, OSDMapRef curmap,
8745 ThreadPool::TPHandle *handle)
8747 if (service.get_osdmap()->is_up(whoami) &&
8749 do_notifies(*ctx.notify_list, curmap);
8750 do_queries(*ctx.query_map, curmap);
8751 do_infos(*ctx.info_map, curmap);
8753 delete ctx.notify_list;
8754 delete ctx.query_map;
8755 delete ctx.info_map;
8756 if ((ctx.on_applied->empty() &&
8757 ctx.on_safe->empty() &&
8758 ctx.transaction->empty() &&
8759 ctx.created_pgs.empty()) || !pg) {
8760 delete ctx.transaction;
8761 delete ctx.on_applied;
8763 assert(ctx.created_pgs.empty());
8765 if (!ctx.created_pgs.empty()) {
8766 ctx.on_applied->add(new C_OpenPGs(ctx.created_pgs, store, this));
8768 int tr = store->queue_transaction(
8770 std::move(*ctx.transaction), ctx.on_applied, ctx.on_safe, NULL, TrackedOpRef(),
8772 delete (ctx.transaction);
8778 * Send an MOSDPGNotify to a primary, with a list of PGs that I have
8779 * content for, and they are primary for.
8782 void OSD::do_notifies(
8783 map<int,vector<pair<pg_notify_t,PastIntervals> > >& notify_list,
8787 vector<pair<pg_notify_t,PastIntervals> > >::iterator it =
8788 notify_list.begin();
8789 it != notify_list.end();
8791 if (!curmap->is_up(it->first)) {
8792 dout(20) << __func__ << " skipping down osd." << it->first << dendl;
8795 ConnectionRef con = service.get_con_osd_cluster(
8796 it->first, curmap->get_epoch());
8798 dout(20) << __func__ << " skipping osd." << it->first
8799 << " (NULL con)" << dendl;
8802 service.share_map_peer(it->first, con.get(), curmap);
8803 dout(7) << __func__ << " osd." << it->first
8804 << " on " << it->second.size() << " PGs" << dendl;
8805 MOSDPGNotify *m = new MOSDPGNotify(curmap->get_epoch(),
8807 con->send_message(m);
8813 * send out pending queries for info | summaries
8815 void OSD::do_queries(map<int, map<spg_t,pg_query_t> >& query_map,
8818 for (map<int, map<spg_t,pg_query_t> >::iterator pit = query_map.begin();
8819 pit != query_map.end();
8821 if (!curmap->is_up(pit->first)) {
8822 dout(20) << __func__ << " skipping down osd." << pit->first << dendl;
8825 int who = pit->first;
8826 ConnectionRef con = service.get_con_osd_cluster(who, curmap->get_epoch());
8828 dout(20) << __func__ << " skipping osd." << who
8829 << " (NULL con)" << dendl;
8832 service.share_map_peer(who, con.get(), curmap);
8833 dout(7) << __func__ << " querying osd." << who
8834 << " on " << pit->second.size() << " PGs" << dendl;
8835 MOSDPGQuery *m = new MOSDPGQuery(curmap->get_epoch(), pit->second);
8836 con->send_message(m);
8841 void OSD::do_infos(map<int,
8842 vector<pair<pg_notify_t, PastIntervals> > >& info_map,
8846 vector<pair<pg_notify_t, PastIntervals> > >::iterator p =
8848 p != info_map.end();
8850 if (!curmap->is_up(p->first)) {
8851 dout(20) << __func__ << " skipping down osd." << p->first << dendl;
8854 for (vector<pair<pg_notify_t,PastIntervals> >::iterator i = p->second.begin();
8855 i != p->second.end();
8857 dout(20) << __func__ << " sending info " << i->first.info
8858 << " to shard " << p->first << dendl;
8860 ConnectionRef con = service.get_con_osd_cluster(
8861 p->first, curmap->get_epoch());
8863 dout(20) << __func__ << " skipping osd." << p->first
8864 << " (NULL con)" << dendl;
8867 service.share_map_peer(p->first, con.get(), curmap);
8868 MOSDPGInfo *m = new MOSDPGInfo(curmap->get_epoch());
8869 m->pg_list = p->second;
8870 con->send_message(m);
8877 * from non-primary to primary
8878 * includes pg_info_t.
8879 * NOTE: called with opqueue active.
8881 void OSD::handle_pg_notify(OpRequestRef op)
8883 const MOSDPGNotify *m = static_cast<const MOSDPGNotify*>(op->get_req());
8884 assert(m->get_type() == MSG_OSD_PG_NOTIFY);
8886 dout(7) << "handle_pg_notify from " << m->get_source() << dendl;
8887 int from = m->get_source().num();
8889 if (!require_osd_peer(op->get_req()))
8892 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8897 for (auto it = m->get_pg_list().begin();
8898 it != m->get_pg_list().end();
8900 if (it->first.info.pgid.preferred() >= 0) {
8901 dout(20) << "ignoring localized pg " << it->first.info.pgid << dendl;
8905 handle_pg_peering_evt(
8906 spg_t(it->first.info.pgid.pgid, it->first.to),
8907 it->first.info.history, it->second,
8908 it->first.query_epoch,
8909 PG::CephPeeringEvtRef(
8910 new PG::CephPeeringEvt(
8911 it->first.epoch_sent, it->first.query_epoch,
8912 PG::MNotifyRec(pg_shard_t(from, it->first.from), it->first,
8913 op->get_req()->get_connection()->get_features())))
8918 void OSD::handle_pg_log(OpRequestRef op)
8920 MOSDPGLog *m = static_cast<MOSDPGLog*>(op->get_nonconst_req());
8921 assert(m->get_type() == MSG_OSD_PG_LOG);
8922 dout(7) << "handle_pg_log " << *m << " from " << m->get_source() << dendl;
8924 if (!require_osd_peer(op->get_req()))
8927 int from = m->get_source().num();
8928 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8931 if (m->info.pgid.preferred() >= 0) {
8932 dout(10) << "ignoring localized pg " << m->info.pgid << dendl;
8937 handle_pg_peering_evt(
8938 spg_t(m->info.pgid.pgid, m->to),
8939 m->info.history, m->past_intervals, m->get_epoch(),
8940 PG::CephPeeringEvtRef(
8941 new PG::CephPeeringEvt(
8942 m->get_epoch(), m->get_query_epoch(),
8943 PG::MLogRec(pg_shard_t(from, m->from), m)))
8947 void OSD::handle_pg_info(OpRequestRef op)
8949 const MOSDPGInfo *m = static_cast<const MOSDPGInfo *>(op->get_req());
8950 assert(m->get_type() == MSG_OSD_PG_INFO);
8951 dout(7) << "handle_pg_info " << *m << " from " << m->get_source() << dendl;
8953 if (!require_osd_peer(op->get_req()))
8956 int from = m->get_source().num();
8957 if (!require_same_or_newer_map(op, m->get_epoch(), false))
8962 for (auto p = m->pg_list.begin();
8963 p != m->pg_list.end();
8965 if (p->first.info.pgid.preferred() >= 0) {
8966 dout(10) << "ignoring localized pg " << p->first.info.pgid << dendl;
8970 handle_pg_peering_evt(
8971 spg_t(p->first.info.pgid.pgid, p->first.to),
8972 p->first.info.history, p->second, p->first.epoch_sent,
8973 PG::CephPeeringEvtRef(
8974 new PG::CephPeeringEvt(
8975 p->first.epoch_sent, p->first.query_epoch,
8978 from, p->first.from), p->first.info, p->first.epoch_sent)))
8983 void OSD::handle_pg_trim(OpRequestRef op)
8985 const MOSDPGTrim *m = static_cast<const MOSDPGTrim*>(op->get_req());
8986 assert(m->get_type() == MSG_OSD_PG_TRIM);
8988 dout(7) << "handle_pg_trim " << *m << " from " << m->get_source() << dendl;
8990 if (!require_osd_peer(op->get_req()))
8993 int from = m->get_source().num();
8994 if (!require_same_or_newer_map(op, m->epoch, false))
8997 if (m->pgid.preferred() >= 0) {
8998 dout(10) << "ignoring localized pg " << m->pgid << dendl;
9004 PG *pg = _lookup_lock_pg(m->pgid);
9006 dout(10) << " don't have pg " << m->pgid << dendl;
9010 if (m->epoch < pg->info.history.same_interval_since) {
9011 dout(10) << *pg << " got old trim to " << m->trim_to << ", ignoring" << dendl;
9016 if (pg->is_primary()) {
9017 // peer is informing us of their last_complete_ondisk
9018 dout(10) << *pg << " replica osd." << from << " lcod " << m->trim_to << dendl;
9019 pg->peer_last_complete_ondisk[pg_shard_t(from, m->pgid.shard)] =
9021 // trim log when the pg is recovered
9022 pg->calc_min_last_complete_ondisk();
9024 // primary is instructing us to trim
9025 ObjectStore::Transaction t;
9026 pg->pg_log.trim(m->trim_to, pg->info);
9027 pg->dirty_info = true;
9028 pg->write_if_dirty(t);
9029 int tr = store->queue_transaction(pg->osr.get(), std::move(t), NULL);
9035 void OSD::handle_pg_backfill_reserve(OpRequestRef op)
9037 const MBackfillReserve *m = static_cast<const MBackfillReserve*>(op->get_req());
9038 assert(m->get_type() == MSG_OSD_BACKFILL_RESERVE);
9040 if (!require_osd_peer(op->get_req()))
9042 if (!require_same_or_newer_map(op, m->query_epoch, false))
9045 PG::CephPeeringEvtRef evt;
9046 if (m->type == MBackfillReserve::REQUEST) {
9047 evt = PG::CephPeeringEvtRef(
9048 new PG::CephPeeringEvt(
9051 PG::RequestBackfillPrio(m->priority)));
9052 } else if (m->type == MBackfillReserve::GRANT) {
9053 evt = PG::CephPeeringEvtRef(
9054 new PG::CephPeeringEvt(
9057 PG::RemoteBackfillReserved()));
9058 } else if (m->type == MBackfillReserve::REJECT) {
9059 // NOTE: this is replica -> primary "i reject your request"
9060 // and also primary -> replica "cancel my previously-granted request"
9061 evt = PG::CephPeeringEvtRef(
9062 new PG::CephPeeringEvt(
9065 PG::RemoteReservationRejected()));
9070 if (service.splitting(m->pgid)) {
9071 peering_wait_for_split[m->pgid].push_back(evt);
9075 PG *pg = _lookup_lock_pg(m->pgid);
9077 dout(10) << " don't have pg " << m->pgid << dendl;
9081 pg->queue_peering_event(evt);
9085 void OSD::handle_pg_recovery_reserve(OpRequestRef op)
9087 const MRecoveryReserve *m = static_cast<const MRecoveryReserve*>(op->get_req());
9088 assert(m->get_type() == MSG_OSD_RECOVERY_RESERVE);
9090 if (!require_osd_peer(op->get_req()))
9092 if (!require_same_or_newer_map(op, m->query_epoch, false))
9095 PG::CephPeeringEvtRef evt;
9096 if (m->type == MRecoveryReserve::REQUEST) {
9097 evt = PG::CephPeeringEvtRef(
9098 new PG::CephPeeringEvt(
9101 PG::RequestRecovery()));
9102 } else if (m->type == MRecoveryReserve::GRANT) {
9103 evt = PG::CephPeeringEvtRef(
9104 new PG::CephPeeringEvt(
9107 PG::RemoteRecoveryReserved()));
9108 } else if (m->type == MRecoveryReserve::RELEASE) {
9109 evt = PG::CephPeeringEvtRef(
9110 new PG::CephPeeringEvt(
9113 PG::RecoveryDone()));
9118 if (service.splitting(m->pgid)) {
9119 peering_wait_for_split[m->pgid].push_back(evt);
9123 PG *pg = _lookup_lock_pg(m->pgid);
9125 dout(10) << " don't have pg " << m->pgid << dendl;
9129 pg->queue_peering_event(evt);
9133 void OSD::handle_force_recovery(Message *m)
9135 MOSDForceRecovery *msg = static_cast<MOSDForceRecovery*>(m);
9136 assert(msg->get_type() == MSG_OSD_FORCE_RECOVERY);
9138 vector<PGRef> local_pgs;
9139 local_pgs.reserve(msg->forced_pgs.size());
9142 RWLock::RLocker l(pg_map_lock);
9143 for (auto& i : msg->forced_pgs) {
9145 if (osdmap->get_primary_shard(i, &locpg)) {
9146 auto pg_map_entry = pg_map.find(locpg);
9147 if (pg_map_entry != pg_map.end()) {
9148 local_pgs.push_back(pg_map_entry->second);
9154 if (local_pgs.size()) {
9155 service.adjust_pg_priorities(local_pgs, msg->options);
9162 * from primary to replica | stray
9163 * NOTE: called with opqueue active.
9165 void OSD::handle_pg_query(OpRequestRef op)
9167 assert(osd_lock.is_locked());
9169 const MOSDPGQuery *m = static_cast<const MOSDPGQuery*>(op->get_req());
9170 assert(m->get_type() == MSG_OSD_PG_QUERY);
9172 if (!require_osd_peer(op->get_req()))
9175 dout(7) << "handle_pg_query from " << m->get_source() << " epoch " << m->get_epoch() << dendl;
9176 int from = m->get_source().num();
9178 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9183 map< int, vector<pair<pg_notify_t, PastIntervals> > > notify_list;
9185 for (auto it = m->pg_list.begin();
9186 it != m->pg_list.end();
9188 spg_t pgid = it->first;
9190 if (pgid.preferred() >= 0) {
9191 dout(10) << "ignoring localized pg " << pgid << dendl;
9195 if (service.splitting(pgid)) {
9196 peering_wait_for_split[pgid].push_back(
9197 PG::CephPeeringEvtRef(
9198 new PG::CephPeeringEvt(
9199 it->second.epoch_sent, it->second.epoch_sent,
9200 PG::MQuery(pg_shard_t(from, it->second.from),
9201 it->second, it->second.epoch_sent))));
9206 RWLock::RLocker l(pg_map_lock);
9207 if (pg_map.count(pgid)) {
9209 pg = _lookup_lock_pg_with_map_lock_held(pgid);
9211 it->second.epoch_sent, it->second.epoch_sent,
9212 pg_shard_t(from, it->second.from), it->second);
9218 if (!osdmap->have_pg_pool(pgid.pool()))
9221 // get active crush mapping
9222 int up_primary, acting_primary;
9223 vector<int> up, acting;
9224 osdmap->pg_to_up_acting_osds(
9225 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9228 pg_history_t history = it->second.history;
9229 bool valid_history = project_pg_history(
9230 pgid, history, it->second.epoch_sent,
9231 up, up_primary, acting, acting_primary);
9233 if (!valid_history ||
9234 it->second.epoch_sent < history.same_interval_since) {
9235 dout(10) << " pg " << pgid << " dne, and pg has changed in "
9236 << history.same_interval_since
9237 << " (msg from " << it->second.epoch_sent << ")" << dendl;
9241 dout(10) << " pg " << pgid << " dne" << dendl;
9242 pg_info_t empty(spg_t(pgid.pgid, it->second.to));
9243 /* This is racy, but that should be ok: if we complete the deletion
9244 * before the pg is recreated, we'll just start it off backfilling
9245 * instead of just empty */
9246 if (service.deleting_pgs.lookup(pgid))
9247 empty.set_last_backfill(hobject_t());
9248 if (it->second.type == pg_query_t::LOG ||
9249 it->second.type == pg_query_t::FULLLOG) {
9250 ConnectionRef con = service.get_con_osd_cluster(from, osdmap->get_epoch());
9252 MOSDPGLog *mlog = new MOSDPGLog(
9253 it->second.from, it->second.to,
9254 osdmap->get_epoch(), empty,
9255 it->second.epoch_sent);
9256 service.share_map_peer(from, con.get(), osdmap);
9257 con->send_message(mlog);
9260 notify_list[from].push_back(
9263 it->second.from, it->second.to,
9264 it->second.epoch_sent,
9265 osdmap->get_epoch(),
9268 osdmap->get_pools().at(pgid.pool()).ec_pool(),
9272 do_notifies(notify_list, osdmap);
9276 void OSD::handle_pg_remove(OpRequestRef op)
9278 const MOSDPGRemove *m = static_cast<const MOSDPGRemove *>(op->get_req());
9279 assert(m->get_type() == MSG_OSD_PG_REMOVE);
9280 assert(osd_lock.is_locked());
9282 if (!require_osd_peer(op->get_req()))
9285 dout(7) << "handle_pg_remove from " << m->get_source() << " on "
9286 << m->pg_list.size() << " pgs" << dendl;
9288 if (!require_same_or_newer_map(op, m->get_epoch(), false))
9293 for (auto it = m->pg_list.begin();
9294 it != m->pg_list.end();
9297 if (pgid.preferred() >= 0) {
9298 dout(10) << "ignoring localized pg " << pgid << dendl;
9302 RWLock::WLocker l(pg_map_lock);
9303 if (pg_map.count(pgid) == 0) {
9304 dout(10) << " don't have pg " << pgid << dendl;
9307 dout(5) << "queue_pg_for_deletion: " << pgid << dendl;
9308 PG *pg = _lookup_lock_pg_with_map_lock_held(pgid);
9309 pg_history_t history = pg->info.history;
9310 int up_primary, acting_primary;
9311 vector<int> up, acting;
9312 osdmap->pg_to_up_acting_osds(
9313 pgid.pgid, &up, &up_primary, &acting, &acting_primary);
9314 bool valid_history = project_pg_history(
9315 pg->info.pgid, history, pg->get_osdmap()->get_epoch(),
9316 up, up_primary, acting, acting_primary);
9317 if (valid_history &&
9318 history.same_interval_since <= m->get_epoch()) {
9319 assert(pg->get_primary().osd == m->get_source().num());
9324 dout(10) << *pg << " ignoring remove request, pg changed in epoch "
9325 << history.same_interval_since
9326 << " > " << m->get_epoch() << dendl;
9332 void OSD::_remove_pg(PG *pg)
9334 ObjectStore::Transaction rmt ;
9336 // on_removal, which calls remove_watchers_and_notifies, and the erasure from
9337 // the pg_map must be done together without unlocking the pg lock,
9338 // to avoid racing with watcher cleanup in ms_handle_reset
9339 // and handle_notify_timeout
9340 pg->on_removal(&rmt);
9342 service.cancel_pending_splits_for_parent(pg->info.pgid);
9343 int tr = store->queue_transaction(
9344 pg->osr.get(), std::move(rmt), NULL,
9345 new ContainerContext<
9346 SequencerRef>(pg->osr));
9349 DeletingStateRef deleting = service.deleting_pgs.lookup_or_create(
9355 remove_wq.queue(make_pair(PGRef(pg), deleting));
9357 service.pg_remove_epoch(pg->info.pgid);
9359 // dereference from op_wq
9360 op_shardedwq.clear_pg_pointer(pg->info.pgid);
9363 pg_map.erase(pg->info.pgid);
9364 pg->put("PGMap"); // since we've taken it out of map
9367 // =========================================================
9370 void OSDService::_maybe_queue_recovery() {
9371 assert(recovery_lock.is_locked_by_me());
9372 uint64_t available_pushes;
9373 while (!awaiting_throttle.empty() &&
9374 _recover_now(&available_pushes)) {
9375 uint64_t to_start = MIN(
9377 cct->_conf->osd_recovery_max_single_start);
9378 _queue_for_recovery(awaiting_throttle.front(), to_start);
9379 awaiting_throttle.pop_front();
9380 recovery_ops_reserved += to_start;
9384 bool OSDService::_recover_now(uint64_t *available_pushes)
9386 if (available_pushes)
9387 *available_pushes = 0;
9389 if (ceph_clock_now() < defer_recovery_until) {
9390 dout(15) << __func__ << " defer until " << defer_recovery_until << dendl;
9394 if (recovery_paused) {
9395 dout(15) << __func__ << " paused" << dendl;
9399 uint64_t max = cct->_conf->osd_recovery_max_active;
9400 if (max <= recovery_ops_active + recovery_ops_reserved) {
9401 dout(15) << __func__ << " active " << recovery_ops_active
9402 << " + reserved " << recovery_ops_reserved
9403 << " >= max " << max << dendl;
9407 if (available_pushes)
9408 *available_pushes = max - recovery_ops_active - recovery_ops_reserved;
9414 void OSDService::adjust_pg_priorities(const vector<PGRef>& pgs, int newflags)
9416 if (!pgs.size() || !(newflags & (OFR_BACKFILL | OFR_RECOVERY)))
9420 if (newflags & OFR_BACKFILL) {
9421 newstate = PG_STATE_FORCED_BACKFILL;
9422 } else if (newflags & OFR_RECOVERY) {
9423 newstate = PG_STATE_FORCED_RECOVERY;
9426 // debug output here may get large, don't generate it if debug level is below
9427 // 10 and use abbreviated pg ids otherwise
9428 if ((cct)->_conf->subsys.should_gather(ceph_subsys_osd, 10)) {
9431 for (auto& i : pgs) {
9432 ss << i->get_pgid() << " ";
9435 dout(10) << __func__ << " working on " << ss.str() << dendl;
9438 if (newflags & OFR_CANCEL) {
9439 for (auto& i : pgs) {
9441 i->_change_recovery_force_mode(newstate, true);
9445 for (auto& i : pgs) {
9446 // make sure the PG is in correct state before forcing backfill or recovery, or
9447 // else we'll make PG keeping FORCE_* flag forever, requiring osds restart
9448 // or forcing somehow recovery/backfill.
9450 int pgstate = i->get_state();
9451 if ( ((newstate == PG_STATE_FORCED_RECOVERY) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_RECOVERY_WAIT | PG_STATE_RECOVERING))) ||
9452 ((newstate == PG_STATE_FORCED_BACKFILL) && (pgstate & (PG_STATE_DEGRADED | PG_STATE_BACKFILL_WAIT | PG_STATE_BACKFILLING))) )
9453 i->_change_recovery_force_mode(newstate, false);
9459 void OSD::do_recovery(
9460 PG *pg, epoch_t queued, uint64_t reserved_pushes,
9461 ThreadPool::TPHandle &handle)
9463 uint64_t started = 0;
9466 * When the value of osd_recovery_sleep is set greater than zero, recovery
9467 * ops are scheduled after osd_recovery_sleep amount of time from the previous
9468 * recovery event's schedule time. This is done by adding a
9469 * recovery_requeue_callback event, which re-queues the recovery op using
9470 * queue_recovery_after_sleep.
9472 float recovery_sleep = get_osd_recovery_sleep();
9473 if (recovery_sleep > 0 && service.recovery_needs_sleep) {
9475 auto recovery_requeue_callback = new FunctionContext([this, pgref, queued, reserved_pushes](int r) {
9476 dout(20) << "do_recovery wake up at "
9478 << ", re-queuing recovery" << dendl;
9479 service.recovery_needs_sleep = false;
9480 service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
9482 Mutex::Locker l(service.recovery_sleep_lock);
9484 // This is true for the first recovery op and when the previous recovery op
9485 // has been scheduled in the past. The next recovery op is scheduled after
9486 // completing the sleep from now.
9487 if (service.recovery_schedule_time < ceph_clock_now()) {
9488 service.recovery_schedule_time = ceph_clock_now();
9490 service.recovery_schedule_time += recovery_sleep;
9491 service.recovery_sleep_timer.add_event_at(service.recovery_schedule_time,
9492 recovery_requeue_callback);
9493 dout(20) << "Recovery event scheduled at "
9494 << service.recovery_schedule_time << dendl;
9499 service.recovery_needs_sleep = true;
9500 if (pg->pg_has_reset_since(queued)) {
9504 assert(!pg->deleting);
9505 assert(pg->is_peered() && pg->is_primary());
9507 assert(pg->recovery_queued);
9508 pg->recovery_queued = false;
9510 dout(10) << "do_recovery starting " << reserved_pushes << " " << *pg << dendl;
9511 #ifdef DEBUG_RECOVERY_OIDS
9512 dout(20) << " active was " << service.recovery_oids[pg->info.pgid] << dendl;
9515 bool more = pg->start_recovery_ops(reserved_pushes, handle, &started);
9516 dout(10) << "do_recovery started " << started << "/" << reserved_pushes
9517 << " on " << *pg << dendl;
9519 // If no recovery op is started, don't bother to manipulate the RecoveryCtx
9520 if (!started && (more || !pg->have_unfound())) {
9524 PG::RecoveryCtx rctx = create_context();
9525 rctx.handle = &handle;
9528 * if we couldn't start any recovery ops and things are still
9529 * unfound, see if we can discover more missing object locations.
9530 * It may be that our initial locations were bad and we errored
9531 * out while trying to pull.
9533 if (!more && pg->have_unfound()) {
9534 pg->discover_all_missing(*rctx.query_map);
9535 if (rctx.query_map->empty()) {
9537 if (pg->state_test(PG_STATE_BACKFILLING)) {
9538 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9541 PG::DeferBackfill(cct->_conf->osd_recovery_retry_interval)));
9542 pg->queue_peering_event(evt);
9543 action = "in backfill";
9544 } else if (pg->state_test(PG_STATE_RECOVERING)) {
9545 auto evt = PG::CephPeeringEvtRef(new PG::CephPeeringEvt(
9548 PG::DeferRecovery(cct->_conf->osd_recovery_retry_interval)));
9549 pg->queue_peering_event(evt);
9550 action = "in recovery";
9552 action = "already out of recovery/backfill";
9554 dout(10) << __func__ << ": no luck, giving up on this pg for now (" << action << ")" << dendl;
9556 dout(10) << __func__ << ": no luck, giving up on this pg for now (queue_recovery)" << dendl;
9557 pg->queue_recovery();
9561 pg->write_if_dirty(*rctx.transaction);
9562 OSDMapRef curmap = pg->get_osdmap();
9563 dispatch_context(rctx, pg, curmap);
9567 assert(started <= reserved_pushes);
9568 service.release_reserved_pushes(reserved_pushes);
9571 void OSDService::start_recovery_op(PG *pg, const hobject_t& soid)
9573 Mutex::Locker l(recovery_lock);
9574 dout(10) << "start_recovery_op " << *pg << " " << soid
9575 << " (" << recovery_ops_active << "/"
9576 << cct->_conf->osd_recovery_max_active << " rops)"
9578 recovery_ops_active++;
9580 #ifdef DEBUG_RECOVERY_OIDS
9581 dout(20) << " active was " << recovery_oids[pg->info.pgid] << dendl;
9582 assert(recovery_oids[pg->info.pgid].count(soid) == 0);
9583 recovery_oids[pg->info.pgid].insert(soid);
9587 void OSDService::finish_recovery_op(PG *pg, const hobject_t& soid, bool dequeue)
9589 Mutex::Locker l(recovery_lock);
9590 dout(10) << "finish_recovery_op " << *pg << " " << soid
9591 << " dequeue=" << dequeue
9592 << " (" << recovery_ops_active << "/" << cct->_conf->osd_recovery_max_active << " rops)"
9596 assert(recovery_ops_active > 0);
9597 recovery_ops_active--;
9599 #ifdef DEBUG_RECOVERY_OIDS
9600 dout(20) << " active oids was " << recovery_oids[pg->info.pgid] << dendl;
9601 assert(recovery_oids[pg->info.pgid].count(soid));
9602 recovery_oids[pg->info.pgid].erase(soid);
9605 _maybe_queue_recovery();
9608 bool OSDService::is_recovery_active()
9610 return local_reserver.has_reservation() || remote_reserver.has_reservation();
9613 // =========================================================
9616 bool OSD::op_is_discardable(const MOSDOp *op)
9618 // drop client request if they are not connected and can't get the
9620 if (!op->get_connection()->is_connected()) {
9626 void OSD::enqueue_op(spg_t pg, OpRequestRef& op, epoch_t epoch)
9628 utime_t latency = ceph_clock_now() - op->get_req()->get_recv_stamp();
9629 dout(15) << "enqueue_op " << op << " prio " << op->get_req()->get_priority()
9630 << " cost " << op->get_req()->get_cost()
9631 << " latency " << latency
9632 << " epoch " << epoch
9633 << " " << *(op->get_req()) << dendl;
9634 op->osd_trace.event("enqueue op");
9635 op->osd_trace.keyval("priority", op->get_req()->get_priority());
9636 op->osd_trace.keyval("cost", op->get_req()->get_cost());
9637 op->mark_queued_for_pg();
9638 logger->tinc(l_osd_op_before_queue_op_lat, latency);
9639 op_shardedwq.queue(make_pair(pg, PGQueueable(op, epoch)));
9645 * NOTE: dequeue called in worker thread, with pg lock
9647 void OSD::dequeue_op(
9648 PGRef pg, OpRequestRef op,
9649 ThreadPool::TPHandle &handle)
9652 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_BEGIN", false);
9654 utime_t now = ceph_clock_now();
9655 op->set_dequeued_time(now);
9656 utime_t latency = now - op->get_req()->get_recv_stamp();
9657 dout(10) << "dequeue_op " << op << " prio " << op->get_req()->get_priority()
9658 << " cost " << op->get_req()->get_cost()
9659 << " latency " << latency
9660 << " " << *(op->get_req())
9661 << " pg " << *pg << dendl;
9663 logger->tinc(l_osd_op_before_dequeue_op_lat, latency);
9665 Session *session = static_cast<Session *>(
9666 op->get_req()->get_connection()->get_priv());
9668 maybe_share_map(session, op, pg->get_osdmap());
9675 op->mark_reached_pg();
9676 op->osd_trace.event("dequeue_op");
9678 pg->do_request(op, handle);
9681 dout(10) << "dequeue_op " << op << " finish" << dendl;
9682 OID_EVENT_TRACE_WITH_MSG(op->get_req(), "DEQUEUE_OP_END", false);
9686 struct C_CompleteSplits : public Context {
9689 C_CompleteSplits(OSD *osd, const set<PGRef> &in)
9690 : osd(osd), pgs(in) {}
9691 void finish(int r) override {
9692 Mutex::Locker l(osd->osd_lock);
9693 if (osd->is_stopping())
9695 PG::RecoveryCtx rctx = osd->create_context();
9696 for (set<PGRef>::iterator i = pgs.begin();
9699 osd->pg_map_lock.get_write();
9702 osd->add_newly_split_pg(pg, &rctx);
9703 if (!((*i)->deleting)) {
9704 set<spg_t> to_complete;
9705 to_complete.insert((*i)->info.pgid);
9706 osd->service.complete_split(to_complete);
9708 osd->pg_map_lock.put_write();
9709 osd->dispatch_context_transaction(rctx, pg);
9710 osd->wake_pg_waiters(*i);
9714 osd->dispatch_context(rctx, 0, osd->service.get_osdmap());
9718 void OSD::process_peering_events(
9719 const list<PG*> &pgs,
9720 ThreadPool::TPHandle &handle
9723 bool need_up_thru = false;
9724 epoch_t same_interval_since = 0;
9726 PG::RecoveryCtx rctx = create_context();
9727 rctx.handle = &handle;
9728 for (list<PG*>::const_iterator i = pgs.begin();
9731 set<PGRef> split_pgs;
9733 pg->lock_suspend_timeout(handle);
9734 curmap = service.get_osdmap();
9739 if (!advance_pg(curmap->get_epoch(), pg, handle, &rctx, &split_pgs)) {
9740 // we need to requeue the PG explicitly since we didn't actually
9742 peering_wq.queue(pg);
9744 assert(!pg->peering_queue.empty());
9745 PG::CephPeeringEvtRef evt = pg->peering_queue.front();
9746 pg->peering_queue.pop_front();
9747 pg->handle_peering_event(evt, &rctx);
9749 need_up_thru = pg->need_up_thru || need_up_thru;
9750 same_interval_since = MAX(pg->info.history.same_interval_since,
9751 same_interval_since);
9752 pg->write_if_dirty(*rctx.transaction);
9753 if (!split_pgs.empty()) {
9754 rctx.on_applied->add(new C_CompleteSplits(this, split_pgs));
9757 dispatch_context_transaction(rctx, pg, &handle);
9761 queue_want_up_thru(same_interval_since);
9762 dispatch_context(rctx, 0, curmap, &handle);
9764 service.send_pg_temp();
9767 // --------------------------------
9769 const char** OSD::get_tracked_conf_keys() const
9771 static const char* KEYS[] = {
9772 "osd_max_backfills",
9773 "osd_min_recovery_priority",
9774 "osd_max_trimming_pgs",
9775 "osd_op_complaint_time",
9776 "osd_op_log_threshold",
9777 "osd_op_history_size",
9778 "osd_op_history_duration",
9779 "osd_op_history_slow_op_size",
9780 "osd_op_history_slow_op_threshold",
9781 "osd_enable_op_tracker",
9782 "osd_map_cache_size",
9783 "osd_map_max_advance",
9784 "osd_pg_epoch_persisted_max_stale",
9785 "osd_disk_thread_ioprio_class",
9786 "osd_disk_thread_ioprio_priority",
9787 // clog & admin clog
9790 "clog_to_syslog_facility",
9791 "clog_to_syslog_level",
9792 "osd_objectstore_fuse",
9794 "clog_to_graylog_host",
9795 "clog_to_graylog_port",
9798 "osd_recovery_delay_start",
9799 "osd_client_message_size_cap",
9800 "osd_client_message_cap",
9801 "osd_heartbeat_min_size",
9802 "osd_heartbeat_interval",
9808 void OSD::handle_conf_change(const struct md_config_t *conf,
9809 const std::set <std::string> &changed)
9811 if (changed.count("osd_max_backfills")) {
9812 service.local_reserver.set_max(cct->_conf->osd_max_backfills);
9813 service.remote_reserver.set_max(cct->_conf->osd_max_backfills);
9815 if (changed.count("osd_min_recovery_priority")) {
9816 service.local_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9817 service.remote_reserver.set_min_priority(cct->_conf->osd_min_recovery_priority);
9819 if (changed.count("osd_max_trimming_pgs")) {
9820 service.snap_reserver.set_max(cct->_conf->osd_max_trimming_pgs);
9822 if (changed.count("osd_op_complaint_time") ||
9823 changed.count("osd_op_log_threshold")) {
9824 op_tracker.set_complaint_and_threshold(cct->_conf->osd_op_complaint_time,
9825 cct->_conf->osd_op_log_threshold);
9827 if (changed.count("osd_op_history_size") ||
9828 changed.count("osd_op_history_duration")) {
9829 op_tracker.set_history_size_and_duration(cct->_conf->osd_op_history_size,
9830 cct->_conf->osd_op_history_duration);
9832 if (changed.count("osd_op_history_slow_op_size") ||
9833 changed.count("osd_op_history_slow_op_threshold")) {
9834 op_tracker.set_history_slow_op_size_and_threshold(cct->_conf->osd_op_history_slow_op_size,
9835 cct->_conf->osd_op_history_slow_op_threshold);
9837 if (changed.count("osd_enable_op_tracker")) {
9838 op_tracker.set_tracking(cct->_conf->osd_enable_op_tracker);
9840 if (changed.count("osd_disk_thread_ioprio_class") ||
9841 changed.count("osd_disk_thread_ioprio_priority")) {
9842 set_disk_tp_priority();
9844 if (changed.count("osd_map_cache_size")) {
9845 service.map_cache.set_size(cct->_conf->osd_map_cache_size);
9846 service.map_bl_cache.set_size(cct->_conf->osd_map_cache_size);
9847 service.map_bl_inc_cache.set_size(cct->_conf->osd_map_cache_size);
9849 if (changed.count("clog_to_monitors") ||
9850 changed.count("clog_to_syslog") ||
9851 changed.count("clog_to_syslog_level") ||
9852 changed.count("clog_to_syslog_facility") ||
9853 changed.count("clog_to_graylog") ||
9854 changed.count("clog_to_graylog_host") ||
9855 changed.count("clog_to_graylog_port") ||
9856 changed.count("host") ||
9857 changed.count("fsid")) {
9858 update_log_config();
9862 if (changed.count("osd_objectstore_fuse")) {
9864 enable_disable_fuse(false);
9869 if (changed.count("osd_recovery_delay_start")) {
9870 service.defer_recovery(cct->_conf->osd_recovery_delay_start);
9871 service.kick_recovery_queue();
9874 if (changed.count("osd_client_message_cap")) {
9875 uint64_t newval = cct->_conf->osd_client_message_cap;
9876 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9877 if (pol.throttler_messages && newval > 0) {
9878 pol.throttler_messages->reset_max(newval);
9881 if (changed.count("osd_client_message_size_cap")) {
9882 uint64_t newval = cct->_conf->osd_client_message_size_cap;
9883 Messenger::Policy pol = client_messenger->get_policy(entity_name_t::TYPE_CLIENT);
9884 if (pol.throttler_bytes && newval > 0) {
9885 pol.throttler_bytes->reset_max(newval);
9892 void OSD::update_log_config()
9894 map<string,string> log_to_monitors;
9895 map<string,string> log_to_syslog;
9896 map<string,string> log_channel;
9897 map<string,string> log_prio;
9898 map<string,string> log_to_graylog;
9899 map<string,string> log_to_graylog_host;
9900 map<string,string> log_to_graylog_port;
9904 if (parse_log_client_options(cct, log_to_monitors, log_to_syslog,
9905 log_channel, log_prio, log_to_graylog,
9906 log_to_graylog_host, log_to_graylog_port,
9908 clog->update_config(log_to_monitors, log_to_syslog,
9909 log_channel, log_prio, log_to_graylog,
9910 log_to_graylog_host, log_to_graylog_port,
9912 derr << "log_to_monitors " << log_to_monitors << dendl;
9915 void OSD::check_config()
9917 // some sanity checks
9918 if (cct->_conf->osd_map_cache_size <= cct->_conf->osd_map_max_advance + 2) {
9919 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9920 << " is not > osd_map_max_advance ("
9921 << cct->_conf->osd_map_max_advance << ")";
9923 if (cct->_conf->osd_map_cache_size <= (int)cct->_conf->osd_pg_epoch_persisted_max_stale + 2) {
9924 clog->warn() << "osd_map_cache_size (" << cct->_conf->osd_map_cache_size << ")"
9925 << " is not > osd_pg_epoch_persisted_max_stale ("
9926 << cct->_conf->osd_pg_epoch_persisted_max_stale << ")";
9930 void OSD::set_disk_tp_priority()
9932 dout(10) << __func__
9933 << " class " << cct->_conf->osd_disk_thread_ioprio_class
9934 << " priority " << cct->_conf->osd_disk_thread_ioprio_priority
9936 if (cct->_conf->osd_disk_thread_ioprio_class.empty() ||
9937 cct->_conf->osd_disk_thread_ioprio_priority < 0)
9940 ceph_ioprio_string_to_class(cct->_conf->osd_disk_thread_ioprio_class);
9942 derr << __func__ << cpp_strerror(cls) << ": "
9943 << "osd_disk_thread_ioprio_class is " << cct->_conf->osd_disk_thread_ioprio_class
9944 << " but only the following values are allowed: idle, be or rt" << dendl;
9946 disk_tp.set_ioprio(cls, cct->_conf->osd_disk_thread_ioprio_priority);
9949 // --------------------------------
9951 void OSD::get_latest_osdmap()
9953 dout(10) << __func__ << " -- start" << dendl;
9956 service.objecter->wait_for_latest_osdmap(&cond);
9959 dout(10) << __func__ << " -- finish" << dendl;
9962 // --------------------------------
9964 int OSD::init_op_flags(OpRequestRef& op)
9966 const MOSDOp *m = static_cast<const MOSDOp*>(op->get_req());
9967 vector<OSDOp>::const_iterator iter;
9969 // client flags have no bearing on whether an op is a read, write, etc.
9972 if (m->has_flag(CEPH_OSD_FLAG_RWORDERED)) {
9973 op->set_force_rwordered();
9976 // set bits based on op codes, called methods.
9977 for (iter = m->ops.begin(); iter != m->ops.end(); ++iter) {
9978 if ((iter->op.op == CEPH_OSD_OP_WATCH &&
9979 iter->op.watch.op == CEPH_OSD_WATCH_OP_PING)) {
9980 /* This a bit odd. PING isn't actually a write. It can't
9981 * result in an update to the object_info. PINGs also aren'ty
9982 * resent, so there's no reason to write out a log entry
9984 * However, we pipeline them behind writes, so let's force
9985 * the write_ordered flag.
9987 op->set_force_rwordered();
9989 if (ceph_osd_op_mode_modify(iter->op.op))
9992 if (ceph_osd_op_mode_read(iter->op.op))
9995 // set READ flag if there are src_oids
9996 if (iter->soid.oid.name.length())
9999 // set PGOP flag if there are PG ops
10000 if (ceph_osd_op_type_pg(iter->op.op))
10003 if (ceph_osd_op_mode_cache(iter->op.op))
10006 // check for ec base pool
10007 int64_t poolid = m->get_pg().pool();
10008 const pg_pool_t *pool = osdmap->get_pg_pool(poolid);
10009 if (pool && pool->is_tier()) {
10010 const pg_pool_t *base_pool = osdmap->get_pg_pool(pool->tier_of);
10011 if (base_pool && base_pool->require_rollback()) {
10012 if ((iter->op.op != CEPH_OSD_OP_READ) &&
10013 (iter->op.op != CEPH_OSD_OP_CHECKSUM) &&
10014 (iter->op.op != CEPH_OSD_OP_CMPEXT) &&
10015 (iter->op.op != CEPH_OSD_OP_STAT) &&
10016 (iter->op.op != CEPH_OSD_OP_ISDIRTY) &&
10017 (iter->op.op != CEPH_OSD_OP_UNDIRTY) &&
10018 (iter->op.op != CEPH_OSD_OP_GETXATTR) &&
10019 (iter->op.op != CEPH_OSD_OP_GETXATTRS) &&
10020 (iter->op.op != CEPH_OSD_OP_CMPXATTR) &&
10021 (iter->op.op != CEPH_OSD_OP_ASSERT_VER) &&
10022 (iter->op.op != CEPH_OSD_OP_LIST_WATCHERS) &&
10023 (iter->op.op != CEPH_OSD_OP_LIST_SNAPS) &&
10024 (iter->op.op != CEPH_OSD_OP_SETALLOCHINT) &&
10025 (iter->op.op != CEPH_OSD_OP_WRITEFULL) &&
10026 (iter->op.op != CEPH_OSD_OP_ROLLBACK) &&
10027 (iter->op.op != CEPH_OSD_OP_CREATE) &&
10028 (iter->op.op != CEPH_OSD_OP_DELETE) &&
10029 (iter->op.op != CEPH_OSD_OP_SETXATTR) &&
10030 (iter->op.op != CEPH_OSD_OP_RMXATTR) &&
10031 (iter->op.op != CEPH_OSD_OP_STARTSYNC) &&
10032 (iter->op.op != CEPH_OSD_OP_COPY_GET) &&
10033 (iter->op.op != CEPH_OSD_OP_COPY_FROM)) {
10039 switch (iter->op.op) {
10040 case CEPH_OSD_OP_CALL:
10042 bufferlist::iterator bp = const_cast<bufferlist&>(iter->indata).begin();
10043 int is_write, is_read;
10044 string cname, mname;
10045 bp.copy(iter->op.cls.class_len, cname);
10046 bp.copy(iter->op.cls.method_len, mname);
10048 ClassHandler::ClassData *cls;
10049 int r = class_handler->open_class(cname, &cls);
10051 derr << "class " << cname << " open got " << cpp_strerror(r) << dendl;
10054 else if (r != -EPERM) // propagate permission errors
10058 int flags = cls->get_method_flags(mname.c_str());
10060 if (flags == -ENOENT)
10066 is_read = flags & CLS_METHOD_RD;
10067 is_write = flags & CLS_METHOD_WR;
10068 bool is_promote = flags & CLS_METHOD_PROMOTE;
10070 dout(10) << "class " << cname << " method " << mname << " "
10071 << "flags=" << (is_read ? "r" : "")
10072 << (is_write ? "w" : "")
10073 << (is_promote ? "p" : "")
10076 op->set_class_read();
10078 op->set_class_write();
10081 op->add_class(cname, is_read, is_write, cls->whitelisted);
10085 case CEPH_OSD_OP_WATCH:
10086 // force the read bit for watch since it is depends on previous
10087 // watch state (and may return early if the watch exists) or, in
10088 // the case of ping, is simply a read op.
10091 case CEPH_OSD_OP_NOTIFY:
10092 case CEPH_OSD_OP_NOTIFY_ACK:
10098 case CEPH_OSD_OP_DELETE:
10099 // if we get a delete with FAILOK we can skip handle cache. without
10100 // FAILOK we still need to promote (or do something smarter) to
10101 // determine whether to return ENOENT or 0.
10102 if (iter == m->ops.begin() &&
10103 iter->op.flags == CEPH_OSD_OP_FLAG_FAILOK) {
10104 op->set_skip_handle_cache();
10106 // skip promotion when proxying a delete op
10107 if (m->ops.size() == 1) {
10108 op->set_skip_promote();
10112 case CEPH_OSD_OP_CACHE_TRY_FLUSH:
10113 case CEPH_OSD_OP_CACHE_FLUSH:
10114 case CEPH_OSD_OP_CACHE_EVICT:
10115 // If try_flush/flush/evict is the only op, can skip handle cache.
10116 if (m->ops.size() == 1) {
10117 op->set_skip_handle_cache();
10121 case CEPH_OSD_OP_READ:
10122 case CEPH_OSD_OP_SYNC_READ:
10123 case CEPH_OSD_OP_SPARSE_READ:
10124 case CEPH_OSD_OP_CHECKSUM:
10125 case CEPH_OSD_OP_WRITEFULL:
10126 if (m->ops.size() == 1 &&
10127 (iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_NOCACHE ||
10128 iter->op.flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED)) {
10129 op->set_skip_promote();
10133 // force promotion when pin an object in cache tier
10134 case CEPH_OSD_OP_CACHE_PIN:
10143 if (op->rmw_flags == 0)
10149 void OSD::PeeringWQ::_dequeue(list<PG*> *out) {
10150 for (list<PG*>::iterator i = peering_queue.begin();
10151 i != peering_queue.end() &&
10152 out->size() < osd->cct->_conf->osd_peering_wq_batch_size;
10154 if (in_use.count(*i)) {
10157 out->push_back(*i);
10158 peering_queue.erase(i++);
10161 in_use.insert(out->begin(), out->end());
10165 // =============================================================
10167 #undef dout_context
10168 #define dout_context osd->cct
10170 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq "
10172 void OSD::ShardedOpWQ::wake_pg_waiters(spg_t pgid)
10174 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10175 auto sdata = shard_list[shard_index];
10176 bool queued = false;
10177 unsigned pushes_to_free = 0;
10179 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10180 auto p = sdata->pg_slots.find(pgid);
10181 if (p != sdata->pg_slots.end()) {
10182 dout(20) << __func__ << " " << pgid
10183 << " to_process " << p->second.to_process
10184 << " waiting_for_pg=" << (int)p->second.waiting_for_pg << dendl;
10185 for (auto i = p->second.to_process.rbegin();
10186 i != p->second.to_process.rend();
10188 sdata->_enqueue_front(make_pair(pgid, *i), osd->op_prio_cutoff);
10190 for (auto& q : p->second.to_process) {
10191 pushes_to_free += q.get_reserved_pushes();
10193 p->second.to_process.clear();
10194 p->second.waiting_for_pg = false;
10195 ++p->second.requeue_seq;
10199 if (pushes_to_free > 0) {
10200 osd->service.release_reserved_pushes(pushes_to_free);
10203 sdata->sdata_lock.Lock();
10204 sdata->sdata_cond.SignalOne();
10205 sdata->sdata_lock.Unlock();
10209 void OSD::ShardedOpWQ::prune_pg_waiters(OSDMapRef osdmap, int whoami)
10211 unsigned pushes_to_free = 0;
10212 for (auto sdata : shard_list) {
10213 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10214 sdata->waiting_for_pg_osdmap = osdmap;
10215 auto p = sdata->pg_slots.begin();
10216 while (p != sdata->pg_slots.end()) {
10217 ShardData::pg_slot& slot = p->second;
10218 if (!slot.to_process.empty() && slot.num_running == 0) {
10219 if (osdmap->is_up_acting_osd_shard(p->first, whoami)) {
10220 dout(20) << __func__ << " " << p->first << " maps to us, keeping"
10225 while (!slot.to_process.empty() &&
10226 slot.to_process.front().get_map_epoch() <= osdmap->get_epoch()) {
10227 auto& qi = slot.to_process.front();
10228 dout(20) << __func__ << " " << p->first
10230 << " epoch " << qi.get_map_epoch()
10231 << " <= " << osdmap->get_epoch()
10232 << ", stale, dropping" << dendl;
10233 pushes_to_free += qi.get_reserved_pushes();
10234 slot.to_process.pop_front();
10237 if (slot.to_process.empty() &&
10238 slot.num_running == 0 &&
10240 dout(20) << __func__ << " " << p->first << " empty, pruning" << dendl;
10241 p = sdata->pg_slots.erase(p);
10247 if (pushes_to_free > 0) {
10248 osd->service.release_reserved_pushes(pushes_to_free);
10252 void OSD::ShardedOpWQ::clear_pg_pointer(spg_t pgid)
10254 uint32_t shard_index = pgid.hash_to_shard(shard_list.size());
10255 auto sdata = shard_list[shard_index];
10256 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10257 auto p = sdata->pg_slots.find(pgid);
10258 if (p != sdata->pg_slots.end()) {
10259 auto& slot = p->second;
10260 dout(20) << __func__ << " " << pgid << " pg " << slot.pg << dendl;
10261 assert(!slot.pg || slot.pg->deleting);
10266 void OSD::ShardedOpWQ::clear_pg_slots()
10268 for (auto sdata : shard_list) {
10269 Mutex::Locker l(sdata->sdata_op_ordering_lock);
10270 sdata->pg_slots.clear();
10271 sdata->waiting_for_pg_osdmap.reset();
10272 // don't bother with reserved pushes; we are shutting down
10277 #define dout_prefix *_dout << "osd." << osd->whoami << " op_wq(" << shard_index << ") "
10279 void OSD::ShardedOpWQ::_process(uint32_t thread_index, heartbeat_handle_d *hb)
10281 uint32_t shard_index = thread_index % num_shards;
10282 ShardData *sdata = shard_list[shard_index];
10283 assert(NULL != sdata);
10286 sdata->sdata_op_ordering_lock.Lock();
10287 if (sdata->pqueue->empty()) {
10288 dout(20) << __func__ << " empty q, waiting" << dendl;
10289 // optimistically sleep a moment; maybe another work item will come along.
10290 osd->cct->get_heartbeat_map()->reset_timeout(hb,
10291 osd->cct->_conf->threadpool_default_timeout, 0);
10292 sdata->sdata_lock.Lock();
10293 sdata->sdata_op_ordering_lock.Unlock();
10294 sdata->sdata_cond.WaitInterval(sdata->sdata_lock,
10295 utime_t(osd->cct->_conf->threadpool_empty_queue_max_wait, 0));
10296 sdata->sdata_lock.Unlock();
10297 sdata->sdata_op_ordering_lock.Lock();
10298 if (sdata->pqueue->empty()) {
10299 sdata->sdata_op_ordering_lock.Unlock();
10303 pair<spg_t, PGQueueable> item = sdata->pqueue->dequeue();
10304 if (osd->is_stopping()) {
10305 sdata->sdata_op_ordering_lock.Unlock();
10306 return; // OSD shutdown, discard.
10309 uint64_t requeue_seq;
10311 auto& slot = sdata->pg_slots[item.first];
10312 dout(30) << __func__ << " " << item.first
10313 << " to_process " << slot.to_process
10314 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10315 slot.to_process.push_back(item.second);
10316 // note the requeue seq now...
10317 requeue_seq = slot.requeue_seq;
10318 if (slot.waiting_for_pg) {
10319 // save ourselves a bit of effort
10320 dout(20) << __func__ << " " << item.first << " item " << item.second
10321 << " queued, waiting_for_pg" << dendl;
10322 sdata->sdata_op_ordering_lock.Unlock();
10326 dout(20) << __func__ << " " << item.first << " item " << item.second
10327 << " queued" << dendl;
10328 ++slot.num_running;
10330 sdata->sdata_op_ordering_lock.Unlock();
10332 osd->service.maybe_inject_dispatch_delay();
10334 // [lookup +] lock pg (if we have it)
10336 pg = osd->_lookup_lock_pg(item.first);
10341 osd->service.maybe_inject_dispatch_delay();
10343 boost::optional<PGQueueable> qi;
10345 // we don't use a Mutex::Locker here because of the
10346 // osd->service.release_reserved_pushes() call below
10347 sdata->sdata_op_ordering_lock.Lock();
10349 auto q = sdata->pg_slots.find(item.first);
10350 assert(q != sdata->pg_slots.end());
10351 auto& slot = q->second;
10352 --slot.num_running;
10354 if (slot.to_process.empty()) {
10355 // raced with wake_pg_waiters or prune_pg_waiters
10356 dout(20) << __func__ << " " << item.first << " nothing queued" << dendl;
10360 sdata->sdata_op_ordering_lock.Unlock();
10363 if (requeue_seq != slot.requeue_seq) {
10364 dout(20) << __func__ << " " << item.first
10365 << " requeue_seq " << slot.requeue_seq << " > our "
10366 << requeue_seq << ", we raced with wake_pg_waiters"
10371 sdata->sdata_op_ordering_lock.Unlock();
10374 if (pg && !slot.pg && !pg->deleting) {
10375 dout(20) << __func__ << " " << item.first << " set pg to " << pg << dendl;
10378 dout(30) << __func__ << " " << item.first << " to_process " << slot.to_process
10379 << " waiting_for_pg=" << (int)slot.waiting_for_pg << dendl;
10381 // make sure we're not already waiting for this pg
10382 if (slot.waiting_for_pg) {
10383 dout(20) << __func__ << " " << item.first << " item " << item.second
10384 << " slot is waiting_for_pg" << dendl;
10388 sdata->sdata_op_ordering_lock.Unlock();
10393 qi = slot.to_process.front();
10394 slot.to_process.pop_front();
10395 dout(20) << __func__ << " " << item.first << " item " << *qi
10396 << " pg " << pg << dendl;
10399 // should this pg shard exist on this osd in this (or a later) epoch?
10400 OSDMapRef osdmap = sdata->waiting_for_pg_osdmap;
10401 if (osdmap->is_up_acting_osd_shard(item.first, osd->whoami)) {
10402 dout(20) << __func__ << " " << item.first
10403 << " no pg, should exist, will wait" << " on " << *qi << dendl;
10404 slot.to_process.push_front(*qi);
10405 slot.waiting_for_pg = true;
10406 } else if (qi->get_map_epoch() > osdmap->get_epoch()) {
10407 dout(20) << __func__ << " " << item.first << " no pg, item epoch is "
10408 << qi->get_map_epoch() << " > " << osdmap->get_epoch()
10409 << ", will wait on " << *qi << dendl;
10410 slot.to_process.push_front(*qi);
10411 slot.waiting_for_pg = true;
10413 dout(20) << __func__ << " " << item.first << " no pg, shouldn't exist,"
10414 << " dropping " << *qi << dendl;
10415 // share map with client?
10416 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10417 Session *session = static_cast<Session *>(
10418 (*_op)->get_req()->get_connection()->get_priv());
10420 osd->maybe_share_map(session, *_op, sdata->waiting_for_pg_osdmap);
10424 unsigned pushes_to_free = qi->get_reserved_pushes();
10425 if (pushes_to_free > 0) {
10426 sdata->sdata_op_ordering_lock.Unlock();
10427 osd->service.release_reserved_pushes(pushes_to_free);
10431 sdata->sdata_op_ordering_lock.Unlock();
10434 sdata->sdata_op_ordering_lock.Unlock();
10437 // osd_opwq_process marks the point at which an operation has been dequeued
10438 // and will begin to be handled by a worker thread.
10442 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10443 reqid = (*_op)->get_reqid();
10446 tracepoint(osd, opwq_process_start, reqid.name._type,
10447 reqid.name._num, reqid.tid, reqid.inc);
10450 lgeneric_subdout(osd->cct, osd, 30) << "dequeue status: ";
10451 Formatter *f = Formatter::create("json");
10452 f->open_object_section("q");
10454 f->close_section();
10459 ThreadPool::TPHandle tp_handle(osd->cct, hb, timeout_interval,
10461 qi->run(osd, pg, tp_handle);
10466 if (boost::optional<OpRequestRef> _op = qi->maybe_get_op()) {
10467 reqid = (*_op)->get_reqid();
10470 tracepoint(osd, opwq_process_finish, reqid.name._type,
10471 reqid.name._num, reqid.tid, reqid.inc);
10477 void OSD::ShardedOpWQ::_enqueue(pair<spg_t, PGQueueable> item) {
10478 uint32_t shard_index =
10479 item.first.hash_to_shard(shard_list.size());
10481 ShardData* sdata = shard_list[shard_index];
10482 assert (NULL != sdata);
10483 unsigned priority = item.second.get_priority();
10484 unsigned cost = item.second.get_cost();
10485 sdata->sdata_op_ordering_lock.Lock();
10487 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10488 if (priority >= osd->op_prio_cutoff)
10489 sdata->pqueue->enqueue_strict(
10490 item.second.get_owner(), priority, item);
10492 sdata->pqueue->enqueue(
10493 item.second.get_owner(),
10494 priority, cost, item);
10495 sdata->sdata_op_ordering_lock.Unlock();
10497 sdata->sdata_lock.Lock();
10498 sdata->sdata_cond.SignalOne();
10499 sdata->sdata_lock.Unlock();
10503 void OSD::ShardedOpWQ::_enqueue_front(pair<spg_t, PGQueueable> item)
10505 uint32_t shard_index = item.first.hash_to_shard(shard_list.size());
10506 ShardData* sdata = shard_list[shard_index];
10507 assert (NULL != sdata);
10508 sdata->sdata_op_ordering_lock.Lock();
10509 auto p = sdata->pg_slots.find(item.first);
10510 if (p != sdata->pg_slots.end() && !p->second.to_process.empty()) {
10511 // we may be racing with _process, which has dequeued a new item
10512 // from pqueue, put it on to_process, and is now busy taking the
10513 // pg lock. ensure this old requeued item is ordered before any
10514 // such newer item in to_process.
10515 p->second.to_process.push_front(item.second);
10516 item.second = p->second.to_process.back();
10517 p->second.to_process.pop_back();
10518 dout(20) << __func__ << " " << item.first
10519 << " " << p->second.to_process.front()
10520 << " shuffled w/ " << item.second << dendl;
10522 dout(20) << __func__ << " " << item.first << " " << item.second << dendl;
10524 sdata->_enqueue_front(item, osd->op_prio_cutoff);
10525 sdata->sdata_op_ordering_lock.Unlock();
10526 sdata->sdata_lock.Lock();
10527 sdata->sdata_cond.SignalOne();
10528 sdata->sdata_lock.Unlock();
10532 namespace osd_cmds {
10534 int heap(CephContext& cct, cmdmap_t& cmdmap, Formatter& f, std::ostream& os)
10536 if (!ceph_using_tcmalloc()) {
10537 os << "could not issue heap profiler command -- not using tcmalloc!";
10538 return -EOPNOTSUPP;
10542 if (!cmd_getval(&cct, cmdmap, "heapcmd", cmd)) {
10543 os << "unable to get value for command \"" << cmd << "\"";
10547 std::vector<std::string> cmd_vec;
10548 get_str_vec(cmd, cmd_vec);
10550 ceph_heap_profiler_handle_command(cmd_vec, os);
10555 }} // namespace ceph::osd_cmds
10558 std::ostream& operator<<(std::ostream& out, const OSD::io_queue& q) {
10560 case OSD::io_queue::prioritized:
10561 out << "prioritized";
10563 case OSD::io_queue::weightedpriority:
10564 out << "weightedpriority";
10566 case OSD::io_queue::mclock_opclass:
10567 out << "mclock_opclass";
10569 case OSD::io_queue::mclock_client:
10570 out << "mclock_client";