1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #ifndef CEPH_PAXOSSERVICE_H
16 #define CEPH_PAXOSSERVICE_H
18 #include "include/Context.h"
21 #include "MonitorDBStore.h"
27 * A Paxos Service is an abstraction that easily allows one to obtain an
28 * association between a Monitor and a Paxos class, in order to implement any
33 * @defgroup PaxosService_h_class Paxos Service
38 * The Monitor to which this class is associated with
42 * The Paxos instance to which this class is associated with
46 * Our name. This will be associated with the class implementing us, and will
47 * be used mainly for store-related operations.
51 * If we are or have queued anything for proposal, this variable will be true
52 * until our proposal has been finished.
56 bool need_immediate_propose = false;
60 * Services implementing us used to depend on the Paxos version, back when
61 * each service would have a Paxos instance for itself. However, now we only
62 * have a single Paxos instance, shared by all the services. Each service now
63 * must keep its own version, if so they wish. This variable should be used
66 version_t service_version;
70 * Event callback responsible for proposing our pending value once a timer
73 Context *proposal_timer;
75 * If the implementation class has anything pending to be proposed to Paxos,
76 * then have_pending should be true; otherwise, false.
81 * health checks for this service
83 * Child must populate this during encode_pending() by calling encode_health().
85 health_check_map_t health_checks;
88 * format of our state in leveldb, 0 for default
90 version_t format_version;
93 const health_check_map_t& get_health_checks() {
98 * @defgroup PaxosService_h_callbacks Callback classes
102 * Retry dispatching a given service message
104 * This callback class is used when we had to wait for some condition to
105 * become true while we were dispatching it.
107 * For instance, if the message's version isn't readable, according to Paxos,
108 * then we must wait for it to become readable. So, we just queue an
109 * instance of this class onto the Paxos::wait_for_readable function, and
110 * we will retry the whole dispatch again once the callback is fired.
112 class C_RetryMessage : public C_MonOp {
115 C_RetryMessage(PaxosService *s, MonOpRequestRef op_) :
116 C_MonOp(op_), svc(s) { }
117 void _finish(int r) override {
118 if (r == -EAGAIN || r >= 0)
120 else if (r == -ECANCELED)
123 assert(0 == "bad C_RetryMessage return value");
132 * @param mn A Monitor instance
133 * @param p A Paxos instance
134 * @param name Our service's name.
136 PaxosService(Monitor *mn, Paxos *p, string name)
137 : mon(mn), paxos(p), service_name(name),
139 service_version(0), proposal_timer(0), have_pending(false),
141 last_committed_name("last_committed"),
142 first_committed_name("first_committed"),
143 full_prefix_name("full"), full_latest_name("latest"),
144 cached_first_committed(0), cached_last_committed(0)
148 virtual ~PaxosService() {}
151 * Get the service's name.
153 * @returns The service's name.
155 string get_service_name() { return service_name; }
158 * Get the store prefixes we utilize
160 virtual void get_store_prefixes(set<string>& s) {
161 s.insert(service_name);
164 // i implement and you ignore
166 * Informs this instance that it should consider itself restarted.
168 * This means that we will cancel our proposal_timer event, if any exists.
172 * Informs this instance that an election has finished.
174 * This means that we will invoke a PaxosService::discard_pending while
175 * setting have_pending to false (basically, ignore our pending state) and
176 * we will then make sure we obtain a new state.
178 * Our state shall be updated by PaxosService::_active if the Paxos is
179 * active; otherwise, we will wait for it to become active by adding a
180 * PaxosService::C_Active callback to it.
182 void election_finished();
184 * Informs this instance that it is supposed to shutdown.
186 * Basically, it will instruct Paxos to cancel all events/callbacks and then
187 * will cancel the proposal_timer event if any exists.
193 * Update our state by updating it from Paxos, and then creating a new
194 * pending state if need be.
196 * @remarks We only create a pending state we our Monitor is the Leader.
198 * @pre Paxos is active
199 * @post have_pending is true if our Monitor is the Leader and Paxos is
206 * Propose a new value through Paxos.
208 * This function should be called by the classes implementing
209 * PaxosService, in order to propose a new value through Paxos.
211 * @pre The implementation class implements the encode_pending function.
212 * @pre have_pending is true
213 * @pre Our monitor is the Leader
214 * @pre Paxos is active
215 * @post Cancel the proposal timer, if any
216 * @post have_pending is false
217 * @post propose pending value through Paxos
219 * @note This function depends on the implementation of encode_pending on
220 * the class that is implementing PaxosService
222 void propose_pending();
225 * Let others request us to propose.
227 * At the moment, this is just a wrapper to propose_pending() with an
228 * extra check for is_writeable(), but it's a good practice to dissociate
229 * requests for proposals from direct usage of propose_pending() for
230 * future use -- we might want to perform additional checks or put a
231 * request on hold, for instance.
233 void request_proposal() {
234 assert(is_writeable());
239 * Request service @p other to perform a proposal.
241 * We could simply use the function above, requesting @p other directly,
242 * but we might eventually want to do something to the request -- say,
243 * set a flag stating we're waiting on a cross-proposal to be finished.
245 void request_proposal(PaxosService *other) {
246 assert(other != NULL);
247 assert(other->is_writeable());
249 other->request_proposal();
253 * Dispatch a message by passing it to several different functions that are
254 * either implemented directly by this service, or that should be implemented
255 * by the class implementing this service.
258 * @returns 'true' on successful dispatch; 'false' otherwise.
260 bool dispatch(MonOpRequestRef op);
262 void refresh(bool *need_bootstrap);
266 * @defgroup PaxosService_h_override_funcs Functions that should be
269 * These functions should be overridden at will by the class implementing
274 * Create the initial state for your system.
276 * In some of ours the state is actually set up elsewhere so this does
279 virtual void create_initial() = 0;
282 * Query the Paxos system for the latest state and apply it if it's newer
283 * than the current Monitor state.
285 virtual void update_from_paxos(bool *need_bootstrap) = 0;
288 * Hook called after all services have refreshed their state from paxos
290 * This is useful for doing any update work that depends on other
291 * service's having up-to-date state.
293 virtual void post_paxos_update() {}
298 * This is called on mon startup, after all of the PaxosService instances'
299 * update_from_paxos() methods have been called
301 virtual void init() {}
304 * Create the pending state.
306 * @invariant This function is only called on a Leader.
307 * @remarks This created state is then modified by incoming messages.
308 * @remarks Called at startup and after every Paxos ratification round.
310 virtual void create_pending() = 0;
313 * Encode the pending state into a bufferlist for ratification and
314 * transmission as the next state.
316 * @invariant This function is only called on a Leader.
318 * @param t The transaction to hold all changes.
320 virtual void encode_pending(MonitorDBStore::TransactionRef t) = 0;
323 * Discard the pending state
325 * @invariant This function is only called on a Leader.
327 * @remarks This function is NOT overridden in any of our code, but it is
328 * called in PaxosService::election_finished if have_pending is
331 virtual void discard_pending() { }
334 * Look at the query; if the query can be handled without changing state,
337 * @param m A query message
338 * @returns 'true' if the query was handled (e.g., was a read that got
339 * answered, was a state change that has no effect); 'false'
342 virtual bool preprocess_query(MonOpRequestRef op) = 0;
345 * Apply the message to the pending state.
347 * @invariant This function is only called on a Leader.
349 * @param m An update message
350 * @returns 'true' if the update message was handled (e.g., a command that
351 * went through); 'false' otherwise.
353 virtual bool prepare_update(MonOpRequestRef op) = 0;
359 * Determine if the Paxos system should vote on pending, and if so how long
360 * it should wait to vote.
362 * @param[out] delay The wait time, used so we can limit the update traffic
364 * @returns 'true' if the Paxos system should propose; 'false' otherwise.
366 virtual bool should_propose(double &delay);
369 * force an immediate propose.
371 * This is meant to be called from prepare_update(op).
373 void force_immediate_propose() {
374 need_immediate_propose = true;
378 * @defgroup PaxosService_h_courtesy Courtesy functions
380 * Courtesy functions, in case the class implementing this service has
381 * anything it wants/needs to do at these times.
385 * This is called when the Paxos state goes to active.
387 * On the peon, this is after each election.
388 * On the leader, this is after each election, *and* after each completed
391 * @note This function may get called twice in certain recovery cases.
393 virtual void on_active() { }
396 * This is called when we are shutting down
398 virtual void on_shutdown() {}
401 * this is called when activating on the leader
403 * it should conditionally upgrade the on-disk format by proposing a transaction
405 virtual void upgrade_format() { }
408 * this is called when we detect the store has just upgraded underneath us
410 virtual void on_upgrade() {}
413 * Called when the Paxos system enters a Leader election.
415 * @remarks It's a courtesy method, in case the class implementing this
416 * service has anything it wants/needs to do at that time.
418 virtual void on_restart() { }
426 virtual void tick() {}
429 * Get health information
431 * @param summary list of summary strings and associated severity
432 * @param detail optional list of detailed problem reports; may be NULL
434 virtual void get_health(list<pair<health_status_t,string> >& summary,
435 list<pair<health_status_t,string> > *detail,
436 CephContext *cct) const { }
438 void encode_health(const health_check_map_t& next,
439 MonitorDBStore::TransactionRef t) {
442 t->put("health", service_name, bl);
443 mon->log_health(next, health_checks, t);
449 * @defgroup PaxosService_h_store_keys Set of keys that are usually used on
450 * all the services implementing this
451 * class, and, being almost the only keys
452 * used, should be standardized to avoid
456 const string last_committed_name;
457 const string first_committed_name;
458 const string full_prefix_name;
459 const string full_latest_name;
465 * @defgroup PaxosService_h_version_cache Variables holding cached values
466 * for the most used versions (first
467 * and last committed); we only have
468 * to read them when the store is
469 * updated, so in-between updates we
470 * may very well use cached versions
471 * and avoid the overhead.
474 version_t cached_first_committed;
475 version_t cached_last_committed;
481 * Callback list to be used whenever we are running a proposal through
482 * Paxos. These callbacks will be awaken whenever the said proposal
485 list<Context*> waiting_for_finished_proposal;
490 * Check if we are proposing a value through Paxos
492 * @returns true if we are proposing; false otherwise.
494 bool is_proposing() {
499 * Check if we are in the Paxos ACTIVE state.
501 * @note This function is a wrapper for Paxos::is_active
503 * @returns true if in state ACTIVE; false otherwise.
508 (paxos->is_active() || paxos->is_updating() || paxos->is_writing());
512 * Check if we are readable.
514 * This mirrors on the paxos check, except that we also verify that
516 * - the client hasn't seen the future relative to this PaxosService
517 * - this service isn't proposing.
518 * - we have committed our initial state (last_committed > 0)
520 * @param ver The version we want to check if is readable
521 * @returns true if it is readable; false otherwise
523 bool is_readable(version_t ver = 0) {
524 if (ver > get_last_committed() ||
525 !paxos->is_readable(0) ||
526 get_last_committed() == 0)
532 * Check if we are writeable.
534 * We consider to be writeable iff:
536 * - we are not proposing a new version;
537 * - we are ready to be written to -- i.e., we have a pending value.
538 * - paxos is (active or updating or writing or refresh)
540 * @returns true if writeable; false otherwise
542 bool is_writeable() {
543 return is_write_ready();
547 * Check if we are ready to be written to. This means we must have a
548 * pending value and be active.
550 * @returns true if we are ready to be written to; false otherwise.
552 bool is_write_ready() {
553 return is_active() && have_pending;
557 * Wait for a proposal to finish.
559 * Add a callback to be awaken whenever our current proposal finishes being
560 * proposed through Paxos.
562 * @param c The callback to be awaken once the proposal is finished.
564 void wait_for_finished_proposal(MonOpRequestRef op, Context *c) {
566 op->mark_event_string(service_name + ":wait_for_finished_proposal");
567 waiting_for_finished_proposal.push_back(c);
569 void wait_for_finished_proposal_ctx(Context *c) {
571 wait_for_finished_proposal(o, c);
575 * Wait for us to become active
577 * @param c The callback to be awaken once we become active.
579 void wait_for_active(MonOpRequestRef op, Context *c) {
581 op->mark_event_string(service_name + ":wait_for_active");
583 if (!is_proposing()) {
584 paxos->wait_for_active(op, c);
587 wait_for_finished_proposal(op, c);
589 void wait_for_active_ctx(Context *c) {
591 wait_for_active(o, c);
595 * Wait for us to become readable
597 * @param c The callback to be awaken once we become active.
598 * @param ver The version we want to wait on.
600 void wait_for_readable(MonOpRequestRef op, Context *c, version_t ver = 0) {
601 /* This is somewhat of a hack. We only do check if a version is readable on
602 * PaxosService::dispatch(), but, nonetheless, we must make sure that if that
603 * is why we are not readable, then we must wait on PaxosService and not on
604 * Paxos; otherwise, we may assert on Paxos::wait_for_readable() if it
605 * happens to be readable at that specific point in time.
608 op->mark_event_string(service_name + ":wait_for_readable");
610 if (is_proposing() ||
611 ver > get_last_committed() ||
612 get_last_committed() == 0)
613 wait_for_finished_proposal(op, c);
616 op->mark_event_string(service_name + ":wait_for_readable/paxos");
618 paxos->wait_for_readable(op, c);
622 void wait_for_readable_ctx(Context *c, version_t ver = 0) {
623 MonOpRequestRef o; // will initialize the shared_ptr to NULL
624 wait_for_readable(o, c, ver);
628 * Wait for us to become writeable
630 * @param c The callback to be awaken once we become writeable.
632 void wait_for_writeable(MonOpRequestRef op, Context *c) {
634 op->mark_event_string(service_name + ":wait_for_writeable");
637 wait_for_finished_proposal(op, c);
638 else if (!is_write_ready())
639 wait_for_active(op, c);
641 paxos->wait_for_writeable(op, c);
643 void wait_for_writeable_ctx(Context *c) {
645 wait_for_writeable(o, c);
650 * @defgroup PaxosService_h_Trim Functions for trimming states
654 * trim service states if appropriate
656 * Called at same interval as tick()
661 * Auxiliary function to trim our state from version @p from to version
662 * @p to, not including; i.e., the interval [from, to[
664 * @param t The transaction to which we will add the trim operations.
665 * @param from the lower limit of the interval to be trimmed
666 * @param to the upper limit of the interval to be trimmed (not including)
668 void trim(MonitorDBStore::TransactionRef t, version_t from, version_t to);
671 * encode service-specific extra bits into trim transaction
673 * @param tx transaction
674 * @param first new first_committed value
676 virtual void encode_trim_extra(MonitorDBStore::TransactionRef tx,
680 * Get the version we should trim to.
682 * Should be overloaded by service if it wants to trim states.
684 * @returns the version we should trim to; if we return zero, it should be
685 * assumed that there's no version to trim to.
687 virtual version_t get_trim_to() {
695 * @defgroup PaxosService_h_Stash_Full
698 virtual bool should_stash_full();
700 * Encode a full version on @p t
702 * @note We force every service to implement this function, since we strongly
703 * desire the encoding of full versions.
704 * @note Services that do not trim their state, will be bound to only create
705 * one full version. Full version stashing is determined/controled by
706 * trimming: we stash a version each time a trim is bound to erase the
707 * latest full version.
709 * @param t Transaction on which the full version shall be encoded.
711 virtual void encode_full(MonitorDBStore::TransactionRef t) = 0;
720 * @note This function is a wrapper for Paxos::cancel_events
722 void cancel_events() {
723 paxos->cancel_events();
727 * @defgroup PaxosService_h_store_funcs Back storage interface functions
731 * @defgroup PaxosService_h_store_modify Wrapper function interface to access
732 * the back store for modification
736 void put_first_committed(MonitorDBStore::TransactionRef t, version_t ver) {
737 t->put(get_service_name(), first_committed_name, ver);
740 * Set the last committed version to @p ver
742 * @param t A transaction to which we add this put operation
743 * @param ver The last committed version number being put
745 void put_last_committed(MonitorDBStore::TransactionRef t, version_t ver) {
746 t->put(get_service_name(), last_committed_name, ver);
748 /* We only need to do this once, and that is when we are about to make our
749 * first proposal. There are some services that rely on first_committed
750 * being set -- and it should! -- so we need to guarantee that it is,
751 * specially because the services itself do not do it themselves. They do
752 * rely on it, but they expect us to deal with it, and so we shall.
754 if (!get_first_committed())
755 put_first_committed(t, ver);
758 * Put the contents of @p bl into version @p ver
760 * @param t A transaction to which we will add this put operation
761 * @param ver The version to which we will add the value
762 * @param bl A bufferlist containing the version's value
764 void put_version(MonitorDBStore::TransactionRef t, version_t ver,
766 t->put(get_service_name(), ver, bl);
769 * Put the contents of @p bl into a full version key for this service, that
770 * will be created with @p ver in mind.
772 * @param t The transaction to which we will add this put operation
773 * @param ver A version number
774 * @param bl A bufferlist containing the version's value
776 void put_version_full(MonitorDBStore::TransactionRef t,
777 version_t ver, bufferlist& bl) {
778 string key = mon->store->combine_strings(full_prefix_name, ver);
779 t->put(get_service_name(), key, bl);
782 * Put the version number in @p ver into the key pointing to the latest full
783 * version of this service.
785 * @param t The transaction to which we will add this put operation
786 * @param ver A version number
788 void put_version_latest_full(MonitorDBStore::TransactionRef t, version_t ver) {
789 string key = mon->store->combine_strings(full_prefix_name, full_latest_name);
790 t->put(get_service_name(), key, ver);
793 * Put the contents of @p bl into the key @p key.
795 * @param t A transaction to which we will add this put operation
796 * @param key The key to which we will add the value
797 * @param bl A bufferlist containing the value
799 void put_value(MonitorDBStore::TransactionRef t,
800 const string& key, bufferlist& bl) {
801 t->put(get_service_name(), key, bl);
805 * Put integer value @v into the key @p key.
807 * @param t A transaction to which we will add this put operation
808 * @param key The key to which we will add the value
809 * @param v An integer
811 void put_value(MonitorDBStore::TransactionRef t,
812 const string& key, version_t v) {
813 t->put(get_service_name(), key, v);
821 * @defgroup PaxosService_h_store_get Wrapper function interface to access
822 * the back store for reading purposes
827 * @defgroup PaxosService_h_version_cache Obtain cached versions for this
832 * Get the first committed version
834 * @returns Our first committed version (that is available)
836 version_t get_first_committed() const{
837 return cached_first_committed;
840 * Get the last committed version
842 * @returns Our last committed version
844 version_t get_last_committed() const{
845 return cached_last_committed;
853 * Get the contents of a given version @p ver
855 * @param ver The version being obtained
856 * @param bl The bufferlist to be populated
857 * @return 0 on success; <0 otherwise
859 virtual int get_version(version_t ver, bufferlist& bl) {
860 return mon->store->get(get_service_name(), ver, bl);
863 * Get the contents of a given full version of this service.
865 * @param ver A version number
866 * @param bl The bufferlist to be populated
867 * @returns 0 on success; <0 otherwise
869 virtual int get_version_full(version_t ver, bufferlist& bl) {
870 string key = mon->store->combine_strings(full_prefix_name, ver);
871 return mon->store->get(get_service_name(), key, bl);
874 * Get the latest full version number
876 * @returns A version number
878 version_t get_version_latest_full() {
879 string key = mon->store->combine_strings(full_prefix_name, full_latest_name);
880 return mon->store->get(get_service_name(), key);
884 * Get a value from a given key.
886 * @param[in] key The key
887 * @param[out] bl The bufferlist to be populated with the value
889 int get_value(const string& key, bufferlist& bl) {
890 return mon->store->get(get_service_name(), key, bl);
893 * Get an integer value from a given key.
895 * @param[in] key The key
897 version_t get_value(const string& key) {
898 return mon->store->get(get_service_name(), key);