initial code repo
[stor4nfv.git] / src / ceph / src / osd / PGBackend.h
diff --git a/src/ceph/src/osd/PGBackend.h b/src/ceph/src/osd/PGBackend.h
new file mode 100644 (file)
index 0000000..f244f4c
--- /dev/null
@@ -0,0 +1,602 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2013,2014 Inktank Storage, Inc.
+ * Copyright (C) 2013,2014 Cloudwatt <libre.licensing@cloudwatt.com>
+ *
+ * Author: Loic Dachary <loic@dachary.org>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef PGBACKEND_H
+#define PGBACKEND_H
+
+#include "osd_types.h"
+#include "common/WorkQueue.h"
+#include "include/Context.h"
+#include "os/ObjectStore.h"
+#include "common/LogClient.h"
+#include <string>
+#include "PGTransaction.h"
+
+namespace Scrub {
+  class Store;
+}
+struct shard_info_wrapper;
+struct inconsistent_obj_wrapper;
+
+//forward declaration
+class OSDMap;
+class PGLog;
+typedef ceph::shared_ptr<const OSDMap> OSDMapRef;
+
+ /**
+  * PGBackend
+  *
+  * PGBackend defines an interface for logic handling IO and
+  * replication on RADOS objects.  The PGBackend implementation
+  * is responsible for:
+  *
+  * 1) Handling client operations
+  * 2) Handling object recovery
+  * 3) Handling object access
+  * 4) Handling scrub, deep-scrub, repair
+  */
+ class PGBackend {
+ public:
+   CephContext* cct;
+ protected:
+   ObjectStore *store;
+   const coll_t coll;
+   ObjectStore::CollectionHandle &ch;
+ public:       
+   /**
+    * Provides interfaces for PGBackend callbacks
+    *
+    * The intention is that the parent calls into the PGBackend
+    * implementation holding a lock and that the callbacks are
+    * called under the same locks.
+    */
+   class Listener {
+   public:
+     /// Debugging
+     virtual DoutPrefixProvider *get_dpp() = 0;
+
+     /// Recovery
+
+     /**
+      * Called with the transaction recovering oid
+      */
+     virtual void on_local_recover(
+       const hobject_t &oid,
+       const ObjectRecoveryInfo &recovery_info,
+       ObjectContextRef obc,
+       bool is_delete,
+       ObjectStore::Transaction *t
+       ) = 0;
+
+     /**
+      * Called when transaction recovering oid is durable and
+      * applied on all replicas
+      */
+     virtual void on_global_recover(
+       const hobject_t &oid,
+       const object_stat_sum_t &stat_diff,
+       bool is_delete
+       ) = 0;
+
+     /**
+      * Called when peer is recovered
+      */
+     virtual void on_peer_recover(
+       pg_shard_t peer,
+       const hobject_t &oid,
+       const ObjectRecoveryInfo &recovery_info
+       ) = 0;
+
+     virtual void begin_peer_recover(
+       pg_shard_t peer,
+       const hobject_t oid) = 0;
+
+     virtual void failed_push(const list<pg_shard_t> &from, const hobject_t &soid) = 0;
+     virtual void primary_failed(const hobject_t &soid) = 0;
+     virtual bool primary_error(const hobject_t& soid, eversion_t v) = 0;
+     virtual void cancel_pull(const hobject_t &soid) = 0;
+
+     virtual void apply_stats(
+       const hobject_t &soid,
+       const object_stat_sum_t &delta_stats) = 0;
+
+     /**
+      * Called when a read on the primary fails when pushing
+      */
+     virtual void on_primary_error(
+       const hobject_t &oid,
+       eversion_t v
+       ) = 0;
+
+     virtual void remove_missing_object(const hobject_t &oid,
+                                       eversion_t v,
+                                       Context *on_complete) = 0;
+
+     /**
+      * Bless a context
+      *
+      * Wraps a context in whatever outer layers the parent usually
+      * uses to call into the PGBackend
+      */
+     virtual Context *bless_context(Context *c) = 0;
+     virtual GenContext<ThreadPool::TPHandle&> *bless_gencontext(
+       GenContext<ThreadPool::TPHandle&> *c) = 0;
+
+     virtual void send_message(int to_osd, Message *m) = 0;
+     virtual void queue_transaction(
+       ObjectStore::Transaction&& t,
+       OpRequestRef op = OpRequestRef()
+       ) = 0;
+     virtual void queue_transactions(
+       vector<ObjectStore::Transaction>& tls,
+       OpRequestRef op = OpRequestRef()
+       ) = 0;
+     virtual epoch_t get_epoch() const = 0;
+     virtual epoch_t get_interval_start_epoch() const = 0;
+     virtual epoch_t get_last_peering_reset_epoch() const = 0;
+
+     virtual const set<pg_shard_t> &get_actingbackfill_shards() const = 0;
+     virtual const set<pg_shard_t> &get_acting_shards() const = 0;
+     virtual const set<pg_shard_t> &get_backfill_shards() const = 0;
+
+     virtual std::string gen_dbg_prefix() const = 0;
+
+     virtual const map<hobject_t, set<pg_shard_t>> &get_missing_loc_shards()
+       const = 0;
+
+     virtual const pg_missing_tracker_t &get_local_missing() const = 0;
+     virtual const map<pg_shard_t, pg_missing_t> &get_shard_missing()
+       const = 0;
+     virtual boost::optional<const pg_missing_const_i &> maybe_get_shard_missing(
+       pg_shard_t peer) const {
+       if (peer == primary_shard()) {
+        return get_local_missing();
+       } else {
+        map<pg_shard_t, pg_missing_t>::const_iterator i =
+          get_shard_missing().find(peer);
+        if (i == get_shard_missing().end()) {
+          return boost::optional<const pg_missing_const_i &>();
+        } else {
+          return i->second;
+        }
+       }
+     }
+     virtual const pg_missing_const_i &get_shard_missing(pg_shard_t peer) const {
+       auto m = maybe_get_shard_missing(peer);
+       assert(m);
+       return *m;
+     }
+
+     virtual const map<pg_shard_t, pg_info_t> &get_shard_info() const = 0;
+     virtual const pg_info_t &get_shard_info(pg_shard_t peer) const {
+       if (peer == primary_shard()) {
+        return get_info();
+       } else {
+        map<pg_shard_t, pg_info_t>::const_iterator i =
+          get_shard_info().find(peer);
+        assert(i != get_shard_info().end());
+        return i->second;
+       }
+     }
+
+     virtual const PGLog &get_log() const = 0;
+     virtual bool pgb_is_primary() const = 0;
+     virtual OSDMapRef pgb_get_osdmap() const = 0;
+     virtual const pg_info_t &get_info() const = 0;
+     virtual const pg_pool_t &get_pool() const = 0;
+
+     virtual ObjectContextRef get_obc(
+       const hobject_t &hoid,
+       const map<string, bufferlist> &attrs) = 0;
+
+     virtual bool try_lock_for_read(
+       const hobject_t &hoid,
+       ObcLockManager &manager) = 0;
+
+     virtual void release_locks(ObcLockManager &manager) = 0;
+
+     virtual void op_applied(
+       const eversion_t &applied_version) = 0;
+
+     virtual bool should_send_op(
+       pg_shard_t peer,
+       const hobject_t &hoid) = 0;
+
+     virtual void log_operation(
+       const vector<pg_log_entry_t> &logv,
+       const boost::optional<pg_hit_set_history_t> &hset_history,
+       const eversion_t &trim_to,
+       const eversion_t &roll_forward_to,
+       bool transaction_applied,
+       ObjectStore::Transaction &t) = 0;
+
+     virtual void pgb_set_object_snap_mapping(
+       const hobject_t &soid,
+       const set<snapid_t> &snaps,
+       ObjectStore::Transaction *t) = 0;
+
+     virtual void pgb_clear_object_snap_mapping(
+       const hobject_t &soid,
+       ObjectStore::Transaction *t) = 0;
+
+     virtual void update_peer_last_complete_ondisk(
+       pg_shard_t fromosd,
+       eversion_t lcod) = 0;
+
+     virtual void update_last_complete_ondisk(
+       eversion_t lcod) = 0;
+
+     virtual void update_stats(
+       const pg_stat_t &stat) = 0;
+
+     virtual void schedule_recovery_work(
+       GenContext<ThreadPool::TPHandle&> *c) = 0;
+
+     virtual pg_shard_t whoami_shard() const = 0;
+     int whoami() const {
+       return whoami_shard().osd;
+     }
+     spg_t whoami_spg_t() const {
+       return get_info().pgid;
+     }
+
+     virtual spg_t primary_spg_t() const = 0;
+     virtual pg_shard_t primary_shard() const = 0;
+
+     virtual uint64_t min_peer_features() const = 0;
+
+     virtual hobject_t get_temp_recovery_object(const hobject_t& target,
+                                               eversion_t version) = 0;
+
+     virtual void send_message_osd_cluster(
+       int peer, Message *m, epoch_t from_epoch) = 0;
+     virtual void send_message_osd_cluster(
+       Message *m, Connection *con) = 0;
+     virtual void send_message_osd_cluster(
+       Message *m, const ConnectionRef& con) = 0;
+     virtual ConnectionRef get_con_osd_cluster(int peer, epoch_t from_epoch) = 0;
+     virtual entity_name_t get_cluster_msgr_name() = 0;
+
+     virtual PerfCounters *get_logger() = 0;
+
+     virtual ceph_tid_t get_tid() = 0;
+
+     virtual LogClientTemp clog_error() = 0;
+     virtual LogClientTemp clog_warn() = 0;
+
+     virtual bool check_failsafe_full(ostream &ss) = 0;
+
+     virtual bool check_osdmap_full(const set<pg_shard_t> &missing_on) = 0;
+
+     virtual ~Listener() {}
+   };
+   Listener *parent;
+   Listener *get_parent() const { return parent; }
+   PGBackend(CephContext* cct, Listener *l, ObjectStore *store, coll_t coll,
+            ObjectStore::CollectionHandle &ch) :
+     cct(cct),
+     store(store),
+     coll(coll),
+     ch(ch),
+     parent(l) {}
+   bool is_primary() const { return get_parent()->pgb_is_primary(); }
+   OSDMapRef get_osdmap() const { return get_parent()->pgb_get_osdmap(); }
+   const pg_info_t &get_info() { return get_parent()->get_info(); }
+
+   std::string gen_prefix() const {
+     return parent->gen_dbg_prefix();
+   }
+
+   /**
+    * RecoveryHandle
+    *
+    * We may want to recover multiple objects in the same set of
+    * messages.  RecoveryHandle is an interface for the opaque
+    * object used by the implementation to store the details of
+    * the pending recovery operations.
+    */
+   struct RecoveryHandle {
+     bool cache_dont_need;
+     map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > deletes;
+
+     RecoveryHandle(): cache_dont_need(false) {}
+     virtual ~RecoveryHandle() {}
+   };
+
+   /// Get a fresh recovery operation
+   virtual RecoveryHandle *open_recovery_op() = 0;
+
+   /// run_recovery_op: finish the operation represented by h
+   virtual void run_recovery_op(
+     RecoveryHandle *h,     ///< [in] op to finish
+     int priority           ///< [in] msg priority
+     ) = 0;
+
+   void recover_delete_object(const hobject_t &oid, eversion_t v,
+                             RecoveryHandle *h);
+   void send_recovery_deletes(int prio,
+                             const map<pg_shard_t, vector<pair<hobject_t, eversion_t> > > &deletes);
+
+   /**
+    * recover_object
+    *
+    * Triggers a recovery operation on the specified hobject_t
+    * onreadable must be called before onwriteable
+    *
+    * On each replica (primary included), get_parent()->on_not_missing()
+    * must be called when the transaction finalizing the recovery
+    * is queued.  Similarly, get_parent()->on_readable() must be called
+    * when the transaction is applied in the backing store.
+    *
+    * get_parent()->on_not_degraded() should be called on the primary
+    * when writes can resume on the object.
+    *
+    * obc may be NULL if the primary lacks the object.
+    *
+    * head may be NULL only if the head/snapdir is missing
+    *
+    * @param missing [in] set of info, missing pairs for queried nodes
+    * @param overlaps [in] mapping of object to file offset overlaps
+    */
+   virtual int recover_object(
+     const hobject_t &hoid, ///< [in] object to recover
+     eversion_t v,          ///< [in] version to recover
+     ObjectContextRef head,  ///< [in] context of the head/snapdir object
+     ObjectContextRef obc,  ///< [in] context of the object
+     RecoveryHandle *h      ///< [in,out] handle to attach recovery op to
+     ) = 0;
+
+   /**
+    * true if PGBackend can handle this message while inactive
+    *
+    * If it returns true, handle_message *must* also return true
+    */
+   virtual bool can_handle_while_inactive(OpRequestRef op) = 0;
+
+   /// gives PGBackend a crack at an incoming message
+   bool handle_message(
+     OpRequestRef op ///< [in] message received
+     ); ///< @return true if the message was handled
+
+   /// the variant of handle_message that is overridden by child classes
+   virtual bool _handle_message(OpRequestRef op) = 0;
+
+   virtual void check_recovery_sources(const OSDMapRef& osdmap) = 0;
+
+
+   /**
+    * clean up any temporary on-disk state due to a pg interval change
+    */
+   void on_change_cleanup(ObjectStore::Transaction *t);
+   /**
+    * implementation should clear itself, contexts blessed prior to on_change
+    * won't be called after on_change()
+    */
+   virtual void on_change() = 0;
+   virtual void clear_recovery_state() = 0;
+
+   virtual void on_flushed() = 0;
+
+   virtual IsPGRecoverablePredicate *get_is_recoverable_predicate() = 0;
+   virtual IsPGReadablePredicate *get_is_readable_predicate() = 0;
+
+   virtual void dump_recovery_info(Formatter *f) const = 0;
+
+ private:
+   set<hobject_t> temp_contents;
+ public:
+   // Track contents of temp collection, clear on reset
+   void add_temp_obj(const hobject_t &oid) {
+     temp_contents.insert(oid);
+   }
+   void add_temp_objs(const set<hobject_t> &oids) {
+     temp_contents.insert(oids.begin(), oids.end());
+   }
+   void clear_temp_obj(const hobject_t &oid) {
+     temp_contents.erase(oid);
+   }
+   void clear_temp_objs(const set<hobject_t> &oids) {
+     for (set<hobject_t>::const_iterator i = oids.begin();
+         i != oids.end();
+         ++i) {
+       temp_contents.erase(*i);
+     }
+   }
+
+   virtual ~PGBackend() {}
+
+   /// execute implementation specific transaction
+   virtual void submit_transaction(
+     const hobject_t &hoid,               ///< [in] object
+     const object_stat_sum_t &delta_stats,///< [in] stat change
+     const eversion_t &at_version,        ///< [in] version
+     PGTransactionUPtr &&t,               ///< [in] trans to execute (move)
+     const eversion_t &trim_to,           ///< [in] trim log to here
+     const eversion_t &roll_forward_to,  ///< [in] trim rollback info to here
+     const vector<pg_log_entry_t> &log_entries, ///< [in] log entries for t
+     /// [in] hitset history (if updated with this transaction)
+     boost::optional<pg_hit_set_history_t> &hset_history,
+     Context *on_local_applied_sync,      ///< [in] called when applied locally
+     Context *on_all_applied,             ///< [in] called when all acked
+     Context *on_all_commit,              ///< [in] called when all commit
+     ceph_tid_t tid,                      ///< [in] tid
+     osd_reqid_t reqid,                   ///< [in] reqid
+     OpRequestRef op                      ///< [in] op
+     ) = 0;
+
+   /// submit callback to be called in order with pending writes
+   virtual void call_write_ordered(std::function<void(void)> &&cb) = 0;
+
+   void try_stash(
+     const hobject_t &hoid,
+     version_t v,
+     ObjectStore::Transaction *t);
+
+   void rollback(
+     const pg_log_entry_t &entry,
+     ObjectStore::Transaction *t);
+
+   friend class LRBTrimmer;
+   void rollforward(
+     const pg_log_entry_t &entry,
+     ObjectStore::Transaction *t);
+
+   void trim(
+     const pg_log_entry_t &entry,
+     ObjectStore::Transaction *t);
+
+   void remove(
+     const hobject_t &hoid,
+     ObjectStore::Transaction *t);
+
+ protected:
+
+   void handle_recovery_delete(OpRequestRef op);
+   void handle_recovery_delete_reply(OpRequestRef op);
+
+   /// Reapply old attributes
+   void rollback_setattrs(
+     const hobject_t &hoid,
+     map<string, boost::optional<bufferlist> > &old_attrs,
+     ObjectStore::Transaction *t);
+
+   /// Truncate object to rollback append
+   virtual void rollback_append(
+     const hobject_t &hoid,
+     uint64_t old_size,
+     ObjectStore::Transaction *t);
+
+   /// Unstash object to rollback stash
+   void rollback_stash(
+     const hobject_t &hoid,
+     version_t old_version,
+     ObjectStore::Transaction *t);
+
+   /// Unstash object to rollback stash
+   void rollback_try_stash(
+     const hobject_t &hoid,
+     version_t old_version,
+     ObjectStore::Transaction *t);
+
+   /// Delete object to rollback create
+   void rollback_create(
+     const hobject_t &hoid,
+     ObjectStore::Transaction *t) {
+     remove(hoid, t);
+   }
+
+   /// Clone the extents back into place
+   void rollback_extents(
+     version_t gen,
+     const vector<pair<uint64_t, uint64_t> > &extents,
+     const hobject_t &hoid,
+     ObjectStore::Transaction *t);
+ public:
+
+   /// Trim object stashed at version
+   void trim_rollback_object(
+     const hobject_t &hoid,
+     version_t gen,
+     ObjectStore::Transaction *t);
+
+   /// List objects in collection
+   int objects_list_partial(
+     const hobject_t &begin,
+     int min,
+     int max,
+     vector<hobject_t> *ls,
+     hobject_t *next);
+
+   int objects_list_range(
+     const hobject_t &start,
+     const hobject_t &end,
+     snapid_t seq,
+     vector<hobject_t> *ls,
+     vector<ghobject_t> *gen_obs=0);
+
+   int objects_get_attr(
+     const hobject_t &hoid,
+     const string &attr,
+     bufferlist *out);
+
+   virtual int objects_get_attrs(
+     const hobject_t &hoid,
+     map<string, bufferlist> *out);
+
+   virtual int objects_read_sync(
+     const hobject_t &hoid,
+     uint64_t off,
+     uint64_t len,
+     uint32_t op_flags,
+     bufferlist *bl) = 0;
+
+   virtual void objects_read_async(
+     const hobject_t &hoid,
+     const list<pair<boost::tuple<uint64_t, uint64_t, uint32_t>,
+               pair<bufferlist*, Context*> > > &to_read,
+     Context *on_complete, bool fast_read = false) = 0;
+
+   virtual bool scrub_supported() = 0;
+   virtual bool auto_repair_supported() const = 0;
+   void be_scan_list(
+     ScrubMap &map, const vector<hobject_t> &ls, bool deep, uint32_t seed,
+     ThreadPool::TPHandle &handle);
+   bool be_compare_scrub_objects(
+     pg_shard_t auth_shard,
+     const ScrubMap::object &auth,
+     const object_info_t& auth_oi,
+     const ScrubMap::object &candidate,
+     shard_info_wrapper& shard_error,
+     inconsistent_obj_wrapper &result,
+     ostream &errorstream);
+   map<pg_shard_t, ScrubMap *>::const_iterator be_select_auth_object(
+     const hobject_t &obj,
+     const map<pg_shard_t,ScrubMap*> &maps,
+     object_info_t *auth_oi,
+     map<pg_shard_t, shard_info_wrapper> &shard_map,
+     inconsistent_obj_wrapper &object_error);
+   void be_compare_scrubmaps(
+     const map<pg_shard_t,ScrubMap*> &maps,
+     bool repair,
+     map<hobject_t, set<pg_shard_t>> &missing,
+     map<hobject_t, set<pg_shard_t>> &inconsistent,
+     map<hobject_t, list<pg_shard_t>> &authoritative,
+     map<hobject_t, pair<uint32_t,uint32_t>> &missing_digest,
+     int &shallow_errors, int &deep_errors,
+     Scrub::Store *store,
+     const spg_t& pgid,
+     const vector<int> &acting,
+     ostream &errorstream);
+   virtual uint64_t be_get_ondisk_size(
+     uint64_t logical_size) = 0;
+   virtual void be_deep_scrub(
+     const hobject_t &poid,
+     uint32_t seed,
+     ScrubMap::object &o,
+     ThreadPool::TPHandle &handle) = 0;
+
+   static PGBackend *build_pg_backend(
+     const pg_pool_t &pool,
+     const OSDMapRef curmap,
+     Listener *l,
+     coll_t coll,
+     ObjectStore::CollectionHandle &ch,
+     ObjectStore *store,
+     CephContext *cct);
+};
+
+#endif