X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=src%2Fceph%2Fsrc%2Fosd%2FPGLog.h;fp=src%2Fceph%2Fsrc%2Fosd%2FPGLog.h;h=5e7d10b2e409948b8afb672920005c852683a2fb;hb=812ff6ca9fcd3e629e49d4328905f33eee8ca3f5;hp=0000000000000000000000000000000000000000;hpb=15280273faafb77777eab341909a3f495cf248d9;p=stor4nfv.git diff --git a/src/ceph/src/osd/PGLog.h b/src/ceph/src/osd/PGLog.h new file mode 100644 index 0000000..5e7d10b --- /dev/null +++ b/src/ceph/src/osd/PGLog.h @@ -0,0 +1,1525 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * Copyright (C) 2013 Cloudwatt + * + * Author: Loic Dachary + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#pragma once + +// re-include our assert to clobber boost's +#include "include/assert.h" +#include "osd_types.h" +#include "os/ObjectStore.h" +#include +using namespace std; + +#define PGLOG_INDEXED_OBJECTS (1 << 0) +#define PGLOG_INDEXED_CALLER_OPS (1 << 1) +#define PGLOG_INDEXED_EXTRA_CALLER_OPS (1 << 2) +#define PGLOG_INDEXED_DUPS (1 << 3) +#define PGLOG_INDEXED_ALL (PGLOG_INDEXED_OBJECTS | \ + PGLOG_INDEXED_CALLER_OPS | \ + PGLOG_INDEXED_EXTRA_CALLER_OPS | \ + PGLOG_INDEXED_DUPS) + +class CephContext; + +struct PGLog : DoutPrefixProvider { + DoutPrefixProvider *prefix_provider; + string gen_prefix() const override { + return prefix_provider ? prefix_provider->gen_prefix() : ""; + } + unsigned get_subsys() const override { + return prefix_provider ? prefix_provider->get_subsys() : + (unsigned)ceph_subsys_osd; + } + CephContext *get_cct() const override { + return cct; + } + + ////////////////////////////// sub classes ////////////////////////////// + struct LogEntryHandler { + virtual void rollback( + const pg_log_entry_t &entry) = 0; + virtual void rollforward( + const pg_log_entry_t &entry) = 0; + virtual void trim( + const pg_log_entry_t &entry) = 0; + virtual void remove( + const hobject_t &hoid) = 0; + virtual void try_stash( + const hobject_t &hoid, + version_t v) = 0; + virtual ~LogEntryHandler() {} + }; + + /* Exceptions */ + class read_log_and_missing_error : public buffer::error { + public: + explicit read_log_and_missing_error(const char *what) { + snprintf(buf, sizeof(buf), "read_log_and_missing_error: %s", what); + } + const char *what() const throw () override { + return buf; + } + private: + char buf[512]; + }; + +public: + /** + * IndexLog - adds in-memory index of the log, by oid. + * plus some methods to manipulate it all. + */ + struct IndexedLog : public pg_log_t { + mutable ceph::unordered_map objects; // ptrs into log. be careful! + mutable ceph::unordered_map caller_ops; + mutable ceph::unordered_multimap extra_caller_ops; + mutable ceph::unordered_map dup_index; + + // recovery pointers + list::iterator complete_to; // not inclusive of referenced item + version_t last_requested = 0; // last object requested by primary + + // + private: + mutable __u16 indexed_data = 0; + /** + * rollback_info_trimmed_to_riter points to the first log entry <= + * rollback_info_trimmed_to + * + * It's a reverse_iterator because rend() is a natural representation for + * tail, and rbegin() works nicely for head. + */ + mempool::osd_pglog::list::reverse_iterator + rollback_info_trimmed_to_riter; + + template + void advance_can_rollback_to(eversion_t to, F &&f) { + if (to > can_rollback_to) + can_rollback_to = to; + + if (to > rollback_info_trimmed_to) + rollback_info_trimmed_to = to; + + while (rollback_info_trimmed_to_riter != log.rbegin()) { + --rollback_info_trimmed_to_riter; + if (rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) { + ++rollback_info_trimmed_to_riter; + break; + } + f(*rollback_info_trimmed_to_riter); + } + } + + void reset_rollback_info_trimmed_to_riter() { + rollback_info_trimmed_to_riter = log.rbegin(); + while (rollback_info_trimmed_to_riter != log.rend() && + rollback_info_trimmed_to_riter->version > rollback_info_trimmed_to) + ++rollback_info_trimmed_to_riter; + } + + // indexes objects, caller ops and extra caller ops + public: + IndexedLog() : + complete_to(log.end()), + last_requested(0), + indexed_data(0), + rollback_info_trimmed_to_riter(log.rbegin()) + { } + + template + IndexedLog(Args&&... args) : + pg_log_t(std::forward(args)...), + complete_to(log.end()), + last_requested(0), + indexed_data(0), + rollback_info_trimmed_to_riter(log.rbegin()) + { + reset_rollback_info_trimmed_to_riter(); + index(); + } + + IndexedLog(const IndexedLog &rhs) : + pg_log_t(rhs), + complete_to(log.end()), + last_requested(rhs.last_requested), + indexed_data(0), + rollback_info_trimmed_to_riter(log.rbegin()) + { + reset_rollback_info_trimmed_to_riter(); + index(rhs.indexed_data); + } + + IndexedLog &operator=(const IndexedLog &rhs) { + this->~IndexedLog(); + new (this) IndexedLog(rhs); + return *this; + } + + void trim_rollback_info_to(eversion_t to, LogEntryHandler *h) { + advance_can_rollback_to( + to, + [&](pg_log_entry_t &entry) { + h->trim(entry); + }); + } + void roll_forward_to(eversion_t to, LogEntryHandler *h) { + advance_can_rollback_to( + to, + [&](pg_log_entry_t &entry) { + h->rollforward(entry); + }); + } + + void skip_can_rollback_to_to_head() { + advance_can_rollback_to(head, [&](const pg_log_entry_t &entry) {}); + } + + mempool::osd_pglog::list rewind_from_head(eversion_t newhead) { + auto divergent = pg_log_t::rewind_from_head(newhead); + index(); + reset_rollback_info_trimmed_to_riter(); + return divergent; + } + + template + void scan_log_after( + const eversion_t &bound, ///< [in] scan entries > bound + T &&f) const { + auto iter = log.rbegin(); + while (iter != log.rend() && iter->version > bound) + ++iter; + + while (true) { + if (iter == log.rbegin()) + break; + f(*(--iter)); + } + } + + /****/ + void claim_log_and_clear_rollback_info(const pg_log_t& o) { + // we must have already trimmed the old entries + assert(rollback_info_trimmed_to == head); + assert(rollback_info_trimmed_to_riter == log.rbegin()); + + *this = IndexedLog(o); + + skip_can_rollback_to_to_head(); + index(); + } + + void split_out_child( + pg_t child_pgid, + unsigned split_bits, + IndexedLog *target); + + void zero() { + // we must have already trimmed the old entries + assert(rollback_info_trimmed_to == head); + assert(rollback_info_trimmed_to_riter == log.rbegin()); + + unindex(); + pg_log_t::clear(); + rollback_info_trimmed_to_riter = log.rbegin(); + reset_recovery_pointers(); + } + void clear() { + skip_can_rollback_to_to_head(); + zero(); + } + void reset_recovery_pointers() { + complete_to = log.end(); + last_requested = 0; + } + + bool logged_object(const hobject_t& oid) const { + if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) { + index_objects(); + } + return objects.count(oid); + } + + bool logged_req(const osd_reqid_t &r) const { + if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) { + index_caller_ops(); + } + if (!caller_ops.count(r)) { + if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) { + index_extra_caller_ops(); + } + return extra_caller_ops.count(r); + } + return true; + } + + bool get_request( + const osd_reqid_t &r, + eversion_t *version, + version_t *user_version, + int *return_code) const + { + assert(version); + assert(user_version); + assert(return_code); + ceph::unordered_map::const_iterator p; + if (!(indexed_data & PGLOG_INDEXED_CALLER_OPS)) { + index_caller_ops(); + } + p = caller_ops.find(r); + if (p != caller_ops.end()) { + *version = p->second->version; + *user_version = p->second->user_version; + *return_code = p->second->return_code; + return true; + } + + // warning: we will return *a* request for this reqid, but not + // necessarily the most recent. + if (!(indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS)) { + index_extra_caller_ops(); + } + p = extra_caller_ops.find(r); + if (p != extra_caller_ops.end()) { + for (auto i = p->second->extra_reqids.begin(); + i != p->second->extra_reqids.end(); + ++i) { + if (i->first == r) { + *version = p->second->version; + *user_version = i->second; + *return_code = p->second->return_code; + return true; + } + } + assert(0 == "in extra_caller_ops but not extra_reqids"); + } + + if (!(indexed_data & PGLOG_INDEXED_DUPS)) { + index_dups(); + } + auto q = dup_index.find(r); + if (q != dup_index.end()) { + *version = q->second->version; + *user_version = q->second->user_version; + *return_code = q->second->return_code; + return true; + } + + return false; + } + + /// get a (bounded) list of recent reqids for the given object + void get_object_reqids(const hobject_t& oid, unsigned max, + mempool::osd_pglog::vector > *pls) const { + // make sure object is present at least once before we do an + // O(n) search. + if (!(indexed_data & PGLOG_INDEXED_OBJECTS)) { + index_objects(); + } + if (objects.count(oid) == 0) + return; + for (list::const_reverse_iterator i = log.rbegin(); + i != log.rend(); + ++i) { + if (i->soid == oid) { + if (i->reqid_is_indexed()) + pls->push_back(make_pair(i->reqid, i->user_version)); + pls->insert(pls->end(), i->extra_reqids.begin(), i->extra_reqids.end()); + if (pls->size() >= max) { + if (pls->size() > max) { + pls->resize(max); + } + return; + } + } + } + } + + void index(__u16 to_index = PGLOG_INDEXED_ALL) const { + // if to_index is 0, no need to run any of this code, especially + // loop below; this can happen with copy constructor for + // IndexedLog (and indirectly through assignment operator) + if (!to_index) return; + + if (to_index & PGLOG_INDEXED_OBJECTS) + objects.clear(); + if (to_index & PGLOG_INDEXED_CALLER_OPS) + caller_ops.clear(); + if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) + extra_caller_ops.clear(); + if (to_index & PGLOG_INDEXED_DUPS) { + dup_index.clear(); + for (auto& i : dups) { + dup_index[i.reqid] = const_cast(&i); + } + } + + constexpr __u16 any_log_entry_index = + PGLOG_INDEXED_OBJECTS | + PGLOG_INDEXED_CALLER_OPS | + PGLOG_INDEXED_EXTRA_CALLER_OPS; + + if (to_index & any_log_entry_index) { + for (list::const_iterator i = log.begin(); + i != log.end(); + ++i) { + if (to_index & PGLOG_INDEXED_OBJECTS) { + if (i->object_is_indexed()) { + objects[i->soid] = const_cast(&(*i)); + } + } + + if (to_index & PGLOG_INDEXED_CALLER_OPS) { + if (i->reqid_is_indexed()) { + caller_ops[i->reqid] = const_cast(&(*i)); + } + } + + if (to_index & PGLOG_INDEXED_EXTRA_CALLER_OPS) { + for (auto j = i->extra_reqids.begin(); + j != i->extra_reqids.end(); + ++j) { + extra_caller_ops.insert( + make_pair(j->first, const_cast(&(*i)))); + } + } + } + } + + indexed_data |= to_index; + } + + void index_objects() const { + index(PGLOG_INDEXED_OBJECTS); + } + + void index_caller_ops() const { + index(PGLOG_INDEXED_CALLER_OPS); + } + + void index_extra_caller_ops() const { + index(PGLOG_INDEXED_EXTRA_CALLER_OPS); + } + + void index_dups() const { + index(PGLOG_INDEXED_DUPS); + } + + void index(pg_log_entry_t& e) { + if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) { + if (objects.count(e.soid) == 0 || + objects[e.soid]->version < e.version) + objects[e.soid] = &e; + } + if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { + // divergent merge_log indexes new before unindexing old + if (e.reqid_is_indexed()) { + caller_ops[e.reqid] = &e; + } + } + if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { + for (auto j = e.extra_reqids.begin(); + j != e.extra_reqids.end(); + ++j) { + extra_caller_ops.insert(make_pair(j->first, &e)); + } + } + } + + void unindex() { + objects.clear(); + caller_ops.clear(); + extra_caller_ops.clear(); + dup_index.clear(); + indexed_data = 0; + } + + void unindex(const pg_log_entry_t& e) { + // NOTE: this only works if we remove from the _tail_ of the log! + if (indexed_data & PGLOG_INDEXED_OBJECTS) { + if (objects.count(e.soid) && objects[e.soid]->version == e.version) + objects.erase(e.soid); + } + if (e.reqid_is_indexed()) { + if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { + // divergent merge_log indexes new before unindexing old + if (caller_ops.count(e.reqid) && caller_ops[e.reqid] == &e) + caller_ops.erase(e.reqid); + } + } + if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { + for (auto j = e.extra_reqids.begin(); + j != e.extra_reqids.end(); + ++j) { + for (ceph::unordered_multimap::iterator k = + extra_caller_ops.find(j->first); + k != extra_caller_ops.end() && k->first == j->first; + ++k) { + if (k->second == &e) { + extra_caller_ops.erase(k); + break; + } + } + } + } + } + + void index(pg_log_dup_t& e) { + if (indexed_data & PGLOG_INDEXED_DUPS) { + dup_index[e.reqid] = &e; + } + } + + void unindex(const pg_log_dup_t& e) { + if (indexed_data & PGLOG_INDEXED_DUPS) { + auto i = dup_index.find(e.reqid); + if (i != dup_index.end()) { + dup_index.erase(i); + } + } + } + + // actors + void add(const pg_log_entry_t& e, bool applied = true) { + if (!applied) { + assert(get_can_rollback_to() == head); + } + + // make sure our buffers don't pin bigger buffers + e.mod_desc.trim_bl(); + + // add to log + log.push_back(e); + + // riter previously pointed to the previous entry + if (rollback_info_trimmed_to_riter == log.rbegin()) + ++rollback_info_trimmed_to_riter; + + assert(e.version > head); + assert(head.version == 0 || e.version.version > head.version); + head = e.version; + + // to our index + if ((indexed_data & PGLOG_INDEXED_OBJECTS) && e.object_is_indexed()) { + objects[e.soid] = &(log.back()); + } + if (indexed_data & PGLOG_INDEXED_CALLER_OPS) { + if (e.reqid_is_indexed()) { + caller_ops[e.reqid] = &(log.back()); + } + } + + if (indexed_data & PGLOG_INDEXED_EXTRA_CALLER_OPS) { + for (auto j = e.extra_reqids.begin(); + j != e.extra_reqids.end(); + ++j) { + extra_caller_ops.insert(make_pair(j->first, &(log.back()))); + } + } + + if (!applied) { + skip_can_rollback_to_to_head(); + } + } // add + + void trim( + CephContext* cct, + eversion_t s, + set *trimmed, + set* trimmed_dups, + eversion_t *write_from_dups); + + ostream& print(ostream& out) const; + }; // IndexedLog + + +protected: + //////////////////// data members //////////////////// + + pg_missing_tracker_t missing; + IndexedLog log; + + eversion_t dirty_to; ///< must clear/writeout all keys <= dirty_to + eversion_t dirty_from; ///< must clear/writeout all keys >= dirty_from + eversion_t writeout_from; ///< must writout keys >= writeout_from + set trimmed; ///< must clear keys in trimmed + eversion_t dirty_to_dups; ///< must clear/writeout all dups <= dirty_to_dups + eversion_t dirty_from_dups; ///< must clear/writeout all dups >= dirty_from_dups + eversion_t write_from_dups; ///< must write keys >= write_from_dups + set trimmed_dups; ///< must clear keys in trimmed_dups + CephContext *cct; + bool pg_log_debug; + /// Log is clean on [dirty_to, dirty_from) + bool touched_log; + bool clear_divergent_priors; + bool rebuilt_missing_with_deletes = false; + + void mark_dirty_to(eversion_t to) { + if (to > dirty_to) + dirty_to = to; + } + void mark_dirty_from(eversion_t from) { + if (from < dirty_from) + dirty_from = from; + } + void mark_writeout_from(eversion_t from) { + if (from < writeout_from) + writeout_from = from; + } + void mark_dirty_to_dups(eversion_t to) { + if (to > dirty_to_dups) + dirty_to_dups = to; + } + void mark_dirty_from_dups(eversion_t from) { + if (from < dirty_from_dups) + dirty_from_dups = from; + } +public: + bool is_dirty() const { + return !touched_log || + (dirty_to != eversion_t()) || + (dirty_from != eversion_t::max()) || + (writeout_from != eversion_t::max()) || + !(trimmed.empty()) || + !missing.is_clean() || + !(trimmed_dups.empty()) || + (dirty_to_dups != eversion_t()) || + (dirty_from_dups != eversion_t::max()) || + (write_from_dups != eversion_t::max()) || + rebuilt_missing_with_deletes; + } + void mark_log_for_rewrite() { + mark_dirty_to(eversion_t::max()); + mark_dirty_from(eversion_t()); + mark_dirty_to_dups(eversion_t::max()); + mark_dirty_from_dups(eversion_t()); + touched_log = false; + } + bool get_rebuilt_missing_with_deletes() const { + return rebuilt_missing_with_deletes; + } +protected: + + /// DEBUG + set log_keys_debug; + static void clear_after(set *log_keys_debug, const string &lb) { + if (!log_keys_debug) + return; + for (set::iterator i = log_keys_debug->lower_bound(lb); + i != log_keys_debug->end(); + log_keys_debug->erase(i++)); + } + static void clear_up_to(set *log_keys_debug, const string &ub) { + if (!log_keys_debug) + return; + for (set::iterator i = log_keys_debug->begin(); + i != log_keys_debug->end() && *i < ub; + log_keys_debug->erase(i++)); + } + + void check(); + void undirty() { + dirty_to = eversion_t(); + dirty_from = eversion_t::max(); + touched_log = true; + trimmed.clear(); + trimmed_dups.clear(); + writeout_from = eversion_t::max(); + check(); + missing.flush(); + dirty_to_dups = eversion_t(); + dirty_from_dups = eversion_t::max(); + write_from_dups = eversion_t::max(); + } +public: + + // cppcheck-suppress noExplicitConstructor + PGLog(CephContext *cct, DoutPrefixProvider *dpp = nullptr) : + prefix_provider(dpp), + dirty_from(eversion_t::max()), + writeout_from(eversion_t::max()), + dirty_from_dups(eversion_t::max()), + write_from_dups(eversion_t::max()), + cct(cct), + pg_log_debug(!(cct && !(cct->_conf->osd_debug_pg_log_writeout))), + touched_log(false), + clear_divergent_priors(false) + { } + + void reset_backfill(); + + void clear(); + + //////////////////// get or set missing //////////////////// + + const pg_missing_tracker_t& get_missing() const { return missing; } + void revise_have(hobject_t oid, eversion_t have) { + missing.revise_have(oid, have); + } + + void missing_add(const hobject_t& oid, eversion_t need, eversion_t have) { + missing.add(oid, need, have, false); + } + + //////////////////// get or set log //////////////////// + + const IndexedLog &get_log() const { return log; } + + const eversion_t &get_tail() const { return log.tail; } + + void set_tail(eversion_t tail) { log.tail = tail; } + + const eversion_t &get_head() const { return log.head; } + + void set_head(eversion_t head) { log.head = head; } + + void set_last_requested(version_t last_requested) { + log.last_requested = last_requested; + } + + void index() { log.index(); } + + void unindex() { log.unindex(); } + + void add(const pg_log_entry_t& e, bool applied = true) { + mark_writeout_from(e.version); + log.add(e, applied); + } + + void reset_recovery_pointers() { log.reset_recovery_pointers(); } + + static void clear_info_log( + spg_t pgid, + ObjectStore::Transaction *t); + + void trim( + eversion_t trim_to, + pg_info_t &info); + + void roll_forward_to( + eversion_t roll_forward_to, + LogEntryHandler *h) { + log.roll_forward_to( + roll_forward_to, + h); + } + + eversion_t get_can_rollback_to() const { + return log.get_can_rollback_to(); + } + + void roll_forward(LogEntryHandler *h) { + roll_forward_to( + log.head, + h); + } + + //////////////////// get or set log & missing //////////////////// + + void reset_backfill_claim_log(const pg_log_t &o, LogEntryHandler *h) { + log.trim_rollback_info_to(log.head, h); + log.claim_log_and_clear_rollback_info(o); + missing.clear(); + mark_dirty_to(eversion_t::max()); + mark_dirty_to_dups(eversion_t::max()); + } + + void split_into( + pg_t child_pgid, + unsigned split_bits, + PGLog *opg_log) { + log.split_out_child(child_pgid, split_bits, &opg_log->log); + missing.split_into(child_pgid, split_bits, &(opg_log->missing)); + opg_log->mark_dirty_to(eversion_t::max()); + opg_log->mark_dirty_to_dups(eversion_t::max()); + mark_dirty_to(eversion_t::max()); + mark_dirty_to_dups(eversion_t::max()); + if (missing.may_include_deletes) + opg_log->rebuilt_missing_with_deletes = true; + } + + void recover_got(hobject_t oid, eversion_t v, pg_info_t &info) { + if (missing.is_missing(oid, v)) { + missing.got(oid, v); + + // raise last_complete? + if (missing.get_items().empty()) { + log.complete_to = log.log.end(); + info.last_complete = info.last_update; + } + while (log.complete_to != log.log.end()) { + if (missing.get_items().at( + missing.get_rmissing().begin()->second + ).need <= log.complete_to->version) + break; + if (info.last_complete < log.complete_to->version) + info.last_complete = log.complete_to->version; + ++log.complete_to; + } + } + + assert(log.get_can_rollback_to() >= v); + } + + void reset_complete_to(pg_info_t *info) { + log.complete_to = log.log.begin(); + while (!missing.get_items().empty() && log.complete_to->version < + missing.get_items().at( + missing.get_rmissing().begin()->second + ).need) { + assert(log.complete_to != log.log.end()); + ++log.complete_to; + } + assert(log.complete_to != log.log.end()); + if (log.complete_to == log.log.begin()) { + if (info) + info->last_complete = eversion_t(); + } else { + --log.complete_to; + if (info) + info->last_complete = log.complete_to->version; + ++log.complete_to; + } + } + + void activate_not_complete(pg_info_t &info) { + reset_complete_to(&info); + log.last_requested = 0; + } + + void proc_replica_log(pg_info_t &oinfo, + const pg_log_t &olog, + pg_missing_t& omissing, pg_shard_t from) const; + + void rebuild_missing_set_with_deletes(ObjectStore *store, + coll_t pg_coll, + const pg_info_t &info); + +protected: + static void split_by_object( + mempool::osd_pglog::list &entries, + map> *out_entries) { + while (!entries.empty()) { + auto &out_list = (*out_entries)[entries.front().soid]; + out_list.splice(out_list.end(), entries, entries.begin()); + } + } + + /** + * _merge_object_divergent_entries + * + * There are 5 distinct cases: + * 1) There is a more recent update: in this case we assume we adjusted the + * store and missing during merge_log + * 2) The first entry in the divergent sequence is a create. This might + * either be because the object is a clone or because prior_version is + * eversion_t(). In this case the object does not exist and we must + * adjust missing and the store to match. + * 3) We are currently missing the object. In this case, we adjust the + * missing to our prior_version taking care to add a divergent_prior + * if necessary + * 4) We can rollback all of the entries. In this case, we do so using + * the rollbacker and return -- the object does not go into missing. + * 5) We cannot rollback at least 1 of the entries. In this case, we + * clear the object out of the store and add a missing entry at + * prior_version taking care to add a divergent_prior if + * necessary. + */ + template + static void _merge_object_divergent_entries( + const IndexedLog &log, ///< [in] log to merge against + const hobject_t &hoid, ///< [in] object we are merging + const mempool::osd_pglog::list &orig_entries, ///< [in] entries for hoid to merge + const pg_info_t &info, ///< [in] info for merging entries + eversion_t olog_can_rollback_to, ///< [in] rollback boundary + missing_type &missing, ///< [in,out] missing to adjust, use + LogEntryHandler *rollbacker, ///< [in] optional rollbacker object + const DoutPrefixProvider *dpp ///< [in] logging provider + ) { + ldpp_dout(dpp, 20) << __func__ << ": merging hoid " << hoid + << " entries: " << orig_entries << dendl; + + if (hoid > info.last_backfill) { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " after last_backfill" + << dendl; + return; + } + + // entries is non-empty + assert(!orig_entries.empty()); + // strip out and ignore ERROR entries + mempool::osd_pglog::list entries; + eversion_t last; + bool seen_non_error = false; + for (list::const_iterator i = orig_entries.begin(); + i != orig_entries.end(); + ++i) { + // all entries are on hoid + assert(i->soid == hoid); + // did not see error entries before this entry and this entry is not error + // then this entry is the first non error entry + bool first_non_error = ! seen_non_error && ! i->is_error(); + if (! i->is_error() ) { + // see a non error entry now + seen_non_error = true; + } + + // No need to check the first entry since it prior_version is unavailable + // in the list + // No need to check if the prior_version is the minimal version + // No need to check the first non-error entry since the leading error + // entries are not its prior version + if (i != orig_entries.begin() && i->prior_version != eversion_t() && + ! first_non_error) { + // in increasing order of version + assert(i->version > last); + // prior_version correct (unless it is an ERROR entry) + assert(i->prior_version == last || i->is_error()); + } + if (i->is_error()) { + ldpp_dout(dpp, 20) << __func__ << ": ignoring " << *i << dendl; + } else { + ldpp_dout(dpp, 20) << __func__ << ": keeping " << *i << dendl; + entries.push_back(*i); + last = i->version; + } + } + if (entries.empty()) { + ldpp_dout(dpp, 10) << __func__ << ": no non-ERROR entries" << dendl; + return; + } + + const eversion_t prior_version = entries.begin()->prior_version; + const eversion_t first_divergent_update = entries.begin()->version; + const eversion_t last_divergent_update = entries.rbegin()->version; + const bool object_not_in_store = + !missing.is_missing(hoid) && + entries.rbegin()->is_delete(); + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " prior_version: " << prior_version + << " first_divergent_update: " << first_divergent_update + << " last_divergent_update: " << last_divergent_update + << dendl; + + ceph::unordered_map::const_iterator objiter = + log.objects.find(hoid); + if (objiter != log.objects.end() && + objiter->second->version >= first_divergent_update) { + /// Case 1) + ldpp_dout(dpp, 10) << __func__ << ": more recent entry found: " + << *objiter->second << ", already merged" << dendl; + + assert(objiter->second->version > last_divergent_update); + + // ensure missing has been updated appropriately + if (objiter->second->is_update() || + (missing.may_include_deletes && objiter->second->is_delete())) { + assert(missing.is_missing(hoid) && + missing.get_items().at(hoid).need == objiter->second->version); + } else { + assert(!missing.is_missing(hoid)); + } + missing.revise_have(hoid, eversion_t()); + if (rollbacker) { + if (!object_not_in_store) { + rollbacker->remove(hoid); + } + for (auto &&i: entries) { + rollbacker->trim(i); + } + } + return; + } + + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + <<" has no more recent entries in log" << dendl; + if (prior_version == eversion_t() || entries.front().is_clone()) { + /// Case 2) + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " prior_version or op type indicates creation," + << " deleting" + << dendl; + if (missing.is_missing(hoid)) + missing.rm(missing.get_items().find(hoid)); + if (rollbacker) { + if (!object_not_in_store) { + rollbacker->remove(hoid); + } + for (auto &&i: entries) { + rollbacker->trim(i); + } + } + return; + } + + if (missing.is_missing(hoid)) { + /// Case 3) + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " missing, " << missing.get_items().at(hoid) + << " adjusting" << dendl; + + if (missing.get_items().at(hoid).have == prior_version) { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " missing.have is prior_version " << prior_version + << " removing from missing" << dendl; + missing.rm(missing.get_items().find(hoid)); + } else { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " missing.have is " << missing.get_items().at(hoid).have + << ", adjusting" << dendl; + missing.revise_need(hoid, prior_version, false); + if (prior_version <= info.log_tail) { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " prior_version " << prior_version + << " <= info.log_tail " + << info.log_tail << dendl; + } + } + if (rollbacker) { + for (auto &&i: entries) { + rollbacker->trim(i); + } + } + return; + } + + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " must be rolled back or recovered," + << " attempting to rollback" + << dendl; + bool can_rollback = true; + /// Distinguish between 4) and 5) + for (list::const_reverse_iterator i = entries.rbegin(); + i != entries.rend(); + ++i) { + if (!i->can_rollback() || i->version <= olog_can_rollback_to) { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot rollback " + << *i << dendl; + can_rollback = false; + break; + } + } + + if (can_rollback) { + /// Case 4) + for (list::const_reverse_iterator i = entries.rbegin(); + i != entries.rend(); + ++i) { + assert(i->can_rollback() && i->version > olog_can_rollback_to); + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " rolling back " << *i << dendl; + if (rollbacker) + rollbacker->rollback(*i); + } + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " rolled back" << dendl; + return; + } else { + /// Case 5) + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid << " cannot roll back, " + << "removing and adding to missing" << dendl; + if (rollbacker) { + if (!object_not_in_store) + rollbacker->remove(hoid); + for (auto &&i: entries) { + rollbacker->trim(i); + } + } + missing.add(hoid, prior_version, eversion_t(), false); + if (prior_version <= info.log_tail) { + ldpp_dout(dpp, 10) << __func__ << ": hoid " << hoid + << " prior_version " << prior_version + << " <= info.log_tail " + << info.log_tail << dendl; + } + } + } + + /// Merge all entries using above + template + static void _merge_divergent_entries( + const IndexedLog &log, ///< [in] log to merge against + mempool::osd_pglog::list &entries, ///< [in] entries to merge + const pg_info_t &oinfo, ///< [in] info for merging entries + eversion_t olog_can_rollback_to, ///< [in] rollback boundary + missing_type &omissing, ///< [in,out] missing to adjust, use + LogEntryHandler *rollbacker, ///< [in] optional rollbacker object + const DoutPrefixProvider *dpp ///< [in] logging provider + ) { + map > split; + split_by_object(entries, &split); + for (map>::iterator i = split.begin(); + i != split.end(); + ++i) { + _merge_object_divergent_entries( + log, + i->first, + i->second, + oinfo, + olog_can_rollback_to, + omissing, + rollbacker, + dpp); + } + } + + /** + * Exists for use in TestPGLog for simply testing single divergent log + * cases + */ + void merge_old_entry( + ObjectStore::Transaction& t, + const pg_log_entry_t& oe, + const pg_info_t& info, + LogEntryHandler *rollbacker) { + mempool::osd_pglog::list entries; + entries.push_back(oe); + _merge_object_divergent_entries( + log, + oe.soid, + entries, + info, + log.get_can_rollback_to(), + missing, + rollbacker, + this); + } + + bool merge_log_dups(const pg_log_t& olog); + +public: + + void rewind_divergent_log(eversion_t newhead, + pg_info_t &info, + LogEntryHandler *rollbacker, + bool &dirty_info, + bool &dirty_big_info); + + void merge_log(pg_info_t &oinfo, + pg_log_t &olog, + pg_shard_t from, + pg_info_t &info, LogEntryHandler *rollbacker, + bool &dirty_info, bool &dirty_big_info); + + template + static bool append_log_entries_update_missing( + const hobject_t &last_backfill, + bool last_backfill_bitwise, + const mempool::osd_pglog::list &entries, + bool maintain_rollback, + IndexedLog *log, + missing_type &missing, + LogEntryHandler *rollbacker, + const DoutPrefixProvider *dpp) { + bool invalidate_stats = false; + if (log && !entries.empty()) { + assert(log->head < entries.begin()->version); + } + for (list::const_iterator p = entries.begin(); + p != entries.end(); + ++p) { + invalidate_stats = invalidate_stats || !p->is_error(); + if (log) { + ldpp_dout(dpp, 20) << "update missing, append " << *p << dendl; + log->add(*p); + } + if (p->soid <= last_backfill && + !p->is_error()) { + if (missing.may_include_deletes) { + missing.add_next_event(*p); + } else { + if (p->is_delete()) { + missing.rm(p->soid, p->version); + } else { + missing.add_next_event(*p); + } + if (rollbacker) { + // hack to match PG::mark_all_unfound_lost + if (maintain_rollback && p->is_lost_delete() && p->can_rollback()) { + rollbacker->try_stash(p->soid, p->version.version); + } else if (p->is_delete()) { + rollbacker->remove(p->soid); + } + } + } + } + } + return invalidate_stats; + } + bool append_new_log_entries( + const hobject_t &last_backfill, + bool last_backfill_bitwise, + const mempool::osd_pglog::list &entries, + LogEntryHandler *rollbacker) { + bool invalidate_stats = append_log_entries_update_missing( + last_backfill, + last_backfill_bitwise, + entries, + true, + &log, + missing, + rollbacker, + this); + if (!entries.empty()) { + mark_writeout_from(entries.begin()->version); + if (entries.begin()->is_lost_delete()) { + // hack: since lost deletes queue recovery directly, and don't + // go through activate_not_complete() again, our complete_to + // iterator may still point at log.end(). Reset it to point + // before these new lost_delete entries. This only occurs + // when lost+delete entries are initially added, which is + // always in a list of solely lost_delete entries, so it is + // sufficient to check whether the first entry is a + // lost_delete + reset_complete_to(nullptr); + } + } + return invalidate_stats; + } + + void write_log_and_missing( + ObjectStore::Transaction& t, + map *km, + const coll_t& coll, + const ghobject_t &log_oid, + bool require_rollback); + + static void write_log_and_missing_wo_missing( + ObjectStore::Transaction& t, + map* km, + pg_log_t &log, + const coll_t& coll, + const ghobject_t &log_oid, map &divergent_priors, + bool require_rollback); + + static void write_log_and_missing( + ObjectStore::Transaction& t, + map* km, + pg_log_t &log, + const coll_t& coll, + const ghobject_t &log_oid, + const pg_missing_tracker_t &missing, + bool require_rollback, + bool *rebuilt_missing_set_with_deletes); + + static void _write_log_and_missing_wo_missing( + ObjectStore::Transaction& t, + map* km, + pg_log_t &log, + const coll_t& coll, const ghobject_t &log_oid, + map &divergent_priors, + eversion_t dirty_to, + eversion_t dirty_from, + eversion_t writeout_from, + const set &trimmed, + const set &trimmed_dups, + bool dirty_divergent_priors, + bool touch_log, + bool require_rollback, + eversion_t dirty_to_dups, + eversion_t dirty_from_dups, + eversion_t write_from_dups, + set *log_keys_debug + ); + + static void _write_log_and_missing( + ObjectStore::Transaction& t, + map* km, + pg_log_t &log, + const coll_t& coll, const ghobject_t &log_oid, + eversion_t dirty_to, + eversion_t dirty_from, + eversion_t writeout_from, + const set &trimmed, + const set &trimmed_dups, + const pg_missing_tracker_t &missing, + bool touch_log, + bool require_rollback, + bool clear_divergent_priors, + eversion_t dirty_to_dups, + eversion_t dirty_from_dups, + eversion_t write_from_dups, + bool *rebuilt_missing_with_deletes, + set *log_keys_debug + ); + + void read_log_and_missing( + ObjectStore *store, + coll_t pg_coll, + coll_t log_coll, + ghobject_t log_oid, + const pg_info_t &info, + bool force_rebuild_missing, + ostringstream &oss, + bool tolerate_divergent_missing_log, + bool debug_verify_stored_missing = false + ) { + return read_log_and_missing( + store, pg_coll, log_coll, log_oid, info, + log, missing, force_rebuild_missing, oss, + tolerate_divergent_missing_log, + &clear_divergent_priors, + this, + (pg_log_debug ? &log_keys_debug : nullptr), + debug_verify_stored_missing); + } + + template + static void read_log_and_missing( + ObjectStore *store, + coll_t pg_coll, + coll_t log_coll, + ghobject_t log_oid, + const pg_info_t &info, + IndexedLog &log, + missing_type &missing, + bool force_rebuild_missing, + ostringstream &oss, + bool tolerate_divergent_missing_log, + bool *clear_divergent_priors = nullptr, + const DoutPrefixProvider *dpp = nullptr, + set *log_keys_debug = nullptr, + bool debug_verify_stored_missing = false + ) { + ldpp_dout(dpp, 20) << "read_log_and_missing coll " << pg_coll + << " log_oid " << log_oid << dendl; + + // legacy? + struct stat st; + int r = store->stat(log_coll, log_oid, &st); + assert(r == 0); + assert(st.st_size == 0); + + // will get overridden below if it had been recorded + eversion_t on_disk_can_rollback_to = info.last_update; + eversion_t on_disk_rollback_info_trimmed_to = eversion_t(); + ObjectMap::ObjectMapIterator p = store->get_omap_iterator(log_coll, log_oid); + map divergent_priors; + bool must_rebuild = force_rebuild_missing; + missing.may_include_deletes = false; + list entries; + list dups; + if (p) { + for (p->seek_to_first(); p->valid() ; p->next(false)) { + // non-log pgmeta_oid keys are prefixed with _; skip those + if (p->key()[0] == '_') + continue; + bufferlist bl = p->value();//Copy bufferlist before creating iterator + bufferlist::iterator bp = bl.begin(); + if (p->key() == "divergent_priors") { + ::decode(divergent_priors, bp); + ldpp_dout(dpp, 20) << "read_log_and_missing " << divergent_priors.size() + << " divergent_priors" << dendl; + must_rebuild = true; + debug_verify_stored_missing = false; + } else if (p->key() == "can_rollback_to") { + ::decode(on_disk_can_rollback_to, bp); + } else if (p->key() == "rollback_info_trimmed_to") { + ::decode(on_disk_rollback_info_trimmed_to, bp); + } else if (p->key() == "may_include_deletes_in_missing") { + missing.may_include_deletes = true; + } else if (p->key().substr(0, 7) == string("missing")) { + hobject_t oid; + pg_missing_item item; + ::decode(oid, bp); + ::decode(item, bp); + if (item.is_delete()) { + assert(missing.may_include_deletes); + } + missing.add(oid, item.need, item.have, item.is_delete()); + } else if (p->key().substr(0, 4) == string("dup_")) { + pg_log_dup_t dup; + ::decode(dup, bp); + if (!dups.empty()) { + assert(dups.back().version < dup.version); + } + dups.push_back(dup); + } else { + pg_log_entry_t e; + e.decode_with_checksum(bp); + ldpp_dout(dpp, 20) << "read_log_and_missing " << e << dendl; + if (!entries.empty()) { + pg_log_entry_t last_e(entries.back()); + assert(last_e.version.version < e.version.version); + assert(last_e.version.epoch <= e.version.epoch); + } + entries.push_back(e); + if (log_keys_debug) + log_keys_debug->insert(e.get_key_name()); + } + } + } + log = IndexedLog( + info.last_update, + info.log_tail, + on_disk_can_rollback_to, + on_disk_rollback_info_trimmed_to, + std::move(entries), + std::move(dups)); + + if (must_rebuild || debug_verify_stored_missing) { + // build missing + if (debug_verify_stored_missing || info.last_complete < info.last_update) { + ldpp_dout(dpp, 10) + << "read_log_and_missing checking for missing items over interval (" + << info.last_complete + << "," << info.last_update << "]" << dendl; + + set did; + set checked; + set skipped; + for (list::reverse_iterator i = log.log.rbegin(); + i != log.log.rend(); + ++i) { + if (!debug_verify_stored_missing && i->version <= info.last_complete) break; + if (i->soid > info.last_backfill) + continue; + if (i->is_error()) + continue; + if (did.count(i->soid)) continue; + did.insert(i->soid); + + if (!missing.may_include_deletes && i->is_delete()) + continue; + + bufferlist bv; + int r = store->getattr( + pg_coll, + ghobject_t(i->soid, ghobject_t::NO_GEN, info.pgid.shard), + OI_ATTR, + bv); + if (r >= 0) { + object_info_t oi(bv); + if (oi.version < i->version) { + ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i + << " (have " << oi.version << ")" << dendl; + if (debug_verify_stored_missing) { + auto miter = missing.get_items().find(i->soid); + assert(miter != missing.get_items().end()); + assert(miter->second.need == i->version); + // the 'have' version is reset if an object is deleted, + // then created again + assert(miter->second.have == oi.version || miter->second.have == eversion_t()); + checked.insert(i->soid); + } else { + missing.add(i->soid, i->version, oi.version, i->is_delete()); + } + } + } else { + ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl; + if (debug_verify_stored_missing) { + auto miter = missing.get_items().find(i->soid); + if (i->is_delete()) { + assert(miter == missing.get_items().end() || + (miter->second.need == i->version && + miter->second.have == eversion_t())); + } else { + assert(miter != missing.get_items().end()); + assert(miter->second.need == i->version); + assert(miter->second.have == eversion_t()); + } + checked.insert(i->soid); + } else { + missing.add(i->soid, i->version, eversion_t(), i->is_delete()); + } + } + } + if (debug_verify_stored_missing) { + for (auto &&i: missing.get_items()) { + if (checked.count(i.first)) + continue; + if (i.first > info.last_backfill) { + ldpp_dout(dpp, -1) << __func__ << ": invalid missing set entry " + << "found before last_backfill: " + << i.first << " " << i.second + << " last_backfill = " << info.last_backfill + << dendl; + assert(0 == "invalid missing set entry found"); + } + bufferlist bv; + int r = store->getattr( + pg_coll, + ghobject_t(i.first, ghobject_t::NO_GEN, info.pgid.shard), + OI_ATTR, + bv); + if (r >= 0) { + object_info_t oi(bv); + assert(oi.version == i.second.have); + } else { + assert(i.second.is_delete() || eversion_t() == i.second.have); + } + } + } else { + assert(must_rebuild); + for (map::reverse_iterator i = + divergent_priors.rbegin(); + i != divergent_priors.rend(); + ++i) { + if (i->first <= info.last_complete) break; + if (i->second > info.last_backfill) + continue; + if (did.count(i->second)) continue; + did.insert(i->second); + bufferlist bv; + int r = store->getattr( + pg_coll, + ghobject_t(i->second, ghobject_t::NO_GEN, info.pgid.shard), + OI_ATTR, + bv); + if (r >= 0) { + object_info_t oi(bv); + /** + * 1) we see this entry in the divergent priors mapping + * 2) we didn't see an entry for this object in the log + * + * From 1 & 2 we know that either the object does not exist + * or it is at the version specified in the divergent_priors + * map since the object would have been deleted atomically + * with the addition of the divergent_priors entry, an older + * version would not have been recovered, and a newer version + * would show up in the log above. + */ + /** + * Unfortunately the assessment above is incorrect because of + * http://tracker.ceph.com/issues/17916 (we were incorrectly + * not removing the divergent_priors set from disk state!), + * so let's check that. + */ + if (oi.version > i->first && tolerate_divergent_missing_log) { + ldpp_dout(dpp, 0) << "read_log divergent_priors entry (" << *i + << ") inconsistent with disk state (" << oi + << "), assuming it is tracker.ceph.com/issues/17916" + << dendl; + } else { + assert(oi.version == i->first); + } + } else { + ldpp_dout(dpp, 15) << "read_log_and_missing missing " << *i << dendl; + missing.add(i->second, i->first, eversion_t(), false); + } + } + } + if (clear_divergent_priors) + (*clear_divergent_priors) = true; + } + } + + if (!must_rebuild) { + if (clear_divergent_priors) + (*clear_divergent_priors) = false; + missing.flush(); + } + ldpp_dout(dpp, 10) << "read_log_and_missing done" << dendl; + } // static read_log_and_missing +}; // struct PGLog