X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=src%2Fceph%2Fsrc%2Fosd%2FExtentCache.h;fp=src%2Fceph%2Fsrc%2Fosd%2FExtentCache.h;h=0000000000000000000000000000000000000000;hb=7da45d65be36d36b880cc55c5036e96c24b53f00;hp=4354ba6c7e6960163f9f50bb94dbc882364a9ba6;hpb=691462d09d0987b47e112d6ee8740375df3c51b2;p=stor4nfv.git diff --git a/src/ceph/src/osd/ExtentCache.h b/src/ceph/src/osd/ExtentCache.h deleted file mode 100644 index 4354ba6..0000000 --- a/src/ceph/src/osd/ExtentCache.h +++ /dev/null @@ -1,491 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2016 Red Hat - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef EXTENT_CACHE_H -#define EXTENT_CACHE_H - -#include -#include -#include -#include -#include -#include -#include -#include "include/interval_set.h" -#include "common/interval_map.h" -#include "include/buffer.h" -#include "common/hobject.h" - -/** - ExtentCache - - The main purpose of this cache is to ensure that we can pipeline - overlapping partial overwrites. - - To that end we need to ensure that an extent pinned for an operation is - live until that operation completes. However, a particular extent - might be pinned by multiple operations (several pipelined writes - on the same object). - - 1) When we complete an operation, we only look at extents owned only - by that operation. - 2) Per-extent overhead is fixed size. - 2) Per-operation metadata is fixed size. - - This is simple enough to realize with two main structures: - - extent: contains a pointer to the pin owning it and intrusive list - pointers to other extents owned by the same pin - - pin_state: contains the list head for extents owned by it - - This works as long as we only need to remember one "owner" for - each extent. To make this work, we'll need to leverage some - invariants guaranteed by higher layers: - - 1) Writes on a particular object must be ordered - 2) A particular object will have outstanding reads or writes, but not - both (note that you can have a read while a write is committed, but - not applied). - - Our strategy therefore will be to have whichever in-progress op will - finish "last" be the owner of a particular extent. For now, we won't - cache reads, so 2) simply means that we can assume that reads and - recovery operations imply no unstable extents on the object in - question. - - Write: WaitRead -> WaitCommit -> Complete - - Invariant 1) above actually indicates that we can't have writes - bypassing the WaitRead state while there are writes waiting on - Reads. Thus, the set of operations pinning a particular extent - must always complete in order or arrival. - - This suggests that a particular extent may be in only the following - states: - - - 0) Empty (not in the map at all) - 1) Write Pending N - - Some write with reqid <= N is currently fetching the data for - this extent - - The extent must persist until Write reqid N completes - - All ops pinning this extent are writes in the WaitRead state of - the Write pipeline (there must be an in progress write, so no - reads can be in progress). - 2) Write Pinned N: - - This extent has data corresponding to some reqid M <= N - - The extent must persist until Write reqid N commits - - All ops pinning this extent are writes in some Write - state (all are possible). Reads are not possible - in this state (or the others) due to 2). - - All of the above suggests that there are 3 things users can - ask of the cache corresponding to the 3 Write pipelines - states. - */ - -/// If someone wants these types, but not ExtentCache, move to another file -struct bl_split_merge { - bufferlist split( - uint64_t offset, - uint64_t length, - bufferlist &bl) const { - bufferlist out; - out.substr_of(bl, offset, length); - return out; - } - bool can_merge(const bufferlist &left, const bufferlist &right) const { - return true; - } - bufferlist merge(bufferlist &&left, bufferlist &&right) const { - bufferlist bl; - bl.claim(left); - bl.claim_append(right); - return bl; - } - uint64_t length(const bufferlist &b) const { return b.length(); } -}; -using extent_set = interval_set; -using extent_map = interval_map; - -class ExtentCache { - struct object_extent_set; - struct pin_state; -private: - - struct extent { - object_extent_set *parent_extent_set = nullptr; - pin_state *parent_pin_state = nullptr; - boost::intrusive::set_member_hook<> extent_set_member; - boost::intrusive::list_member_hook<> pin_list_member; - - uint64_t offset; - uint64_t length; - boost::optional bl; - - uint64_t get_length() const { - return length; - } - - bool is_pending() const { - return bl == boost::none; - } - - bool pinned_by_write() const { - assert(parent_pin_state); - return parent_pin_state->is_write(); - } - - uint64_t pin_tid() const { - assert(parent_pin_state); - return parent_pin_state->tid; - } - - extent(uint64_t offset, bufferlist _bl) - : offset(offset), length(_bl.length()), bl(_bl) {} - - extent(uint64_t offset, uint64_t length) - : offset(offset), length(length) {} - - bool operator<(const extent &rhs) const { - return offset < rhs.offset; - } - private: - // can briefly violate the two link invariant, used in unlink() and move() - void _link_pin_state(pin_state &pin_state); - void _unlink_pin_state(); - public: - void unlink(); - void link(object_extent_set &parent_extent_set, pin_state &pin_state); - void move(pin_state &to); - }; - - struct object_extent_set : boost::intrusive::set_base_hook<> { - hobject_t oid; - object_extent_set(const hobject_t &oid) : oid(oid) {} - - using set_member_options = boost::intrusive::member_hook< - extent, - boost::intrusive::set_member_hook<>, - &extent::extent_set_member>; - using set = boost::intrusive::set; - set extent_set; - - bool operator<(const object_extent_set &rhs) const { - return oid < rhs.oid; - } - - struct uint_cmp { - bool operator()(uint64_t lhs, const extent &rhs) const { - return lhs < rhs.offset; - } - bool operator()(const extent &lhs, uint64_t rhs) const { - return lhs.offset < rhs; - } - }; - std::pair get_containing_range( - uint64_t offset, uint64_t length); - - void erase(uint64_t offset, uint64_t length); - - struct update_action { - enum type { - NONE, - UPDATE_PIN - }; - type action = NONE; - boost::optional bl; - }; - template - void traverse_update( - pin_state &pin, - uint64_t offset, - uint64_t length, - F &&f) { - auto range = get_containing_range(offset, length); - - if (range.first == range.second || range.first->offset > offset) { - uint64_t extlen = range.first == range.second ? - length : range.first->offset - offset; - - update_action action; - f(offset, extlen, nullptr, &action); - assert(!action.bl || action.bl->length() == extlen); - if (action.action == update_action::UPDATE_PIN) { - extent *ext = action.bl ? - new extent(offset, *action.bl) : - new extent(offset, extlen); - ext->link(*this, pin); - } else { - assert(!action.bl); - } - } - - for (auto p = range.first; p != range.second;) { - extent *ext = &*p; - ++p; - - uint64_t extoff = MAX(ext->offset, offset); - uint64_t extlen = MIN( - ext->length - (extoff - ext->offset), - offset + length - extoff); - - update_action action; - f(extoff, extlen, ext, &action); - assert(!action.bl || action.bl->length() == extlen); - extent *final_extent = nullptr; - if (action.action == update_action::NONE) { - final_extent = ext; - } else { - pin_state *ps = ext->parent_pin_state; - ext->unlink(); - if ((ext->offset < offset) && - (ext->offset + ext->get_length() > offset)) { - extent *head = nullptr; - if (ext->bl) { - bufferlist bl; - bl.substr_of( - *(ext->bl), - 0, - offset - ext->offset); - head = new extent(ext->offset, bl); - } else { - head = new extent( - ext->offset, offset - ext->offset); - } - head->link(*this, *ps); - } - if ((ext->offset + ext->length > offset + length) && - (offset + length > ext->offset)) { - uint64_t nlen = - (ext->offset + ext->get_length()) - (offset + length); - extent *tail = nullptr; - if (ext->bl) { - bufferlist bl; - bl.substr_of( - *(ext->bl), - ext->get_length() - nlen, - nlen); - tail = new extent(offset + length, bl); - } else { - tail = new extent(offset + length, nlen); - } - tail->link(*this, *ps); - } - if (action.action == update_action::UPDATE_PIN) { - if (ext->bl) { - bufferlist bl; - bl.substr_of( - *(ext->bl), - extoff - ext->offset, - extlen); - final_extent = new ExtentCache::extent( - extoff, - bl); - } else { - final_extent = new ExtentCache::extent( - extoff, extlen); - } - final_extent->link(*this, pin); - } - delete ext; - } - - if (action.bl) { - assert(final_extent); - assert(final_extent->length == action.bl->length()); - final_extent->bl = *(action.bl); - } - - uint64_t next_off = p == range.second ? - offset + length : p->offset; - if (extoff + extlen < next_off) { - uint64_t tailoff = extoff + extlen; - uint64_t taillen = next_off - tailoff; - - update_action action; - f(tailoff, taillen, nullptr, &action); - assert(!action.bl || action.bl->length() == taillen); - if (action.action == update_action::UPDATE_PIN) { - extent *ext = action.bl ? - new extent(tailoff, *action.bl) : - new extent(tailoff, taillen); - ext->link(*this, pin); - } else { - assert(!action.bl); - } - } - } - } - }; - struct Cmp { - bool operator()(const hobject_t &oid, const object_extent_set &rhs) const { - return oid < rhs.oid; - } - bool operator()(const object_extent_set &lhs, const hobject_t &oid) const { - return lhs.oid < oid; - } - }; - - object_extent_set &get_or_create(const hobject_t &oid); - object_extent_set *get_if_exists(const hobject_t &oid); - - void remove_and_destroy_if_empty(object_extent_set &set); - using cache_set = boost::intrusive::set; - cache_set per_object_caches; - - uint64_t next_write_tid = 1; - uint64_t next_read_tid = 1; - struct pin_state { - uint64_t tid = 0; - enum pin_type_t { - NONE, - WRITE, - }; - pin_type_t pin_type = NONE; - bool is_write() const { return pin_type == WRITE; } - - pin_state(const pin_state &other) = delete; - pin_state &operator=(const pin_state &other) = delete; - pin_state(pin_state &&other) = delete; - pin_state() = default; - - using list_member_options = boost::intrusive::member_hook< - extent, - boost::intrusive::list_member_hook<>, - &extent::pin_list_member>; - using list = boost::intrusive::list; - list pin_list; - ~pin_state() { - assert(pin_list.empty()); - assert(tid == 0); - assert(pin_type == NONE); - } - void _open(uint64_t in_tid, pin_type_t in_type) { - assert(pin_type == NONE); - assert(in_tid > 0); - tid = in_tid; - pin_type = in_type; - } - }; - - void release_pin(pin_state &p) { - for (auto iter = p.pin_list.begin(); iter != p.pin_list.end(); ) { - unique_ptr extent(&*iter); // we now own this - iter++; // unlink will invalidate - assert(extent->parent_extent_set); - auto &eset = *(extent->parent_extent_set); - extent->unlink(); - remove_and_destroy_if_empty(eset); - } - p.tid = 0; - p.pin_type = pin_state::NONE; - } - -public: - class write_pin : private pin_state { - friend class ExtentCache; - private: - void open(uint64_t in_tid) { - _open(in_tid, pin_state::WRITE); - } - public: - write_pin() : pin_state() {} - }; - - void open_write_pin(write_pin &pin) { - pin.open(next_write_tid++); - } - - /** - * Reserves extents required for rmw, and learn - * which need to be read - * - * Pins all extents in to_write. Returns subset of to_read not - * currently present in the cache. Caller must obtain those - * extents before calling get_remaining_extents_for_rmw. - * - * Transition table: - * - Empty -> Write Pending pin.reqid - * - Write Pending N -> Write Pending pin.reqid - * - Write Pinned N -> Write Pinned pin.reqid - * - * @param oid [in] object undergoing rmw - * @param pin [in,out] pin to use (obtained from create_write_pin) - * @param to_write [in] extents which will be written - * @param to_read [in] extents to read prior to write (must be subset - * of to_write) - * @return subset of to_read which isn't already present or pending - */ - extent_set reserve_extents_for_rmw( - const hobject_t &oid, - write_pin &pin, - const extent_set &to_write, - const extent_set &to_read); - - /** - * Gets extents required for rmw not returned from - * reserve_extents_for_rmw - * - * Requested extents (to_get) must be the set to_read \ the set - * returned from reserve_extents_for_rmw. No transition table, - * all extents at this point must be present and already pinned - * for this pin by reserve_extents_for_rmw. - * - * @param oid [in] object - * @param pin [in,out] pin associated with this IO - * @param to_get [in] extents to get (see above for restrictions) - * @return map of buffers from to_get - */ - extent_map get_remaining_extents_for_rmw( - const hobject_t &oid, - write_pin &pin, - const extent_set &to_get); - - /** - * Updates the cache to reflect the rmw write - * - * All presented extents must already have been specified in - * reserve_extents_for_rmw under to_write. - * - * Transition table: - * - Empty -> invalid, must call reserve_extents_for_rmw first - * - Write Pending N -> Write Pinned N, update buffer - * (assert N >= pin.reqid) - * - Write Pinned N -> Update buffer (assert N >= pin.reqid) - * - * @param oid [in] object - * @param pin [in,out] pin associated with this IO - * @param extents [in] map of buffers to update - * @return void - */ - void present_rmw_update( - const hobject_t &oid, - write_pin &pin, - const extent_map &extents); - - /** - * Release all buffers pinned by pin - */ - void release_write_pin( - write_pin &pin) { - release_pin(pin); - } - - ostream &print( - ostream &out) const; -}; - -ostream &operator<<(ostream &lhs, const ExtentCache &cache); - -#endif