+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2016 Red Hat
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef EXTENT_CACHE_H
-#define EXTENT_CACHE_H
-
-#include <map>
-#include <list>
-#include <vector>
-#include <utility>
-#include <boost/optional.hpp>
-#include <boost/intrusive/set.hpp>
-#include <boost/intrusive/list.hpp>
-#include "include/interval_set.h"
-#include "common/interval_map.h"
-#include "include/buffer.h"
-#include "common/hobject.h"
-
-/**
- ExtentCache
-
- The main purpose of this cache is to ensure that we can pipeline
- overlapping partial overwrites.
-
- To that end we need to ensure that an extent pinned for an operation is
- live until that operation completes. However, a particular extent
- might be pinned by multiple operations (several pipelined writes
- on the same object).
-
- 1) When we complete an operation, we only look at extents owned only
- by that operation.
- 2) Per-extent overhead is fixed size.
- 2) Per-operation metadata is fixed size.
-
- This is simple enough to realize with two main structures:
- - extent: contains a pointer to the pin owning it and intrusive list
- pointers to other extents owned by the same pin
- - pin_state: contains the list head for extents owned by it
-
- This works as long as we only need to remember one "owner" for
- each extent. To make this work, we'll need to leverage some
- invariants guaranteed by higher layers:
-
- 1) Writes on a particular object must be ordered
- 2) A particular object will have outstanding reads or writes, but not
- both (note that you can have a read while a write is committed, but
- not applied).
-
- Our strategy therefore will be to have whichever in-progress op will
- finish "last" be the owner of a particular extent. For now, we won't
- cache reads, so 2) simply means that we can assume that reads and
- recovery operations imply no unstable extents on the object in
- question.
-
- Write: WaitRead -> WaitCommit -> Complete
-
- Invariant 1) above actually indicates that we can't have writes
- bypassing the WaitRead state while there are writes waiting on
- Reads. Thus, the set of operations pinning a particular extent
- must always complete in order or arrival.
-
- This suggests that a particular extent may be in only the following
- states:
-
-
- 0) Empty (not in the map at all)
- 1) Write Pending N
- - Some write with reqid <= N is currently fetching the data for
- this extent
- - The extent must persist until Write reqid N completes
- - All ops pinning this extent are writes in the WaitRead state of
- the Write pipeline (there must be an in progress write, so no
- reads can be in progress).
- 2) Write Pinned N:
- - This extent has data corresponding to some reqid M <= N
- - The extent must persist until Write reqid N commits
- - All ops pinning this extent are writes in some Write
- state (all are possible). Reads are not possible
- in this state (or the others) due to 2).
-
- All of the above suggests that there are 3 things users can
- ask of the cache corresponding to the 3 Write pipelines
- states.
- */
-
-/// If someone wants these types, but not ExtentCache, move to another file
-struct bl_split_merge {
- bufferlist split(
- uint64_t offset,
- uint64_t length,
- bufferlist &bl) const {
- bufferlist out;
- out.substr_of(bl, offset, length);
- return out;
- }
- bool can_merge(const bufferlist &left, const bufferlist &right) const {
- return true;
- }
- bufferlist merge(bufferlist &&left, bufferlist &&right) const {
- bufferlist bl;
- bl.claim(left);
- bl.claim_append(right);
- return bl;
- }
- uint64_t length(const bufferlist &b) const { return b.length(); }
-};
-using extent_set = interval_set<uint64_t>;
-using extent_map = interval_map<uint64_t, bufferlist, bl_split_merge>;
-
-class ExtentCache {
- struct object_extent_set;
- struct pin_state;
-private:
-
- struct extent {
- object_extent_set *parent_extent_set = nullptr;
- pin_state *parent_pin_state = nullptr;
- boost::intrusive::set_member_hook<> extent_set_member;
- boost::intrusive::list_member_hook<> pin_list_member;
-
- uint64_t offset;
- uint64_t length;
- boost::optional<bufferlist> bl;
-
- uint64_t get_length() const {
- return length;
- }
-
- bool is_pending() const {
- return bl == boost::none;
- }
-
- bool pinned_by_write() const {
- assert(parent_pin_state);
- return parent_pin_state->is_write();
- }
-
- uint64_t pin_tid() const {
- assert(parent_pin_state);
- return parent_pin_state->tid;
- }
-
- extent(uint64_t offset, bufferlist _bl)
- : offset(offset), length(_bl.length()), bl(_bl) {}
-
- extent(uint64_t offset, uint64_t length)
- : offset(offset), length(length) {}
-
- bool operator<(const extent &rhs) const {
- return offset < rhs.offset;
- }
- private:
- // can briefly violate the two link invariant, used in unlink() and move()
- void _link_pin_state(pin_state &pin_state);
- void _unlink_pin_state();
- public:
- void unlink();
- void link(object_extent_set &parent_extent_set, pin_state &pin_state);
- void move(pin_state &to);
- };
-
- struct object_extent_set : boost::intrusive::set_base_hook<> {
- hobject_t oid;
- object_extent_set(const hobject_t &oid) : oid(oid) {}
-
- using set_member_options = boost::intrusive::member_hook<
- extent,
- boost::intrusive::set_member_hook<>,
- &extent::extent_set_member>;
- using set = boost::intrusive::set<extent, set_member_options>;
- set extent_set;
-
- bool operator<(const object_extent_set &rhs) const {
- return oid < rhs.oid;
- }
-
- struct uint_cmp {
- bool operator()(uint64_t lhs, const extent &rhs) const {
- return lhs < rhs.offset;
- }
- bool operator()(const extent &lhs, uint64_t rhs) const {
- return lhs.offset < rhs;
- }
- };
- std::pair<set::iterator, set::iterator> get_containing_range(
- uint64_t offset, uint64_t length);
-
- void erase(uint64_t offset, uint64_t length);
-
- struct update_action {
- enum type {
- NONE,
- UPDATE_PIN
- };
- type action = NONE;
- boost::optional<bufferlist> bl;
- };
- template <typename F>
- void traverse_update(
- pin_state &pin,
- uint64_t offset,
- uint64_t length,
- F &&f) {
- auto range = get_containing_range(offset, length);
-
- if (range.first == range.second || range.first->offset > offset) {
- uint64_t extlen = range.first == range.second ?
- length : range.first->offset - offset;
-
- update_action action;
- f(offset, extlen, nullptr, &action);
- assert(!action.bl || action.bl->length() == extlen);
- if (action.action == update_action::UPDATE_PIN) {
- extent *ext = action.bl ?
- new extent(offset, *action.bl) :
- new extent(offset, extlen);
- ext->link(*this, pin);
- } else {
- assert(!action.bl);
- }
- }
-
- for (auto p = range.first; p != range.second;) {
- extent *ext = &*p;
- ++p;
-
- uint64_t extoff = MAX(ext->offset, offset);
- uint64_t extlen = MIN(
- ext->length - (extoff - ext->offset),
- offset + length - extoff);
-
- update_action action;
- f(extoff, extlen, ext, &action);
- assert(!action.bl || action.bl->length() == extlen);
- extent *final_extent = nullptr;
- if (action.action == update_action::NONE) {
- final_extent = ext;
- } else {
- pin_state *ps = ext->parent_pin_state;
- ext->unlink();
- if ((ext->offset < offset) &&
- (ext->offset + ext->get_length() > offset)) {
- extent *head = nullptr;
- if (ext->bl) {
- bufferlist bl;
- bl.substr_of(
- *(ext->bl),
- 0,
- offset - ext->offset);
- head = new extent(ext->offset, bl);
- } else {
- head = new extent(
- ext->offset, offset - ext->offset);
- }
- head->link(*this, *ps);
- }
- if ((ext->offset + ext->length > offset + length) &&
- (offset + length > ext->offset)) {
- uint64_t nlen =
- (ext->offset + ext->get_length()) - (offset + length);
- extent *tail = nullptr;
- if (ext->bl) {
- bufferlist bl;
- bl.substr_of(
- *(ext->bl),
- ext->get_length() - nlen,
- nlen);
- tail = new extent(offset + length, bl);
- } else {
- tail = new extent(offset + length, nlen);
- }
- tail->link(*this, *ps);
- }
- if (action.action == update_action::UPDATE_PIN) {
- if (ext->bl) {
- bufferlist bl;
- bl.substr_of(
- *(ext->bl),
- extoff - ext->offset,
- extlen);
- final_extent = new ExtentCache::extent(
- extoff,
- bl);
- } else {
- final_extent = new ExtentCache::extent(
- extoff, extlen);
- }
- final_extent->link(*this, pin);
- }
- delete ext;
- }
-
- if (action.bl) {
- assert(final_extent);
- assert(final_extent->length == action.bl->length());
- final_extent->bl = *(action.bl);
- }
-
- uint64_t next_off = p == range.second ?
- offset + length : p->offset;
- if (extoff + extlen < next_off) {
- uint64_t tailoff = extoff + extlen;
- uint64_t taillen = next_off - tailoff;
-
- update_action action;
- f(tailoff, taillen, nullptr, &action);
- assert(!action.bl || action.bl->length() == taillen);
- if (action.action == update_action::UPDATE_PIN) {
- extent *ext = action.bl ?
- new extent(tailoff, *action.bl) :
- new extent(tailoff, taillen);
- ext->link(*this, pin);
- } else {
- assert(!action.bl);
- }
- }
- }
- }
- };
- struct Cmp {
- bool operator()(const hobject_t &oid, const object_extent_set &rhs) const {
- return oid < rhs.oid;
- }
- bool operator()(const object_extent_set &lhs, const hobject_t &oid) const {
- return lhs.oid < oid;
- }
- };
-
- object_extent_set &get_or_create(const hobject_t &oid);
- object_extent_set *get_if_exists(const hobject_t &oid);
-
- void remove_and_destroy_if_empty(object_extent_set &set);
- using cache_set = boost::intrusive::set<object_extent_set>;
- cache_set per_object_caches;
-
- uint64_t next_write_tid = 1;
- uint64_t next_read_tid = 1;
- struct pin_state {
- uint64_t tid = 0;
- enum pin_type_t {
- NONE,
- WRITE,
- };
- pin_type_t pin_type = NONE;
- bool is_write() const { return pin_type == WRITE; }
-
- pin_state(const pin_state &other) = delete;
- pin_state &operator=(const pin_state &other) = delete;
- pin_state(pin_state &&other) = delete;
- pin_state() = default;
-
- using list_member_options = boost::intrusive::member_hook<
- extent,
- boost::intrusive::list_member_hook<>,
- &extent::pin_list_member>;
- using list = boost::intrusive::list<extent, list_member_options>;
- list pin_list;
- ~pin_state() {
- assert(pin_list.empty());
- assert(tid == 0);
- assert(pin_type == NONE);
- }
- void _open(uint64_t in_tid, pin_type_t in_type) {
- assert(pin_type == NONE);
- assert(in_tid > 0);
- tid = in_tid;
- pin_type = in_type;
- }
- };
-
- void release_pin(pin_state &p) {
- for (auto iter = p.pin_list.begin(); iter != p.pin_list.end(); ) {
- unique_ptr<extent> extent(&*iter); // we now own this
- iter++; // unlink will invalidate
- assert(extent->parent_extent_set);
- auto &eset = *(extent->parent_extent_set);
- extent->unlink();
- remove_and_destroy_if_empty(eset);
- }
- p.tid = 0;
- p.pin_type = pin_state::NONE;
- }
-
-public:
- class write_pin : private pin_state {
- friend class ExtentCache;
- private:
- void open(uint64_t in_tid) {
- _open(in_tid, pin_state::WRITE);
- }
- public:
- write_pin() : pin_state() {}
- };
-
- void open_write_pin(write_pin &pin) {
- pin.open(next_write_tid++);
- }
-
- /**
- * Reserves extents required for rmw, and learn
- * which need to be read
- *
- * Pins all extents in to_write. Returns subset of to_read not
- * currently present in the cache. Caller must obtain those
- * extents before calling get_remaining_extents_for_rmw.
- *
- * Transition table:
- * - Empty -> Write Pending pin.reqid
- * - Write Pending N -> Write Pending pin.reqid
- * - Write Pinned N -> Write Pinned pin.reqid
- *
- * @param oid [in] object undergoing rmw
- * @param pin [in,out] pin to use (obtained from create_write_pin)
- * @param to_write [in] extents which will be written
- * @param to_read [in] extents to read prior to write (must be subset
- * of to_write)
- * @return subset of to_read which isn't already present or pending
- */
- extent_set reserve_extents_for_rmw(
- const hobject_t &oid,
- write_pin &pin,
- const extent_set &to_write,
- const extent_set &to_read);
-
- /**
- * Gets extents required for rmw not returned from
- * reserve_extents_for_rmw
- *
- * Requested extents (to_get) must be the set to_read \ the set
- * returned from reserve_extents_for_rmw. No transition table,
- * all extents at this point must be present and already pinned
- * for this pin by reserve_extents_for_rmw.
- *
- * @param oid [in] object
- * @param pin [in,out] pin associated with this IO
- * @param to_get [in] extents to get (see above for restrictions)
- * @return map of buffers from to_get
- */
- extent_map get_remaining_extents_for_rmw(
- const hobject_t &oid,
- write_pin &pin,
- const extent_set &to_get);
-
- /**
- * Updates the cache to reflect the rmw write
- *
- * All presented extents must already have been specified in
- * reserve_extents_for_rmw under to_write.
- *
- * Transition table:
- * - Empty -> invalid, must call reserve_extents_for_rmw first
- * - Write Pending N -> Write Pinned N, update buffer
- * (assert N >= pin.reqid)
- * - Write Pinned N -> Update buffer (assert N >= pin.reqid)
- *
- * @param oid [in] object
- * @param pin [in,out] pin associated with this IO
- * @param extents [in] map of buffers to update
- * @return void
- */
- void present_rmw_update(
- const hobject_t &oid,
- write_pin &pin,
- const extent_map &extents);
-
- /**
- * Release all buffers pinned by pin
- */
- void release_write_pin(
- write_pin &pin) {
- release_pin(pin);
- }
-
- ostream &print(
- ostream &out) const;
-};
-
-ostream &operator<<(ostream &lhs, const ExtentCache &cache);
-
-#endif