1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2016 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #ifndef EXTENT_CACHE_H
16 #define EXTENT_CACHE_H
22 #include <boost/optional.hpp>
23 #include <boost/intrusive/set.hpp>
24 #include <boost/intrusive/list.hpp>
25 #include "include/interval_set.h"
26 #include "common/interval_map.h"
27 #include "include/buffer.h"
28 #include "common/hobject.h"
33 The main purpose of this cache is to ensure that we can pipeline
34 overlapping partial overwrites.
36 To that end we need to ensure that an extent pinned for an operation is
37 live until that operation completes. However, a particular extent
38 might be pinned by multiple operations (several pipelined writes
41 1) When we complete an operation, we only look at extents owned only
43 2) Per-extent overhead is fixed size.
44 2) Per-operation metadata is fixed size.
46 This is simple enough to realize with two main structures:
47 - extent: contains a pointer to the pin owning it and intrusive list
48 pointers to other extents owned by the same pin
49 - pin_state: contains the list head for extents owned by it
51 This works as long as we only need to remember one "owner" for
52 each extent. To make this work, we'll need to leverage some
53 invariants guaranteed by higher layers:
55 1) Writes on a particular object must be ordered
56 2) A particular object will have outstanding reads or writes, but not
57 both (note that you can have a read while a write is committed, but
60 Our strategy therefore will be to have whichever in-progress op will
61 finish "last" be the owner of a particular extent. For now, we won't
62 cache reads, so 2) simply means that we can assume that reads and
63 recovery operations imply no unstable extents on the object in
66 Write: WaitRead -> WaitCommit -> Complete
68 Invariant 1) above actually indicates that we can't have writes
69 bypassing the WaitRead state while there are writes waiting on
70 Reads. Thus, the set of operations pinning a particular extent
71 must always complete in order or arrival.
73 This suggests that a particular extent may be in only the following
77 0) Empty (not in the map at all)
79 - Some write with reqid <= N is currently fetching the data for
81 - The extent must persist until Write reqid N completes
82 - All ops pinning this extent are writes in the WaitRead state of
83 the Write pipeline (there must be an in progress write, so no
84 reads can be in progress).
86 - This extent has data corresponding to some reqid M <= N
87 - The extent must persist until Write reqid N commits
88 - All ops pinning this extent are writes in some Write
89 state (all are possible). Reads are not possible
90 in this state (or the others) due to 2).
92 All of the above suggests that there are 3 things users can
93 ask of the cache corresponding to the 3 Write pipelines
97 /// If someone wants these types, but not ExtentCache, move to another file
98 struct bl_split_merge {
102 bufferlist &bl) const {
104 out.substr_of(bl, offset, length);
107 bool can_merge(const bufferlist &left, const bufferlist &right) const {
110 bufferlist merge(bufferlist &&left, bufferlist &&right) const {
113 bl.claim_append(right);
116 uint64_t length(const bufferlist &b) const { return b.length(); }
118 using extent_set = interval_set<uint64_t>;
119 using extent_map = interval_map<uint64_t, bufferlist, bl_split_merge>;
122 struct object_extent_set;
127 object_extent_set *parent_extent_set = nullptr;
128 pin_state *parent_pin_state = nullptr;
129 boost::intrusive::set_member_hook<> extent_set_member;
130 boost::intrusive::list_member_hook<> pin_list_member;
134 boost::optional<bufferlist> bl;
136 uint64_t get_length() const {
140 bool is_pending() const {
141 return bl == boost::none;
144 bool pinned_by_write() const {
145 assert(parent_pin_state);
146 return parent_pin_state->is_write();
149 uint64_t pin_tid() const {
150 assert(parent_pin_state);
151 return parent_pin_state->tid;
154 extent(uint64_t offset, bufferlist _bl)
155 : offset(offset), length(_bl.length()), bl(_bl) {}
157 extent(uint64_t offset, uint64_t length)
158 : offset(offset), length(length) {}
160 bool operator<(const extent &rhs) const {
161 return offset < rhs.offset;
164 // can briefly violate the two link invariant, used in unlink() and move()
165 void _link_pin_state(pin_state &pin_state);
166 void _unlink_pin_state();
169 void link(object_extent_set &parent_extent_set, pin_state &pin_state);
170 void move(pin_state &to);
173 struct object_extent_set : boost::intrusive::set_base_hook<> {
175 object_extent_set(const hobject_t &oid) : oid(oid) {}
177 using set_member_options = boost::intrusive::member_hook<
179 boost::intrusive::set_member_hook<>,
180 &extent::extent_set_member>;
181 using set = boost::intrusive::set<extent, set_member_options>;
184 bool operator<(const object_extent_set &rhs) const {
185 return oid < rhs.oid;
189 bool operator()(uint64_t lhs, const extent &rhs) const {
190 return lhs < rhs.offset;
192 bool operator()(const extent &lhs, uint64_t rhs) const {
193 return lhs.offset < rhs;
196 std::pair<set::iterator, set::iterator> get_containing_range(
197 uint64_t offset, uint64_t length);
199 void erase(uint64_t offset, uint64_t length);
201 struct update_action {
207 boost::optional<bufferlist> bl;
209 template <typename F>
210 void traverse_update(
215 auto range = get_containing_range(offset, length);
217 if (range.first == range.second || range.first->offset > offset) {
218 uint64_t extlen = range.first == range.second ?
219 length : range.first->offset - offset;
221 update_action action;
222 f(offset, extlen, nullptr, &action);
223 assert(!action.bl || action.bl->length() == extlen);
224 if (action.action == update_action::UPDATE_PIN) {
225 extent *ext = action.bl ?
226 new extent(offset, *action.bl) :
227 new extent(offset, extlen);
228 ext->link(*this, pin);
234 for (auto p = range.first; p != range.second;) {
238 uint64_t extoff = MAX(ext->offset, offset);
239 uint64_t extlen = MIN(
240 ext->length - (extoff - ext->offset),
241 offset + length - extoff);
243 update_action action;
244 f(extoff, extlen, ext, &action);
245 assert(!action.bl || action.bl->length() == extlen);
246 extent *final_extent = nullptr;
247 if (action.action == update_action::NONE) {
250 pin_state *ps = ext->parent_pin_state;
252 if ((ext->offset < offset) &&
253 (ext->offset + ext->get_length() > offset)) {
254 extent *head = nullptr;
260 offset - ext->offset);
261 head = new extent(ext->offset, bl);
264 ext->offset, offset - ext->offset);
266 head->link(*this, *ps);
268 if ((ext->offset + ext->length > offset + length) &&
269 (offset + length > ext->offset)) {
271 (ext->offset + ext->get_length()) - (offset + length);
272 extent *tail = nullptr;
277 ext->get_length() - nlen,
279 tail = new extent(offset + length, bl);
281 tail = new extent(offset + length, nlen);
283 tail->link(*this, *ps);
285 if (action.action == update_action::UPDATE_PIN) {
290 extoff - ext->offset,
292 final_extent = new ExtentCache::extent(
296 final_extent = new ExtentCache::extent(
299 final_extent->link(*this, pin);
305 assert(final_extent);
306 assert(final_extent->length == action.bl->length());
307 final_extent->bl = *(action.bl);
310 uint64_t next_off = p == range.second ?
311 offset + length : p->offset;
312 if (extoff + extlen < next_off) {
313 uint64_t tailoff = extoff + extlen;
314 uint64_t taillen = next_off - tailoff;
316 update_action action;
317 f(tailoff, taillen, nullptr, &action);
318 assert(!action.bl || action.bl->length() == taillen);
319 if (action.action == update_action::UPDATE_PIN) {
320 extent *ext = action.bl ?
321 new extent(tailoff, *action.bl) :
322 new extent(tailoff, taillen);
323 ext->link(*this, pin);
332 bool operator()(const hobject_t &oid, const object_extent_set &rhs) const {
333 return oid < rhs.oid;
335 bool operator()(const object_extent_set &lhs, const hobject_t &oid) const {
336 return lhs.oid < oid;
340 object_extent_set &get_or_create(const hobject_t &oid);
341 object_extent_set *get_if_exists(const hobject_t &oid);
343 void remove_and_destroy_if_empty(object_extent_set &set);
344 using cache_set = boost::intrusive::set<object_extent_set>;
345 cache_set per_object_caches;
347 uint64_t next_write_tid = 1;
348 uint64_t next_read_tid = 1;
355 pin_type_t pin_type = NONE;
356 bool is_write() const { return pin_type == WRITE; }
358 pin_state(const pin_state &other) = delete;
359 pin_state &operator=(const pin_state &other) = delete;
360 pin_state(pin_state &&other) = delete;
361 pin_state() = default;
363 using list_member_options = boost::intrusive::member_hook<
365 boost::intrusive::list_member_hook<>,
366 &extent::pin_list_member>;
367 using list = boost::intrusive::list<extent, list_member_options>;
370 assert(pin_list.empty());
372 assert(pin_type == NONE);
374 void _open(uint64_t in_tid, pin_type_t in_type) {
375 assert(pin_type == NONE);
382 void release_pin(pin_state &p) {
383 for (auto iter = p.pin_list.begin(); iter != p.pin_list.end(); ) {
384 unique_ptr<extent> extent(&*iter); // we now own this
385 iter++; // unlink will invalidate
386 assert(extent->parent_extent_set);
387 auto &eset = *(extent->parent_extent_set);
389 remove_and_destroy_if_empty(eset);
392 p.pin_type = pin_state::NONE;
396 class write_pin : private pin_state {
397 friend class ExtentCache;
399 void open(uint64_t in_tid) {
400 _open(in_tid, pin_state::WRITE);
403 write_pin() : pin_state() {}
406 void open_write_pin(write_pin &pin) {
407 pin.open(next_write_tid++);
411 * Reserves extents required for rmw, and learn
412 * which need to be read
414 * Pins all extents in to_write. Returns subset of to_read not
415 * currently present in the cache. Caller must obtain those
416 * extents before calling get_remaining_extents_for_rmw.
419 * - Empty -> Write Pending pin.reqid
420 * - Write Pending N -> Write Pending pin.reqid
421 * - Write Pinned N -> Write Pinned pin.reqid
423 * @param oid [in] object undergoing rmw
424 * @param pin [in,out] pin to use (obtained from create_write_pin)
425 * @param to_write [in] extents which will be written
426 * @param to_read [in] extents to read prior to write (must be subset
428 * @return subset of to_read which isn't already present or pending
430 extent_set reserve_extents_for_rmw(
431 const hobject_t &oid,
433 const extent_set &to_write,
434 const extent_set &to_read);
437 * Gets extents required for rmw not returned from
438 * reserve_extents_for_rmw
440 * Requested extents (to_get) must be the set to_read \ the set
441 * returned from reserve_extents_for_rmw. No transition table,
442 * all extents at this point must be present and already pinned
443 * for this pin by reserve_extents_for_rmw.
445 * @param oid [in] object
446 * @param pin [in,out] pin associated with this IO
447 * @param to_get [in] extents to get (see above for restrictions)
448 * @return map of buffers from to_get
450 extent_map get_remaining_extents_for_rmw(
451 const hobject_t &oid,
453 const extent_set &to_get);
456 * Updates the cache to reflect the rmw write
458 * All presented extents must already have been specified in
459 * reserve_extents_for_rmw under to_write.
462 * - Empty -> invalid, must call reserve_extents_for_rmw first
463 * - Write Pending N -> Write Pinned N, update buffer
464 * (assert N >= pin.reqid)
465 * - Write Pinned N -> Update buffer (assert N >= pin.reqid)
467 * @param oid [in] object
468 * @param pin [in,out] pin associated with this IO
469 * @param extents [in] map of buffers to update
472 void present_rmw_update(
473 const hobject_t &oid,
475 const extent_map &extents);
478 * Release all buffers pinned by pin
480 void release_write_pin(
489 ostream &operator<<(ostream &lhs, const ExtentCache &cache);