1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2016 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
14 #ifndef PGTRANSACTION_H
15 #define PGTRANSACTION_H
19 #include <boost/optional.hpp>
21 #include "common/hobject.h"
22 #include "osd/osd_types.h"
23 #include "osd/osd_internal_types.h"
24 #include "common/interval_map.h"
25 #include "common/inline_variant.h"
28 * This class represents transactions which can be submitted to
29 * a PGBackend. For expediency, there are some constraints on
30 * the operations submitted:
31 * 1) Rename sources may only be referenced prior to the rename
32 * operation to the destination.
33 * 2) The graph formed by edges of source->destination for clones
34 * (Create) and Renames must be acyclic.
35 * 3) clone_range sources must not be modified by the same
40 map<hobject_t, ObjectContextRef> obc_map;
42 class ObjectOperation {
52 hobject_t source; // must be temp object
55 using InitType = boost::variant<
61 InitType init_type = Init::None();
62 bool delete_first = false;
65 * is_none() && is_delete() indicates that we are deleting an
66 * object which already exists and not recreating it. delete_first means
67 * that the transaction logically removes the object.
69 * There are really 4 cases:
71 * 1) We are modifying an existing object (is_none() &&
73 * a) If it's an append, we just write into the log entry the old size
74 * b) If it's an actual overwrite, we save the old versions of the
75 * extents being overwritten and write those offsets into the log
77 * 2) We are removing and then recreating an object (!is_none() && is_delete())
79 * 3) We are removing an object (is_none() && is_delete()) -- stash
80 * 4) We are creating an object (!is_none() && !is_delete()) -- create (no
83 * Create, Clone, Rename are the three ways we can recreate it.
84 * ECBackend transaction planning needs this context
85 * to figure out how to perform the transaction.
87 bool deletes_first() const {
90 bool is_delete() const {
91 return boost::get<Init::None>(&init_type) != nullptr && delete_first;
93 bool is_none() const {
94 return boost::get<Init::None>(&init_type) != nullptr && !delete_first;
96 bool is_fresh_object() const {
97 return boost::get<Init::None>(&init_type) == nullptr;
99 bool is_rename() const {
100 return boost::get<Init::Rename>(&init_type) != nullptr;
102 bool has_source(hobject_t *source = nullptr) const {
105 [&](const Init::Clone &op) -> bool {
110 [&](const Init::Rename &op) -> bool {
115 [&](const Init::None &) -> bool { return false; },
116 [&](const Init::Create &) -> bool { return false; });
119 bool clear_omap = false;
125 * truncate is represented as a pair because in the event of
126 * multiple truncates within a single transaction we need to
127 * remember the lowest truncate and the final object size
128 * (the last truncate). We also adjust the buffers map
129 * to account for truncates overriding previous writes */
130 boost::optional<pair<uint64_t, uint64_t> > truncate = boost::none;
132 std::map<string, boost::optional<bufferlist> > attr_updates;
134 enum class OmapUpdateType {Remove, Insert};
135 std::vector<std::pair<OmapUpdateType, bufferlist> > omap_updates;
137 boost::optional<bufferlist> omap_header;
139 /// (old, new) -- only valid with no truncate or buffer updates
140 boost::optional<pair<set<snapid_t>, set<snapid_t> > > updated_snaps;
142 struct alloc_hint_t {
143 uint64_t expected_object_size;
144 uint64_t expected_write_size;
147 boost::optional<alloc_hint_t> alloc_hint;
149 struct BufferUpdate {
152 uint32_t fadvise_flags;
163 using BufferUpdateType = boost::variant<
166 BufferUpdate::CloneRange>;
170 BufferUpdateType split(
173 const BufferUpdateType &bu) const {
176 [&](const BufferUpdate::Write &w) -> BufferUpdateType {
178 bl.substr_of(w.buffer, offset, len);
179 return BufferUpdate::Write{bl, w.fadvise_flags};
181 [&](const BufferUpdate::Zero &) -> BufferUpdateType {
182 return BufferUpdate::Zero{len};
184 [&](const BufferUpdate::CloneRange &c) -> BufferUpdateType {
185 return BufferUpdate::CloneRange{c.from, c.offset + offset, len};
189 const BufferUpdateType &left) const {
192 [&](const BufferUpdate::Write &w) -> uint64_t {
193 return w.buffer.length();
195 [&](const BufferUpdate::Zero &z) -> uint64_t {
198 [&](const BufferUpdate::CloneRange &c) -> uint64_t {
203 const BufferUpdateType &left,
204 const BufferUpdateType &right) const {
207 [&](const BufferUpdate::Write &w) -> bool {
208 auto r = boost::get<BufferUpdate::Write>(&right);
209 return r != nullptr && (w.fadvise_flags == r->fadvise_flags);
211 [&](const BufferUpdate::Zero &) -> bool {
212 auto r = boost::get<BufferUpdate::Zero>(&right);
215 [&](const BufferUpdate::CloneRange &c) -> bool {
219 BufferUpdateType merge(
220 BufferUpdateType &&left,
221 BufferUpdateType &&right) const {
224 [&](const BufferUpdate::Write &w) -> BufferUpdateType {
225 auto r = boost::get<BufferUpdate::Write>(&right);
226 assert(r && w.fadvise_flags == r->fadvise_flags);
227 bufferlist bl = w.buffer;
228 bl.append(r->buffer);
229 return BufferUpdate::Write{bl, w.fadvise_flags};
231 [&](const BufferUpdate::Zero &z) -> BufferUpdateType {
232 auto r = boost::get<BufferUpdate::Zero>(&right);
234 return BufferUpdate::Zero{z.len + r->len};
236 [&](const BufferUpdate::CloneRange &c) -> BufferUpdateType {
237 assert(0 == "violates can_merge condition");
243 using buffer_update_type = interval_map<
244 uint64_t, BufferUpdateType, SplitMerger>;
245 buffer_update_type buffer_updates;
247 friend class PGTransaction;
249 map<hobject_t, ObjectOperation> op_map;
251 ObjectOperation &get_object_op_for_modify(const hobject_t &hoid) {
252 auto &op = op_map[hoid];
253 assert(!op.is_delete());
256 ObjectOperation &get_object_op(const hobject_t &hoid) {
261 ObjectContextRef obc) {
263 obc_map[obc->obs.oi.soid] = obc;
265 /// Sets up state for new object
267 const hobject_t &hoid
269 auto &op = op_map[hoid];
270 assert(op.is_none() || op.is_delete());
271 op.init_type = ObjectOperation::Init::Create();
274 /// Sets up state for target cloned from source
276 const hobject_t &target, ///< [in] obj to clone to
277 const hobject_t &source ///< [in] obj to clone from
279 auto &op = op_map[target];
280 assert(op.is_none() || op.is_delete());
281 op.init_type = ObjectOperation::Init::Clone{source};
284 /// Sets up state for target renamed from source
286 const hobject_t &target, ///< [in] source (must be a temp object)
287 const hobject_t &source ///< [in] to, must not exist, be non-temp
289 assert(source.is_temp());
290 assert(!target.is_temp());
291 auto &op = op_map[target];
292 assert(op.is_none() || op.is_delete());
294 bool del_first = op.is_delete();
295 auto iter = op_map.find(source);
296 if (iter != op_map.end()) {
299 op.delete_first = del_first;
302 op.init_type = ObjectOperation::Init::Rename{source};
305 /// Remove -- must not be called on rename target
307 const hobject_t &hoid ///< [in] obj to remove
309 auto &op = get_object_op_for_modify(hoid);
310 if (!op.is_fresh_object()) {
311 assert(!op.updated_snaps);
312 op = ObjectOperation();
313 op.delete_first = true;
315 assert(!op.is_rename());
316 op_map.erase(hoid); // make it a noop if it's a fresh object
321 const hobject_t &hoid, ///< [in] object for snaps
322 const set<snapid_t> &old_snaps,///< [in] old snaps value
323 const set<snapid_t> &new_snaps ///< [in] new snaps value
325 auto &op = get_object_op(hoid);
326 assert(!op.updated_snaps);
327 assert(op.buffer_updates.empty());
328 assert(!op.truncate);
329 op.updated_snaps = make_pair(
334 /// Clears, truncates
336 const hobject_t &hoid ///< [in] object to clear omap
338 auto &op = get_object_op_for_modify(hoid);
339 op.clear_omap = true;
340 op.omap_updates.clear();
341 op.omap_header = boost::none;
344 const hobject_t &hoid, ///< [in] object
345 uint64_t off ///< [in] offset to truncate to
347 auto &op = get_object_op_for_modify(hoid);
348 assert(!op.updated_snaps);
349 op.buffer_updates.erase(
351 std::numeric_limits<uint64_t>::max() - off);
352 if (!op.truncate || off < op.truncate->first) {
353 op.truncate = std::pair<uint64_t, uint64_t>(off, off);
355 op.truncate->second = off;
361 const hobject_t &hoid, ///< [in] object to write
362 map<string, bufferlist> &attrs ///< [in] attrs, may be cleared
364 auto &op = get_object_op_for_modify(hoid);
365 for (auto &&i: attrs) {
366 op.attr_updates[i.first] = i.second;
370 const hobject_t &hoid, ///< [in] object to write
371 const string &attrname, ///< [in] attr to write
372 bufferlist &bl ///< [in] val to write, may be claimed
374 auto &op = get_object_op_for_modify(hoid);
375 op.attr_updates[attrname] = bl;
378 const hobject_t &hoid, ///< [in] object to write
379 const string &attrname ///< [in] attr to remove
381 auto &op = get_object_op_for_modify(hoid);
382 op.attr_updates[attrname] = boost::none;
387 const hobject_t &hoid, ///< [in] object (must exist)
388 uint64_t expected_object_size, ///< [in]
389 uint64_t expected_write_size,
392 auto &op = get_object_op_for_modify(hoid);
393 op.alloc_hint = ObjectOperation::alloc_hint_t{
394 expected_object_size, expected_write_size, flags};
399 const hobject_t &hoid, ///< [in] object to write
400 uint64_t off, ///< [in] off at which to write
401 uint64_t len, ///< [in] len to write from bl
402 bufferlist &bl, ///< [in] bl to write will be claimed to len
403 uint32_t fadvise_flags = 0 ///< [in] fadvise hint
405 auto &op = get_object_op_for_modify(hoid);
406 assert(!op.updated_snaps);
408 assert(len == bl.length());
409 op.buffer_updates.insert(
412 ObjectOperation::BufferUpdate::Write{bl, fadvise_flags});
415 const hobject_t &from, ///< [in] from
416 const hobject_t &to, ///< [in] to
417 uint64_t fromoff, ///< [in] offset
418 uint64_t len, ///< [in] len
419 uint64_t tooff ///< [in] offset
421 auto &op = get_object_op_for_modify(to);
422 assert(!op.updated_snaps);
423 op.buffer_updates.insert(
426 ObjectOperation::BufferUpdate::CloneRange{from, fromoff, len});
429 const hobject_t &hoid, ///< [in] object
430 uint64_t off, ///< [in] offset to start zeroing at
431 uint64_t len ///< [in] amount to zero
433 auto &op = get_object_op_for_modify(hoid);
434 assert(!op.updated_snaps);
435 op.buffer_updates.insert(
438 ObjectOperation::BufferUpdate::Zero{len});
443 const hobject_t &hoid, ///< [in] object to write
444 bufferlist &keys_bl ///< [in] encoded map<string, bufferlist>
446 auto &op = get_object_op_for_modify(hoid);
447 op.omap_updates.emplace_back(
449 ObjectOperation::OmapUpdateType::Insert,
453 const hobject_t &hoid, ///< [in] object to write
454 map<string, bufferlist> &keys ///< [in] omap keys, may be cleared
458 omap_setkeys(hoid, bl);
461 const hobject_t &hoid, ///< [in] object to write
462 bufferlist &keys_bl ///< [in] encode set<string>
464 auto &op = get_object_op_for_modify(hoid);
465 op.omap_updates.emplace_back(
467 ObjectOperation::OmapUpdateType::Remove,
471 const hobject_t &hoid, ///< [in] object to write
472 set<string> &keys ///< [in] omap keys, may be cleared
476 omap_rmkeys(hoid, bl);
479 const hobject_t &hoid, ///< [in] object to write
480 bufferlist &header ///< [in] header
482 auto &op = get_object_op_for_modify(hoid);
483 op.omap_header = header;
487 return op_map.empty();
490 uint64_t get_bytes_written() const {
492 for (auto &&i: op_map) {
493 for (auto &&j: i.second.buffer_updates) {
501 const hobject_t &hoid ///< [in] obj to which we are doing nothing
503 get_object_op_for_modify(hoid);
506 /* Calls t() on all pair<hobject_t, ObjectOperation> & such that clone/rename
507 * sinks are always called before clone sources
509 * TODO: add a fast path for the single object case and possibly the single
510 * object clone from source case (make_writeable made a clone).
512 * This structure only requires that the source->sink graph be acyclic.
513 * This is much more general than is actually required by PrimaryLogPG.
514 * Only 4 flavors of multi-object transactions actually happen:
515 * 1) rename temp -> object for copyfrom
516 * 2) clone head -> clone, modify head for make_writeable on normal head write
517 * 3) clone clone -> head for rollback
520 * We can bypass the below logic for single object transactions trivially
521 * (including case 1 above since temp doesn't show up again).
522 * For 2-3, we could add something ad-hoc to ensure that they happen in the
523 * right order, but it actually seems easier to just do the graph construction.
525 template <typename T>
526 void safe_create_traverse(T &&t) {
527 map<hobject_t, list<hobject_t>> dgraph;
528 list<hobject_t> stack;
530 // Populate stack with roots, dgraph with edges
531 for (auto &&opair: op_map) {
533 if (opair.second.has_source(&source)) {
534 auto &l = dgraph[source];
535 if (l.empty() && !op_map.count(source)) {
536 /* Source oids not in op_map need to be added as roots
537 * (but only once!) */
538 stack.push_back(source);
540 l.push_back(opair.first);
542 stack.push_back(opair.first);
546 /* Why don't we need to worry about accessing the same node
547 * twice? dgraph nodes always have in-degree at most 1 because
548 * the inverse graph nodes (source->dest) can have out-degree
549 * at most 1 (only one possible source). We do a post-order
550 * depth-first traversal here to ensure we call f on children
553 while (!stack.empty()) {
554 hobject_t &cur = stack.front();
555 auto diter = dgraph.find(cur);
556 if (diter == dgraph.end()) {
557 /* Leaf: pop and call t() */
558 auto opiter = op_map.find(cur);
559 if (opiter != op_map.end())
563 /* Internal node: push children onto stack, remove edge,
564 * recurse. When this node is encountered again, it'll
566 assert(!diter->second.empty());
567 stack.splice(stack.begin(), diter->second);
573 using PGTransactionUPtr = std::unique_ptr<PGTransaction>;