src/ceph/src/osd/PGTransaction.h

   1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
   2 // vim: ts=8 sw=2 smarttab
   3 /*
   4  * Ceph - scalable distributed file system
   5  *
   6  * Copyright (C) 2016 Red Hat
   7  *
   8  * This is free software; you can redistribute it and/or
   9  * modify it under the terms of the GNU Lesser General Public
  10  * License version 2.1, as published by the Free Software
  11  * Foundation.  See file COPYING.
  12  *
  13  */
  14 #ifndef PGTRANSACTION_H
  15 #define PGTRANSACTION_H
  16
  17 #include <map>
  18 #include <memory>
  19 #include <boost/optional.hpp>
  20
  21 #include "common/hobject.h"
  22 #include "osd/osd_types.h"
  23 #include "osd/osd_internal_types.h"
  24 #include "common/interval_map.h"
  25 #include "common/inline_variant.h"
  26
  27 /**
  28  * This class represents transactions which can be submitted to
  29  * a PGBackend.  For expediency, there are some constraints on
  30  * the operations submitted:
  31  * 1) Rename sources may only be referenced prior to the rename
  32  *    operation to the destination.
  33  * 2) The graph formed by edges of source->destination for clones
  34  *    (Create) and Renames must be acyclic.
  35  * 3) clone_range sources must not be modified by the same
  36  *    transaction
  37  */
  38 class PGTransaction {
  39 public:
  40   map<hobject_t, ObjectContextRef> obc_map;
  41
  42   class ObjectOperation {
  43   public:
  44     struct Init
  45     {
  46       struct None {};
  47       struct Create {};
  48       struct Clone {
  49         hobject_t source;
  50       };
  51       struct Rename {
  52         hobject_t source; // must be temp object
  53       };
  54     };
  55     using InitType = boost::variant<
  56       Init::None,
  57       Init::Create,
  58       Init::Clone,
  59       Init::Rename>;
  60
  61     InitType init_type = Init::None();
  62     bool delete_first = false;
  63
  64     /**
  65      * is_none() && is_delete() indicates that we are deleting an
  66      * object which already exists and not recreating it. delete_first means
  67      * that the transaction logically removes the object.
  68
  69      * There are really 4 cases:
  70
  71      * 1) We are modifying an existing object (is_none() &&
  72      *    !is_delete())
  73      *    a) If it's an append, we just write into the log entry the old size
  74      *    b) If it's an actual overwrite, we save the old versions of the
  75      *       extents being overwritten and write those offsets into the log
  76      *       entry
  77      * 2) We are removing and then recreating an object (!is_none() && is_delete())
  78      *    -- stash
  79      * 3) We are removing an object (is_none() && is_delete()) -- stash
  80      * 4) We are creating an object (!is_none() && !is_delete()) -- create (no
  81      *    stash)
  82      *
  83      * Create, Clone, Rename are the three ways we can recreate it.
  84      * ECBackend transaction planning needs this context
  85      * to figure out how to perform the transaction.
  86      */
  87     bool deletes_first() const {
  88       return delete_first;
  89     }
  90     bool is_delete() const {
  91       return boost::get<Init::None>(&init_type) != nullptr && delete_first;
  92     }
  93     bool is_none() const {
  94       return boost::get<Init::None>(&init_type) != nullptr && !delete_first;
  95     }
  96     bool is_fresh_object() const {
  97       return boost::get<Init::None>(&init_type) == nullptr;
  98     }
  99     bool is_rename() const {
 100       return boost::get<Init::Rename>(&init_type) != nullptr;
 101     }
 102     bool has_source(hobject_t *source = nullptr) const {
 103       return match(
 104         init_type,
 105         [&](const Init::Clone &op) -> bool {
 106           if (source)
 107             *source = op.source;
 108           return true;
 109         },
 110         [&](const Init::Rename &op) -> bool {
 111           if (source)
 112             *source = op.source;
 113           return true;
 114         },
 115         [&](const Init::None &) -> bool { return false; },
 116         [&](const Init::Create &) -> bool { return false; });
 117     }
 118
 119     bool clear_omap = false;
 120
 121     /**
 122      * truncate
 123      * <lowest, last> ?
 124      *
 125      * truncate is represented as a pair because in the event of
 126      * multiple truncates within a single transaction we need to
 127      * remember the lowest truncate and the final object size
 128      * (the last truncate).  We also adjust the buffers map
 129      * to account for truncates overriding previous writes */
 130     boost::optional<pair<uint64_t, uint64_t> > truncate = boost::none;
 131
 132     std::map<string, boost::optional<bufferlist> > attr_updates;
 133
 134     enum class OmapUpdateType {Remove, Insert};
 135     std::vector<std::pair<OmapUpdateType, bufferlist> > omap_updates;
 136
 137     boost::optional<bufferlist> omap_header;
 138
 139     /// (old, new) -- only valid with no truncate or buffer updates
 140     boost::optional<pair<set<snapid_t>, set<snapid_t> > > updated_snaps;
 141
 142     struct alloc_hint_t {
 143       uint64_t expected_object_size;
 144       uint64_t expected_write_size;
 145       uint32_t flags;
 146     };
 147     boost::optional<alloc_hint_t> alloc_hint;
 148
 149     struct BufferUpdate {
 150       struct Write {
 151         bufferlist buffer;
 152         uint32_t fadvise_flags;
 153       };
 154       struct Zero {
 155         uint64_t len;
 156       };
 157       struct CloneRange {
 158         hobject_t from;
 159         uint64_t offset;
 160         uint64_t len;
 161       };
 162     };
 163     using BufferUpdateType = boost::variant<
 164       BufferUpdate::Write,
 165       BufferUpdate::Zero,
 166       BufferUpdate::CloneRange>;
 167
 168   private:
 169     struct SplitMerger {
 170       BufferUpdateType split(
 171         uint64_t offset,
 172         uint64_t len,
 173         const BufferUpdateType &bu) const {
 174         return match(
 175           bu,
 176           [&](const BufferUpdate::Write &w) -> BufferUpdateType {
 177             bufferlist bl;
 178             bl.substr_of(w.buffer, offset, len);
 179             return BufferUpdate::Write{bl, w.fadvise_flags};
 180           },
 181           [&](const BufferUpdate::Zero &) -> BufferUpdateType {
 182             return BufferUpdate::Zero{len};
 183           },
 184           [&](const BufferUpdate::CloneRange &c) -> BufferUpdateType {
 185             return BufferUpdate::CloneRange{c.from, c.offset + offset, len};
 186           });
 187       }
 188       uint64_t length(
 189         const BufferUpdateType &left) const {
 190         return match(
 191           left,
 192           [&](const BufferUpdate::Write &w) -> uint64_t {
 193             return w.buffer.length();
 194           },
 195           [&](const BufferUpdate::Zero &z) -> uint64_t {
 196             return z.len;
 197           },
 198           [&](const BufferUpdate::CloneRange &c) -> uint64_t {
 199             return c.len;
 200           });
 201       }
 202       bool can_merge(
 203         const BufferUpdateType &left,
 204         const BufferUpdateType &right) const {
 205         return match(
 206           left,
 207           [&](const BufferUpdate::Write &w) -> bool {
 208             auto r = boost::get<BufferUpdate::Write>(&right);
 209             return r != nullptr && (w.fadvise_flags == r->fadvise_flags);
 210           },
 211           [&](const BufferUpdate::Zero &) -> bool {
 212             auto r = boost::get<BufferUpdate::Zero>(&right);
 213             return r != nullptr;
 214           },
 215           [&](const BufferUpdate::CloneRange &c) -> bool {
 216             return false;
 217           });
 218       }
 219       BufferUpdateType merge(
 220         BufferUpdateType &&left,
 221         BufferUpdateType &&right) const {
 222         return match(
 223           left,
 224           [&](const BufferUpdate::Write &w) -> BufferUpdateType {
 225             auto r = boost::get<BufferUpdate::Write>(&right);
 226             assert(r && w.fadvise_flags == r->fadvise_flags);
 227             bufferlist bl = w.buffer;
 228             bl.append(r->buffer);
 229             return BufferUpdate::Write{bl, w.fadvise_flags};
 230           },
 231           [&](const BufferUpdate::Zero &z) -> BufferUpdateType {
 232             auto r = boost::get<BufferUpdate::Zero>(&right);
 233             assert(r);
 234             return BufferUpdate::Zero{z.len + r->len};
 235           },
 236           [&](const BufferUpdate::CloneRange &c) -> BufferUpdateType {
 237             assert(0 == "violates can_merge condition");
 238             return left;
 239           });
 240       }
 241     };
 242   public:
 243     using buffer_update_type = interval_map<
 244       uint64_t, BufferUpdateType, SplitMerger>;
 245     buffer_update_type buffer_updates;
 246
 247     friend class PGTransaction;
 248   };
 249   map<hobject_t, ObjectOperation> op_map;
 250 private:
 251   ObjectOperation &get_object_op_for_modify(const hobject_t &hoid) {
 252     auto &op = op_map[hoid];
 253     assert(!op.is_delete());
 254     return op;
 255   }
 256   ObjectOperation &get_object_op(const hobject_t &hoid) {
 257     return op_map[hoid];
 258   }
 259 public:
 260   void add_obc(
 261     ObjectContextRef obc) {
 262     assert(obc);
 263     obc_map[obc->obs.oi.soid] = obc;
 264   }
 265   /// Sets up state for new object
 266   void create(
 267     const hobject_t &hoid
 268     ) {
 269     auto &op = op_map[hoid];
 270     assert(op.is_none() || op.is_delete());
 271     op.init_type = ObjectOperation::Init::Create();
 272   }
 273
 274   /// Sets up state for target cloned from source
 275   void clone(
 276     const hobject_t &target,       ///< [in] obj to clone to
 277     const hobject_t &source        ///< [in] obj to clone from
 278     ) {
 279     auto &op = op_map[target];
 280     assert(op.is_none() || op.is_delete());
 281     op.init_type = ObjectOperation::Init::Clone{source};
 282   }
 283
 284   /// Sets up state for target renamed from source
 285   void rename(
 286     const hobject_t &target,       ///< [in] source (must be a temp object)
 287     const hobject_t &source        ///< [in] to, must not exist, be non-temp
 288     ) {
 289     assert(source.is_temp());
 290     assert(!target.is_temp());
 291     auto &op = op_map[target];
 292     assert(op.is_none() || op.is_delete());
 293
 294     bool del_first = op.is_delete();
 295     auto iter = op_map.find(source);
 296     if (iter != op_map.end()) {
 297       op = iter->second;
 298       op_map.erase(iter);
 299       op.delete_first = del_first;
 300     }
 301
 302     op.init_type = ObjectOperation::Init::Rename{source};
 303   }
 304
 305   /// Remove -- must not be called on rename target
 306   void remove(
 307     const hobject_t &hoid          ///< [in] obj to remove
 308     ) {
 309     auto &op = get_object_op_for_modify(hoid);
 310     if (!op.is_fresh_object()) {
 311       assert(!op.updated_snaps);
 312       op = ObjectOperation();
 313       op.delete_first = true;
 314     } else {
 315       assert(!op.is_rename());
 316       op_map.erase(hoid); // make it a noop if it's a fresh object
 317     }
 318   }
 319
 320   void update_snaps(
 321     const hobject_t &hoid,         ///< [in] object for snaps
 322     const set<snapid_t> &old_snaps,///< [in] old snaps value
 323     const set<snapid_t> &new_snaps ///< [in] new snaps value
 324     ) {
 325     auto &op = get_object_op(hoid);
 326     assert(!op.updated_snaps);
 327     assert(op.buffer_updates.empty());
 328     assert(!op.truncate);
 329     op.updated_snaps = make_pair(
 330       old_snaps,
 331       new_snaps);
 332   }
 333
 334   /// Clears, truncates
 335   void omap_clear(
 336     const hobject_t &hoid          ///< [in] object to clear omap
 337     ) {
 338     auto &op = get_object_op_for_modify(hoid);
 339     op.clear_omap = true;
 340     op.omap_updates.clear();
 341     op.omap_header = boost::none;
 342   }
 343   void truncate(
 344     const hobject_t &hoid,         ///< [in] object
 345     uint64_t off                   ///< [in] offset to truncate to
 346     ) {
 347     auto &op = get_object_op_for_modify(hoid);
 348     assert(!op.updated_snaps);
 349     op.buffer_updates.erase(
 350       off,
 351       std::numeric_limits<uint64_t>::max() - off);
 352     if (!op.truncate || off < op.truncate->first) {
 353       op.truncate = std::pair<uint64_t, uint64_t>(off, off);
 354     } else {
 355       op.truncate->second = off;
 356     }
 357   }
 358
 359   /// Attr ops
 360   void setattrs(
 361     const hobject_t &hoid,         ///< [in] object to write
 362     map<string, bufferlist> &attrs ///< [in] attrs, may be cleared
 363     ) {
 364     auto &op = get_object_op_for_modify(hoid);
 365     for (auto &&i: attrs) {
 366       op.attr_updates[i.first] = i.second;
 367     }
 368   }
 369   void setattr(
 370     const hobject_t &hoid,         ///< [in] object to write
 371     const string &attrname,        ///< [in] attr to write
 372     bufferlist &bl                 ///< [in] val to write, may be claimed
 373     ) {
 374     auto &op = get_object_op_for_modify(hoid);
 375     op.attr_updates[attrname] = bl;
 376   }
 377   void rmattr(
 378     const hobject_t &hoid,         ///< [in] object to write
 379     const string &attrname         ///< [in] attr to remove
 380     ) {
 381     auto &op = get_object_op_for_modify(hoid);
 382     op.attr_updates[attrname] = boost::none;
 383   }
 384
 385   /// set alloc hint
 386   void set_alloc_hint(
 387     const hobject_t &hoid,         ///< [in] object (must exist)
 388     uint64_t expected_object_size, ///< [in]
 389     uint64_t expected_write_size,
 390     uint32_t flags
 391     ) {
 392     auto &op = get_object_op_for_modify(hoid);
 393     op.alloc_hint = ObjectOperation::alloc_hint_t{
 394       expected_object_size, expected_write_size, flags};
 395   }
 396
 397   /// Buffer updates
 398   void write(
 399     const hobject_t &hoid,         ///< [in] object to write
 400     uint64_t off,                  ///< [in] off at which to write
 401     uint64_t len,                  ///< [in] len to write from bl
 402     bufferlist &bl,                ///< [in] bl to write will be claimed to len
 403     uint32_t fadvise_flags = 0     ///< [in] fadvise hint
 404     ) {
 405     auto &op = get_object_op_for_modify(hoid);
 406     assert(!op.updated_snaps);
 407     assert(len > 0);
 408     assert(len == bl.length());
 409     op.buffer_updates.insert(
 410       off,
 411       len,
 412       ObjectOperation::BufferUpdate::Write{bl, fadvise_flags});
 413   }
 414   void clone_range(
 415     const hobject_t &from,         ///< [in] from
 416     const hobject_t &to,           ///< [in] to
 417     uint64_t fromoff,              ///< [in] offset
 418     uint64_t len,                  ///< [in] len
 419     uint64_t tooff                 ///< [in] offset
 420     ) {
 421     auto &op = get_object_op_for_modify(to);
 422     assert(!op.updated_snaps);
 423     op.buffer_updates.insert(
 424       tooff,
 425       len,
 426       ObjectOperation::BufferUpdate::CloneRange{from, fromoff, len});
 427   }
 428   void zero(
 429     const hobject_t &hoid,         ///< [in] object
 430     uint64_t off,                  ///< [in] offset to start zeroing at
 431     uint64_t len                   ///< [in] amount to zero
 432     ) {
 433     auto &op = get_object_op_for_modify(hoid);
 434     assert(!op.updated_snaps);
 435     op.buffer_updates.insert(
 436       off,
 437       len,
 438       ObjectOperation::BufferUpdate::Zero{len});
 439   }
 440
 441   /// Omap updates
 442   void omap_setkeys(
 443     const hobject_t &hoid,         ///< [in] object to write
 444     bufferlist &keys_bl            ///< [in] encoded map<string, bufferlist>
 445     ) {
 446     auto &op = get_object_op_for_modify(hoid);
 447     op.omap_updates.emplace_back(
 448       make_pair(
 449         ObjectOperation::OmapUpdateType::Insert,
 450         keys_bl));
 451   }
 452   void omap_setkeys(
 453     const hobject_t &hoid,         ///< [in] object to write
 454     map<string, bufferlist> &keys  ///< [in] omap keys, may be cleared
 455     ) {
 456     bufferlist bl;
 457     ::encode(keys, bl);
 458     omap_setkeys(hoid, bl);
 459   }
 460   void omap_rmkeys(
 461     const hobject_t &hoid,         ///< [in] object to write
 462     bufferlist &keys_bl            ///< [in] encode set<string>
 463     ) {
 464     auto &op = get_object_op_for_modify(hoid);
 465     op.omap_updates.emplace_back(
 466       make_pair(
 467         ObjectOperation::OmapUpdateType::Remove,
 468         keys_bl));
 469   }
 470   void omap_rmkeys(
 471     const hobject_t &hoid,         ///< [in] object to write
 472     set<string> &keys              ///< [in] omap keys, may be cleared
 473     ) {
 474     bufferlist bl;
 475     ::encode(keys, bl);
 476     omap_rmkeys(hoid, bl);
 477   }
 478   void omap_setheader(
 479     const hobject_t &hoid,         ///< [in] object to write
 480     bufferlist &header             ///< [in] header
 481     ) {
 482     auto &op = get_object_op_for_modify(hoid);
 483     op.omap_header = header;
 484   }
 485
 486   bool empty() const {
 487     return op_map.empty();
 488   }
 489
 490   uint64_t get_bytes_written() const {
 491     uint64_t ret = 0;
 492     for (auto &&i: op_map) {
 493       for (auto &&j: i.second.buffer_updates) {
 494         ret += j.get_len();
 495       }
 496     }
 497     return ret;
 498   }
 499
 500   void nop(
 501     const hobject_t &hoid ///< [in] obj to which we are doing nothing
 502     ) {
 503     get_object_op_for_modify(hoid);
 504   }
 505
 506   /* Calls t() on all pair<hobject_t, ObjectOperation> & such that clone/rename
 507    * sinks are always called before clone sources
 508    *
 509    * TODO: add a fast path for the single object case and possibly the single
 510    * object clone from source case (make_writeable made a clone).
 511    *
 512    * This structure only requires that the source->sink graph be acyclic.
 513    * This is much more general than is actually required by PrimaryLogPG.
 514    * Only 4 flavors of multi-object transactions actually happen:
 515    * 1) rename temp -> object for copyfrom
 516    * 2) clone head -> clone, modify head for make_writeable on normal head write
 517    * 3) clone clone -> head for rollback
 518    * 4) 2 + 3
 519    *
 520    * We can bypass the below logic for single object transactions trivially
 521    * (including case 1 above since temp doesn't show up again).
 522    * For 2-3, we could add something ad-hoc to ensure that they happen in the
 523    * right order, but it actually seems easier to just do the graph construction.
 524    */
 525   template <typename T>
 526   void safe_create_traverse(T &&t) {
 527     map<hobject_t, list<hobject_t>> dgraph;
 528     list<hobject_t> stack;
 529
 530     // Populate stack with roots, dgraph with edges
 531     for (auto &&opair: op_map) {
 532       hobject_t source;
 533       if (opair.second.has_source(&source)) {
 534         auto &l = dgraph[source];
 535         if (l.empty() && !op_map.count(source)) {
 536           /* Source oids not in op_map need to be added as roots
 537            * (but only once!) */
 538           stack.push_back(source);
 539         }
 540         l.push_back(opair.first);
 541       } else {
 542         stack.push_back(opair.first);
 543       }
 544     }
 545
 546     /* Why don't we need to worry about accessing the same node
 547      * twice?  dgraph nodes always have in-degree at most 1 because
 548      * the inverse graph nodes (source->dest) can have out-degree
 549      * at most 1 (only one possible source).  We do a post-order
 550      * depth-first traversal here to ensure we call f on children
 551      * before parents.
 552      */
 553     while (!stack.empty()) {
 554       hobject_t &cur = stack.front();
 555       auto diter = dgraph.find(cur);
 556       if (diter == dgraph.end()) {
 557         /* Leaf: pop and call t() */
 558         auto opiter = op_map.find(cur);
 559         if (opiter != op_map.end())
 560           t(*opiter);
 561         stack.pop_front();
 562       } else {
 563         /* Internal node: push children onto stack, remove edge,
 564          * recurse.  When this node is encountered again, it'll
 565          * be a leaf */
 566         assert(!diter->second.empty());
 567         stack.splice(stack.begin(), diter->second);
 568         dgraph.erase(diter);
 569       }
 570     }
 571   }
 572 };
 573 using PGTransactionUPtr = std::unique_ptr<PGTransaction>;
 574
 575 #endif