// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system * * Copyright (C) 2004-2006 Sage Weil * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #ifndef CEPH_OBJECTSTORE_H #define CEPH_OBJECTSTORE_H #include "include/Context.h" #include "include/buffer.h" #include "include/types.h" #include "osd/osd_types.h" #include "common/TrackedOp.h" #include "common/WorkQueue.h" #include "ObjectMap.h" #include #include #include #include #if defined(DARWIN) || defined(__FreeBSD__) || defined(__sun) #include #else #include /* or */ #endif /* DARWIN */ #define OPS_PER_PTR 32 class CephContext; using std::vector; using std::string; using std::map; namespace ceph { class Formatter; } /* * low-level interface to the local OSD file system */ class Logger; static inline void encode(const map *attrset, bufferlist &bl) { ::encode(*attrset, bl); } // this isn't the best place for these, but... void decode_str_str_map_to_bl(bufferlist::iterator& p, bufferlist *out); void decode_str_set_to_bl(bufferlist::iterator& p, bufferlist *out); // Flag bits typedef uint32_t osflagbits_t; const int SKIP_JOURNAL_REPLAY = 1 << 0; const int SKIP_MOUNT_OMAP = 1 << 1; class ObjectStore { protected: string path; public: CephContext* cct; /** * create - create an ObjectStore instance. * * This is invoked once at initialization time. * * @param type type of store. This is a string from the configuration file. * @param data path (or other descriptor) for data * @param journal path (or other descriptor) for journal (optional) * @param flags which filestores should check if applicable */ static ObjectStore *create(CephContext *cct, const string& type, const string& data, const string& journal, osflagbits_t flags = 0); /** * probe a block device to learn the uuid of the owning OSD * * @param cct cct * @param path path to device * @param fsid [out] osd uuid */ static int probe_block_device_fsid( CephContext *cct, const string& path, uuid_d *fsid); /** * Fetch Object Store statistics. * * Currently only latency of write and apply times are measured. * * This appears to be called with nothing locked. */ virtual objectstore_perf_stat_t get_cur_stats() = 0; /** * Fetch Object Store performance counters. * * * This appears to be called with nothing locked. */ virtual const PerfCounters* get_perf_counters() const = 0; /** * a sequencer orders transactions * * Any transactions queued under a given sequencer will be applied in * sequence. Transactions queued under different sequencers may run * in parallel. * * Clients of ObjectStore create and maintain their own Sequencer objects. * When a list of transactions is queued the caller specifies a Sequencer to be used. * */ /** * ABC for Sequencer implementation, private to the ObjectStore derived class. * created in ...::queue_transaction(s) */ struct Sequencer_impl : public RefCountedObject { CephContext* cct; // block until any previous transactions are visible. specifically, // collection_list and collection_empty need to reflect prior operations. virtual void flush() = 0; // called when we are done with the impl. the impl may have a different // (longer) lifecycle than the Sequencer. virtual void discard() {} /** * Async flush_commit * * There are two cases: * 1) sequencer is currently idle: the method returns true. c is * not touched. * 2) sequencer is not idle: the method returns false and c is * called asyncronously with a value of 0 once all transactions * queued on this sequencer prior to the call have been applied * and committed. */ virtual bool flush_commit( Context *c ///< [in] context to call upon flush/commit ) = 0; ///< @return true if idle, false otherwise Sequencer_impl(CephContext* cct) : RefCountedObject(NULL, 0), cct(cct) {} ~Sequencer_impl() override {} }; typedef boost::intrusive_ptr Sequencer_implRef; /** * External (opaque) sequencer implementation */ struct Sequencer { string name; spg_t shard_hint; Sequencer_implRef p; explicit Sequencer(string n) : name(n), shard_hint(spg_t()), p(NULL) { } ~Sequencer() { if (p) p->discard(); // tell impl we are done with it } /// return a unique string identifier for this sequencer const string& get_name() const { return name; } /// wait for any queued transactions on this sequencer to apply void flush() { if (p) p->flush(); } /// @see Sequencer_impl::flush_commit() bool flush_commit(Context *c) { if (!p) { return true; } else { return p->flush_commit(c); } } }; struct CollectionImpl : public RefCountedObject { virtual const coll_t &get_cid() = 0; CollectionImpl() : RefCountedObject(NULL, 0) {} }; typedef boost::intrusive_ptr CollectionHandle; struct CompatCollectionHandle : public CollectionImpl { coll_t cid; explicit CompatCollectionHandle(coll_t c) : cid(c) {} const coll_t &get_cid() override { return cid; } }; /********************************* * * Object Contents and semantics * * All ObjectStore objects are identified as a named object * (ghobject_t and hobject_t) in a named collection (coll_t). * ObjectStore operations support the creation, mutation, deletion * and enumeration of objects within a collection. Enumeration is * in sorted key order (where keys are sorted by hash). Object names * are globally unique. * * Each object has four distinct parts: byte data, xattrs, omap_header * and omap entries. * * The data portion of an object is conceptually equivalent to a * file in a file system. Random and Partial access for both read * and write operations is required. The ability to have a sparse * implementation of the data portion of an object is beneficial for * some workloads, but not required. There is a system-wide limit on * the maximum size of an object, which is typically around 100 MB. * * Xattrs are equivalent to the extended attributes of file * systems. Xattrs are a set of key/value pairs. Sub-value access * is not required. It is possible to enumerate the set of xattrs in * key order. At the implementation level, xattrs are used * exclusively internal to Ceph and the implementer can expect the * total size of all of the xattrs on an object to be relatively * small, i.e., less than 64KB. Much of Ceph assumes that accessing * xattrs on temporally adjacent object accesses (recent past or * near future) is inexpensive. * * omap_header is a single blob of data. It can be read or written * in total. * * Omap entries are conceptually the same as xattrs * but in a different address space. In other words, you can have * the same key as an xattr and an omap entry and they have distinct * values. Enumeration of xattrs doesn't include omap entries and * vice versa. The size and access characteristics of omap entries * are very different from xattrs. In particular, the value portion * of an omap entry can be quite large (MBs). More importantly, the * interface must support efficient range queries on omap entries even * when there are a large numbers of entries. * *********************************/ /******************************* * * Collections * * A collection is simply a grouping of objects. Collections have * names (coll_t) and can be enumerated in order. Like an * individual object, a collection also has a set of xattrs. * * */ /********************************* * transaction * * A Transaction represents a sequence of primitive mutation * operations. * * Three events in the life of a Transaction result in * callbacks. Any Transaction can contain any number of callback * objects (Context) for any combination of the three classes of * callbacks: * * on_applied_sync, on_applied, and on_commit. * * The "on_applied" and "on_applied_sync" callbacks are invoked when * the modifications requested by the Transaction are visible to * subsequent ObjectStore operations, i.e., the results are * readable. The only conceptual difference between on_applied and * on_applied_sync is the specific thread and locking environment in * which the callbacks operate. "on_applied_sync" is called * directly by an ObjectStore execution thread. It is expected to * execute quickly and must not acquire any locks of the calling * environment. Conversely, "on_applied" is called from the separate * Finisher thread, meaning that it can contend for calling * environment locks. NB, on_applied and on_applied_sync are * sometimes called on_readable and on_readable_sync. * * The "on_commit" callback is also called from the Finisher thread * and indicates that all of the mutations have been durably * committed to stable storage (i.e., are now software/hardware * crashproof). * * At the implementation level, each mutation primitive (and its * associated data) can be serialized to a single buffer. That * serialization, however, does not copy any data, but (using the * bufferlist library) will reference the original buffers. This * implies that the buffer that contains the data being submitted * must remain stable until the on_commit callback completes. In * practice, bufferlist handles all of this for you and this * subtlety is only relevant if you are referencing an existing * buffer via buffer::raw_static. * * Some implementations of ObjectStore choose to implement their own * form of journaling that uses the serialized form of a * Transaction. This requires that the encode/decode logic properly * version itself and handle version upgrades that might change the * format of the encoded Transaction. This has already happened a * couple of times and the Transaction object contains some helper * variables that aid in this legacy decoding: * * sobject_encoding detects an older/simpler version of oid * present in pre-bobtail versions of ceph. use_pool_override * also detects a situation where the pool of an oid can be * override for legacy operations/buffers. For non-legacy * implementation of ObjectStore, neither of these fields is * relevant. * * * TRANSACTION ISOLATION * * Except as noted below, isolation is the responsibility of the * caller. In other words, if any storage element (storage element * == any of the four portions of an object as described above) is * altered by a transaction (including deletion), the caller * promises not to attempt to read that element while the * transaction is pending (here pending means from the time of * issuance until the "on_applied_sync" callback has been * received). Violations of isolation need not be detected by * ObjectStore and there is no corresponding error mechanism for * reporting an isolation violation (crashing would be the * appropriate way to report an isolation violation if detected). * * Enumeration operations may violate transaction isolation as * described above when a storage element is being created or * deleted as part of a transaction. In this case, ObjectStore is * allowed to consider the enumeration operation to either precede * or follow the violating transaction element. In other words, the * presence/absence of the mutated element in the enumeration is * entirely at the discretion of ObjectStore. The arbitrary ordering * applies independently to each transaction element. For example, * if a transaction contains two mutating elements "create A" and * "delete B". And an enumeration operation is performed while this * transaction is pending. It is permissable for ObjectStore to * report any of the four possible combinations of the existence of * A and B. * */ class Transaction { public: enum { OP_NOP = 0, OP_TOUCH = 9, // cid, oid OP_WRITE = 10, // cid, oid, offset, len, bl OP_ZERO = 11, // cid, oid, offset, len OP_TRUNCATE = 12, // cid, oid, len OP_REMOVE = 13, // cid, oid OP_SETATTR = 14, // cid, oid, attrname, bl OP_SETATTRS = 15, // cid, oid, attrset OP_RMATTR = 16, // cid, oid, attrname OP_CLONE = 17, // cid, oid, newoid OP_CLONERANGE = 18, // cid, oid, newoid, offset, len OP_CLONERANGE2 = 30, // cid, oid, newoid, srcoff, len, dstoff OP_TRIMCACHE = 19, // cid, oid, offset, len **DEPRECATED** OP_MKCOLL = 20, // cid OP_RMCOLL = 21, // cid OP_COLL_ADD = 22, // cid, oldcid, oid OP_COLL_REMOVE = 23, // cid, oid OP_COLL_SETATTR = 24, // cid, attrname, bl OP_COLL_RMATTR = 25, // cid, attrname OP_COLL_SETATTRS = 26, // cid, attrset OP_COLL_MOVE = 8, // newcid, oldcid, oid OP_STARTSYNC = 27, // start a sync OP_RMATTRS = 28, // cid, oid OP_COLL_RENAME = 29, // cid, newcid OP_OMAP_CLEAR = 31, // cid OP_OMAP_SETKEYS = 32, // cid, attrset OP_OMAP_RMKEYS = 33, // cid, keyset OP_OMAP_SETHEADER = 34, // cid, header OP_SPLIT_COLLECTION = 35, // cid, bits, destination OP_SPLIT_COLLECTION2 = 36, /* cid, bits, destination doesn't create the destination */ OP_OMAP_RMKEYRANGE = 37, // cid, oid, firstkey, lastkey OP_COLL_MOVE_RENAME = 38, // oldcid, oldoid, newcid, newoid OP_SETALLOCHINT = 39, // cid, oid, object_size, write_size OP_COLL_HINT = 40, // cid, type, bl OP_TRY_RENAME = 41, // oldcid, oldoid, newoid OP_COLL_SET_BITS = 42, // cid, bits }; // Transaction hint type enum { COLL_HINT_EXPECTED_NUM_OBJECTS = 1, }; struct Op { __le32 op; __le32 cid; __le32 oid; __le64 off; __le64 len; __le32 dest_cid; __le32 dest_oid; //OP_CLONE, OP_CLONERANGE __le64 dest_off; //OP_CLONERANGE union { struct { __le32 hint_type; //OP_COLL_HINT }; struct { __le32 alloc_hint_flags; //OP_SETALLOCHINT }; }; __le64 expected_object_size; //OP_SETALLOCHINT __le64 expected_write_size; //OP_SETALLOCHINT __le32 split_bits; //OP_SPLIT_COLLECTION2,OP_COLL_SET_BITS, //OP_MKCOLL __le32 split_rem; //OP_SPLIT_COLLECTION2 } __attribute__ ((packed)) ; struct TransactionData { __le64 ops; __le32 largest_data_len; __le32 largest_data_off; __le32 largest_data_off_in_data_bl; __le32 fadvise_flags; TransactionData() noexcept : ops(0), largest_data_len(0), largest_data_off(0), largest_data_off_in_data_bl(0), fadvise_flags(0) { } // override default move operations to reset default values TransactionData(TransactionData&& other) noexcept : ops(other.ops), largest_data_len(other.largest_data_len), largest_data_off(other.largest_data_off), largest_data_off_in_data_bl(other.largest_data_off_in_data_bl), fadvise_flags(other.fadvise_flags) { other.ops = 0; other.largest_data_len = 0; other.largest_data_off = 0; other.largest_data_off_in_data_bl = 0; other.fadvise_flags = 0; } TransactionData& operator=(TransactionData&& other) noexcept { ops = other.ops; largest_data_len = other.largest_data_len; largest_data_off = other.largest_data_off; largest_data_off_in_data_bl = other.largest_data_off_in_data_bl; fadvise_flags = other.fadvise_flags; other.ops = 0; other.largest_data_len = 0; other.largest_data_off = 0; other.largest_data_off_in_data_bl = 0; other.fadvise_flags = 0; return *this; } TransactionData(const TransactionData& other) = default; TransactionData& operator=(const TransactionData& other) = default; void encode(bufferlist& bl) const { bl.append((char*)this, sizeof(TransactionData)); } void decode(bufferlist::iterator &bl) { bl.copy(sizeof(TransactionData), (char*)this); } } __attribute__ ((packed)) ; private: TransactionData data; void *osr {nullptr}; // NULL on replay map coll_index; map object_index; __le32 coll_id {0}; __le32 object_id {0}; bufferlist data_bl; bufferlist op_bl; bufferptr op_ptr; list on_applied; list on_commit; list on_applied_sync; public: Transaction() = default; explicit Transaction(bufferlist::iterator &dp) { decode(dp); } explicit Transaction(bufferlist &nbl) { bufferlist::iterator dp = nbl.begin(); decode(dp); } // override default move operations to reset default values Transaction(Transaction&& other) noexcept : data(std::move(other.data)), osr(other.osr), coll_index(std::move(other.coll_index)), object_index(std::move(other.object_index)), coll_id(other.coll_id), object_id(other.object_id), data_bl(std::move(other.data_bl)), op_bl(std::move(other.op_bl)), op_ptr(std::move(other.op_ptr)), on_applied(std::move(other.on_applied)), on_commit(std::move(other.on_commit)), on_applied_sync(std::move(other.on_applied_sync)) { other.osr = nullptr; other.coll_id = 0; other.object_id = 0; } Transaction& operator=(Transaction&& other) noexcept { data = std::move(other.data); osr = other.osr; coll_index = std::move(other.coll_index); object_index = std::move(other.object_index); coll_id = other.coll_id; object_id = other.object_id; data_bl = std::move(other.data_bl); op_bl = std::move(other.op_bl); op_ptr = std::move(other.op_ptr); on_applied = std::move(other.on_applied); on_commit = std::move(other.on_commit); on_applied_sync = std::move(other.on_applied_sync); other.osr = nullptr; other.coll_id = 0; other.object_id = 0; return *this; } Transaction(const Transaction& other) = default; Transaction& operator=(const Transaction& other) = default; /* Operations on callback contexts */ void register_on_applied(Context *c) { if (!c) return; on_applied.push_back(c); } void register_on_commit(Context *c) { if (!c) return; on_commit.push_back(c); } void register_on_applied_sync(Context *c) { if (!c) return; on_applied_sync.push_back(c); } void register_on_complete(Context *c) { if (!c) return; RunOnDeleteRef _complete (std::make_shared(c)); register_on_applied(new ContainerContext(_complete)); register_on_commit(new ContainerContext(_complete)); } static void collect_contexts( vector& t, Context **out_on_applied, Context **out_on_commit, Context **out_on_applied_sync) { assert(out_on_applied); assert(out_on_commit); assert(out_on_applied_sync); list on_applied, on_commit, on_applied_sync; for (vector::iterator i = t.begin(); i != t.end(); ++i) { on_applied.splice(on_applied.end(), (*i).on_applied); on_commit.splice(on_commit.end(), (*i).on_commit); on_applied_sync.splice(on_applied_sync.end(), (*i).on_applied_sync); } *out_on_applied = C_Contexts::list_to_context(on_applied); *out_on_commit = C_Contexts::list_to_context(on_commit); *out_on_applied_sync = C_Contexts::list_to_context(on_applied_sync); } Context *get_on_applied() { return C_Contexts::list_to_context(on_applied); } Context *get_on_commit() { return C_Contexts::list_to_context(on_commit); } Context *get_on_applied_sync() { return C_Contexts::list_to_context(on_applied_sync); } void set_fadvise_flags(uint32_t flags) { data.fadvise_flags = flags; } void set_fadvise_flag(uint32_t flag) { data.fadvise_flags = data.fadvise_flags | flag; } uint32_t get_fadvise_flags() { return data.fadvise_flags; } void swap(Transaction& other) noexcept { std::swap(data, other.data); std::swap(on_applied, other.on_applied); std::swap(on_commit, other.on_commit); std::swap(on_applied_sync, other.on_applied_sync); std::swap(coll_index, other.coll_index); std::swap(object_index, other.object_index); std::swap(coll_id, other.coll_id); std::swap(object_id, other.object_id); op_bl.swap(other.op_bl); data_bl.swap(other.data_bl); } void _update_op(Op* op, vector<__le32> &cm, vector<__le32> &om) { switch (op->op) { case OP_NOP: case OP_STARTSYNC: break; case OP_TOUCH: case OP_REMOVE: case OP_SETATTR: case OP_SETATTRS: case OP_RMATTR: case OP_RMATTRS: case OP_COLL_REMOVE: case OP_OMAP_CLEAR: case OP_OMAP_SETKEYS: case OP_OMAP_RMKEYS: case OP_OMAP_RMKEYRANGE: case OP_OMAP_SETHEADER: case OP_WRITE: case OP_ZERO: case OP_TRUNCATE: case OP_SETALLOCHINT: assert(op->cid < cm.size()); assert(op->oid < om.size()); op->cid = cm[op->cid]; op->oid = om[op->oid]; break; case OP_CLONERANGE2: case OP_CLONE: assert(op->cid < cm.size()); assert(op->oid < om.size()); assert(op->dest_oid < om.size()); op->cid = cm[op->cid]; op->oid = om[op->oid]; op->dest_oid = om[op->dest_oid]; break; case OP_MKCOLL: case OP_RMCOLL: case OP_COLL_SETATTR: case OP_COLL_RMATTR: case OP_COLL_SETATTRS: case OP_COLL_HINT: case OP_COLL_SET_BITS: assert(op->cid < cm.size()); op->cid = cm[op->cid]; break; case OP_COLL_ADD: assert(op->cid < cm.size()); assert(op->oid < om.size()); assert(op->dest_cid < om.size()); op->cid = cm[op->cid]; op->dest_cid = cm[op->dest_cid]; op->oid = om[op->oid]; break; case OP_COLL_MOVE_RENAME: assert(op->cid < cm.size()); assert(op->oid < om.size()); assert(op->dest_cid < cm.size()); assert(op->dest_oid < om.size()); op->cid = cm[op->cid]; op->oid = om[op->oid]; op->dest_cid = cm[op->dest_cid]; op->dest_oid = om[op->dest_oid]; break; case OP_TRY_RENAME: assert(op->cid < cm.size()); assert(op->oid < om.size()); assert(op->dest_oid < om.size()); op->cid = cm[op->cid]; op->oid = om[op->oid]; op->dest_oid = om[op->dest_oid]; break; case OP_SPLIT_COLLECTION2: assert(op->cid < cm.size()); assert(op->dest_cid < cm.size()); op->cid = cm[op->cid]; op->dest_cid = cm[op->dest_cid]; break; default: assert(0 == "Unkown OP"); } } void _update_op_bl( bufferlist& bl, vector<__le32> &cm, vector<__le32> &om) { list list = bl.buffers(); std::list::iterator p; for(p = list.begin(); p != list.end(); ++p) { assert(p->length() % sizeof(Op) == 0); char* raw_p = p->c_str(); char* raw_end = raw_p + p->length(); while (raw_p < raw_end) { _update_op(reinterpret_cast(raw_p), cm, om); raw_p += sizeof(Op); } } } /// Append the operations of the parameter to this Transaction. Those operations are removed from the parameter Transaction void append(Transaction& other) { data.ops += other.data.ops; if (other.data.largest_data_len > data.largest_data_len) { data.largest_data_len = other.data.largest_data_len; data.largest_data_off = other.data.largest_data_off; data.largest_data_off_in_data_bl = data_bl.length() + other.data.largest_data_off_in_data_bl; } data.fadvise_flags |= other.data.fadvise_flags; on_applied.splice(on_applied.end(), other.on_applied); on_commit.splice(on_commit.end(), other.on_commit); on_applied_sync.splice(on_applied_sync.end(), other.on_applied_sync); //append coll_index & object_index vector<__le32> cm(other.coll_index.size()); map::iterator coll_index_p; for (coll_index_p = other.coll_index.begin(); coll_index_p != other.coll_index.end(); ++coll_index_p) { cm[coll_index_p->second] = _get_coll_id(coll_index_p->first); } vector<__le32> om(other.object_index.size()); map::iterator object_index_p; for (object_index_p = other.object_index.begin(); object_index_p != other.object_index.end(); ++object_index_p) { om[object_index_p->second] = _get_object_id(object_index_p->first); } //the other.op_bl SHOULD NOT be changes during append operation, //we use additional bufferlist to avoid this problem bufferptr other_op_bl_ptr(other.op_bl.length()); other.op_bl.copy(0, other.op_bl.length(), other_op_bl_ptr.c_str()); bufferlist other_op_bl; other_op_bl.append(other_op_bl_ptr); //update other_op_bl with cm & om //When the other is appended to current transaction, all coll_index and //object_index in other.op_buffer should be updated by new index of the //combined transaction _update_op_bl(other_op_bl, cm, om); //append op_bl op_bl.append(other_op_bl); //append data_bl data_bl.append(other.data_bl); } /** Inquires about the Transaction as a whole. */ /// How big is the encoded Transaction buffer? uint64_t get_encoded_bytes() { //layout: data_bl + op_bl + coll_index + object_index + data // coll_index size, object_index size and sizeof(transaction_data) // all here, so they may be computed at compile-time size_t final_size = sizeof(__u32) * 2 + sizeof(data); // coll_index second and object_index second final_size += (coll_index.size() + object_index.size()) * sizeof(__le32); // coll_index first for (auto p = coll_index.begin(); p != coll_index.end(); ++p) { final_size += p->first.encoded_size(); } // object_index first for (auto p = object_index.begin(); p != object_index.end(); ++p) { final_size += p->first.encoded_size(); } return data_bl.length() + op_bl.length() + final_size; } /// Retain old version for regression testing purposes uint64_t get_encoded_bytes_test() { //layout: data_bl + op_bl + coll_index + object_index + data bufferlist bl; ::encode(coll_index, bl); ::encode(object_index, bl); return data_bl.length() + op_bl.length() + bl.length() + sizeof(data); } uint64_t get_num_bytes() { return get_encoded_bytes(); } /// Size of largest data buffer to the "write" operation encountered so far uint32_t get_data_length() { return data.largest_data_len; } /// offset within the encoded buffer to the start of the largest data buffer that's encoded uint32_t get_data_offset() { if (data.largest_data_off_in_data_bl) { return data.largest_data_off_in_data_bl + sizeof(__u8) + // encode struct_v sizeof(__u8) + // encode compat_v sizeof(__u32) + // encode len sizeof(__u32); // data_bl len } return 0; // none } /// offset of buffer as aligned to destination within object. int get_data_alignment() { if (!data.largest_data_len) return -1; return (0 - get_data_offset()) & ~CEPH_PAGE_MASK; } /// Is the Transaction empty (no operations) bool empty() { return !data.ops; } /// Number of operations in the transation int get_num_ops() { return data.ops; } void set_osr(void *s) { osr = s; } void *get_osr() { return osr; } /** * iterator * * Helper object to parse Transactions. * * ObjectStore instances use this object to step down the encoded * buffer decoding operation codes and parameters as we go. * */ class iterator { Transaction *t; uint64_t ops; char* op_buffer_p; bufferlist::iterator data_bl_p; public: vector colls; vector objects; private: explicit iterator(Transaction *t) : t(t), data_bl_p(t->data_bl.begin()), colls(t->coll_index.size()), objects(t->object_index.size()) { ops = t->data.ops; op_buffer_p = t->op_bl.get_contiguous(0, t->data.ops * sizeof(Op)); map::iterator coll_index_p; for (coll_index_p = t->coll_index.begin(); coll_index_p != t->coll_index.end(); ++coll_index_p) { colls[coll_index_p->second] = coll_index_p->first; } map::iterator object_index_p; for (object_index_p = t->object_index.begin(); object_index_p != t->object_index.end(); ++object_index_p) { objects[object_index_p->second] = object_index_p->first; } } friend class Transaction; public: bool have_op() { return ops > 0; } Op* decode_op() { assert(ops > 0); Op* op = reinterpret_cast(op_buffer_p); op_buffer_p += sizeof(Op); ops--; return op; } string decode_string() { string s; ::decode(s, data_bl_p); return s; } void decode_bp(bufferptr& bp) { ::decode(bp, data_bl_p); } void decode_bl(bufferlist& bl) { ::decode(bl, data_bl_p); } void decode_attrset(map& aset) { ::decode(aset, data_bl_p); } void decode_attrset(map& aset) { ::decode(aset, data_bl_p); } void decode_attrset_bl(bufferlist *pbl) { decode_str_str_map_to_bl(data_bl_p, pbl); } void decode_keyset(set &keys){ ::decode(keys, data_bl_p); } void decode_keyset_bl(bufferlist *pbl){ decode_str_set_to_bl(data_bl_p, pbl); } const ghobject_t &get_oid(__le32 oid_id) { assert(oid_id < objects.size()); return objects[oid_id]; } const coll_t &get_cid(__le32 cid_id) { assert(cid_id < colls.size()); return colls[cid_id]; } uint32_t get_fadvise_flags() const { return t->get_fadvise_flags(); } }; iterator begin() { return iterator(this); } private: void _build_actions_from_tbl(); /** * Helper functions to encode the various mutation elements of a * transaction. These are 1:1 with the operation codes (see * enumeration above). These routines ensure that the * encoder/creator of a transaction gets the right data in the * right place. Sadly, there's no corresponding version nor any * form of seat belts for the decoder. */ Op* _get_next_op() { if (op_ptr.length() == 0 || op_ptr.offset() >= op_ptr.length()) { op_ptr = bufferptr(sizeof(Op) * OPS_PER_PTR); } bufferptr ptr(op_ptr, 0, sizeof(Op)); op_bl.append(ptr); op_ptr.set_offset(op_ptr.offset() + sizeof(Op)); char* p = ptr.c_str(); memset(p, 0, sizeof(Op)); return reinterpret_cast(p); } __le32 _get_coll_id(const coll_t& coll) { map::iterator c = coll_index.find(coll); if (c != coll_index.end()) return c->second; __le32 index_id = coll_id++; coll_index[coll] = index_id; return index_id; } __le32 _get_object_id(const ghobject_t& oid) { map::iterator o = object_index.find(oid); if (o != object_index.end()) return o->second; __le32 index_id = object_id++; object_index[oid] = index_id; return index_id; } public: /// Commence a global file system sync operation. void start_sync() { Op* _op = _get_next_op(); _op->op = OP_STARTSYNC; data.ops++; } /// noop. 'nuf said void nop() { Op* _op = _get_next_op(); _op->op = OP_NOP; data.ops++; } /** * touch * * Ensure the existance of an object in a collection. Create an * empty object if necessary */ void touch(const coll_t& cid, const ghobject_t& oid) { Op* _op = _get_next_op(); _op->op = OP_TOUCH; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); data.ops++; } /** * Write data to an offset within an object. If the object is too * small, it is expanded as needed. It is possible to specify an * offset beyond the current end of an object and it will be * expanded as needed. Simple implementations of ObjectStore will * just zero the data between the old end of the object and the * newly provided data. More sophisticated implementations of * ObjectStore will omit the untouched data and store it as a * "hole" in the file. */ void write(const coll_t& cid, const ghobject_t& oid, uint64_t off, uint64_t len, const bufferlist& write_data, uint32_t flags = 0) { uint32_t orig_len = data_bl.length(); Op* _op = _get_next_op(); _op->op = OP_WRITE; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); _op->off = off; _op->len = len; ::encode(write_data, data_bl); assert(len == write_data.length()); data.fadvise_flags = data.fadvise_flags | flags; if (write_data.length() > data.largest_data_len) { data.largest_data_len = write_data.length(); data.largest_data_off = off; data.largest_data_off_in_data_bl = orig_len + sizeof(__u32); // we are about to } data.ops++; } /** * zero out the indicated byte range within an object. Some * ObjectStore instances may optimize this to release the * underlying storage space. */ void zero(const coll_t& cid, const ghobject_t& oid, uint64_t off, uint64_t len) { Op* _op = _get_next_op(); _op->op = OP_ZERO; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); _op->off = off; _op->len = len; data.ops++; } /// Discard all data in the object beyond the specified size. void truncate(const coll_t& cid, const ghobject_t& oid, uint64_t off) { Op* _op = _get_next_op(); _op->op = OP_TRUNCATE; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); _op->off = off; data.ops++; } /// Remove an object. All four parts of the object are removed. void remove(const coll_t& cid, const ghobject_t& oid) { Op* _op = _get_next_op(); _op->op = OP_REMOVE; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); data.ops++; } /// Set an xattr of an object void setattr(const coll_t& cid, const ghobject_t& oid, const char* name, bufferlist& val) { string n(name); setattr(cid, oid, n, val); } /// Set an xattr of an object void setattr(const coll_t& cid, const ghobject_t& oid, const string& s, bufferlist& val) { Op* _op = _get_next_op(); _op->op = OP_SETATTR; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); ::encode(s, data_bl); ::encode(val, data_bl); data.ops++; } /// Set multiple xattrs of an object void setattrs(const coll_t& cid, const ghobject_t& oid, const map& attrset) { Op* _op = _get_next_op(); _op->op = OP_SETATTRS; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); ::encode(attrset, data_bl); data.ops++; } /// Set multiple xattrs of an object void setattrs(const coll_t& cid, const ghobject_t& oid, const map& attrset) { Op* _op = _get_next_op(); _op->op = OP_SETATTRS; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); ::encode(attrset, data_bl); data.ops++; } /// remove an xattr from an object void rmattr(const coll_t& cid, const ghobject_t& oid, const char *name) { string n(name); rmattr(cid, oid, n); } /// remove an xattr from an object void rmattr(const coll_t& cid, const ghobject_t& oid, const string& s) { Op* _op = _get_next_op(); _op->op = OP_RMATTR; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); ::encode(s, data_bl); data.ops++; } /// remove all xattrs from an object void rmattrs(const coll_t& cid, const ghobject_t& oid) { Op* _op = _get_next_op(); _op->op = OP_RMATTRS; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); data.ops++; } /** * Clone an object into another object. * * Low-cost (e.g., O(1)) cloning (if supported) is best, but * fallback to an O(n) copy is allowed. All four parts of the * object are cloned (data, xattrs, omap header, omap * entries). * * The destination named object may already exist, in * which case its previous contents are discarded. */ void clone(const coll_t& cid, const ghobject_t& oid, const ghobject_t& noid) { Op* _op = _get_next_op(); _op->op = OP_CLONE; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); _op->dest_oid = _get_object_id(noid); data.ops++; } /** * Clone a byte range from one object to another. * * The data portion of the destination object receives a copy of a * portion of the data from the source object. None of the other * three parts of an object is copied from the source. * * The destination object size may be extended to the dstoff + len. * * The source range *must* overlap with the source object data. If it does * not the result is undefined. */ void clone_range(const coll_t& cid, const ghobject_t& oid, const ghobject_t& noid, uint64_t srcoff, uint64_t srclen, uint64_t dstoff) { Op* _op = _get_next_op(); _op->op = OP_CLONERANGE2; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); _op->dest_oid = _get_object_id(noid); _op->off = srcoff; _op->len = srclen; _op->dest_off = dstoff; data.ops++; } /// Create the collection void create_collection(const coll_t& cid, int bits) { Op* _op = _get_next_op(); _op->op = OP_MKCOLL; _op->cid = _get_coll_id(cid); _op->split_bits = bits; data.ops++; } /** * Give the collection a hint. * * @param cid - collection id. * @param type - hint type. * @param hint - the hint payload, which contains the customized * data along with the hint type. */ void collection_hint(const coll_t& cid, uint32_t type, const bufferlist& hint) { Op* _op = _get_next_op(); _op->op = OP_COLL_HINT; _op->cid = _get_coll_id(cid); _op->hint_type = type; ::encode(hint, data_bl); data.ops++; } /// remove the collection, the collection must be empty void remove_collection(const coll_t& cid) { Op* _op = _get_next_op(); _op->op = OP_RMCOLL; _op->cid = _get_coll_id(cid); data.ops++; } void collection_move(const coll_t& cid, coll_t oldcid, const ghobject_t& oid) __attribute__ ((deprecated)) { // NOTE: we encode this as a fixed combo of ADD + REMOVE. they // always appear together, so this is effectively a single MOVE. Op* _op = _get_next_op(); _op->op = OP_COLL_ADD; _op->cid = _get_coll_id(oldcid); _op->oid = _get_object_id(oid); _op->dest_cid = _get_coll_id(cid); data.ops++; _op = _get_next_op(); _op->op = OP_COLL_REMOVE; _op->cid = _get_coll_id(oldcid); _op->oid = _get_object_id(oid); data.ops++; } void collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid, coll_t cid, const ghobject_t& oid) { Op* _op = _get_next_op(); _op->op = OP_COLL_MOVE_RENAME; _op->cid = _get_coll_id(oldcid); _op->oid = _get_object_id(oldoid); _op->dest_cid = _get_coll_id(cid); _op->dest_oid = _get_object_id(oid); data.ops++; } void try_rename(coll_t cid, const ghobject_t& oldoid, const ghobject_t& oid) { Op* _op = _get_next_op(); _op->op = OP_TRY_RENAME; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oldoid); _op->dest_oid = _get_object_id(oid); data.ops++; } /// Remove omap from oid void omap_clear( coll_t cid, ///< [in] Collection containing oid const ghobject_t &oid ///< [in] Object from which to remove omap ) { Op* _op = _get_next_op(); _op->op = OP_OMAP_CLEAR; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); data.ops++; } /// Set keys on oid omap. Replaces duplicate keys. void omap_setkeys( const coll_t& cid, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object to update const map &attrset ///< [in] Replacement keys and values ) { Op* _op = _get_next_op(); _op->op = OP_OMAP_SETKEYS; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); ::encode(attrset, data_bl); data.ops++; } /// Set keys on an oid omap (bufferlist variant). void omap_setkeys( coll_t cid, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object to update const bufferlist &attrset_bl ///< [in] Replacement keys and values ) { Op* _op = _get_next_op(); _op->op = OP_OMAP_SETKEYS; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); data_bl.append(attrset_bl); data.ops++; } /// Remove keys from oid omap void omap_rmkeys( coll_t cid, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object from which to remove the omap const set &keys ///< [in] Keys to clear ) { Op* _op = _get_next_op(); _op->op = OP_OMAP_RMKEYS; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); ::encode(keys, data_bl); data.ops++; } /// Remove keys from oid omap void omap_rmkeys( coll_t cid, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object from which to remove the omap const bufferlist &keys_bl ///< [in] Keys to clear ) { Op* _op = _get_next_op(); _op->op = OP_OMAP_RMKEYS; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); data_bl.append(keys_bl); data.ops++; } /// Remove key range from oid omap void omap_rmkeyrange( coll_t cid, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object from which to remove the omap keys const string& first, ///< [in] first key in range const string& last ///< [in] first key past range, range is [first,last) ) { Op* _op = _get_next_op(); _op->op = OP_OMAP_RMKEYRANGE; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); ::encode(first, data_bl); ::encode(last, data_bl); data.ops++; } /// Set omap header void omap_setheader( coll_t cid, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object const bufferlist &bl ///< [in] Header value ) { Op* _op = _get_next_op(); _op->op = OP_OMAP_SETHEADER; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); ::encode(bl, data_bl); data.ops++; } /// Split collection based on given prefixes, objects matching the specified bits/rem are /// moved to the new collection void split_collection( coll_t cid, uint32_t bits, uint32_t rem, coll_t destination) { Op* _op = _get_next_op(); _op->op = OP_SPLIT_COLLECTION2; _op->cid = _get_coll_id(cid); _op->dest_cid = _get_coll_id(destination); _op->split_bits = bits; _op->split_rem = rem; data.ops++; } void collection_set_bits( coll_t cid, int bits) { Op* _op = _get_next_op(); _op->op = OP_COLL_SET_BITS; _op->cid = _get_coll_id(cid); _op->split_bits = bits; data.ops++; } /// Set allocation hint for an object /// make 0 values(expected_object_size, expected_write_size) noops for all implementations void set_alloc_hint( coll_t cid, const ghobject_t &oid, uint64_t expected_object_size, uint64_t expected_write_size, uint32_t flags ) { Op* _op = _get_next_op(); _op->op = OP_SETALLOCHINT; _op->cid = _get_coll_id(cid); _op->oid = _get_object_id(oid); _op->expected_object_size = expected_object_size; _op->expected_write_size = expected_write_size; _op->alloc_hint_flags = flags; data.ops++; } void encode(bufferlist& bl) const { //layout: data_bl + op_bl + coll_index + object_index + data ENCODE_START(9, 9, bl); ::encode(data_bl, bl); ::encode(op_bl, bl); ::encode(coll_index, bl); ::encode(object_index, bl); data.encode(bl); ENCODE_FINISH(bl); } void decode(bufferlist::iterator &bl) { DECODE_START(9, bl); DECODE_OLDEST(9); ::decode(data_bl, bl); ::decode(op_bl, bl); ::decode(coll_index, bl); ::decode(object_index, bl); data.decode(bl); coll_id = coll_index.size(); object_id = object_index.size(); DECODE_FINISH(bl); } void dump(ceph::Formatter *f); static void generate_test_instances(list& o); }; // synchronous wrappers unsigned apply_transaction(Sequencer *osr, Transaction&& t, Context *ondisk=0) { vector tls; tls.push_back(std::move(t)); return apply_transactions(osr, tls, ondisk); } unsigned apply_transactions(Sequencer *osr, vector& tls, Context *ondisk=0); int queue_transaction(Sequencer *osr, Transaction&& t, Context *onreadable, Context *ondisk=0, Context *onreadable_sync=0, TrackedOpRef op = TrackedOpRef(), ThreadPool::TPHandle *handle = NULL) { vector tls; tls.push_back(std::move(t)); return queue_transactions(osr, tls, onreadable, ondisk, onreadable_sync, op, handle); } int queue_transactions(Sequencer *osr, vector& tls, Context *onreadable, Context *ondisk=0, Context *onreadable_sync=0, TrackedOpRef op = TrackedOpRef(), ThreadPool::TPHandle *handle = NULL) { assert(!tls.empty()); tls.back().register_on_applied(onreadable); tls.back().register_on_commit(ondisk); tls.back().register_on_applied_sync(onreadable_sync); return queue_transactions(osr, tls, op, handle); } virtual int queue_transactions( Sequencer *osr, vector& tls, TrackedOpRef op = TrackedOpRef(), ThreadPool::TPHandle *handle = NULL) = 0; int queue_transactions( Sequencer *osr, vector& tls, Context *onreadable, Context *oncommit, Context *onreadable_sync, Context *oncomplete, TrackedOpRef op); int queue_transaction( Sequencer *osr, Transaction&& t, Context *onreadable, Context *oncommit, Context *onreadable_sync, Context *oncomplete, TrackedOpRef op) { vector tls; tls.push_back(std::move(t)); return queue_transactions( osr, tls, onreadable, oncommit, onreadable_sync, oncomplete, op); } public: ObjectStore(CephContext* cct, const std::string& path_) : path(path_), cct(cct) {} virtual ~ObjectStore() {} // no copying explicit ObjectStore(const ObjectStore& o) = delete; const ObjectStore& operator=(const ObjectStore& o) = delete; // versioning virtual int upgrade() { return 0; } virtual void get_db_statistics(Formatter *f) { } virtual void generate_db_histogram(Formatter *f) { } virtual void flush_cache() { } virtual void dump_perf_counters(Formatter *f) {} virtual string get_type() = 0; // mgmt virtual bool test_mount_in_use() = 0; virtual int mount() = 0; virtual int umount() = 0; virtual int fsck(bool deep) { return -EOPNOTSUPP; } virtual int repair(bool deep) { return -EOPNOTSUPP; } virtual void set_cache_shards(unsigned num) { } /** * Returns 0 if the hobject is valid, -error otherwise * * Errors: * -ENAMETOOLONG: locator/namespace/name too large */ virtual int validate_hobject_key(const hobject_t &obj) const = 0; virtual unsigned get_max_attr_name_length() = 0; virtual int mkfs() = 0; // wipe virtual int mkjournal() = 0; // journal only virtual bool needs_journal() = 0; //< requires a journal virtual bool wants_journal() = 0; //< prefers a journal virtual bool allows_journal() = 0; //< allows a journal /** * is_rotational * * Check whether store is backed by a rotational (HDD) or non-rotational * (SSD) device. * * This must be usable *before* the store is mounted. * * @return true for HDD, false for SSD */ virtual bool is_rotational() { return true; } /** * is_journal_rotational * * Check whether journal is backed by a rotational (HDD) or non-rotational * (SSD) device. * * * @return true for HDD, false for SSD */ virtual bool is_journal_rotational() { return true; } virtual string get_default_device_class() { return is_rotational() ? "hdd" : "ssd"; } virtual bool can_sort_nibblewise() { return false; // assume a backend cannot, unless it says otherwise } virtual int statfs(struct store_statfs_t *buf) = 0; virtual void collect_metadata(map *pm) { } /** * write_meta - write a simple configuration key out-of-band * * Write a simple key/value pair for basic store configuration * (e.g., a uuid or magic number) to an unopened/unmounted store. * The default implementation writes this to a plaintext file in the * path. * * A newline is appended. * * @param key key name (e.g., "fsid") * @param value value (e.g., a uuid rendered as a string) * @returns 0 for success, or an error code */ virtual int write_meta(const std::string& key, const std::string& value); /** * read_meta - read a simple configuration key out-of-band * * Read a simple key value to an unopened/mounted store. * * Trailing whitespace is stripped off. * * @param key key name * @param value pointer to value string * @returns 0 for success, or an error code */ virtual int read_meta(const std::string& key, std::string *value); /** * get ideal max value for collection_list() * * default to some arbitrary values; the implementation will override. */ virtual int get_ideal_list_max() { return 64; } /** * get a collection handle * * Provide a trivial handle as a default to avoid converting legacy * implementations. */ virtual CollectionHandle open_collection(const coll_t &cid) { return new CompatCollectionHandle(cid); } /** * Synchronous read operations */ /** * exists -- Test for existance of object * * @param cid collection for object * @param oid oid of object * @returns true if object exists, false otherwise */ virtual bool exists(const coll_t& cid, const ghobject_t& oid) = 0; // useful? virtual bool exists(CollectionHandle& c, const ghobject_t& oid) { return exists(c->get_cid(), oid); } /** * set_collection_opts -- set pool options for a collectioninformation for an object * * @param cid collection * @param opts new collection options * @returns 0 on success, negative error code on failure. */ virtual int set_collection_opts( const coll_t& cid, const pool_opts_t& opts) = 0; /** * stat -- get information for an object * * @param cid collection for object * @param oid oid of object * @param st output information for the object * @param allow_eio if false, assert on -EIO operation failure * @returns 0 on success, negative error code on failure. */ virtual int stat( const coll_t& cid, const ghobject_t& oid, struct stat *st, bool allow_eio = false) = 0; // struct stat? virtual int stat( CollectionHandle &c, const ghobject_t& oid, struct stat *st, bool allow_eio = false) { return stat(c->get_cid(), oid, st, allow_eio); } /** * read -- read a byte range of data from an object * * Note: if reading from an offset past the end of the object, we * return 0 (not, say, -EINVAL). * * @param cid collection for object * @param oid oid of object * @param offset location offset of first byte to be read * @param len number of bytes to be read * @param bl output bufferlist * @param op_flags is CEPH_OSD_OP_FLAG_* * @param allow_eio if false, assert on -EIO operation failure * @returns number of bytes read on success, or negative error code on failure. */ virtual int read( const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl, uint32_t op_flags = 0) = 0; virtual int read( CollectionHandle &c, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl, uint32_t op_flags = 0) { return read(c->get_cid(), oid, offset, len, bl, op_flags); } /** * fiemap -- get extent map of data of an object * * Returns an encoded map of the extents of an object's data portion * (map). * * A non-enlightened implementation is free to return the extent (offset, len) * as the sole extent. * * @param cid collection for object * @param oid oid of object * @param offset location offset of first byte to be read * @param len number of bytes to be read * @param bl output bufferlist for extent map information. * @returns 0 on success, negative error code on failure. */ virtual int fiemap(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) = 0; virtual int fiemap(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len, map& destmap) = 0; virtual int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, bufferlist& bl) { return fiemap(c->get_cid(), oid, offset, len, bl); } virtual int fiemap(CollectionHandle& c, const ghobject_t& oid, uint64_t offset, size_t len, map& destmap) { return fiemap(c->get_cid(), oid, offset, len, destmap); } /** * getattr -- get an xattr of an object * * @param cid collection for object * @param oid oid of object * @param name name of attr to read * @param value place to put output result. * @returns 0 on success, negative error code on failure. */ virtual int getattr(const coll_t& cid, const ghobject_t& oid, const char *name, bufferptr& value) = 0; virtual int getattr(CollectionHandle &c, const ghobject_t& oid, const char *name, bufferptr& value) { return getattr(c->get_cid(), oid, name, value); } /** * getattr -- get an xattr of an object * * @param cid collection for object * @param oid oid of object * @param name name of attr to read * @param value place to put output result. * @returns 0 on success, negative error code on failure. */ int getattr(const coll_t& cid, const ghobject_t& oid, const char *name, bufferlist& value) { bufferptr bp; int r = getattr(cid, oid, name, bp); if (bp.length()) value.push_back(bp); return r; } int getattr( coll_t cid, const ghobject_t& oid, const string& name, bufferlist& value) { bufferptr bp; int r = getattr(cid, oid, name.c_str(), bp); value.push_back(bp); return r; } int getattr( CollectionHandle &c, const ghobject_t& oid, const string& name, bufferlist& value) { bufferptr bp; int r = getattr(c, oid, name.c_str(), bp); value.push_back(bp); return r; } /** * getattrs -- get all of the xattrs of an object * * @param cid collection for object * @param oid oid of object * @param aset place to put output result. * @returns 0 on success, negative error code on failure. */ virtual int getattrs(const coll_t& cid, const ghobject_t& oid, map& aset) = 0; virtual int getattrs(CollectionHandle &c, const ghobject_t& oid, map& aset) { return getattrs(c->get_cid(), oid, aset); } /** * getattrs -- get all of the xattrs of an object * * @param cid collection for object * @param oid oid of object * @param aset place to put output result. * @returns 0 on success, negative error code on failure. */ int getattrs(const coll_t& cid, const ghobject_t& oid, map& aset) { map bmap; int r = getattrs(cid, oid, bmap); for (map::iterator i = bmap.begin(); i != bmap.end(); ++i) { aset[i->first].append(i->second); } return r; } int getattrs(CollectionHandle &c, const ghobject_t& oid, map& aset) { map bmap; int r = getattrs(c, oid, bmap); for (map::iterator i = bmap.begin(); i != bmap.end(); ++i) { aset[i->first].append(i->second); } return r; } // collections /** * list_collections -- get all of the collections known to this ObjectStore * * @param ls list of the collections in sorted order. * @returns 0 on success, negative error code on failure. */ virtual int list_collections(vector& ls) = 0; /** * does a collection exist? * * @param c collection * @returns true if it exists, false otherwise */ virtual bool collection_exists(const coll_t& c) = 0; /** * is a collection empty? * * @param c collection * @param empty true if the specified collection is empty, false otherwise * @returns 0 on success, negative error code on failure. */ virtual int collection_empty(const coll_t& c, bool *empty) = 0; /** * return the number of significant bits of the coll_t::pgid. * * This should return what the last create_collection or split_collection * set. A legacy backend may return -EAGAIN if the value is unavailable * (because we upgraded from an older version, e.g., FileStore). */ virtual int collection_bits(const coll_t& c) = 0; /** * list contents of a collection that fall in the range [start, end) and no more than a specified many result * * @param c collection * @param start list object that sort >= this value * @param end list objects that sort < this value * @param max return no more than this many results * @param seq return no objects with snap < seq * @param ls [out] result * @param next [out] next item sorts >= this value * @return zero on success, or negative error */ virtual int collection_list(const coll_t& c, const ghobject_t& start, const ghobject_t& end, int max, vector *ls, ghobject_t *next) = 0; virtual int collection_list(CollectionHandle &c, const ghobject_t& start, const ghobject_t& end, int max, vector *ls, ghobject_t *next) { return collection_list(c->get_cid(), start, end, max, ls, next); } /// OMAP /// Get omap contents virtual int omap_get( const coll_t& c, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap bufferlist *header, ///< [out] omap header map *out /// < [out] Key to value map ) = 0; virtual int omap_get( CollectionHandle &c, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap bufferlist *header, ///< [out] omap header map *out /// < [out] Key to value map ) { return omap_get(c->get_cid(), oid, header, out); } /// Get omap header virtual int omap_get_header( const coll_t& c, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap bufferlist *header, ///< [out] omap header bool allow_eio = false ///< [in] don't assert on eio ) = 0; virtual int omap_get_header( CollectionHandle &c, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap bufferlist *header, ///< [out] omap header bool allow_eio = false ///< [in] don't assert on eio ) { return omap_get_header(c->get_cid(), oid, header, allow_eio); } /// Get keys defined on oid virtual int omap_get_keys( const coll_t& c, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap set *keys ///< [out] Keys defined on oid ) = 0; virtual int omap_get_keys( CollectionHandle &c, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap set *keys ///< [out] Keys defined on oid ) { return omap_get_keys(c->get_cid(), oid, keys); } /// Get key values virtual int omap_get_values( const coll_t& c, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap const set &keys, ///< [in] Keys to get map *out ///< [out] Returned keys and values ) = 0; virtual int omap_get_values( CollectionHandle &c, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap const set &keys, ///< [in] Keys to get map *out ///< [out] Returned keys and values ) { return omap_get_values(c->get_cid(), oid, keys, out); } /// Filters keys into out which are defined on oid virtual int omap_check_keys( const coll_t& c, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap const set &keys, ///< [in] Keys to check set *out ///< [out] Subset of keys defined on oid ) = 0; virtual int omap_check_keys( CollectionHandle &c, ///< [in] Collection containing oid const ghobject_t &oid, ///< [in] Object containing omap const set &keys, ///< [in] Keys to check set *out ///< [out] Subset of keys defined on oid ) { return omap_check_keys(c->get_cid(), oid, keys, out); } /** * Returns an object map iterator * * Warning! The returned iterator is an implicit lock on filestore * operations in c. Do not use filestore methods on c while the returned * iterator is live. (Filling in a transaction is no problem). * * @return iterator, null on error */ virtual ObjectMap::ObjectMapIterator get_omap_iterator( const coll_t& c, ///< [in] collection const ghobject_t &oid ///< [in] object ) = 0; virtual ObjectMap::ObjectMapIterator get_omap_iterator( CollectionHandle &c, ///< [in] collection const ghobject_t &oid ///< [in] object ) { return get_omap_iterator(c->get_cid(), oid); } virtual int flush_journal() { return -EOPNOTSUPP; } virtual int dump_journal(ostream& out) { return -EOPNOTSUPP; } virtual int snapshot(const string& name) { return -EOPNOTSUPP; } /** * Set and get internal fsid for this instance. No external data is modified */ virtual void set_fsid(uuid_d u) = 0; virtual uuid_d get_fsid() = 0; /** * Estimates additional disk space used by the specified amount of objects and caused by file allocation granularity and metadata store * - num objects - total (including witeouts) object count to measure used space for. */ virtual uint64_t estimate_objects_overhead(uint64_t num_objects) = 0; // DEBUG virtual void inject_data_error(const ghobject_t &oid) {} virtual void inject_mdata_error(const ghobject_t &oid) {} virtual void compact() {} }; WRITE_CLASS_ENCODER(ObjectStore::Transaction) WRITE_CLASS_ENCODER(ObjectStore::Transaction::TransactionData) static inline void intrusive_ptr_add_ref(ObjectStore::Sequencer_impl *s) { s->get(); } static inline void intrusive_ptr_release(ObjectStore::Sequencer_impl *s) { s->put(); } ostream& operator<<(ostream& out, const ObjectStore::Sequencer& s); ostream& operator<<(ostream& out, const ObjectStore::Transaction& tx); #endif