1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 #ifndef LEVEL_DB_STORE_H
4 #define LEVEL_DB_STORE_H
6 #include "include/types.h"
7 #include "include/buffer_fwd.h"
8 #include "KeyValueDB.h"
12 #include "include/memory.h"
13 #include <boost/scoped_ptr.hpp>
14 #include "leveldb/db.h"
15 #include "leveldb/env.h"
16 #include "leveldb/write_batch.h"
17 #include "leveldb/slice.h"
18 #include "leveldb/cache.h"
19 #ifdef HAVE_LEVELDB_FILTER_POLICY
20 #include "leveldb/filter_policy.h"
24 #include "common/errno.h"
25 #include "common/dout.h"
26 #include "include/assert.h"
27 #include "common/Formatter.h"
28 #include "common/Cond.h"
30 #include "common/ceph_context.h"
32 // reinclude our assert to clobber the system one
33 # include "include/assert.h"
38 l_leveldb_first = 34300,
41 l_leveldb_get_latency,
42 l_leveldb_submit_latency,
43 l_leveldb_submit_sync_latency,
45 l_leveldb_compact_range,
46 l_leveldb_compact_queue_merge,
47 l_leveldb_compact_queue_len,
51 extern leveldb::Logger *create_leveldb_ceph_logger();
53 class CephLevelDBLogger;
56 * Uses LevelDB to implement the KeyValueDB interface
58 class LevelDBStore : public KeyValueDB {
61 CephLevelDBLogger *ceph_logger;
63 boost::scoped_ptr<leveldb::Cache> db_cache;
64 #ifdef HAVE_LEVELDB_FILTER_POLICY
65 boost::scoped_ptr<const leveldb::FilterPolicy> filterpolicy;
67 boost::scoped_ptr<leveldb::DB> db;
69 int do_open(ostream &out, bool create_if_missing);
71 // manage async compactions
72 Mutex compact_queue_lock;
73 Cond compact_queue_cond;
74 list< pair<string,string> > compact_queue;
75 bool compact_queue_stop;
76 class CompactThread : public Thread {
79 explicit CompactThread(LevelDBStore *d) : db(d) {}
80 void *entry() override {
81 db->compact_thread_entry();
84 friend class LevelDBStore;
87 void compact_thread_entry();
89 void compact_range(const string& start, const string& end) {
90 leveldb::Slice cstart(start);
91 leveldb::Slice cend(end);
92 db->CompactRange(&cstart, &cend);
94 void compact_range_async(const string& start, const string& end);
97 /// compact the underlying leveldb store
98 void compact() override;
100 /// compact db for all keys with a given prefix
101 void compact_prefix(const string& prefix) override {
102 compact_range(prefix, past_prefix(prefix));
104 void compact_prefix_async(const string& prefix) override {
105 compact_range_async(prefix, past_prefix(prefix));
107 void compact_range(const string& prefix,
108 const string& start, const string& end) override {
109 compact_range(combine_strings(prefix, start), combine_strings(prefix, end));
111 void compact_range_async(const string& prefix,
112 const string& start, const string& end) override {
113 compact_range_async(combine_strings(prefix, start),
114 combine_strings(prefix, end));
119 * options_t: Holds options which are minimally interpreted
120 * on initialization and then passed through to LevelDB.
121 * We transform a couple of these into actual LevelDB
122 * structures, but the rest are simply passed through unchanged. See
123 * leveldb/options.h for more precise details on each.
125 * Set them after constructing the LevelDBStore, but before calling
126 * open() or create_and_open().
129 uint64_t write_buffer_size; /// in-memory write buffer size
130 int max_open_files; /// maximum number of files LevelDB can open at once
131 uint64_t cache_size; /// size of extra decompressed cache to use
132 uint64_t block_size; /// user data per block
133 int bloom_size; /// number of bits per entry to put in a bloom filter
134 bool compression_enabled; /// whether to use libsnappy compression or not
136 // don't change these ones. No, seriously
137 int block_restart_interval;
138 bool error_if_exists;
139 bool paranoid_checks;
144 write_buffer_size(0), //< 0 means default
145 max_open_files(0), //< 0 means default
146 cache_size(0), //< 0 means no cache (default)
147 block_size(0), //< 0 means default
148 bloom_size(0), //< 0 means no bloom filter (default)
149 compression_enabled(true), //< set to false for no compression
150 block_restart_interval(0), //< 0 means default
151 error_if_exists(false), //< set to true if you want to check nonexistence
152 paranoid_checks(false) //< set to true if you want paranoid checks
156 LevelDBStore(CephContext *c, const string &path) :
162 #ifdef HAVE_LEVELDB_FILTER_POLICY
165 compact_queue_lock("LevelDBStore::compact_thread_lock"),
166 compact_queue_stop(false),
167 compact_thread(this),
171 ~LevelDBStore() override;
173 static int _test_init(const string& dir);
174 int init(string option_str="") override;
176 /// Opens underlying db
177 int open(ostream &out) override {
178 return do_open(out, false);
180 /// Creates underlying db if missing and opens it
181 int create_and_open(ostream &out) override {
182 return do_open(out, true);
185 void close() override;
187 PerfCounters *get_perf_counters() override
192 class LevelDBTransactionImpl : public KeyValueDB::TransactionImpl {
194 leveldb::WriteBatch bat;
196 explicit LevelDBTransactionImpl(LevelDBStore *db) : db(db) {}
198 const string &prefix,
200 const bufferlist &bl) override;
201 using KeyValueDB::TransactionImpl::set;
203 const string &prefix,
204 const string &k) override;
205 void rmkeys_by_prefix(
208 virtual void rm_range_keys(
209 const string &prefix,
211 const string &end) override;
213 using KeyValueDB::TransactionImpl::rmkey;
216 KeyValueDB::Transaction get_transaction() override {
217 return std::make_shared<LevelDBTransactionImpl>(this);
220 int submit_transaction(KeyValueDB::Transaction t) override;
221 int submit_transaction_sync(KeyValueDB::Transaction t) override;
223 const string &prefix,
224 const std::set<string> &key,
225 std::map<string, bufferlist> *out
228 int get(const string &prefix,
230 bufferlist *value) override;
232 using KeyValueDB::get;
234 class LevelDBWholeSpaceIteratorImpl :
235 public KeyValueDB::WholeSpaceIteratorImpl {
237 boost::scoped_ptr<leveldb::Iterator> dbiter;
239 explicit LevelDBWholeSpaceIteratorImpl(leveldb::Iterator *iter) :
241 ~LevelDBWholeSpaceIteratorImpl() override { }
243 int seek_to_first() override {
244 dbiter->SeekToFirst();
245 return dbiter->status().ok() ? 0 : -1;
247 int seek_to_first(const string &prefix) override {
248 leveldb::Slice slice_prefix(prefix);
249 dbiter->Seek(slice_prefix);
250 return dbiter->status().ok() ? 0 : -1;
252 int seek_to_last() override {
253 dbiter->SeekToLast();
254 return dbiter->status().ok() ? 0 : -1;
256 int seek_to_last(const string &prefix) override {
257 string limit = past_prefix(prefix);
258 leveldb::Slice slice_limit(limit);
259 dbiter->Seek(slice_limit);
261 if (!dbiter->Valid()) {
262 dbiter->SeekToLast();
266 return dbiter->status().ok() ? 0 : -1;
268 int upper_bound(const string &prefix, const string &after) override {
269 lower_bound(prefix, after);
271 pair<string,string> key = raw_key();
272 if (key.first == prefix && key.second == after)
275 return dbiter->status().ok() ? 0 : -1;
277 int lower_bound(const string &prefix, const string &to) override {
278 string bound = combine_strings(prefix, to);
279 leveldb::Slice slice_bound(bound);
280 dbiter->Seek(slice_bound);
281 return dbiter->status().ok() ? 0 : -1;
283 bool valid() override {
284 return dbiter->Valid();
286 int next() override {
289 return dbiter->status().ok() ? 0 : -1;
291 int prev() override {
294 return dbiter->status().ok() ? 0 : -1;
296 string key() override {
298 split_key(dbiter->key(), 0, &out_key);
301 pair<string,string> raw_key() override {
303 split_key(dbiter->key(), &prefix, &key);
304 return make_pair(prefix, key);
306 bool raw_key_is_prefixed(const string &prefix) override {
307 leveldb::Slice key = dbiter->key();
308 if ((key.size() > prefix.length()) && (key[prefix.length()] == '\0')) {
309 return memcmp(key.data(), prefix.c_str(), prefix.length()) == 0;
314 bufferlist value() override {
315 return to_bufferlist(dbiter->value());
318 bufferptr value_as_ptr() override {
319 leveldb::Slice data = dbiter->value();
320 return bufferptr(data.data(), data.size());
323 int status() override {
324 return dbiter->status().ok() ? 0 : -1;
329 static string combine_strings(const string &prefix, const string &value);
330 static int split_key(leveldb::Slice in, string *prefix, string *key);
331 static bufferlist to_bufferlist(leveldb::Slice in);
332 static string past_prefix(const string &prefix) {
333 string limit = prefix;
338 uint64_t get_estimated_size(map<string,uint64_t> &extra) override {
339 DIR *store_dir = opendir(path.c_str());
341 lderr(cct) << __func__ << " something happened opening the store: "
342 << cpp_strerror(errno) << dendl;
346 uint64_t total_size = 0;
347 uint64_t sst_size = 0;
348 uint64_t log_size = 0;
349 uint64_t misc_size = 0;
351 struct dirent *entry = NULL;
352 while ((entry = readdir(store_dir)) != NULL) {
353 string n(entry->d_name);
355 if (n == "." || n == "..")
358 string fpath = path + '/' + n;
360 int err = stat(fpath.c_str(), &s);
363 // we may race against leveldb while reading files; this should only
364 // happen when those files are being updated, data is being shuffled
365 // and files get removed, in which case there's not much of a problem
366 // as we'll get to them next time around.
367 if (err == -ENOENT) {
371 lderr(cct) << __func__ << " error obtaining stats for " << fpath
372 << ": " << cpp_strerror(err) << dendl;
376 size_t pos = n.find_last_of('.');
377 if (pos == string::npos) {
378 misc_size += s.st_size;
382 string ext = n.substr(pos+1);
384 sst_size += s.st_size;
385 } else if (ext == "log") {
386 log_size += s.st_size;
388 misc_size += s.st_size;
392 total_size = sst_size + log_size + misc_size;
394 extra["sst"] = sst_size;
395 extra["log"] = log_size;
396 extra["misc"] = misc_size;
397 extra["total"] = total_size;
406 WholeSpaceIterator _get_iterator() override {
407 return std::make_shared<LevelDBWholeSpaceIteratorImpl>(
408 db->NewIterator(leveldb::ReadOptions()));