// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * In-memory crash non-safe keyvalue db * Author: Ramesh Chander, Ramesh.Chander@sandisk.com */ #include "include/compat.h" #include #include #include #include #include #include #include #include #include "common/perf_counters.h" #include "common/debug.h" #include "include/str_list.h" #include "include/str_map.h" #include "KeyValueDB.h" #include "MemDB.h" #include "include/assert.h" #include "common/debug.h" #include "common/errno.h" #include "include/compat.h" #define dout_context g_ceph_context #define dout_subsys ceph_subsys_memdb #undef dout_prefix #define dout_prefix *_dout << "memdb: " #define dtrace dout(30) #define dwarn dout(0) #define dinfo dout(0) static void split_key(const string& raw_key, string *prefix, string *key) { size_t pos = raw_key.find(KEY_DELIM, 0); assert(pos != std::string::npos); *prefix = raw_key.substr(0, pos); *key = raw_key.substr(pos + 1, raw_key.length()); } static string make_key(const string &prefix, const string &value) { string out = prefix; out.push_back(KEY_DELIM); out.append(value); return out; } void MemDB::_encode(mdb_iter_t iter, bufferlist &bl) { ::encode(iter->first, bl); ::encode(iter->second, bl); } std::string MemDB::_get_data_fn() { string fn = m_db_path + "/" + "MemDB.db"; return fn; } void MemDB::_save() { std::lock_guard l(m_lock); dout(10) << __func__ << " Saving MemDB to file: "<< _get_data_fn().c_str() << dendl; int mode = 0644; int fd = TEMP_FAILURE_RETRY(::open(_get_data_fn().c_str(), O_WRONLY|O_CREAT|O_TRUNC, mode)); if (fd < 0) { int err = errno; cerr << "write_file(" << _get_data_fn().c_str() << "): failed to open file: " << cpp_strerror(err) << std::endl; return; } bufferlist bl; mdb_iter_t iter = m_map.begin(); while (iter != m_map.end()) { dout(10) << __func__ << " Key:"<< iter->first << dendl; _encode(iter, bl); ++iter; } bl.write_fd(fd); VOID_TEMP_FAILURE_RETRY(::close(fd)); } int MemDB::_load() { std::lock_guard l(m_lock); dout(10) << __func__ << " Reading MemDB from file: "<< _get_data_fn().c_str() << dendl; /* * Open file and read it in single shot. */ int fd = TEMP_FAILURE_RETRY(::open(_get_data_fn().c_str(), O_RDONLY)); if (fd < 0) { int err = errno; cerr << "can't open " << _get_data_fn().c_str() << ": " << cpp_strerror(err) << std::endl; return -err; } struct stat st; memset(&st, 0, sizeof(st)); if (::fstat(fd, &st) < 0) { int err = errno; cerr << "can't stat file " << _get_data_fn().c_str() << ": " << cpp_strerror(err) << std::endl; VOID_TEMP_FAILURE_RETRY(::close(fd)); return -err; } ssize_t file_size = st.st_size; ssize_t bytes_done = 0; while (bytes_done < file_size) { string key; bufferptr datap; bytes_done += ::decode_file(fd, key); bytes_done += ::decode_file(fd, datap); dout(10) << __func__ << " Key:"<< key << dendl; m_map[key] = datap; m_total_bytes += datap.length(); } VOID_TEMP_FAILURE_RETRY(::close(fd)); return 0; } int MemDB::_init(bool create) { int r; dout(1) << __func__ << dendl; if (create) { r = ::mkdir(m_db_path.c_str(), 0700); if (r < 0) { r = -errno; if (r != -EEXIST) { derr << __func__ << " mkdir failed: " << cpp_strerror(r) << dendl; return r; } return 0; // ignore EEXIST } } else { r = _load(); } return r; } int MemDB::set_merge_operator( const string& prefix, std::shared_ptr mop) { merge_ops.push_back(std::make_pair(prefix, mop)); return 0; } int MemDB::do_open(ostream &out, bool create) { m_total_bytes = 0; m_allocated_bytes = 1; return _init(create); } MemDB::~MemDB() { close(); dout(10) << __func__ << " Destroying MemDB instance: "<< dendl; } void MemDB::close() { /* * Save whatever in memory btree. */ _save(); } int MemDB::submit_transaction(KeyValueDB::Transaction t) { MDBTransactionImpl* mt = static_cast(t.get()); dtrace << __func__ << " " << mt->get_ops().size() << dendl; for(auto& op : mt->get_ops()) { if(op.first == MDBTransactionImpl::WRITE) { ms_op_t set_op = op.second; _setkey(set_op); } else if (op.first == MDBTransactionImpl::MERGE) { ms_op_t merge_op = op.second; _merge(merge_op); } else { ms_op_t rm_op = op.second; assert(op.first == MDBTransactionImpl::DELETE); _rmkey(rm_op); } } return 0; } int MemDB::submit_transaction_sync(KeyValueDB::Transaction tsync) { dtrace << __func__ << " " << dendl; submit_transaction(tsync); return 0; } int MemDB::transaction_rollback(KeyValueDB::Transaction t) { MDBTransactionImpl* mt = static_cast(t.get()); mt->clear(); return 0; } void MemDB::MDBTransactionImpl::set( const string &prefix, const string &k, const bufferlist &to_set_bl) { dtrace << __func__ << " " << prefix << " " << k << dendl; ops.push_back(make_pair(WRITE, std::make_pair(std::make_pair(prefix, k), to_set_bl))); } void MemDB::MDBTransactionImpl::rmkey(const string &prefix, const string &k) { dtrace << __func__ << " " << prefix << " " << k << dendl; ops.push_back(make_pair(DELETE, std::make_pair(std::make_pair(prefix, k), bufferlist()))); } void MemDB::MDBTransactionImpl::rmkeys_by_prefix(const string &prefix) { KeyValueDB::Iterator it = m_db->get_iterator(prefix); for (it->seek_to_first(); it->valid(); it->next()) { rmkey(prefix, it->key()); } } void MemDB::MDBTransactionImpl::rm_range_keys(const string &prefix, const string &start, const string &end) { KeyValueDB::Iterator it = m_db->get_iterator(prefix); it->lower_bound(start); while (it->valid()) { if (it->key() >= end) { break; } rmkey(prefix, it->key()); it->next(); } } void MemDB::MDBTransactionImpl::merge( const std::string &prefix, const std::string &key, const bufferlist &value) { dtrace << __func__ << " " << prefix << " " << key << dendl; ops.push_back(make_pair(MERGE, make_pair(std::make_pair(prefix, key), value))); return; } int MemDB::_setkey(ms_op_t &op) { std::lock_guard l(m_lock); std::string key = make_key(op.first.first, op.first.second); bufferlist bl = op.second; m_total_bytes += bl.length(); bufferlist bl_old; if (_get(op.first.first, op.first.second, &bl_old)) { /* * delete and free existing key. */ assert(m_total_bytes >= bl_old.length()); m_total_bytes -= bl_old.length(); m_map.erase(key); } m_map[key] = bufferptr((char *) bl.c_str(), bl.length()); iterator_seq_no++; return 0; } int MemDB::_rmkey(ms_op_t &op) { std::lock_guard l(m_lock); std::string key = make_key(op.first.first, op.first.second); bufferlist bl_old; if (_get(op.first.first, op.first.second, &bl_old)) { assert(m_total_bytes >= bl_old.length()); m_total_bytes -= bl_old.length(); } iterator_seq_no++; /* * Erase will call the destructor for bufferptr. */ return m_map.erase(key); } std::shared_ptr MemDB::_find_merge_op(std::string prefix) { for (const auto& i : merge_ops) { if (i.first == prefix) { return i.second; } } dtrace << __func__ << " No merge op for " << prefix << dendl; return NULL; } int MemDB::_merge(ms_op_t &op) { std::lock_guard l(m_lock); std::string prefix = op.first.first; std::string key = make_key(op.first.first, op.first.second); bufferlist bl = op.second; int64_t bytes_adjusted = bl.length(); /* * find the operator for this prefix */ std::shared_ptr mop = _find_merge_op(prefix); assert(mop); /* * call the merge operator with value and non value */ bufferlist bl_old; if (_get(op.first.first, op.first.second, &bl_old) == false) { std::string new_val; /* * Merge non existent. */ mop->merge_nonexistent(bl.c_str(), bl.length(), &new_val); m_map[key] = bufferptr(new_val.c_str(), new_val.length()); } else { /* * Merge existing. */ std::string new_val; mop->merge(bl_old.c_str(), bl_old.length(), bl.c_str(), bl.length(), &new_val); m_map[key] = bufferptr(new_val.c_str(), new_val.length()); bytes_adjusted -= bl_old.length(); bl_old.clear(); } assert((int64_t)m_total_bytes + bytes_adjusted >= 0); m_total_bytes += bytes_adjusted; iterator_seq_no++; return 0; } /* * Caller take btree lock. */ bool MemDB::_get(const string &prefix, const string &k, bufferlist *out) { string key = make_key(prefix, k); mdb_iter_t iter = m_map.find(key); if (iter == m_map.end()) { return false; } out->push_back((m_map[key].clone())); return true; } bool MemDB::_get_locked(const string &prefix, const string &k, bufferlist *out) { std::lock_guard l(m_lock); return _get(prefix, k, out); } int MemDB::get(const string &prefix, const std::string& key, bufferlist *out) { if (_get_locked(prefix, key, out)) { return 0; } return -ENOENT; } int MemDB::get(const string &prefix, const std::set &keys, std::map *out) { for (const auto& i : keys) { bufferlist bl; if (_get_locked(prefix, i, &bl)) out->insert(make_pair(i, bl)); } return 0; } void MemDB::MDBWholeSpaceIteratorImpl::fill_current() { bufferlist bl; bl.append(m_iter->second.clone()); m_key_value = std::make_pair(m_iter->first, bl); } bool MemDB::MDBWholeSpaceIteratorImpl::valid() { if (m_key_value.first.empty()) { return false; } return true; } bool MemDB::MDBWholeSpaceIteratorImpl::iterator_validate() { if (this_seq_no != *global_seq_no) { auto key = m_key_value.first; assert(!key.empty()); bool restart_iter = false; if (!m_using_btree) { /* * Map is modified and marker key does not exists, * restart the iterator from next key. */ if (m_map_p->find(key) == m_map_p->end()) { restart_iter = true; } } else { restart_iter = true; } if (restart_iter) { m_iter = m_map_p->lower_bound(key); if (m_iter == m_map_p->end()) { return false; } } /* * This iter is valid now. */ this_seq_no = *global_seq_no; } return true; } void MemDB::MDBWholeSpaceIteratorImpl::free_last() { m_key_value.first.clear(); m_key_value.second.clear(); } string MemDB::MDBWholeSpaceIteratorImpl::key() { dtrace << __func__ << " " << m_key_value.first << dendl; string prefix, key; split_key(m_key_value.first, &prefix, &key); return key; } pair MemDB::MDBWholeSpaceIteratorImpl::raw_key() { string prefix, key; split_key(m_key_value.first, &prefix, &key); return make_pair(prefix, key); } bool MemDB::MDBWholeSpaceIteratorImpl::raw_key_is_prefixed( const string &prefix) { string p, k; split_key(m_key_value.first, &p, &k); return (p == prefix); } bufferlist MemDB::MDBWholeSpaceIteratorImpl::value() { dtrace << __func__ << " " << m_key_value << dendl; return m_key_value.second; } int MemDB::MDBWholeSpaceIteratorImpl::next() { std::lock_guard l(*m_map_lock_p); if (!iterator_validate()) { free_last(); return -1; } free_last(); ++m_iter; if (m_iter != m_map_p->end()) { fill_current(); return 0; } else { return -1; } } int MemDB::MDBWholeSpaceIteratorImpl:: prev() { std::lock_guard l(*m_map_lock_p); if (!iterator_validate()) { free_last(); return -1; } free_last(); if (m_iter != m_map_p->begin()) { --m_iter; fill_current(); return 0; } else { return -1; } } /* * First key >= to given key, if key is null then first key in btree. */ int MemDB::MDBWholeSpaceIteratorImpl::seek_to_first(const std::string &k) { std::lock_guard l(*m_map_lock_p); free_last(); if (k.empty()) { m_iter = m_map_p->begin(); } else { m_iter = m_map_p->lower_bound(k); } if (m_iter == m_map_p->end()) { return -1; } fill_current(); return 0; } int MemDB::MDBWholeSpaceIteratorImpl::seek_to_last(const std::string &k) { std::lock_guard l(*m_map_lock_p); free_last(); if (k.empty()) { m_iter = m_map_p->end(); --m_iter; } else { m_iter = m_map_p->lower_bound(k); } if (m_iter == m_map_p->end()) { return -1; } fill_current(); return 0; } MemDB::MDBWholeSpaceIteratorImpl::~MDBWholeSpaceIteratorImpl() { free_last(); } int MemDB::MDBWholeSpaceIteratorImpl::upper_bound(const std::string &prefix, const std::string &after) { std::lock_guard l(*m_map_lock_p); dtrace << "upper_bound " << prefix.c_str() << after.c_str() << dendl; string k = make_key(prefix, after); m_iter = m_map_p->upper_bound(k); if (m_iter != m_map_p->end()) { fill_current(); return 0; } return -1; } int MemDB::MDBWholeSpaceIteratorImpl::lower_bound(const std::string &prefix, const std::string &to) { std::lock_guard l(*m_map_lock_p); dtrace << "lower_bound " << prefix.c_str() << to.c_str() << dendl; string k = make_key(prefix, to); m_iter = m_map_p->lower_bound(k); if (m_iter != m_map_p->end()) { fill_current(); return 0; } return -1; }