1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 #include "BitmapFreelistManager.h"
5 #include "kv/KeyValueDB.h"
8 #include "common/debug.h"
10 #define dout_context cct
11 #define dout_subsys ceph_subsys_bluestore
13 #define dout_prefix *_dout << "freelist "
15 void make_offset_key(uint64_t offset, std::string *key)
18 _key_encode_u64(offset, key);
21 struct XorMergeOperator : public KeyValueDB::MergeOperator {
22 void merge_nonexistent(
23 const char *rdata, size_t rlen, std::string *new_value) override {
24 *new_value = std::string(rdata, rlen);
27 const char *ldata, size_t llen,
28 const char *rdata, size_t rlen,
29 std::string *new_value) override {
31 *new_value = std::string(ldata, llen);
32 for (size_t i = 0; i < rlen; ++i) {
33 (*new_value)[i] ^= rdata[i];
36 // We use each operator name and each prefix to construct the
37 // overall RocksDB operator name for consistency check at open time.
38 string name() const override {
43 void BitmapFreelistManager::setup_merge_operator(KeyValueDB *db, string prefix)
45 ceph::shared_ptr<XorMergeOperator> merge_op(new XorMergeOperator);
46 db->set_merge_operator(prefix, merge_op);
49 BitmapFreelistManager::BitmapFreelistManager(CephContext* cct,
53 : FreelistManager(cct),
54 meta_prefix(meta_prefix),
55 bitmap_prefix(bitmap_prefix),
61 int BitmapFreelistManager::create(uint64_t new_size, uint64_t min_alloc_size,
62 KeyValueDB::Transaction txn)
64 bytes_per_block = std::max(cct->_conf->bdev_block_size,
65 (int64_t)min_alloc_size);
66 assert(ISP2(bytes_per_block));
67 size = P2ALIGN(new_size, bytes_per_block);
68 blocks_per_key = cct->_conf->bluestore_freelist_blocks_per_key;
72 blocks = size / bytes_per_block;
73 if (blocks / blocks_per_key * blocks_per_key != blocks) {
74 blocks = (blocks / blocks_per_key + 1) * blocks_per_key;
75 dout(10) << __func__ << " rounding blocks up from 0x" << std::hex << size
76 << " to 0x" << (blocks * bytes_per_block)
77 << " (0x" << blocks << " blocks)" << std::dec << dendl;
78 // set past-eof blocks as allocated
79 _xor(size, blocks * bytes_per_block - size, txn);
82 << " size 0x" << std::hex << size
83 << " bytes_per_block 0x" << bytes_per_block
84 << " blocks 0x" << blocks
85 << " blocks_per_key 0x" << blocks_per_key
89 ::encode(bytes_per_block, bl);
90 txn->set(meta_prefix, "bytes_per_block", bl);
94 ::encode(blocks_per_key, bl);
95 txn->set(meta_prefix, "blocks_per_key", bl);
100 txn->set(meta_prefix, "blocks", bl);
105 txn->set(meta_prefix, "size", bl);
110 int BitmapFreelistManager::init(uint64_t dev_size)
112 dout(1) << __func__ << dendl;
114 KeyValueDB::Iterator it = kvdb->get_iterator(meta_prefix);
115 it->lower_bound(string());
118 while (it->valid()) {
119 string k = it->key();
120 if (k == "bytes_per_block") {
121 bufferlist bl = it->value();
122 bufferlist::iterator p = bl.begin();
123 ::decode(bytes_per_block, p);
124 dout(10) << __func__ << " bytes_per_block 0x" << std::hex
125 << bytes_per_block << std::dec << dendl;
126 } else if (k == "blocks") {
127 bufferlist bl = it->value();
128 bufferlist::iterator p = bl.begin();
130 dout(10) << __func__ << " blocks 0x" << std::hex << blocks << std::dec
132 } else if (k == "size") {
133 bufferlist bl = it->value();
134 bufferlist::iterator p = bl.begin();
136 dout(10) << __func__ << " size 0x" << std::hex << size << std::dec
138 } else if (k == "blocks_per_key") {
139 bufferlist bl = it->value();
140 bufferlist::iterator p = bl.begin();
141 ::decode(blocks_per_key, p);
142 dout(10) << __func__ << " blocks_per_key 0x" << std::hex << blocks_per_key
143 << std::dec << dendl;
145 derr << __func__ << " unrecognized meta " << k << dendl;
151 dout(10) << __func__ << std::hex
152 << " size 0x" << size
153 << " bytes_per_block 0x" << bytes_per_block
154 << " blocks 0x" << blocks
155 << " blocks_per_key 0x" << blocks_per_key
156 << std::dec << dendl;
159 // check for http://tracker.ceph.com/issues/21089 inconsistency
161 uint64_t new_size = P2ALIGN(dev_size, bytes_per_block);
162 if (new_size != size) {
163 uint64_t bad_size = new_size & ~bytes_per_block;
164 if (size == bad_size) {
165 derr << __func__ << " size is 0x" << std::hex << size << " should be 0x"
166 << new_size << " and appears to be due to #21089" << std::dec
169 uint64_t new_blocks = new_size / bytes_per_block;
170 if (new_blocks / blocks_per_key * blocks_per_key != new_blocks) {
171 new_blocks = (new_blocks / blocks_per_key + 1) *
175 KeyValueDB::Transaction t = kvdb->get_transaction();
178 ::encode(new_size, sizebl);
179 t->set(meta_prefix, "size", sizebl);
181 if (new_blocks != blocks) {
182 derr << "blocks is 0x" << std::hex << blocks << " should be 0x"
183 << new_blocks << std::dec << dendl;
185 ::encode(new_blocks, bl);
186 t->set(meta_prefix, "blocks", bl);
187 _xor(new_size, new_blocks * bytes_per_block - new_size, t);
189 derr << "blocks are ok" << dendl;
190 _xor(bad_size, bytes_per_block, t);
192 int r = kvdb->submit_transaction_sync(t);
196 derr << __func__ << " fixed inconsistency, size now 0x" << std::hex
197 << size << " blocks 0x" << blocks << std::dec << dendl;
204 void BitmapFreelistManager::_init_misc()
206 bufferptr z(blocks_per_key >> 3);
207 memset(z.c_str(), 0xff, z.length());
209 all_set_bl.append(z);
211 block_mask = ~(bytes_per_block - 1);
213 bytes_per_key = bytes_per_block * blocks_per_key;
214 key_mask = ~(bytes_per_key - 1);
215 dout(10) << __func__ << std::hex << " bytes_per_key 0x" << bytes_per_key
216 << ", key_mask 0x" << key_mask << std::dec
220 void BitmapFreelistManager::shutdown()
222 dout(1) << __func__ << dendl;
225 void BitmapFreelistManager::enumerate_reset()
227 std::lock_guard<std::mutex> l(lock);
228 enumerate_offset = 0;
229 enumerate_bl_pos = 0;
230 enumerate_bl.clear();
234 int get_next_clear_bit(bufferlist& bl, int start)
236 const char *p = bl.c_str();
237 int bits = bl.length() << 3;
238 while (start < bits) {
239 // byte = start / 8 (or start >> 3)
240 // bit = start % 8 (or start & 7)
241 unsigned char byte_mask = 1 << (start & 7);
242 if ((p[start >> 3] & byte_mask) == 0) {
247 return -1; // not found
250 int get_next_set_bit(bufferlist& bl, int start)
252 const char *p = bl.c_str();
253 int bits = bl.length() << 3;
254 while (start < bits) {
255 int which_byte = start / 8;
256 int which_bit = start % 8;
257 unsigned char byte_mask = 1 << which_bit;
258 if (p[which_byte] & byte_mask) {
263 return -1; // not found
266 bool BitmapFreelistManager::enumerate_next(uint64_t *offset, uint64_t *length)
268 std::lock_guard<std::mutex> l(lock);
270 // initial base case is a bit awkward
271 if (enumerate_offset == 0 && enumerate_bl_pos == 0) {
272 dout(10) << __func__ << " start" << dendl;
273 enumerate_p = kvdb->get_iterator(bitmap_prefix);
274 enumerate_p->lower_bound(string());
275 // we assert that the first block is always allocated; it's true,
276 // and it simplifies our lives a bit.
277 assert(enumerate_p->valid());
278 string k = enumerate_p->key();
279 const char *p = k.c_str();
280 _key_decode_u64(p, &enumerate_offset);
281 enumerate_bl = enumerate_p->value();
282 assert(enumerate_offset == 0);
283 assert(get_next_set_bit(enumerate_bl, 0) == 0);
286 if (enumerate_offset >= size) {
287 dout(10) << __func__ << " end" << dendl;
291 // skip set bits to find offset
293 enumerate_bl_pos = get_next_clear_bit(enumerate_bl, enumerate_bl_pos);
294 if (enumerate_bl_pos >= 0) {
295 *offset = _get_offset(enumerate_offset, enumerate_bl_pos);
296 dout(30) << __func__ << " found clear bit, key 0x" << std::hex
297 << enumerate_offset << " bit 0x" << enumerate_bl_pos
298 << " offset 0x" << *offset
299 << std::dec << dendl;
302 dout(30) << " no more clear bits in 0x" << std::hex << enumerate_offset
303 << std::dec << dendl;
305 enumerate_bl.clear();
306 if (!enumerate_p->valid()) {
307 enumerate_offset += bytes_per_key;
308 enumerate_bl_pos = 0;
309 *offset = _get_offset(enumerate_offset, enumerate_bl_pos);
312 string k = enumerate_p->key();
313 const char *p = k.c_str();
314 uint64_t next = enumerate_offset + bytes_per_key;
315 _key_decode_u64(p, &enumerate_offset);
316 enumerate_bl = enumerate_p->value();
317 enumerate_bl_pos = 0;
318 if (enumerate_offset > next) {
319 dout(30) << " no key at 0x" << std::hex << next << ", got 0x"
320 << enumerate_offset << std::dec << dendl;
326 // skip clear bits to find the end
328 if (enumerate_p->valid()) {
330 enumerate_bl_pos = get_next_set_bit(enumerate_bl, enumerate_bl_pos);
331 if (enumerate_bl_pos >= 0) {
332 end = _get_offset(enumerate_offset, enumerate_bl_pos);
333 dout(30) << __func__ << " found set bit, key 0x" << std::hex
334 << enumerate_offset << " bit 0x" << enumerate_bl_pos
335 << " offset 0x" << end << std::dec
337 *length = end - *offset;
338 assert((*offset + *length) <= size);
339 dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length
340 << std::dec << dendl;
343 dout(30) << " no more set bits in 0x" << std::hex << enumerate_offset
344 << std::dec << dendl;
346 enumerate_bl.clear();
347 enumerate_bl_pos = 0;
348 if (!enumerate_p->valid()) {
351 string k = enumerate_p->key();
352 const char *p = k.c_str();
353 _key_decode_u64(p, &enumerate_offset);
354 enumerate_bl = enumerate_p->value();
359 if (enumerate_offset < end) {
360 *length = end - *offset;
361 dout(10) << __func__ << std::hex << " 0x" << *offset << "~" << *length
362 << std::dec << dendl;
363 enumerate_offset = end;
364 enumerate_bl_pos = blocks_per_key;
365 assert((*offset + *length) <= size);
369 dout(10) << __func__ << " end" << dendl;
373 void BitmapFreelistManager::dump()
376 uint64_t offset, length;
377 while (enumerate_next(&offset, &length)) {
378 dout(20) << __func__ << " 0x" << std::hex << offset << "~" << length
379 << std::dec << dendl;
383 void BitmapFreelistManager::_verify_range(uint64_t offset, uint64_t length,
387 uint64_t first_key = offset & key_mask;
388 uint64_t last_key = (offset + length - 1) & key_mask;
389 if (first_key == last_key) {
391 make_offset_key(first_key, &k);
393 kvdb->get(bitmap_prefix, k, &bl);
394 if (bl.length() > 0) {
395 const char *p = bl.c_str();
396 unsigned s = (offset & ~key_mask) / bytes_per_block;
397 unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
398 for (unsigned i = s; i <= e; ++i) {
399 int has = !!(p[i >> 3] & (1ull << (i & 7)));
401 derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x"
402 << i << " has 0x" << has << " expected 0x" << val
403 << std::dec << dendl;
409 derr << __func__ << " key 0x" << std::hex << first_key
410 << " not present, expected 0x" << val << std::dec << dendl;
418 make_offset_key(first_key, &k);
420 kvdb->get(bitmap_prefix, k, &bl);
422 const char *p = bl.c_str();
423 unsigned s = (offset & ~key_mask) / bytes_per_block;
424 unsigned e = blocks_per_key;
425 for (unsigned i = s; i < e; ++i) {
426 int has = !!(p[i >> 3] & (1ull << (i & 7)));
428 derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x"
429 << i << " has 0x" << has << " expected 0x" << val << std::dec
436 derr << __func__ << " key 0x" << std::hex << first_key
437 << " not present, expected 0x" << val << std::dec << dendl;
441 first_key += bytes_per_key;
444 if (first_key < last_key) {
445 while (first_key < last_key) {
447 make_offset_key(first_key, &k);
449 kvdb->get(bitmap_prefix, k, &bl);
450 if (bl.length() > 0) {
451 const char *p = bl.c_str();
452 for (unsigned i = 0; i < blocks_per_key; ++i) {
453 int has = !!(p[i >> 3] & (1ull << (i & 7)));
455 derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x"
456 << i << " has 0x" << has << " expected 0x" << val
457 << std::dec << dendl;
463 derr << __func__ << " key 0x" << std::hex << first_key
464 << " not present, expected 0x" << val << std::dec << dendl;
468 first_key += bytes_per_key;
471 assert(first_key == last_key);
474 make_offset_key(first_key, &k);
476 kvdb->get(bitmap_prefix, k, &bl);
477 if (bl.length() > 0) {
478 const char *p = bl.c_str();
479 unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
480 for (unsigned i = 0; i < e; ++i) {
481 int has = !!(p[i >> 3] & (1ull << (i & 7)));
483 derr << __func__ << " key 0x" << std::hex << first_key << " bit 0x"
484 << i << " has 0x" << has << " expected 0x" << val << std::dec
491 derr << __func__ << " key 0x" << std::hex << first_key
492 << " not present, expected 0x" << val << std::dec << dendl;
499 derr << __func__ << " saw " << errors << " errors" << dendl;
500 assert(0 == "bitmap freelist errors");
504 void BitmapFreelistManager::allocate(
505 uint64_t offset, uint64_t length,
506 KeyValueDB::Transaction txn)
508 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
509 << std::dec << dendl;
510 if (cct->_conf->bluestore_debug_freelist)
511 _verify_range(offset, length, 0);
512 _xor(offset, length, txn);
515 void BitmapFreelistManager::release(
516 uint64_t offset, uint64_t length,
517 KeyValueDB::Transaction txn)
519 dout(10) << __func__ << " 0x" << std::hex << offset << "~" << length
520 << std::dec << dendl;
521 if (cct->_conf->bluestore_debug_freelist)
522 _verify_range(offset, length, 1);
523 _xor(offset, length, txn);
526 void BitmapFreelistManager::_xor(
527 uint64_t offset, uint64_t length,
528 KeyValueDB::Transaction txn)
530 // must be block aligned
531 assert((offset & block_mask) == offset);
532 assert((length & block_mask) == length);
534 uint64_t first_key = offset & key_mask;
535 uint64_t last_key = (offset + length - 1) & key_mask;
536 dout(20) << __func__ << " first_key 0x" << std::hex << first_key
537 << " last_key 0x" << last_key << std::dec << dendl;
539 if (first_key == last_key) {
540 bufferptr p(blocks_per_key >> 3);
542 unsigned s = (offset & ~key_mask) / bytes_per_block;
543 unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
544 for (unsigned i = s; i <= e; ++i) {
545 p[i >> 3] ^= 1ull << (i & 7);
548 make_offset_key(first_key, &k);
551 dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
552 bl.hexdump(*_dout, false);
554 txn->merge(bitmap_prefix, k, bl);
558 bufferptr p(blocks_per_key >> 3);
560 unsigned s = (offset & ~key_mask) / bytes_per_block;
561 unsigned e = blocks_per_key;
562 for (unsigned i = s; i < e; ++i) {
563 p[i >> 3] ^= 1ull << (i & 7);
566 make_offset_key(first_key, &k);
569 dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
570 bl.hexdump(*_dout, false);
572 txn->merge(bitmap_prefix, k, bl);
573 first_key += bytes_per_key;
576 while (first_key < last_key) {
578 make_offset_key(first_key, &k);
579 dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec
581 all_set_bl.hexdump(*_dout, false);
583 txn->merge(bitmap_prefix, k, all_set_bl);
584 first_key += bytes_per_key;
586 assert(first_key == last_key);
588 bufferptr p(blocks_per_key >> 3);
590 unsigned e = ((offset + length - 1) & ~key_mask) / bytes_per_block;
591 for (unsigned i = 0; i <= e; ++i) {
592 p[i >> 3] ^= 1ull << (i & 7);
595 make_offset_key(first_key, &k);
598 dout(30) << __func__ << " 0x" << std::hex << first_key << std::dec << ": ";
599 bl.hexdump(*_dout, false);
601 txn->merge(bitmap_prefix, k, bl);