--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+
+// vim: ts=8 sw=2 smarttab
+/*
+ * Bitmap based in-memory allocator.
+ * Author: Ramesh Chander, Ramesh.Chander@sandisk.com
+ *
+ * BitMap Tree Design:
+ * Storage is divided into bitmap of blocks. Each bitmap has size of
+ * unsigned long. Group of bitmap creates a Zone. Zone is a unit where
+ * at a time single thread can be active as well as single biggest
+ * contiguous allocation that can be requested.
+ *
+ * Rest of the nodes are classified into three categories:
+ * root node or Allocator
+ * internal nodes or BitMapAreaIN
+ * final nodes that contains Zones called BitMapAreaLeaf
+ * This classification is according to their own implmentation of some
+ * of the interfaces defined in BitMapArea.
+ */
+
+#include "BitAllocator.h"
+#include <assert.h>
+#include "bluestore_types.h"
+#include "common/debug.h"
+#include <math.h>
+
+#define dout_context cct
+#define dout_subsys ceph_subsys_bluestore
+#undef dout_prefix
+#define dout_prefix *_dout << "bitalloc:"
+
+MEMPOOL_DEFINE_OBJECT_FACTORY(BitMapArea, BitMapArea, bluestore_alloc);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BitMapAreaIN, BitMapAreaIN, bluestore_alloc);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BitMapAreaLeaf, BitMapAreaLeaf, bluestore_alloc);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BitMapZone, BitMapZone, bluestore_alloc);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BmapEntry, BmapEntry, bluestore_alloc);
+MEMPOOL_DEFINE_OBJECT_FACTORY(BitAllocator, BitAllocator, bluestore_alloc);
+
+int64_t BitMapAreaLeaf::count = 0;
+int64_t BitMapZone::count = 0;
+int64_t BitMapZone::total_blocks = 0;
+
+
+
+int64_t BmapEntityListIter::index()
+{
+ return m_cur_idx;
+}
+
+BmapEntry::BmapEntry(CephContext*, const bool full)
+{
+ if (full) {
+ m_bits = BmapEntry::full_bmask();
+ } else {
+ m_bits = BmapEntry::empty_bmask();
+ }
+}
+
+BmapEntry::~BmapEntry()
+{
+
+}
+
+bool BmapEntry::check_bit(int bit)
+{
+ return (atomic_fetch() & bit_mask(bit));
+}
+
+bool BmapEntry::is_allocated(int64_t offset, int64_t num_bits)
+{
+ bmap_t bmask = BmapEntry::align_mask(num_bits) >> offset;
+ return ((m_bits & bmask) == bmask);
+}
+
+void BmapEntry::clear_bit(int bit)
+{
+ bmap_t bmask = bit_mask(bit);
+ m_bits &= ~(bmask);
+}
+
+void BmapEntry::clear_bits(int offset, int num_bits)
+{
+ if (num_bits == 0) {
+ return;
+ }
+
+ bmap_t bmask = BmapEntry::align_mask(num_bits) >> offset;
+ m_bits &= ~(bmask);
+}
+
+void BmapEntry::set_bits(int offset, int num_bits)
+{
+ if (num_bits == 0) {
+ return;
+ }
+
+ bmap_t bmask = BmapEntry::align_mask(num_bits) >> offset;
+ m_bits |= bmask;
+}
+
+/*
+ * Allocate a bit if it was free.
+ * Retruns true if it was free.
+ */
+bool BmapEntry::check_n_set_bit(int bit)
+{
+ bmap_t bmask = bit_mask(bit);
+ bool res = !(m_bits & bmask);
+ m_bits |= bmask;
+ return res;
+}
+
+/*
+ * Find N cont free bits in BitMap starting from an offset.
+ *
+ * Returns number of continuous bits found.
+ */
+int BmapEntry::find_n_cont_bits(int start_offset, int64_t num_bits)
+{
+ int count = 0;
+ int i = 0;
+
+ if (num_bits == 0) {
+ return 0;
+ }
+
+ if (start_offset >= BmapEntry::size()) {
+ return 0;
+ }
+
+ for (i = start_offset; i < BmapEntry::size() && count < num_bits; i++) {
+ if (!check_n_set_bit(i)) {
+ break;
+ }
+ count++;
+ }
+
+ return count;
+}
+
+/*
+ * Find N free bits starting search from a given offset.
+ *
+ * Returns number of bits found, start bit and end of
+ * index next to bit where our search ended + 1.
+ */
+int BmapEntry::find_n_free_bits(int start_idx, int64_t max_bits,
+ int *free_bit, int *end_idx)
+{
+ int i = 0;
+ int count = 0;
+
+ *free_bit = 0;
+ alloc_assert(max_bits > 0);
+
+ /*
+ * Find free bit aligned to bit_align return the bit_num in free_bit.
+ */
+ if (atomic_fetch() == BmapEntry::full_bmask()) {
+ /*
+ * All bits full, return fail.
+ */
+ *end_idx = BmapEntry::size();
+ return 0;
+ }
+
+ /*
+ * Do a serial scan on bitmap.
+ */
+ for (i = start_idx; i < BmapEntry::size(); i++) {
+ if (check_n_set_bit(i)) {
+ /*
+ * Found first free bit
+ */
+ *free_bit = i;
+ count++;
+ break;
+ }
+ }
+ count += find_n_cont_bits(i + 1, max_bits - 1);
+
+ (*end_idx) = i + count;
+ return count;
+}
+
+/*
+ * Find first series of contiguous bits free in bitmap starting
+ * from start offset that either
+ * satisfy our need or are touching right edge of bitmap.
+ *
+ * Returns allocated bits, start bit of allocated, number of bits
+ * scanned from start offset.
+ */
+int
+BmapEntry::find_first_set_bits(int64_t required_blocks,
+ int bit_offset, int *start_offset,
+ int64_t *scanned)
+{
+ int allocated = 0;
+ int conti = 0;
+ int end_idx = 0;
+
+ *scanned = 0;
+
+ while (bit_offset < BmapEntry::size()) {
+ conti = find_n_free_bits(bit_offset, required_blocks,
+ start_offset, &end_idx);
+
+ *scanned += end_idx - bit_offset;
+ /*
+ * Either end of bitmap or got required.
+ */
+ if (conti == required_blocks ||
+ (conti + *start_offset == BmapEntry::size())) {
+ allocated += conti;
+ break;
+ }
+
+ /*
+ * Did not get expected, search from next index again.
+ */
+ clear_bits(*start_offset, conti);
+ allocated = 0;
+
+ bit_offset = end_idx;
+ }
+
+ return allocated;
+}
+
+void BmapEntry::dump_state(CephContext* const cct, const int& count)
+{
+ dout(0) << count << ":: 0x" << std::hex << m_bits << std::dec << dendl;
+}
+
+/*
+ * Zone related functions.
+ */
+void BitMapZone::init(CephContext* const cct,
+ const int64_t zone_num,
+ const int64_t total_blocks,
+ const bool def)
+{
+ m_area_index = zone_num;
+ BitMapZone::total_blocks = total_blocks;
+ alloc_assert(size() > 0);
+
+ m_used_blocks = def? total_blocks: 0;
+
+ int64_t num_bmaps = total_blocks / BmapEntry::size();
+ alloc_assert(num_bmaps < std::numeric_limits<int16_t>::max());
+ alloc_assert(total_blocks < std::numeric_limits<int32_t>::max());
+ alloc_assert(!(total_blocks % BmapEntry::size()));
+
+ m_bmap_vec.resize(num_bmaps, BmapEntry(cct, def));
+ incr_count();
+}
+
+int64_t BitMapZone::sub_used_blocks(int64_t num_blocks)
+{
+ return std::atomic_fetch_sub(&m_used_blocks, (int32_t) num_blocks);
+}
+
+int64_t BitMapZone::add_used_blocks(int64_t num_blocks)
+{
+ return std::atomic_fetch_add(&m_used_blocks, (int32_t)num_blocks) + num_blocks;
+}
+
+/* Intensionally hinted because BitMapAreaLeaf::child_check_n_lock. */
+inline int64_t BitMapZone::get_used_blocks()
+{
+ return std::atomic_load(&m_used_blocks);
+}
+
+bool BitMapZone::reserve_blocks(int64_t num_blocks)
+{
+ ceph_abort();
+ return false;
+}
+
+void BitMapZone::unreserve(int64_t num_blocks, int64_t allocated)
+{
+ ceph_abort();
+}
+
+int64_t BitMapZone::get_reserved_blocks()
+{
+ ceph_abort();
+ return 0;
+}
+
+BitMapZone::BitMapZone(CephContext* cct, int64_t total_blocks,
+ int64_t zone_num)
+ : BitMapArea(cct)
+{
+ init(cct, zone_num, total_blocks, false);
+}
+
+BitMapZone::BitMapZone(CephContext* cct, int64_t total_blocks,
+ int64_t zone_num, bool def)
+ : BitMapArea(cct)
+{
+ init(cct, zone_num, total_blocks, def);
+}
+
+void BitMapZone::shutdown()
+{
+}
+
+BitMapZone::~BitMapZone()
+{
+}
+
+/*
+ * Check if some search took zone marker to end.
+ *
+ * The inline hint has been added intensionally because of importance of this
+ * method for BitMapAreaLeaf::child_check_n_lock, and thus for the overall
+ * allocator's performance. Examination of disassemblies coming from GCC 5.4.0
+ * showed that the compiler really needs that hint.
+ */
+inline bool BitMapZone::is_exhausted()
+{
+ /* BitMapZone::get_used_blocks operates atomically. No need for lock. */
+ return BitMapZone::get_used_blocks() == BitMapZone::size();
+}
+
+bool BitMapZone::is_allocated(int64_t start_block, int64_t num_blocks)
+{
+ BmapEntry *bmap = NULL;
+ int bit = 0;
+ int64_t falling_in_bmap = 0;
+
+ while (num_blocks) {
+ bit = start_block % BmapEntry::size();
+ bmap = &m_bmap_vec[start_block / BmapEntry::size()];
+ falling_in_bmap = MIN(num_blocks, BmapEntry::size() - bit);
+
+ if (!bmap->is_allocated(bit, falling_in_bmap)) {
+ return false;
+ }
+
+ start_block += falling_in_bmap;
+ num_blocks -= falling_in_bmap;
+ }
+
+ return true;
+}
+
+void BitMapZone::set_blocks_used(int64_t start_block, int64_t num_blocks)
+{
+ BmapEntry *bmap = NULL;
+ int bit = 0;
+ int64_t falling_in_bmap = 0;
+ int64_t blks = num_blocks;
+
+ while (blks) {
+ bit = start_block % BmapEntry::size();
+ bmap = &m_bmap_vec[start_block / BmapEntry::size()];
+ falling_in_bmap = MIN(blks, BmapEntry::size() - bit);
+
+ bmap->set_bits(bit, falling_in_bmap);
+
+ start_block += falling_in_bmap;
+ blks -= falling_in_bmap;
+ }
+ add_used_blocks(num_blocks);
+}
+
+void BitMapZone::free_blocks_int(int64_t start_block, int64_t num_blocks)
+{
+ BmapEntry *bmap = NULL;
+ int bit = 0;
+ int64_t falling_in_bmap = 0;
+ int64_t count = num_blocks;
+ int64_t first_blk = start_block;
+
+ if (num_blocks == 0) {
+ return;
+ }
+ alloc_dbg_assert(is_allocated(start_block, num_blocks));
+
+ while (count) {
+ bit = first_blk % BmapEntry::size();
+ bmap = &m_bmap_vec[first_blk / BmapEntry::size()];
+ falling_in_bmap = MIN(count, BmapEntry::size() - bit);
+
+ bmap->clear_bits(bit, falling_in_bmap);
+
+ first_blk += falling_in_bmap;
+ count -= falling_in_bmap;
+ }
+ alloc_dbg_assert(!is_allocated(start_block, num_blocks));
+}
+
+void BitMapZone::lock_excl()
+{
+ m_lock.lock();
+}
+
+bool BitMapZone::lock_excl_try()
+{
+ return m_lock.try_lock();
+}
+
+void BitMapZone::unlock()
+{
+ m_lock.unlock();
+}
+
+bool BitMapZone::check_locked()
+{
+ return !lock_excl_try();
+}
+
+void BitMapZone::free_blocks(int64_t start_block, int64_t num_blocks)
+{
+ free_blocks_int(start_block, num_blocks);
+ sub_used_blocks(num_blocks);
+ alloc_assert(get_used_blocks() >= 0);
+}
+
+int64_t BitMapZone::alloc_blocks_dis(int64_t num_blocks,
+ int64_t min_alloc,
+ int64_t hint,
+ int64_t zone_blk_off,
+ ExtentList *alloc_blocks)
+{
+ int64_t bmap_idx = hint / BmapEntry::size();
+ int bit = hint % BmapEntry::size();
+ BmapEntry *bmap = NULL;
+ int64_t allocated = 0;
+ int64_t blk_off = 0;
+ int64_t alloc_cont = 0;
+ int64_t last_cont = 0;
+ int64_t last_running_ext = 0;
+ int search_idx = bit;
+ int64_t scanned = 0;
+ int start_off = 0;
+
+
+ alloc_assert(check_locked());
+
+ BitMapEntityIter <BmapEntry> iter = BitMapEntityIter<BmapEntry>(
+ &m_bmap_vec, bmap_idx);
+ bmap = iter.next();
+ if (!bmap) {
+ return 0;
+ }
+
+ while (allocated < num_blocks) {
+ blk_off = zone_blk_off + bmap_idx * bmap->size();
+ if (last_cont) {
+ /*
+ * We had bits free at end of last bitmap, try to complete required
+ * min alloc size using that.
+ */
+ alloc_cont = bmap->find_n_cont_bits(0, min_alloc - last_cont);
+ allocated += alloc_cont;
+ last_cont += alloc_cont;
+
+ if (!alloc_cont) {
+ if (last_cont) {
+ this->free_blocks_int(last_running_ext - zone_blk_off, last_cont);
+ }
+ allocated -= last_cont;
+ last_cont = 0;
+ } else if (last_cont / min_alloc) {
+ /*
+ * Got contiguous min_alloc_size across bitmaps.
+ */
+ alloc_blocks->add_extents(last_running_ext, last_cont);
+ last_cont = 0;
+ last_running_ext = 0;
+ }
+ search_idx = alloc_cont;
+ } else {
+ /*
+ * Try to allocate min_alloc_size bits from given bmap.
+ */
+ alloc_cont = bmap->find_first_set_bits(min_alloc, search_idx, &start_off, &scanned);
+ search_idx = search_idx + scanned;
+ allocated += alloc_cont;
+ if (alloc_cont / min_alloc) {
+ /*
+ * Got contiguous min_alloc_size within a bitmap.
+ */
+ alloc_blocks->add_extents(blk_off + start_off, min_alloc);
+ }
+
+ if (alloc_cont % min_alloc) {
+ /*
+ * Got some bits at end of bitmap, carry them to try match with
+ * start bits from next bitmap.
+ */
+ if (!last_cont) {
+ last_running_ext = blk_off + start_off;
+ }
+ last_cont += alloc_cont % min_alloc;
+ }
+ }
+
+
+ if (search_idx == BmapEntry::size()) {
+ search_idx = 0;
+ bmap_idx = iter.index();
+ if ((bmap = iter.next()) == NULL) {
+ if (last_cont) {
+ this->free_blocks_int(last_running_ext - zone_blk_off, last_cont);
+ }
+ allocated -= last_cont;
+ break;
+ }
+ }
+ }
+
+ add_used_blocks(allocated);
+ return allocated;
+}
+
+
+
+void BitMapZone::dump_state(CephContext* const cct, int& count)
+{
+ BmapEntry *bmap = NULL;
+ int bmap_idx = 0;
+ BitMapEntityIter <BmapEntry> iter = BitMapEntityIter<BmapEntry>(
+ &m_bmap_vec, 0);
+ dout(0) << __func__ << " zone " << count << " dump start " << dendl;
+ while ((bmap = static_cast<BmapEntry *>(iter.next()))) {
+ bmap->dump_state(cct, bmap_idx);
+ bmap_idx++;
+ }
+ dout(0) << __func__ << " zone " << count << " dump end " << dendl;
+ count++;
+}
+
+
+/*
+ * BitMapArea Leaf and non-Leaf functions.
+ */
+int64_t BitMapArea::get_zone_size(CephContext* cct)
+{
+ return cct->_conf->bluestore_bitmapallocator_blocks_per_zone;
+}
+
+int64_t BitMapArea::get_span_size(CephContext* cct)
+{
+ return cct->_conf->bluestore_bitmapallocator_span_size;
+}
+
+int BitMapArea::get_level(CephContext* cct, int64_t total_blocks)
+{
+ int level = 1;
+ int64_t zone_size_block = get_zone_size(cct);
+ int64_t span_size = get_span_size(cct);
+ int64_t spans = zone_size_block * span_size;
+ while (spans < total_blocks) {
+ spans *= span_size;
+ level++;
+ }
+ return level;
+}
+
+int64_t BitMapArea::get_level_factor(CephContext* cct, int level)
+{
+ alloc_assert(level > 0);
+
+ int64_t zone_size = get_zone_size(cct);
+ if (level == 1) {
+ return zone_size;
+ }
+
+ int64_t level_factor = zone_size;
+ int64_t span_size = get_span_size(cct);
+ while (--level) {
+ level_factor *= span_size;
+ }
+
+ return level_factor;
+}
+
+int64_t BitMapArea::get_index()
+{
+ return m_area_index;
+}
+
+/*
+ * BitMapArea Leaf and Internal
+ */
+BitMapAreaIN::BitMapAreaIN(CephContext* cct)
+ : BitMapArea(cct)
+{
+ // nothing
+}
+
+void BitMapAreaIN::init_common(CephContext* const cct,
+ const int64_t total_blocks,
+ const int64_t area_idx,
+ const bool def)
+{
+ m_area_index = area_idx;
+ m_total_blocks = total_blocks;
+ m_level = BitMapArea::get_level(cct, total_blocks);
+ m_reserved_blocks = 0;
+
+ m_used_blocks = def? total_blocks: 0;
+}
+
+void BitMapAreaIN::init(CephContext* const cct,
+ int64_t total_blocks,
+ const int64_t area_idx,
+ const bool def)
+{
+ int64_t num_child = 0;
+ alloc_assert(!(total_blocks % BmapEntry::size()));
+
+ init_common(cct, total_blocks, area_idx, def);
+ int64_t level_factor = BitMapArea::get_level_factor(cct, m_level);
+
+ num_child = (total_blocks + level_factor - 1) / level_factor;
+ alloc_assert(num_child < std::numeric_limits<int16_t>::max());
+
+ m_child_size_blocks = level_factor;
+
+ std::vector<BitMapArea*> children;
+ children.reserve(num_child);
+ int i = 0;
+ for (i = 0; i < num_child - 1; i++) {
+ if (m_level <= 2) {
+ children.push_back(new BitMapAreaLeaf(cct, m_child_size_blocks, i, def));
+ } else {
+ children.push_back(new BitMapAreaIN(cct, m_child_size_blocks, i, def));
+ }
+ total_blocks -= m_child_size_blocks;
+ }
+
+ int last_level = BitMapArea::get_level(cct, total_blocks);
+ if (last_level == 1) {
+ children.push_back(new BitMapAreaLeaf(cct, total_blocks, i, def));
+ } else {
+ children.push_back(new BitMapAreaIN(cct, total_blocks, i, def));
+ }
+ m_child_list = BitMapAreaList(std::move(children));
+}
+
+BitMapAreaIN::BitMapAreaIN(CephContext* cct,int64_t total_blocks,
+ int64_t area_idx)
+ : BitMapArea(cct)
+{
+ init(cct, total_blocks, area_idx, false);
+}
+
+BitMapAreaIN::BitMapAreaIN(CephContext* cct, int64_t total_blocks,
+ int64_t area_idx, bool def)
+ : BitMapArea(cct)
+{
+ init(cct, total_blocks, area_idx, def);
+}
+
+BitMapAreaIN::~BitMapAreaIN()
+{
+}
+
+void BitMapAreaIN::shutdown()
+{
+ lock_excl();
+ m_total_blocks = -1;
+ m_area_index = -2;
+ unlock();
+}
+
+bool BitMapAreaIN::child_check_n_lock(BitMapArea *child, int64_t required)
+{
+ child->lock_shared();
+
+ if (child->is_exhausted()) {
+ child->unlock();
+ return false;
+ }
+
+ int64_t child_used_blocks = child->get_used_blocks();
+ int64_t child_total_blocks = child->size();
+ if ((child_total_blocks - child_used_blocks) < required) {
+ child->unlock();
+ return false;
+ }
+
+ return true;
+}
+
+void BitMapAreaIN::child_unlock(BitMapArea *child)
+{
+ child->unlock();
+}
+
+bool BitMapAreaIN::is_exhausted()
+{
+ return get_used_blocks() == size();
+}
+
+int64_t BitMapAreaIN::add_used_blocks(int64_t blks)
+{
+ std::lock_guard<std::mutex> l(m_blocks_lock);
+ m_used_blocks += blks;
+ return m_used_blocks;
+}
+
+int64_t BitMapAreaIN::sub_used_blocks(int64_t num_blocks)
+{
+ std::lock_guard<std::mutex> l(m_blocks_lock);
+
+ int64_t used_blks = m_used_blocks;
+ m_used_blocks -= num_blocks;
+ alloc_assert(m_used_blocks >= 0);
+ return used_blks;
+}
+
+int64_t BitMapAreaIN::get_used_blocks()
+{
+ std::lock_guard<std::mutex> l(m_blocks_lock);
+ return m_used_blocks;
+}
+
+int64_t BitMapAreaIN::get_used_blocks_adj()
+{
+ std::lock_guard<std::mutex> l(m_blocks_lock);
+ return m_used_blocks - m_reserved_blocks;
+}
+
+bool BitMapAreaIN::reserve_blocks(int64_t num)
+{
+ bool res = false;
+ std::lock_guard<std::mutex> u_l(m_blocks_lock);
+ if (m_used_blocks + num <= size()) {
+ m_used_blocks += num;
+ m_reserved_blocks += num;
+ res = true;
+ }
+ alloc_assert(m_used_blocks <= size());
+ return res;
+}
+
+void BitMapAreaIN::unreserve(int64_t needed, int64_t allocated)
+{
+ std::lock_guard<std::mutex> l(m_blocks_lock);
+ m_used_blocks -= (needed - allocated);
+ m_reserved_blocks -= needed;
+ alloc_assert(m_used_blocks >= 0);
+ alloc_assert(m_reserved_blocks >= 0);
+}
+int64_t BitMapAreaIN::get_reserved_blocks()
+{
+ std::lock_guard<std::mutex> l(m_blocks_lock);
+ return m_reserved_blocks;
+}
+
+bool BitMapAreaIN::is_allocated(int64_t start_block, int64_t num_blocks)
+{
+ BitMapArea *area = NULL;
+ int64_t area_block_offset = 0;
+ int64_t falling_in_area = 0;
+
+ alloc_assert(start_block >= 0 &&
+ (start_block + num_blocks <= size()));
+
+ if (num_blocks == 0) {
+ return true;
+ }
+
+ while (num_blocks) {
+ area = static_cast<BitMapArea *>(m_child_list.get_nth_item(
+ start_block / m_child_size_blocks));
+
+ area_block_offset = start_block % m_child_size_blocks;
+ falling_in_area = MIN(m_child_size_blocks - area_block_offset,
+ num_blocks);
+ if (!area->is_allocated(area_block_offset, falling_in_area)) {
+ return false;
+ }
+ start_block += falling_in_area;
+ num_blocks -= falling_in_area;
+ }
+ return true;
+}
+
+int64_t BitMapAreaIN::alloc_blocks_dis_int_work(bool wrap, int64_t num_blocks, int64_t min_alloc,
+ int64_t hint, int64_t area_blk_off, ExtentList *block_list)
+{
+ BitMapArea *child = NULL;
+ int64_t allocated = 0;
+ int64_t blk_off = 0;
+
+ BmapEntityListIter iter = BmapEntityListIter(
+ &m_child_list, hint / m_child_size_blocks, wrap);
+
+ while ((child = static_cast<BitMapArea *>(iter.next()))) {
+ if (!child_check_n_lock(child, 1)) {
+ hint = 0;
+ continue;
+ }
+
+ blk_off = child->get_index() * m_child_size_blocks + area_blk_off;
+ allocated += child->alloc_blocks_dis(num_blocks - allocated, min_alloc,
+ hint % m_child_size_blocks, blk_off, block_list);
+ hint = 0;
+ child_unlock(child);
+ if (allocated == num_blocks) {
+ break;
+ }
+ }
+
+ return allocated;
+}
+
+int64_t BitMapAreaIN::alloc_blocks_dis_int(int64_t num_blocks, int64_t min_alloc,
+ int64_t hint, int64_t area_blk_off, ExtentList *block_list)
+{
+ return alloc_blocks_dis_int_work(false, num_blocks, min_alloc, hint,
+ area_blk_off, block_list);
+}
+
+int64_t BitMapAreaIN::alloc_blocks_dis(int64_t num_blocks, int64_t min_alloc,
+ int64_t hint, int64_t blk_off, ExtentList *block_list)
+{
+ int64_t allocated = 0;
+
+ lock_shared();
+ allocated += alloc_blocks_dis_int(num_blocks, min_alloc, hint, blk_off, block_list);
+ add_used_blocks(allocated);
+
+ unlock();
+ return allocated;
+}
+
+
+void BitMapAreaIN::set_blocks_used_int(int64_t start_block, int64_t num_blocks)
+{
+ BitMapArea *child = NULL;
+ int64_t child_block_offset = 0;
+ int64_t falling_in_child = 0;
+ int64_t blks = num_blocks;
+ int64_t start_blk = start_block;
+
+ alloc_assert(start_block >= 0);
+
+ while (blks) {
+ child = static_cast<BitMapArea *>(m_child_list.get_nth_item(
+ start_blk / m_child_size_blocks));
+
+ child_block_offset = start_blk % child->size();
+ falling_in_child = MIN(m_child_size_blocks - child_block_offset,
+ blks);
+ child->set_blocks_used(child_block_offset, falling_in_child);
+ start_blk += falling_in_child;
+ blks -= falling_in_child;
+ }
+
+ add_used_blocks(num_blocks);
+ alloc_dbg_assert(is_allocated(start_block, num_blocks));
+}
+
+void BitMapAreaIN::set_blocks_used(int64_t start_block, int64_t num_blocks)
+{
+ if (num_blocks == 0) {
+ return;
+ }
+
+ lock_shared();
+ set_blocks_used_int(start_block, num_blocks);
+ unlock();
+}
+
+void BitMapAreaIN::free_blocks_int(int64_t start_block, int64_t num_blocks)
+{
+ BitMapArea *child = NULL;
+ int64_t child_block_offset = 0;
+ int64_t falling_in_child = 0;
+
+ alloc_assert(start_block >= 0 &&
+ (start_block + num_blocks) <= size());
+
+ if (num_blocks == 0) {
+ return;
+ }
+
+ while (num_blocks) {
+ child = static_cast<BitMapArea *>(m_child_list.get_nth_item(
+ start_block / m_child_size_blocks));
+
+ child_block_offset = start_block % m_child_size_blocks;
+
+ falling_in_child = MIN(m_child_size_blocks - child_block_offset,
+ num_blocks);
+ child->free_blocks(child_block_offset, falling_in_child);
+ start_block += falling_in_child;
+ num_blocks -= falling_in_child;
+ }
+
+}
+void BitMapAreaIN::free_blocks(int64_t start_block, int64_t num_blocks)
+{
+ if (num_blocks == 0) {
+ return;
+ }
+ lock_shared();
+ alloc_dbg_assert(is_allocated(start_block, num_blocks));
+
+ free_blocks_int(start_block, num_blocks);
+ (void) sub_used_blocks(num_blocks);
+
+ unlock();
+}
+
+void BitMapAreaIN::dump_state(CephContext* const cct, int& count)
+{
+ BitMapArea *child = NULL;
+
+ BmapEntityListIter iter = BmapEntityListIter(
+ &m_child_list, 0, false);
+
+ while ((child = static_cast<BitMapArea *>(iter.next()))) {
+ child->dump_state(cct, count);
+ }
+}
+
+/*
+ * BitMapArea Leaf
+ */
+BitMapAreaLeaf::BitMapAreaLeaf(CephContext* cct, int64_t total_blocks,
+ int64_t area_idx)
+ : BitMapAreaIN(cct)
+{
+ init(cct, total_blocks, area_idx, false);
+}
+
+BitMapAreaLeaf::BitMapAreaLeaf(CephContext* cct, int64_t total_blocks,
+ int64_t area_idx, bool def)
+ : BitMapAreaIN(cct)
+{
+ init(cct, total_blocks, area_idx, def);
+}
+
+void BitMapAreaLeaf::init(CephContext* const cct,
+ const int64_t total_blocks,
+ const int64_t area_idx,
+ const bool def)
+{
+ int64_t num_child = 0;
+ alloc_assert(!(total_blocks % BmapEntry::size()));
+
+ init_common(cct, total_blocks, area_idx, def);
+ alloc_assert(m_level == 1);
+ int zone_size_block = get_zone_size(cct);
+ alloc_assert(zone_size_block > 0);
+ num_child = (total_blocks + zone_size_block - 1) / zone_size_block;
+ alloc_assert(num_child);
+ m_child_size_blocks = total_blocks / num_child;
+
+ std::vector<BitMapArea*> children;
+ children.reserve(num_child);
+ for (int i = 0; i < num_child; i++) {
+ children.emplace_back(new BitMapZone(cct, m_child_size_blocks, i, def));
+ }
+
+ m_child_list = BitMapAreaList(std::move(children));
+
+ BitMapAreaLeaf::incr_count();
+}
+
+BitMapAreaLeaf::~BitMapAreaLeaf()
+{
+ lock_excl();
+
+ for (int64_t i = 0; i < m_child_list.size(); i++) {
+ auto child = static_cast<BitMapArea *>(m_child_list.get_nth_item(i));
+ delete child;
+ }
+
+ unlock();
+}
+
+/* Intensionally hinted because BitMapAreaLeaf::alloc_blocks_dis_int. */
+inline bool BitMapAreaLeaf::child_check_n_lock(BitMapZone* const child,
+ const int64_t required,
+ const bool lock)
+{
+ /* The exhausted check can be performed without acquiring the lock. This
+ * is because 1) BitMapZone::is_exhausted() actually operates atomically
+ * and 2) it's followed by the exclusive, required-aware re-verification. */
+ if (child->BitMapZone::is_exhausted()) {
+ return false;
+ }
+
+ if (lock) {
+ child->lock_excl();
+ } else if (!child->lock_excl_try()) {
+ return false;
+ }
+
+ int64_t child_used_blocks = child->get_used_blocks();
+ int64_t child_total_blocks = child->size();
+ if ((child_total_blocks - child_used_blocks) < required) {
+ child->unlock();
+ return false;
+ }
+
+ return true;
+}
+
+int64_t BitMapAreaLeaf::alloc_blocks_dis_int(int64_t num_blocks, int64_t min_alloc,
+ int64_t hint, int64_t area_blk_off, ExtentList *block_list)
+{
+ BitMapZone* child = nullptr;
+ int64_t allocated = 0;
+ int64_t blk_off = 0;
+
+ BmapEntityListIter iter = BmapEntityListIter(
+ &m_child_list, hint / m_child_size_blocks, false);
+
+ /* We're sure the only element type we aggregate is BitMapZone,
+ * so there is no business to go through vptr and thus prohibit
+ * compiler to inline the stuff. Consult BitMapAreaLeaf::init. */
+ while ((child = static_cast<BitMapZone*>(iter.next()))) {
+ if (!child_check_n_lock(child, 1, false)) {
+ hint = 0;
+ continue;
+ }
+
+ blk_off = child->get_index() * m_child_size_blocks + area_blk_off;
+ allocated += child->alloc_blocks_dis(num_blocks - allocated, min_alloc,
+ hint % m_child_size_blocks, blk_off, block_list);
+ child->unlock();
+ if (allocated == num_blocks) {
+ break;
+ }
+ hint = 0;
+ }
+ return allocated;
+}
+
+void BitMapAreaLeaf::free_blocks_int(int64_t start_block, int64_t num_blocks)
+{
+ BitMapArea *child = NULL;
+ int64_t child_block_offset = 0;
+ int64_t falling_in_child = 0;
+
+ alloc_assert(start_block >= 0 &&
+ (start_block + num_blocks) <= size());
+
+ if (num_blocks == 0) {
+ return;
+ }
+
+ while (num_blocks) {
+ child = static_cast<BitMapArea *>(m_child_list.get_nth_item(
+ start_block / m_child_size_blocks));
+
+ child_block_offset = start_block % m_child_size_blocks;
+
+ falling_in_child = MIN(m_child_size_blocks - child_block_offset,
+ num_blocks);
+
+ child->lock_excl();
+ child->free_blocks(child_block_offset, falling_in_child);
+ child->unlock();
+ start_block += falling_in_child;
+ num_blocks -= falling_in_child;
+ }
+}
+
+/*
+ * Main allocator functions.
+ */
+BitAllocator::BitAllocator(CephContext* cct, int64_t total_blocks,
+ int64_t zone_size_block, bmap_alloc_mode_t mode)
+ : BitMapAreaIN(cct),
+ cct(cct)
+{
+ init_check(total_blocks, zone_size_block, mode, false, false);
+}
+
+BitAllocator::BitAllocator(CephContext* cct, int64_t total_blocks,
+ int64_t zone_size_block, bmap_alloc_mode_t mode,
+ bool def)
+ : BitMapAreaIN(cct),
+ cct(cct)
+{
+ init_check(total_blocks, zone_size_block, mode, def, false);
+}
+
+BitAllocator::BitAllocator(CephContext* cct, int64_t total_blocks,
+ int64_t zone_size_block, bmap_alloc_mode_t mode,
+ bool def, bool stats_on)
+ : BitMapAreaIN(cct),
+ cct(cct)
+{
+ init_check(total_blocks, zone_size_block, mode, def, stats_on);
+}
+
+void BitAllocator::init_check(int64_t total_blocks, int64_t zone_size_block,
+ bmap_alloc_mode_t mode, bool def, bool stats_on)
+{
+ int64_t unaligned_blocks = 0;
+
+ if (mode != SERIAL && mode != CONCURRENT) {
+ ceph_abort();
+ }
+
+ if (total_blocks <= 0) {
+ ceph_abort();
+ }
+
+ if (zone_size_block == 0 ||
+ zone_size_block < BmapEntry::size()) {
+ ceph_abort();
+ }
+
+ zone_size_block = (zone_size_block / BmapEntry::size()) *
+ BmapEntry::size();
+
+ unaligned_blocks = total_blocks % zone_size_block;
+ m_extra_blocks = unaligned_blocks? zone_size_block - unaligned_blocks: 0;
+ total_blocks = ROUND_UP_TO(total_blocks, zone_size_block);
+
+ m_alloc_mode = mode;
+ m_is_stats_on = stats_on;
+ if (m_is_stats_on) {
+ m_stats = new BitAllocatorStats();
+ }
+
+ pthread_rwlock_init(&m_rw_lock, NULL);
+ init(cct, total_blocks, 0, def);
+ if (!def && unaligned_blocks) {
+ /*
+ * Mark extra padded blocks used from beginning.
+ */
+ set_blocks_used(total_blocks - m_extra_blocks, m_extra_blocks);
+ }
+}
+
+void BitAllocator::lock_excl()
+{
+ pthread_rwlock_wrlock(&m_rw_lock);
+}
+
+void BitAllocator::lock_shared()
+{
+ pthread_rwlock_rdlock(&m_rw_lock);
+}
+
+bool BitAllocator::try_lock()
+{
+ bool get_lock = false;
+ if (pthread_rwlock_trywrlock(&m_rw_lock) == 0) {
+ get_lock = true;
+ }
+
+ return get_lock;
+}
+
+void BitAllocator::unlock()
+{
+ pthread_rwlock_unlock(&m_rw_lock);
+}
+
+BitAllocator::~BitAllocator()
+{
+ lock_excl();
+
+ for (int64_t i = 0; i < m_child_list.size(); i++) {
+ auto child = static_cast<BitMapArea *>(m_child_list.get_nth_item(i));
+ delete child;
+ }
+
+ unlock();
+ pthread_rwlock_destroy(&m_rw_lock);
+}
+
+void
+BitAllocator::shutdown()
+{
+ bool get_lock = try_lock();
+ assert(get_lock);
+ bool get_serial_lock = try_serial_lock();
+ assert(get_serial_lock);
+ serial_unlock();
+ unlock();
+}
+
+void BitAllocator::unreserve_blocks(int64_t unused)
+{
+ unreserve(unused, 0);
+}
+
+void BitAllocator::serial_lock()
+{
+ if (m_alloc_mode == SERIAL) {
+ m_serial_mutex.lock();
+ }
+}
+
+void BitAllocator::serial_unlock()
+{
+ if (m_alloc_mode == SERIAL) {
+ m_serial_mutex.unlock();
+ }
+}
+
+bool BitAllocator::try_serial_lock()
+{
+ bool get_lock = false;
+ if (m_alloc_mode == SERIAL) {
+ if (m_serial_mutex.try_lock() == 0) {
+ get_lock = true;
+ }
+ } else {
+ get_lock = true;
+ }
+ return get_lock;
+}
+
+bool BitAllocator::child_check_n_lock(BitMapArea *child, int64_t required)
+{
+ child->lock_shared();
+
+ if (child->is_exhausted()) {
+ child->unlock();
+ return false;
+ }
+
+ int64_t child_used_blocks = child->get_used_blocks();
+ int64_t child_total_blocks = child->size();
+ if ((child_total_blocks - child_used_blocks) < required) {
+ child->unlock();
+ return false;
+ }
+
+ return true;
+}
+
+void BitAllocator::child_unlock(BitMapArea *child)
+{
+ child->unlock();
+}
+
+bool BitAllocator::check_input_dis(int64_t num_blocks)
+{
+ if (num_blocks == 0 || num_blocks > size()) {
+ return false;
+ }
+ return true;
+}
+
+bool BitAllocator::check_input(int64_t num_blocks)
+{
+ if (num_blocks == 0 || num_blocks > get_zone_size(cct)) {
+ return false;
+ }
+ return true;
+}
+
+void BitAllocator::free_blocks(int64_t start_block, int64_t num_blocks)
+{
+ if (num_blocks == 0) {
+ return;
+ }
+
+ alloc_assert(start_block + num_blocks <= size());
+ if (is_stats_on()) {
+ m_stats->add_free_calls(1);
+ m_stats->add_freed(num_blocks);
+ }
+
+ lock_shared();
+ alloc_dbg_assert(is_allocated(start_block, num_blocks));
+
+ free_blocks_int(start_block, num_blocks);
+ (void) sub_used_blocks(num_blocks);
+
+ unlock();
+}
+
+
+void BitAllocator::set_blocks_used(int64_t start_block, int64_t num_blocks)
+{
+ if (num_blocks == 0) {
+ return;
+ }
+
+ alloc_assert(start_block + num_blocks <= size());
+ lock_shared();
+ serial_lock();
+ set_blocks_used_int(start_block, num_blocks);
+
+ serial_unlock();
+ unlock();
+}
+
+/*
+ * Allocate N dis-contiguous blocks.
+ */
+int64_t BitAllocator::alloc_blocks_dis_int(int64_t num_blocks, int64_t min_alloc,
+ int64_t hint, int64_t area_blk_off, ExtentList *block_list)
+{
+ return alloc_blocks_dis_int_work(true, num_blocks, min_alloc, hint,
+ area_blk_off, block_list);
+}
+
+int64_t BitAllocator::alloc_blocks_dis_res(int64_t num_blocks, int64_t min_alloc,
+ int64_t hint, ExtentList *block_list)
+{
+ return alloc_blocks_dis_work(num_blocks, min_alloc, hint, block_list, true);
+}
+
+int64_t BitAllocator::alloc_blocks_dis_work(int64_t num_blocks, int64_t min_alloc,
+ int64_t hint, ExtentList *block_list, bool reserved)
+{
+ int scans = 1;
+ int64_t allocated = 0;
+ /*
+ * This is root so offset is 0 yet.
+ */
+ int64_t blk_off = 0;
+
+ if (!check_input_dis(num_blocks)) {
+ return 0;
+ }
+
+ if (is_stats_on()) {
+ m_stats->add_alloc_calls(1);
+ m_stats->add_allocated(num_blocks);
+ }
+
+ lock_shared();
+ serial_lock();
+ if (!reserved && !reserve_blocks(num_blocks)) {
+ goto exit;
+ }
+
+ if (is_stats_on()) {
+ m_stats->add_concurrent_scans(scans);
+ }
+
+ while (scans && allocated < num_blocks) {
+ allocated += alloc_blocks_dis_int(num_blocks - allocated, min_alloc, hint + allocated, blk_off, block_list);
+ scans--;
+ }
+
+ if (allocated < num_blocks) {
+ /*
+ * Could not find anything in concurrent scan.
+ * Go in serial manner to get something for sure
+ * if available.
+ */
+ serial_unlock();
+ unlock();
+ lock_excl();
+ serial_lock();
+ allocated += alloc_blocks_dis_int(num_blocks - allocated, min_alloc, hint + allocated,
+ blk_off, block_list);
+ if (is_stats_on()) {
+ m_stats->add_serial_scans(1);
+ }
+ }
+
+ unreserve(num_blocks, allocated);
+ alloc_dbg_assert(is_allocated_dis(block_list, allocated));
+
+exit:
+ serial_unlock();
+ unlock();
+
+ return allocated;
+}
+
+bool BitAllocator::is_allocated_dis(ExtentList *blocks, int64_t num_blocks)
+{
+ int64_t count = 0;
+ for (int64_t j = 0; j < blocks->get_extent_count(); j++) {
+ auto p = blocks->get_nth_extent(j);
+ count += p.second;
+ if (!is_allocated(p.first, p.second)) {
+ return false;
+ }
+ }
+
+ alloc_assert(count == num_blocks);
+ return true;
+}
+
+void BitAllocator::free_blocks_dis(int64_t num_blocks, ExtentList *block_list)
+{
+ int64_t freed = 0;
+ lock_shared();
+ if (is_stats_on()) {
+ m_stats->add_free_calls(1);
+ m_stats->add_freed(num_blocks);
+ }
+
+ for (int64_t i = 0; i < block_list->get_extent_count(); i++) {
+ free_blocks_int(block_list->get_nth_extent(i).first,
+ block_list->get_nth_extent(i).second);
+ freed += block_list->get_nth_extent(i).second;
+ }
+
+ alloc_assert(num_blocks == freed);
+ sub_used_blocks(num_blocks);
+ alloc_assert(get_used_blocks() >= 0);
+ unlock();
+}
+
+void BitAllocator::dump()
+{
+ int count = 0;
+ serial_lock();
+ dump_state(cct, count);
+ serial_unlock();
+}