1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
5 *******************************************************************
9 * Author: Arash Partow - 2000 *
10 * URL: http://www.partow.net/programming/hashfunctions/index.html *
13 * Free use of the Open Bloom Filter Library is permitted under *
14 * the guidelines and in accordance with the most current version *
15 * of the Boost Software License, Version 1.0 *
16 * http://www.opensource.org/licenses/bsl1.0.html *
18 *******************************************************************
22 #ifndef COMMON_BLOOM_FILTER_HPP
23 #define COMMON_BLOOM_FILTER_HPP
27 #include "include/mempool.h"
28 #include "include/encoding.h"
30 static const std::size_t bits_per_char = 0x08; // 8 bits in 1 char(unsigned)
31 static const unsigned char bit_mask[bits_per_char] = {
42 MEMPOOL_DECLARE_FACTORY(unsigned char, byte, bloom_filter);
48 typedef unsigned int bloom_type;
49 typedef unsigned char cell_type;
51 unsigned char* bit_table_; ///< pointer to bit map
52 std::vector<bloom_type> salt_; ///< vector of salts
53 std::size_t salt_count_; ///< number of salts
54 std::size_t table_size_; ///< bit table size in bytes
55 std::size_t insert_count_; ///< insertion count
56 std::size_t target_element_count_; ///< target number of unique insertions
57 std::size_t random_seed_; ///< random seed
66 target_element_count_(0),
70 bloom_filter(const std::size_t& predicted_inserted_element_count,
71 const double& false_positive_probability,
72 const std::size_t& random_seed)
75 target_element_count_(predicted_inserted_element_count),
76 random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
78 assert(false_positive_probability > 0.0);
79 find_optimal_parameters(predicted_inserted_element_count, false_positive_probability,
80 &salt_count_, &table_size_);
84 bloom_filter(const std::size_t& salt_count,
85 std::size_t table_size,
86 const std::size_t& random_seed,
87 std::size_t target_element_count)
89 salt_count_(salt_count),
90 table_size_(table_size),
92 target_element_count_(target_element_count),
93 random_seed_((random_seed) ? random_seed : 0xA5A5A5A5)
99 generate_unique_salt();
101 bit_table_ = mempool::bloom_filter::alloc_byte.allocate(table_size_);
102 std::fill_n(bit_table_, table_size_, 0x00);
108 bloom_filter(const bloom_filter& filter)
111 this->operator=(filter);
114 bloom_filter& operator = (const bloom_filter& filter)
116 if (this != &filter) {
118 mempool::bloom_filter::alloc_byte.deallocate(bit_table_, table_size_);
120 salt_count_ = filter.salt_count_;
121 table_size_ = filter.table_size_;
122 insert_count_ = filter.insert_count_;
123 target_element_count_ = filter.target_element_count_;
124 random_seed_ = filter.random_seed_;
125 bit_table_ = mempool::bloom_filter::alloc_byte.allocate(table_size_);
126 std::copy(filter.bit_table_, filter.bit_table_ + table_size_, bit_table_);
127 salt_ = filter.salt_;
132 virtual ~bloom_filter()
134 mempool::bloom_filter::alloc_byte.deallocate(bit_table_, table_size_);
137 inline bool operator!() const
139 return (0 == table_size_);
145 std::fill_n(bit_table_, table_size_, 0x00);
150 * insert a u32 into the set
152 * NOTE: the internal hash is weak enough that consecutive inputs do
153 * not achieve the desired fpp. Well-mixed values should be used
154 * here (e.g., put rjhash(x) into the filter instead of just x).
156 * @param val integer value to insert
158 inline void insert(uint32_t val) {
160 std::size_t bit_index = 0;
162 for (std::size_t i = 0; i < salt_.size(); ++i)
164 compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
165 bit_table_[bit_index >> 3] |= bit_mask[bit];
170 inline void insert(const unsigned char* key_begin, const std::size_t& length)
173 std::size_t bit_index = 0;
175 for (std::size_t i = 0; i < salt_.size(); ++i)
177 compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
178 bit_table_[bit_index >> 3] |= bit_mask[bit];
184 inline void insert(const T& t)
186 // Note: T must be a C++ POD type.
187 insert(reinterpret_cast<const unsigned char*>(&t),sizeof(T));
190 inline void insert(const std::string& key)
192 insert(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
195 inline void insert(const char* data, const std::size_t& length)
197 insert(reinterpret_cast<const unsigned char*>(data),length);
200 template<typename InputIterator>
201 inline void insert(const InputIterator begin, const InputIterator end)
203 InputIterator itr = begin;
211 * check if a u32 is contained by set
213 * NOTE: the internal hash is weak enough that consecutive inputs do
214 * not achieve the desired fpp. Well-mixed values should be used
215 * here (e.g., put rjhash(x) into the filter instead of just x).
217 * @param val integer value to query
218 * @returns true if value is (probably) in the set, false if it definitely is not
220 inline virtual bool contains(uint32_t val) const
224 std::size_t bit_index = 0;
226 for (std::size_t i = 0; i < salt_.size(); ++i)
228 compute_indices(hash_ap(val,salt_[i]),bit_index,bit);
229 if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit])
237 inline virtual bool contains(const unsigned char* key_begin, const std::size_t length) const
241 std::size_t bit_index = 0;
243 for (std::size_t i = 0; i < salt_.size(); ++i)
245 compute_indices(hash_ap(key_begin,length,salt_[i]),bit_index,bit);
246 if ((bit_table_[bit_index >> 3] & bit_mask[bit]) != bit_mask[bit])
255 inline bool contains(const T& t) const
257 return contains(reinterpret_cast<const unsigned char*>(&t),static_cast<std::size_t>(sizeof(T)));
260 inline bool contains(const std::string& key) const
262 return contains(reinterpret_cast<const unsigned char*>(key.c_str()),key.size());
265 inline bool contains(const char* data, const std::size_t& length) const
267 return contains(reinterpret_cast<const unsigned char*>(data),length);
270 template<typename InputIterator>
271 inline InputIterator contains_all(const InputIterator begin, const InputIterator end) const
273 InputIterator itr = begin;
285 template<typename InputIterator>
286 inline InputIterator contains_none(const InputIterator begin, const InputIterator end) const
288 InputIterator itr = begin;
300 inline virtual std::size_t size() const
302 return table_size_ * bits_per_char;
305 inline std::size_t element_count() const
307 return insert_count_;
310 inline bool is_full() const
312 return insert_count_ >= target_element_count_;
316 * density of bits set. inconvenient units, but:
317 * .3 = ~50% target insertions
318 * .5 = 100% target insertions, "perfectly full"
319 * .75 = 200% target insertions
320 * 1.0 = all bits set... infinite insertions
322 inline double density() const
327 uint8_t *p = bit_table_;
328 size_t left = table_size_;
335 return (double)set / (double)(table_size_ << 3);
338 virtual inline double approx_unique_element_count() const {
339 // this is not a very good estimate; a better solution should have
340 // some asymptotic behavior as density() approaches 1.0.
341 return (double)target_element_count_ * 2.0 * density();
344 inline double effective_fpp() const
348 The effective false positive probability is calculated using the
349 designated table size and hash function count in conjunction with
350 the current number of inserted elements - not the user defined
351 predicated/expected number of inserted elements.
353 return std::pow(1.0 - std::exp(-1.0 * salt_.size() * insert_count_ / size()), 1.0 * salt_.size());
356 inline const cell_type* table() const
363 inline virtual void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const
365 bit_index = hash % (table_size_ << 3);
369 void generate_unique_salt()
373 A distinct hash function need not be implementation-wise
374 distinct. In the current implementation "seeding" a common
375 hash function with different values seems to be adequate.
377 const unsigned int predef_salt_count = 128;
378 static const bloom_type predef_salt[predef_salt_count] = {
379 0xAAAAAAAA, 0x55555555, 0x33333333, 0xCCCCCCCC,
380 0x66666666, 0x99999999, 0xB5B5B5B5, 0x4B4B4B4B,
381 0xAA55AA55, 0x55335533, 0x33CC33CC, 0xCC66CC66,
382 0x66996699, 0x99B599B5, 0xB54BB54B, 0x4BAA4BAA,
383 0xAA33AA33, 0x55CC55CC, 0x33663366, 0xCC99CC99,
384 0x66B566B5, 0x994B994B, 0xB5AAB5AA, 0xAAAAAA33,
385 0x555555CC, 0x33333366, 0xCCCCCC99, 0x666666B5,
386 0x9999994B, 0xB5B5B5AA, 0xFFFFFFFF, 0xFFFF0000,
387 0xB823D5EB, 0xC1191CDF, 0xF623AEB3, 0xDB58499F,
388 0xC8D42E70, 0xB173F616, 0xA91A5967, 0xDA427D63,
389 0xB1E8A2EA, 0xF6C0D155, 0x4909FEA3, 0xA68CC6A7,
390 0xC395E782, 0xA26057EB, 0x0CD5DA28, 0x467C5492,
391 0xF15E6982, 0x61C6FAD3, 0x9615E352, 0x6E9E355A,
392 0x689B563E, 0x0C9831A8, 0x6753C18B, 0xA622689B,
393 0x8CA63C47, 0x42CC2884, 0x8E89919B, 0x6EDBD7D3,
394 0x15B6796C, 0x1D6FDFE4, 0x63FF9092, 0xE7401432,
395 0xEFFE9412, 0xAEAEDF79, 0x9F245A31, 0x83C136FC,
396 0xC3DA4A8C, 0xA5112C8C, 0x5271F491, 0x9A948DAB,
397 0xCEE59A8D, 0xB5F525AB, 0x59D13217, 0x24E7C331,
398 0x697C2103, 0x84B0A460, 0x86156DA9, 0xAEF2AC68,
399 0x23243DA5, 0x3F649643, 0x5FA495A8, 0x67710DF8,
400 0x9A6C499E, 0xDCFB0227, 0x46A43433, 0x1832B07A,
401 0xC46AFF3C, 0xB9C8FFF0, 0xC9500467, 0x34431BDF,
402 0xB652432B, 0xE367F12B, 0x427F4C1B, 0x224C006E,
403 0x2E7E5A89, 0x96F99AA5, 0x0BEB452A, 0x2FD87C39,
404 0x74B2E1FB, 0x222EFD24, 0xF357F60C, 0x440FCB1E,
405 0x8BBE030F, 0x6704DC29, 0x1144D12F, 0x948B1355,
406 0x6D8FD7E9, 0x1C11A014, 0xADD1592F, 0xFB3C712E,
407 0xFC77642F, 0xF9C4CE8C, 0x31312FB9, 0x08B0DD79,
408 0x318FA6E7, 0xC040D23D, 0xC0589AA7, 0x0CA5C075,
409 0xF874B172, 0x0CF914D5, 0x784D3280, 0x4E8CFEBC,
410 0xC569F575, 0xCDB2A091, 0x2CC016B4, 0x5C5F4421
413 if (salt_count_ <= predef_salt_count)
415 std::copy(predef_salt,
416 predef_salt + salt_count_,
417 std::back_inserter(salt_));
418 for (unsigned int i = 0; i < salt_.size(); ++i)
422 This is done to integrate the user defined random seed,
423 so as to allow for the generation of unique bloom filter
426 salt_[i] = salt_[i] * salt_[(i + 3) % salt_.size()] + random_seed_;
431 std::copy(predef_salt,predef_salt + predef_salt_count,
432 std::back_inserter(salt_));
433 srand(static_cast<unsigned int>(random_seed_));
434 while (salt_.size() < salt_count_)
436 bloom_type current_salt = static_cast<bloom_type>(rand()) * static_cast<bloom_type>(rand());
437 if (0 == current_salt)
439 if (salt_.end() == std::find(salt_.begin(), salt_.end(), current_salt))
441 salt_.push_back(current_salt);
447 static void find_optimal_parameters(std::size_t target_insert_count,
449 std::size_t *salt_count,
450 std::size_t *table_size)
454 The following will attempt to find the number of hash functions
455 and minimum amount of storage bits required to construct a bloom
456 filter consistent with the user defined false positive probability
457 and estimated element insertion count.
460 double min_m = std::numeric_limits<double>::infinity();
466 double numerator = (- k * target_insert_count);
467 double denominator = std::log(1.0 - std::pow(target_fpp, 1.0 / k));
468 curr_m = numerator / denominator;
478 *salt_count = static_cast<std::size_t>(min_k);
479 size_t t = static_cast<std::size_t>(min_m);
480 t += (((t & 7) != 0) ? (bits_per_char - (t & 7)) : 0);
481 *table_size = t >> 3;
484 inline bloom_type hash_ap(uint32_t val, bloom_type hash) const
486 hash ^= (hash << 7) ^ ((val & 0xff000000) >> 24) * (hash >> 3);
487 hash ^= (~((hash << 11) + (((val & 0xff0000) >> 16) ^ (hash >> 5))));
488 hash ^= (hash << 7) ^ ((val & 0xff00) >> 8) * (hash >> 3);
489 hash ^= (~((hash << 11) + (((val & 0xff)) ^ (hash >> 5))));
493 inline bloom_type hash_ap(const unsigned char* begin, std::size_t remaining_length, bloom_type hash) const
495 const unsigned char* itr = begin;
497 while (remaining_length >= 4)
499 hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
500 hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
501 hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
502 hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
503 remaining_length -= 4;
506 while (remaining_length >= 2)
508 hash ^= (hash << 7) ^ (*itr++) * (hash >> 3);
509 hash ^= (~((hash << 11) + ((*itr++) ^ (hash >> 5))));
510 remaining_length -= 2;
513 if (remaining_length)
515 hash ^= (hash << 7) ^ (*itr) * (hash >> 3);
522 void encode(bufferlist& bl) const;
523 void decode(bufferlist::iterator& bl);
524 void dump(Formatter *f) const;
525 static void generate_test_instances(std::list<bloom_filter*>& ls);
527 WRITE_CLASS_ENCODER(bloom_filter)
530 class compressible_bloom_filter : public bloom_filter
534 compressible_bloom_filter() : bloom_filter() {}
536 compressible_bloom_filter(const std::size_t& predicted_element_count,
537 const double& false_positive_probability,
538 const std::size_t& random_seed)
539 : bloom_filter(predicted_element_count, false_positive_probability, random_seed)
541 size_list.push_back(table_size_);
544 compressible_bloom_filter(const std::size_t& salt_count,
545 std::size_t table_size,
546 const std::size_t& random_seed,
547 std::size_t target_count)
548 : bloom_filter(salt_count, table_size, random_seed, target_count)
550 size_list.push_back(table_size_);
553 inline std::size_t size() const override
555 return size_list.back() * bits_per_char;
558 inline bool compress(const double& target_ratio)
563 if ((0.0 >= target_ratio) || (target_ratio >= 1.0))
568 std::size_t original_table_size = size_list.back();
569 std::size_t new_table_size = static_cast<std::size_t>(size_list.back() * target_ratio);
571 if ((!new_table_size) || (new_table_size >= original_table_size))
576 cell_type* tmp = mempool::bloom_filter::alloc_byte.allocate(new_table_size);
577 std::copy(bit_table_, bit_table_ + (new_table_size), tmp);
578 cell_type* itr = bit_table_ + (new_table_size);
579 cell_type* end = bit_table_ + (original_table_size);
580 cell_type* itr_tmp = tmp;
581 cell_type* itr_end = tmp + (new_table_size);
584 *(itr_tmp++) |= (*itr++);
585 if (itr_tmp == itr_end)
589 mempool::bloom_filter::alloc_byte.deallocate(bit_table_, table_size_);
591 size_list.push_back(new_table_size);
592 table_size_ = new_table_size;
597 inline double approx_unique_element_count() const override {
598 // this is not a very good estimate; a better solution should have
599 // some asymptotic behavior as density() approaches 1.0.
601 // the compress() correction is also bad; it tends to under-estimate.
602 return (double)target_element_count_ * 2.0 * density() * (double)size_list.back() / (double)size_list.front();
607 inline void compute_indices(const bloom_type& hash, std::size_t& bit_index, std::size_t& bit) const override
610 for (std::size_t i = 0; i < size_list.size(); ++i)
612 bit_index %= size_list[i] << 3;
617 std::vector<std::size_t> size_list;
619 void encode(bufferlist& bl) const;
620 void decode(bufferlist::iterator& bl);
621 void dump(Formatter *f) const;
622 static void generate_test_instances(std::list<compressible_bloom_filter*>& ls);
624 WRITE_CLASS_ENCODER(compressible_bloom_filter)
631 If it can be guaranteed that bits_per_char will be of the form 2^n then
632 the following optimization can be used:
634 hash_table[bit_index >> n] |= bit_mask[bit_index & (bits_per_char - 1)];
637 For performance reasons where possible when allocating memory it should
638 be aligned (aligned_alloc) according to the architecture being used.