ceph: shared persistent read-only rbd cache
[stor4nfv.git] / src / ceph / 0007-librbd-cleanup-policy-based-promotion-eviction.patch
diff --git a/src/ceph/0007-librbd-cleanup-policy-based-promotion-eviction.patch b/src/ceph/0007-librbd-cleanup-policy-based-promotion-eviction.patch
new file mode 100644 (file)
index 0000000..010407b
--- /dev/null
@@ -0,0 +1,512 @@
+From dd4804fb05ad8aca51516b0112975cc91ef85a6b Mon Sep 17 00:00:00 2001
+From: Yuan Zhou <yuan.zhou@intel.com>
+Date: Wed, 8 Aug 2018 15:31:47 +0800
+Subject: [PATCH 07/10] librbd: cleanup policy based promotion/eviction
+
+Signed-off-by: Yuan Zhou <yuan.zhou@intel.com>
+---
+ src/common/options.cc                              |   4 +
+ .../rbd_cache/CacheControllerSocketClient.hpp      |   3 +-
+ src/tools/rbd_cache/ObjectCacheStore.cc            |  63 +++----
+ src/tools/rbd_cache/ObjectCacheStore.h             |  10 +-
+ src/tools/rbd_cache/Policy.hpp                     |  18 +-
+ src/tools/rbd_cache/SimplePolicy.hpp               | 188 +++++++++------------
+ 6 files changed, 141 insertions(+), 145 deletions(-)
+
+diff --git a/src/common/options.cc b/src/common/options.cc
+index 7839a31..b334c1e 100644
+--- a/src/common/options.cc
++++ b/src/common/options.cc
+@@ -6365,6 +6365,10 @@ static std::vector<Option> get_rbd_options() {
+     .set_default("/tmp")
+     .set_description("shared ssd caching data dir"),
++    Option("rbd_shared_cache_entries", Option::TYPE_INT, Option::LEVEL_ADVANCED)
++    .set_default(4096)
++    .set_description("shared ssd caching data entries"),
++
+     Option("rbd_non_blocking_aio", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
+     .set_default(true)
+     .set_description("process AIO ops from a dispatch thread to prevent blocking"),
+diff --git a/src/tools/rbd_cache/CacheControllerSocketClient.hpp b/src/tools/rbd_cache/CacheControllerSocketClient.hpp
+index 4e1f36c..56b79ce 100644
+--- a/src/tools/rbd_cache/CacheControllerSocketClient.hpp
++++ b/src/tools/rbd_cache/CacheControllerSocketClient.hpp
+@@ -90,7 +90,8 @@ public:
+         }
+     });
+     std::unique_lock<std::mutex> lk(m);
+-    cv.wait(lk);
++    //cv.wait(lk);
++    cv.wait_for(lk, std::chrono::milliseconds(100));
+     return 0;
+   }
+diff --git a/src/tools/rbd_cache/ObjectCacheStore.cc b/src/tools/rbd_cache/ObjectCacheStore.cc
+index b39fe66..99f90d6 100644
+--- a/src/tools/rbd_cache/ObjectCacheStore.cc
++++ b/src/tools/rbd_cache/ObjectCacheStore.cc
+@@ -15,7 +15,12 @@ namespace cache {
+ ObjectCacheStore::ObjectCacheStore(CephContext *cct, ContextWQ* work_queue)
+       : m_cct(cct), m_work_queue(work_queue),
+         m_rados(new librados::Rados()) {
+-  m_policy = new SimplePolicy(4096, 0.9); // TODO
++
++  uint64_t object_cache_entries =
++    cct->_conf.get_val<int64_t>("rbd_shared_cache_entries");
++
++  //TODO(): allow to set level
++  m_policy = new SimplePolicy(object_cache_entries, 0.5);
+ }
+ ObjectCacheStore::~ObjectCacheStore() {
+@@ -35,7 +40,16 @@ int ObjectCacheStore::init(bool reset) {
+     lderr(m_cct) << "fail to conect to cluster" << dendl;
+     return ret;
+   }
+-  //TODO(): check existing cache objects
++
++  std::string cache_path = m_cct->_conf.get_val<std::string>("rbd_shared_cache_path");
++  //TODO(): check and reuse existing cache objects
++  if(reset) {
++    std::string cmd = "exec rm -rf " + cache_path + "/rbd_cache*; exec mkdir -p " + cache_path;
++    //TODO(): to use std::filesystem
++    int r = system(cmd.c_str());
++  }
++
++  evict_thd = new std::thread([this]{this->evict_thread_body();});
+   return ret;
+ }
+@@ -58,10 +72,6 @@ int ObjectCacheStore::do_promote(std::string pool_name, std::string object_name)
+   
+   librados::IoCtx* ioctx = m_ioctxs[pool_name]; 
+-  //promoting: update metadata 
+-  m_policy->update_status(cache_file_name, PROMOTING);
+-  assert(PROMOTING == m_policy->get_status(cache_file_name));
+-
+   librados::bufferlist* read_buf = new librados::bufferlist();
+   int object_size = 4096*1024; //TODO(): read config from image metadata
+@@ -83,9 +93,9 @@ int ObjectCacheStore::do_promote(std::string pool_name, std::string object_name)
+   ret = cache_file.write_object_to_file(*read_buf, object_size);
+   
+   // update metadata
+-  assert(PROMOTING == m_policy->get_status(cache_file_name));
+-  m_policy->update_status(cache_file_name, PROMOTED);
+-  assert(PROMOTED == m_policy->get_status(cache_file_name));
++  assert(OBJ_CACHE_PROMOTING == m_policy->get_status(cache_file_name));
++  m_policy->update_status(cache_file_name, OBJ_CACHE_PROMOTED);
++  assert(OBJ_CACHE_PROMOTED == m_policy->get_status(cache_file_name));
+   return ret;
+@@ -97,18 +107,15 @@ int ObjectCacheStore::lookup_object(std::string pool_name, std::string object_na
+   std::string cache_file_name =  pool_name + object_name;
+-  // TODO lookup and return status;
+-
+   CACHESTATUS ret;
+   ret = m_policy->lookup_object(cache_file_name);
+   switch(ret) {
+-    case NONE:
++    case OBJ_CACHE_NONE:
+       return do_promote(pool_name, object_name);
+-    case PROMOTING:
+-      return -1;
+-    case PROMOTED:
++    case OBJ_CACHE_PROMOTED:
+       return 0;
++    case OBJ_CACHE_PROMOTING:
+     default:
+       return -1;
+   }
+@@ -117,26 +124,14 @@ int ObjectCacheStore::lookup_object(std::string pool_name, std::string object_na
+ void ObjectCacheStore::evict_thread_body() {
+   int ret;
+   while(m_evict_go) {
+-    std::string temp_cache_file;
+-
+-    ret = m_policy->evict_object(temp_cache_file);
+-    if(ret == 0) {
+-      continue;
+-    }
+-
+-    // TODO
+-    // delete temp_cache_file file.
+-
+-    assert(EVICTING == m_policy->get_status(temp_cache_file));
+-
+-    m_policy->update_status(temp_cache_file, EVICTED);
+-
+-    assert(NONE == m_policy->get_status(temp_cache_file));
++    ret = evict_objects();
+   }
+ }
+ int ObjectCacheStore::shutdown() {
++  m_evict_go = false;
++  evict_thd->join();
+   m_rados->shutdown();
+   return 0;
+ }
+@@ -165,5 +160,13 @@ int ObjectCacheStore::promote_object(librados::IoCtx* ioctx, std::string object_
+   
+ }
++int ObjectCacheStore::evict_objects() {
++  std::list<std::string> obj_list;
++  m_policy->get_evict_list(&obj_list);
++  for (auto& obj: obj_list) {
++    //do_evict(obj);
++  }
++}
++
+ } // namespace cache
+ } // namespace rbd
+diff --git a/src/tools/rbd_cache/ObjectCacheStore.h b/src/tools/rbd_cache/ObjectCacheStore.h
+index 5118a73..ba0e1f1 100644
+--- a/src/tools/rbd_cache/ObjectCacheStore.h
++++ b/src/tools/rbd_cache/ObjectCacheStore.h
+@@ -15,6 +15,7 @@
+ #include "librbd/cache/SharedPersistentObjectCacherFile.h"
+ #include "SimplePolicy.hpp"
++
+ using librados::Rados;
+ using librados::IoCtx;
+@@ -40,10 +41,9 @@ class ObjectCacheStore
+     int lock_cache(std::string vol_name);
+-    void evict_thread_body();
+-
+   private:
+-    int _evict_object();
++    void evict_thread_body();
++    int evict_objects();
+     int do_promote(std::string pool_name, std::string object_name);
+@@ -61,8 +61,8 @@ class ObjectCacheStore
+     librbd::cache::SyncFile *m_cache_file;
+     Policy* m_policy;
+-
+-    bool m_evict_go;
++    std::thread* evict_thd;
++    bool m_evict_go = false;
+ };
+ } // namespace rbd
+diff --git a/src/tools/rbd_cache/Policy.hpp b/src/tools/rbd_cache/Policy.hpp
+index 575c294..711e3bd 100644
+--- a/src/tools/rbd_cache/Policy.hpp
++++ b/src/tools/rbd_cache/Policy.hpp
+@@ -1,12 +1,16 @@
+ #ifndef RBD_CACHE_POLICY_HPP
+ #define RBD_CACHE_POLICY_HPP
++#include <list>
++#include <string>
++
++namespace rbd {
++namespace cache {
++
+ enum CACHESTATUS {
+-  NONE = 0,
+-  PROMOTING,
+-  PROMOTED,
+-  EVICTING,
+-  EVICTED,
++  OBJ_CACHE_NONE = 0,
++  OBJ_CACHE_PROMOTING,
++  OBJ_CACHE_PROMOTED,
+ };
+@@ -18,5 +22,9 @@ public:
+   virtual int evict_object(std::string&) = 0;
+   virtual void update_status(std::string, CACHESTATUS) = 0;
+   virtual CACHESTATUS get_status(std::string) = 0;
++  virtual void get_evict_list(std::list<std::string>* obj_list) = 0;
+ };
++
++} // namespace cache
++} // namespace rbd
+ #endif
+diff --git a/src/tools/rbd_cache/SimplePolicy.hpp b/src/tools/rbd_cache/SimplePolicy.hpp
+index a0d8de7..e785de1 100644
+--- a/src/tools/rbd_cache/SimplePolicy.hpp
++++ b/src/tools/rbd_cache/SimplePolicy.hpp
+@@ -3,138 +3,93 @@
+ #include "Policy.hpp"
+ #include "include/lru.h"
++#include "common/RWLock.h"
+ #include "common/Mutex.h"
+ #include <vector>
+ #include <unordered_map>
+ #include <string>
++namespace rbd {
++namespace cache {
++
++
+ class SimplePolicy : public Policy {
+ public:
+-  SimplePolicy(uint64_t block_num, float level)
+-    : m_level(level),
+-      m_lock("SimplePolicy"),
+-      m_entry_count(block_num)
++  SimplePolicy(uint64_t block_num, float watermark)
++    : m_watermark(watermark), m_entry_count(block_num),
++      m_cache_map_lock("rbd::cache::SimplePolicy::m_cache_map_lock"),
++      m_free_list_lock("rbd::cache::SimplePolicy::m_free_list_lock")
+   {
+-    Entry m_entries[m_entry_count];
+-
+-    for(auto &entry : m_entries) {
+-      m_free_lru.lru_insert_bot(&entry);
++    for(uint64_t i = 0; i < m_entry_count; i++) {
++      m_free_list.push_back(new Entry());
+     }
++
+   }
+-  ~SimplePolicy() {}
++  ~SimplePolicy() {
++    for(uint64_t i = 0; i < m_entry_count; i++) {
++      Entry* entry = reinterpret_cast<Entry*>(m_free_list.front());
++      delete entry;
++      m_free_list.pop_front();
++    }
++  }
+   CACHESTATUS lookup_object(std::string cache_file_name) {
+-    Mutex::Locker locker(m_lock);
+-    auto entry_it = m_oid_to_entry.find(cache_file_name);
+-    if(entry_it == m_oid_to_entry.end()) {
+-      return NONE;
++    //TODO(): check race condition
++    RWLock::WLocker wlocker(m_cache_map_lock);
++
++    auto entry_it = m_cache_map.find(cache_file_name);
++    if(entry_it == m_cache_map.end()) {
++      Mutex::Locker locker(m_free_list_lock);
++      Entry* entry = reinterpret_cast<Entry*>(m_free_list.front());
++      assert(entry != nullptr);
++      m_free_list.pop_front();
++      entry->status = OBJ_CACHE_PROMOTING;
++
++      m_cache_map[cache_file_name] = entry;
++
++      return OBJ_CACHE_NONE;
+     }
+     Entry* entry = entry_it->second;
+-    LRU* lru;
+-    if(entry->status == PROMOTED) {
+-      lru = &m_promoted_lru;
+-    } else {
+-      lru = &m_handing_lru;
++    if(entry->status == OBJ_CACHE_PROMOTED) {
++      // touch it
++      m_promoted_lru.lru_touch(entry);
+     }
+-    // touch it
+-    lru->lru_remove(entry);
+-    lru->lru_insert_top(entry);
+-
+     return entry->status;
+   }
+   int evict_object(std::string& out_cache_file_name) {
+-    Mutex::Locker locker(m_lock);
+-
+-    // still have enough free space, don't need to evict lru.
+-    uint64_t temp_current_size = m_oid_to_entry.size();
+-    float temp_current_evict_level = temp_current_size / m_entry_count;
+-    if(temp_current_evict_level < m_level) {
+-      return 0;
+-    }
+-
+-    // when all entries are USING, PROMOTING or EVICTING, just busy waiting.
+-    if(m_promoted_lru.lru_get_size() == 0) {
+-      return 0;
+-    }
+-
+-    assert(m_promoted_lru.lru_get_size() != 0);
+-
+-    // evict one item from promoted lru
+-    Entry *entry = reinterpret_cast<Entry*>(m_promoted_lru.lru_get_next_expire());
+-    assert(entry != nullptr);
+-
+-    assert(entry->status == PROMOTED);
+-
+-    out_cache_file_name = entry->cache_file_name;
+-    entry->status = EVICTING;
+-
+-    m_promoted_lru.lru_remove(entry);
+-    m_handing_lru.lru_insert_top(entry);
++    RWLock::WLocker locker(m_cache_map_lock);
+     return 1;
+   }
+   // TODO(): simplify the logic
+-  void update_status(std::string _file_name, CACHESTATUS _status) {
+-    Mutex::Locker locker(m_lock);
++  void update_status(std::string file_name, CACHESTATUS new_status) {
++    RWLock::WLocker locker(m_cache_map_lock);
+     Entry* entry;
+-    auto entry_it = m_oid_to_entry.find(_file_name);
++    auto entry_it = m_cache_map.find(file_name);
+     // just check.
+-    if(_status == PROMOTING) {
+-      assert(m_oid_to_entry.find(_file_name) == m_oid_to_entry.end());
++    if(new_status == OBJ_CACHE_PROMOTING) {
++      assert(entry_it == m_cache_map.end());
+     }
+-    // miss this object.
+-    if(entry_it == m_oid_to_entry.end() && _status == PROMOTING) {
+-      entry = reinterpret_cast<Entry*>(m_free_lru.lru_get_next_expire());
+-      if(entry == nullptr) {
+-        assert(0); // namely evict thread have some problems.
+-      }
+-
+-      entry->status = PROMOTING;
+-
+-      m_oid_to_entry[_file_name] = entry;
+-      m_free_lru.lru_remove(entry);
+-      m_handing_lru.lru_insert_top(entry);
+-
+-      return;
+-    }
+-
+-    assert(entry_it != m_oid_to_entry.end());
++    assert(entry_it != m_cache_map.end());
+     entry = entry_it->second;
+-    // promoting action have been finished, so update it.
+-    if(entry->status == PROMOTING && _status== PROMOTED) {
+-      m_handing_lru.lru_remove(entry);
++    // promoting is done, so update it.
++    if(entry->status == OBJ_CACHE_PROMOTING && new_status== OBJ_CACHE_PROMOTED) {
+       m_promoted_lru.lru_insert_top(entry);
+-      entry->status = PROMOTED;
+-      return;
+-    }
+-
+-    // will delete this cache file
+-    if(entry->status == PROMOTED && _status == EVICTING) {
+-      m_promoted_lru.lru_remove(entry);
+-      m_handing_lru.lru_insert_top(entry);
+-      entry->status = EVICTING;
+-      return;
+-    }
+-
+-
+-    if(_status == EVICTED) {
+-      m_oid_to_entry.erase(entry_it);
+-      m_handing_lru.lru_remove(entry);
+-      m_free_lru.lru_insert_bot(entry);
++      entry->status = new_status;
+       return;
+     }
+@@ -142,39 +97,64 @@ public:
+   }
+   // get entry status
+-  CACHESTATUS get_status(std::string _file_name) {
+-    Mutex::Locker locker(m_lock);
+-    auto entry_it = m_oid_to_entry.find(_file_name);
+-    if(entry_it == m_oid_to_entry.end()) {
+-      return NONE;
++  CACHESTATUS get_status(std::string file_name) {
++    RWLock::RLocker locker(m_cache_map_lock);
++    auto entry_it = m_cache_map.find(file_name);
++    if(entry_it == m_cache_map.end()) {
++      return OBJ_CACHE_NONE;
+     }
+     return entry_it->second->status;
+   }
++  void get_evict_list(std::list<std::string>* obj_list) {
++    RWLock::WLocker locker(m_cache_map_lock);
++    // check free ratio, pop entries from LRU
++    if (m_free_list.size() / m_entry_count < m_watermark) {
++      int evict_num = 10; //TODO(): make this configurable
++      for(int i = 0; i < evict_num; i++) {
++        Entry* entry = reinterpret_cast<Entry*>(m_promoted_lru.lru_expire());
++        if (entry == nullptr) {
++        continue;
++        }
++        std::string file_name = entry->cache_file_name;
++        obj_list->push_back(file_name);
++
++        auto entry_it = m_cache_map.find(file_name);
++        m_cache_map.erase(entry_it);
++
++        //mark this entry as free
++      entry->status = OBJ_CACHE_NONE;
++        Mutex::Locker locker(m_free_list_lock);
++        m_free_list.push_back(entry);
++      }
++   }
++  }
+ private:
+   class Entry : public LRUObject {
+     public:
+       CACHESTATUS status;
+-      Entry() : status(NONE){}
++      Entry() : status(OBJ_CACHE_NONE){}
+       std::string cache_file_name;
+       void encode(bufferlist &bl){}
+       void decode(bufferlist::iterator &it){}
+   };
+-  std::unordered_map<std::string, Entry*> m_oid_to_entry;
++  float m_watermark;
++  uint64_t m_entry_count;
+-  LRU m_free_lru;
+-  LRU m_handing_lru; // include promoting status or evicting status
+-  LRU m_promoted_lru; // include promoted, using status.
++  std::unordered_map<std::string, Entry*> m_cache_map;
++  RWLock m_cache_map_lock;
+-  mutable Mutex m_lock;
++  std::deque<Entry*> m_free_list;
++  Mutex m_free_list_lock;
+-  float m_level;
+-  uint64_t m_entry_count;
++  LRU m_promoted_lru; // include promoted, using status.
+ };
++} // namespace cache
++} // namespace rbd
+ #endif
+-- 
+2.7.4
+