X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=src%2Fceph%2Fsrc%2Fos%2Fbluestore%2FBlueFS.h;fp=src%2Fceph%2Fsrc%2Fos%2Fbluestore%2FBlueFS.h;h=36b41b451bbe5a66168e35794ee0033dd5c498f4;hb=812ff6ca9fcd3e629e49d4328905f33eee8ca3f5;hp=0000000000000000000000000000000000000000;hpb=15280273faafb77777eab341909a3f495cf248d9;p=stor4nfv.git diff --git a/src/ceph/src/os/bluestore/BlueFS.h b/src/ceph/src/os/bluestore/BlueFS.h new file mode 100644 index 0000000..36b41b4 --- /dev/null +++ b/src/ceph/src/os/bluestore/BlueFS.h @@ -0,0 +1,442 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +#ifndef CEPH_OS_BLUESTORE_BLUEFS_H +#define CEPH_OS_BLUESTORE_BLUEFS_H + +#include +#include + +#include "bluefs_types.h" +#include "common/RefCountedObj.h" +#include "BlockDevice.h" + +#include "boost/intrusive/list.hpp" +#include + +class PerfCounters; + +class Allocator; + +enum { + l_bluefs_first = 732600, + l_bluefs_gift_bytes, + l_bluefs_reclaim_bytes, + l_bluefs_db_total_bytes, + l_bluefs_db_used_bytes, + l_bluefs_wal_total_bytes, + l_bluefs_wal_used_bytes, + l_bluefs_slow_total_bytes, + l_bluefs_slow_used_bytes, + l_bluefs_num_files, + l_bluefs_log_bytes, + l_bluefs_log_compactions, + l_bluefs_logged_bytes, + l_bluefs_files_written_wal, + l_bluefs_files_written_sst, + l_bluefs_bytes_written_wal, + l_bluefs_bytes_written_sst, + l_bluefs_last, +}; + +class BlueFS { +public: + CephContext* cct; + static constexpr unsigned MAX_BDEV = 3; + static constexpr unsigned BDEV_WAL = 0; + static constexpr unsigned BDEV_DB = 1; + static constexpr unsigned BDEV_SLOW = 2; + + enum { + WRITER_UNKNOWN, + WRITER_WAL, + WRITER_SST, + }; + + struct File : public RefCountedObject { + MEMPOOL_CLASS_HELPERS(); + + bluefs_fnode_t fnode; + int refs; + uint64_t dirty_seq; + bool locked; + bool deleted; + boost::intrusive::list_member_hook<> dirty_item; + + std::atomic_int num_readers, num_writers; + std::atomic_int num_reading; + + File() + : RefCountedObject(NULL, 0), + refs(0), + dirty_seq(0), + locked(false), + deleted(false), + num_readers(0), + num_writers(0), + num_reading(0) + {} + ~File() override { + assert(num_readers.load() == 0); + assert(num_writers.load() == 0); + assert(num_reading.load() == 0); + assert(!locked); + } + + friend void intrusive_ptr_add_ref(File *f) { + f->get(); + } + friend void intrusive_ptr_release(File *f) { + f->put(); + } + }; + typedef boost::intrusive_ptr FileRef; + + typedef boost::intrusive::list< + File, + boost::intrusive::member_hook< + File, + boost::intrusive::list_member_hook<>, + &File::dirty_item> > dirty_file_list_t; + + struct Dir : public RefCountedObject { + MEMPOOL_CLASS_HELPERS(); + + mempool::bluefs::map file_map; + + Dir() : RefCountedObject(NULL, 0) {} + + friend void intrusive_ptr_add_ref(Dir *d) { + d->get(); + } + friend void intrusive_ptr_release(Dir *d) { + d->put(); + } + }; + typedef boost::intrusive_ptr DirRef; + + struct FileWriter { + MEMPOOL_CLASS_HELPERS(); + + FileRef file; + uint64_t pos; ///< start offset for buffer + bufferlist buffer; ///< new data to write (at end of file) + bufferlist tail_block; ///< existing partial block at end of file, if any + bufferlist::page_aligned_appender buffer_appender; //< for const char* only + int writer_type = 0; ///< WRITER_* + + std::mutex lock; + std::array iocv; ///< for each bdev + + FileWriter(FileRef f) + : file(f), + pos(0), + buffer_appender(buffer.get_page_aligned_appender( + g_conf->bluefs_alloc_size / CEPH_PAGE_SIZE)) { + ++file->num_writers; + iocv.fill(nullptr); + } + // NOTE: caller must call BlueFS::close_writer() + ~FileWriter() { + --file->num_writers; + } + + // note: BlueRocksEnv uses this append exclusively, so it's safe + // to use buffer_appender exclusively here (e.g., it's notion of + // offset will remain accurate). + void append(const char *buf, size_t len) { + buffer_appender.append(buf, len); + } + + // note: used internally only, for ino 1 or 0. + void append(bufferlist& bl) { + buffer.claim_append(bl); + } + + uint64_t get_effective_write_pos() { + buffer_appender.flush(); + return pos + buffer.length(); + } + }; + + struct FileReaderBuffer { + MEMPOOL_CLASS_HELPERS(); + + uint64_t bl_off; ///< prefetch buffer logical offset + bufferlist bl; ///< prefetch buffer + uint64_t pos; ///< current logical offset + uint64_t max_prefetch; ///< max allowed prefetch + + explicit FileReaderBuffer(uint64_t mpf) + : bl_off(0), + pos(0), + max_prefetch(mpf) {} + + uint64_t get_buf_end() { + return bl_off + bl.length(); + } + uint64_t get_buf_remaining(uint64_t p) { + if (p >= bl_off && p < bl_off + bl.length()) + return bl_off + bl.length() - p; + return 0; + } + + void skip(size_t n) { + pos += n; + } + void seek(uint64_t offset) { + pos = offset; + } + }; + + struct FileReader { + MEMPOOL_CLASS_HELPERS(); + + FileRef file; + FileReaderBuffer buf; + bool random; + bool ignore_eof; ///< used when reading our log file + + FileReader(FileRef f, uint64_t mpf, bool rand, bool ie) + : file(f), + buf(mpf), + random(rand), + ignore_eof(ie) { + ++file->num_readers; + } + ~FileReader() { + --file->num_readers; + } + }; + + struct FileLock { + MEMPOOL_CLASS_HELPERS(); + + FileRef file; + explicit FileLock(FileRef f) : file(f) {} + }; + +private: + std::mutex lock; + + PerfCounters *logger = nullptr; + + // cache + mempool::bluefs::map dir_map; ///< dirname -> Dir + mempool::bluefs::unordered_map file_map; ///< ino -> File + + // map of dirty files, files of same dirty_seq are grouped into list. + map dirty_files; + + bluefs_super_t super; ///< latest superblock (as last written) + uint64_t ino_last = 0; ///< last assigned ino (this one is in use) + uint64_t log_seq = 0; ///< last used log seq (by current pending log_t) + uint64_t log_seq_stable = 0; ///< last stable/synced log seq + FileWriter *log_writer = 0; ///< writer for the log + bluefs_transaction_t log_t; ///< pending, unwritten log transaction + bool log_flushing = false; ///< true while flushing the log + std::condition_variable log_cond; + + uint64_t new_log_jump_to = 0; + uint64_t old_log_jump_to = 0; + FileRef new_log = nullptr; + FileWriter *new_log_writer = nullptr; + + /* + * There are up to 3 block devices: + * + * BDEV_DB db/ - the primary db device + * BDEV_WAL db.wal/ - a small, fast device, specifically for the WAL + * BDEV_SLOW db.slow/ - a big, slow device, to spill over to as BDEV_DB fills + */ + vector bdev; ///< block devices we can use + vector ioc; ///< IOContexts for bdevs + vector > block_all; ///< extents in bdev we own + vector block_total; ///< sum of block_all + vector alloc; ///< allocators for bdevs + vector> pending_release; ///< extents to release + + void _init_logger(); + void _shutdown_logger(); + void _update_logger_stats(); + + void _init_alloc(); + void _stop_alloc(); + + void _pad_bl(bufferlist& bl); ///< pad bufferlist to block size w/ zeros + + FileRef _get_file(uint64_t ino); + void _drop_link(FileRef f); + + int _allocate(uint8_t bdev, uint64_t len, + mempool::bluefs::vector *ev); + int _flush_range(FileWriter *h, uint64_t offset, uint64_t length); + int _flush(FileWriter *h, bool force); + int _fsync(FileWriter *h, std::unique_lock& l); + + void _claim_completed_aios(FileWriter *h, list *ls); + void wait_for_aio(FileWriter *h); // safe to call without a lock + + int _flush_and_sync_log(std::unique_lock& l, + uint64_t want_seq = 0, + uint64_t jump_to = 0); + uint64_t _estimate_log_size(); + bool _should_compact_log(); + void _compact_log_dump_metadata(bluefs_transaction_t *t); + void _compact_log_sync(); + void _compact_log_async(std::unique_lock& l); + + //void _aio_finish(void *priv); + + void _flush_bdev_safely(FileWriter *h); + void flush_bdev(); // this is safe to call without a lock + + int _preallocate(FileRef f, uint64_t off, uint64_t len); + int _truncate(FileWriter *h, uint64_t off); + + int _read( + FileReader *h, ///< [in] read from here + FileReaderBuffer *buf, ///< [in] reader state + uint64_t offset, ///< [in] offset + size_t len, ///< [in] this many bytes + bufferlist *outbl, ///< [out] optional: reference the result here + char *out); ///< [out] optional: or copy it here + int _read_random( + FileReader *h, ///< [in] read from here + uint64_t offset, ///< [in] offset + size_t len, ///< [in] this many bytes + char *out); ///< [out] optional: or copy it here + + void _invalidate_cache(FileRef f, uint64_t offset, uint64_t length); + + int _open_super(); + int _write_super(); + int _replay(bool noop); ///< replay journal + + FileWriter *_create_writer(FileRef f); + void _close_writer(FileWriter *h); + + // always put the super in the second 4k block. FIXME should this be + // block size independent? + unsigned get_super_offset() { + return 4096; + } + unsigned get_super_length() { + return 4096; + } + +public: + BlueFS(CephContext* cct); + ~BlueFS(); + + // the super is always stored on bdev 0 + int mkfs(uuid_d osd_uuid); + int mount(); + void umount(); + + void collect_metadata(map *pm); + int fsck(); + + uint64_t get_fs_usage(); + uint64_t get_total(unsigned id); + uint64_t get_free(unsigned id); + void get_usage(vector> *usage); // [ ...] + void dump_perf_counters(Formatter *f); + + void dump_block_extents(ostream& out); + + /// get current extents that we own for given block device + int get_block_extents(unsigned id, interval_set *extents); + + int open_for_write( + const string& dir, + const string& file, + FileWriter **h, + bool overwrite); + + int open_for_read( + const string& dir, + const string& file, + FileReader **h, + bool random = false); + + void close_writer(FileWriter *h) { + std::lock_guard l(lock); + _close_writer(h); + } + + int rename(const string& old_dir, const string& old_file, + const string& new_dir, const string& new_file); + + int readdir(const string& dirname, vector *ls); + + int unlink(const string& dirname, const string& filename); + int mkdir(const string& dirname); + int rmdir(const string& dirname); + bool wal_is_rotational(); + + bool dir_exists(const string& dirname); + int stat(const string& dirname, const string& filename, + uint64_t *size, utime_t *mtime); + + int lock_file(const string& dirname, const string& filename, FileLock **p); + int unlock_file(FileLock *l); + + void flush_log(); + void compact_log(); + + /// sync any uncommitted state to disk + void sync_metadata(); + + int add_block_device(unsigned bdev, const string& path); + bool bdev_support_label(unsigned id); + uint64_t get_block_device_size(unsigned bdev); + + /// gift more block space + void add_block_extent(unsigned bdev, uint64_t offset, uint64_t len); + + /// reclaim block space + int reclaim_blocks(unsigned bdev, uint64_t want, + AllocExtentVector *extents); + + void flush(FileWriter *h) { + std::lock_guard l(lock); + _flush(h, false); + } + void flush_range(FileWriter *h, uint64_t offset, uint64_t length) { + std::lock_guard l(lock); + _flush_range(h, offset, length); + } + int fsync(FileWriter *h) { + std::unique_lock l(lock); + return _fsync(h, l); + } + int read(FileReader *h, FileReaderBuffer *buf, uint64_t offset, size_t len, + bufferlist *outbl, char *out) { + // no need to hold the global lock here; we only touch h and + // h->file, and read vs write or delete is already protected (via + // atomics and asserts). + return _read(h, buf, offset, len, outbl, out); + } + int read_random(FileReader *h, uint64_t offset, size_t len, + char *out) { + // no need to hold the global lock here; we only touch h and + // h->file, and read vs write or delete is already protected (via + // atomics and asserts). + return _read_random(h, offset, len, out); + } + void invalidate_cache(FileRef f, uint64_t offset, uint64_t len) { + std::lock_guard l(lock); + _invalidate_cache(f, offset, len); + } + int preallocate(FileRef f, uint64_t offset, uint64_t len) { + std::lock_guard l(lock); + return _preallocate(f, offset, len); + } + int truncate(FileWriter *h, uint64_t offset) { + std::lock_guard l(lock); + return _truncate(h, offset); + } + +}; + +#endif