X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=src%2Fceph%2Fsrc%2Fos%2Ffilestore%2FFileStore.cc;fp=src%2Fceph%2Fsrc%2Fos%2Ffilestore%2FFileStore.cc;h=caac76ec41408127d8155f84dbdd5be294c7ad95;hb=812ff6ca9fcd3e629e49d4328905f33eee8ca3f5;hp=0000000000000000000000000000000000000000;hpb=15280273faafb77777eab341909a3f495cf248d9;p=stor4nfv.git diff --git a/src/ceph/src/os/filestore/FileStore.cc b/src/ceph/src/os/filestore/FileStore.cc new file mode 100644 index 0000000..caac76e --- /dev/null +++ b/src/ceph/src/os/filestore/FileStore.cc @@ -0,0 +1,6027 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * Copyright (c) 2015 Hewlett-Packard Development Company, L.P. + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "include/compat.h" +#include "include/int_types.h" +#include "boost/tuple/tuple.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(__linux__) +#include +#endif + +#include +#include + +#include "include/linux_fiemap.h" + +#include "common/xattr.h" +#include "chain_xattr.h" + +#if defined(DARWIN) || defined(__FreeBSD__) +#include +#include +#endif // DARWIN + + +#include +#include + +#include "FileStore.h" +#include "GenericFileStoreBackend.h" +#include "BtrfsFileStoreBackend.h" +#include "XfsFileStoreBackend.h" +#include "ZFSFileStoreBackend.h" +#include "common/BackTrace.h" +#include "include/types.h" +#include "FileJournal.h" + +#include "osd/osd_types.h" +#include "include/color.h" +#include "include/buffer.h" + +#include "common/Timer.h" +#include "common/debug.h" +#include "common/errno.h" +#include "common/run_cmd.h" +#include "common/safe_io.h" +#include "common/perf_counters.h" +#include "common/sync_filesystem.h" +#include "common/fd.h" +#include "HashIndex.h" +#include "DBObjectMap.h" +#include "kv/KeyValueDB.h" + +#include "common/ceph_crypto.h" +using ceph::crypto::SHA1; + +#include "include/assert.h" + +#include "common/config.h" +#include "common/blkdev.h" + +#ifdef WITH_LTTNG +#define TRACEPOINT_DEFINE +#define TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#include "tracing/objectstore.h" +#undef TRACEPOINT_PROBE_DYNAMIC_LINKAGE +#undef TRACEPOINT_DEFINE +#else +#define tracepoint(...) +#endif + +#define dout_context cct +#define dout_subsys ceph_subsys_filestore +#undef dout_prefix +#define dout_prefix *_dout << "filestore(" << basedir << ") " + +#define COMMIT_SNAP_ITEM "snap_%llu" +#define CLUSTER_SNAP_ITEM "clustersnap_%s" + +#define REPLAY_GUARD_XATTR "user.cephos.seq" +#define GLOBAL_REPLAY_GUARD_XATTR "user.cephos.gseq" + +// XATTR_SPILL_OUT_NAME as a xattr is used to maintain that indicates whether +// xattrs spill over into DBObjectMap, if XATTR_SPILL_OUT_NAME exists in file +// xattrs and the value is "no", it indicates no xattrs in DBObjectMap +#define XATTR_SPILL_OUT_NAME "user.cephos.spill_out" +#define XATTR_NO_SPILL_OUT "0" +#define XATTR_SPILL_OUT "1" +#define __FUNC__ __func__ << "(" << __LINE__ << ")" + +//Initial features in new superblock. +static CompatSet get_fs_initial_compat_set() { + CompatSet::FeatureSet ceph_osd_feature_compat; + CompatSet::FeatureSet ceph_osd_feature_ro_compat; + CompatSet::FeatureSet ceph_osd_feature_incompat; + return CompatSet(ceph_osd_feature_compat, ceph_osd_feature_ro_compat, + ceph_osd_feature_incompat); +} + +//Features are added here that this FileStore supports. +static CompatSet get_fs_supported_compat_set() { + CompatSet compat = get_fs_initial_compat_set(); + //Any features here can be set in code, but not in initial superblock + compat.incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS); + return compat; +} + +int FileStore::validate_hobject_key(const hobject_t &obj) const +{ + unsigned len = LFNIndex::get_max_escaped_name_len(obj); + return len > m_filestore_max_xattr_value_size ? -ENAMETOOLONG : 0; +} + +int FileStore::get_block_device_fsid(CephContext* cct, const string& path, + uuid_d *fsid) +{ + // make sure we don't try to use aio or direct_io (and get annoying + // error messages from failing to do so); performance implications + // should be irrelevant for this use + FileJournal j(cct, *fsid, 0, 0, path.c_str(), false, false); + return j.peek_fsid(*fsid); +} + +void FileStore::FSPerfTracker::update_from_perfcounters( + PerfCounters &logger) +{ + os_commit_latency.consume_next( + logger.get_tavg_ms( + l_filestore_journal_latency)); + os_apply_latency.consume_next( + logger.get_tavg_ms( + l_filestore_apply_latency)); +} + + +ostream& operator<<(ostream& out, const FileStore::OpSequencer& s) +{ + return out << *s.parent; +} + +int FileStore::get_cdir(const coll_t& cid, char *s, int len) +{ + const string &cid_str(cid.to_str()); + return snprintf(s, len, "%s/current/%s", basedir.c_str(), cid_str.c_str()); +} + +int FileStore::get_index(const coll_t& cid, Index *index) +{ + int r = index_manager.get_index(cid, basedir, index); + assert(!m_filestore_fail_eio || r != -EIO); + return r; +} + +int FileStore::init_index(const coll_t& cid) +{ + char path[PATH_MAX]; + get_cdir(cid, path, sizeof(path)); + int r = index_manager.init_index(cid, path, target_version); + assert(!m_filestore_fail_eio || r != -EIO); + return r; +} + +int FileStore::lfn_find(const ghobject_t& oid, const Index& index, IndexedPath *path) +{ + IndexedPath path2; + if (!path) + path = &path2; + int r, exist; + assert(NULL != index.index); + r = (index.index)->lookup(oid, path, &exist); + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + if (!exist) + return -ENOENT; + return 0; +} + +int FileStore::lfn_truncate(const coll_t& cid, const ghobject_t& oid, off_t length) +{ + FDRef fd; + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) + return r; + r = ::ftruncate(**fd, length); + if (r < 0) + r = -errno; + if (r >= 0 && m_filestore_sloppy_crc) { + int rc = backend->_crc_update_truncate(**fd, length); + assert(rc >= 0); + } + lfn_close(fd); + assert(!m_filestore_fail_eio || r != -EIO); + return r; +} + +int FileStore::lfn_stat(const coll_t& cid, const ghobject_t& oid, struct stat *buf) +{ + IndexedPath path; + Index index; + int r = get_index(cid, &index); + if (r < 0) + return r; + + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + + r = lfn_find(oid, index, &path); + if (r < 0) + return r; + r = ::stat(path->path(), buf); + if (r < 0) + r = -errno; + return r; +} + +int FileStore::lfn_open(const coll_t& cid, + const ghobject_t& oid, + bool create, + FDRef *outfd, + Index *index) +{ + assert(outfd); + int r = 0; + bool need_lock = true; + int flags = O_RDWR; + + if (create) + flags |= O_CREAT; + if (cct->_conf->filestore_odsync_write) { + flags |= O_DSYNC; + } + + Index index2; + if (!index) { + index = &index2; + } + if (!((*index).index)) { + r = get_index(cid, index); + if (r < 0) { + dout(10) << __FUNC__ << ": could not get index r = " << r << dendl; + return r; + } + } else { + need_lock = false; + } + + int fd, exist; + assert(NULL != (*index).index); + if (need_lock) { + ((*index).index)->access_lock.get_write(); + } + if (!replaying) { + *outfd = fdcache.lookup(oid); + if (*outfd) { + if (need_lock) { + ((*index).index)->access_lock.put_write(); + } + return 0; + } + } + + + IndexedPath path2; + IndexedPath *path = &path2; + + r = (*index)->lookup(oid, path, &exist); + if (r < 0) { + derr << "could not find " << oid << " in index: " + << cpp_strerror(-r) << dendl; + goto fail; + } + + r = ::open((*path)->path(), flags, 0644); + if (r < 0) { + r = -errno; + dout(10) << "error opening file " << (*path)->path() << " with flags=" + << flags << ": " << cpp_strerror(-r) << dendl; + goto fail; + } + fd = r; + if (create && (!exist)) { + r = (*index)->created(oid, (*path)->path()); + if (r < 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + derr << "error creating " << oid << " (" << (*path)->path() + << ") in index: " << cpp_strerror(-r) << dendl; + goto fail; + } + r = chain_fsetxattr( + fd, XATTR_SPILL_OUT_NAME, + XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT)); + if (r < 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + derr << "error setting spillout xattr for oid " << oid << " (" << (*path)->path() + << "):" << cpp_strerror(-r) << dendl; + goto fail; + } + } + + if (!replaying) { + bool existed; + *outfd = fdcache.add(oid, fd, &existed); + if (existed) { + TEMP_FAILURE_RETRY(::close(fd)); + } + } else { + *outfd = std::make_shared(fd); + } + + if (need_lock) { + ((*index).index)->access_lock.put_write(); + } + + return 0; + + fail: + + if (need_lock) { + ((*index).index)->access_lock.put_write(); + } + + assert(!m_filestore_fail_eio || r != -EIO); + return r; +} + +void FileStore::lfn_close(FDRef fd) +{ +} + +int FileStore::lfn_link(const coll_t& c, const coll_t& newcid, const ghobject_t& o, const ghobject_t& newoid) +{ + Index index_new, index_old; + IndexedPath path_new, path_old; + int exist; + int r; + bool index_same = false; + if (c < newcid) { + r = get_index(newcid, &index_new); + if (r < 0) + return r; + r = get_index(c, &index_old); + if (r < 0) + return r; + } else if (c == newcid) { + r = get_index(c, &index_old); + if (r < 0) + return r; + index_new = index_old; + index_same = true; + } else { + r = get_index(c, &index_old); + if (r < 0) + return r; + r = get_index(newcid, &index_new); + if (r < 0) + return r; + } + + assert(NULL != index_old.index); + assert(NULL != index_new.index); + + if (!index_same) { + + RWLock::RLocker l1((index_old.index)->access_lock); + + r = index_old->lookup(o, &path_old, &exist); + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + if (!exist) + return -ENOENT; + + RWLock::WLocker l2((index_new.index)->access_lock); + + r = index_new->lookup(newoid, &path_new, &exist); + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + if (exist) + return -EEXIST; + + dout(25) << __FUNC__ << ": path_old: " << path_old << dendl; + dout(25) << __FUNC__ << ": path_new: " << path_new << dendl; + r = ::link(path_old->path(), path_new->path()); + if (r < 0) + return -errno; + + r = index_new->created(newoid, path_new->path()); + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + } else { + RWLock::WLocker l1((index_old.index)->access_lock); + + r = index_old->lookup(o, &path_old, &exist); + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + if (!exist) + return -ENOENT; + + r = index_new->lookup(newoid, &path_new, &exist); + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + if (exist) + return -EEXIST; + + dout(25) << __FUNC__ << ": path_old: " << path_old << dendl; + dout(25) << __FUNC__ << ": path_new: " << path_new << dendl; + r = ::link(path_old->path(), path_new->path()); + if (r < 0) + return -errno; + + // make sure old fd for unlinked/overwritten file is gone + fdcache.clear(newoid); + + r = index_new->created(newoid, path_new->path()); + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + } + return 0; +} + +int FileStore::lfn_unlink(const coll_t& cid, const ghobject_t& o, + const SequencerPosition &spos, + bool force_clear_omap) +{ + Index index; + int r = get_index(cid, &index); + if (r < 0) { + dout(25) << __FUNC__ << ": get_index failed " << cpp_strerror(r) << dendl; + return r; + } + + assert(NULL != index.index); + RWLock::WLocker l((index.index)->access_lock); + + { + IndexedPath path; + int hardlink; + r = index->lookup(o, &path, &hardlink); + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + + if (!force_clear_omap) { + if (hardlink == 0 || hardlink == 1) { + force_clear_omap = true; + } + } + if (force_clear_omap) { + dout(20) << __FUNC__ << ": clearing omap on " << o + << " in cid " << cid << dendl; + r = object_map->clear(o, &spos); + if (r < 0 && r != -ENOENT) { + dout(25) << __FUNC__ << ": omap clear failed " << cpp_strerror(r) << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + if (cct->_conf->filestore_debug_inject_read_err) { + debug_obj_on_delete(o); + } + if (!m_disable_wbthrottle) { + wbthrottle.clear_object(o); // should be only non-cache ref + } + fdcache.clear(o); + } else { + /* Ensure that replay of this op doesn't result in the object_map + * going away. + */ + if (!backend->can_checkpoint()) + object_map->sync(&o, &spos); + } + if (hardlink == 0) { + if (!m_disable_wbthrottle) { + wbthrottle.clear_object(o); // should be only non-cache ref + } + return 0; + } + } + r = index->unlink(o); + if (r < 0) { + dout(25) << __FUNC__ << ": index unlink failed " << cpp_strerror(r) << dendl; + return r; + } + return 0; +} + +FileStore::FileStore(CephContext* cct, const std::string &base, + const std::string &jdev, osflagbits_t flags, + const char *name, bool do_update) : + JournalingObjectStore(cct, base), + internal_name(name), + basedir(base), journalpath(jdev), + generic_flags(flags), + blk_size(0), + fsid_fd(-1), op_fd(-1), + basedir_fd(-1), current_fd(-1), + backend(NULL), + index_manager(cct, do_update), + lock("FileStore::lock"), + force_sync(false), + sync_entry_timeo_lock("FileStore::sync_entry_timeo_lock"), + timer(cct, sync_entry_timeo_lock), + stop(false), sync_thread(this), + fdcache(cct), + wbthrottle(cct), + next_osr_id(0), + m_disable_wbthrottle(cct->_conf->filestore_odsync_write || + !cct->_conf->filestore_wbthrottle_enable), + throttle_ops(cct, "filestore_ops", cct->_conf->filestore_caller_concurrency), + throttle_bytes(cct, "filestore_bytes", cct->_conf->filestore_caller_concurrency), + m_ondisk_finisher_num(cct->_conf->filestore_ondisk_finisher_threads), + m_apply_finisher_num(cct->_conf->filestore_apply_finisher_threads), + op_tp(cct, "FileStore::op_tp", "tp_fstore_op", cct->_conf->filestore_op_threads, "filestore_op_threads"), + op_wq(this, cct->_conf->filestore_op_thread_timeout, + cct->_conf->filestore_op_thread_suicide_timeout, &op_tp), + logger(NULL), + trace_endpoint("0.0.0.0", 0, "FileStore"), + read_error_lock("FileStore::read_error_lock"), + m_filestore_commit_timeout(cct->_conf->filestore_commit_timeout), + m_filestore_journal_parallel(cct->_conf->filestore_journal_parallel ), + m_filestore_journal_trailing(cct->_conf->filestore_journal_trailing), + m_filestore_journal_writeahead(cct->_conf->filestore_journal_writeahead), + m_filestore_fiemap_threshold(cct->_conf->filestore_fiemap_threshold), + m_filestore_max_sync_interval(cct->_conf->filestore_max_sync_interval), + m_filestore_min_sync_interval(cct->_conf->filestore_min_sync_interval), + m_filestore_fail_eio(cct->_conf->filestore_fail_eio), + m_filestore_fadvise(cct->_conf->filestore_fadvise), + do_update(do_update), + m_journal_dio(cct->_conf->journal_dio), + m_journal_aio(cct->_conf->journal_aio), + m_journal_force_aio(cct->_conf->journal_force_aio), + m_osd_rollback_to_cluster_snap(cct->_conf->osd_rollback_to_cluster_snap), + m_osd_use_stale_snap(cct->_conf->osd_use_stale_snap), + m_filestore_do_dump(false), + m_filestore_dump_fmt(true), + m_filestore_sloppy_crc(cct->_conf->filestore_sloppy_crc), + m_filestore_sloppy_crc_block_size(cct->_conf->filestore_sloppy_crc_block_size), + m_filestore_max_alloc_hint_size(cct->_conf->filestore_max_alloc_hint_size), + m_fs_type(0), + m_filestore_max_inline_xattr_size(0), + m_filestore_max_inline_xattrs(0), + m_filestore_max_xattr_value_size(0) +{ + m_filestore_kill_at = cct->_conf->filestore_kill_at; + for (int i = 0; i < m_ondisk_finisher_num; ++i) { + ostringstream oss; + oss << "filestore-ondisk-" << i; + Finisher *f = new Finisher(cct, oss.str(), "fn_odsk_fstore"); + ondisk_finishers.push_back(f); + } + for (int i = 0; i < m_apply_finisher_num; ++i) { + ostringstream oss; + oss << "filestore-apply-" << i; + Finisher *f = new Finisher(cct, oss.str(), "fn_appl_fstore"); + apply_finishers.push_back(f); + } + + ostringstream oss; + oss << basedir << "/current"; + current_fn = oss.str(); + + ostringstream sss; + sss << basedir << "/current/commit_op_seq"; + current_op_seq_fn = sss.str(); + + ostringstream omss; + if (cct->_conf->filestore_omap_backend_path != "") { + omap_dir = cct->_conf->filestore_omap_backend_path; + } else { + omss << basedir << "/current/omap"; + omap_dir = omss.str(); + } + + // initialize logger + PerfCountersBuilder plb(cct, internal_name, l_filestore_first, l_filestore_last); + + plb.add_u64(l_filestore_journal_queue_ops, "journal_queue_ops", "Operations in journal queue"); + plb.add_u64(l_filestore_journal_ops, "journal_ops", "Active journal entries to be applied"); + plb.add_u64(l_filestore_journal_queue_bytes, "journal_queue_bytes", "Size of journal queue"); + plb.add_u64(l_filestore_journal_bytes, "journal_bytes", "Active journal operation size to be applied"); + plb.add_time_avg(l_filestore_journal_latency, "journal_latency", "Average journal queue completing latency"); + plb.add_u64_counter(l_filestore_journal_wr, "journal_wr", "Journal write IOs"); + plb.add_u64_avg(l_filestore_journal_wr_bytes, "journal_wr_bytes", "Journal data written"); + plb.add_u64(l_filestore_op_queue_max_ops, "op_queue_max_ops", "Max operations in writing to FS queue"); + plb.add_u64(l_filestore_op_queue_ops, "op_queue_ops", "Operations in writing to FS queue"); + plb.add_u64_counter(l_filestore_ops, "ops", "Operations written to store"); + plb.add_u64(l_filestore_op_queue_max_bytes, "op_queue_max_bytes", "Max data in writing to FS queue"); + plb.add_u64(l_filestore_op_queue_bytes, "op_queue_bytes", "Size of writing to FS queue"); + plb.add_u64_counter(l_filestore_bytes, "bytes", "Data written to store"); + plb.add_time_avg(l_filestore_apply_latency, "apply_latency", "Apply latency"); + plb.add_u64(l_filestore_committing, "committing", "Is currently committing"); + + plb.add_u64_counter(l_filestore_commitcycle, "commitcycle", "Commit cycles"); + plb.add_time_avg(l_filestore_commitcycle_interval, "commitcycle_interval", "Average interval between commits"); + plb.add_time_avg(l_filestore_commitcycle_latency, "commitcycle_latency", "Average latency of commit"); + plb.add_u64_counter(l_filestore_journal_full, "journal_full", "Journal writes while full"); + plb.add_time_avg(l_filestore_queue_transaction_latency_avg, "queue_transaction_latency_avg", "Store operation queue latency"); + plb.add_time(l_filestore_sync_pause_max_lat, "sync_pause_max_latency", "Max latency of op_wq pause before syncfs"); + + logger = plb.create_perf_counters(); + + cct->get_perfcounters_collection()->add(logger); + cct->_conf->add_observer(this); + + superblock.compat_features = get_fs_initial_compat_set(); +} + +FileStore::~FileStore() +{ + for (vector::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { + delete *it; + *it = NULL; + } + for (vector::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { + delete *it; + *it = NULL; + } + cct->_conf->remove_observer(this); + cct->get_perfcounters_collection()->remove(logger); + + if (journal) + journal->logger = NULL; + delete logger; + + if (m_filestore_do_dump) { + dump_stop(); + } +} + +static void get_attrname(const char *name, char *buf, int len) +{ + snprintf(buf, len, "user.ceph.%s", name); +} + +bool parse_attrname(char **name) +{ + if (strncmp(*name, "user.ceph.", 10) == 0) { + *name += 10; + return true; + } + return false; +} + +void FileStore::collect_metadata(map *pm) +{ + char partition_path[PATH_MAX]; + char dev_node[PATH_MAX]; + int rc = 0; + + (*pm)["filestore_backend"] = backend->get_name(); + ostringstream ss; + ss << "0x" << std::hex << m_fs_type << std::dec; + (*pm)["filestore_f_type"] = ss.str(); + + if (cct->_conf->filestore_collect_device_partition_information) { + rc = get_device_by_fd(fsid_fd, partition_path, dev_node, PATH_MAX); + } else { + rc = -EINVAL; + } + + switch (rc) { + case -EOPNOTSUPP: + case -EINVAL: + (*pm)["backend_filestore_partition_path"] = "unknown"; + (*pm)["backend_filestore_dev_node"] = "unknown"; + break; + case -ENODEV: + (*pm)["backend_filestore_partition_path"] = string(partition_path); + (*pm)["backend_filestore_dev_node"] = "unknown"; + break; + default: + (*pm)["backend_filestore_partition_path"] = string(partition_path); + (*pm)["backend_filestore_dev_node"] = string(dev_node); + } +} + +int FileStore::statfs(struct store_statfs_t *buf0) +{ + struct statfs buf; + buf0->reset(); + if (::statfs(basedir.c_str(), &buf) < 0) { + int r = -errno; + assert(!m_filestore_fail_eio || r != -EIO); + assert(r != -ENOENT); + return r; + } + buf0->total = buf.f_blocks * buf.f_bsize; + buf0->available = buf.f_bavail * buf.f_bsize; + // Adjust for writes pending in the journal + if (journal) { + uint64_t estimate = journal->get_journal_size_estimate(); + if (buf0->available > estimate) + buf0->available -= estimate; + else + buf0->available = 0; + } + return 0; +} + + +void FileStore::new_journal() +{ + if (journalpath.length()) { + dout(10) << "open_journal at " << journalpath << dendl; + journal = new FileJournal(cct, fsid, &finisher, &sync_cond, + journalpath.c_str(), + m_journal_dio, m_journal_aio, + m_journal_force_aio); + if (journal) + journal->logger = logger; + } + return; +} + +int FileStore::dump_journal(ostream& out) +{ + int r; + + if (!journalpath.length()) + return -EINVAL; + + FileJournal *journal = new FileJournal(cct, fsid, &finisher, &sync_cond, journalpath.c_str(), m_journal_dio); + r = journal->dump(out); + delete journal; + return r; +} + +FileStoreBackend *FileStoreBackend::create(long f_type, FileStore *fs) +{ + switch (f_type) { +#if defined(__linux__) + case BTRFS_SUPER_MAGIC: + return new BtrfsFileStoreBackend(fs); +# ifdef HAVE_LIBXFS + case XFS_SUPER_MAGIC: + return new XfsFileStoreBackend(fs); +# endif +#endif +#ifdef HAVE_LIBZFS + case ZFS_SUPER_MAGIC: + return new ZFSFileStoreBackend(fs); +#endif + default: + return new GenericFileStoreBackend(fs); + } +} + +void FileStore::create_backend(long f_type) +{ + m_fs_type = f_type; + + assert(backend == NULL); + backend = FileStoreBackend::create(f_type, this); + + dout(0) << "backend " << backend->get_name() + << " (magic 0x" << std::hex << f_type << std::dec << ")" + << dendl; + + switch (f_type) { +#if defined(__linux__) + case BTRFS_SUPER_MAGIC: + if (!m_disable_wbthrottle){ + wbthrottle.set_fs(WBThrottle::BTRFS); + } + break; + + case XFS_SUPER_MAGIC: + // wbthrottle is constructed with fs(WBThrottle::XFS) + break; +#endif + } + + set_xattr_limits_via_conf(); +} + +int FileStore::mkfs() +{ + int ret = 0; + char fsid_fn[PATH_MAX]; + char fsid_str[40]; + uuid_d old_fsid; + uuid_d old_omap_fsid; + + dout(1) << "mkfs in " << basedir << dendl; + basedir_fd = ::open(basedir.c_str(), O_RDONLY); + if (basedir_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to open base dir " << basedir << ": " << cpp_strerror(ret) << dendl; + return ret; + } + + // open+lock fsid + snprintf(fsid_fn, sizeof(fsid_fn), "%s/fsid", basedir.c_str()); + fsid_fd = ::open(fsid_fn, O_RDWR|O_CREAT, 0644); + if (fsid_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to open " << fsid_fn << ": " << cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + + if (lock_fsid() < 0) { + ret = -EBUSY; + goto close_fsid_fd; + } + + if (read_fsid(fsid_fd, &old_fsid) < 0 || old_fsid.is_zero()) { + if (fsid.is_zero()) { + fsid.generate_random(); + dout(1) << __FUNC__ << ": generated fsid " << fsid << dendl; + } else { + dout(1) << __FUNC__ << ": using provided fsid " << fsid << dendl; + } + + fsid.print(fsid_str); + strcat(fsid_str, "\n"); + ret = ::ftruncate(fsid_fd, 0); + if (ret < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to truncate fsid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + ret = safe_write(fsid_fd, fsid_str, strlen(fsid_str)); + if (ret < 0) { + derr << __FUNC__ << ": failed to write fsid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + if (::fsync(fsid_fd) < 0) { + ret = -errno; + derr << __FUNC__ << ": close failed: can't write fsid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + dout(10) << __FUNC__ << ": fsid is " << fsid << dendl; + } else { + if (!fsid.is_zero() && fsid != old_fsid) { + derr << __FUNC__ << ": on-disk fsid " << old_fsid << " != provided " << fsid << dendl; + ret = -EINVAL; + goto close_fsid_fd; + } + fsid = old_fsid; + dout(1) << __FUNC__ << ": fsid is already set to " << fsid << dendl; + } + + // version stamp + ret = write_version_stamp(); + if (ret < 0) { + derr << __FUNC__ << ": write_version_stamp() failed: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + // superblock + superblock.omap_backend = cct->_conf->filestore_omap_backend; + ret = write_superblock(); + if (ret < 0) { + derr << __FUNC__ << ": write_superblock() failed: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + struct statfs basefs; + ret = ::fstatfs(basedir_fd, &basefs); + if (ret < 0) { + ret = -errno; + derr << __FUNC__ << ": cannot fstatfs basedir " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + +#if defined(__linux__) + if (basefs.f_type == BTRFS_SUPER_MAGIC && + !g_ceph_context->check_experimental_feature_enabled("btrfs")) { + derr << __FUNC__ << ": deprecated btrfs support is not enabled" << dendl; + goto close_fsid_fd; + } +#endif + + create_backend(basefs.f_type); + + ret = backend->create_current(); + if (ret < 0) { + derr << __FUNC__ << ": failed to create current/ " << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + // write initial op_seq + { + uint64_t initial_seq = 0; + int fd = read_op_seq(&initial_seq); + if (fd < 0) { + ret = fd; + derr << __FUNC__ << ": failed to create " << current_op_seq_fn << ": " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + if (initial_seq == 0) { + ret = write_op_seq(fd, 1); + if (ret < 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + derr << __FUNC__ << ": failed to write to " << current_op_seq_fn << ": " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + if (backend->can_checkpoint()) { + // create snap_1 too + current_fd = ::open(current_fn.c_str(), O_RDONLY); + assert(current_fd >= 0); + char s[NAME_MAX]; + snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, 1ull); + ret = backend->create_checkpoint(s, NULL); + VOID_TEMP_FAILURE_RETRY(::close(current_fd)); + if (ret < 0 && ret != -EEXIST) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + derr << __FUNC__ << ": failed to create snap_1: " << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + } + } + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + ret = KeyValueDB::test_init(superblock.omap_backend, omap_dir); + if (ret < 0) { + derr << __FUNC__ << ": failed to create " << cct->_conf->filestore_omap_backend << dendl; + goto close_fsid_fd; + } + // create fsid under omap + // open+lock fsid + int omap_fsid_fd; + char omap_fsid_fn[PATH_MAX]; + snprintf(omap_fsid_fn, sizeof(omap_fsid_fn), "%s/osd_uuid", omap_dir.c_str()); + omap_fsid_fd = ::open(omap_fsid_fn, O_RDWR|O_CREAT, 0644); + if (omap_fsid_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to open " << omap_fsid_fn << ": " << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } + + if (read_fsid(omap_fsid_fd, &old_omap_fsid) < 0 || old_omap_fsid.is_zero()) { + assert(!fsid.is_zero()); + fsid.print(fsid_str); + strcat(fsid_str, "\n"); + ret = ::ftruncate(omap_fsid_fd, 0); + if (ret < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to truncate fsid: " + << cpp_strerror(ret) << dendl; + goto close_omap_fsid_fd; + } + ret = safe_write(omap_fsid_fd, fsid_str, strlen(fsid_str)); + if (ret < 0) { + derr << __FUNC__ << ": failed to write fsid: " + << cpp_strerror(ret) << dendl; + goto close_omap_fsid_fd; + } + dout(10) << __FUNC__ << ": write success, fsid:" << fsid_str << ", ret:" << ret << dendl; + if (::fsync(omap_fsid_fd) < 0) { + ret = -errno; + derr << __FUNC__ << ": close failed: can't write fsid: " + << cpp_strerror(ret) << dendl; + goto close_omap_fsid_fd; + } + dout(10) << "mkfs omap fsid is " << fsid << dendl; + } else { + if (fsid != old_omap_fsid) { + derr << __FUNC__ << ": " << omap_fsid_fn + << " has existed omap fsid " << old_omap_fsid + << " != expected osd fsid " << fsid + << dendl; + ret = -EINVAL; + goto close_omap_fsid_fd; + } + dout(1) << __FUNC__ << ": omap fsid is already set to " << fsid << dendl; + } + + dout(1) << cct->_conf->filestore_omap_backend << " db exists/created" << dendl; + + // journal? + ret = mkjournal(); + if (ret) + goto close_omap_fsid_fd; + + ret = write_meta("type", "filestore"); + if (ret) + goto close_omap_fsid_fd; + + dout(1) << "mkfs done in " << basedir << dendl; + ret = 0; + + close_omap_fsid_fd: + VOID_TEMP_FAILURE_RETRY(::close(omap_fsid_fd)); + close_fsid_fd: + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; + close_basedir_fd: + VOID_TEMP_FAILURE_RETRY(::close(basedir_fd)); + delete backend; + backend = NULL; + return ret; +} + +int FileStore::mkjournal() +{ + // read fsid + int ret; + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str()); + int fd = ::open(fn, O_RDONLY, 0644); + if (fd < 0) { + int err = errno; + derr << __FUNC__ << ": open error: " << cpp_strerror(err) << dendl; + return -err; + } + ret = read_fsid(fd, &fsid); + if (ret < 0) { + derr << __FUNC__ << ": read error: " << cpp_strerror(ret) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return ret; + } + VOID_TEMP_FAILURE_RETRY(::close(fd)); + + ret = 0; + + new_journal(); + if (journal) { + ret = journal->check(); + if (ret < 0) { + ret = journal->create(); + if (ret) + derr << __FUNC__ << ": error creating journal on " << journalpath + << ": " << cpp_strerror(ret) << dendl; + else + dout(0) << __FUNC__ << ": created journal on " << journalpath << dendl; + } + delete journal; + journal = 0; + } + return ret; +} + +int FileStore::read_fsid(int fd, uuid_d *uuid) +{ + char fsid_str[40]; + memset(fsid_str, 0, sizeof(fsid_str)); + int ret = safe_read(fd, fsid_str, sizeof(fsid_str)); + if (ret < 0) + return ret; + if (ret == 8) { + // old 64-bit fsid... mirror it. + *(uint64_t*)&uuid->bytes()[0] = *(uint64_t*)fsid_str; + *(uint64_t*)&uuid->bytes()[8] = *(uint64_t*)fsid_str; + return 0; + } + + if (ret > 36) + fsid_str[36] = 0; + else + fsid_str[ret] = 0; + if (!uuid->parse(fsid_str)) + return -EINVAL; + return 0; +} + +int FileStore::lock_fsid() +{ + struct flock l; + memset(&l, 0, sizeof(l)); + l.l_type = F_WRLCK; + l.l_whence = SEEK_SET; + l.l_start = 0; + l.l_len = 0; + int r = ::fcntl(fsid_fd, F_SETLK, &l); + if (r < 0) { + int err = errno; + dout(0) << __FUNC__ << ": failed to lock " << basedir << "/fsid, is another ceph-osd still running? " + << cpp_strerror(err) << dendl; + return -err; + } + return 0; +} + +bool FileStore::test_mount_in_use() +{ + dout(5) << __FUNC__ << ": basedir " << basedir << " journal " << journalpath << dendl; + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/fsid", basedir.c_str()); + + // verify fs isn't in use + + fsid_fd = ::open(fn, O_RDWR, 0644); + if (fsid_fd < 0) + return 0; // no fsid, ok. + bool inuse = lock_fsid() < 0; + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; + return inuse; +} + +bool FileStore::is_rotational() +{ + bool rotational; + if (backend) { + rotational = backend->is_rotational(); + } else { + int fd = ::open(basedir.c_str(), O_RDONLY); + if (fd < 0) + return true; + struct statfs st; + int r = ::fstatfs(fd, &st); + ::close(fd); + if (r < 0) { + return true; + } + create_backend(st.f_type); + rotational = backend->is_rotational(); + delete backend; + backend = NULL; + } + dout(10) << __func__ << " " << (int)rotational << dendl; + return rotational; +} + +bool FileStore::is_journal_rotational() +{ + bool journal_rotational; + if (backend) { + journal_rotational = backend->is_journal_rotational(); + } else { + int fd = ::open(journalpath.c_str(), O_RDONLY); + if (fd < 0) + return true; + struct statfs st; + int r = ::fstatfs(fd, &st); + ::close(fd); + if (r < 0) { + return true; + } + create_backend(st.f_type); + journal_rotational = backend->is_journal_rotational(); + delete backend; + backend = NULL; + } + dout(10) << __func__ << " " << (int)journal_rotational << dendl; + return journal_rotational; +} + +int FileStore::_detect_fs() +{ + struct statfs st; + int r = ::fstatfs(basedir_fd, &st); + if (r < 0) + return -errno; + + blk_size = st.f_bsize; + +#if defined(__linux__) + if (st.f_type == BTRFS_SUPER_MAGIC && + !g_ceph_context->check_experimental_feature_enabled("btrfs")) { + derr <<__FUNC__ << ": deprecated btrfs support is not enabled" << dendl; + return -EPERM; + } +#endif + + create_backend(st.f_type); + + r = backend->detect_features(); + if (r < 0) { + derr << __FUNC__ << ": detect_features error: " << cpp_strerror(r) << dendl; + return r; + } + + // test xattrs + char fn[PATH_MAX]; + int x = rand(); + int y = x+1; + snprintf(fn, sizeof(fn), "%s/xattr_test", basedir.c_str()); + int tmpfd = ::open(fn, O_CREAT|O_WRONLY|O_TRUNC, 0700); + if (tmpfd < 0) { + int ret = -errno; + derr << __FUNC__ << ": unable to create " << fn << ": " << cpp_strerror(ret) << dendl; + return ret; + } + + int ret = chain_fsetxattr(tmpfd, "user.test", &x, sizeof(x)); + if (ret >= 0) + ret = chain_fgetxattr(tmpfd, "user.test", &y, sizeof(y)); + if ((ret < 0) || (x != y)) { + derr << "Extended attributes don't appear to work. "; + if (ret) + *_dout << "Got error " + cpp_strerror(ret) + ". "; + *_dout << "If you are using ext3 or ext4, be sure to mount the underlying " + << "file system with the 'user_xattr' option." << dendl; + ::unlink(fn); + VOID_TEMP_FAILURE_RETRY(::close(tmpfd)); + return -ENOTSUP; + } + + char buf[1000]; + memset(buf, 0, sizeof(buf)); // shut up valgrind + chain_fsetxattr(tmpfd, "user.test", &buf, sizeof(buf)); + chain_fsetxattr(tmpfd, "user.test2", &buf, sizeof(buf)); + chain_fsetxattr(tmpfd, "user.test3", &buf, sizeof(buf)); + chain_fsetxattr(tmpfd, "user.test4", &buf, sizeof(buf)); + ret = chain_fsetxattr(tmpfd, "user.test5", &buf, sizeof(buf)); + if (ret == -ENOSPC) { + dout(0) << "limited size xattrs" << dendl; + } + chain_fremovexattr(tmpfd, "user.test"); + chain_fremovexattr(tmpfd, "user.test2"); + chain_fremovexattr(tmpfd, "user.test3"); + chain_fremovexattr(tmpfd, "user.test4"); + chain_fremovexattr(tmpfd, "user.test5"); + + ::unlink(fn); + VOID_TEMP_FAILURE_RETRY(::close(tmpfd)); + + return 0; +} + +int FileStore::_sanity_check_fs() +{ + // sanity check(s) + + if (((int)m_filestore_journal_writeahead + + (int)m_filestore_journal_parallel + + (int)m_filestore_journal_trailing) > 1) { + dout(0) << "mount ERROR: more than one of filestore journal {writeahead,parallel,trailing} enabled" << dendl; + cerr << TEXT_RED + << " ** WARNING: more than one of 'filestore journal {writeahead,parallel,trailing}'\n" + << " is enabled in ceph.conf. You must choose a single journal mode." + << TEXT_NORMAL << std::endl; + return -EINVAL; + } + + if (!backend->can_checkpoint()) { + if (!journal || !m_filestore_journal_writeahead) { + dout(0) << "mount WARNING: no btrfs, and no journal in writeahead mode; data may be lost" << dendl; + cerr << TEXT_RED + << " ** WARNING: no btrfs AND (no journal OR journal not in writeahead mode)\n" + << " For non-btrfs volumes, a writeahead journal is required to\n" + << " maintain on-disk consistency in the event of a crash. Your conf\n" + << " should include something like:\n" + << " osd journal = /path/to/journal_device_or_file\n" + << " filestore journal writeahead = true\n" + << TEXT_NORMAL; + } + } + + if (!journal) { + dout(0) << "mount WARNING: no journal" << dendl; + cerr << TEXT_YELLOW + << " ** WARNING: No osd journal is configured: write latency may be high.\n" + << " If you will not be using an osd journal, write latency may be\n" + << " relatively high. It can be reduced somewhat by lowering\n" + << " filestore_max_sync_interval, but lower values mean lower write\n" + << " throughput, especially with spinning disks.\n" + << TEXT_NORMAL; + } + + return 0; +} + +int FileStore::write_superblock() +{ + bufferlist bl; + ::encode(superblock, bl); + return safe_write_file(basedir.c_str(), "superblock", + bl.c_str(), bl.length()); +} + +int FileStore::read_superblock() +{ + bufferptr bp(PATH_MAX); + int ret = safe_read_file(basedir.c_str(), "superblock", + bp.c_str(), bp.length()); + if (ret < 0) { + if (ret == -ENOENT) { + // If the file doesn't exist write initial CompatSet + return write_superblock(); + } + return ret; + } + + bufferlist bl; + bl.push_back(std::move(bp)); + bufferlist::iterator i = bl.begin(); + ::decode(superblock, i); + return 0; +} + +int FileStore::update_version_stamp() +{ + return write_version_stamp(); +} + +int FileStore::version_stamp_is_valid(uint32_t *version) +{ + bufferptr bp(PATH_MAX); + int ret = safe_read_file(basedir.c_str(), "store_version", + bp.c_str(), bp.length()); + if (ret < 0) { + return ret; + } + bufferlist bl; + bl.push_back(std::move(bp)); + bufferlist::iterator i = bl.begin(); + ::decode(*version, i); + dout(10) << __FUNC__ << ": was " << *version << " vs target " + << target_version << dendl; + if (*version == target_version) + return 1; + else + return 0; +} + +int FileStore::write_version_stamp() +{ + dout(1) << __FUNC__ << ": " << target_version << dendl; + bufferlist bl; + ::encode(target_version, bl); + + return safe_write_file(basedir.c_str(), "store_version", + bl.c_str(), bl.length()); +} + +int FileStore::upgrade() +{ + dout(1) << __FUNC__ << dendl; + uint32_t version; + int r = version_stamp_is_valid(&version); + + if (r == -ENOENT) { + derr << "The store_version file doesn't exist." << dendl; + return -EINVAL; + } + if (r < 0) + return r; + if (r == 1) + return 0; + + if (version < 3) { + derr << "ObjectStore is old at version " << version << ". Please upgrade to firefly v0.80.x, convert your store, and then upgrade." << dendl; + return -EINVAL; + } + + // nothing necessary in FileStore for v3 -> v4 upgrade; we just need to + // open up DBObjectMap with the do_upgrade flag, which we already did. + update_version_stamp(); + return 0; +} + +int FileStore::read_op_seq(uint64_t *seq) +{ + int op_fd = ::open(current_op_seq_fn.c_str(), O_CREAT|O_RDWR, 0644); + if (op_fd < 0) { + int r = -errno; + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + char s[40]; + memset(s, 0, sizeof(s)); + int ret = safe_read(op_fd, s, sizeof(s) - 1); + if (ret < 0) { + derr << __FUNC__ << ": error reading " << current_op_seq_fn << ": " << cpp_strerror(ret) << dendl; + VOID_TEMP_FAILURE_RETRY(::close(op_fd)); + assert(!m_filestore_fail_eio || ret != -EIO); + return ret; + } + *seq = atoll(s); + return op_fd; +} + +int FileStore::write_op_seq(int fd, uint64_t seq) +{ + char s[30]; + snprintf(s, sizeof(s), "%" PRId64 "\n", seq); + int ret = TEMP_FAILURE_RETRY(::pwrite(fd, s, strlen(s), 0)); + if (ret < 0) { + ret = -errno; + assert(!m_filestore_fail_eio || ret != -EIO); + } + return ret; +} + +int FileStore::mount() +{ + int ret; + char buf[PATH_MAX]; + uint64_t initial_op_seq; + uuid_d omap_fsid; + set cluster_snaps; + CompatSet supported_compat_set = get_fs_supported_compat_set(); + + dout(5) << "basedir " << basedir << " journal " << journalpath << dendl; + + ret = set_throttle_params(); + if (ret != 0) + goto done; + + // make sure global base dir exists + if (::access(basedir.c_str(), R_OK | W_OK)) { + ret = -errno; + derr << __FUNC__ << ": unable to access basedir '" << basedir << "': " + << cpp_strerror(ret) << dendl; + goto done; + } + + // get fsid + snprintf(buf, sizeof(buf), "%s/fsid", basedir.c_str()); + fsid_fd = ::open(buf, O_RDWR, 0644); + if (fsid_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": error opening '" << buf << "': " + << cpp_strerror(ret) << dendl; + goto done; + } + + ret = read_fsid(fsid_fd, &fsid); + if (ret < 0) { + derr << __FUNC__ << ": error reading fsid_fd: " << cpp_strerror(ret) + << dendl; + goto close_fsid_fd; + } + + if (lock_fsid() < 0) { + derr << __FUNC__ << ": lock_fsid failed" << dendl; + ret = -EBUSY; + goto close_fsid_fd; + } + + dout(10) << "mount fsid is " << fsid << dendl; + + + uint32_t version_stamp; + ret = version_stamp_is_valid(&version_stamp); + if (ret < 0) { + derr << __FUNC__ << ": error in version_stamp_is_valid: " + << cpp_strerror(ret) << dendl; + goto close_fsid_fd; + } else if (ret == 0) { + if (do_update || (int)version_stamp < cct->_conf->filestore_update_to) { + derr << __FUNC__ << ": stale version stamp detected: " + << version_stamp + << ". Proceeding, do_update " + << "is set, performing disk format upgrade." + << dendl; + do_update = true; + } else { + ret = -EINVAL; + derr << __FUNC__ << ": stale version stamp " << version_stamp + << ". Please run the FileStore update script before starting the " + << "OSD, or set filestore_update_to to " << target_version + << " (currently " << cct->_conf->filestore_update_to << ")" + << dendl; + goto close_fsid_fd; + } + } + + ret = read_superblock(); + if (ret < 0) { + goto close_fsid_fd; + } + + // Check if this FileStore supports all the necessary features to mount + if (supported_compat_set.compare(superblock.compat_features) == -1) { + derr << __FUNC__ << ": Incompatible features set " + << superblock.compat_features << dendl; + ret = -EINVAL; + goto close_fsid_fd; + } + + // open some dir handles + basedir_fd = ::open(basedir.c_str(), O_RDONLY); + if (basedir_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to open " << basedir << ": " + << cpp_strerror(ret) << dendl; + basedir_fd = -1; + goto close_fsid_fd; + } + + // test for btrfs, xattrs, etc. + ret = _detect_fs(); + if (ret < 0) { + derr << __FUNC__ << ": error in _detect_fs: " + << cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + + { + list ls; + ret = backend->list_checkpoints(ls); + if (ret < 0) { + derr << __FUNC__ << ": error in _list_snaps: "<< cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + + long long unsigned c, prev = 0; + char clustersnap[NAME_MAX]; + for (list::iterator it = ls.begin(); it != ls.end(); ++it) { + if (sscanf(it->c_str(), COMMIT_SNAP_ITEM, &c) == 1) { + assert(c > prev); + prev = c; + snaps.push_back(c); + } else if (sscanf(it->c_str(), CLUSTER_SNAP_ITEM, clustersnap) == 1) + cluster_snaps.insert(*it); + } + } + + if (m_osd_rollback_to_cluster_snap.length() && + cluster_snaps.count(m_osd_rollback_to_cluster_snap) == 0) { + derr << "rollback to cluster snapshot '" << m_osd_rollback_to_cluster_snap << "': not found" << dendl; + ret = -ENOENT; + goto close_basedir_fd; + } + + char nosnapfn[200]; + snprintf(nosnapfn, sizeof(nosnapfn), "%s/nosnap", current_fn.c_str()); + + if (backend->can_checkpoint()) { + if (snaps.empty()) { + dout(0) << __FUNC__ << ": WARNING: no consistent snaps found, store may be in inconsistent state" << dendl; + } else { + char s[NAME_MAX]; + uint64_t curr_seq = 0; + + if (m_osd_rollback_to_cluster_snap.length()) { + derr << TEXT_RED + << " ** NOTE: rolling back to cluster snapshot " << m_osd_rollback_to_cluster_snap << " **" + << TEXT_NORMAL + << dendl; + assert(cluster_snaps.count(m_osd_rollback_to_cluster_snap)); + snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, m_osd_rollback_to_cluster_snap.c_str()); + } else { + { + int fd = read_op_seq(&curr_seq); + if (fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(fd)); + } + } + if (curr_seq) + dout(10) << " current/ seq was " << curr_seq << dendl; + else + dout(10) << " current/ missing entirely (unusual, but okay)" << dendl; + + uint64_t cp = snaps.back(); + dout(10) << " most recent snap from " << snaps << " is " << cp << dendl; + + // if current/ is marked as non-snapshotted, refuse to roll + // back (without clear direction) to avoid throwing out new + // data. + struct stat st; + if (::stat(nosnapfn, &st) == 0) { + if (!m_osd_use_stale_snap) { + derr << "ERROR: " << nosnapfn << " exists, not rolling back to avoid losing new data" << dendl; + derr << "Force rollback to old snapshotted version with 'osd use stale snap = true'" << dendl; + derr << "config option for --osd-use-stale-snap startup argument." << dendl; + ret = -ENOTSUP; + goto close_basedir_fd; + } + derr << "WARNING: user forced start with data sequence mismatch: current was " << curr_seq + << ", newest snap is " << cp << dendl; + cerr << TEXT_YELLOW + << " ** WARNING: forcing the use of stale snapshot data **" + << TEXT_NORMAL << std::endl; + } + + dout(10) << __FUNC__ << ": rolling back to consistent snap " << cp << dendl; + snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp); + } + + // drop current? + ret = backend->rollback_to(s); + if (ret) { + derr << __FUNC__ << ": error rolling back to " << s << ": " + << cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + } + } + initial_op_seq = 0; + + current_fd = ::open(current_fn.c_str(), O_RDONLY); + if (current_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": error opening: " << current_fn << ": " << cpp_strerror(ret) << dendl; + goto close_basedir_fd; + } + + assert(current_fd >= 0); + + op_fd = read_op_seq(&initial_op_seq); + if (op_fd < 0) { + ret = op_fd; + derr << __FUNC__ << ": read_op_seq failed" << dendl; + goto close_current_fd; + } + + dout(5) << "mount op_seq is " << initial_op_seq << dendl; + if (initial_op_seq == 0) { + derr << "mount initial op seq is 0; something is wrong" << dendl; + ret = -EINVAL; + goto close_current_fd; + } + + if (!backend->can_checkpoint()) { + // mark current/ as non-snapshotted so that we don't rollback away + // from it. + int r = ::creat(nosnapfn, 0644); + if (r < 0) { + ret = -errno; + derr << __FUNC__ << ": failed to create current/nosnap" << dendl; + goto close_current_fd; + } + VOID_TEMP_FAILURE_RETRY(::close(r)); + } else { + // clear nosnap marker, if present. + ::unlink(nosnapfn); + } + + // check fsid with omap + // get omap fsid + int omap_fsid_fd; + char omap_fsid_buf[PATH_MAX]; + struct ::stat omap_fsid_stat; + snprintf(omap_fsid_buf, sizeof(omap_fsid_buf), "%s/osd_uuid", omap_dir.c_str()); + // if osd_uuid not exists, assume as this omap matchs corresponding osd + if (::stat(omap_fsid_buf, &omap_fsid_stat) != 0){ + dout(10) << __FUNC__ << ": osd_uuid not found under omap, " + << "assume as matched." + << dendl; + }else{ + // if osd_uuid exists, compares osd_uuid with fsid + omap_fsid_fd = ::open(omap_fsid_buf, O_RDONLY, 0644); + if (omap_fsid_fd < 0) { + ret = -errno; + derr << __FUNC__ << ": error opening '" << omap_fsid_buf << "': " + << cpp_strerror(ret) + << dendl; + goto close_current_fd; + } + ret = read_fsid(omap_fsid_fd, &omap_fsid); + VOID_TEMP_FAILURE_RETRY(::close(omap_fsid_fd)); + omap_fsid_fd = -1; // defensive + if (ret < 0) { + derr << __FUNC__ << ": error reading omap_fsid_fd" + << ", omap_fsid = " << omap_fsid + << cpp_strerror(ret) + << dendl; + goto close_current_fd; + } + if (fsid != omap_fsid) { + derr << __FUNC__ << ": " << omap_fsid_buf + << " has existed omap fsid " << omap_fsid + << " != expected osd fsid " << fsid + << dendl; + ret = -EINVAL; + goto close_current_fd; + } + } + + dout(0) << "start omap initiation" << dendl; + if (!(generic_flags & SKIP_MOUNT_OMAP)) { + KeyValueDB * omap_store = KeyValueDB::create(cct, + superblock.omap_backend, + omap_dir); + if (omap_store == NULL) + { + derr << __FUNC__ << ": Error creating " << superblock.omap_backend << dendl; + ret = -1; + goto close_current_fd; + } + + if (superblock.omap_backend == "rocksdb") + ret = omap_store->init(cct->_conf->filestore_rocksdb_options); + else + ret = omap_store->init(); + + if (ret < 0) { + derr << __FUNC__ << ": Error initializing omap_store: " << cpp_strerror(ret) << dendl; + goto close_current_fd; + } + + stringstream err; + if (omap_store->create_and_open(err)) { + delete omap_store; + derr << __FUNC__ << ": Error initializing " << superblock.omap_backend + << " : " << err.str() << dendl; + ret = -1; + goto close_current_fd; + } + + DBObjectMap *dbomap = new DBObjectMap(cct, omap_store); + ret = dbomap->init(do_update); + if (ret < 0) { + delete dbomap; + derr << __FUNC__ << ": Error initializing DBObjectMap: " << ret << dendl; + goto close_current_fd; + } + stringstream err2; + + if (cct->_conf->filestore_debug_omap_check && !dbomap->check(err2)) { + derr << err2.str() << dendl; + delete dbomap; + ret = -EINVAL; + goto close_current_fd; + } + object_map.reset(dbomap); + } + + // journal + new_journal(); + + // select journal mode? + if (journal) { + if (!m_filestore_journal_writeahead && + !m_filestore_journal_parallel && + !m_filestore_journal_trailing) { + if (!backend->can_checkpoint()) { + m_filestore_journal_writeahead = true; + dout(0) << __FUNC__ << ": enabling WRITEAHEAD journal mode: checkpoint is not enabled" << dendl; + } else { + m_filestore_journal_parallel = true; + dout(0) << __FUNC__ << ": enabling PARALLEL journal mode: fs, checkpoint is enabled" << dendl; + } + } else { + if (m_filestore_journal_writeahead) + dout(0) << __FUNC__ << ": WRITEAHEAD journal mode explicitly enabled in conf" << dendl; + if (m_filestore_journal_parallel) + dout(0) << __FUNC__ << ": PARALLEL journal mode explicitly enabled in conf" << dendl; + if (m_filestore_journal_trailing) + dout(0) << __FUNC__ << ": TRAILING journal mode explicitly enabled in conf" << dendl; + } + if (m_filestore_journal_writeahead) + journal->set_wait_on_full(true); + } else { + dout(0) << __FUNC__ << ": no journal" << dendl; + } + + ret = _sanity_check_fs(); + if (ret) { + derr << __FUNC__ << ": _sanity_check_fs failed with error " + << ret << dendl; + goto close_current_fd; + } + + // Cleanup possibly invalid collections + { + vector collections; + ret = list_collections(collections, true); + if (ret < 0) { + derr << "Error " << ret << " while listing collections" << dendl; + goto close_current_fd; + } + for (vector::iterator i = collections.begin(); + i != collections.end(); + ++i) { + Index index; + ret = get_index(*i, &index); + if (ret < 0) { + derr << "Unable to mount index " << *i + << " with error: " << ret << dendl; + goto close_current_fd; + } + assert(NULL != index.index); + RWLock::WLocker l((index.index)->access_lock); + + index->cleanup(); + } + } + if (!m_disable_wbthrottle) { + wbthrottle.start(); + } else { + dout(0) << __FUNC__ << ": INFO: WbThrottle is disabled" << dendl; + if (cct->_conf->filestore_odsync_write) { + dout(0) << __FUNC__ << ": INFO: O_DSYNC write is enabled" << dendl; + } + } + sync_thread.create("filestore_sync"); + + if (!(generic_flags & SKIP_JOURNAL_REPLAY)) { + ret = journal_replay(initial_op_seq); + if (ret < 0) { + derr << __FUNC__ << ": failed to open journal " << journalpath << ": " << cpp_strerror(ret) << dendl; + if (ret == -ENOTTY) { + derr << "maybe journal is not pointing to a block device and its size " + << "wasn't configured?" << dendl; + } + + goto stop_sync; + } + } + + { + stringstream err2; + if (cct->_conf->filestore_debug_omap_check && !object_map->check(err2)) { + derr << err2.str() << dendl; + ret = -EINVAL; + goto stop_sync; + } + } + + init_temp_collections(); + + journal_start(); + + op_tp.start(); + for (vector::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { + (*it)->start(); + } + for (vector::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { + (*it)->start(); + } + + timer.init(); + + // upgrade? + if (cct->_conf->filestore_update_to >= (int)get_target_version()) { + int err = upgrade(); + if (err < 0) { + derr << "error converting store" << dendl; + umount(); + return err; + } + } + + // all okay. + return 0; + +stop_sync: + // stop sync thread + lock.Lock(); + stop = true; + sync_cond.Signal(); + lock.Unlock(); + sync_thread.join(); + if (!m_disable_wbthrottle) { + wbthrottle.stop(); + } +close_current_fd: + VOID_TEMP_FAILURE_RETRY(::close(current_fd)); + current_fd = -1; +close_basedir_fd: + VOID_TEMP_FAILURE_RETRY(::close(basedir_fd)); + basedir_fd = -1; +close_fsid_fd: + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; +done: + assert(!m_filestore_fail_eio || ret != -EIO); + delete backend; + backend = NULL; + object_map.reset(); + return ret; +} + +void FileStore::init_temp_collections() +{ + dout(10) << __FUNC__ << dendl; + vector ls; + int r = list_collections(ls, true); + assert(r >= 0); + + dout(20) << " ls " << ls << dendl; + + SequencerPosition spos; + + set temps; + for (vector::iterator p = ls.begin(); p != ls.end(); ++p) + if (p->is_temp()) + temps.insert(*p); + dout(20) << " temps " << temps << dendl; + + for (vector::iterator p = ls.begin(); p != ls.end(); ++p) { + if (p->is_temp()) + continue; + if (p->is_meta()) + continue; + coll_t temp = p->get_temp(); + if (temps.count(temp)) { + temps.erase(temp); + } else { + dout(10) << __FUNC__ << ": creating " << temp << dendl; + r = _create_collection(temp, 0, spos); + assert(r == 0); + } + } + + for (set::iterator p = temps.begin(); p != temps.end(); ++p) { + dout(10) << __FUNC__ << ": removing stray " << *p << dendl; + r = _collection_remove_recursive(*p, spos); + assert(r == 0); + } +} + +int FileStore::umount() +{ + dout(5) << __FUNC__ << ": " << basedir << dendl; + + flush(); + sync(); + do_force_sync(); + + lock.Lock(); + stop = true; + sync_cond.Signal(); + lock.Unlock(); + sync_thread.join(); + if (!m_disable_wbthrottle){ + wbthrottle.stop(); + } + op_tp.stop(); + + journal_stop(); + if (!(generic_flags & SKIP_JOURNAL_REPLAY)) + journal_write_close(); + + for (vector::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { + (*it)->stop(); + } + for (vector::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { + (*it)->stop(); + } + + if (fsid_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(fsid_fd)); + fsid_fd = -1; + } + if (op_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(op_fd)); + op_fd = -1; + } + if (current_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(current_fd)); + current_fd = -1; + } + if (basedir_fd >= 0) { + VOID_TEMP_FAILURE_RETRY(::close(basedir_fd)); + basedir_fd = -1; + } + + force_sync = false; + + delete backend; + backend = NULL; + + object_map.reset(); + + { + Mutex::Locker l(sync_entry_timeo_lock); + timer.shutdown(); + } + + // nothing + return 0; +} + + + + +/// ----------------------------- + +FileStore::Op *FileStore::build_op(vector& tls, + Context *onreadable, + Context *onreadable_sync, + TrackedOpRef osd_op) +{ + uint64_t bytes = 0, ops = 0; + for (vector::iterator p = tls.begin(); + p != tls.end(); + ++p) { + bytes += (*p).get_num_bytes(); + ops += (*p).get_num_ops(); + } + + Op *o = new Op; + o->start = ceph_clock_now(); + o->tls = std::move(tls); + o->onreadable = onreadable; + o->onreadable_sync = onreadable_sync; + o->ops = ops; + o->bytes = bytes; + o->osd_op = osd_op; + return o; +} + + + +void FileStore::queue_op(OpSequencer *osr, Op *o) +{ + // queue op on sequencer, then queue sequencer for the threadpool, + // so that regardless of which order the threads pick up the + // sequencer, the op order will be preserved. + + osr->queue(o); + o->trace.event("queued"); + + logger->inc(l_filestore_ops); + logger->inc(l_filestore_bytes, o->bytes); + + dout(5) << __FUNC__ << ": " << o << " seq " << o->op + << " " << *osr + << " " << o->bytes << " bytes" + << " (queue has " << throttle_ops.get_current() << " ops and " << throttle_bytes.get_current() << " bytes)" + << dendl; + op_wq.queue(osr); +} + +void FileStore::op_queue_reserve_throttle(Op *o) +{ + throttle_ops.get(); + throttle_bytes.get(o->bytes); + + logger->set(l_filestore_op_queue_ops, throttle_ops.get_current()); + logger->set(l_filestore_op_queue_bytes, throttle_bytes.get_current()); +} + +void FileStore::op_queue_release_throttle(Op *o) +{ + throttle_ops.put(); + throttle_bytes.put(o->bytes); + logger->set(l_filestore_op_queue_ops, throttle_ops.get_current()); + logger->set(l_filestore_op_queue_bytes, throttle_bytes.get_current()); +} + +void FileStore::_do_op(OpSequencer *osr, ThreadPool::TPHandle &handle) +{ + if (!m_disable_wbthrottle) { + wbthrottle.throttle(); + } + // inject a stall? + if (cct->_conf->filestore_inject_stall) { + int orig = cct->_conf->filestore_inject_stall; + dout(5) << __FUNC__ << ": filestore_inject_stall " << orig << ", sleeping" << dendl; + sleep(orig); + cct->_conf->set_val("filestore_inject_stall", "0"); + dout(5) << __FUNC__ << ": done stalling" << dendl; + } + + osr->apply_lock.Lock(); + Op *o = osr->peek_queue(); + o->trace.event("op_apply_start"); + apply_manager.op_apply_start(o->op); + dout(5) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " start" << dendl; + o->trace.event("_do_transactions start"); + int r = _do_transactions(o->tls, o->op, &handle); + o->trace.event("op_apply_finish"); + apply_manager.op_apply_finish(o->op); + dout(10) << __FUNC__ << ": " << o << " seq " << o->op << " r = " << r + << ", finisher " << o->onreadable << " " << o->onreadable_sync << dendl; + + o->tls.clear(); + +} + +void FileStore::_finish_op(OpSequencer *osr) +{ + list to_queue; + Op *o = osr->dequeue(&to_queue); + + utime_t lat = ceph_clock_now(); + lat -= o->start; + + dout(10) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << "/" << osr->parent << " lat " << lat << dendl; + osr->apply_lock.Unlock(); // locked in _do_op + o->trace.event("_finish_op"); + + // called with tp lock held + op_queue_release_throttle(o); + + logger->tinc(l_filestore_apply_latency, lat); + + if (o->onreadable_sync) { + o->onreadable_sync->complete(0); + } + if (o->onreadable) { + apply_finishers[osr->id % m_apply_finisher_num]->queue(o->onreadable); + } + if (!to_queue.empty()) { + apply_finishers[osr->id % m_apply_finisher_num]->queue(to_queue); + } + delete o; +} + + +struct C_JournaledAhead : public Context { + FileStore *fs; + FileStore::OpSequencer *osr; + FileStore::Op *o; + Context *ondisk; + + C_JournaledAhead(FileStore *f, FileStore::OpSequencer *os, FileStore::Op *o, Context *ondisk): + fs(f), osr(os), o(o), ondisk(ondisk) { } + void finish(int r) override { + fs->_journaled_ahead(osr, o, ondisk); + } +}; + +int FileStore::queue_transactions(Sequencer *posr, vector& tls, + TrackedOpRef osd_op, + ThreadPool::TPHandle *handle) +{ + Context *onreadable; + Context *ondisk; + Context *onreadable_sync; + ObjectStore::Transaction::collect_contexts( + tls, &onreadable, &ondisk, &onreadable_sync); + + if (cct->_conf->objectstore_blackhole) { + dout(0) << __FUNC__ << ": objectstore_blackhole = TRUE, dropping transaction" + << dendl; + delete ondisk; + delete onreadable; + delete onreadable_sync; + return 0; + } + + utime_t start = ceph_clock_now(); + // set up the sequencer + OpSequencer *osr; + assert(posr); + if (posr->p) { + osr = static_cast(posr->p.get()); + dout(5) << __FUNC__ << ": existing " << osr << " " << *osr << dendl; + } else { + osr = new OpSequencer(cct, ++next_osr_id); + osr->set_cct(cct); + osr->parent = posr; + posr->p = osr; + dout(5) << __FUNC__ << ": new " << osr << " " << *osr << dendl; + } + + // used to include osr information in tracepoints during transaction apply + for (vector::iterator i = tls.begin(); i != tls.end(); ++i) { + (*i).set_osr(osr); + } + + ZTracer::Trace trace; + if (osd_op && osd_op->pg_trace) { + osd_op->store_trace.init("filestore op", &trace_endpoint, &osd_op->pg_trace); + trace = osd_op->store_trace; + } + + if (journal && journal->is_writeable() && !m_filestore_journal_trailing) { + Op *o = build_op(tls, onreadable, onreadable_sync, osd_op); + + //prepare and encode transactions data out of lock + bufferlist tbl; + int orig_len = journal->prepare_entry(o->tls, &tbl); + + if (handle) + handle->suspend_tp_timeout(); + + op_queue_reserve_throttle(o); + journal->reserve_throttle_and_backoff(tbl.length()); + + if (handle) + handle->reset_tp_timeout(); + + uint64_t op_num = submit_manager.op_submit_start(); + o->op = op_num; + trace.keyval("opnum", op_num); + + if (m_filestore_do_dump) + dump_transactions(o->tls, o->op, osr); + + if (m_filestore_journal_parallel) { + dout(5) << __FUNC__ << ": (parallel) " << o->op << " " << o->tls << dendl; + + trace.keyval("journal mode", "parallel"); + trace.event("journal started"); + _op_journal_transactions(tbl, orig_len, o->op, ondisk, osd_op); + + // queue inside submit_manager op submission lock + queue_op(osr, o); + trace.event("op queued"); + } else if (m_filestore_journal_writeahead) { + dout(5) << __FUNC__ << ": (writeahead) " << o->op << " " << o->tls << dendl; + + osr->queue_journal(o->op); + + trace.keyval("journal mode", "writeahead"); + trace.event("journal started"); + _op_journal_transactions(tbl, orig_len, o->op, + new C_JournaledAhead(this, osr, o, ondisk), + osd_op); + } else { + ceph_abort(); + } + submit_manager.op_submit_finish(op_num); + utime_t end = ceph_clock_now(); + logger->tinc(l_filestore_queue_transaction_latency_avg, end - start); + return 0; + } + + if (!journal) { + Op *o = build_op(tls, onreadable, onreadable_sync, osd_op); + dout(5) << __FUNC__ << ": (no journal) " << o << " " << tls << dendl; + + if (handle) + handle->suspend_tp_timeout(); + + op_queue_reserve_throttle(o); + + if (handle) + handle->reset_tp_timeout(); + + uint64_t op_num = submit_manager.op_submit_start(); + o->op = op_num; + + if (m_filestore_do_dump) + dump_transactions(o->tls, o->op, osr); + + queue_op(osr, o); + trace.keyval("opnum", op_num); + trace.keyval("journal mode", "none"); + trace.event("op queued"); + + if (ondisk) + apply_manager.add_waiter(op_num, ondisk); + submit_manager.op_submit_finish(op_num); + utime_t end = ceph_clock_now(); + logger->tinc(l_filestore_queue_transaction_latency_avg, end - start); + return 0; + } + + assert(journal); + //prepare and encode transactions data out of lock + bufferlist tbl; + int orig_len = -1; + if (journal->is_writeable()) { + orig_len = journal->prepare_entry(tls, &tbl); + } + uint64_t op = submit_manager.op_submit_start(); + dout(5) << __FUNC__ << ": (trailing journal) " << op << " " << tls << dendl; + + if (m_filestore_do_dump) + dump_transactions(tls, op, osr); + + trace.event("op_apply_start"); + trace.keyval("opnum", op); + trace.keyval("journal mode", "trailing"); + apply_manager.op_apply_start(op); + trace.event("do_transactions"); + int r = do_transactions(tls, op); + + if (r >= 0) { + trace.event("journal started"); + _op_journal_transactions(tbl, orig_len, op, ondisk, osd_op); + } else { + delete ondisk; + } + + // start on_readable finisher after we queue journal item, as on_readable callback + // is allowed to delete the Transaction + if (onreadable_sync) { + onreadable_sync->complete(r); + } + apply_finishers[osr->id % m_apply_finisher_num]->queue(onreadable, r); + + submit_manager.op_submit_finish(op); + trace.event("op_apply_finish"); + apply_manager.op_apply_finish(op); + + utime_t end = ceph_clock_now(); + logger->tinc(l_filestore_queue_transaction_latency_avg, end - start); + return r; +} + +void FileStore::_journaled_ahead(OpSequencer *osr, Op *o, Context *ondisk) +{ + dout(5) << __FUNC__ << ": " << o << " seq " << o->op << " " << *osr << " " << o->tls << dendl; + + o->trace.event("writeahead journal finished"); + + // this should queue in order because the journal does it's completions in order. + queue_op(osr, o); + + list to_queue; + osr->dequeue_journal(&to_queue); + + // do ondisk completions async, to prevent any onreadable_sync completions + // getting blocked behind an ondisk completion. + if (ondisk) { + dout(10) << " queueing ondisk " << ondisk << dendl; + ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(ondisk); + } + if (!to_queue.empty()) { + ondisk_finishers[osr->id % m_ondisk_finisher_num]->queue(to_queue); + } +} + +int FileStore::_do_transactions( + vector &tls, + uint64_t op_seq, + ThreadPool::TPHandle *handle) +{ + int trans_num = 0; + + for (vector::iterator p = tls.begin(); + p != tls.end(); + ++p, trans_num++) { + _do_transaction(*p, op_seq, trans_num, handle); + if (handle) + handle->reset_tp_timeout(); + } + + return 0; +} + +void FileStore::_set_global_replay_guard(const coll_t& cid, + const SequencerPosition &spos) +{ + if (backend->can_checkpoint()) + return; + + // sync all previous operations on this sequencer + int ret = object_map->sync(); + if (ret < 0) { + derr << __FUNC__ << ": omap sync error " << cpp_strerror(ret) << dendl; + assert(0 == "_set_global_replay_guard failed"); + } + ret = sync_filesystem(basedir_fd); + if (ret < 0) { + derr << __FUNC__ << ": sync_filesystem error " << cpp_strerror(ret) << dendl; + assert(0 == "_set_global_replay_guard failed"); + } + + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + int err = errno; + derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl; + assert(0 == "_set_global_replay_guard failed"); + } + + _inject_failure(); + + // then record that we did it + bufferlist v; + ::encode(spos, v); + int r = chain_fsetxattr( + fd, GLOBAL_REPLAY_GUARD_XATTR, v.c_str(), v.length()); + if (r < 0) { + derr << __FUNC__ << ": fsetxattr " << GLOBAL_REPLAY_GUARD_XATTR + << " got " << cpp_strerror(r) << dendl; + assert(0 == "fsetxattr failed"); + } + + // and make sure our xattr is durable. + ::fsync(fd); + + _inject_failure(); + + VOID_TEMP_FAILURE_RETRY(::close(fd)); + dout(10) << __FUNC__ << ": " << spos << " done" << dendl; +} + +int FileStore::_check_global_replay_guard(const coll_t& cid, + const SequencerPosition& spos) +{ + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + dout(10) << __FUNC__ << ": " << cid << " dne" << dendl; + return 1; // if collection does not exist, there is no guard, and we can replay. + } + + char buf[100]; + int r = chain_fgetxattr(fd, GLOBAL_REPLAY_GUARD_XATTR, buf, sizeof(buf)); + if (r < 0) { + dout(20) << __FUNC__ << ": no xattr" << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return 1; // no xattr + } + bufferlist bl; + bl.append(buf, r); + + SequencerPosition opos; + bufferlist::iterator p = bl.begin(); + ::decode(opos, p); + + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return spos >= opos ? 1 : -1; +} + + +void FileStore::_set_replay_guard(const coll_t& cid, + const SequencerPosition &spos, + bool in_progress=false) +{ + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + int err = errno; + derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl; + assert(0 == "_set_replay_guard failed"); + } + _set_replay_guard(fd, spos, 0, in_progress); + VOID_TEMP_FAILURE_RETRY(::close(fd)); +} + + +void FileStore::_set_replay_guard(int fd, + const SequencerPosition& spos, + const ghobject_t *hoid, + bool in_progress) +{ + if (backend->can_checkpoint()) + return; + + dout(10) << __FUNC__ << ": " << spos << (in_progress ? " START" : "") << dendl; + + _inject_failure(); + + // first make sure the previous operation commits + ::fsync(fd); + + if (!in_progress) { + // sync object_map too. even if this object has a header or keys, + // it have had them in the past and then removed them, so always + // sync. + object_map->sync(hoid, &spos); + } + + _inject_failure(); + + // then record that we did it + bufferlist v(40); + ::encode(spos, v); + ::encode(in_progress, v); + int r = chain_fsetxattr( + fd, REPLAY_GUARD_XATTR, v.c_str(), v.length()); + if (r < 0) { + derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl; + assert(0 == "fsetxattr failed"); + } + + // and make sure our xattr is durable. + ::fsync(fd); + + _inject_failure(); + + dout(10) << __FUNC__ << ": " << spos << " done" << dendl; +} + +void FileStore::_close_replay_guard(const coll_t& cid, + const SequencerPosition &spos) +{ + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + int err = errno; + derr << __FUNC__ << ": " << cid << " error " << cpp_strerror(err) << dendl; + assert(0 == "_close_replay_guard failed"); + } + _close_replay_guard(fd, spos); + VOID_TEMP_FAILURE_RETRY(::close(fd)); +} + +void FileStore::_close_replay_guard(int fd, const SequencerPosition& spos, + const ghobject_t *hoid) +{ + if (backend->can_checkpoint()) + return; + + dout(10) << __FUNC__ << ": " << spos << dendl; + + _inject_failure(); + + // sync object_map too. even if this object has a header or keys, + // it have had them in the past and then removed them, so always + // sync. + object_map->sync(hoid, &spos); + + // then record that we are done with this operation + bufferlist v(40); + ::encode(spos, v); + bool in_progress = false; + ::encode(in_progress, v); + int r = chain_fsetxattr( + fd, REPLAY_GUARD_XATTR, v.c_str(), v.length()); + if (r < 0) { + derr << "fsetxattr " << REPLAY_GUARD_XATTR << " got " << cpp_strerror(r) << dendl; + assert(0 == "fsetxattr failed"); + } + + // and make sure our xattr is durable. + ::fsync(fd); + + _inject_failure(); + + dout(10) << __FUNC__ << ": " << spos << " done" << dendl; +} + +int FileStore::_check_replay_guard(const coll_t& cid, const ghobject_t &oid, + const SequencerPosition& spos) +{ + if (!replaying || backend->can_checkpoint()) + return 1; + + int r = _check_global_replay_guard(cid, spos); + if (r < 0) + return r; + + FDRef fd; + r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + dout(10) << __FUNC__ << ": " << cid << " " << oid << " dne" << dendl; + return 1; // if file does not exist, there is no guard, and we can replay. + } + int ret = _check_replay_guard(**fd, spos); + lfn_close(fd); + return ret; +} + +int FileStore::_check_replay_guard(const coll_t& cid, const SequencerPosition& spos) +{ + if (!replaying || backend->can_checkpoint()) + return 1; + + char fn[PATH_MAX]; + get_cdir(cid, fn, sizeof(fn)); + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + dout(10) << __FUNC__ << ": " << cid << " dne" << dendl; + return 1; // if collection does not exist, there is no guard, and we can replay. + } + int ret = _check_replay_guard(fd, spos); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + return ret; +} + +int FileStore::_check_replay_guard(int fd, const SequencerPosition& spos) +{ + if (!replaying || backend->can_checkpoint()) + return 1; + + char buf[100]; + int r = chain_fgetxattr(fd, REPLAY_GUARD_XATTR, buf, sizeof(buf)); + if (r < 0) { + dout(20) << __FUNC__ << ": no xattr" << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + return 1; // no xattr + } + bufferlist bl; + bl.append(buf, r); + + SequencerPosition opos; + bufferlist::iterator p = bl.begin(); + ::decode(opos, p); + bool in_progress = false; + if (!p.end()) // older journals don't have this + ::decode(in_progress, p); + if (opos > spos) { + dout(10) << __FUNC__ << ": object has " << opos << " > current pos " << spos + << ", now or in future, SKIPPING REPLAY" << dendl; + return -1; + } else if (opos == spos) { + if (in_progress) { + dout(10) << __FUNC__ << ": object has " << opos << " == current pos " << spos + << ", in_progress=true, CONDITIONAL REPLAY" << dendl; + return 0; + } else { + dout(10) << __FUNC__ << ": object has " << opos << " == current pos " << spos + << ", in_progress=false, SKIPPING REPLAY" << dendl; + return -1; + } + } else { + dout(10) << __FUNC__ << ": object has " << opos << " < current pos " << spos + << ", in past, will replay" << dendl; + return 1; + } +} + +void FileStore::_do_transaction( + Transaction& t, uint64_t op_seq, int trans_num, + ThreadPool::TPHandle *handle) +{ + dout(10) << __FUNC__ << ": on " << &t << dendl; + +#ifdef WITH_LTTNG + const char *osr_name = t.get_osr() ? static_cast(t.get_osr())->get_name().c_str() : ""; +#endif + + Transaction::iterator i = t.begin(); + + SequencerPosition spos(op_seq, trans_num, 0); + while (i.have_op()) { + if (handle) + handle->reset_tp_timeout(); + + Transaction::Op *op = i.decode_op(); + int r = 0; + + _inject_failure(); + + switch (op->op) { + case Transaction::OP_NOP: + break; + case Transaction::OP_TOUCH: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + tracepoint(objectstore, touch_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _touch(cid, oid); + tracepoint(objectstore, touch_exit, r); + } + break; + + case Transaction::OP_WRITE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + uint64_t off = op->off; + uint64_t len = op->len; + uint32_t fadvise_flags = i.get_fadvise_flags(); + bufferlist bl; + i.decode_bl(bl); + tracepoint(objectstore, write_enter, osr_name, off, len); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _write(cid, oid, off, len, bl, fadvise_flags); + tracepoint(objectstore, write_exit, r); + } + break; + + case Transaction::OP_ZERO: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + uint64_t off = op->off; + uint64_t len = op->len; + tracepoint(objectstore, zero_enter, osr_name, off, len); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _zero(cid, oid, off, len); + tracepoint(objectstore, zero_exit, r); + } + break; + + case Transaction::OP_TRIMCACHE: + { + // deprecated, no-op + } + break; + + case Transaction::OP_TRUNCATE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + uint64_t off = op->off; + tracepoint(objectstore, truncate_enter, osr_name, off); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _truncate(cid, oid, off); + tracepoint(objectstore, truncate_exit, r); + } + break; + + case Transaction::OP_REMOVE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + tracepoint(objectstore, remove_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _remove(cid, oid, spos); + tracepoint(objectstore, remove_exit, r); + } + break; + + case Transaction::OP_SETATTR: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + string name = i.decode_string(); + bufferlist bl; + i.decode_bl(bl); + tracepoint(objectstore, setattr_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) { + map to_set; + to_set[name] = bufferptr(bl.c_str(), bl.length()); + r = _setattrs(cid, oid, to_set, spos); + if (r == -ENOSPC) + dout(0) << " ENOSPC on setxattr on " << cid << "/" << oid + << " name " << name << " size " << bl.length() << dendl; + } + tracepoint(objectstore, setattr_exit, r); + } + break; + + case Transaction::OP_SETATTRS: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + map aset; + i.decode_attrset(aset); + tracepoint(objectstore, setattrs_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _setattrs(cid, oid, aset, spos); + tracepoint(objectstore, setattrs_exit, r); + if (r == -ENOSPC) + dout(0) << " ENOSPC on setxattrs on " << cid << "/" << oid << dendl; + } + break; + + case Transaction::OP_RMATTR: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + string name = i.decode_string(); + tracepoint(objectstore, rmattr_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _rmattr(cid, oid, name.c_str(), spos); + tracepoint(objectstore, rmattr_exit, r); + } + break; + + case Transaction::OP_RMATTRS: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + tracepoint(objectstore, rmattrs_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _rmattrs(cid, oid, spos); + tracepoint(objectstore, rmattrs_exit, r); + } + break; + + case Transaction::OP_CLONE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + const ghobject_t &noid = i.get_oid(op->dest_oid); + tracepoint(objectstore, clone_enter, osr_name); + r = _clone(cid, oid, noid, spos); + tracepoint(objectstore, clone_exit, r); + } + break; + + case Transaction::OP_CLONERANGE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const ghobject_t &noid = i.get_oid(op->dest_oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + const coll_t &ncid = !_need_temp_object_collection(_cid, noid) ? + _cid : _cid.get_temp(); + uint64_t off = op->off; + uint64_t len = op->len; + tracepoint(objectstore, clone_range_enter, osr_name, len); + r = _clone_range(cid, oid, ncid, noid, off, len, off, spos); + tracepoint(objectstore, clone_range_exit, r); + } + break; + + case Transaction::OP_CLONERANGE2: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const ghobject_t &noid = i.get_oid(op->dest_oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + const coll_t &ncid = !_need_temp_object_collection(_cid, noid) ? + _cid : _cid.get_temp(); + uint64_t srcoff = op->off; + uint64_t len = op->len; + uint64_t dstoff = op->dest_off; + tracepoint(objectstore, clone_range2_enter, osr_name, len); + r = _clone_range(cid, oid, ncid, noid, srcoff, len, dstoff, spos); + tracepoint(objectstore, clone_range2_exit, r); + } + break; + + case Transaction::OP_MKCOLL: + { + const coll_t &cid = i.get_cid(op->cid); + tracepoint(objectstore, mkcoll_enter, osr_name); + if (_check_replay_guard(cid, spos) > 0) + r = _create_collection(cid, op->split_bits, spos); + tracepoint(objectstore, mkcoll_exit, r); + } + break; + + case Transaction::OP_COLL_SET_BITS: + { + const coll_t &cid = i.get_cid(op->cid); + int bits = op->split_bits; + r = _collection_set_bits(cid, bits); + } + break; + + case Transaction::OP_COLL_HINT: + { + const coll_t &cid = i.get_cid(op->cid); + uint32_t type = op->hint_type; + bufferlist hint; + i.decode_bl(hint); + bufferlist::iterator hiter = hint.begin(); + if (type == Transaction::COLL_HINT_EXPECTED_NUM_OBJECTS) { + uint32_t pg_num; + uint64_t num_objs; + ::decode(pg_num, hiter); + ::decode(num_objs, hiter); + if (_check_replay_guard(cid, spos) > 0) { + r = _collection_hint_expected_num_objs(cid, pg_num, num_objs, spos); + } + } else { + // Ignore the hint + dout(10) << "Unrecognized collection hint type: " << type << dendl; + } + } + break; + + case Transaction::OP_RMCOLL: + { + const coll_t &cid = i.get_cid(op->cid); + tracepoint(objectstore, rmcoll_enter, osr_name); + if (_check_replay_guard(cid, spos) > 0) + r = _destroy_collection(cid); + tracepoint(objectstore, rmcoll_exit, r); + } + break; + + case Transaction::OP_COLL_ADD: + { + const coll_t &ocid = i.get_cid(op->cid); + const coll_t &ncid = i.get_cid(op->dest_cid); + const ghobject_t &oid = i.get_oid(op->oid); + + assert(oid.hobj.pool >= -1); + + // always followed by OP_COLL_REMOVE + Transaction::Op *op2 = i.decode_op(); + const coll_t &ocid2 = i.get_cid(op2->cid); + const ghobject_t &oid2 = i.get_oid(op2->oid); + assert(op2->op == Transaction::OP_COLL_REMOVE); + assert(ocid2 == ocid); + assert(oid2 == oid); + + tracepoint(objectstore, coll_add_enter); + r = _collection_add(ncid, ocid, oid, spos); + tracepoint(objectstore, coll_add_exit, r); + spos.op++; + if (r < 0) + break; + tracepoint(objectstore, coll_remove_enter, osr_name); + if (_check_replay_guard(ocid, oid, spos) > 0) + r = _remove(ocid, oid, spos); + tracepoint(objectstore, coll_remove_exit, r); + } + break; + + case Transaction::OP_COLL_MOVE: + { + // WARNING: this is deprecated and buggy; only here to replay old journals. + const coll_t &ocid = i.get_cid(op->cid); + const coll_t &ncid = i.get_cid(op->dest_cid); + const ghobject_t &oid = i.get_oid(op->oid); + tracepoint(objectstore, coll_move_enter); + r = _collection_add(ocid, ncid, oid, spos); + if (r == 0 && + (_check_replay_guard(ocid, oid, spos) > 0)) + r = _remove(ocid, oid, spos); + tracepoint(objectstore, coll_move_exit, r); + } + break; + + case Transaction::OP_COLL_MOVE_RENAME: + { + const coll_t &_oldcid = i.get_cid(op->cid); + const ghobject_t &oldoid = i.get_oid(op->oid); + const coll_t &_newcid = i.get_cid(op->dest_cid); + const ghobject_t &newoid = i.get_oid(op->dest_oid); + const coll_t &oldcid = !_need_temp_object_collection(_oldcid, oldoid) ? + _oldcid : _oldcid.get_temp(); + const coll_t &newcid = !_need_temp_object_collection(_newcid, newoid) ? + _oldcid : _newcid.get_temp(); + tracepoint(objectstore, coll_move_rename_enter); + r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos); + tracepoint(objectstore, coll_move_rename_exit, r); + } + break; + + case Transaction::OP_TRY_RENAME: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oldoid = i.get_oid(op->oid); + const ghobject_t &newoid = i.get_oid(op->dest_oid); + const coll_t &oldcid = !_need_temp_object_collection(_cid, oldoid) ? + _cid : _cid.get_temp(); + const coll_t &newcid = !_need_temp_object_collection(_cid, newoid) ? + _cid : _cid.get_temp(); + tracepoint(objectstore, coll_try_rename_enter); + r = _collection_move_rename(oldcid, oldoid, newcid, newoid, spos, true); + tracepoint(objectstore, coll_try_rename_exit, r); + } + break; + + case Transaction::OP_COLL_SETATTR: + case Transaction::OP_COLL_RMATTR: + assert(0 == "collection attr methods no longer implemented"); + break; + + case Transaction::OP_STARTSYNC: + tracepoint(objectstore, startsync_enter, osr_name); + _start_sync(); + tracepoint(objectstore, startsync_exit); + break; + + case Transaction::OP_COLL_RENAME: + { + r = -EOPNOTSUPP; + } + break; + + case Transaction::OP_OMAP_CLEAR: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + tracepoint(objectstore, omap_clear_enter, osr_name); + r = _omap_clear(cid, oid, spos); + tracepoint(objectstore, omap_clear_exit, r); + } + break; + case Transaction::OP_OMAP_SETKEYS: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + map aset; + i.decode_attrset(aset); + tracepoint(objectstore, omap_setkeys_enter, osr_name); + r = _omap_setkeys(cid, oid, aset, spos); + tracepoint(objectstore, omap_setkeys_exit, r); + } + break; + case Transaction::OP_OMAP_RMKEYS: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + set keys; + i.decode_keyset(keys); + tracepoint(objectstore, omap_rmkeys_enter, osr_name); + r = _omap_rmkeys(cid, oid, keys, spos); + tracepoint(objectstore, omap_rmkeys_exit, r); + } + break; + case Transaction::OP_OMAP_RMKEYRANGE: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + string first, last; + first = i.decode_string(); + last = i.decode_string(); + tracepoint(objectstore, omap_rmkeyrange_enter, osr_name); + r = _omap_rmkeyrange(cid, oid, first, last, spos); + tracepoint(objectstore, omap_rmkeyrange_exit, r); + } + break; + case Transaction::OP_OMAP_SETHEADER: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + bufferlist bl; + i.decode_bl(bl); + tracepoint(objectstore, omap_setheader_enter, osr_name); + r = _omap_setheader(cid, oid, bl, spos); + tracepoint(objectstore, omap_setheader_exit, r); + } + break; + case Transaction::OP_SPLIT_COLLECTION: + { + assert(0 == "not legacy journal; upgrade to firefly first"); + } + break; + case Transaction::OP_SPLIT_COLLECTION2: + { + coll_t cid = i.get_cid(op->cid); + uint32_t bits = op->split_bits; + uint32_t rem = op->split_rem; + coll_t dest = i.get_cid(op->dest_cid); + tracepoint(objectstore, split_coll2_enter, osr_name); + r = _split_collection(cid, bits, rem, dest, spos); + tracepoint(objectstore, split_coll2_exit, r); + } + break; + + case Transaction::OP_SETALLOCHINT: + { + const coll_t &_cid = i.get_cid(op->cid); + const ghobject_t &oid = i.get_oid(op->oid); + const coll_t &cid = !_need_temp_object_collection(_cid, oid) ? + _cid : _cid.get_temp(); + uint64_t expected_object_size = op->expected_object_size; + uint64_t expected_write_size = op->expected_write_size; + tracepoint(objectstore, setallochint_enter, osr_name); + if (_check_replay_guard(cid, oid, spos) > 0) + r = _set_alloc_hint(cid, oid, expected_object_size, + expected_write_size); + tracepoint(objectstore, setallochint_exit, r); + } + break; + + default: + derr << "bad op " << op->op << dendl; + ceph_abort(); + } + + if (r < 0) { + bool ok = false; + + if (r == -ENOENT && !(op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2 || + op->op == Transaction::OP_COLL_ADD || + op->op == Transaction::OP_SETATTR || + op->op == Transaction::OP_SETATTRS || + op->op == Transaction::OP_RMATTR || + op->op == Transaction::OP_OMAP_SETKEYS || + op->op == Transaction::OP_OMAP_RMKEYS || + op->op == Transaction::OP_OMAP_RMKEYRANGE || + op->op == Transaction::OP_OMAP_SETHEADER)) + // -ENOENT is normally okay + // ...including on a replayed OP_RMCOLL with checkpoint mode + ok = true; + if (r == -ENODATA) + ok = true; + + if (op->op == Transaction::OP_SETALLOCHINT) + // Either EOPNOTSUPP or EINVAL most probably. EINVAL in most + // cases means invalid hint size (e.g. too big, not a multiple + // of block size, etc) or, at least on xfs, an attempt to set + // or change it when the file is not empty. However, + // OP_SETALLOCHINT is advisory, so ignore all errors. + ok = true; + + if (replaying && !backend->can_checkpoint()) { + if (r == -EEXIST && op->op == Transaction::OP_MKCOLL) { + dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl; + ok = true; + } + if (r == -EEXIST && op->op == Transaction::OP_COLL_ADD) { + dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl; + ok = true; + } + if (r == -EEXIST && op->op == Transaction::OP_COLL_MOVE) { + dout(10) << "tolerating EEXIST during journal replay since checkpoint is not enabled" << dendl; + ok = true; + } + if (r == -ERANGE) { + dout(10) << "tolerating ERANGE on replay" << dendl; + ok = true; + } + if (r == -ENOENT) { + dout(10) << "tolerating ENOENT on replay" << dendl; + ok = true; + } + } + + if (!ok) { + const char *msg = "unexpected error code"; + + if (r == -ENOENT && (op->op == Transaction::OP_CLONERANGE || + op->op == Transaction::OP_CLONE || + op->op == Transaction::OP_CLONERANGE2)) { + msg = "ENOENT on clone suggests osd bug"; + } else if (r == -ENOSPC) { + // For now, if we hit _any_ ENOSPC, crash, before we do any damage + // by partially applying transactions. + msg = "ENOSPC from disk filesystem, misconfigured cluster"; + } else if (r == -ENOTEMPTY) { + msg = "ENOTEMPTY suggests garbage data in osd data dir"; + } else if (r == -EPERM) { + msg = "EPERM suggests file(s) in osd data dir not owned by ceph user, or leveldb corruption"; + } + + derr << " error " << cpp_strerror(r) << " not handled on operation " << op + << " (" << spos << ", or op " << spos.op << ", counting from 0)" << dendl; + dout(0) << msg << dendl; + dout(0) << " transaction dump:\n"; + JSONFormatter f(true); + f.open_object_section("transaction"); + t.dump(&f); + f.close_section(); + f.flush(*_dout); + *_dout << dendl; + + if (r == -EMFILE) { + dump_open_fds(cct); + } + + assert(0 == "unexpected error"); + } + } + + spos.op++; + } + + _inject_failure(); +} + + /*********************************************/ + + + +// -------------------- +// objects + +bool FileStore::exists(const coll_t& _cid, const ghobject_t& oid) +{ + tracepoint(objectstore, exists_enter, _cid.c_str()); + const coll_t& cid = !_need_temp_object_collection(_cid, oid) ? _cid : _cid.get_temp(); + struct stat st; + bool retval = stat(cid, oid, &st) == 0; + tracepoint(objectstore, exists_exit, retval); + return retval; +} + +int FileStore::stat( + const coll_t& _cid, const ghobject_t& oid, struct stat *st, bool allow_eio) +{ + tracepoint(objectstore, stat_enter, _cid.c_str()); + const coll_t& cid = !_need_temp_object_collection(_cid, oid) ? _cid : _cid.get_temp(); + int r = lfn_stat(cid, oid, st); + assert(allow_eio || !m_filestore_fail_eio || r != -EIO); + if (r < 0) { + dout(10) << __FUNC__ << ": " << cid << "/" << oid + << " = " << r << dendl; + } else { + dout(10) << __FUNC__ << ": " << cid << "/" << oid + << " = " << r + << " (size " << st->st_size << ")" << dendl; + } + if (cct->_conf->filestore_debug_inject_read_err && + debug_mdata_eio(oid)) { + return -EIO; + } else { + tracepoint(objectstore, stat_exit, r); + return r; + } +} + +int FileStore::set_collection_opts( + const coll_t& cid, + const pool_opts_t& opts) +{ + return -EOPNOTSUPP; +} + +int FileStore::read( + const coll_t& _cid, + const ghobject_t& oid, + uint64_t offset, + size_t len, + bufferlist& bl, + uint32_t op_flags) +{ + int got; + tracepoint(objectstore, read_enter, _cid.c_str(), offset, len); + const coll_t& cid = !_need_temp_object_collection(_cid, oid) ? _cid : _cid.get_temp(); + + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl; + + FDRef fd; + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + dout(10) << __FUNC__ << ": (" << cid << "/" << oid << ") open error: " + << cpp_strerror(r) << dendl; + return r; + } + + if (offset == 0 && len == 0) { + struct stat st; + memset(&st, 0, sizeof(struct stat)); + int r = ::fstat(**fd, &st); + assert(r == 0); + len = st.st_size; + } + +#ifdef HAVE_POSIX_FADVISE + if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_RANDOM) + posix_fadvise(**fd, offset, len, POSIX_FADV_RANDOM); + if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL) + posix_fadvise(**fd, offset, len, POSIX_FADV_SEQUENTIAL); +#endif + + bufferptr bptr(len); // prealloc space for entire read + got = safe_pread(**fd, bptr.c_str(), len, offset); + if (got < 0) { + dout(10) << __FUNC__ << ": (" << cid << "/" << oid << ") pread error: " << cpp_strerror(got) << dendl; + lfn_close(fd); + return got; + } + bptr.set_length(got); // properly size the buffer + bl.clear(); + bl.push_back(std::move(bptr)); // put it in the target bufferlist + +#ifdef HAVE_POSIX_FADVISE + if (op_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) + posix_fadvise(**fd, offset, len, POSIX_FADV_DONTNEED); + if (op_flags & (CEPH_OSD_OP_FLAG_FADVISE_RANDOM | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL)) + posix_fadvise(**fd, offset, len, POSIX_FADV_NORMAL); +#endif + + if (m_filestore_sloppy_crc && (!replaying || backend->can_checkpoint())) { + ostringstream ss; + int errors = backend->_crc_verify_read(**fd, offset, got, bl, &ss); + if (errors != 0) { + dout(0) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" + << got << " ... BAD CRC:\n" << ss.str() << dendl; + assert(0 == "bad crc on read"); + } + } + + lfn_close(fd); + + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" + << got << "/" << len << dendl; + if (cct->_conf->filestore_debug_inject_read_err && + debug_data_eio(oid)) { + return -EIO; + } else if (cct->_conf->filestore_debug_random_read_err && + (rand() % (int)(cct->_conf->filestore_debug_random_read_err * 100.0)) == 0) { + dout(0) << __func__ << ": inject random EIO" << dendl; + return -EIO; + } else { + tracepoint(objectstore, read_exit, got); + return got; + } +} + +int FileStore::_do_fiemap(int fd, uint64_t offset, size_t len, + map *m) +{ + uint64_t i; + struct fiemap_extent *extent = NULL; + struct fiemap *fiemap = NULL; + int r = 0; + +more: + r = backend->do_fiemap(fd, offset, len, &fiemap); + if (r < 0) + return r; + + if (fiemap->fm_mapped_extents == 0) { + free(fiemap); + return r; + } + + extent = &fiemap->fm_extents[0]; + + /* start where we were asked to start */ + if (extent->fe_logical < offset) { + extent->fe_length -= offset - extent->fe_logical; + extent->fe_logical = offset; + } + + i = 0; + + struct fiemap_extent *last = nullptr; + while (i < fiemap->fm_mapped_extents) { + struct fiemap_extent *next = extent + 1; + + dout(10) << __FUNC__ << ": fm_mapped_extents=" << fiemap->fm_mapped_extents + << " fe_logical=" << extent->fe_logical << " fe_length=" << extent->fe_length << dendl; + + /* try to merge extents */ + while ((i < fiemap->fm_mapped_extents - 1) && + (extent->fe_logical + extent->fe_length == next->fe_logical)) { + next->fe_length += extent->fe_length; + next->fe_logical = extent->fe_logical; + extent = next; + next = extent + 1; + i++; + } + + if (extent->fe_logical + extent->fe_length > offset + len) + extent->fe_length = offset + len - extent->fe_logical; + (*m)[extent->fe_logical] = extent->fe_length; + i++; + last = extent++; + } + uint64_t xoffset = last->fe_logical + last->fe_length - offset; + offset = last->fe_logical + last->fe_length; + len -= xoffset; + const bool is_last = (last->fe_flags & FIEMAP_EXTENT_LAST) || (len == 0); + free(fiemap); + if (!is_last) { + goto more; + } + + return r; +} + +int FileStore::_do_seek_hole_data(int fd, uint64_t offset, size_t len, + map *m) +{ +#if defined(__linux__) && defined(SEEK_HOLE) && defined(SEEK_DATA) + off_t hole_pos, data_pos; + int r = 0; + + // If lseek fails with errno setting to be ENXIO, this means the current + // file offset is beyond the end of the file. + off_t start = offset; + while(start < (off_t)(offset + len)) { + data_pos = lseek(fd, start, SEEK_DATA); + if (data_pos < 0) { + if (errno == ENXIO) + break; + else { + r = -errno; + dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl; + return r; + } + } else if (data_pos > (off_t)(offset + len)) { + break; + } + + hole_pos = lseek(fd, data_pos, SEEK_HOLE); + if (hole_pos < 0) { + if (errno == ENXIO) { + break; + } else { + r = -errno; + dout(10) << "failed to lseek: " << cpp_strerror(r) << dendl; + return r; + } + } + + if (hole_pos >= (off_t)(offset + len)) { + (*m)[data_pos] = offset + len - data_pos; + break; + } + (*m)[data_pos] = hole_pos - data_pos; + start = hole_pos; + } + + return r; +#else + (*m)[offset] = len; + return 0; +#endif +} + +int FileStore::fiemap(const coll_t& _cid, const ghobject_t& oid, + uint64_t offset, size_t len, + bufferlist& bl) +{ + map exomap; + int r = fiemap(_cid, oid, offset, len, exomap); + if (r >= 0) { + ::encode(exomap, bl); + } + return r; +} + +int FileStore::fiemap(const coll_t& _cid, const ghobject_t& oid, + uint64_t offset, size_t len, + map& destmap) +{ + tracepoint(objectstore, fiemap_enter, _cid.c_str(), offset, len); + const coll_t& cid = !_need_temp_object_collection(_cid, oid) ? _cid : _cid.get_temp(); + destmap.clear(); + + if ((!backend->has_seek_data_hole() && !backend->has_fiemap()) || + len <= (size_t)m_filestore_fiemap_threshold) { + destmap[offset] = len; + return 0; + } + + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl; + + FDRef fd; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + dout(10) << "read couldn't open " << cid << "/" << oid << ": " << cpp_strerror(r) << dendl; + goto done; + } + + if (backend->has_seek_data_hole()) { + dout(15) << "seek_data/seek_hole " << cid << "/" << oid << " " << offset << "~" << len << dendl; + r = _do_seek_hole_data(**fd, offset, len, &destmap); + } else if (backend->has_fiemap()) { + dout(15) << "fiemap ioctl" << cid << "/" << oid << " " << offset << "~" << len << dendl; + r = _do_fiemap(**fd, offset, len, &destmap); + } + + lfn_close(fd); + +done: + + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << " num_extents=" << destmap.size() << " " << destmap << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + tracepoint(objectstore, fiemap_exit, r); + return r; +} + +int FileStore::_remove(const coll_t& cid, const ghobject_t& oid, + const SequencerPosition &spos) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl; + int r = lfn_unlink(cid, oid, spos); + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl; + return r; +} + +int FileStore::_truncate(const coll_t& cid, const ghobject_t& oid, uint64_t size) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " size " << size << dendl; + int r = lfn_truncate(cid, oid, size); + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " size " << size << " = " << r << dendl; + return r; +} + + +int FileStore::_touch(const coll_t& cid, const ghobject_t& oid) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl; + + FDRef fd; + int r = lfn_open(cid, oid, true, &fd); + if (r < 0) { + return r; + } else { + lfn_close(fd); + } + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl; + return r; +} + +int FileStore::_write(const coll_t& cid, const ghobject_t& oid, + uint64_t offset, size_t len, + const bufferlist& bl, uint32_t fadvise_flags) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl; + int r; + + FDRef fd; + r = lfn_open(cid, oid, true, &fd); + if (r < 0) { + dout(0) << __FUNC__ << ": couldn't open " << cid << "/" + << oid << ": " + << cpp_strerror(r) << dendl; + goto out; + } + + // write + r = bl.write_fd(**fd, offset); + if (r < 0) { + derr << __FUNC__ << ": write_fd on " << cid << "/" << oid + << " error: " << cpp_strerror(r) << dendl; + lfn_close(fd); + goto out; + } + r = bl.length(); + + if (r >= 0 && m_filestore_sloppy_crc) { + int rc = backend->_crc_update_write(**fd, offset, len, bl); + assert(rc >= 0); + } + + if (replaying || m_disable_wbthrottle) { + if (fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED) { +#ifdef HAVE_POSIX_FADVISE + posix_fadvise(**fd, 0, 0, POSIX_FADV_DONTNEED); +#endif + } + } else { + wbthrottle.queue_wb(fd, oid, offset, len, + fadvise_flags & CEPH_OSD_OP_FLAG_FADVISE_DONTNEED); + } + + lfn_close(fd); + + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << r << dendl; + return r; +} + +int FileStore::_zero(const coll_t& cid, const ghobject_t& oid, uint64_t offset, size_t len) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << dendl; + int ret = 0; + + if (cct->_conf->filestore_punch_hole) { +#ifdef CEPH_HAVE_FALLOCATE +# if !defined(DARWIN) && !defined(__FreeBSD__) +# ifdef FALLOC_FL_KEEP_SIZE + // first try to punch a hole. + FDRef fd; + ret = lfn_open(cid, oid, false, &fd); + if (ret < 0) { + goto out; + } + + struct stat st; + ret = ::fstat(**fd, &st); + if (ret < 0) { + ret = -errno; + lfn_close(fd); + goto out; + } + + // first try fallocate + ret = fallocate(**fd, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + offset, len); + if (ret < 0) { + ret = -errno; + } else { + // ensure we extent file size, if needed + if (offset + len > (uint64_t)st.st_size) { + ret = ::ftruncate(**fd, offset + len); + if (ret < 0) { + ret = -errno; + lfn_close(fd); + goto out; + } + } + } + lfn_close(fd); + + if (ret >= 0 && m_filestore_sloppy_crc) { + int rc = backend->_crc_update_zero(**fd, offset, len); + assert(rc >= 0); + } + + if (ret == 0) + goto out; // yay! + if (ret != -EOPNOTSUPP) + goto out; // some other error +# endif +# endif +#endif + } + + // lame, kernel is old and doesn't support it. + // write zeros.. yuck! + dout(20) << __FUNC__ << ": falling back to writing zeros" << dendl; + { + bufferlist bl; + bl.append_zero(len); + ret = _write(cid, oid, offset, len, bl); + } + +#ifdef CEPH_HAVE_FALLOCATE +# if !defined(DARWIN) && !defined(__FreeBSD__) +# ifdef FALLOC_FL_KEEP_SIZE + out: +# endif +# endif +#endif + dout(20) << __FUNC__ << ": " << cid << "/" << oid << " " << offset << "~" << len << " = " << ret << dendl; + return ret; +} + +int FileStore::_clone(const coll_t& cid, const ghobject_t& oldoid, const ghobject_t& newoid, + const SequencerPosition& spos) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << dendl; + + if (_check_replay_guard(cid, newoid, spos) < 0) + return 0; + + int r; + FDRef o, n; + { + Index index; + r = lfn_open(cid, oldoid, false, &o, &index); + if (r < 0) { + goto out2; + } + assert(NULL != (index.index)); + RWLock::WLocker l((index.index)->access_lock); + + r = lfn_open(cid, newoid, true, &n, &index); + if (r < 0) { + goto out; + } + r = ::ftruncate(**n, 0); + if (r < 0) { + r = -errno; + goto out3; + } + struct stat st; + r = ::fstat(**o, &st); + if (r < 0) { + r = -errno; + goto out3; + } + + r = _do_clone_range(**o, **n, 0, st.st_size, 0); + if (r < 0) { + goto out3; + } + + dout(20) << "objectmap clone" << dendl; + r = object_map->clone(oldoid, newoid, &spos); + if (r < 0 && r != -ENOENT) + goto out3; + } + + { + char buf[2]; + map aset; + r = _fgetattrs(**o, aset); + if (r < 0) + goto out3; + + r = chain_fgetxattr(**o, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) { + r = chain_fsetxattr(**n, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT, + sizeof(XATTR_NO_SPILL_OUT)); + } else { + r = chain_fsetxattr(**n, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT, + sizeof(XATTR_SPILL_OUT)); + } + if (r < 0) + goto out3; + + r = _fsetattrs(**n, aset); + if (r < 0) + goto out3; + } + + // clone is non-idempotent; record our work. + _set_replay_guard(**n, spos, &newoid); + + out3: + lfn_close(n); + out: + lfn_close(o); + out2: + dout(10) << __FUNC__ << ": " << cid << "/" << oldoid << " -> " << cid << "/" << newoid << " = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + return r; +} + +int FileStore::_do_clone_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) +{ + dout(20) << __FUNC__ << ": copy " << srcoff << "~" << len << " to " << dstoff << dendl; + return backend->clone_range(from, to, srcoff, len, dstoff); +} + +int FileStore::_do_sparse_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff) +{ + dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << dendl; + int r = 0; + map exomap; + // fiemap doesn't allow zero length + if (len == 0) + return 0; + + if (backend->has_seek_data_hole()) { + dout(15) << "seek_data/seek_hole " << from << " " << srcoff << "~" << len << dendl; + r = _do_seek_hole_data(from, srcoff, len, &exomap); + } else if (backend->has_fiemap()) { + dout(15) << "fiemap ioctl" << from << " " << srcoff << "~" << len << dendl; + r = _do_fiemap(from, srcoff, len, &exomap); + } + + + int64_t written = 0; + if (r < 0) + goto out; + + for (map::iterator miter = exomap.begin(); miter != exomap.end(); ++miter) { + uint64_t it_off = miter->first - srcoff + dstoff; + r = _do_copy_range(from, to, miter->first, miter->second, it_off, true); + if (r < 0) { + derr << __FUNC__ << ": copy error at " << miter->first << "~" << miter->second + << " to " << it_off << ", " << cpp_strerror(r) << dendl; + break; + } + written += miter->second; + } + + if (r >= 0) { + if (m_filestore_sloppy_crc) { + int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff); + assert(rc >= 0); + } + struct stat st; + r = ::fstat(to, &st); + if (r < 0) { + r = -errno; + derr << __FUNC__ << ": fstat error at " << to << " " << cpp_strerror(r) << dendl; + goto out; + } + if (st.st_size < (int)(dstoff + len)) { + r = ::ftruncate(to, dstoff + len); + if (r < 0) { + r = -errno; + derr << __FUNC__ << ": ftruncate error at " << dstoff+len << " " << cpp_strerror(r) << dendl; + goto out; + } + } + r = written; + } + + out: + dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl; + return r; +} + +int FileStore::_do_copy_range(int from, int to, uint64_t srcoff, uint64_t len, uint64_t dstoff, bool skip_sloppycrc) +{ + dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << dendl; + int r = 0; + loff_t pos = srcoff; + loff_t end = srcoff + len; + int buflen = 4096 * 16; //limit by pipe max size.see fcntl + +#ifdef CEPH_HAVE_SPLICE + if (backend->has_splice()) { + int pipefd[2]; + if (pipe(pipefd) < 0) { + r = -errno; + derr << " pipe " << " got " << cpp_strerror(r) << dendl; + return r; + } + + loff_t dstpos = dstoff; + while (pos < end) { + int l = MIN(end-pos, buflen); + r = safe_splice(from, &pos, pipefd[1], NULL, l, SPLICE_F_NONBLOCK); + dout(10) << " safe_splice read from " << pos << "~" << l << " got " << r << dendl; + if (r < 0) { + derr << __FUNC__ << ": safe_splice read error at " << pos << "~" << len + << ", " << cpp_strerror(r) << dendl; + break; + } + if (r == 0) { + // hrm, bad source range, wtf. + r = -ERANGE; + derr << __FUNC__ << ": got short read result at " << pos + << " of fd " << from << " len " << len << dendl; + break; + } + + r = safe_splice(pipefd[0], NULL, to, &dstpos, r, 0); + dout(10) << " safe_splice write to " << to << " len " << r + << " got " << r << dendl; + if (r < 0) { + derr << __FUNC__ << ": write error at " << pos << "~" + << r << ", " << cpp_strerror(r) << dendl; + break; + } + } + close(pipefd[0]); + close(pipefd[1]); + } else +#endif + { + int64_t actual; + + actual = ::lseek64(from, srcoff, SEEK_SET); + if (actual != (int64_t)srcoff) { + if (actual < 0) + r = -errno; + else + r = -EINVAL; + derr << "lseek64 to " << srcoff << " got " << cpp_strerror(r) << dendl; + return r; + } + actual = ::lseek64(to, dstoff, SEEK_SET); + if (actual != (int64_t)dstoff) { + if (actual < 0) + r = -errno; + else + r = -EINVAL; + derr << "lseek64 to " << dstoff << " got " << cpp_strerror(r) << dendl; + return r; + } + + char buf[buflen]; + while (pos < end) { + int l = MIN(end-pos, buflen); + r = ::read(from, buf, l); + dout(25) << " read from " << pos << "~" << l << " got " << r << dendl; + if (r < 0) { + if (errno == EINTR) { + continue; + } else { + r = -errno; + derr << __FUNC__ << ": read error at " << pos << "~" << len + << ", " << cpp_strerror(r) << dendl; + break; + } + } + if (r == 0) { + // hrm, bad source range, wtf. + r = -ERANGE; + derr << __FUNC__ << ": got short read result at " << pos + << " of fd " << from << " len " << len << dendl; + break; + } + int op = 0; + while (op < r) { + int r2 = safe_write(to, buf+op, r-op); + dout(25) << " write to " << to << " len " << (r-op) + << " got " << r2 << dendl; + if (r2 < 0) { + r = r2; + derr << __FUNC__ << ": write error at " << pos << "~" + << r-op << ", " << cpp_strerror(r) << dendl; + + break; + } + op += (r-op); + } + if (r < 0) + break; + pos += r; + } + } + + if (r < 0 && replaying) { + assert(r == -ERANGE); + derr << __FUNC__ << ": short source tolerated because we are replaying" << dendl; + r = pos - from;; + } + assert(replaying || pos == end); + if (r >= 0 && !skip_sloppycrc && m_filestore_sloppy_crc) { + int rc = backend->_crc_update_clone_range(from, to, srcoff, len, dstoff); + assert(rc >= 0); + } + dout(20) << __FUNC__ << ": " << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl; + return r; +} + +int FileStore::_clone_range(const coll_t& oldcid, const ghobject_t& oldoid, const coll_t& newcid, const ghobject_t& newoid, + uint64_t srcoff, uint64_t len, uint64_t dstoff, + const SequencerPosition& spos) +{ + dout(15) << __FUNC__ << ": " << oldcid << "/" << oldoid << " -> " << newcid << "/" << newoid << " " << srcoff << "~" << len << " to " << dstoff << dendl; + + if (_check_replay_guard(newcid, newoid, spos) < 0) + return 0; + + int r; + FDRef o, n; + r = lfn_open(oldcid, oldoid, false, &o); + if (r < 0) { + goto out2; + } + r = lfn_open(newcid, newoid, true, &n); + if (r < 0) { + goto out; + } + r = _do_clone_range(**o, **n, srcoff, len, dstoff); + if (r < 0) { + goto out3; + } + + // clone is non-idempotent; record our work. + _set_replay_guard(**n, spos, &newoid); + + out3: + lfn_close(n); + out: + lfn_close(o); + out2: + dout(10) << __FUNC__ << ": " << oldcid << "/" << oldoid << " -> " << newcid << "/" << newoid << " " + << srcoff << "~" << len << " to " << dstoff << " = " << r << dendl; + return r; +} + +class SyncEntryTimeout : public Context { +public: + CephContext* cct; + explicit SyncEntryTimeout(CephContext* cct, int commit_timeo) + : cct(cct), m_commit_timeo(commit_timeo) + { + } + + void finish(int r) override { + BackTrace *bt = new BackTrace(1); + generic_dout(-1) << "FileStore: sync_entry timed out after " + << m_commit_timeo << " seconds.\n"; + bt->print(*_dout); + *_dout << dendl; + delete bt; + ceph_abort(); + } +private: + int m_commit_timeo; +}; + +void FileStore::sync_entry() +{ + lock.Lock(); + while (!stop) { + utime_t max_interval; + max_interval.set_from_double(m_filestore_max_sync_interval); + utime_t min_interval; + min_interval.set_from_double(m_filestore_min_sync_interval); + + utime_t startwait = ceph_clock_now(); + if (!force_sync) { + dout(20) << __FUNC__ << ": waiting for max_interval " << max_interval << dendl; + sync_cond.WaitInterval(lock, max_interval); + } else { + dout(20) << __FUNC__ << ": not waiting, force_sync set" << dendl; + } + + if (force_sync) { + dout(20) << __FUNC__ << ": force_sync set" << dendl; + force_sync = false; + } else if (stop) { + dout(20) << __FUNC__ << ": stop set" << dendl; + break; + } else { + // wait for at least the min interval + utime_t woke = ceph_clock_now(); + woke -= startwait; + dout(20) << __FUNC__ << ": woke after " << woke << dendl; + if (woke < min_interval) { + utime_t t = min_interval; + t -= woke; + dout(20) << __FUNC__ << ": waiting for another " << t + << " to reach min interval " << min_interval << dendl; + sync_cond.WaitInterval(lock, t); + } + } + + list fin; + again: + fin.swap(sync_waiters); + lock.Unlock(); + + op_tp.pause(); + if (apply_manager.commit_start()) { + utime_t start = ceph_clock_now(); + uint64_t cp = apply_manager.get_committing_seq(); + + sync_entry_timeo_lock.Lock(); + SyncEntryTimeout *sync_entry_timeo = + new SyncEntryTimeout(cct, m_filestore_commit_timeout); + if (!timer.add_event_after(m_filestore_commit_timeout, + sync_entry_timeo)) { + sync_entry_timeo = nullptr; + } + sync_entry_timeo_lock.Unlock(); + + logger->set(l_filestore_committing, 1); + + dout(15) << __FUNC__ << ": committing " << cp << dendl; + stringstream errstream; + if (cct->_conf->filestore_debug_omap_check && !object_map->check(errstream)) { + derr << errstream.str() << dendl; + ceph_abort(); + } + + if (backend->can_checkpoint()) { + int err = write_op_seq(op_fd, cp); + if (err < 0) { + derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl; + assert(0 == "error during write_op_seq"); + } + + char s[NAME_MAX]; + snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)cp); + uint64_t cid = 0; + err = backend->create_checkpoint(s, &cid); + if (err < 0) { + int err = errno; + derr << "snap create '" << s << "' got error " << err << dendl; + assert(err == 0); + } + + snaps.push_back(cp); + apply_manager.commit_started(); + op_tp.unpause(); + + if (cid > 0) { + dout(20) << " waiting for checkpoint " << cid << " to complete" << dendl; + err = backend->sync_checkpoint(cid); + if (err < 0) { + derr << "ioctl WAIT_SYNC got " << cpp_strerror(err) << dendl; + assert(0 == "wait_sync got error"); + } + dout(20) << " done waiting for checkpoint " << cid << " to complete" << dendl; + } + } else { + apply_manager.commit_started(); + op_tp.unpause(); + + int err = object_map->sync(); + if (err < 0) { + derr << "object_map sync got " << cpp_strerror(err) << dendl; + assert(0 == "object_map sync returned error"); + } + + err = backend->syncfs(); + if (err < 0) { + derr << "syncfs got " << cpp_strerror(err) << dendl; + assert(0 == "syncfs returned error"); + } + + err = write_op_seq(op_fd, cp); + if (err < 0) { + derr << "Error during write_op_seq: " << cpp_strerror(err) << dendl; + assert(0 == "error during write_op_seq"); + } + err = ::fsync(op_fd); + if (err < 0) { + derr << "Error during fsync of op_seq: " << cpp_strerror(err) << dendl; + assert(0 == "error during fsync of op_seq"); + } + } + + utime_t done = ceph_clock_now(); + utime_t lat = done - start; + utime_t dur = done - startwait; + dout(10) << __FUNC__ << ": commit took " << lat << ", interval was " << dur << dendl; + utime_t max_pause_lat = logger->tget(l_filestore_sync_pause_max_lat); + if (max_pause_lat < dur - lat) { + logger->tinc(l_filestore_sync_pause_max_lat, dur - lat); + } + + logger->inc(l_filestore_commitcycle); + logger->tinc(l_filestore_commitcycle_latency, lat); + logger->tinc(l_filestore_commitcycle_interval, dur); + + apply_manager.commit_finish(); + if (!m_disable_wbthrottle) { + wbthrottle.clear(); + } + + logger->set(l_filestore_committing, 0); + + // remove old snaps? + if (backend->can_checkpoint()) { + char s[NAME_MAX]; + while (snaps.size() > 2) { + snprintf(s, sizeof(s), COMMIT_SNAP_ITEM, (long long unsigned)snaps.front()); + snaps.pop_front(); + dout(10) << "removing snap '" << s << "'" << dendl; + int r = backend->destroy_checkpoint(s); + if (r) { + int err = errno; + derr << "unable to destroy snap '" << s << "' got " << cpp_strerror(err) << dendl; + } + } + } + + dout(15) << __FUNC__ << ": committed to op_seq " << cp << dendl; + + if (sync_entry_timeo) { + Mutex::Locker lock(sync_entry_timeo_lock); + timer.cancel_event(sync_entry_timeo); + } + } else { + op_tp.unpause(); + } + + lock.Lock(); + finish_contexts(cct, fin, 0); + fin.clear(); + if (!sync_waiters.empty()) { + dout(10) << __FUNC__ << ": more waiters, committing again" << dendl; + goto again; + } + if (!stop && journal && journal->should_commit_now()) { + dout(10) << __FUNC__ << ": journal says we should commit again (probably is/was full)" << dendl; + goto again; + } + } + stop = false; + lock.Unlock(); +} + +void FileStore::_start_sync() +{ + if (!journal) { // don't do a big sync if the journal is on + dout(10) << __FUNC__ << dendl; + sync_cond.Signal(); + } else { + dout(10) << __FUNC__ << ": - NOOP (journal is on)" << dendl; + } +} + +void FileStore::do_force_sync() +{ + dout(10) << __FUNC__ << dendl; + Mutex::Locker l(lock); + force_sync = true; + sync_cond.Signal(); +} + +void FileStore::start_sync(Context *onsafe) +{ + Mutex::Locker l(lock); + sync_waiters.push_back(onsafe); + sync_cond.Signal(); + force_sync = true; + dout(10) << __FUNC__ << dendl; +} + +void FileStore::sync() +{ + Mutex l("FileStore::sync"); + Cond c; + bool done; + C_SafeCond *fin = new C_SafeCond(&l, &c, &done); + + start_sync(fin); + + l.Lock(); + while (!done) { + dout(10) << "sync waiting" << dendl; + c.Wait(l); + } + l.Unlock(); + dout(10) << "sync done" << dendl; +} + +void FileStore::_flush_op_queue() +{ + dout(10) << __FUNC__ << ": draining op tp" << dendl; + op_wq.drain(); + dout(10) << __FUNC__ << ": waiting for apply finisher" << dendl; + for (vector::iterator it = apply_finishers.begin(); it != apply_finishers.end(); ++it) { + (*it)->wait_for_empty(); + } +} + +/* + * flush - make every queued write readable + */ +void FileStore::flush() +{ + dout(10) << __FUNC__ << dendl; + + if (cct->_conf->filestore_blackhole) { + // wait forever + Mutex lock("FileStore::flush::lock"); + Cond cond; + lock.Lock(); + while (true) + cond.Wait(lock); + ceph_abort(); + } + + if (m_filestore_journal_writeahead) { + if (journal) + journal->flush(); + dout(10) << __FUNC__ << ": draining ondisk finisher" << dendl; + for (vector::iterator it = ondisk_finishers.begin(); it != ondisk_finishers.end(); ++it) { + (*it)->wait_for_empty(); + } + } + + _flush_op_queue(); + dout(10) << __FUNC__ << ": complete" << dendl; +} + +/* + * sync_and_flush - make every queued write readable AND committed to disk + */ +void FileStore::sync_and_flush() +{ + dout(10) << __FUNC__ << dendl; + + if (m_filestore_journal_writeahead) { + if (journal) + journal->flush(); + _flush_op_queue(); + } else { + // includes m_filestore_journal_parallel + _flush_op_queue(); + sync(); + } + dout(10) << __FUNC__ << ": done" << dendl; +} + +int FileStore::flush_journal() +{ + dout(10) << __FUNC__ << dendl; + sync_and_flush(); + sync(); + return 0; +} + +int FileStore::snapshot(const string& name) +{ + dout(10) << __FUNC__ << ": " << name << dendl; + sync_and_flush(); + + if (!backend->can_checkpoint()) { + dout(0) << __FUNC__ << ": " << name << " failed, not supported" << dendl; + return -EOPNOTSUPP; + } + + char s[NAME_MAX]; + snprintf(s, sizeof(s), CLUSTER_SNAP_ITEM, name.c_str()); + + int r = backend->create_checkpoint(s, NULL); + if (r) { + derr << __FUNC__ << ": " << name << " failed: " << cpp_strerror(r) << dendl; + } + + return r; +} + +// ------------------------------- +// attributes + +int FileStore::_fgetattr(int fd, const char *name, bufferptr& bp) +{ + char val[CHAIN_XATTR_MAX_BLOCK_LEN]; + int l = chain_fgetxattr(fd, name, val, sizeof(val)); + if (l >= 0) { + bp = buffer::create(l); + memcpy(bp.c_str(), val, l); + } else if (l == -ERANGE) { + l = chain_fgetxattr(fd, name, 0, 0); + if (l > 0) { + bp = buffer::create(l); + l = chain_fgetxattr(fd, name, bp.c_str(), l); + } + } + assert(!m_filestore_fail_eio || l != -EIO); + return l; +} + +int FileStore::_fgetattrs(int fd, map& aset) +{ + // get attr list + char names1[100]; + int len = chain_flistxattr(fd, names1, sizeof(names1)-1); + char *names2 = 0; + char *name = 0; + if (len == -ERANGE) { + len = chain_flistxattr(fd, 0, 0); + if (len < 0) { + assert(!m_filestore_fail_eio || len != -EIO); + return len; + } + dout(10) << " -ERANGE, len is " << len << dendl; + names2 = new char[len+1]; + len = chain_flistxattr(fd, names2, len); + dout(10) << " -ERANGE, got " << len << dendl; + if (len < 0) { + assert(!m_filestore_fail_eio || len != -EIO); + delete[] names2; + return len; + } + name = names2; + } else if (len < 0) { + assert(!m_filestore_fail_eio || len != -EIO); + return len; + } else { + name = names1; + } + name[len] = 0; + + char *end = name + len; + while (name < end) { + char *attrname = name; + if (parse_attrname(&name)) { + if (*name) { + dout(20) << __FUNC__ << ": " << fd << " getting '" << name << "'" << dendl; + int r = _fgetattr(fd, attrname, aset[name]); + if (r < 0) { + delete[] names2; + return r; + } + } + } + name += strlen(name) + 1; + } + + delete[] names2; + return 0; +} + +int FileStore::_fsetattrs(int fd, map &aset) +{ + for (map::iterator p = aset.begin(); + p != aset.end(); + ++p) { + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN); + const char *val; + if (p->second.length()) + val = p->second.c_str(); + else + val = ""; + // ??? Why do we skip setting all the other attrs if one fails? + int r = chain_fsetxattr(fd, n, val, p->second.length()); + if (r < 0) { + derr << __FUNC__ << ": chain_setxattr returned " << r << dendl; + return r; + } + } + return 0; +} + +// debug EIO injection +void FileStore::inject_data_error(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + dout(10) << __FUNC__ << ": init error on " << oid << dendl; + data_error_set.insert(oid); +} +void FileStore::inject_mdata_error(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + dout(10) << __FUNC__ << ": init error on " << oid << dendl; + mdata_error_set.insert(oid); +} + +void FileStore::debug_obj_on_delete(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + dout(10) << __FUNC__ << ": clear error on " << oid << dendl; + data_error_set.erase(oid); + mdata_error_set.erase(oid); +} +bool FileStore::debug_data_eio(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + if (data_error_set.count(oid)) { + dout(10) << __FUNC__ << ": inject error on " << oid << dendl; + return true; + } else { + return false; + } +} +bool FileStore::debug_mdata_eio(const ghobject_t &oid) { + Mutex::Locker l(read_error_lock); + if (mdata_error_set.count(oid)) { + dout(10) << __FUNC__ << ": inject error on " << oid << dendl; + return true; + } else { + return false; + } +} + + +// objects + +int FileStore::getattr(const coll_t& _cid, const ghobject_t& oid, const char *name, bufferptr &bp) +{ + tracepoint(objectstore, getattr_enter, _cid.c_str()); + const coll_t& cid = !_need_temp_object_collection(_cid, oid) ? _cid : _cid.get_temp(); + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "'" << dendl; + FDRef fd; + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN); + r = _fgetattr(**fd, n, bp); + lfn_close(fd); + if (r == -ENODATA) { + map got; + set to_get; + to_get.insert(string(name)); + Index index; + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __FUNC__ << ": could not get index r = " << r << dendl; + goto out; + } + r = object_map->get_xattrs(oid, to_get, &got); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": get_xattrs err r =" << r << dendl; + goto out; + } + if (got.empty()) { + dout(10) << __FUNC__ << ": got.size() is 0" << dendl; + return -ENODATA; + } + bp = bufferptr(got.begin()->second.c_str(), + got.begin()->second.length()); + r = bp.length(); + } + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "' = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + if (cct->_conf->filestore_debug_inject_read_err && + debug_mdata_eio(oid)) { + return -EIO; + } else { + tracepoint(objectstore, getattr_exit, r); + return r < 0 ? r : 0; + } +} + +int FileStore::getattrs(const coll_t& _cid, const ghobject_t& oid, map& aset) +{ + tracepoint(objectstore, getattrs_enter, _cid.c_str()); + const coll_t& cid = !_need_temp_object_collection(_cid, oid) ? _cid : _cid.get_temp(); + set omap_attrs; + map omap_aset; + Index index; + dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl; + FDRef fd; + bool spill_out = true; + char buf[2]; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + + r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) + spill_out = false; + + r = _fgetattrs(**fd, aset); + lfn_close(fd); + fd = FDRef(); // defensive + if (r < 0) { + goto out; + } + + if (!spill_out) { + dout(10) << __FUNC__ << ": no xattr exists in object_map r = " << r << dendl; + goto out; + } + + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __FUNC__ << ": could not get index r = " << r << dendl; + goto out; + } + { + r = object_map->get_all_xattrs(oid, &omap_attrs); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl; + goto out; + } + + r = object_map->get_xattrs(oid, omap_attrs, &omap_aset); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl; + goto out; + } + if (r == -ENOENT) + r = 0; + } + assert(omap_attrs.size() == omap_aset.size()); + for (map::iterator i = omap_aset.begin(); + i != omap_aset.end(); + ++i) { + string key(i->first); + aset.insert(make_pair(key, + bufferptr(i->second.c_str(), i->second.length()))); + } + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + + if (cct->_conf->filestore_debug_inject_read_err && + debug_mdata_eio(oid)) { + return -EIO; + } else { + tracepoint(objectstore, getattrs_exit, r); + return r; + } +} + +int FileStore::_setattrs(const coll_t& cid, const ghobject_t& oid, map& aset, + const SequencerPosition &spos) +{ + map omap_set; + set omap_remove; + map inline_set; + map inline_to_set; + FDRef fd; + int spill_out = -1; + bool incomplete_inline = false; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + + char buf[2]; + r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) + spill_out = 0; + else + spill_out = 1; + + r = _fgetattrs(**fd, inline_set); + incomplete_inline = (r == -E2BIG); + assert(!m_filestore_fail_eio || r != -EIO); + dout(15) << __FUNC__ << ": " << cid << "/" << oid + << (incomplete_inline ? " (incomplete_inline, forcing omap)" : "") + << dendl; + + for (map::iterator p = aset.begin(); + p != aset.end(); + ++p) { + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN); + + if (incomplete_inline) { + chain_fremovexattr(**fd, n); // ignore any error + omap_set[p->first].push_back(p->second); + continue; + } + + if (p->second.length() > m_filestore_max_inline_xattr_size) { + if (inline_set.count(p->first)) { + inline_set.erase(p->first); + r = chain_fremovexattr(**fd, n); + if (r < 0) + goto out_close; + } + omap_set[p->first].push_back(p->second); + continue; + } + + if (!inline_set.count(p->first) && + inline_set.size() >= m_filestore_max_inline_xattrs) { + omap_set[p->first].push_back(p->second); + continue; + } + omap_remove.insert(p->first); + inline_set.insert(*p); + + inline_to_set.insert(*p); + } + + if (spill_out != 1 && !omap_set.empty()) { + chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_SPILL_OUT, + sizeof(XATTR_SPILL_OUT)); + } + + r = _fsetattrs(**fd, inline_to_set); + if (r < 0) + goto out_close; + + if (spill_out && !omap_remove.empty()) { + r = object_map->remove_xattrs(oid, omap_remove, &spos); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not remove_xattrs r = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + goto out_close; + } else { + r = 0; // don't confuse the debug output + } + } + + if (!omap_set.empty()) { + r = object_map->set_xattrs(oid, omap_set, &spos); + if (r < 0) { + dout(10) << __FUNC__ << ": could not set_xattrs r = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + goto out_close; + } + } + out_close: + lfn_close(fd); + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl; + return r; +} + + +int FileStore::_rmattr(const coll_t& cid, const ghobject_t& oid, const char *name, + const SequencerPosition &spos) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "'" << dendl; + FDRef fd; + bool spill_out = true; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + + char buf[2]; + r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) { + spill_out = false; + } + + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(name, n, CHAIN_XATTR_MAX_NAME_LEN); + r = chain_fremovexattr(**fd, n); + if (r == -ENODATA && spill_out) { + Index index; + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __FUNC__ << ": could not get index r = " << r << dendl; + goto out_close; + } + set to_remove; + to_remove.insert(string(name)); + r = object_map->remove_xattrs(oid, to_remove, &spos); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not remove_xattrs index r = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + goto out_close; + } + } + out_close: + lfn_close(fd); + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " '" << name << "' = " << r << dendl; + return r; +} + +int FileStore::_rmattrs(const coll_t& cid, const ghobject_t& oid, + const SequencerPosition &spos) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << dendl; + + map aset; + FDRef fd; + set omap_attrs; + Index index; + bool spill_out = true; + + int r = lfn_open(cid, oid, false, &fd); + if (r < 0) { + goto out; + } + + char buf[2]; + r = chain_fgetxattr(**fd, XATTR_SPILL_OUT_NAME, buf, sizeof(buf)); + if (r >= 0 && !strncmp(buf, XATTR_NO_SPILL_OUT, sizeof(XATTR_NO_SPILL_OUT))) { + spill_out = false; + } + + r = _fgetattrs(**fd, aset); + if (r >= 0) { + for (map::iterator p = aset.begin(); p != aset.end(); ++p) { + char n[CHAIN_XATTR_MAX_NAME_LEN]; + get_attrname(p->first.c_str(), n, CHAIN_XATTR_MAX_NAME_LEN); + r = chain_fremovexattr(**fd, n); + if (r < 0) { + dout(10) << __FUNC__ << ": could not remove xattr r = " << r << dendl; + goto out_close; + } + } + } + + if (!spill_out) { + dout(10) << __FUNC__ << ": no xattr exists in object_map r = " << r << dendl; + goto out_close; + } + + r = get_index(cid, &index); + if (r < 0) { + dout(10) << __FUNC__ << ": could not get index r = " << r << dendl; + goto out_close; + } + { + r = object_map->get_all_xattrs(oid, &omap_attrs); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not get omap_attrs r = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + goto out_close; + } + r = object_map->remove_xattrs(oid, omap_attrs, &spos); + if (r < 0 && r != -ENOENT) { + dout(10) << __FUNC__ << ": could not remove omap_attrs r = " << r << dendl; + goto out_close; + } + if (r == -ENOENT) + r = 0; + chain_fsetxattr(**fd, XATTR_SPILL_OUT_NAME, XATTR_NO_SPILL_OUT, + sizeof(XATTR_NO_SPILL_OUT)); + } + + out_close: + lfn_close(fd); + out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " = " << r << dendl; + return r; +} + + + + +int FileStore::_collection_remove_recursive(const coll_t &cid, + const SequencerPosition &spos) +{ + struct stat st; + int r = collection_stat(cid, &st); + if (r < 0) { + if (r == -ENOENT) + return 0; + return r; + } + + vector objects; + ghobject_t max; + while (!max.is_max()) { + r = collection_list(cid, max, ghobject_t::get_max(), + 300, &objects, &max); + if (r < 0) + return r; + for (vector::iterator i = objects.begin(); + i != objects.end(); + ++i) { + assert(_check_replay_guard(cid, *i, spos)); + r = _remove(cid, *i, spos); + if (r < 0) + return r; + } + objects.clear(); + } + return _destroy_collection(cid); +} + +// -------------------------- +// collections + +int FileStore::list_collections(vector& ls) +{ + return list_collections(ls, false); +} + +int FileStore::list_collections(vector& ls, bool include_temp) +{ + tracepoint(objectstore, list_collections_enter); + dout(10) << __FUNC__ << dendl; + + char fn[PATH_MAX]; + snprintf(fn, sizeof(fn), "%s/current", basedir.c_str()); + + int r = 0; + DIR *dir = ::opendir(fn); + if (!dir) { + r = -errno; + derr << "tried opening directory " << fn << ": " << cpp_strerror(-r) << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + + struct dirent *de = nullptr; + while ((de = ::readdir(dir))) { + if (de->d_type == DT_UNKNOWN) { + // d_type not supported (non-ext[234], btrfs), must stat + struct stat sb; + char filename[PATH_MAX]; + snprintf(filename, sizeof(filename), "%s/%s", fn, de->d_name); + + r = ::stat(filename, &sb); + if (r < 0) { + r = -errno; + derr << "stat on " << filename << ": " << cpp_strerror(-r) << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + break; + } + if (!S_ISDIR(sb.st_mode)) { + continue; + } + } else if (de->d_type != DT_DIR) { + continue; + } + if (strcmp(de->d_name, "omap") == 0) { + continue; + } + if (de->d_name[0] == '.' && + (de->d_name[1] == '\0' || + (de->d_name[1] == '.' && + de->d_name[2] == '\0'))) + continue; + coll_t cid; + if (!cid.parse(de->d_name)) { + derr << "ignoring invalid collection '" << de->d_name << "'" << dendl; + continue; + } + if (!cid.is_temp() || include_temp) + ls.push_back(cid); + } + + if (r > 0) { + derr << "trying readdir " << fn << ": " << cpp_strerror(r) << dendl; + r = -r; + } + + ::closedir(dir); + assert(!m_filestore_fail_eio || r != -EIO); + tracepoint(objectstore, list_collections_exit, r); + return r; +} + +int FileStore::collection_stat(const coll_t& c, struct stat *st) +{ + tracepoint(objectstore, collection_stat_enter, c.c_str()); + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(15) << __FUNC__ << ": " << fn << dendl; + int r = ::stat(fn, st); + if (r < 0) + r = -errno; + dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + tracepoint(objectstore, collection_stat_exit, r); + return r; +} + +bool FileStore::collection_exists(const coll_t& c) +{ + tracepoint(objectstore, collection_exists_enter, c.c_str()); + struct stat st; + bool ret = collection_stat(c, &st) == 0; + tracepoint(objectstore, collection_exists_exit, ret); + return ret; +} + +int FileStore::collection_empty(const coll_t& c, bool *empty) +{ + tracepoint(objectstore, collection_empty_enter, c.c_str()); + dout(15) << __FUNC__ << ": " << c << dendl; + Index index; + int r = get_index(c, &index); + if (r < 0) { + derr << __FUNC__ << ": get_index returned: " << cpp_strerror(r) + << dendl; + return r; + } + + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + + vector ls; + r = index->collection_list_partial(ghobject_t(), ghobject_t::get_max(), + 1, &ls, NULL); + if (r < 0) { + derr << __FUNC__ << ": collection_list_partial returned: " + << cpp_strerror(r) << dendl; + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + *empty = ls.empty(); + tracepoint(objectstore, collection_empty_exit, *empty); + return 0; +} + +int FileStore::_collection_set_bits(const coll_t& c, int bits) +{ + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(10) << __FUNC__ << ": " << fn << " " << bits << dendl; + char n[PATH_MAX]; + int r; + int32_t v = bits; + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + r = -errno; + goto out; + } + get_attrname("bits", n, PATH_MAX); + r = chain_fsetxattr(fd, n, (char*)&v, sizeof(v)); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + out: + dout(10) << __FUNC__ << ": " << fn << " " << bits << " = " << r << dendl; + return r; +} + +int FileStore::collection_bits(const coll_t& c) +{ + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(15) << __FUNC__ << ": " << fn << dendl; + int r; + char n[PATH_MAX]; + int32_t bits; + int fd = ::open(fn, O_RDONLY); + if (fd < 0) { + bits = r = -errno; + goto out; + } + get_attrname("bits", n, PATH_MAX); + r = chain_fgetxattr(fd, n, (char*)&bits, sizeof(bits)); + VOID_TEMP_FAILURE_RETRY(::close(fd)); + if (r < 0) { + bits = r; + goto out; + } + out: + dout(10) << __FUNC__ << ": " << fn << " = " << bits << dendl; + return bits; +} + +int FileStore::collection_list(const coll_t& c, + const ghobject_t& orig_start, + const ghobject_t& end, + int max, + vector *ls, ghobject_t *next) +{ + ghobject_t start = orig_start; + if (start.is_max()) + return 0; + + ghobject_t temp_next; + if (!next) + next = &temp_next; + // figure out the pool id. we need this in order to generate a + // meaningful 'next' value. + int64_t pool = -1; + shard_id_t shard; + { + spg_t pgid; + if (c.is_temp(&pgid)) { + pool = -2 - pgid.pool(); + shard = pgid.shard; + } else if (c.is_pg(&pgid)) { + pool = pgid.pool(); + shard = pgid.shard; + } else if (c.is_meta()) { + pool = -1; + shard = shard_id_t::NO_SHARD; + } else { + // hrm, the caller is test code! we should get kill it off. for now, + // tolerate it. + pool = 0; + shard = shard_id_t::NO_SHARD; + } + dout(20) << __FUNC__ << ": pool is " << pool << " shard is " << shard + << " pgid " << pgid << dendl; + } + ghobject_t sep; + sep.hobj.pool = -1; + sep.set_shard(shard); + if (!c.is_temp() && !c.is_meta()) { + if (start < sep) { + dout(10) << __FUNC__ << ": first checking temp pool" << dendl; + coll_t temp = c.get_temp(); + int r = collection_list(temp, start, end, max, ls, next); + if (r < 0) + return r; + if (*next != ghobject_t::get_max()) + return r; + start = sep; + dout(10) << __FUNC__ << ": fall through to non-temp collection, start " + << start << dendl; + } else { + dout(10) << __FUNC__ << ": start " << start << " >= sep " << sep << dendl; + } + } + + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + + r = index->collection_list_partial(start, end, max, ls, next); + + if (r < 0) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + dout(20) << "objects: " << *ls << dendl; + + // HashIndex doesn't know the pool when constructing a 'next' value + if (next && !next->is_max()) { + next->hobj.pool = pool; + next->set_shard(shard); + dout(20) << " next " << *next << dendl; + } + + return 0; +} + +int FileStore::omap_get(const coll_t& _c, const ghobject_t &hoid, + bufferlist *header, + map *out) +{ + tracepoint(objectstore, omap_get_enter, _c.c_str()); + const coll_t& c = !_need_temp_object_collection(_c, hoid) ? _c : _c.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->get(hoid, header, out); + if (r < 0 && r != -ENOENT) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + tracepoint(objectstore, omap_get_exit, 0); + return 0; +} + +int FileStore::omap_get_header( + const coll_t& _c, + const ghobject_t &hoid, + bufferlist *bl, + bool allow_eio) +{ + tracepoint(objectstore, omap_get_header_enter, _c.c_str()); + const coll_t& c = !_need_temp_object_collection(_c, hoid) ? _c : _c.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->get_header(hoid, bl); + if (r < 0 && r != -ENOENT) { + assert(allow_eio || !m_filestore_fail_eio || r != -EIO); + return r; + } + tracepoint(objectstore, omap_get_header_exit, 0); + return 0; +} + +int FileStore::omap_get_keys(const coll_t& _c, const ghobject_t &hoid, set *keys) +{ + tracepoint(objectstore, omap_get_keys_enter, _c.c_str()); + const coll_t& c = !_need_temp_object_collection(_c, hoid) ? _c : _c.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->get_keys(hoid, keys); + if (r < 0 && r != -ENOENT) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + tracepoint(objectstore, omap_get_keys_exit, 0); + return 0; +} + +int FileStore::omap_get_values(const coll_t& _c, const ghobject_t &hoid, + const set &keys, + map *out) +{ + tracepoint(objectstore, omap_get_values_enter, _c.c_str()); + const coll_t& c = !_need_temp_object_collection(_c, hoid) ? _c : _c.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + Index index; + const char *where = "()"; + int r = get_index(c, &index); + if (r < 0) { + where = " (get_index)"; + goto out; + } + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) { + where = " (lfn_find)"; + goto out; + } + } + r = object_map->get_values(hoid, keys, out); + if (r < 0 && r != -ENOENT) { + assert(!m_filestore_fail_eio || r != -EIO); + where = " (get_values)"; + goto out; + } + r = 0; + out: + tracepoint(objectstore, omap_get_values_exit, r); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << " = " << r + << where << dendl; + return r; +} + +int FileStore::omap_check_keys(const coll_t& _c, const ghobject_t &hoid, + const set &keys, + set *out) +{ + tracepoint(objectstore, omap_check_keys_enter, _c.c_str()); + const coll_t& c = !_need_temp_object_collection(_c, hoid) ? _c : _c.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + + Index index; + int r = get_index(c, &index); + if (r < 0) + return r; + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->check_keys(hoid, keys, out); + if (r < 0 && r != -ENOENT) { + assert(!m_filestore_fail_eio || r != -EIO); + return r; + } + tracepoint(objectstore, omap_check_keys_exit, 0); + return 0; +} + +ObjectMap::ObjectMapIterator FileStore::get_omap_iterator(const coll_t& _c, + const ghobject_t &hoid) +{ + tracepoint(objectstore, get_omap_iterator, _c.c_str()); + const coll_t& c = !_need_temp_object_collection(_c, hoid) ? _c : _c.get_temp(); + dout(15) << __FUNC__ << ": " << c << "/" << hoid << dendl; + Index index; + int r = get_index(c, &index); + if (r < 0) { + dout(10) << __FUNC__ << ": " << c << "/" << hoid << " = 0 " + << "(get_index failed with " << cpp_strerror(r) << ")" << dendl; + return ObjectMap::ObjectMapIterator(); + } + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) { + dout(10) << __FUNC__ << ": " << c << "/" << hoid << " = 0 " + << "(lfn_find failed with " << cpp_strerror(r) << ")" << dendl; + return ObjectMap::ObjectMapIterator(); + } + } + return object_map->get_iterator(hoid); +} + +int FileStore::_collection_hint_expected_num_objs(const coll_t& c, uint32_t pg_num, + uint64_t expected_num_objs, + const SequencerPosition &spos) +{ + dout(15) << __FUNC__ << ": collection: " << c << " pg number: " + << pg_num << " expected number of objects: " << expected_num_objs << dendl; + + bool empty; + int ret = collection_empty(c, &empty); + if (ret < 0) + return ret; + if (!empty && !replaying) { + dout(0) << "Failed to give an expected number of objects hint to collection : " + << c << ", only empty collection can take such type of hint. " << dendl; + return 0; + } + + Index index; + ret = get_index(c, &index); + if (ret < 0) + return ret; + // Pre-hash the collection + ret = index->pre_hash_collection(pg_num, expected_num_objs); + dout(10) << "pre_hash_collection " << c << " = " << ret << dendl; + if (ret < 0) + return ret; + _set_replay_guard(c, spos); + + return 0; +} + +int FileStore::_create_collection( + const coll_t& c, + int bits, + const SequencerPosition &spos) +{ + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(15) << __FUNC__ << ": " << fn << dendl; + int r = ::mkdir(fn, 0755); + if (r < 0) + r = -errno; + if (r == -EEXIST && replaying) + r = 0; + dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl; + + if (r < 0) + return r; + r = init_index(c); + if (r < 0) + return r; + r = _collection_set_bits(c, bits); + if (r < 0) + return r; + // create parallel temp collection, too + if (!c.is_meta() && !c.is_temp()) { + coll_t temp = c.get_temp(); + r = _create_collection(temp, 0, spos); + if (r < 0) + return r; + } + + _set_replay_guard(c, spos); + return 0; +} + +int FileStore::_destroy_collection(const coll_t& c) +{ + int r = 0; + char fn[PATH_MAX]; + get_cdir(c, fn, sizeof(fn)); + dout(15) << __FUNC__ << ": " << fn << dendl; + { + Index from; + r = get_index(c, &from); + if (r < 0) + goto out; + assert(NULL != from.index); + RWLock::WLocker l((from.index)->access_lock); + + r = from->prep_delete(); + if (r < 0) + goto out; + } + r = ::rmdir(fn); + if (r < 0) { + r = -errno; + goto out; + } + + out: + // destroy parallel temp collection, too + if (!c.is_meta() && !c.is_temp()) { + coll_t temp = c.get_temp(); + int r2 = _destroy_collection(temp); + if (r2 < 0) { + r = r2; + goto out_final; + } + } + + out_final: + dout(10) << __FUNC__ << ": " << fn << " = " << r << dendl; + return r; +} + + +int FileStore::_collection_add(const coll_t& c, const coll_t& oldcid, const ghobject_t& o, + const SequencerPosition& spos) +{ + dout(15) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << o << dendl; + + int dstcmp = _check_replay_guard(c, o, spos); + if (dstcmp < 0) + return 0; + + // check the src name too; it might have a newer guard, and we don't + // want to clobber it + int srccmp = _check_replay_guard(oldcid, o, spos); + if (srccmp < 0) + return 0; + + // open guard on object so we don't any previous operations on the + // new name that will modify the source inode. + FDRef fd; + int r = lfn_open(oldcid, o, 0, &fd); + if (r < 0) { + // the source collection/object does not exist. If we are replaying, we + // should be safe, so just return 0 and move on. + assert(replaying); + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " + << oldcid << "/" << o << " (dne, continue replay) " << dendl; + return 0; + } + if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress" + _set_replay_guard(**fd, spos, &o, true); + } + + r = lfn_link(oldcid, c, o, o); + if (replaying && !backend->can_checkpoint() && + r == -EEXIST) // crashed between link() and set_replay_guard() + r = 0; + + _inject_failure(); + + // close guard on object so we don't do this again + if (r == 0) { + _close_replay_guard(**fd, spos); + } + lfn_close(fd); + + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << o << " = " << r << dendl; + return r; +} + +int FileStore::_collection_move_rename(const coll_t& oldcid, const ghobject_t& oldoid, + coll_t c, const ghobject_t& o, + const SequencerPosition& spos, + bool allow_enoent) +{ + dout(15) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid << dendl; + int r = 0; + int dstcmp, srccmp; + + if (replaying) { + /* If the destination collection doesn't exist during replay, + * we need to delete the src object and continue on + */ + if (!collection_exists(c)) + goto out_rm_src; + } + + dstcmp = _check_replay_guard(c, o, spos); + if (dstcmp < 0) + goto out_rm_src; + + // check the src name too; it might have a newer guard, and we don't + // want to clobber it + srccmp = _check_replay_guard(oldcid, oldoid, spos); + if (srccmp < 0) + return 0; + + { + // open guard on object so we don't any previous operations on the + // new name that will modify the source inode. + FDRef fd; + r = lfn_open(oldcid, oldoid, 0, &fd); + if (r < 0) { + // the source collection/object does not exist. If we are replaying, we + // should be safe, so just return 0 and move on. + if (replaying) { + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " + << oldcid << "/" << oldoid << " (dne, continue replay) " << dendl; + } else if (allow_enoent) { + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " + << oldcid << "/" << oldoid << " (dne, ignoring enoent)" + << dendl; + } else { + assert(0 == "ERROR: source must exist"); + } + + if (!replaying) { + return 0; + } + if (allow_enoent && dstcmp > 0) { // if dstcmp == 0, try_rename was started. + return 0; + } + + r = 0; // don't know if object_map was cloned + } else { + if (dstcmp > 0) { // if dstcmp == 0 the guard already says "in-progress" + _set_replay_guard(**fd, spos, &o, true); + } + + r = lfn_link(oldcid, c, oldoid, o); + if (replaying && !backend->can_checkpoint() && + r == -EEXIST) // crashed between link() and set_replay_guard() + r = 0; + + lfn_close(fd); + fd = FDRef(); + + _inject_failure(); + } + + if (r == 0) { + // the name changed; link the omap content + r = object_map->rename(oldoid, o, &spos); + if (r == -ENOENT) + r = 0; + } + + _inject_failure(); + + if (r == 0) + r = lfn_unlink(oldcid, oldoid, spos, true); + + if (r == 0) + r = lfn_open(c, o, 0, &fd); + + // close guard on object so we don't do this again + if (r == 0) { + _close_replay_guard(**fd, spos, &o); + lfn_close(fd); + } + } + + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid + << " = " << r << dendl; + return r; + + out_rm_src: + // remove source + if (_check_replay_guard(oldcid, oldoid, spos) > 0) { + r = lfn_unlink(oldcid, oldoid, spos, true); + } + + dout(10) << __FUNC__ << ": " << c << "/" << o << " from " << oldcid << "/" << oldoid + << " = " << r << dendl; + return r; +} + +void FileStore::_inject_failure() +{ + if (m_filestore_kill_at) { + int final = --m_filestore_kill_at; + dout(5) << __FUNC__ << ": " << (final+1) << " -> " << final << dendl; + if (final == 0) { + derr << __FUNC__ << ": KILLING" << dendl; + cct->_log->flush(); + _exit(1); + } + } +} + +int FileStore::_omap_clear(const coll_t& cid, const ghobject_t &hoid, + const SequencerPosition &spos) { + dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl; + Index index; + int r = get_index(cid, &index); + if (r < 0) + return r; + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + r = object_map->clear_keys_header(hoid, &spos); + if (r < 0 && r != -ENOENT) + return r; + return 0; +} + +int FileStore::_omap_setkeys(const coll_t& cid, const ghobject_t &hoid, + const map &aset, + const SequencerPosition &spos) { + dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl; + Index index; + int r; + //treat pgmeta as a logical object, skip to check exist + if (hoid.is_pgmeta()) + goto skip; + + r = get_index(cid, &index); + if (r < 0) { + dout(20) << __FUNC__ << ": get_index got " << cpp_strerror(r) << dendl; + return r; + } + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) { + dout(20) << __FUNC__ << ": lfn_find got " << cpp_strerror(r) << dendl; + return r; + } + } +skip: + if (g_conf->subsys.should_gather(ceph_subsys_filestore, 20)) { + for (auto& p : aset) { + dout(20) << __FUNC__ << ": set " << p.first << dendl; + } + } + r = object_map->set_keys(hoid, aset, &spos); + dout(20) << __FUNC__ << ": " << cid << "/" << hoid << " = " << r << dendl; + return r; +} + +int FileStore::_omap_rmkeys(const coll_t& cid, const ghobject_t &hoid, + const set &keys, + const SequencerPosition &spos) { + dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl; + Index index; + int r; + //treat pgmeta as a logical object, skip to check exist + if (hoid.is_pgmeta()) + goto skip; + + r = get_index(cid, &index); + if (r < 0) + return r; + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } +skip: + r = object_map->rm_keys(hoid, keys, &spos); + if (r < 0 && r != -ENOENT) + return r; + return 0; +} + +int FileStore::_omap_rmkeyrange(const coll_t& cid, const ghobject_t &hoid, + const string& first, const string& last, + const SequencerPosition &spos) { + dout(15) << __FUNC__ << ": " << cid << "/" << hoid << " [" << first << "," << last << "]" << dendl; + set keys; + { + ObjectMap::ObjectMapIterator iter = get_omap_iterator(cid, hoid); + if (!iter) + return -ENOENT; + for (iter->lower_bound(first); iter->valid() && iter->key() < last; + iter->next()) { + keys.insert(iter->key()); + } + } + return _omap_rmkeys(cid, hoid, keys, spos); +} + +int FileStore::_omap_setheader(const coll_t& cid, const ghobject_t &hoid, + const bufferlist &bl, + const SequencerPosition &spos) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << hoid << dendl; + Index index; + int r = get_index(cid, &index); + if (r < 0) + return r; + { + assert(NULL != index.index); + RWLock::RLocker l((index.index)->access_lock); + r = lfn_find(hoid, index); + if (r < 0) + return r; + } + return object_map->set_header(hoid, bl, &spos); +} + +int FileStore::_split_collection(const coll_t& cid, + uint32_t bits, + uint32_t rem, + coll_t dest, + const SequencerPosition &spos) +{ + int r; + { + dout(15) << __FUNC__ << ": " << cid << " bits: " << bits << dendl; + if (!collection_exists(cid)) { + dout(2) << __FUNC__ << ": " << cid << " DNE" << dendl; + assert(replaying); + return 0; + } + if (!collection_exists(dest)) { + dout(2) << __FUNC__ << ": " << dest << " DNE" << dendl; + assert(replaying); + return 0; + } + + int dstcmp = _check_replay_guard(dest, spos); + if (dstcmp < 0) + return 0; + + int srccmp = _check_replay_guard(cid, spos); + if (srccmp < 0) + return 0; + + _set_global_replay_guard(cid, spos); + _set_replay_guard(cid, spos, true); + _set_replay_guard(dest, spos, true); + + Index from; + r = get_index(cid, &from); + + Index to; + if (!r) + r = get_index(dest, &to); + + if (!r) { + assert(NULL != from.index); + RWLock::WLocker l1((from.index)->access_lock); + + assert(NULL != to.index); + RWLock::WLocker l2((to.index)->access_lock); + + r = from->split(rem, bits, to.index); + } + + _close_replay_guard(cid, spos); + _close_replay_guard(dest, spos); + } + _collection_set_bits(cid, bits); + if (!r && cct->_conf->filestore_debug_verify_split) { + vector objects; + ghobject_t next; + while (1) { + collection_list( + cid, + next, ghobject_t::get_max(), + get_ideal_list_max(), + &objects, + &next); + if (objects.empty()) + break; + for (vector::iterator i = objects.begin(); + i != objects.end(); + ++i) { + dout(20) << __FUNC__ << ": " << *i << " still in source " + << cid << dendl; + assert(!i->match(bits, rem)); + } + objects.clear(); + } + next = ghobject_t(); + while (1) { + collection_list( + dest, + next, ghobject_t::get_max(), + get_ideal_list_max(), + &objects, + &next); + if (objects.empty()) + break; + for (vector::iterator i = objects.begin(); + i != objects.end(); + ++i) { + dout(20) << __FUNC__ << ": " << *i << " now in dest " + << *i << dendl; + assert(i->match(bits, rem)); + } + objects.clear(); + } + } + return r; +} + +int FileStore::_set_alloc_hint(const coll_t& cid, const ghobject_t& oid, + uint64_t expected_object_size, + uint64_t expected_write_size) +{ + dout(15) << __FUNC__ << ": " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << dendl; + + FDRef fd; + int ret = 0; + + if (expected_object_size == 0 || expected_write_size == 0) + goto out; + + ret = lfn_open(cid, oid, false, &fd); + if (ret < 0) + goto out; + + { + // TODO: a more elaborate hint calculation + uint64_t hint = MIN(expected_write_size, m_filestore_max_alloc_hint_size); + + ret = backend->set_alloc_hint(**fd, hint); + dout(20) << __FUNC__ << ": hint " << hint << " ret " << ret << dendl; + } + + lfn_close(fd); +out: + dout(10) << __FUNC__ << ": " << cid << "/" << oid << " object_size " << expected_object_size << " write_size " << expected_write_size << " = " << ret << dendl; + assert(!m_filestore_fail_eio || ret != -EIO); + return ret; +} + +const char** FileStore::get_tracked_conf_keys() const +{ + static const char* KEYS[] = { + "filestore_max_inline_xattr_size", + "filestore_max_inline_xattr_size_xfs", + "filestore_max_inline_xattr_size_btrfs", + "filestore_max_inline_xattr_size_other", + "filestore_max_inline_xattrs", + "filestore_max_inline_xattrs_xfs", + "filestore_max_inline_xattrs_btrfs", + "filestore_max_inline_xattrs_other", + "filestore_max_xattr_value_size", + "filestore_max_xattr_value_size_xfs", + "filestore_max_xattr_value_size_btrfs", + "filestore_max_xattr_value_size_other", + "filestore_min_sync_interval", + "filestore_max_sync_interval", + "filestore_queue_max_ops", + "filestore_queue_max_bytes", + "filestore_expected_throughput_bytes", + "filestore_expected_throughput_ops", + "filestore_queue_low_threshhold", + "filestore_queue_high_threshhold", + "filestore_queue_high_delay_multiple", + "filestore_queue_max_delay_multiple", + "filestore_commit_timeout", + "filestore_dump_file", + "filestore_kill_at", + "filestore_fail_eio", + "filestore_fadvise", + "filestore_sloppy_crc", + "filestore_sloppy_crc_block_size", + "filestore_max_alloc_hint_size", + NULL + }; + return KEYS; +} + +void FileStore::handle_conf_change(const struct md_config_t *conf, + const std::set &changed) +{ + if (changed.count("filestore_max_inline_xattr_size") || + changed.count("filestore_max_inline_xattr_size_xfs") || + changed.count("filestore_max_inline_xattr_size_btrfs") || + changed.count("filestore_max_inline_xattr_size_other") || + changed.count("filestore_max_inline_xattrs") || + changed.count("filestore_max_inline_xattrs_xfs") || + changed.count("filestore_max_inline_xattrs_btrfs") || + changed.count("filestore_max_inline_xattrs_other") || + changed.count("filestore_max_xattr_value_size") || + changed.count("filestore_max_xattr_value_size_xfs") || + changed.count("filestore_max_xattr_value_size_btrfs") || + changed.count("filestore_max_xattr_value_size_other")) { + if (backend) { + Mutex::Locker l(lock); + set_xattr_limits_via_conf(); + } + } + + if (changed.count("filestore_queue_max_bytes") || + changed.count("filestore_queue_max_ops") || + changed.count("filestore_expected_throughput_bytes") || + changed.count("filestore_expected_throughput_ops") || + changed.count("filestore_queue_low_threshhold") || + changed.count("filestore_queue_high_threshhold") || + changed.count("filestore_queue_high_delay_multiple") || + changed.count("filestore_queue_max_delay_multiple")) { + Mutex::Locker l(lock); + set_throttle_params(); + } + + if (changed.count("filestore_min_sync_interval") || + changed.count("filestore_max_sync_interval") || + changed.count("filestore_kill_at") || + changed.count("filestore_fail_eio") || + changed.count("filestore_sloppy_crc") || + changed.count("filestore_sloppy_crc_block_size") || + changed.count("filestore_max_alloc_hint_size") || + changed.count("filestore_fadvise")) { + Mutex::Locker l(lock); + m_filestore_min_sync_interval = conf->filestore_min_sync_interval; + m_filestore_max_sync_interval = conf->filestore_max_sync_interval; + m_filestore_kill_at = conf->filestore_kill_at; + m_filestore_fail_eio = conf->filestore_fail_eio; + m_filestore_fadvise = conf->filestore_fadvise; + m_filestore_sloppy_crc = conf->filestore_sloppy_crc; + m_filestore_sloppy_crc_block_size = conf->filestore_sloppy_crc_block_size; + m_filestore_max_alloc_hint_size = conf->filestore_max_alloc_hint_size; + } + if (changed.count("filestore_commit_timeout")) { + Mutex::Locker l(sync_entry_timeo_lock); + m_filestore_commit_timeout = conf->filestore_commit_timeout; + } + if (changed.count("filestore_dump_file")) { + if (conf->filestore_dump_file.length() && + conf->filestore_dump_file != "-") { + dump_start(conf->filestore_dump_file); + } else { + dump_stop(); + } + } +} + +int FileStore::set_throttle_params() +{ + stringstream ss; + bool valid = throttle_bytes.set_params( + cct->_conf->filestore_queue_low_threshhold, + cct->_conf->filestore_queue_high_threshhold, + cct->_conf->filestore_expected_throughput_bytes, + cct->_conf->filestore_queue_high_delay_multiple, + cct->_conf->filestore_queue_max_delay_multiple, + cct->_conf->filestore_queue_max_bytes, + &ss); + + valid &= throttle_ops.set_params( + cct->_conf->filestore_queue_low_threshhold, + cct->_conf->filestore_queue_high_threshhold, + cct->_conf->filestore_expected_throughput_ops, + cct->_conf->filestore_queue_high_delay_multiple, + cct->_conf->filestore_queue_max_delay_multiple, + cct->_conf->filestore_queue_max_ops, + &ss); + + logger->set(l_filestore_op_queue_max_ops, throttle_ops.get_max()); + logger->set(l_filestore_op_queue_max_bytes, throttle_bytes.get_max()); + + if (!valid) { + derr << "tried to set invalid params: " + << ss.str() + << dendl; + } + return valid ? 0 : -EINVAL; +} + +void FileStore::dump_start(const std::string& file) +{ + dout(10) << __FUNC__ << ": " << file << dendl; + if (m_filestore_do_dump) { + dump_stop(); + } + m_filestore_dump_fmt.reset(); + m_filestore_dump_fmt.open_array_section("dump"); + m_filestore_dump.open(file.c_str()); + m_filestore_do_dump = true; +} + +void FileStore::dump_stop() +{ + dout(10) << __FUNC__ << dendl; + m_filestore_do_dump = false; + if (m_filestore_dump.is_open()) { + m_filestore_dump_fmt.close_section(); + m_filestore_dump_fmt.flush(m_filestore_dump); + m_filestore_dump.flush(); + m_filestore_dump.close(); + } +} + +void FileStore::dump_transactions(vector& ls, uint64_t seq, OpSequencer *osr) +{ + m_filestore_dump_fmt.open_array_section("transactions"); + unsigned trans_num = 0; + for (vector::iterator i = ls.begin(); i != ls.end(); ++i, ++trans_num) { + m_filestore_dump_fmt.open_object_section("transaction"); + m_filestore_dump_fmt.dump_string("osr", osr->get_name()); + m_filestore_dump_fmt.dump_unsigned("seq", seq); + m_filestore_dump_fmt.dump_unsigned("trans_num", trans_num); + (*i).dump(&m_filestore_dump_fmt); + m_filestore_dump_fmt.close_section(); + } + m_filestore_dump_fmt.close_section(); + m_filestore_dump_fmt.flush(m_filestore_dump); + m_filestore_dump.flush(); +} + +void FileStore::set_xattr_limits_via_conf() +{ + uint32_t fs_xattr_size; + uint32_t fs_xattrs; + uint32_t fs_xattr_max_value_size; + + switch (m_fs_type) { +#if defined(__linux__) + case XFS_SUPER_MAGIC: + fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_xfs; + fs_xattrs = cct->_conf->filestore_max_inline_xattrs_xfs; + fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_xfs; + break; + case BTRFS_SUPER_MAGIC: + fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_btrfs; + fs_xattrs = cct->_conf->filestore_max_inline_xattrs_btrfs; + fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_btrfs; + break; +#endif + default: + fs_xattr_size = cct->_conf->filestore_max_inline_xattr_size_other; + fs_xattrs = cct->_conf->filestore_max_inline_xattrs_other; + fs_xattr_max_value_size = cct->_conf->filestore_max_xattr_value_size_other; + break; + } + + // Use override value if set + if (cct->_conf->filestore_max_inline_xattr_size) + m_filestore_max_inline_xattr_size = cct->_conf->filestore_max_inline_xattr_size; + else + m_filestore_max_inline_xattr_size = fs_xattr_size; + + // Use override value if set + if (cct->_conf->filestore_max_inline_xattrs) + m_filestore_max_inline_xattrs = cct->_conf->filestore_max_inline_xattrs; + else + m_filestore_max_inline_xattrs = fs_xattrs; + + // Use override value if set + if (cct->_conf->filestore_max_xattr_value_size) + m_filestore_max_xattr_value_size = cct->_conf->filestore_max_xattr_value_size; + else + m_filestore_max_xattr_value_size = fs_xattr_max_value_size; + + if (m_filestore_max_xattr_value_size < cct->_conf->osd_max_object_name_len) { + derr << "WARNING: max attr value size (" + << m_filestore_max_xattr_value_size + << ") is smaller than osd_max_object_name_len (" + << cct->_conf->osd_max_object_name_len + << "). Your backend filesystem appears to not support attrs large " + << "enough to handle the configured max rados name size. You may get " + << "unexpected ENAMETOOLONG errors on rados operations or buggy " + << "behavior" + << dendl; + } +} + +uint64_t FileStore::estimate_objects_overhead(uint64_t num_objects) +{ + uint64_t res = num_objects * blk_size / 2; //assumes that each object uses ( in average ) additional 1/2 block due to FS allocation granularity. + return res; +} + +int FileStore::apply_layout_settings(const coll_t &cid) +{ + dout(20) << __FUNC__ << ": " << cid << dendl; + Index index; + int r = get_index(cid, &index); + if (r < 0) { + dout(10) << "Error getting index for " << cid << ": " << cpp_strerror(r) + << dendl; + return r; + } + + return index->apply_layout_settings(); +} + + +// -- FSSuperblock -- + +void FSSuperblock::encode(bufferlist &bl) const +{ + ENCODE_START(2, 1, bl); + compat_features.encode(bl); + ::encode(omap_backend, bl); + ENCODE_FINISH(bl); +} + +void FSSuperblock::decode(bufferlist::iterator &bl) +{ + DECODE_START(2, bl); + compat_features.decode(bl); + if (struct_v >= 2) + ::decode(omap_backend, bl); + else + omap_backend = "leveldb"; + DECODE_FINISH(bl); +} + +void FSSuperblock::dump(Formatter *f) const +{ + f->open_object_section("compat"); + compat_features.dump(f); + f->dump_string("omap_backend", omap_backend); + f->close_section(); +} + +void FSSuperblock::generate_test_instances(list& o) +{ + FSSuperblock z; + o.push_back(new FSSuperblock(z)); + CompatSet::FeatureSet feature_compat; + CompatSet::FeatureSet feature_ro_compat; + CompatSet::FeatureSet feature_incompat; + feature_incompat.insert(CEPH_FS_FEATURE_INCOMPAT_SHARDS); + z.compat_features = CompatSet(feature_compat, feature_ro_compat, + feature_incompat); + o.push_back(new FSSuperblock(z)); + z.omap_backend = "rocksdb"; + o.push_back(new FSSuperblock(z)); +}