X-Git-Url: https://gerrit.opnfv.org/gerrit/gitweb?a=blobdiff_plain;f=src%2Fceph%2Fsrc%2Fmds%2FMDCache.h;fp=src%2Fceph%2Fsrc%2Fmds%2FMDCache.h;h=0000000000000000000000000000000000000000;hb=7da45d65be36d36b880cc55c5036e96c24b53f00;hp=61a170bf6fd57b60baae3a613b89c0a9770afdce;hpb=691462d09d0987b47e112d6ee8740375df3c51b2;p=stor4nfv.git diff --git a/src/ceph/src/mds/MDCache.h b/src/ceph/src/mds/MDCache.h deleted file mode 100644 index 61a170b..0000000 --- a/src/ceph/src/mds/MDCache.h +++ /dev/null @@ -1,1227 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - - - -#ifndef CEPH_MDCACHE_H -#define CEPH_MDCACHE_H - -#include "include/types.h" -#include "include/filepath.h" -#include "include/elist.h" - -#include "osdc/Filer.h" -#include "CInode.h" -#include "CDentry.h" -#include "CDir.h" -#include "include/Context.h" -#include "events/EMetaBlob.h" -#include "RecoveryQueue.h" -#include "StrayManager.h" -#include "MDSContext.h" -#include "MDSMap.h" -#include "Mutation.h" - -#include "messages/MClientRequest.h" -#include "messages/MMDSSlaveRequest.h" - -class PerfCounters; - -class MDSRank; -class Session; -class Migrator; - -class Message; -class Session; - -class MMDSResolve; -class MMDSResolveAck; -class MMDSCacheRejoin; -class MDiscover; -class MDiscoverReply; -class MCacheExpire; -class MDirUpdate; -class MDentryLink; -class MDentryUnlink; -class MLock; -struct MMDSFindIno; -struct MMDSFindInoReply; -struct MMDSOpenIno; -struct MMDSOpenInoReply; - -class Message; -class MClientRequest; -class MMDSSlaveRequest; -struct MClientSnap; - -class MMDSFragmentNotify; - -class ESubtreeMap; - -enum { - l_mdc_first = 3000, - // How many inodes currently in stray dentries - l_mdc_num_strays, - // How many stray dentries are currently delayed for purge due to refs - l_mdc_num_strays_delayed, - // How many stray dentries are currently being enqueued for purge - l_mdc_num_strays_enqueuing, - - // How many dentries have ever been added to stray dir - l_mdc_strays_created, - // How many dentries have been passed on to PurgeQueue - l_mdc_strays_enqueued, - // How many strays have been reintegrated? - l_mdc_strays_reintegrated, - // How many strays have been migrated? - l_mdc_strays_migrated, - - // How many inode sizes currently being recovered - l_mdc_num_recovering_processing, - // How many inodes currently waiting to have size recovered - l_mdc_num_recovering_enqueued, - // How many inodes waiting with elevated priority for recovery - l_mdc_num_recovering_prioritized, - // How many inodes ever started size recovery - l_mdc_recovery_started, - // How many inodes ever completed size recovery - l_mdc_recovery_completed, - - l_mdss_ireq_enqueue_scrub, - l_mdss_ireq_exportdir, - l_mdss_ireq_flush, - l_mdss_ireq_fragmentdir, - l_mdss_ireq_fragstats, - l_mdss_ireq_inodestats, - - l_mdc_last, -}; - - -// flags for predirty_journal_parents() -static const int PREDIRTY_PRIMARY = 1; // primary dn, adjust nested accounting -static const int PREDIRTY_DIR = 2; // update parent dir mtime/size -static const int PREDIRTY_SHALLOW = 4; // only go to immediate parent (for easier rollback) - -class MDCache { - public: - // my master - MDSRank *mds; - - // -- my cache -- - LRU lru; // dentry lru for expiring items from cache - LRU bottom_lru; // dentries that should be trimmed ASAP - protected: - ceph::unordered_map inode_map; // map of inodes by ino - CInode *root; // root inode - CInode *myin; // .ceph/mds%d dir - - bool readonly; - void set_readonly() { readonly = true; } - - CInode *strays[NUM_STRAY]; // my stray dir - int stray_index; - - CInode *get_stray() { - return strays[stray_index]; - } - - set base_inodes; - - std::unique_ptr logger; - - Filer filer; - - bool exceeded_size_limit; - -public: - static uint64_t cache_limit_inodes(void) { - return g_conf->get_val("mds_cache_size"); - } - static uint64_t cache_limit_memory(void) { - return g_conf->get_val("mds_cache_memory_limit"); - } - static double cache_reservation(void) { - return g_conf->get_val("mds_cache_reservation"); - } - static double cache_mid(void) { - return g_conf->get_val("mds_cache_mid"); - } - static double cache_health_threshold(void) { - return g_conf->get_val("mds_health_cache_threshold"); - } - double cache_toofull_ratio(void) const { - uint64_t inode_limit = cache_limit_inodes(); - double inode_reserve = inode_limit*(1.0-cache_reservation()); - double memory_reserve = cache_limit_memory()*(1.0-cache_reservation()); - return fmax(0.0, fmax((cache_size()-memory_reserve)/memory_reserve, inode_limit == 0 ? 0.0 : (CInode::count()-inode_reserve)/inode_reserve)); - } - bool cache_toofull(void) const { - return cache_toofull_ratio() > 0.0; - } - uint64_t cache_size(void) const { - return mempool::get_pool(mempool::mds_co::id).allocated_bytes(); - } - bool cache_overfull(void) const { - uint64_t inode_limit = cache_limit_inodes(); - return (inode_limit > 0 && CInode::count() > inode_limit*cache_health_threshold()) || (cache_size() > cache_limit_memory()*cache_health_threshold()); - } - - void advance_stray() { - stray_index = (stray_index+1)%NUM_STRAY; - } - - void activate_stray_manager(); - - /** - * Call this when you know that a CDentry is ready to be passed - * on to StrayManager (i.e. this is a stray you've just created) - */ - void notify_stray(CDentry *dn) { - assert(dn->get_dir()->get_inode()->is_stray()); - stray_manager.eval_stray(dn); - } - - void maybe_eval_stray(CInode *in, bool delay=false); - void clear_dirty_bits_for_stray(CInode* diri); - - bool is_readonly() { return readonly; } - void force_readonly(); - - DecayRate decayrate; - - int num_inodes_with_caps; - - unsigned max_dir_commit_size; - - static file_layout_t gen_default_file_layout(const MDSMap &mdsmap); - static file_layout_t gen_default_log_layout(const MDSMap &mdsmap); - - file_layout_t default_file_layout; - file_layout_t default_log_layout; - - void register_perfcounters(); - - // -- client leases -- -public: - static const int client_lease_pools = 3; - float client_lease_durations[client_lease_pools]; -protected: - xlist client_leases[client_lease_pools]; -public: - void touch_client_lease(ClientLease *r, int pool, utime_t ttl) { - client_leases[pool].push_back(&r->item_lease); - r->ttl = ttl; - } - - void notify_stray_removed() - { - stray_manager.notify_stray_removed(); - } - - void notify_stray_created() - { - stray_manager.notify_stray_created(); - } - - void eval_remote(CDentry *dn) - { - stray_manager.eval_remote(dn); - } - - // -- client caps -- - uint64_t last_cap_id; - - - - // -- discover -- - struct discover_info_t { - ceph_tid_t tid; - mds_rank_t mds; - inodeno_t ino; - frag_t frag; - snapid_t snap; - filepath want_path; - CInode *basei; - bool want_base_dir; - bool want_xlocked; - - discover_info_t() : - tid(0), mds(-1), snap(CEPH_NOSNAP), basei(NULL), - want_base_dir(false), want_xlocked(false) {} - ~discover_info_t() { - if (basei) - basei->put(MDSCacheObject::PIN_DISCOVERBASE); - } - void pin_base(CInode *b) { - basei = b; - basei->get(MDSCacheObject::PIN_DISCOVERBASE); - } - }; - - map discovers; - ceph_tid_t discover_last_tid; - - void _send_discover(discover_info_t& dis); - discover_info_t& _create_discover(mds_rank_t mds) { - ceph_tid_t t = ++discover_last_tid; - discover_info_t& d = discovers[t]; - d.tid = t; - d.mds = mds; - return d; - } - - // waiters - map > > waiting_for_base_ino; - - void discover_base_ino(inodeno_t want_ino, MDSInternalContextBase *onfinish, mds_rank_t from=MDS_RANK_NONE); - void discover_dir_frag(CInode *base, frag_t approx_fg, MDSInternalContextBase *onfinish, - mds_rank_t from=MDS_RANK_NONE); - void discover_path(CInode *base, snapid_t snap, filepath want_path, MDSInternalContextBase *onfinish, - bool want_xlocked=false, mds_rank_t from=MDS_RANK_NONE); - void discover_path(CDir *base, snapid_t snap, filepath want_path, MDSInternalContextBase *onfinish, - bool want_xlocked=false); - void kick_discovers(mds_rank_t who); // after a failure. - - - // -- subtrees -- -protected: - /* subtree keys and each tree's non-recursive nested subtrees (the "bounds") */ - map > subtrees; - map > > projected_subtree_renames; // renamed ino -> target dir - - // adjust subtree auth specification - // dir->dir_auth - // imports/exports/nested_exports - // join/split subtrees as appropriate -public: - bool is_subtrees() { return !subtrees.empty(); } - void list_subtrees(list& ls); - void adjust_subtree_auth(CDir *root, mds_authority_t auth); - void adjust_subtree_auth(CDir *root, mds_rank_t a, mds_rank_t b=CDIR_AUTH_UNKNOWN) { - adjust_subtree_auth(root, mds_authority_t(a,b)); - } - void adjust_bounded_subtree_auth(CDir *dir, set& bounds, mds_authority_t auth); - void adjust_bounded_subtree_auth(CDir *dir, set& bounds, mds_rank_t a) { - adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN)); - } - void adjust_bounded_subtree_auth(CDir *dir, vector& bounds, mds_authority_t auth); - void adjust_bounded_subtree_auth(CDir *dir, vector& bounds, mds_rank_t a) { - adjust_bounded_subtree_auth(dir, bounds, mds_authority_t(a, CDIR_AUTH_UNKNOWN)); - } - void map_dirfrag_set(list& dfs, set& result); - void try_subtree_merge(CDir *root); - void try_subtree_merge_at(CDir *root, set *to_eval); - void subtree_merge_writebehind_finish(CInode *in, MutationRef& mut); - void eval_subtree_root(CInode *diri); - CDir *get_subtree_root(CDir *dir); - CDir *get_projected_subtree_root(CDir *dir); - bool is_leaf_subtree(CDir *dir) { - assert(subtrees.count(dir)); - return subtrees[dir].empty(); - } - void remove_subtree(CDir *dir); - bool is_subtree(CDir *root) { - return subtrees.count(root); - } - void get_subtree_bounds(CDir *root, set& bounds); - void get_wouldbe_subtree_bounds(CDir *root, set& bounds); - void verify_subtree_bounds(CDir *root, const set& bounds); - void verify_subtree_bounds(CDir *root, const list& bounds); - - void project_subtree_rename(CInode *diri, CDir *olddir, CDir *newdir); - void adjust_subtree_after_rename(CInode *diri, CDir *olddir, bool pop); - - void get_auth_subtrees(set& s); - void get_fullauth_subtrees(set& s); - - int num_subtrees(); - int num_subtrees_fullauth(); - int num_subtrees_fullnonauth(); - - -protected: - // delayed cache expire - map > delayed_expire; // subtree root -> expire msg - - - // -- requests -- - ceph::unordered_map active_requests; - -public: - int get_num_client_requests(); - - MDRequestRef request_start(MClientRequest *req); - MDRequestRef request_start_slave(metareqid_t rid, __u32 attempt, Message *m); - MDRequestRef request_start_internal(int op); - bool have_request(metareqid_t rid) { - return active_requests.count(rid); - } - MDRequestRef request_get(metareqid_t rid); - void request_pin_ref(MDRequestRef& r, CInode *ref, vector& trace); - void request_finish(MDRequestRef& mdr); - void request_forward(MDRequestRef& mdr, mds_rank_t mds, int port=0); - void dispatch_request(MDRequestRef& mdr); - void request_drop_foreign_locks(MDRequestRef& mdr); - void request_drop_non_rdlocks(MDRequestRef& r); - void request_drop_locks(MDRequestRef& r); - void request_cleanup(MDRequestRef& r); - - void request_kill(MDRequestRef& r); // called when session closes - - // journal/snap helpers - CInode *pick_inode_snap(CInode *in, snapid_t follows); - CInode *cow_inode(CInode *in, snapid_t last); - void journal_cow_dentry(MutationImpl *mut, EMetaBlob *metablob, CDentry *dn, - snapid_t follows=CEPH_NOSNAP, - CInode **pcow_inode=0, CDentry::linkage_t *dnl=0); - void journal_cow_inode(MutationRef& mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP, - CInode **pcow_inode=0); - void journal_dirty_inode(MutationImpl *mut, EMetaBlob *metablob, CInode *in, snapid_t follows=CEPH_NOSNAP); - - void project_rstat_inode_to_frag(CInode *cur, CDir *parent, snapid_t first, - int linkunlink, SnapRealm *prealm); - void _project_rstat_inode_to_frag(inode_t& inode, snapid_t ofirst, snapid_t last, - CDir *parent, int linkunlink, bool update_inode); - void project_rstat_frag_to_inode(nest_info_t& rstat, nest_info_t& accounted_rstat, - snapid_t ofirst, snapid_t last, - CInode *pin, bool cow_head); - void broadcast_quota_to_client(CInode *in); - void predirty_journal_parents(MutationRef mut, EMetaBlob *blob, - CInode *in, CDir *parent, - int flags, int linkunlink=0, - snapid_t follows=CEPH_NOSNAP); - - // slaves - void add_uncommitted_master(metareqid_t reqid, LogSegment *ls, set &slaves, bool safe=false) { - uncommitted_masters[reqid].ls = ls; - uncommitted_masters[reqid].slaves = slaves; - uncommitted_masters[reqid].safe = safe; - } - void wait_for_uncommitted_master(metareqid_t reqid, MDSInternalContextBase *c) { - uncommitted_masters[reqid].waiters.push_back(c); - } - bool have_uncommitted_master(metareqid_t reqid, mds_rank_t from) { - auto p = uncommitted_masters.find(reqid); - return p != uncommitted_masters.end() && p->second.slaves.count(from) > 0; - } - void log_master_commit(metareqid_t reqid); - void logged_master_update(metareqid_t reqid); - void _logged_master_commit(metareqid_t reqid); - void committed_master_slave(metareqid_t r, mds_rank_t from); - void finish_committed_masters(); - - void _logged_slave_commit(mds_rank_t from, metareqid_t reqid); - - // -- recovery -- -protected: - set recovery_set; - -public: - void set_recovery_set(set& s); - void handle_mds_failure(mds_rank_t who); - void handle_mds_recovery(mds_rank_t who); - -protected: - // [resolve] - // from EImportStart w/o EImportFinish during journal replay - map > my_ambiguous_imports; - // from MMDSResolves - map > > other_ambiguous_imports; - - map > uncommitted_slave_updates; // slave: for replay. - map uncommitted_slave_rename_olddir; // slave: preserve the non-auth dir until seeing commit. - map uncommitted_slave_unlink; // slave: preserve the unlinked inode until seeing commit. - - // track master requests whose slaves haven't acknowledged commit - struct umaster { - set slaves; - LogSegment *ls; - list waiters; - bool safe; - bool committing; - bool recovering; - umaster() : ls(NULL), safe(false), committing(false), recovering(false) {} - }; - map uncommitted_masters; // master: req -> slave set - - set pending_masters; - map > ambiguous_slave_updates; - - friend class ESlaveUpdate; - friend class ECommitted; - - bool resolves_pending; - set resolve_gather; // nodes i need resolves from - set resolve_ack_gather; // nodes i need a resolve_ack from - map need_resolve_rollback; // rollbacks i'm writing to the journal - map delayed_resolve; - - void handle_resolve(MMDSResolve *m); - void handle_resolve_ack(MMDSResolveAck *m); - void process_delayed_resolve(); - void discard_delayed_resolve(mds_rank_t who); - void maybe_resolve_finish(); - void disambiguate_my_imports(); - void disambiguate_other_imports(); - void trim_unlinked_inodes(); - void add_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master, MDSlaveUpdate*); - void finish_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master); - MDSlaveUpdate* get_uncommitted_slave_update(metareqid_t reqid, mds_rank_t master); -public: - void recalc_auth_bits(bool replay); - void remove_inode_recursive(CInode *in); - - bool is_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) { - auto p = ambiguous_slave_updates.find(master); - return p != ambiguous_slave_updates.end() && p->second.count(reqid); - } - void add_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) { - ambiguous_slave_updates[master].insert(reqid); - } - void remove_ambiguous_slave_update(metareqid_t reqid, mds_rank_t master) { - auto p = ambiguous_slave_updates.find(master); - auto q = p->second.find(reqid); - assert(q != p->second.end()); - p->second.erase(q); - if (p->second.empty()) - ambiguous_slave_updates.erase(p); - } - - void add_rollback(metareqid_t reqid, mds_rank_t master) { - need_resolve_rollback[reqid] = master; - } - void finish_rollback(metareqid_t reqid); - - // ambiguous imports - void add_ambiguous_import(dirfrag_t base, const vector& bounds); - void add_ambiguous_import(CDir *base, const set& bounds); - bool have_ambiguous_import(dirfrag_t base) { - return my_ambiguous_imports.count(base); - } - void get_ambiguous_import_bounds(dirfrag_t base, vector& bounds) { - assert(my_ambiguous_imports.count(base)); - bounds = my_ambiguous_imports[base]; - } - void cancel_ambiguous_import(CDir *); - void finish_ambiguous_import(dirfrag_t dirino); - void resolve_start(MDSInternalContext *resolve_done_); - void send_resolves(); - void send_slave_resolves(); - void send_subtree_resolves(); - void maybe_send_pending_resolves() { - if (resolves_pending) - send_subtree_resolves(); - } - - void _move_subtree_map_bound(dirfrag_t df, dirfrag_t oldparent, dirfrag_t newparent, - map >& subtrees); - ESubtreeMap *create_subtree_map(); - - - void clean_open_file_lists(); - -protected: - // [rejoin] - bool rejoins_pending; - set rejoin_gather; // nodes from whom i need a rejoin - set rejoin_sent; // nodes i sent a rejoin to - set rejoin_ack_sent; // nodes i sent a rejoin to - set rejoin_ack_gather; // nodes from whom i need a rejoin ack - map > > rejoin_imported_caps; - map > > rejoin_slave_exports; - map rejoin_client_map; - - map > cap_exports; // ino -> client -> capex - map cap_export_targets; // ino -> auth mds - - map > > cap_imports; // ino -> client -> frommds -> capex - set cap_imports_missing; - map > cap_reconnect_waiters; - int cap_imports_num_opening; - - set rejoin_undef_inodes; - set rejoin_potential_updated_scatterlocks; - set rejoin_undef_dirfrags; - map > rejoin_unlinked_inodes; - - vector rejoin_recover_q, rejoin_check_q; - list rejoin_eval_locks; - list rejoin_waiters; - - void rejoin_walk(CDir *dir, MMDSCacheRejoin *rejoin); - void handle_cache_rejoin(MMDSCacheRejoin *m); - void handle_cache_rejoin_weak(MMDSCacheRejoin *m); - CInode* rejoin_invent_inode(inodeno_t ino, snapid_t last); - CDir* rejoin_invent_dirfrag(dirfrag_t df); - void handle_cache_rejoin_strong(MMDSCacheRejoin *m); - void rejoin_scour_survivor_replicas(mds_rank_t from, MMDSCacheRejoin *ack, - set& acked_inodes, - set& gather_locks); - void handle_cache_rejoin_ack(MMDSCacheRejoin *m); - void rejoin_send_acks(); - void rejoin_trim_undef_inodes(); - void maybe_send_pending_rejoins() { - if (rejoins_pending) - rejoin_send_rejoins(); - } - std::unique_ptr rejoin_done; - std::unique_ptr resolve_done; -public: - void rejoin_start(MDSInternalContext *rejoin_done_); - void rejoin_gather_finish(); - void rejoin_send_rejoins(); - void rejoin_export_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr, - int target=-1) { - cap_exports[ino][client] = icr; - cap_export_targets[ino] = target; - } - void rejoin_recovered_caps(inodeno_t ino, client_t client, const cap_reconnect_t& icr, - mds_rank_t frommds=MDS_RANK_NONE) { - cap_imports[ino][client][frommds] = icr; - } - const cap_reconnect_t *get_replay_cap_reconnect(inodeno_t ino, client_t client) { - if (cap_imports.count(ino) && - cap_imports[ino].count(client) && - cap_imports[ino][client].count(MDS_RANK_NONE)) { - return &cap_imports[ino][client][MDS_RANK_NONE]; - } - return NULL; - } - void remove_replay_cap_reconnect(inodeno_t ino, client_t client) { - assert(cap_imports[ino].size() == 1); - assert(cap_imports[ino][client].size() == 1); - cap_imports.erase(ino); - } - void wait_replay_cap_reconnect(inodeno_t ino, MDSInternalContextBase *c) { - cap_reconnect_waiters[ino].push_back(c); - } - - // [reconnect/rejoin caps] - struct reconnected_cap_info_t { - inodeno_t realm_ino; - snapid_t snap_follows; - int dirty_caps; - reconnected_cap_info_t() : - realm_ino(0), snap_follows(0), dirty_caps(0) {} - }; - map > reconnected_caps; // inode -> client -> snap_follows,realmino - map > reconnected_snaprealms; // realmino -> client -> realmseq - - void add_reconnected_cap(client_t client, inodeno_t ino, const cap_reconnect_t& icr) { - reconnected_cap_info_t &info = reconnected_caps[ino][client]; - info.realm_ino = inodeno_t(icr.capinfo.snaprealm); - info.snap_follows = icr.snap_follows; - } - void set_reconnected_dirty_caps(client_t client, inodeno_t ino, int dirty) { - reconnected_cap_info_t &info = reconnected_caps[ino][client]; - info.dirty_caps |= dirty; - } - void add_reconnected_snaprealm(client_t client, inodeno_t ino, snapid_t seq) { - reconnected_snaprealms[ino][client] = seq; - } - - friend class C_MDC_RejoinOpenInoFinish; - friend class C_MDC_RejoinSessionsOpened; - void rejoin_open_ino_finish(inodeno_t ino, int ret); - void rejoin_open_sessions_finish(map client_map, - map& sseqmap); - bool process_imported_caps(); - void choose_lock_states_and_reconnect_caps(); - void prepare_realm_split(SnapRealm *realm, client_t client, inodeno_t ino, - map& splits); - void do_realm_invalidate_and_update_notify(CInode *in, int snapop, bool nosend=false); - void send_snaps(map& splits); - Capability* rejoin_import_cap(CInode *in, client_t client, const cap_reconnect_t& icr, mds_rank_t frommds); - void finish_snaprealm_reconnect(client_t client, SnapRealm *realm, snapid_t seq); - void try_reconnect_cap(CInode *in, Session *session); - void export_remaining_imported_caps(); - - // cap imports. delayed snap parent opens. - // realm inode -> client -> cap inodes needing to split to this realm - map > missing_snap_parents; - map > delayed_imported_caps; - - void do_cap_import(Session *session, CInode *in, Capability *cap, - uint64_t p_cap_id, ceph_seq_t p_seq, ceph_seq_t p_mseq, - int peer, int p_flags); - void do_delayed_cap_imports(); - void rebuild_need_snapflush(CInode *head_in, SnapRealm *realm, client_t client, - snapid_t snap_follows); - void check_realm_past_parents(SnapRealm *realm, bool reconnect); - void open_snap_parents(); - - bool open_undef_inodes_dirfrags(); - void opened_undef_inode(CInode *in); - void opened_undef_dirfrag(CDir *dir) { - rejoin_undef_dirfrags.erase(dir); - } - - void reissue_all_caps(); - - - friend class Locker; - friend class Migrator; - friend class MDBalancer; - - // StrayManager needs to be able to remove_inode() from us - // when it is done purging - friend class StrayManager; - - // File size recovery -private: - RecoveryQueue recovery_queue; - void identify_files_to_recover(); -public: - void start_files_to_recover(); - void do_file_recover(); - void queue_file_recover(CInode *in); - void _queued_file_recover_cow(CInode *in, MutationRef& mut); - - // subsystems - std::unique_ptr migrator; - - public: - explicit MDCache(MDSRank *m, PurgeQueue &purge_queue_); - ~MDCache(); - - // debug - void log_stat(); - - // root inode - CInode *get_root() { return root; } - CInode *get_myin() { return myin; } - - size_t get_cache_size() { return lru.lru_get_size(); } - - // trimming - bool trim(uint64_t count=0); -private: - void trim_lru(uint64_t count, map& expiremap); - bool trim_dentry(CDentry *dn, map& expiremap); - void trim_dirfrag(CDir *dir, CDir *con, - map& expiremap); - bool trim_inode(CDentry *dn, CInode *in, CDir *con, - map& expiremap); - void send_expire_messages(map& expiremap); - void trim_non_auth(); // trim out trimmable non-auth items -public: - bool trim_non_auth_subtree(CDir *directory); - void standby_trim_segment(LogSegment *ls); - void try_trim_non_auth_subtree(CDir *dir); - bool can_trim_non_auth_dirfrag(CDir *dir) { - return my_ambiguous_imports.count((dir)->dirfrag()) == 0 && - uncommitted_slave_rename_olddir.count(dir->inode) == 0; - } - - /** - * For all unreferenced inodes, dirs, dentries below an inode, compose - * expiry messages. This is used when giving up all replicas of entities - * for an MDS peer in the 'stopping' state, such that the peer can - * empty its cache and finish shutting down. - * - * We have to make sure we're only expiring un-referenced items to - * avoid interfering with ongoing stray-movement (we can't distinguish - * between the "moving my strays" and "waiting for my cache to empty" - * phases within 'stopping') - * - * @return false if we completed cleanly, true if caller should stop - * expiring because we hit something with refs. - */ - bool expire_recursive( - CInode *in, - std::map& expiremap); - - void trim_client_leases(); - void check_memory_usage(); - - utime_t last_recall_state; - - // shutdown -private: - set shutdown_exported_strays; -public: - void shutdown_start(); - void shutdown_check(); - bool shutdown_pass(); - bool shutdown_export_strays(); - bool shutdown(); // clear cache (ie at shutodwn) - - bool did_shutdown_log_cap; - - // inode_map - bool have_inode(vinodeno_t vino) { - return inode_map.count(vino) ? true:false; - } - bool have_inode(inodeno_t ino, snapid_t snap=CEPH_NOSNAP) { - return have_inode(vinodeno_t(ino, snap)); - } - CInode* get_inode(vinodeno_t vino) { - if (have_inode(vino)) - return inode_map[vino]; - return NULL; - } - CInode* get_inode(inodeno_t ino, snapid_t s=CEPH_NOSNAP) { - return get_inode(vinodeno_t(ino, s)); - } - - CDir* get_dirfrag(dirfrag_t df) { - CInode *in = get_inode(df.ino); - if (!in) - return NULL; - return in->get_dirfrag(df.frag); - } - CDir* get_dirfrag(inodeno_t ino, const string& dn) { - CInode *in = get_inode(ino); - if (!in) - return NULL; - frag_t fg = in->pick_dirfrag(dn); - return in->get_dirfrag(fg); - } - CDir* get_force_dirfrag(dirfrag_t df, bool replay) { - CInode *diri = get_inode(df.ino); - if (!diri) - return NULL; - CDir *dir = force_dir_fragment(diri, df.frag, replay); - if (!dir) - dir = diri->get_dirfrag(df.frag); - return dir; - } - - MDSCacheObject *get_object(MDSCacheObjectInfo &info); - - - - public: - void add_inode(CInode *in); - - void remove_inode(CInode *in); - protected: - void touch_inode(CInode *in) { - if (in->get_parent_dn()) - touch_dentry(in->get_projected_parent_dn()); - } -public: - void touch_dentry(CDentry *dn) { - if (dn->state_test(CDentry::STATE_BOTTOMLRU)) { - bottom_lru.lru_midtouch(dn); - } else { - if (dn->is_auth()) - lru.lru_touch(dn); - else - lru.lru_midtouch(dn); - } - } - void touch_dentry_bottom(CDentry *dn) { - if (dn->state_test(CDentry::STATE_BOTTOMLRU)) - return; - lru.lru_bottouch(dn); - } -protected: - - void inode_remove_replica(CInode *in, mds_rank_t rep, bool rejoin, - set& gather_locks); - void dentry_remove_replica(CDentry *dn, mds_rank_t rep, set& gather_locks); - - void rename_file(CDentry *srcdn, CDentry *destdn); - - public: - // truncate - void truncate_inode(CInode *in, LogSegment *ls); - void _truncate_inode(CInode *in, LogSegment *ls); - void truncate_inode_finish(CInode *in, LogSegment *ls); - void truncate_inode_logged(CInode *in, MutationRef& mut); - - void add_recovered_truncate(CInode *in, LogSegment *ls); - void remove_recovered_truncate(CInode *in, LogSegment *ls); - void start_recovered_truncates(); - - - public: - CDir *get_auth_container(CDir *in); - CDir *get_export_container(CDir *dir); - void find_nested_exports(CDir *dir, set& s); - void find_nested_exports_under(CDir *import, CDir *dir, set& s); - - -private: - bool opening_root, open; - list waiting_for_open; - -public: - void init_layouts(); - void create_unlinked_system_inode(CInode *in, inodeno_t ino, - int mode) const; - CInode *create_system_inode(inodeno_t ino, int mode); - CInode *create_root_inode(); - - void create_empty_hierarchy(MDSGather *gather); - void create_mydir_hierarchy(MDSGather *gather); - - bool is_open() { return open; } - void wait_for_open(MDSInternalContextBase *c) { - waiting_for_open.push_back(c); - } - - void open_root_inode(MDSInternalContextBase *c); - void open_root(); - void open_mydir_inode(MDSInternalContextBase *c); - void populate_mydir(); - - void _create_system_file(CDir *dir, const char *name, CInode *in, MDSInternalContextBase *fin); - void _create_system_file_finish(MutationRef& mut, CDentry *dn, - version_t dpv, MDSInternalContextBase *fin); - - void open_foreign_mdsdir(inodeno_t ino, MDSInternalContextBase *c); - CDir *get_stray_dir(CInode *in); - CDentry *get_or_create_stray_dentry(CInode *in); - - MDSInternalContextBase *_get_waiter(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin); - - /** - * Find the given dentry (and whether it exists or not), its ancestors, - * and get them all into memory and usable on this MDS. This function - * makes a best-effort attempt to load everything; if it needs to - * go away and do something then it will put the request on a waitlist. - * It prefers the mdr, then the req, then the fin. (At least one of these - * must be non-null.) - * - * At least one of the params mdr, req, and fin must be non-null. - * - * @param mdr The MDRequest associated with the path. Can be null. - * @param req The Message associated with the path. Can be null. - * @param fin The Context associated with the path. Can be null. - * @param path The path to traverse to. - * @param pdnvec Data return parameter -- on success, contains a - * vector of dentries. On failure, is either empty or contains the - * full trace of traversable dentries. - * @param pin Data return parameter -- if successful, points to the inode - * associated with filepath. If unsuccessful, is null. - * @param onfail Specifies different lookup failure behaviors. If set to - * MDS_TRAVERSE_DISCOVERXLOCK, path_traverse will succeed on null - * dentries (instead of returning -ENOENT). If set to - * MDS_TRAVERSE_FORWARD, it will forward the request to the auth - * MDS if that becomes appropriate (ie, if it doesn't know the contents - * of a directory). If set to MDS_TRAVERSE_DISCOVER, it - * will attempt to look up the path from a different MDS (and bring them - * into its cache as replicas). - * - * @returns 0 on success, 1 on "not done yet", 2 on "forwarding", -errno otherwise. - * If it returns 1, the requester associated with this call has been placed - * on the appropriate waitlist, and it should unwind itself and back out. - * If it returns 2 the request has been forwarded, and again the requester - * should unwind itself and back out. - */ - int path_traverse(MDRequestRef& mdr, Message *req, MDSInternalContextBase *fin, const filepath& path, - vector *pdnvec, CInode **pin, int onfail); - - CInode *cache_traverse(const filepath& path); - - void open_remote_dirfrag(CInode *diri, frag_t fg, MDSInternalContextBase *fin); - CInode *get_dentry_inode(CDentry *dn, MDRequestRef& mdr, bool projected=false); - - bool parallel_fetch(map& pathmap, set& missing); - bool parallel_fetch_traverse_dir(inodeno_t ino, filepath& path, - set& fetch_queue, set& missing, - C_GatherBuilder &gather_bld); - - void open_remote_dentry(CDentry *dn, bool projected, MDSInternalContextBase *fin, - bool want_xlocked=false); - void _open_remote_dentry_finish(CDentry *dn, inodeno_t ino, MDSInternalContextBase *fin, - bool want_xlocked, int r); - - void make_trace(vector& trace, CInode *in); - -protected: - struct open_ino_info_t { - vector ancestors; - set checked; - mds_rank_t checking; - mds_rank_t auth_hint; - bool check_peers; - bool fetch_backtrace; - bool discover; - bool want_replica; - bool want_xlocked; - version_t tid; - int64_t pool; - int last_err; - list waiters; - open_ino_info_t() : checking(MDS_RANK_NONE), auth_hint(MDS_RANK_NONE), - check_peers(true), fetch_backtrace(true), discover(false), - want_replica(false), want_xlocked(false), tid(0), pool(-1), - last_err(0) {} - }; - ceph_tid_t open_ino_last_tid; - map opening_inodes; - - void _open_ino_backtrace_fetched(inodeno_t ino, bufferlist& bl, int err); - void _open_ino_parent_opened(inodeno_t ino, int ret); - void _open_ino_traverse_dir(inodeno_t ino, open_ino_info_t& info, int err); - void _open_ino_fetch_dir(inodeno_t ino, MMDSOpenIno *m, CDir *dir, bool parent); - int open_ino_traverse_dir(inodeno_t ino, MMDSOpenIno *m, - vector& ancestors, - bool discover, bool want_xlocked, mds_rank_t *hint); - void open_ino_finish(inodeno_t ino, open_ino_info_t& info, int err); - void do_open_ino(inodeno_t ino, open_ino_info_t& info, int err); - void do_open_ino_peer(inodeno_t ino, open_ino_info_t& info); - void handle_open_ino(MMDSOpenIno *m, int err=0); - void handle_open_ino_reply(MMDSOpenInoReply *m); - friend class C_IO_MDC_OpenInoBacktraceFetched; - friend struct C_MDC_OpenInoTraverseDir; - friend struct C_MDC_OpenInoParentOpened; - -public: - void kick_open_ino_peers(mds_rank_t who); - void open_ino(inodeno_t ino, int64_t pool, MDSInternalContextBase *fin, - bool want_replica=true, bool want_xlocked=false); - - // -- find_ino_peer -- - struct find_ino_peer_info_t { - inodeno_t ino; - ceph_tid_t tid; - MDSInternalContextBase *fin; - mds_rank_t hint; - mds_rank_t checking; - set checked; - - find_ino_peer_info_t() : tid(0), fin(NULL), hint(MDS_RANK_NONE), checking(MDS_RANK_NONE) {} - }; - - map find_ino_peer; - ceph_tid_t find_ino_peer_last_tid; - - void find_ino_peers(inodeno_t ino, MDSInternalContextBase *c, mds_rank_t hint=MDS_RANK_NONE); - void _do_find_ino_peer(find_ino_peer_info_t& fip); - void handle_find_ino(MMDSFindIno *m); - void handle_find_ino_reply(MMDSFindInoReply *m); - void kick_find_ino_peers(mds_rank_t who); - - // -- snaprealms -- -public: - void snaprealm_create(MDRequestRef& mdr, CInode *in); - void _snaprealm_create_finish(MDRequestRef& mdr, MutationRef& mut, CInode *in); - - // -- stray -- -public: - void fetch_backtrace(inodeno_t ino, int64_t pool, bufferlist& bl, Context *fin); - uint64_t get_num_strays() const { return stray_manager.get_num_strays(); } - -protected: - void scan_stray_dir(dirfrag_t next=dirfrag_t()); - StrayManager stray_manager; - friend struct C_MDC_RetryScanStray; - friend class C_IO_MDC_FetchedBacktrace; - - // == messages == - public: - void dispatch(Message *m); - - protected: - // -- replicas -- - void handle_discover(MDiscover *dis); - void handle_discover_reply(MDiscoverReply *m); - friend class C_MDC_Join; - -public: - void replicate_dir(CDir *dir, mds_rank_t to, bufferlist& bl) { - dirfrag_t df = dir->dirfrag(); - ::encode(df, bl); - dir->encode_replica(to, bl); - } - void replicate_dentry(CDentry *dn, mds_rank_t to, bufferlist& bl) { - ::encode(dn->name, bl); - ::encode(dn->last, bl); - dn->encode_replica(to, bl); - } - void replicate_inode(CInode *in, mds_rank_t to, bufferlist& bl, - uint64_t features) { - ::encode(in->inode.ino, bl); // bleh, minor assymetry here - ::encode(in->last, bl); - in->encode_replica(to, bl, features); - } - - CDir* add_replica_dir(bufferlist::iterator& p, CInode *diri, mds_rank_t from, list& finished); - CDentry *add_replica_dentry(bufferlist::iterator& p, CDir *dir, list& finished); - CInode *add_replica_inode(bufferlist::iterator& p, CDentry *dn, list& finished); - - void replicate_stray(CDentry *straydn, mds_rank_t who, bufferlist& bl); - CDentry *add_replica_stray(bufferlist &bl, mds_rank_t from); - - // -- namespace -- -public: - void send_dentry_link(CDentry *dn, MDRequestRef& mdr); - void send_dentry_unlink(CDentry *dn, CDentry *straydn, MDRequestRef& mdr); -protected: - void handle_dentry_link(MDentryLink *m); - void handle_dentry_unlink(MDentryUnlink *m); - - - // -- fragmenting -- -private: - struct ufragment { - int bits; - bool committed; - LogSegment *ls; - list waiters; - list old_frags; - bufferlist rollback; - ufragment() : bits(0), committed(false), ls(NULL) {} - }; - map uncommitted_fragments; - - struct fragment_info_t { - int bits; - list dirs; - list resultfrags; - MDRequestRef mdr; - // for deadlock detection - bool all_frozen; - utime_t last_cum_auth_pins_change; - int last_cum_auth_pins; - int num_remote_waiters; // number of remote authpin waiters - fragment_info_t() : bits(0), all_frozen(false), last_cum_auth_pins(0), num_remote_waiters(0) {} - bool is_fragmenting() { return !resultfrags.empty(); } - }; - map fragments; - - void adjust_dir_fragments(CInode *diri, frag_t basefrag, int bits, - list& frags, list& waiters, bool replay); - void adjust_dir_fragments(CInode *diri, - list& srcfrags, - frag_t basefrag, int bits, - list& resultfrags, - list& waiters, - bool replay); - CDir *force_dir_fragment(CInode *diri, frag_t fg, bool replay=true); - void get_force_dirfrag_bound_set(vector& dfs, set& bounds); - - bool can_fragment(CInode *diri, list& dirs); - void fragment_freeze_dirs(list& dirs); - void fragment_mark_and_complete(MDRequestRef& mdr); - void fragment_frozen(MDRequestRef& mdr, int r); - void fragment_unmark_unfreeze_dirs(list& dirs); - void dispatch_fragment_dir(MDRequestRef& mdr); - void _fragment_logged(MDRequestRef& mdr); - void _fragment_stored(MDRequestRef& mdr); - void _fragment_committed(dirfrag_t f, list& resultfrags); - void _fragment_finish(dirfrag_t f, list& resultfrags); - - friend class EFragment; - friend class C_MDC_FragmentFrozen; - friend class C_MDC_FragmentMarking; - friend class C_MDC_FragmentPrep; - friend class C_MDC_FragmentStore; - friend class C_MDC_FragmentCommit; - friend class C_IO_MDC_FragmentFinish; - - void handle_fragment_notify(MMDSFragmentNotify *m); - - void add_uncommitted_fragment(dirfrag_t basedirfrag, int bits, list& old_frag, - LogSegment *ls, bufferlist *rollback=NULL); - void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op); - void rollback_uncommitted_fragment(dirfrag_t basedirfrag, list& old_frags); -public: - void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSInternalContextBase *c) { - assert(uncommitted_fragments.count(dirfrag)); - uncommitted_fragments[dirfrag].waiters.push_back(c); - } - void split_dir(CDir *dir, int byn); - void merge_dir(CInode *diri, frag_t fg); - void rollback_uncommitted_fragments(); - - void find_stale_fragment_freeze(); - void fragment_freeze_inc_num_waiters(CDir *dir); - bool fragment_are_all_frozen(CDir *dir); - int get_num_fragmenting_dirs() { return fragments.size(); } - - // -- updates -- - //int send_inode_updates(CInode *in); - //void handle_inode_update(MInodeUpdate *m); - - int send_dir_updates(CDir *in, bool bcast=false); - void handle_dir_update(MDirUpdate *m); - - // -- cache expiration -- - void handle_cache_expire(MCacheExpire *m); - void process_delayed_expire(CDir *dir); - void discard_delayed_expire(CDir *dir); - -protected: - int dump_cache(const char *fn, Formatter *f, - const std::string& dump_root = "", - int depth = -1); -public: - int dump_cache() { return dump_cache(NULL, NULL); } - int dump_cache(const std::string &filename); - int dump_cache(Formatter *f); - int dump_cache(const std::string& dump_root, int depth, Formatter *f); - - int cache_status(Formatter *f); - - void dump_resolve_status(Formatter *f) const; - void dump_rejoin_status(Formatter *f) const; - - // == crap fns == - public: - void show_cache(); - void show_subtrees(int dbl=10); - - CInode *hack_pick_random_inode() { - assert(!inode_map.empty()); - int n = rand() % inode_map.size(); - ceph::unordered_map::iterator p = inode_map.begin(); - while (n--) ++p; - return p->second; - } - -protected: - void flush_dentry_work(MDRequestRef& mdr); - /** - * Resolve path to a dentry and pass it onto the ScrubStack. - * - * TODO: return enough information to the original mdr formatter - * and completion that they can subsequeuntly check the progress of - * this scrub (we won't block them on a whole scrub as it can take a very - * long time) - */ - void enqueue_scrub_work(MDRequestRef& mdr); - void repair_inode_stats_work(MDRequestRef& mdr); - void repair_dirfrag_stats_work(MDRequestRef& mdr); - friend class C_MDC_RepairDirfragStats; -public: - void flush_dentry(const string& path, Context *fin); - /** - * Create and start an OP_ENQUEUE_SCRUB - */ - void enqueue_scrub(const string& path, const std::string &tag, - bool force, bool recursive, bool repair, - Formatter *f, Context *fin); - void repair_inode_stats(CInode *diri); - void repair_dirfrag_stats(CDir *dir); - -public: - /* Because exports may fail, this set lets us keep track of inodes that need exporting. */ - std::set export_pin_queue; -}; - -class C_MDS_RetryRequest : public MDSInternalContext { - MDCache *cache; - MDRequestRef mdr; - public: - C_MDS_RetryRequest(MDCache *c, MDRequestRef& r); - void finish(int r) override; -}; - -#endif