1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
15 #ifndef CEPH_MDS_EMETABLOB_H
16 #define CEPH_MDS_EMETABLOB_H
20 #include "../CInode.h"
22 #include "../CDentry.h"
23 #include "../LogSegment.h"
25 #include "include/interval_set.h"
33 * a bunch of metadata in the journal
38 * - make sure you adjust the inode.version for any modified inode you
39 * journal. CDir and CDentry maintain a projected_version, but CInode
40 * doesn't, since the journaled inode usually has to be modifed
41 * manually anyway (to delay the change in the MDS's cache until after
50 /* fullbit - a regular dentry + inode
52 * We encode this one a bit weirdly, just because (also, it's marginally faster
53 * on multiple encodes, which I think can happen):
54 * Encode a bufferlist on struct creation with all data members, without a struct_v.
55 * When encode is called, encode struct_v and then append the bufferlist.
56 * Decode straight into the appropriate variables.
58 * So, if you add members, encode them in the constructor and then change
59 * the struct_v in the encode function!
62 static const int STATE_DIRTY = (1<<0);
63 static const int STATE_DIRTYPARENT = (1<<1);
64 static const int STATE_DIRTYPOOL = (1<<2);
65 static const int STATE_NEED_SNAPFLUSH = (1<<3);
66 typedef compact_map<snapid_t, old_inode_t> old_inodes_t;
68 snapid_t dnfirst, dnlast;
70 inode_t inode; // if it's not
71 fragtree_t dirfragtree;
72 map<string,bufferptr> xattrs;
77 old_inodes_t old_inodes;
79 fullbit(const fullbit& o);
80 const fullbit& operator=(const fullbit& o);
82 fullbit(const string& d, snapid_t df, snapid_t dl,
83 version_t v, const inode_t& i, const fragtree_t &dft,
84 const map<string,bufferptr> &xa, const string& sym,
85 snapid_t os, const bufferlist &sbl, __u8 st,
86 const old_inodes_t *oi = NULL) :
87 dn(d), dnfirst(df), dnlast(dl), dnv(v), inode(i), xattrs(xa),
88 oldest_snap(os), state(st)
98 explicit fullbit(bufferlist::iterator &p) {
104 void encode(bufferlist& bl, uint64_t features) const;
105 void decode(bufferlist::iterator &bl);
106 void dump(Formatter *f) const;
107 static void generate_test_instances(list<EMetaBlob::fullbit*>& ls);
109 void update_inode(MDSRank *mds, CInode *in);
110 bool is_dirty() const { return (state & STATE_DIRTY); }
111 bool is_dirty_parent() const { return (state & STATE_DIRTYPARENT); }
112 bool is_dirty_pool() const { return (state & STATE_DIRTYPOOL); }
113 bool need_snapflush() const { return (state & STATE_NEED_SNAPFLUSH); }
115 void print(ostream& out) const {
116 out << " fullbit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv
117 << " inode " << inode.ino
118 << " state=" << state << std::endl;
120 string state_string() const {
122 bool marked_already = false;
124 state_string.append("dirty");
125 marked_already = true;
127 if (is_dirty_parent()) {
128 state_string.append(marked_already ? "+dirty_parent" : "dirty_parent");
130 state_string.append("+dirty_pool");
135 WRITE_CLASS_ENCODER_FEATURES(fullbit)
137 /* remotebit - a dentry + remote inode link (i.e. just an ino)
141 snapid_t dnfirst, dnlast;
144 unsigned char d_type;
147 remotebit(const string& d, snapid_t df, snapid_t dl, version_t v, inodeno_t i, unsigned char dt, bool dr) :
148 dn(d), dnfirst(df), dnlast(dl), dnv(v), ino(i), d_type(dt), dirty(dr) { }
149 explicit remotebit(bufferlist::iterator &p) { decode(p); }
150 remotebit(): dnfirst(0), dnlast(0), dnv(0), ino(0),
151 d_type('\0'), dirty(false) {}
153 void encode(bufferlist& bl) const;
154 void decode(bufferlist::iterator &bl);
155 void print(ostream& out) const {
156 out << " remotebit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv
158 << " dirty=" << dirty << std::endl;
160 void dump(Formatter *f) const;
161 static void generate_test_instances(list<remotebit*>& ls);
163 WRITE_CLASS_ENCODER(remotebit)
166 * nullbit - a null dentry
170 snapid_t dnfirst, dnlast;
174 nullbit(const string& d, snapid_t df, snapid_t dl, version_t v, bool dr) :
175 dn(d), dnfirst(df), dnlast(dl), dnv(v), dirty(dr) { }
176 explicit nullbit(bufferlist::iterator &p) { decode(p); }
177 nullbit(): dnfirst(0), dnlast(0), dnv(0), dirty(false) {}
179 void encode(bufferlist& bl) const;
180 void decode(bufferlist::iterator &bl);
181 void dump(Formatter *f) const;
182 static void generate_test_instances(list<nullbit*>& ls);
183 void print(ostream& out) {
184 out << " nullbit dn " << dn << " [" << dnfirst << "," << dnlast << "] dnv " << dnv
185 << " dirty=" << dirty << std::endl;
188 WRITE_CLASS_ENCODER(nullbit)
191 /* dirlump - contains metadata for any dir we have contents for.
195 static const int STATE_COMPLETE = (1<<1);
196 static const int STATE_DIRTY = (1<<2); // dirty due to THIS journal item, that is!
197 static const int STATE_NEW = (1<<3); // new directory
198 static const int STATE_IMPORTING = (1<<4); // importing directory
199 static const int STATE_DIRTYDFT = (1<<5); // dirty dirfragtree
204 __u32 nfull, nremote, nnull;
207 mutable bufferlist dnbl;
208 mutable bool dn_decoded;
209 mutable list<ceph::shared_ptr<fullbit> > dfull;
210 mutable list<remotebit> dremote;
211 mutable list<nullbit> dnull;
214 dirlump() : state(0), nfull(0), nremote(0), nnull(0), dn_decoded(true) { }
216 bool is_complete() const { return state & STATE_COMPLETE; }
217 void mark_complete() { state |= STATE_COMPLETE; }
218 bool is_dirty() const { return state & STATE_DIRTY; }
219 void mark_dirty() { state |= STATE_DIRTY; }
220 bool is_new() const { return state & STATE_NEW; }
221 void mark_new() { state |= STATE_NEW; }
222 bool is_importing() { return state & STATE_IMPORTING; }
223 void mark_importing() { state |= STATE_IMPORTING; }
224 bool is_dirty_dft() { return state & STATE_DIRTYDFT; }
225 void mark_dirty_dft() { state |= STATE_DIRTYDFT; }
227 const list<ceph::shared_ptr<fullbit> > &get_dfull() const { return dfull; }
228 const list<remotebit> &get_dremote() const { return dremote; }
229 const list<nullbit> &get_dnull() const { return dnull; }
231 void add_dnull(nullbit const &n) { dnull.push_back(n); };
232 void add_dfull(ceph::shared_ptr<fullbit> const &p) { dfull.push_back(p); };
233 void add_dremote(remotebit const &r) { dremote.push_back(r); };
235 void print(dirfrag_t dirfrag, ostream& out) {
236 out << "dirlump " << dirfrag << " v " << fnode.version
237 << " state " << state
238 << " num " << nfull << "/" << nremote << "/" << nnull
241 for (list<ceph::shared_ptr<fullbit> >::iterator p = dfull.begin(); p != dfull.end(); ++p)
243 for (list<remotebit>::iterator p = dremote.begin(); p != dremote.end(); ++p)
245 for (list<nullbit>::iterator p = dnull.begin(); p != dnull.end(); ++p)
249 string state_string() const {
251 bool marked_already = false;
253 state_string.append("complete");
254 marked_already = true;
257 state_string.append(marked_already ? "+dirty" : "dirty");
258 marked_already = true;
261 state_string.append(marked_already ? "+new" : "new");
266 // if this changes, update the versioning in encode for it!
267 void _encode_bits(uint64_t features) const {
268 if (!dn_decoded) return;
269 ::encode(dfull, dnbl, features);
270 ::encode(dremote, dnbl);
271 ::encode(dnull, dnbl);
273 void _decode_bits() const {
274 if (dn_decoded) return;
275 bufferlist::iterator p = dnbl.begin();
277 ::decode(dremote, p);
282 void encode(bufferlist& bl, uint64_t features) const;
283 void decode(bufferlist::iterator &bl);
284 void dump(Formatter *f) const;
285 static void generate_test_instances(list<dirlump*>& ls);
287 WRITE_CLASS_ENCODER_FEATURES(dirlump)
289 // my lumps. preserve the order we added them in a list.
290 list<dirfrag_t> lump_order;
291 map<dirfrag_t, dirlump> lump_map;
292 list<ceph::shared_ptr<fullbit> > roots;
294 list<pair<__u8,version_t> > table_tids; // tableclient transactions
296 inodeno_t opened_ino;
298 inodeno_t renamed_dirino;
299 list<frag_t> renamed_dir_frags;
302 // ino (pre)allocation. may involve both inotable AND session state.
303 version_t inotablev, sessionmapv;
304 inodeno_t allocated_ino; // inotable
305 interval_set<inodeno_t> preallocated_inos; // inotable + session
306 inodeno_t used_preallocated_ino; // session
307 entity_name_t client_name; // session
309 // inodes i've truncated
310 list<inodeno_t> truncate_start; // start truncate
311 map<inodeno_t, log_segment_seq_t> truncate_finish; // finished truncate (started in segment blah)
314 vector<inodeno_t> destroyed_inodes;
318 list<pair<metareqid_t,uint64_t> > client_reqs;
319 list<pair<metareqid_t,uint64_t> > client_flushes;
322 void encode(bufferlist& bl, uint64_t features) const;
323 void decode(bufferlist::iterator& bl);
324 void get_inodes(std::set<inodeno_t> &inodes) const;
325 void get_paths(std::vector<std::string> &paths) const;
326 void get_dentries(std::map<dirfrag_t, std::set<std::string> > &dentries) const;
327 entity_name_t get_client_name() const {return client_name;}
329 void dump(Formatter *f) const;
330 static void generate_test_instances(list<EMetaBlob*>& ls);
332 uint64_t last_subtree_map;
335 // for replay, in certain cases
336 //LogSegment *_segment;
338 explicit EMetaBlob(MDLog *mdl = 0); // defined in journal.cc
341 void print(ostream& out) {
342 for (list<dirfrag_t>::iterator p = lump_order.begin();
343 p != lump_order.end();
345 lump_map[*p].print(*p, out);
349 void add_client_req(metareqid_t r, uint64_t tid=0) {
350 client_reqs.push_back(pair<metareqid_t,uint64_t>(r, tid));
352 void add_client_flush(metareqid_t r, uint64_t tid=0) {
353 client_flushes.push_back(pair<metareqid_t,uint64_t>(r, tid));
356 void add_table_transaction(int table, version_t tid) {
357 table_tids.push_back(pair<__u8, version_t>(table, tid));
360 void add_opened_ino(inodeno_t ino) {
365 void set_ino_alloc(inodeno_t alloc,
366 inodeno_t used_prealloc,
367 interval_set<inodeno_t>& prealloc,
368 entity_name_t client,
369 version_t sv, version_t iv) {
370 allocated_ino = alloc;
371 used_preallocated_ino = used_prealloc;
372 preallocated_inos = prealloc;
373 client_name = client;
378 void add_truncate_start(inodeno_t ino) {
379 truncate_start.push_back(ino);
381 void add_truncate_finish(inodeno_t ino, uint64_t segoff) {
382 truncate_finish[ino] = segoff;
385 bool rewrite_truncate_finish(MDSRank const *mds, std::map<uint64_t, uint64_t> const &old_to_new);
387 void add_destroyed_inode(inodeno_t ino) {
388 destroyed_inodes.push_back(ino);
391 void add_null_dentry(CDentry *dn, bool dirty) {
392 add_null_dentry(add_dir(dn->get_dir(), false), dn, dirty);
394 void add_null_dentry(dirlump& lump, CDentry *dn, bool dirty) {
397 lump.add_dnull(nullbit(dn->get_name(),
399 dn->get_projected_version(),
403 void add_remote_dentry(CDentry *dn, bool dirty) {
404 add_remote_dentry(add_dir(dn->get_dir(), false), dn, dirty, 0, 0);
406 void add_remote_dentry(CDentry *dn, bool dirty, inodeno_t rino, int rdt) {
407 add_remote_dentry(add_dir(dn->get_dir(), false), dn, dirty, rino, rdt);
409 void add_remote_dentry(dirlump& lump, CDentry *dn, bool dirty,
410 inodeno_t rino=0, unsigned char rdt=0) {
412 rino = dn->get_projected_linkage()->get_remote_ino();
413 rdt = dn->get_projected_linkage()->get_remote_d_type();
416 lump.add_dremote(remotebit(dn->get_name(),
418 dn->get_projected_version(),
423 // return remote pointer to to-be-journaled inode
424 void add_primary_dentry(CDentry *dn, CInode *in, bool dirty,
425 bool dirty_parent=false, bool dirty_pool=false,
426 bool need_snapflush=false) {
428 if (dirty) state |= fullbit::STATE_DIRTY;
429 if (dirty_parent) state |= fullbit::STATE_DIRTYPARENT;
430 if (dirty_pool) state |= fullbit::STATE_DIRTYPOOL;
431 if (need_snapflush) state |= fullbit::STATE_NEED_SNAPFLUSH;
432 add_primary_dentry(add_dir(dn->get_dir(), false), dn, in, state);
434 void add_primary_dentry(dirlump& lump, CDentry *dn, CInode *in, __u8 state) {
436 in = dn->get_projected_linkage()->get_inode();
438 // make note of where this inode was last journaled
439 in->last_journaled = event_seq;
440 //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl;
442 const inode_t *pi = in->get_projected_inode();
443 if ((state & fullbit::STATE_DIRTY) && pi->is_backtrace_updated())
444 state |= fullbit::STATE_DIRTYPARENT;
447 const sr_t *sr = in->get_projected_srnode();
452 lump.add_dfull(ceph::shared_ptr<fullbit>(new fullbit(dn->get_name(),
454 dn->get_projected_version(),
455 *pi, in->dirfragtree,
456 *in->get_projected_xattrs(),
458 in->oldest_snap, snapbl,
459 state, &in->old_inodes)));
462 // convenience: primary or remote? figure it out.
463 void add_dentry(CDentry *dn, bool dirty) {
464 dirlump& lump = add_dir(dn->get_dir(), false);
465 add_dentry(lump, dn, dirty, false, false);
467 void add_import_dentry(CDentry *dn) {
468 bool dirty_parent = false;
469 bool dirty_pool = false;
470 if (dn->get_linkage()->is_primary()) {
471 dirty_parent = dn->get_linkage()->get_inode()->is_dirty_parent();
472 dirty_pool = dn->get_linkage()->get_inode()->is_dirty_pool();
474 dirlump& lump = add_dir(dn->get_dir(), false);
475 add_dentry(lump, dn, dn->is_dirty(), dirty_parent, dirty_pool);
477 void add_dentry(dirlump& lump, CDentry *dn, bool dirty, bool dirty_parent, bool dirty_pool) {
479 if (dn->get_projected_linkage()->is_remote()) {
480 add_remote_dentry(dn, dirty);
482 } else if (dn->get_projected_linkage()->is_null()) {
483 add_null_dentry(dn, dirty);
486 assert(dn->get_projected_linkage()->is_primary());
487 add_primary_dentry(dn, 0, dirty, dirty_parent, dirty_pool);
490 void add_root(bool dirty, CInode *in, const inode_t *pi=0, fragtree_t *pdft=0, bufferlist *psnapbl=0,
491 map<string,bufferptr> *px=0) {
492 in->last_journaled = event_seq;
493 //cout << "journaling " << in->inode.ino << " at " << my_offset << std::endl;
495 if (!pi) pi = in->get_projected_inode();
496 if (!pdft) pdft = &in->dirfragtree;
497 if (!px) px = in->get_projected_xattrs();
503 in->encode_snap_blob(snapbl);
505 for (list<ceph::shared_ptr<fullbit> >::iterator p = roots.begin(); p != roots.end(); ++p) {
506 if ((*p)->inode.ino == in->ino()) {
513 roots.push_back(ceph::shared_ptr<fullbit>(new fullbit(empty, in->first, in->last, 0, *pi,
514 *pdft, *px, in->symlink,
515 in->oldest_snap, snapbl,
516 dirty ? fullbit::STATE_DIRTY : 0,
520 dirlump& add_dir(CDir *dir, bool dirty, bool complete=false) {
521 return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(),
524 dirlump& add_new_dir(CDir *dir) {
525 return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(),
526 true, true, true); // dirty AND complete AND new
528 dirlump& add_import_dir(CDir *dir) {
529 // dirty=false would be okay in some cases
530 return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(),
531 dir->is_dirty(), dir->is_complete(), false, true, dir->is_dirty_dft());
533 dirlump& add_fragmented_dir(CDir *dir, bool dirty, bool dirtydft) {
534 return add_dir(dir->dirfrag(), dir->get_projected_fnode(), dir->get_projected_version(),
535 dirty, false, false, false, dirtydft);
537 dirlump& add_dir(dirfrag_t df, const fnode_t *pf, version_t pv, bool dirty,
538 bool complete=false, bool isnew=false,
539 bool importing=false, bool dirty_dft=false) {
540 if (lump_map.count(df) == 0)
541 lump_order.push_back(df);
543 dirlump& l = lump_map[df];
545 l.fnode.version = pv;
546 if (complete) l.mark_complete();
547 if (dirty) l.mark_dirty();
548 if (isnew) l.mark_new();
549 if (importing) l.mark_importing();
550 if (dirty_dft) l.mark_dirty_dft();
554 static const int TO_AUTH_SUBTREE_ROOT = 0; // default.
555 static const int TO_ROOT = 1;
557 void add_dir_context(CDir *dir, int mode = TO_AUTH_SUBTREE_ROOT);
560 return roots.empty() && lump_order.empty() && table_tids.empty() &&
561 truncate_start.empty() && truncate_finish.empty() &&
562 destroyed_inodes.empty() && client_reqs.empty() &&
563 opened_ino == 0 && inotablev == 0 && sessionmapv == 0;
566 void print(ostream& out) const {
568 if (!lump_order.empty())
569 out << " " << lump_order.front() << ", " << lump_map.size() << " dirs";
570 if (!table_tids.empty())
571 out << " table_tids=" << table_tids;
572 if (allocated_ino || preallocated_inos.size()) {
574 out << " alloc_ino=" << allocated_ino;
575 if (preallocated_inos.size())
576 out << " prealloc_ino=" << preallocated_inos;
577 if (used_preallocated_ino)
578 out << " used_prealloc_ino=" << used_preallocated_ino;
579 out << " v" << inotablev;
584 void update_segment(LogSegment *ls);
585 void replay(MDSRank *mds, LogSegment *ls, MDSlaveUpdate *su=NULL);
587 WRITE_CLASS_ENCODER_FEATURES(EMetaBlob)
588 WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::fullbit)
589 WRITE_CLASS_ENCODER(EMetaBlob::remotebit)
590 WRITE_CLASS_ENCODER(EMetaBlob::nullbit)
591 WRITE_CLASS_ENCODER_FEATURES(EMetaBlob::dirlump)
593 inline ostream& operator<<(ostream& out, const EMetaBlob& t) {