1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
24 #include "compact_map.h"
26 #include "ceph_frag.h"
27 #include "include/assert.h"
31 * the goal here is to use a binary split strategy to partition a namespace.
32 * frag_t represents a particular fragment. bits() tells you the size of the
33 * fragment, and value() it's name. this is roughly analogous to an ip address
36 * fragtree_t represents an entire namespace and it's partition. it essentially
37 * tells you where fragments are split into other fragments, and by how much
38 * (i.e. by how many bits, resulting in a power of 2 number of child fragments).
40 * this vaguely resembles a btree, in that when a fragment becomes large or small
41 * we can split or merge, except that there is no guarantee of being balanced.
43 * presumably we are partitioning the output of a (perhaps specialized) hash
50 * description of an individual fragment. that is, a particular piece
51 * of the overall namespace.
53 * this is conceptually analogous to an ip address and netmask.
55 * a value v falls "within" fragment f iff (v & f.mask()) == f.value().
57 * we write it as v/b, where v is a value and b is the number of bits.
58 * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that,
59 * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on.
61 * this makes the right most bit of v the "most significant", which is the
62 * opposite of what we usually see.
67 * - get_first_child(), next_sibling(int parent_bits) to make (possibly partial)
68 * iteration efficient (see, e.g., try_assimilate_children()
69 * - rework frag_t so that we mask the left-most (most significant) bits instead of
70 * the right-most (least significant) bits. just because it's more intutive, and
71 * matches the network/netmask concept.
74 typedef uint32_t _frag_t;
78 * encoding is dictated by frag_* functions in ceph_fs.h. use those
79 * helpers _exclusively_.
84 frag_t() : _enc(0) { }
85 frag_t(unsigned v, unsigned b) : _enc(ceph_frag_make(b, v)) { }
86 frag_t(_frag_t e) : _enc(e) { }
89 void from_unsigned(unsigned e) { _enc = e; }
92 unsigned value() const { return ceph_frag_value(_enc); }
93 unsigned bits() const { return ceph_frag_bits(_enc); }
94 unsigned mask() const { return ceph_frag_mask(_enc); }
95 unsigned mask_shift() const { return ceph_frag_mask_shift(_enc); }
97 operator _frag_t() const { return _enc; }
100 bool contains(unsigned v) const { return ceph_frag_contains_value(_enc, v); }
101 bool contains(frag_t sub) const { return ceph_frag_contains_frag(_enc, sub._enc); }
102 bool is_root() const { return bits() == 0; }
103 frag_t parent() const {
105 return frag_t(ceph_frag_parent(_enc));
109 frag_t make_child(int i, int nb) const {
111 return frag_t(ceph_frag_make_child(_enc, nb, i));
113 void split(int nb, std::list<frag_t>& fragments) const {
115 unsigned nway = 1 << nb;
116 for (unsigned i=0; i<nway; i++)
117 fragments.push_back(make_child(i, nb));
121 frag_t left_child() const { return frag_t(ceph_frag_left_child(_enc)); }
122 frag_t right_child() const { return frag_t(ceph_frag_right_child(_enc)); }
124 bool is_left() const { return ceph_frag_is_left_child(_enc); }
125 bool is_right() const { return ceph_frag_is_right_child(_enc); }
126 frag_t get_sibling() const {
128 return frag_t(ceph_frag_sibling(_enc));
132 bool is_leftmost() const { return ceph_frag_is_leftmost(_enc); }
133 bool is_rightmost() const { return ceph_frag_is_rightmost(_enc); }
134 frag_t next() const {
135 assert(!is_rightmost());
136 return frag_t(ceph_frag_next(_enc));
140 bool parse(const char *s) {
142 int r = sscanf(s, "%x/%d", &pvalue, &pbits);
144 *this = frag_t(pvalue, pbits);
151 inline std::ostream& operator<<(std::ostream& out, const frag_t& hb)
153 //out << std::hex << hb.value() << std::dec << "/" << hb.bits() << '=';
154 unsigned num = hb.bits();
156 unsigned val = hb.value();
157 for (unsigned bit = 23; num; num--, bit--)
158 out << ((val & (1<<bit)) ? '1':'0');
163 inline void encode(frag_t f, bufferlist& bl) { encode_raw(f._enc, bl); }
164 inline void decode(frag_t &f, bufferlist::iterator& p) {
173 * fragtree_t -- partition an entire namespace into one or more frag_t's.
177 // frag_t f is split by b bits.
178 // if child frag_t does not appear, it is not split.
180 compact_map<frag_t,int32_t> _splits;
185 void swap(fragtree_t& other) {
186 _splits.swap(other._splits);
195 return _splits.empty();
197 int get_split(const frag_t hb) const {
198 compact_map<frag_t,int32_t>::const_iterator p = _splits.find(hb);
199 if (p == _splits.end())
206 bool is_leaf(frag_t x) const {
207 std::list<frag_t> ls;
208 get_leaves_under(x, ls);
209 //generic_dout(10) << "is_leaf(" << x << ") -> " << ls << dendl;
218 * get_leaves -- list all leaves
220 void get_leaves(std::list<frag_t>& ls) const {
221 return get_leaves_under_split(frag_t(), ls);
225 * get_leaves_under_split -- list all leaves under a known split point (or root)
227 void get_leaves_under_split(frag_t under, std::list<frag_t>& ls) const {
233 int nb = get_split(t);
235 t.split(nb, q); // queue up children
237 ls.push_front(t); // not spit, it's a leaf.
242 * get_branch -- get branch point at OR above frag @a x
243 * - may be @a x itself, if @a x is a split
244 * - may be root (frag_t())
246 frag_t get_branch(frag_t x) const {
248 if (x == frag_t()) return x; // root
249 if (get_split(x)) return x; // found it!
255 * get_branch_above -- get a branch point above frag @a x
256 * - may be root (frag_t())
257 * - may NOT be @a x, even if @a x is a split.
259 frag_t get_branch_above(frag_t x) const {
261 if (x == frag_t()) return x; // root
263 if (get_split(x)) return x; // found it!
269 * get_branch_or_leaf -- get branch or leaf point parent for frag @a x
270 * - may be @a x itself, if @a x is a split or leaf
271 * - may be root (frag_t())
273 frag_t get_branch_or_leaf(frag_t x) const {
274 frag_t branch = get_branch(x);
275 int nb = get_split(branch);
276 if (nb > 0 && // if branch is a split, and
277 branch.bits() + nb <= x.bits()) // one of the children is or contains x
278 return frag_t(x.value(), branch.bits()+nb); // then return that child (it's a leaf)
284 * get_leaves_under(x, ls) -- search for any leaves fully contained by x
286 void get_leaves_under(frag_t x, std::list<frag_t>& ls) const {
288 q.push_back(get_branch_or_leaf(x));
290 frag_t t = q.front();
292 if (t.bits() >= x.bits() && // if t is more specific than x, and
293 !x.contains(t)) // x does not contain t,
294 continue; // then skip
295 int nb = get_split(t);
297 t.split(nb, q); // queue up children
298 else if (x.contains(t))
299 ls.push_back(t); // not spit, it's a leaf.
304 * contains(fg) -- does fragtree contain the specific frag @a x
306 bool contains(frag_t x) const {
308 q.push_back(get_branch(x));
310 frag_t t = q.front();
312 if (t.bits() >= x.bits() && // if t is more specific than x, and
313 !x.contains(t)) // x does not contain t,
314 continue; // then skip
315 int nb = get_split(t);
317 if (t == x) return false; // it's split.
318 t.split(nb, q); // queue up children
320 if (t == x) return true; // it's there.
327 * operator[] -- map a (hash?) value to a frag
329 frag_t operator[](unsigned v) const {
332 assert(t.contains(v));
333 int nb = get_split(t);
336 if (nb == 0) return t; // done.
338 // pick appropriate child fragment.
339 unsigned nway = 1 << nb;
341 for (i=0; i<nway; i++) {
342 frag_t n = t.make_child(i, nb);
355 void split(frag_t x, int b, bool simplify=true) {
360 try_assimilate_children(get_branch_above(x));
362 void merge(frag_t x, int b, bool simplify=true) {
364 assert(_splits[x] == b);
368 try_assimilate_children(get_branch_above(x));
372 * if all of a given split's children are identically split,
373 * then the children can be assimilated.
375 void try_assimilate_children(frag_t x) {
376 int nb = get_split(x);
378 std::list<frag_t> children;
379 x.split(nb, children);
381 for (std::list<frag_t>::iterator p = children.begin();
384 int cb = get_split(*p);
385 if (!cb) return; // nope.
386 if (childbits && cb != childbits) return; // not the same
389 // all children are split with childbits!
390 for (std::list<frag_t>::iterator p = children.begin();
394 _splits[x] += childbits;
397 bool force_to_leaf(CephContext *cct, frag_t x) {
401 lgeneric_dout(cct, 10) << "force_to_leaf " << x << " on " << _splits << dendl;
403 frag_t parent = get_branch_or_leaf(x);
404 assert(parent.bits() <= x.bits());
405 lgeneric_dout(cct, 10) << "parent is " << parent << dendl;
407 // do we need to split from parent to x?
408 if (parent.bits() < x.bits()) {
409 int spread = x.bits() - parent.bits();
410 int nb = get_split(parent);
411 lgeneric_dout(cct, 10) << "spread " << spread << ", parent splits by " << nb << dendl;
413 // easy: split parent (a leaf) by the difference
414 lgeneric_dout(cct, 10) << "splitting parent " << parent << " by spread " << spread << dendl;
415 split(parent, spread);
421 // add an intermediary split
422 merge(parent, nb, false);
423 split(parent, spread, false);
425 std::list<frag_t> subs;
426 parent.split(spread, subs);
427 for (std::list<frag_t>::iterator p = subs.begin();
430 lgeneric_dout(cct, 10) << "splitting intermediate " << *p << " by " << (nb-spread) << dendl;
431 split(*p, nb - spread, false);
435 // x is now a leaf or split.
436 // hoover up any children.
440 frag_t t = q.front();
442 int nb = get_split(t);
444 lgeneric_dout(cct, 10) << "merging child " << t << " by " << nb << dendl;
445 merge(t, nb, false); // merge this point, and
446 t.split(nb, q); // queue up children
450 lgeneric_dout(cct, 10) << "force_to_leaf done" << dendl;
456 void encode(bufferlist& bl) const {
457 ::encode(_splits, bl);
459 void decode(bufferlist::iterator& p) {
460 ::decode(_splits, p);
462 void encode_nohead(bufferlist& bl) const {
463 for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin();
466 ::encode(p->first, bl);
467 ::encode(p->second, bl);
470 void decode_nohead(int n, bufferlist::iterator& p) {
475 ::decode(_splits[f], p);
479 void print(std::ostream& out) {
480 out << "fragtree_t(";
482 q.push_back(frag_t());
484 frag_t t = q.front();
489 for (unsigned i=0; i<t.bits(); i++) out << ' ';
491 int nb = get_split(t);
493 out << t << " %" << nb;
494 t.split(nb, q); // queue up children
502 void dump(Formatter *f) const {
503 f->open_array_section("splits");
504 for (compact_map<frag_t,int32_t>::const_iterator p = _splits.begin();
507 f->open_object_section("split");
508 std::ostringstream frag_str;
509 frag_str << p->first;
510 f->dump_string("frag", frag_str.str());
511 f->dump_int("children", p->second);
512 f->close_section(); // split
514 f->close_section(); // splits
517 WRITE_CLASS_ENCODER(fragtree_t)
519 inline bool operator==(const fragtree_t& l, const fragtree_t& r) {
520 return l._splits == r._splits;
522 inline bool operator!=(const fragtree_t& l, const fragtree_t& r) {
523 return l._splits != r._splits;
526 inline std::ostream& operator<<(std::ostream& out, const fragtree_t& ft)
528 out << "fragtree_t(";
530 for (compact_map<frag_t,int32_t>::const_iterator p = ft._splits.begin();
531 p != ft._splits.end();
533 if (p != ft._splits.begin())
535 out << p->first << "^" << p->second;
542 * fragset_t -- a set of fragments
545 std::set<frag_t> _set;
548 const std::set<frag_t> &get() const { return _set; }
549 std::set<frag_t>::iterator begin() { return _set.begin(); }
550 std::set<frag_t>::iterator end() { return _set.end(); }
552 bool empty() const { return _set.empty(); }
554 bool contains(frag_t f) const {
556 if (_set.count(f)) return true;
557 if (f.bits() == 0) return false;
562 void insert(frag_t f) {
570 std::set<frag_t>::iterator p = _set.begin();
571 while (p != _set.end()) {
573 _set.count(p->get_sibling())) {
574 _set.erase(p->get_sibling());
575 _set.insert(p->parent());
588 inline std::ostream& operator<<(std::ostream& out, const fragset_t& fs)
590 return out << "fragset_t(" << fs.get() << ")";