// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- // vim: ts=8 sw=2 smarttab /* * Ceph - scalable distributed file system * * Copyright (C) 2004-2006 Sage Weil * * This is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License version 2.1, as published by the Free Software * Foundation. See file COPYING. * */ #ifndef CEPH_FRAG_H #define CEPH_FRAG_H #include #include #include #include #include "buffer.h" #include "compact_map.h" #include "ceph_frag.h" #include "include/assert.h" /* * * the goal here is to use a binary split strategy to partition a namespace. * frag_t represents a particular fragment. bits() tells you the size of the * fragment, and value() it's name. this is roughly analogous to an ip address * and netmask. * * fragtree_t represents an entire namespace and it's partition. it essentially * tells you where fragments are split into other fragments, and by how much * (i.e. by how many bits, resulting in a power of 2 number of child fragments). * * this vaguely resembles a btree, in that when a fragment becomes large or small * we can split or merge, except that there is no guarantee of being balanced. * * presumably we are partitioning the output of a (perhaps specialized) hash * function. */ /** * frag_t * * description of an individual fragment. that is, a particular piece * of the overall namespace. * * this is conceptually analogous to an ip address and netmask. * * a value v falls "within" fragment f iff (v & f.mask()) == f.value(). * * we write it as v/b, where v is a value and b is the number of bits. * 0/0 (bits==0) corresponds to the entire namespace. if we bisect that, * we get 0/1 and 1/1. quartering gives us 0/2, 1/2, 2/2, 3/2. and so on. * * this makes the right most bit of v the "most significant", which is the * opposite of what we usually see. */ /* * TODO: * - get_first_child(), next_sibling(int parent_bits) to make (possibly partial) * iteration efficient (see, e.g., try_assimilate_children() * - rework frag_t so that we mask the left-most (most significant) bits instead of * the right-most (least significant) bits. just because it's more intutive, and * matches the network/netmask concept. */ typedef uint32_t _frag_t; class frag_t { /* * encoding is dictated by frag_* functions in ceph_fs.h. use those * helpers _exclusively_. */ public: _frag_t _enc; frag_t() : _enc(0) { } frag_t(unsigned v, unsigned b) : _enc(ceph_frag_make(b, v)) { } frag_t(_frag_t e) : _enc(e) { } // constructors void from_unsigned(unsigned e) { _enc = e; } // accessors unsigned value() const { return ceph_frag_value(_enc); } unsigned bits() const { return ceph_frag_bits(_enc); } unsigned mask() const { return ceph_frag_mask(_enc); } unsigned mask_shift() const { return ceph_frag_mask_shift(_enc); } operator _frag_t() const { return _enc; } // tests bool contains(unsigned v) const { return ceph_frag_contains_value(_enc, v); } bool contains(frag_t sub) const { return ceph_frag_contains_frag(_enc, sub._enc); } bool is_root() const { return bits() == 0; } frag_t parent() const { assert(bits() > 0); return frag_t(ceph_frag_parent(_enc)); } // splitting frag_t make_child(int i, int nb) const { assert(i < (1<& fragments) const { assert(nb > 0); unsigned nway = 1 << nb; for (unsigned i=0; i: // frag_t f is split by b bits. // if child frag_t does not appear, it is not split. public: compact_map _splits; public: // ------------- // basics void swap(fragtree_t& other) { _splits.swap(other._splits); } void clear() { _splits.clear(); } // ------------- // accessors bool empty() const { return _splits.empty(); } int get_split(const frag_t hb) const { compact_map::const_iterator p = _splits.find(hb); if (p == _splits.end()) return 0; else return p->second; } bool is_leaf(frag_t x) const { std::list ls; get_leaves_under(x, ls); //generic_dout(10) << "is_leaf(" << x << ") -> " << ls << dendl; if (!ls.empty() && ls.front() == x && ls.size() == 1) return true; return false; } /** * get_leaves -- list all leaves */ void get_leaves(std::list& ls) const { return get_leaves_under_split(frag_t(), ls); } /** * get_leaves_under_split -- list all leaves under a known split point (or root) */ void get_leaves_under_split(frag_t under, std::list& ls) const { std::list q; q.push_back(under); while (!q.empty()) { frag_t t = q.back(); q.pop_back(); int nb = get_split(t); if (nb) t.split(nb, q); // queue up children else ls.push_front(t); // not spit, it's a leaf. } } /** * get_branch -- get branch point at OR above frag @a x * - may be @a x itself, if @a x is a split * - may be root (frag_t()) */ frag_t get_branch(frag_t x) const { while (1) { if (x == frag_t()) return x; // root if (get_split(x)) return x; // found it! x = x.parent(); } } /** * get_branch_above -- get a branch point above frag @a x * - may be root (frag_t()) * - may NOT be @a x, even if @a x is a split. */ frag_t get_branch_above(frag_t x) const { while (1) { if (x == frag_t()) return x; // root x = x.parent(); if (get_split(x)) return x; // found it! } } /** * get_branch_or_leaf -- get branch or leaf point parent for frag @a x * - may be @a x itself, if @a x is a split or leaf * - may be root (frag_t()) */ frag_t get_branch_or_leaf(frag_t x) const { frag_t branch = get_branch(x); int nb = get_split(branch); if (nb > 0 && // if branch is a split, and branch.bits() + nb <= x.bits()) // one of the children is or contains x return frag_t(x.value(), branch.bits()+nb); // then return that child (it's a leaf) else return branch; } /** * get_leaves_under(x, ls) -- search for any leaves fully contained by x */ void get_leaves_under(frag_t x, std::list& ls) const { std::list q; q.push_back(get_branch_or_leaf(x)); while (!q.empty()) { frag_t t = q.front(); q.pop_front(); if (t.bits() >= x.bits() && // if t is more specific than x, and !x.contains(t)) // x does not contain t, continue; // then skip int nb = get_split(t); if (nb) t.split(nb, q); // queue up children else if (x.contains(t)) ls.push_back(t); // not spit, it's a leaf. } } /** * contains(fg) -- does fragtree contain the specific frag @a x */ bool contains(frag_t x) const { std::list q; q.push_back(get_branch(x)); while (!q.empty()) { frag_t t = q.front(); q.pop_front(); if (t.bits() >= x.bits() && // if t is more specific than x, and !x.contains(t)) // x does not contain t, continue; // then skip int nb = get_split(t); if (nb) { if (t == x) return false; // it's split. t.split(nb, q); // queue up children } else { if (t == x) return true; // it's there. } } return false; } /** * operator[] -- map a (hash?) value to a frag */ frag_t operator[](unsigned v) const { frag_t t; while (1) { assert(t.contains(v)); int nb = get_split(t); // is this a leaf? if (nb == 0) return t; // done. // pick appropriate child fragment. unsigned nway = 1 << nb; unsigned i; for (i=0; i children; x.split(nb, children); int childbits = 0; for (std::list::iterator p = children.begin(); p != children.end(); ++p) { int cb = get_split(*p); if (!cb) return; // nope. if (childbits && cb != childbits) return; // not the same childbits = cb; } // all children are split with childbits! for (std::list::iterator p = children.begin(); p != children.end(); ++p) _splits.erase(*p); _splits[x] += childbits; } bool force_to_leaf(CephContext *cct, frag_t x) { if (is_leaf(x)) return false; lgeneric_dout(cct, 10) << "force_to_leaf " << x << " on " << _splits << dendl; frag_t parent = get_branch_or_leaf(x); assert(parent.bits() <= x.bits()); lgeneric_dout(cct, 10) << "parent is " << parent << dendl; // do we need to split from parent to x? if (parent.bits() < x.bits()) { int spread = x.bits() - parent.bits(); int nb = get_split(parent); lgeneric_dout(cct, 10) << "spread " << spread << ", parent splits by " << nb << dendl; if (nb == 0) { // easy: split parent (a leaf) by the difference lgeneric_dout(cct, 10) << "splitting parent " << parent << " by spread " << spread << dendl; split(parent, spread); assert(is_leaf(x)); return true; } assert(nb > spread); // add an intermediary split merge(parent, nb, false); split(parent, spread, false); std::list subs; parent.split(spread, subs); for (std::list::iterator p = subs.begin(); p != subs.end(); ++p) { lgeneric_dout(cct, 10) << "splitting intermediate " << *p << " by " << (nb-spread) << dendl; split(*p, nb - spread, false); } } // x is now a leaf or split. // hoover up any children. std::list q; q.push_back(x); while (!q.empty()) { frag_t t = q.front(); q.pop_front(); int nb = get_split(t); if (nb) { lgeneric_dout(cct, 10) << "merging child " << t << " by " << nb << dendl; merge(t, nb, false); // merge this point, and t.split(nb, q); // queue up children } } lgeneric_dout(cct, 10) << "force_to_leaf done" << dendl; assert(is_leaf(x)); return true; } // encoding void encode(bufferlist& bl) const { ::encode(_splits, bl); } void decode(bufferlist::iterator& p) { ::decode(_splits, p); } void encode_nohead(bufferlist& bl) const { for (compact_map::const_iterator p = _splits.begin(); p != _splits.end(); ++p) { ::encode(p->first, bl); ::encode(p->second, bl); } } void decode_nohead(int n, bufferlist::iterator& p) { _splits.clear(); while (n-- > 0) { frag_t f; ::decode(f, p); ::decode(_splits[f], p); } } void print(std::ostream& out) { out << "fragtree_t("; std::list q; q.push_back(frag_t()); while (!q.empty()) { frag_t t = q.front(); q.pop_front(); // newline + indent? if (t.bits()) { out << std::endl; for (unsigned i=0; iopen_array_section("splits"); for (compact_map::const_iterator p = _splits.begin(); p != _splits.end(); ++p) { f->open_object_section("split"); std::ostringstream frag_str; frag_str << p->first; f->dump_string("frag", frag_str.str()); f->dump_int("children", p->second); f->close_section(); // split } f->close_section(); // splits } }; WRITE_CLASS_ENCODER(fragtree_t) inline bool operator==(const fragtree_t& l, const fragtree_t& r) { return l._splits == r._splits; } inline bool operator!=(const fragtree_t& l, const fragtree_t& r) { return l._splits != r._splits; } inline std::ostream& operator<<(std::ostream& out, const fragtree_t& ft) { out << "fragtree_t("; for (compact_map::const_iterator p = ft._splits.begin(); p != ft._splits.end(); ++p) { if (p != ft._splits.begin()) out << " "; out << p->first << "^" << p->second; } return out << ")"; } /** * fragset_t -- a set of fragments */ class fragset_t { std::set _set; public: const std::set &get() const { return _set; } std::set::iterator begin() { return _set.begin(); } std::set::iterator end() { return _set.end(); } bool empty() const { return _set.empty(); } bool contains(frag_t f) const { while (1) { if (_set.count(f)) return true; if (f.bits() == 0) return false; f = f.parent(); } } void insert(frag_t f) { _set.insert(f); simplify(); } void simplify() { while (1) { bool clean = true; std::set::iterator p = _set.begin(); while (p != _set.end()) { if (!p->is_root() && _set.count(p->get_sibling())) { _set.erase(p->get_sibling()); _set.insert(p->parent()); _set.erase(p++); clean = false; } else { p++; } } if (clean) break; } } }; inline std::ostream& operator<<(std::ostream& out, const fragset_t& fs) { return out << "fragset_t(" << fs.get() << ")"; } #endif