+++ /dev/null
-#ifndef CEPH_CRUSH_CRUSH_H
-#define CEPH_CRUSH_CRUSH_H
-
-#ifdef __KERNEL__
-# include <linux/types.h>
-#else
-# include "crush_compat.h"
-#endif
-
-/*
- * CRUSH is a pseudo-random data distribution algorithm that
- * efficiently distributes input values (typically, data objects)
- * across a heterogeneous, structured storage cluster.
- *
- * The algorithm was originally described in detail in this paper
- * (although the algorithm has evolved somewhat since then):
- *
- * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
- *
- * LGPL2
- */
-
-
-#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
-
-#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
-#define CRUSH_MAX_RULESET (1<<8) /* max crush ruleset number */
-#define CRUSH_MAX_RULES CRUSH_MAX_RULESET /* should be the same as max rulesets */
-
-#define CRUSH_MAX_DEVICE_WEIGHT (100u * 0x10000u)
-#define CRUSH_MAX_BUCKET_WEIGHT (65535u * 0x10000u)
-
-#define CRUSH_ITEM_UNDEF 0x7ffffffe /* undefined result (internal use only) */
-/** @ingroup API
- * The equivalent of NULL for an item, i.e. the absence of an item.
- */
-#define CRUSH_ITEM_NONE 0x7fffffff
-
-/*
- * CRUSH uses user-defined "rules" to describe how inputs should be
- * mapped to devices. A rule consists of sequence of steps to perform
- * to generate the set of output devices.
- */
-struct crush_rule_step {
- __u32 op;
- __s32 arg1;
- __s32 arg2;
-};
-
-/** @ingroup API
- */
-enum crush_opcodes {
- /*! do nothing
- */
- CRUSH_RULE_NOOP = 0,
- CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
- CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
- /* arg2 = type */
- CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
- CRUSH_RULE_EMIT = 4, /* no args */
- CRUSH_RULE_CHOOSELEAF_FIRSTN = 6,
- CRUSH_RULE_CHOOSELEAF_INDEP = 7,
-
- CRUSH_RULE_SET_CHOOSE_TRIES = 8, /* override choose_total_tries */
- CRUSH_RULE_SET_CHOOSELEAF_TRIES = 9, /* override chooseleaf_descend_once */
- CRUSH_RULE_SET_CHOOSE_LOCAL_TRIES = 10,
- CRUSH_RULE_SET_CHOOSE_LOCAL_FALLBACK_TRIES = 11,
- CRUSH_RULE_SET_CHOOSELEAF_VARY_R = 12,
- CRUSH_RULE_SET_CHOOSELEAF_STABLE = 13
-};
-
-/*
- * for specifying choose num (arg1) relative to the max parameter
- * passed to do_rule
- */
-#define CRUSH_CHOOSE_N 0
-#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
-
-/*
- * The rule mask is used to describe what the rule is intended for.
- * Given a ruleset and size of output set, we search through the
- * rule list for a matching rule_mask.
- */
-struct crush_rule_mask {
- __u8 ruleset;
- __u8 type;
- __u8 min_size;
- __u8 max_size;
-};
-
-struct crush_rule {
- __u32 len;
- struct crush_rule_mask mask;
- struct crush_rule_step steps[0];
-};
-
-#define crush_rule_size(len) (sizeof(struct crush_rule) + \
- (len)*sizeof(struct crush_rule_step))
-
-
-
-/*
- * A bucket is a named container of other items (either devices or
- * other buckets).
- */
-
-/** @ingroup API
- *
- * Items within a bucket are chosen with crush_do_rule() using one of
- * three algorithms representing a tradeoff between performance and
- * reorganization efficiency. If you are unsure of which bucket type
- * to use, we recommend using ::CRUSH_BUCKET_STRAW2.
- *
- * The table summarizes how the speed of each option measures up
- * against mapping stability when items are added or removed.
- *
- * Bucket Alg Speed Additions Removals
- * ------------------------------------------------
- * uniform O(1) poor poor
- * list O(n) optimal poor
- * straw2 O(n) optimal optimal
- */
-enum crush_algorithm {
- /*!
- * Devices are rarely added individually in a large system.
- * Instead, new storage is typically deployed in blocks of identical
- * devices, often as an additional shelf in a server rack or perhaps
- * an entire cabinet. Devices reaching their end of life are often
- * similarly decommissioned as a set (individual failures aside),
- * making it natural to treat them as a unit. CRUSH uniform buckets
- * are used to represent an identical set of devices in such
- * circumstances. The key advantage in doing so is performance
- * related: CRUSH can map replicas into uniform buckets in constant
- * time. In cases where the uniformity restrictions are not
- * appropriate, other bucket types can be used. If the size of a
- * uniform bucket changes, there is a complete reshuffling of data
- * between devices, much like conventional hash-based distribution
- * strategies.
- */
- CRUSH_BUCKET_UNIFORM = 1,
- /*!
- * List buckets structure their contents as a linked list, and
- * can contain items with arbitrary weights. To place a
- * replica, CRUSH begins at the head of the list with the most
- * recently added item and compares its weight to the sum of
- * all remaining items' weights. Depending on the value of
- * hash( x , r , item), either the current item is chosen with
- * the appropriate probability, or the process continues
- * recursively down the list. This is a natural and intuitive
- * choice for an expanding cluster: either an object is
- * relocated to the newest device with some appropriate
- * probability, or it remains on the older devices as before.
- * The result is optimal data migration when items are added
- * to the bucket. Items removed from the middle or tail of the
- * list, however, can result in a significant amount of
- * unnecessary movement, making list buckets most suitable for
- * circumstances in which they never (or very rarely) shrink.
- */
- CRUSH_BUCKET_LIST = 2,
- /*! @cond INTERNAL */
- CRUSH_BUCKET_TREE = 3,
- CRUSH_BUCKET_STRAW = 4,
- /*! @endcond */
- /*!
- * List and tree buckets are structured such that a limited
- * number of hash values need to be calculated and compared to
- * weights in order to select a bucket item. In doing so,
- * they divide and conquer in a way that either gives certain
- * items precedence (e. g., those at the beginning of a list)
- * or obviates the need to consider entire subtrees of items
- * at all. That improves the performance of the replica
- * placement process, but can also introduce suboptimal
- * reorganization behavior when the contents of a bucket
- * change due an addition, removal, or re-weighting of an
- * item.
- *
- * The straw2 bucket type allows all items to fairly "compete"
- * against each other for replica placement through a process
- * analogous to a draw of straws. To place a replica, a straw
- * of random length is drawn for each item in the bucket. The
- * item with the longest straw wins. The length of each straw
- * is initially a value in a fixed range. Each straw length
- * is scaled by a factor based on the item's weight so that
- * heavily weighted items are more likely to win the draw.
- * Although this process is almost twice as slow (on average)
- * than a list bucket and even slower than a tree bucket
- * (which scales logarithmically), straw2 buckets result in
- * optimal data movement between nested items when modified.
- */
- CRUSH_BUCKET_STRAW2 = 5,
-};
-extern const char *crush_bucket_alg_name(int alg);
-
-/*
- * although tree was a legacy algorithm, it has been buggy, so
- * exclude it.
- */
-#define CRUSH_LEGACY_ALLOWED_BUCKET_ALGS ( \
- (1 << CRUSH_BUCKET_UNIFORM) | \
- (1 << CRUSH_BUCKET_LIST) | \
- (1 << CRUSH_BUCKET_STRAW))
-
-/** @ingroup API
- *
- * A bucket contains __size__ __items__ which are either positive
- * numbers or negative numbers that reference other buckets and is
- * uniquely identified with __id__ which is a negative number. The
- * __weight__ of a bucket is the cumulative weight of all its
- * children. A bucket is assigned a ::crush_algorithm that is used by
- * crush_do_rule() to draw an item depending on its weight. A bucket
- * can be assigned a strictly positive (> 0) __type__ defined by the
- * caller. The __type__ can be used by crush_do_rule(), when it is
- * given as an argument of a rule step.
- *
- * A pointer to crush_bucket can safely be cast into the following
- * structure, depending on the value of __alg__:
- *
- * - __alg__ == ::CRUSH_BUCKET_UNIFORM cast to crush_bucket_uniform
- * - __alg__ == ::CRUSH_BUCKET_LIST cast to crush_bucket_list
- * - __alg__ == ::CRUSH_BUCKET_STRAW2 cast to crush_bucket_straw2
- *
- * The weight of each item depends on the algorithm and the
- * information about it is available in the corresponding structure
- * (crush_bucket_uniform, crush_bucket_list or crush_bucket_straw2).
- *
- * See crush_map for more information on how __id__ is used
- * to reference the bucket.
- */
-struct crush_bucket {
- __s32 id; /*!< bucket identifier, < 0 and unique within a crush_map */
- __u16 type; /*!< > 0 bucket type, defined by the caller */
- __u8 alg; /*!< the item selection ::crush_algorithm */
- /*! @cond INTERNAL */
- __u8 hash; /* which hash function to use, CRUSH_HASH_* */
- /*! @endcond */
- __u32 weight; /*!< 16.16 fixed point cumulated children weight */
- __u32 size; /*!< size of the __items__ array */
- __s32 *items; /*!< array of children: < 0 are buckets, >= 0 items */
-};
-
-/** @ingroup API
- *
- * Replacement weights for each item in a bucket. The size of the
- * array must be exactly the size of the straw2 bucket, just as the
- * item_weights array.
- *
- */
-struct crush_weight_set {
- __u32 *weights; /*!< 16.16 fixed point weights in the same order as items */
- __u32 size; /*!< size of the __weights__ array */
-};
-
-/** @ingroup API
- *
- * Replacement weights and ids for a given straw2 bucket, for
- * placement purposes.
- *
- * When crush_do_rule() chooses the Nth item from a straw2 bucket, the
- * replacement weights found at __weight_set[N]__ are used instead of
- * the weights from __item_weights__. If __N__ is greater than
- * __weight_set_size__, the weights found at __weight_set_size-1__ are
- * used instead. For instance if __weight_set__ is:
- *
- * [ [ 0x10000, 0x20000 ], // position 0
- * [ 0x20000, 0x40000 ] ] // position 1
- *
- * choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ]
- * choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ]
- * choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ]
- * etc.
- *
- */
-struct crush_choose_arg {
- __s32 *ids; /*!< values to use instead of items */
- __u32 ids_size; /*!< size of the __ids__ array */
- struct crush_weight_set *weight_set; /*!< weight replacements for a given position */
- __u32 weight_set_size; /*!< size of the __weight_set__ array */
-};
-
-/** @ingroup API
- *
- * Replacement weights and ids for each bucket in the crushmap. The
- * __size__ of the __args__ array must be exactly the same as the
- * __map->max_buckets__.
- *
- * The __crush_choose_arg__ at index N will be used when choosing
- * an item from the bucket __map->buckets[N]__ bucket, provided it
- * is a straw2 bucket.
- *
- */
-struct crush_choose_arg_map {
- struct crush_choose_arg *args; /*!< replacement for each bucket in the crushmap */
- __u32 size; /*!< size of the __args__ array */
-};
-
-/** @ingroup API
- * The weight of each item in the bucket when
- * __h.alg__ == ::CRUSH_BUCKET_UNIFORM.
- */
-struct crush_bucket_uniform {
- struct crush_bucket h; /*!< generic bucket information */
- __u32 item_weight; /*!< 16.16 fixed point weight for each item */
-};
-
-/** @ingroup API
- * The weight of each item in the bucket when
- * __h.alg__ == ::CRUSH_BUCKET_LIST.
- *
- * The weight of __h.items[i]__ is __item_weights[i]__ for i in
- * [0,__h.size__[. The __sum_weight__[i] is the sum of the __item_weights[j]__
- * for j in [0,i[.
- *
- */
-struct crush_bucket_list {
- struct crush_bucket h; /*!< generic bucket information */
- __u32 *item_weights; /*!< 16.16 fixed point weight for each item */
- __u32 *sum_weights; /*!< 16.16 fixed point sum of the weights */
-};
-
-struct crush_bucket_tree {
- struct crush_bucket h; /* note: h.size is _tree_ size, not number of
- actual items */
- __u8 num_nodes;
- __u32 *node_weights;
-};
-
-struct crush_bucket_straw {
- struct crush_bucket h;
- __u32 *item_weights; /* 16-bit fixed point */
- __u32 *straws; /* 16-bit fixed point */
-};
-
-/** @ingroup API
- * The weight of each item in the bucket when
- * __h.alg__ == ::CRUSH_BUCKET_STRAW2.
- *
- * The weight of __h.items[i]__ is __item_weights[i]__ for i in
- * [0,__h.size__[.
- */
-struct crush_bucket_straw2 {
- struct crush_bucket h; /*!< generic bucket information */
- __u32 *item_weights; /*!< 16.16 fixed point weight for each item */
-};
-
-
-
-/** @ingroup API
- *
- * A crush map define a hierarchy of crush_bucket that end with leaves
- * (buckets and leaves are called items) and a set of crush_rule to
- * map an integer to items with the crush_do_rule() function.
- *
- */
-struct crush_map {
- /*! An array of crush_bucket pointers of size __max_buckets__.
- * An element of the array may be NULL if the bucket was removed with
- * crush_remove_bucket(). The buckets must be added with crush_add_bucket().
- * The bucket found at __buckets[i]__ must have a crush_bucket.id == -1-i.
- */
- struct crush_bucket **buckets;
- /*! An array of crush_rule pointers of size __max_rules__.
- * An element of the array may be NULL if the rule was removed (there is
- * no API to do so but there may be one in the future). The rules must be added
- * with crush_add_rule().
- */
- struct crush_rule **rules;
- __s32 max_buckets; /*!< the size of __buckets__ */
- __u32 max_rules; /*!< the size of __rules__ */
- /*! The value of the highest item stored in the crush_map + 1
- */
- __s32 max_devices;
-
- /*! Backward compatibility tunable. It implements a bad solution
- * and must always be set to 0 except for backward compatibility
- * purposes
- */
- __u32 choose_local_tries;
- /*! Backward compatibility tunable. It implements a bad solution
- * and must always be set to 0 except for backward compatibility
- * purposes
- */
- __u32 choose_local_fallback_tries;
- /*! Tunable. The default value when the CHOOSE_TRIES or
- * CHOOSELEAF_TRIES steps are omitted in a rule. See the
- * documentation for crush_rule_set_step() for more
- * information
- */
- __u32 choose_total_tries;
- /*! Backward compatibility tunable. It should always be set
- * to 1 except for backward compatibility. Implemented in 2012
- * it was generalized late 2013 and is mostly unused except
- * in one border case, reason why it must be set to 1.
- *
- * Attempt chooseleaf inner descent once for firstn mode; on
- * reject retry outer descent. Note that this does *not*
- * apply to a collision: in that case we will retry as we
- * used to.
- */
- __u32 chooseleaf_descend_once;
- /*! Backward compatibility tunable. It is a fix for bad
- * mappings implemented in 2014 at
- * https://github.com/ceph/ceph/pull/1185. It should always
- * be set to 1 except for backward compatibility.
- *
- * If non-zero, feed r into chooseleaf, bit-shifted right by
- * (r-1) bits. a value of 1 is best for new clusters. for
- * legacy clusters that want to limit reshuffling, a value of
- * 3 or 4 will make the mappings line up a bit better with
- * previous mappings.
- */
- __u8 chooseleaf_vary_r;
-
- /*! Backward compatibility tunable. It is an improvement that
- * avoids unnecessary mapping changes, implemented at
- * https://github.com/ceph/ceph/pull/6572 and explained in
- * this post: "chooseleaf may cause some unnecessary pg
- * migrations" in October 2015
- * https://www.mail-archive.com/ceph-devel@vger.kernel.org/msg26075.html
- * It should always be set to 1 except for backward compatibility.
- */
- __u8 chooseleaf_stable;
-
- /*! @cond INTERNAL */
- /* This value is calculated after decode or construction by
- the builder. It is exposed here (rather than having a
- 'build CRUSH working space' function) so that callers can
- reserve a static buffer, allocate space on the stack, or
- otherwise avoid calling into the heap allocator if they
- want to. The size of the working space depends on the map,
- while the size of the scratch vector passed to the mapper
- depends on the size of the desired result set.
-
- Nothing stops the caller from allocating both in one swell
- foop and passing in two points, though. */
- size_t working_size;
-
-#ifndef __KERNEL__
- /*! @endcond */
- /*! Backward compatibility tunable. It is a fix for the straw
- * scaler values for the straw algorithm which is deprecated
- * (straw2 replaces it) implemented at
- * https://github.com/ceph/ceph/pull/3057. It should always
- * be set to 1 except for backward compatibility.
- *
- */
- __u8 straw_calc_version;
-
- /*! @cond INTERNAL */
- /*
- * allowed bucket algs is a bitmask, here the bit positions
- * are CRUSH_BUCKET_*. note that these are *bits* and
- * CRUSH_BUCKET_* values are not, so we need to or together (1
- * << CRUSH_BUCKET_WHATEVER). The 0th bit is not used to
- * minimize confusion (bucket type values start at 1).
- */
- __u32 allowed_bucket_algs;
-
- __u32 *choose_tries;
-#endif
- /*! @endcond */
-};
-
-
-/* crush.c */
-/** @ingroup API
- *
- * Return the 16.16 fixed point weight of the item at __pos__ (zero
- * based index) within the bucket __b__. If __pos__ is negative or
- * greater or equal to the number of items in the bucket, return 0.
- *
- * @param b the bucket containing items
- * @param pos the zero based index of the item
- *
- * @returns the 16.16 fixed point item weight
- */
-extern int crush_get_bucket_item_weight(const struct crush_bucket *b, int pos);
-extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
-extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
-extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
-extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
-extern void crush_destroy_bucket_straw2(struct crush_bucket_straw2 *b);
-/** @ingroup API
- *
- * Deallocate a bucket created via crush_add_bucket().
- *
- * @param b the bucket to deallocate
- */
-extern void crush_destroy_bucket(struct crush_bucket *b);
-/** @ingroup API
- *
- * Deallocate a rule created via crush_add_rule().
- *
- * @param r the rule to deallocate
- */
-extern void crush_destroy_rule(struct crush_rule *r);
-/** @ingroup API
- *
- * Deallocate the __map__, previously allocated with crush_create.
- *
- * @param map the crush map
- */
-extern void crush_destroy(struct crush_map *map);
-
-static inline int crush_calc_tree_node(int i)
-{
- return ((i+1) << 1)-1;
-}
-
-static inline const char *crush_alg_name(int alg)
-{
- switch (alg) {
- case CRUSH_BUCKET_UNIFORM:
- return "uniform";
- case CRUSH_BUCKET_LIST:
- return "list";
- case CRUSH_BUCKET_TREE:
- return "tree";
- case CRUSH_BUCKET_STRAW:
- return "straw";
- case CRUSH_BUCKET_STRAW2:
- return "straw2";
- default:
- return "unknown";
- }
-}
-
-/* ---------------------------------------------------------------------
- Private
- --------------------------------------------------------------------- */
-
-/* These data structures are private to the CRUSH implementation. They
- are exposed in this header file because builder needs their
- definitions to calculate the total working size.
-
- Moving this out of the crush map allow us to treat the CRUSH map as
- immutable within the mapper and removes the requirement for a CRUSH
- map lock. */
-
-struct crush_work_bucket {
- __u32 perm_x; /* @x for which *perm is defined */
- __u32 perm_n; /* num elements of *perm that are permuted/defined */
- __u32 *perm; /* Permutation of the bucket's items */
-};
-
-struct crush_work {
- struct crush_work_bucket **work; /* Per-bucket working store */
-};
-
-#endif