Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / fs / reiserfs / fix_node.c
1 /*
2  * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README
3  */
4
5 #include <linux/time.h>
6 #include <linux/slab.h>
7 #include <linux/string.h>
8 #include "reiserfs.h"
9 #include <linux/buffer_head.h>
10
11 /*
12  * To make any changes in the tree we find a node that contains item
13  * to be changed/deleted or position in the node we insert a new item
14  * to. We call this node S. To do balancing we need to decide what we
15  * will shift to left/right neighbor, or to a new node, where new item
16  * will be etc. To make this analysis simpler we build virtual
17  * node. Virtual node is an array of items, that will replace items of
18  * node S. (For instance if we are going to delete an item, virtual
19  * node does not contain it). Virtual node keeps information about
20  * item sizes and types, mergeability of first and last items, sizes
21  * of all entries in directory item. We use this array of items when
22  * calculating what we can shift to neighbors and how many nodes we
23  * have to have if we do not any shiftings, if we shift to left/right
24  * neighbor or to both.
25  */
26
27 /*
28  * Takes item number in virtual node, returns number of item
29  * that it has in source buffer
30  */
31 static inline int old_item_num(int new_num, int affected_item_num, int mode)
32 {
33         if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num)
34                 return new_num;
35
36         if (mode == M_INSERT) {
37
38                 RFALSE(new_num == 0,
39                        "vs-8005: for INSERT mode and item number of inserted item");
40
41                 return new_num - 1;
42         }
43
44         RFALSE(mode != M_DELETE,
45                "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'",
46                mode);
47         /* delete mode */
48         return new_num + 1;
49 }
50
51 static void create_virtual_node(struct tree_balance *tb, int h)
52 {
53         struct item_head *ih;
54         struct virtual_node *vn = tb->tb_vn;
55         int new_num;
56         struct buffer_head *Sh; /* this comes from tb->S[h] */
57
58         Sh = PATH_H_PBUFFER(tb->tb_path, h);
59
60         /* size of changed node */
61         vn->vn_size =
62             MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h];
63
64         /* for internal nodes array if virtual items is not created */
65         if (h) {
66                 vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE);
67                 return;
68         }
69
70         /* number of items in virtual node  */
71         vn->vn_nr_item =
72             B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) -
73             ((vn->vn_mode == M_DELETE) ? 1 : 0);
74
75         /* first virtual item */
76         vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1);
77         memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item));
78         vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item);
79
80         /* first item in the node */
81         ih = item_head(Sh, 0);
82
83         /* define the mergeability for 0-th item (if it is not being deleted) */
84         if (op_is_left_mergeable(&ih->ih_key, Sh->b_size)
85             && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num))
86                 vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE;
87
88         /*
89          * go through all items that remain in the virtual
90          * node (except for the new (inserted) one)
91          */
92         for (new_num = 0; new_num < vn->vn_nr_item; new_num++) {
93                 int j;
94                 struct virtual_item *vi = vn->vn_vi + new_num;
95                 int is_affected =
96                     ((new_num != vn->vn_affected_item_num) ? 0 : 1);
97
98                 if (is_affected && vn->vn_mode == M_INSERT)
99                         continue;
100
101                 /* get item number in source node */
102                 j = old_item_num(new_num, vn->vn_affected_item_num,
103                                  vn->vn_mode);
104
105                 vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE;
106                 vi->vi_ih = ih + j;
107                 vi->vi_item = ih_item_body(Sh, ih + j);
108                 vi->vi_uarea = vn->vn_free_ptr;
109
110                 /*
111                  * FIXME: there is no check that item operation did not
112                  * consume too much memory
113                  */
114                 vn->vn_free_ptr +=
115                     op_create_vi(vn, vi, is_affected, tb->insert_size[0]);
116                 if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr)
117                         reiserfs_panic(tb->tb_sb, "vs-8030",
118                                        "virtual node space consumed");
119
120                 if (!is_affected)
121                         /* this is not being changed */
122                         continue;
123
124                 if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) {
125                         vn->vn_vi[new_num].vi_item_len += tb->insert_size[0];
126                         /* pointer to data which is going to be pasted */
127                         vi->vi_new_data = vn->vn_data;
128                 }
129         }
130
131         /* virtual inserted item is not defined yet */
132         if (vn->vn_mode == M_INSERT) {
133                 struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num;
134
135                 RFALSE(vn->vn_ins_ih == NULL,
136                        "vs-8040: item header of inserted item is not specified");
137                 vi->vi_item_len = tb->insert_size[0];
138                 vi->vi_ih = vn->vn_ins_ih;
139                 vi->vi_item = vn->vn_data;
140                 vi->vi_uarea = vn->vn_free_ptr;
141
142                 op_create_vi(vn, vi, 0 /*not pasted or cut */ ,
143                              tb->insert_size[0]);
144         }
145
146         /*
147          * set right merge flag we take right delimiting key and
148          * check whether it is a mergeable item
149          */
150         if (tb->CFR[0]) {
151                 struct reiserfs_key *key;
152
153                 key = internal_key(tb->CFR[0], tb->rkey[0]);
154                 if (op_is_left_mergeable(key, Sh->b_size)
155                     && (vn->vn_mode != M_DELETE
156                         || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1))
157                         vn->vn_vi[vn->vn_nr_item - 1].vi_type |=
158                             VI_TYPE_RIGHT_MERGEABLE;
159
160 #ifdef CONFIG_REISERFS_CHECK
161                 if (op_is_left_mergeable(key, Sh->b_size) &&
162                     !(vn->vn_mode != M_DELETE
163                       || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) {
164                         /*
165                          * we delete last item and it could be merged
166                          * with right neighbor's first item
167                          */
168                         if (!
169                             (B_NR_ITEMS(Sh) == 1
170                              && is_direntry_le_ih(item_head(Sh, 0))
171                              && ih_entry_count(item_head(Sh, 0)) == 1)) {
172                                 /*
173                                  * node contains more than 1 item, or item
174                                  * is not directory item, or this item
175                                  * contains more than 1 entry
176                                  */
177                                 print_block(Sh, 0, -1, -1);
178                                 reiserfs_panic(tb->tb_sb, "vs-8045",
179                                                "rdkey %k, affected item==%d "
180                                                "(mode==%c) Must be %c",
181                                                key, vn->vn_affected_item_num,
182                                                vn->vn_mode, M_DELETE);
183                         }
184                 }
185 #endif
186
187         }
188 }
189
190 /*
191  * Using virtual node check, how many items can be
192  * shifted to left neighbor
193  */
194 static void check_left(struct tree_balance *tb, int h, int cur_free)
195 {
196         int i;
197         struct virtual_node *vn = tb->tb_vn;
198         struct virtual_item *vi;
199         int d_size, ih_size;
200
201         RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free);
202
203         /* internal level */
204         if (h > 0) {
205                 tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
206                 return;
207         }
208
209         /* leaf level */
210
211         if (!cur_free || !vn->vn_nr_item) {
212                 /* no free space or nothing to move */
213                 tb->lnum[h] = 0;
214                 tb->lbytes = -1;
215                 return;
216         }
217
218         RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
219                "vs-8055: parent does not exist or invalid");
220
221         vi = vn->vn_vi;
222         if ((unsigned int)cur_free >=
223             (vn->vn_size -
224              ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) {
225                 /* all contents of S[0] fits into L[0] */
226
227                 RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
228                        "vs-8055: invalid mode or balance condition failed");
229
230                 tb->lnum[0] = vn->vn_nr_item;
231                 tb->lbytes = -1;
232                 return;
233         }
234
235         d_size = 0, ih_size = IH_SIZE;
236
237         /* first item may be merge with last item in left neighbor */
238         if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE)
239                 d_size = -((int)IH_SIZE), ih_size = 0;
240
241         tb->lnum[0] = 0;
242         for (i = 0; i < vn->vn_nr_item;
243              i++, ih_size = IH_SIZE, d_size = 0, vi++) {
244                 d_size += vi->vi_item_len;
245                 if (cur_free >= d_size) {
246                         /* the item can be shifted entirely */
247                         cur_free -= d_size;
248                         tb->lnum[0]++;
249                         continue;
250                 }
251
252                 /* the item cannot be shifted entirely, try to split it */
253                 /*
254                  * check whether L[0] can hold ih and at least one byte
255                  * of the item body
256                  */
257
258                 /* cannot shift even a part of the current item */
259                 if (cur_free <= ih_size) {
260                         tb->lbytes = -1;
261                         return;
262                 }
263                 cur_free -= ih_size;
264
265                 tb->lbytes = op_check_left(vi, cur_free, 0, 0);
266                 if (tb->lbytes != -1)
267                         /* count partially shifted item */
268                         tb->lnum[0]++;
269
270                 break;
271         }
272
273         return;
274 }
275
276 /*
277  * Using virtual node check, how many items can be
278  * shifted to right neighbor
279  */
280 static void check_right(struct tree_balance *tb, int h, int cur_free)
281 {
282         int i;
283         struct virtual_node *vn = tb->tb_vn;
284         struct virtual_item *vi;
285         int d_size, ih_size;
286
287         RFALSE(cur_free < 0, "vs-8070: cur_free < 0");
288
289         /* internal level */
290         if (h > 0) {
291                 tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE);
292                 return;
293         }
294
295         /* leaf level */
296
297         if (!cur_free || !vn->vn_nr_item) {
298                 /* no free space  */
299                 tb->rnum[h] = 0;
300                 tb->rbytes = -1;
301                 return;
302         }
303
304         RFALSE(!PATH_H_PPARENT(tb->tb_path, 0),
305                "vs-8075: parent does not exist or invalid");
306
307         vi = vn->vn_vi + vn->vn_nr_item - 1;
308         if ((unsigned int)cur_free >=
309             (vn->vn_size -
310              ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) {
311                 /* all contents of S[0] fits into R[0] */
312
313                 RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE,
314                        "vs-8080: invalid mode or balance condition failed");
315
316                 tb->rnum[h] = vn->vn_nr_item;
317                 tb->rbytes = -1;
318                 return;
319         }
320
321         d_size = 0, ih_size = IH_SIZE;
322
323         /* last item may be merge with first item in right neighbor */
324         if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE)
325                 d_size = -(int)IH_SIZE, ih_size = 0;
326
327         tb->rnum[0] = 0;
328         for (i = vn->vn_nr_item - 1; i >= 0;
329              i--, d_size = 0, ih_size = IH_SIZE, vi--) {
330                 d_size += vi->vi_item_len;
331                 if (cur_free >= d_size) {
332                         /* the item can be shifted entirely */
333                         cur_free -= d_size;
334                         tb->rnum[0]++;
335                         continue;
336                 }
337
338                 /*
339                  * check whether R[0] can hold ih and at least one
340                  * byte of the item body
341                  */
342
343                 /* cannot shift even a part of the current item */
344                 if (cur_free <= ih_size) {
345                         tb->rbytes = -1;
346                         return;
347                 }
348
349                 /*
350                  * R[0] can hold the header of the item and at least
351                  * one byte of its body
352                  */
353                 cur_free -= ih_size;    /* cur_free is still > 0 */
354
355                 tb->rbytes = op_check_right(vi, cur_free);
356                 if (tb->rbytes != -1)
357                         /* count partially shifted item */
358                         tb->rnum[0]++;
359
360                 break;
361         }
362
363         return;
364 }
365
366 /*
367  * from - number of items, which are shifted to left neighbor entirely
368  * to - number of item, which are shifted to right neighbor entirely
369  * from_bytes - number of bytes of boundary item (or directory entries)
370  *              which are shifted to left neighbor
371  * to_bytes - number of bytes of boundary item (or directory entries)
372  *            which are shifted to right neighbor
373  */
374 static int get_num_ver(int mode, struct tree_balance *tb, int h,
375                        int from, int from_bytes,
376                        int to, int to_bytes, short *snum012, int flow)
377 {
378         int i;
379         int cur_free;
380         int units;
381         struct virtual_node *vn = tb->tb_vn;
382         int total_node_size, max_node_size, current_item_size;
383         int needed_nodes;
384
385         /* position of item we start filling node from */
386         int start_item;
387
388         /* position of item we finish filling node by */
389         int end_item;
390
391         /*
392          * number of first bytes (entries for directory) of start_item-th item
393          * we do not include into node that is being filled
394          */
395         int start_bytes;
396
397         /*
398          * number of last bytes (entries for directory) of end_item-th item
399          * we do node include into node that is being filled
400          */
401         int end_bytes;
402
403         /*
404          * these are positions in virtual item of items, that are split
405          * between S[0] and S1new and S1new and S2new
406          */
407         int split_item_positions[2];
408
409         split_item_positions[0] = -1;
410         split_item_positions[1] = -1;
411
412         /*
413          * We only create additional nodes if we are in insert or paste mode
414          * or we are in replace mode at the internal level. If h is 0 and
415          * the mode is M_REPLACE then in fix_nodes we change the mode to
416          * paste or insert before we get here in the code.
417          */
418         RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE),
419                "vs-8100: insert_size < 0 in overflow");
420
421         max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h));
422
423         /*
424          * snum012 [0-2] - number of items, that lay
425          * to S[0], first new node and second new node
426          */
427         snum012[3] = -1;        /* s1bytes */
428         snum012[4] = -1;        /* s2bytes */
429
430         /* internal level */
431         if (h > 0) {
432                 i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE);
433                 if (i == max_node_size)
434                         return 1;
435                 return (i / max_node_size + 1);
436         }
437
438         /* leaf level */
439         needed_nodes = 1;
440         total_node_size = 0;
441         cur_free = max_node_size;
442
443         /* start from 'from'-th item */
444         start_item = from;
445         /* skip its first 'start_bytes' units */
446         start_bytes = ((from_bytes != -1) ? from_bytes : 0);
447
448         /* last included item is the 'end_item'-th one */
449         end_item = vn->vn_nr_item - to - 1;
450         /* do not count last 'end_bytes' units of 'end_item'-th item */
451         end_bytes = (to_bytes != -1) ? to_bytes : 0;
452
453         /*
454          * go through all item beginning from the start_item-th item
455          * and ending by the end_item-th item. Do not count first
456          * 'start_bytes' units of 'start_item'-th item and last
457          * 'end_bytes' of 'end_item'-th item
458          */
459         for (i = start_item; i <= end_item; i++) {
460                 struct virtual_item *vi = vn->vn_vi + i;
461                 int skip_from_end = ((i == end_item) ? end_bytes : 0);
462
463                 RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed");
464
465                 /* get size of current item */
466                 current_item_size = vi->vi_item_len;
467
468                 /*
469                  * do not take in calculation head part (from_bytes)
470                  * of from-th item
471                  */
472                 current_item_size -=
473                     op_part_size(vi, 0 /*from start */ , start_bytes);
474
475                 /* do not take in calculation tail part of last item */
476                 current_item_size -=
477                     op_part_size(vi, 1 /*from end */ , skip_from_end);
478
479                 /* if item fits into current node entierly */
480                 if (total_node_size + current_item_size <= max_node_size) {
481                         snum012[needed_nodes - 1]++;
482                         total_node_size += current_item_size;
483                         start_bytes = 0;
484                         continue;
485                 }
486
487                 /*
488                  * virtual item length is longer, than max size of item in
489                  * a node. It is impossible for direct item
490                  */
491                 if (current_item_size > max_node_size) {
492                         RFALSE(is_direct_le_ih(vi->vi_ih),
493                                "vs-8110: "
494                                "direct item length is %d. It can not be longer than %d",
495                                current_item_size, max_node_size);
496                         /* we will try to split it */
497                         flow = 1;
498                 }
499
500                 /* as we do not split items, take new node and continue */
501                 if (!flow) {
502                         needed_nodes++;
503                         i--;
504                         total_node_size = 0;
505                         continue;
506                 }
507
508                 /*
509                  * calculate number of item units which fit into node being
510                  * filled
511                  */
512                 {
513                         int free_space;
514
515                         free_space = max_node_size - total_node_size - IH_SIZE;
516                         units =
517                             op_check_left(vi, free_space, start_bytes,
518                                           skip_from_end);
519                         /*
520                          * nothing fits into current node, take new
521                          * node and continue
522                          */
523                         if (units == -1) {
524                                 needed_nodes++, i--, total_node_size = 0;
525                                 continue;
526                         }
527                 }
528
529                 /* something fits into the current node */
530                 start_bytes += units;
531                 snum012[needed_nodes - 1 + 3] = units;
532
533                 if (needed_nodes > 2)
534                         reiserfs_warning(tb->tb_sb, "vs-8111",
535                                          "split_item_position is out of range");
536                 snum012[needed_nodes - 1]++;
537                 split_item_positions[needed_nodes - 1] = i;
538                 needed_nodes++;
539                 /* continue from the same item with start_bytes != -1 */
540                 start_item = i;
541                 i--;
542                 total_node_size = 0;
543         }
544
545         /*
546          * sum012[4] (if it is not -1) contains number of units of which
547          * are to be in S1new, snum012[3] - to be in S0. They are supposed
548          * to be S1bytes and S2bytes correspondingly, so recalculate
549          */
550         if (snum012[4] > 0) {
551                 int split_item_num;
552                 int bytes_to_r, bytes_to_l;
553                 int bytes_to_S1new;
554
555                 split_item_num = split_item_positions[1];
556                 bytes_to_l =
557                     ((from == split_item_num
558                       && from_bytes != -1) ? from_bytes : 0);
559                 bytes_to_r =
560                     ((end_item == split_item_num
561                       && end_bytes != -1) ? end_bytes : 0);
562                 bytes_to_S1new =
563                     ((split_item_positions[0] ==
564                       split_item_positions[1]) ? snum012[3] : 0);
565
566                 /* s2bytes */
567                 snum012[4] =
568                     op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] -
569                     bytes_to_r - bytes_to_l - bytes_to_S1new;
570
571                 if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY &&
572                     vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT)
573                         reiserfs_warning(tb->tb_sb, "vs-8115",
574                                          "not directory or indirect item");
575         }
576
577         /* now we know S2bytes, calculate S1bytes */
578         if (snum012[3] > 0) {
579                 int split_item_num;
580                 int bytes_to_r, bytes_to_l;
581                 int bytes_to_S2new;
582
583                 split_item_num = split_item_positions[0];
584                 bytes_to_l =
585                     ((from == split_item_num
586                       && from_bytes != -1) ? from_bytes : 0);
587                 bytes_to_r =
588                     ((end_item == split_item_num
589                       && end_bytes != -1) ? end_bytes : 0);
590                 bytes_to_S2new =
591                     ((split_item_positions[0] == split_item_positions[1]
592                       && snum012[4] != -1) ? snum012[4] : 0);
593
594                 /* s1bytes */
595                 snum012[3] =
596                     op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] -
597                     bytes_to_r - bytes_to_l - bytes_to_S2new;
598         }
599
600         return needed_nodes;
601 }
602
603
604 /*
605  * Set parameters for balancing.
606  * Performs write of results of analysis of balancing into structure tb,
607  * where it will later be used by the functions that actually do the balancing.
608  * Parameters:
609  *      tb      tree_balance structure;
610  *      h       current level of the node;
611  *      lnum    number of items from S[h] that must be shifted to L[h];
612  *      rnum    number of items from S[h] that must be shifted to R[h];
613  *      blk_num number of blocks that S[h] will be splitted into;
614  *      s012    number of items that fall into splitted nodes.
615  *      lbytes  number of bytes which flow to the left neighbor from the
616  *              item that is not not shifted entirely
617  *      rbytes  number of bytes which flow to the right neighbor from the
618  *              item that is not not shifted entirely
619  *      s1bytes number of bytes which flow to the first  new node when
620  *              S[0] splits (this number is contained in s012 array)
621  */
622
623 static void set_parameters(struct tree_balance *tb, int h, int lnum,
624                            int rnum, int blk_num, short *s012, int lb, int rb)
625 {
626
627         tb->lnum[h] = lnum;
628         tb->rnum[h] = rnum;
629         tb->blknum[h] = blk_num;
630
631         /* only for leaf level */
632         if (h == 0) {
633                 if (s012 != NULL) {
634                         tb->s0num = *s012++;
635                         tb->snum[0] = *s012++;
636                         tb->snum[1] = *s012++;
637                         tb->sbytes[0] = *s012++;
638                         tb->sbytes[1] = *s012;
639                 }
640                 tb->lbytes = lb;
641                 tb->rbytes = rb;
642         }
643         PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum);
644         PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum);
645
646         PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb);
647         PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb);
648 }
649
650 /*
651  * check if node disappears if we shift tb->lnum[0] items to left
652  * neighbor and tb->rnum[0] to the right one.
653  */
654 static int is_leaf_removable(struct tree_balance *tb)
655 {
656         struct virtual_node *vn = tb->tb_vn;
657         int to_left, to_right;
658         int size;
659         int remain_items;
660
661         /*
662          * number of items that will be shifted to left (right) neighbor
663          * entirely
664          */
665         to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0);
666         to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0);
667         remain_items = vn->vn_nr_item;
668
669         /* how many items remain in S[0] after shiftings to neighbors */
670         remain_items -= (to_left + to_right);
671
672         /* all content of node can be shifted to neighbors */
673         if (remain_items < 1) {
674                 set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0,
675                                NULL, -1, -1);
676                 return 1;
677         }
678
679         /* S[0] is not removable */
680         if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1)
681                 return 0;
682
683         /* check whether we can divide 1 remaining item between neighbors */
684
685         /* get size of remaining item (in item units) */
686         size = op_unit_num(&vn->vn_vi[to_left]);
687
688         if (tb->lbytes + tb->rbytes >= size) {
689                 set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL,
690                                tb->lbytes, -1);
691                 return 1;
692         }
693
694         return 0;
695 }
696
697 /* check whether L, S, R can be joined in one node */
698 static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree)
699 {
700         struct virtual_node *vn = tb->tb_vn;
701         int ih_size;
702         struct buffer_head *S0;
703
704         S0 = PATH_H_PBUFFER(tb->tb_path, 0);
705
706         ih_size = 0;
707         if (vn->vn_nr_item) {
708                 if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE)
709                         ih_size += IH_SIZE;
710
711                 if (vn->vn_vi[vn->vn_nr_item - 1].
712                     vi_type & VI_TYPE_RIGHT_MERGEABLE)
713                         ih_size += IH_SIZE;
714         } else {
715                 /* there was only one item and it will be deleted */
716                 struct item_head *ih;
717
718                 RFALSE(B_NR_ITEMS(S0) != 1,
719                        "vs-8125: item number must be 1: it is %d",
720                        B_NR_ITEMS(S0));
721
722                 ih = item_head(S0, 0);
723                 if (tb->CFR[0]
724                     && !comp_short_le_keys(&ih->ih_key,
725                                            internal_key(tb->CFR[0],
726                                                           tb->rkey[0])))
727                         /*
728                          * Directory must be in correct state here: that is
729                          * somewhere at the left side should exist first
730                          * directory item. But the item being deleted can
731                          * not be that first one because its right neighbor
732                          * is item of the same directory. (But first item
733                          * always gets deleted in last turn). So, neighbors
734                          * of deleted item can be merged, so we can save
735                          * ih_size
736                          */
737                         if (is_direntry_le_ih(ih)) {
738                                 ih_size = IH_SIZE;
739
740                                 /*
741                                  * we might check that left neighbor exists
742                                  * and is of the same directory
743                                  */
744                                 RFALSE(le_ih_k_offset(ih) == DOT_OFFSET,
745                                        "vs-8130: first directory item can not be removed until directory is not empty");
746                         }
747
748         }
749
750         if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) {
751                 set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1);
752                 PROC_INFO_INC(tb->tb_sb, leaves_removable);
753                 return 1;
754         }
755         return 0;
756
757 }
758
759 /* when we do not split item, lnum and rnum are numbers of entire items */
760 #define SET_PAR_SHIFT_LEFT \
761 if (h)\
762 {\
763    int to_l;\
764    \
765    to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\
766               (MAX_NR_KEY(Sh) + 1 - lpar);\
767               \
768               set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\
769 }\
770 else \
771 {\
772    if (lset==LEFT_SHIFT_FLOW)\
773      set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\
774                      tb->lbytes, -1);\
775    else\
776      set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\
777                      -1, -1);\
778 }
779
780 #define SET_PAR_SHIFT_RIGHT \
781 if (h)\
782 {\
783    int to_r;\
784    \
785    to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\
786    \
787    set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\
788 }\
789 else \
790 {\
791    if (rset==RIGHT_SHIFT_FLOW)\
792      set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\
793                   -1, tb->rbytes);\
794    else\
795      set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\
796                   -1, -1);\
797 }
798
799 static void free_buffers_in_tb(struct tree_balance *tb)
800 {
801         int i;
802
803         pathrelse(tb->tb_path);
804
805         for (i = 0; i < MAX_HEIGHT; i++) {
806                 brelse(tb->L[i]);
807                 brelse(tb->R[i]);
808                 brelse(tb->FL[i]);
809                 brelse(tb->FR[i]);
810                 brelse(tb->CFL[i]);
811                 brelse(tb->CFR[i]);
812
813                 tb->L[i] = NULL;
814                 tb->R[i] = NULL;
815                 tb->FL[i] = NULL;
816                 tb->FR[i] = NULL;
817                 tb->CFL[i] = NULL;
818                 tb->CFR[i] = NULL;
819         }
820 }
821
822 /*
823  * Get new buffers for storing new nodes that are created while balancing.
824  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
825  *              CARRY_ON - schedule didn't occur while the function worked;
826  *              NO_DISK_SPACE - no disk space.
827  */
828 /* The function is NOT SCHEDULE-SAFE! */
829 static int get_empty_nodes(struct tree_balance *tb, int h)
830 {
831         struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h);
832         b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, };
833         int counter, number_of_freeblk;
834         int  amount_needed;     /* number of needed empty blocks */
835         int  retval = CARRY_ON;
836         struct super_block *sb = tb->tb_sb;
837
838         /*
839          * number_of_freeblk is the number of empty blocks which have been
840          * acquired for use by the balancing algorithm minus the number of
841          * empty blocks used in the previous levels of the analysis,
842          * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule
843          * occurs after empty blocks are acquired, and the balancing analysis
844          * is then restarted, amount_needed is the number needed by this
845          * level (h) of the balancing analysis.
846          *
847          * Note that for systems with many processes writing, it would be
848          * more layout optimal to calculate the total number needed by all
849          * levels and then to run reiserfs_new_blocks to get all of them at
850          * once.
851          */
852
853         /*
854          * Initiate number_of_freeblk to the amount acquired prior to the
855          * restart of the analysis or 0 if not restarted, then subtract the
856          * amount needed by all of the levels of the tree below h.
857          */
858         /* blknum includes S[h], so we subtract 1 in this calculation */
859         for (counter = 0, number_of_freeblk = tb->cur_blknum;
860              counter < h; counter++)
861                 number_of_freeblk -=
862                     (tb->blknum[counter]) ? (tb->blknum[counter] -
863                                                    1) : 0;
864
865         /* Allocate missing empty blocks. */
866         /* if Sh == 0  then we are getting a new root */
867         amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1;
868         /*
869          * Amount_needed = the amount that we need more than the
870          * amount that we have.
871          */
872         if (amount_needed > number_of_freeblk)
873                 amount_needed -= number_of_freeblk;
874         else    /* If we have enough already then there is nothing to do. */
875                 return CARRY_ON;
876
877         /*
878          * No need to check quota - is not allocated for blocks used
879          * for formatted nodes
880          */
881         if (reiserfs_new_form_blocknrs(tb, blocknrs,
882                                        amount_needed) == NO_DISK_SPACE)
883                 return NO_DISK_SPACE;
884
885         /* for each blocknumber we just got, get a buffer and stick it on FEB */
886         for (blocknr = blocknrs, counter = 0;
887              counter < amount_needed; blocknr++, counter++) {
888
889                 RFALSE(!*blocknr,
890                        "PAP-8135: reiserfs_new_blocknrs failed when got new blocks");
891
892                 new_bh = sb_getblk(sb, *blocknr);
893                 RFALSE(buffer_dirty(new_bh) ||
894                        buffer_journaled(new_bh) ||
895                        buffer_journal_dirty(new_bh),
896                        "PAP-8140: journaled or dirty buffer %b for the new block",
897                        new_bh);
898
899                 /* Put empty buffers into the array. */
900                 RFALSE(tb->FEB[tb->cur_blknum],
901                        "PAP-8141: busy slot for new buffer");
902
903                 set_buffer_journal_new(new_bh);
904                 tb->FEB[tb->cur_blknum++] = new_bh;
905         }
906
907         if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb))
908                 retval = REPEAT_SEARCH;
909
910         return retval;
911 }
912
913 /*
914  * Get free space of the left neighbor, which is stored in the parent
915  * node of the left neighbor.
916  */
917 static int get_lfree(struct tree_balance *tb, int h)
918 {
919         struct buffer_head *l, *f;
920         int order;
921
922         if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
923             (l = tb->FL[h]) == NULL)
924                 return 0;
925
926         if (f == l)
927                 order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1;
928         else {
929                 order = B_NR_ITEMS(l);
930                 f = l;
931         }
932
933         return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
934 }
935
936 /*
937  * Get free space of the right neighbor,
938  * which is stored in the parent node of the right neighbor.
939  */
940 static int get_rfree(struct tree_balance *tb, int h)
941 {
942         struct buffer_head *r, *f;
943         int order;
944
945         if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL ||
946             (r = tb->FR[h]) == NULL)
947                 return 0;
948
949         if (f == r)
950                 order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1;
951         else {
952                 order = 0;
953                 f = r;
954         }
955
956         return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order)));
957
958 }
959
960 /* Check whether left neighbor is in memory. */
961 static int is_left_neighbor_in_cache(struct tree_balance *tb, int h)
962 {
963         struct buffer_head *father, *left;
964         struct super_block *sb = tb->tb_sb;
965         b_blocknr_t left_neighbor_blocknr;
966         int left_neighbor_position;
967
968         /* Father of the left neighbor does not exist. */
969         if (!tb->FL[h])
970                 return 0;
971
972         /* Calculate father of the node to be balanced. */
973         father = PATH_H_PBUFFER(tb->tb_path, h + 1);
974
975         RFALSE(!father ||
976                !B_IS_IN_TREE(father) ||
977                !B_IS_IN_TREE(tb->FL[h]) ||
978                !buffer_uptodate(father) ||
979                !buffer_uptodate(tb->FL[h]),
980                "vs-8165: F[h] (%b) or FL[h] (%b) is invalid",
981                father, tb->FL[h]);
982
983         /*
984          * Get position of the pointer to the left neighbor
985          * into the left father.
986          */
987         left_neighbor_position = (father == tb->FL[h]) ?
988             tb->lkey[h] : B_NR_ITEMS(tb->FL[h]);
989         /* Get left neighbor block number. */
990         left_neighbor_blocknr =
991             B_N_CHILD_NUM(tb->FL[h], left_neighbor_position);
992         /* Look for the left neighbor in the cache. */
993         if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) {
994
995                 RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left),
996                        "vs-8170: left neighbor (%b %z) is not in the tree",
997                        left, left);
998                 put_bh(left);
999                 return 1;
1000         }
1001
1002         return 0;
1003 }
1004
1005 #define LEFT_PARENTS  'l'
1006 #define RIGHT_PARENTS 'r'
1007
1008 static void decrement_key(struct cpu_key *key)
1009 {
1010         /* call item specific function for this key */
1011         item_ops[cpu_key_k_type(key)]->decrement_key(key);
1012 }
1013
1014 /*
1015  * Calculate far left/right parent of the left/right neighbor of the
1016  * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor
1017  * of the parent F[h].
1018  * Calculate left/right common parent of the current node and L[h]/R[h].
1019  * Calculate left/right delimiting key position.
1020  * Returns:     PATH_INCORRECT    - path in the tree is not correct
1021  *              SCHEDULE_OCCURRED - schedule occurred while the function worked
1022  *              CARRY_ON          - schedule didn't occur while the function
1023  *                                  worked
1024  */
1025 static int get_far_parent(struct tree_balance *tb,
1026                           int h,
1027                           struct buffer_head **pfather,
1028                           struct buffer_head **pcom_father, char c_lr_par)
1029 {
1030         struct buffer_head *parent;
1031         INITIALIZE_PATH(s_path_to_neighbor_father);
1032         struct treepath *path = tb->tb_path;
1033         struct cpu_key s_lr_father_key;
1034         int counter,
1035             position = INT_MAX,
1036             first_last_position = 0,
1037             path_offset = PATH_H_PATH_OFFSET(path, h);
1038
1039         /*
1040          * Starting from F[h] go upwards in the tree, and look for the common
1041          * ancestor of F[h], and its neighbor l/r, that should be obtained.
1042          */
1043
1044         counter = path_offset;
1045
1046         RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET,
1047                "PAP-8180: invalid path length");
1048
1049         for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) {
1050                 /*
1051                  * Check whether parent of the current buffer in the path
1052                  * is really parent in the tree.
1053                  */
1054                 if (!B_IS_IN_TREE
1055                     (parent = PATH_OFFSET_PBUFFER(path, counter - 1)))
1056                         return REPEAT_SEARCH;
1057
1058                 /* Check whether position in the parent is correct. */
1059                 if ((position =
1060                      PATH_OFFSET_POSITION(path,
1061                                           counter - 1)) >
1062                     B_NR_ITEMS(parent))
1063                         return REPEAT_SEARCH;
1064
1065                 /*
1066                  * Check whether parent at the path really points
1067                  * to the child.
1068                  */
1069                 if (B_N_CHILD_NUM(parent, position) !=
1070                     PATH_OFFSET_PBUFFER(path, counter)->b_blocknr)
1071                         return REPEAT_SEARCH;
1072
1073                 /*
1074                  * Return delimiting key if position in the parent is not
1075                  * equal to first/last one.
1076                  */
1077                 if (c_lr_par == RIGHT_PARENTS)
1078                         first_last_position = B_NR_ITEMS(parent);
1079                 if (position != first_last_position) {
1080                         *pcom_father = parent;
1081                         get_bh(*pcom_father);
1082                         /*(*pcom_father = parent)->b_count++; */
1083                         break;
1084                 }
1085         }
1086
1087         /* if we are in the root of the tree, then there is no common father */
1088         if (counter == FIRST_PATH_ELEMENT_OFFSET) {
1089                 /*
1090                  * Check whether first buffer in the path is the
1091                  * root of the tree.
1092                  */
1093                 if (PATH_OFFSET_PBUFFER
1094                     (tb->tb_path,
1095                      FIRST_PATH_ELEMENT_OFFSET)->b_blocknr ==
1096                     SB_ROOT_BLOCK(tb->tb_sb)) {
1097                         *pfather = *pcom_father = NULL;
1098                         return CARRY_ON;
1099                 }
1100                 return REPEAT_SEARCH;
1101         }
1102
1103         RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL,
1104                "PAP-8185: (%b %z) level too small",
1105                *pcom_father, *pcom_father);
1106
1107         /* Check whether the common parent is locked. */
1108
1109         if (buffer_locked(*pcom_father)) {
1110
1111                 /* Release the write lock while the buffer is busy */
1112                 int depth = reiserfs_write_unlock_nested(tb->tb_sb);
1113                 __wait_on_buffer(*pcom_father);
1114                 reiserfs_write_lock_nested(tb->tb_sb, depth);
1115                 if (FILESYSTEM_CHANGED_TB(tb)) {
1116                         brelse(*pcom_father);
1117                         return REPEAT_SEARCH;
1118                 }
1119         }
1120
1121         /*
1122          * So, we got common parent of the current node and its
1123          * left/right neighbor.  Now we are getting the parent of the
1124          * left/right neighbor.
1125          */
1126
1127         /* Form key to get parent of the left/right neighbor. */
1128         le_key2cpu_key(&s_lr_father_key,
1129                        internal_key(*pcom_father,
1130                                       (c_lr_par ==
1131                                        LEFT_PARENTS) ? (tb->lkey[h - 1] =
1132                                                         position -
1133                                                         1) : (tb->rkey[h -
1134                                                                            1] =
1135                                                               position)));
1136
1137         if (c_lr_par == LEFT_PARENTS)
1138                 decrement_key(&s_lr_father_key);
1139
1140         if (search_by_key
1141             (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father,
1142              h + 1) == IO_ERROR)
1143                 /* path is released */
1144                 return IO_ERROR;
1145
1146         if (FILESYSTEM_CHANGED_TB(tb)) {
1147                 pathrelse(&s_path_to_neighbor_father);
1148                 brelse(*pcom_father);
1149                 return REPEAT_SEARCH;
1150         }
1151
1152         *pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father);
1153
1154         RFALSE(B_LEVEL(*pfather) != h + 1,
1155                "PAP-8190: (%b %z) level too small", *pfather, *pfather);
1156         RFALSE(s_path_to_neighbor_father.path_length <
1157                FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small");
1158
1159         s_path_to_neighbor_father.path_length--;
1160         pathrelse(&s_path_to_neighbor_father);
1161         return CARRY_ON;
1162 }
1163
1164 /*
1165  * Get parents of neighbors of node in the path(S[path_offset]) and
1166  * common parents of S[path_offset] and L[path_offset]/R[path_offset]:
1167  * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset],
1168  * CFR[path_offset].
1169  * Calculate numbers of left and right delimiting keys position:
1170  * lkey[path_offset], rkey[path_offset].
1171  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked
1172  *              CARRY_ON - schedule didn't occur while the function worked
1173  */
1174 static int get_parents(struct tree_balance *tb, int h)
1175 {
1176         struct treepath *path = tb->tb_path;
1177         int position,
1178             ret,
1179             path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
1180         struct buffer_head *curf, *curcf;
1181
1182         /* Current node is the root of the tree or will be root of the tree */
1183         if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
1184                 /*
1185                  * The root can not have parents.
1186                  * Release nodes which previously were obtained as
1187                  * parents of the current node neighbors.
1188                  */
1189                 brelse(tb->FL[h]);
1190                 brelse(tb->CFL[h]);
1191                 brelse(tb->FR[h]);
1192                 brelse(tb->CFR[h]);
1193                 tb->FL[h]  = NULL;
1194                 tb->CFL[h] = NULL;
1195                 tb->FR[h]  = NULL;
1196                 tb->CFR[h] = NULL;
1197                 return CARRY_ON;
1198         }
1199
1200         /* Get parent FL[path_offset] of L[path_offset]. */
1201         position = PATH_OFFSET_POSITION(path, path_offset - 1);
1202         if (position) {
1203                 /* Current node is not the first child of its parent. */
1204                 curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1205                 curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1206                 get_bh(curf);
1207                 get_bh(curf);
1208                 tb->lkey[h] = position - 1;
1209         } else {
1210                 /*
1211                  * Calculate current parent of L[path_offset], which is the
1212                  * left neighbor of the current node.  Calculate current
1213                  * common parent of L[path_offset] and the current node.
1214                  * Note that CFL[path_offset] not equal FL[path_offset] and
1215                  * CFL[path_offset] not equal F[path_offset].
1216                  * Calculate lkey[path_offset].
1217                  */
1218                 if ((ret = get_far_parent(tb, h + 1, &curf,
1219                                                   &curcf,
1220                                                   LEFT_PARENTS)) != CARRY_ON)
1221                         return ret;
1222         }
1223
1224         brelse(tb->FL[h]);
1225         tb->FL[h] = curf;       /* New initialization of FL[h]. */
1226         brelse(tb->CFL[h]);
1227         tb->CFL[h] = curcf;     /* New initialization of CFL[h]. */
1228
1229         RFALSE((curf && !B_IS_IN_TREE(curf)) ||
1230                (curcf && !B_IS_IN_TREE(curcf)),
1231                "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf);
1232
1233         /* Get parent FR[h] of R[h]. */
1234
1235         /* Current node is the last child of F[h]. FR[h] != F[h]. */
1236         if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) {
1237                 /*
1238                  * Calculate current parent of R[h], which is the right
1239                  * neighbor of F[h].  Calculate current common parent of
1240                  * R[h] and current node. Note that CFR[h] not equal
1241                  * FR[path_offset] and CFR[h] not equal F[h].
1242                  */
1243                 if ((ret =
1244                      get_far_parent(tb, h + 1, &curf, &curcf,
1245                                     RIGHT_PARENTS)) != CARRY_ON)
1246                         return ret;
1247         } else {
1248                 /* Current node is not the last child of its parent F[h]. */
1249                 curf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1250                 curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1);
1251                 get_bh(curf);
1252                 get_bh(curf);
1253                 tb->rkey[h] = position;
1254         }
1255
1256         brelse(tb->FR[h]);
1257         /* New initialization of FR[path_offset]. */
1258         tb->FR[h] = curf;
1259
1260         brelse(tb->CFR[h]);
1261         /* New initialization of CFR[path_offset]. */
1262         tb->CFR[h] = curcf;
1263
1264         RFALSE((curf && !B_IS_IN_TREE(curf)) ||
1265                (curcf && !B_IS_IN_TREE(curcf)),
1266                "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf);
1267
1268         return CARRY_ON;
1269 }
1270
1271 /*
1272  * it is possible to remove node as result of shiftings to
1273  * neighbors even when we insert or paste item.
1274  */
1275 static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree,
1276                                       struct tree_balance *tb, int h)
1277 {
1278         struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h);
1279         int levbytes = tb->insert_size[h];
1280         struct item_head *ih;
1281         struct reiserfs_key *r_key = NULL;
1282
1283         ih = item_head(Sh, 0);
1284         if (tb->CFR[h])
1285                 r_key = internal_key(tb->CFR[h], tb->rkey[h]);
1286
1287         if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes
1288             /* shifting may merge items which might save space */
1289             -
1290             ((!h
1291               && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0)
1292             -
1293             ((!h && r_key
1294               && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0)
1295             + ((h) ? KEY_SIZE : 0)) {
1296                 /* node can not be removed */
1297                 if (sfree >= levbytes) {
1298                         /* new item fits into node S[h] without any shifting */
1299                         if (!h)
1300                                 tb->s0num =
1301                                     B_NR_ITEMS(Sh) +
1302                                     ((mode == M_INSERT) ? 1 : 0);
1303                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1304                         return NO_BALANCING_NEEDED;
1305                 }
1306         }
1307         PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]);
1308         return !NO_BALANCING_NEEDED;
1309 }
1310
1311 /*
1312  * Check whether current node S[h] is balanced when increasing its size by
1313  * Inserting or Pasting.
1314  * Calculate parameters for balancing for current level h.
1315  * Parameters:
1316  *      tb      tree_balance structure;
1317  *      h       current level of the node;
1318  *      inum    item number in S[h];
1319  *      mode    i - insert, p - paste;
1320  * Returns:     1 - schedule occurred;
1321  *              0 - balancing for higher levels needed;
1322  *             -1 - no balancing for higher levels needed;
1323  *             -2 - no disk space.
1324  */
1325 /* ip means Inserting or Pasting */
1326 static int ip_check_balance(struct tree_balance *tb, int h)
1327 {
1328         struct virtual_node *vn = tb->tb_vn;
1329         /*
1330          * Number of bytes that must be inserted into (value is negative
1331          * if bytes are deleted) buffer which contains node being balanced.
1332          * The mnemonic is that the attempted change in node space used
1333          * level is levbytes bytes.
1334          */
1335         int levbytes;
1336         int ret;
1337
1338         int lfree, sfree, rfree /* free space in L, S and R */ ;
1339
1340         /*
1341          * nver is short for number of vertixes, and lnver is the number if
1342          * we shift to the left, rnver is the number if we shift to the
1343          * right, and lrnver is the number if we shift in both directions.
1344          * The goal is to minimize first the number of vertixes, and second,
1345          * the number of vertixes whose contents are changed by shifting,
1346          * and third the number of uncached vertixes whose contents are
1347          * changed by shifting and must be read from disk.
1348          */
1349         int nver, lnver, rnver, lrnver;
1350
1351         /*
1352          * used at leaf level only, S0 = S[0] is the node being balanced,
1353          * sInum [ I = 0,1,2 ] is the number of items that will
1354          * remain in node SI after balancing.  S1 and S2 are new
1355          * nodes that might be created.
1356          */
1357
1358         /*
1359          * we perform 8 calls to get_num_ver().  For each call we
1360          * calculate five parameters.  where 4th parameter is s1bytes
1361          * and 5th - s2bytes
1362          *
1363          * s0num, s1num, s2num for 8 cases
1364          * 0,1 - do not shift and do not shift but bottle
1365          * 2   - shift only whole item to left
1366          * 3   - shift to left and bottle as much as possible
1367          * 4,5 - shift to right (whole items and as much as possible
1368          * 6,7 - shift to both directions (whole items and as much as possible)
1369          */
1370         short snum012[40] = { 0, };
1371
1372         /* Sh is the node whose balance is currently being checked */
1373         struct buffer_head *Sh;
1374
1375         Sh = PATH_H_PBUFFER(tb->tb_path, h);
1376         levbytes = tb->insert_size[h];
1377
1378         /* Calculate balance parameters for creating new root. */
1379         if (!Sh) {
1380                 if (!h)
1381                         reiserfs_panic(tb->tb_sb, "vs-8210",
1382                                        "S[0] can not be 0");
1383                 switch (ret = get_empty_nodes(tb, h)) {
1384                 /* no balancing for higher levels needed */
1385                 case CARRY_ON:
1386                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1387                         return NO_BALANCING_NEEDED;
1388
1389                 case NO_DISK_SPACE:
1390                 case REPEAT_SEARCH:
1391                         return ret;
1392                 default:
1393                         reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect "
1394                                        "return value of get_empty_nodes");
1395                 }
1396         }
1397
1398         /* get parents of S[h] neighbors. */
1399         ret = get_parents(tb, h);
1400         if (ret != CARRY_ON)
1401                 return ret;
1402
1403         sfree = B_FREE_SPACE(Sh);
1404
1405         /* get free space of neighbors */
1406         rfree = get_rfree(tb, h);
1407         lfree = get_lfree(tb, h);
1408
1409         /* and new item fits into node S[h] without any shifting */
1410         if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) ==
1411             NO_BALANCING_NEEDED)
1412                 return NO_BALANCING_NEEDED;
1413
1414         create_virtual_node(tb, h);
1415
1416         /*
1417          * determine maximal number of items we can shift to the left
1418          * neighbor (in tb structure) and the maximal number of bytes
1419          * that can flow to the left neighbor from the left most liquid
1420          * item that cannot be shifted from S[0] entirely (returned value)
1421          */
1422         check_left(tb, h, lfree);
1423
1424         /*
1425          * determine maximal number of items we can shift to the right
1426          * neighbor (in tb structure) and the maximal number of bytes
1427          * that can flow to the right neighbor from the right most liquid
1428          * item that cannot be shifted from S[0] entirely (returned value)
1429          */
1430         check_right(tb, h, rfree);
1431
1432         /*
1433          * all contents of internal node S[h] can be moved into its
1434          * neighbors, S[h] will be removed after balancing
1435          */
1436         if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) {
1437                 int to_r;
1438
1439                 /*
1440                  * Since we are working on internal nodes, and our internal
1441                  * nodes have fixed size entries, then we can balance by the
1442                  * number of items rather than the space they consume.  In this
1443                  * routine we set the left node equal to the right node,
1444                  * allowing a difference of less than or equal to 1 child
1445                  * pointer.
1446                  */
1447                 to_r =
1448                     ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1449                      vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1450                                                 tb->rnum[h]);
1451                 set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1452                                -1, -1);
1453                 return CARRY_ON;
1454         }
1455
1456         /*
1457          * this checks balance condition, that any two neighboring nodes
1458          * can not fit in one node
1459          */
1460         RFALSE(h &&
1461                (tb->lnum[h] >= vn->vn_nr_item + 1 ||
1462                 tb->rnum[h] >= vn->vn_nr_item + 1),
1463                "vs-8220: tree is not balanced on internal level");
1464         RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) ||
1465                       (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))),
1466                "vs-8225: tree is not balanced on leaf level");
1467
1468         /*
1469          * all contents of S[0] can be moved into its neighbors
1470          * S[0] will be removed after balancing.
1471          */
1472         if (!h && is_leaf_removable(tb))
1473                 return CARRY_ON;
1474
1475         /*
1476          * why do we perform this check here rather than earlier??
1477          * Answer: we can win 1 node in some cases above. Moreover we
1478          * checked it above, when we checked, that S[0] is not removable
1479          * in principle
1480          */
1481
1482          /* new item fits into node S[h] without any shifting */
1483         if (sfree >= levbytes) {
1484                 if (!h)
1485                         tb->s0num = vn->vn_nr_item;
1486                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1487                 return NO_BALANCING_NEEDED;
1488         }
1489
1490         {
1491                 int lpar, rpar, nset, lset, rset, lrset;
1492                 /* regular overflowing of the node */
1493
1494                 /*
1495                  * get_num_ver works in 2 modes (FLOW & NO_FLOW)
1496                  * lpar, rpar - number of items we can shift to left/right
1497                  *              neighbor (including splitting item)
1498                  * nset, lset, rset, lrset - shows, whether flowing items
1499                  *                           give better packing
1500                  */
1501 #define FLOW 1
1502 #define NO_FLOW 0               /* do not any splitting */
1503
1504                 /* we choose one of the following */
1505 #define NOTHING_SHIFT_NO_FLOW   0
1506 #define NOTHING_SHIFT_FLOW      5
1507 #define LEFT_SHIFT_NO_FLOW      10
1508 #define LEFT_SHIFT_FLOW         15
1509 #define RIGHT_SHIFT_NO_FLOW     20
1510 #define RIGHT_SHIFT_FLOW        25
1511 #define LR_SHIFT_NO_FLOW        30
1512 #define LR_SHIFT_FLOW           35
1513
1514                 lpar = tb->lnum[h];
1515                 rpar = tb->rnum[h];
1516
1517                 /*
1518                  * calculate number of blocks S[h] must be split into when
1519                  * nothing is shifted to the neighbors, as well as number of
1520                  * items in each part of the split node (s012 numbers),
1521                  * and number of bytes (s1bytes) of the shared drop which
1522                  * flow to S1 if any
1523                  */
1524                 nset = NOTHING_SHIFT_NO_FLOW;
1525                 nver = get_num_ver(vn->vn_mode, tb, h,
1526                                    0, -1, h ? vn->vn_nr_item : 0, -1,
1527                                    snum012, NO_FLOW);
1528
1529                 if (!h) {
1530                         int nver1;
1531
1532                         /*
1533                          * note, that in this case we try to bottle
1534                          * between S[0] and S1 (S1 - the first new node)
1535                          */
1536                         nver1 = get_num_ver(vn->vn_mode, tb, h,
1537                                             0, -1, 0, -1,
1538                                             snum012 + NOTHING_SHIFT_FLOW, FLOW);
1539                         if (nver > nver1)
1540                                 nset = NOTHING_SHIFT_FLOW, nver = nver1;
1541                 }
1542
1543                 /*
1544                  * calculate number of blocks S[h] must be split into when
1545                  * l_shift_num first items and l_shift_bytes of the right
1546                  * most liquid item to be shifted are shifted to the left
1547                  * neighbor, as well as number of items in each part of the
1548                  * splitted node (s012 numbers), and number of bytes
1549                  * (s1bytes) of the shared drop which flow to S1 if any
1550                  */
1551                 lset = LEFT_SHIFT_NO_FLOW;
1552                 lnver = get_num_ver(vn->vn_mode, tb, h,
1553                                     lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1554                                     -1, h ? vn->vn_nr_item : 0, -1,
1555                                     snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW);
1556                 if (!h) {
1557                         int lnver1;
1558
1559                         lnver1 = get_num_ver(vn->vn_mode, tb, h,
1560                                              lpar -
1561                                              ((tb->lbytes != -1) ? 1 : 0),
1562                                              tb->lbytes, 0, -1,
1563                                              snum012 + LEFT_SHIFT_FLOW, FLOW);
1564                         if (lnver > lnver1)
1565                                 lset = LEFT_SHIFT_FLOW, lnver = lnver1;
1566                 }
1567
1568                 /*
1569                  * calculate number of blocks S[h] must be split into when
1570                  * r_shift_num first items and r_shift_bytes of the left most
1571                  * liquid item to be shifted are shifted to the right neighbor,
1572                  * as well as number of items in each part of the splitted
1573                  * node (s012 numbers), and number of bytes (s1bytes) of the
1574                  * shared drop which flow to S1 if any
1575                  */
1576                 rset = RIGHT_SHIFT_NO_FLOW;
1577                 rnver = get_num_ver(vn->vn_mode, tb, h,
1578                                     0, -1,
1579                                     h ? (vn->vn_nr_item - rpar) : (rpar -
1580                                                                    ((tb->
1581                                                                      rbytes !=
1582                                                                      -1) ? 1 :
1583                                                                     0)), -1,
1584                                     snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW);
1585                 if (!h) {
1586                         int rnver1;
1587
1588                         rnver1 = get_num_ver(vn->vn_mode, tb, h,
1589                                              0, -1,
1590                                              (rpar -
1591                                               ((tb->rbytes != -1) ? 1 : 0)),
1592                                              tb->rbytes,
1593                                              snum012 + RIGHT_SHIFT_FLOW, FLOW);
1594
1595                         if (rnver > rnver1)
1596                                 rset = RIGHT_SHIFT_FLOW, rnver = rnver1;
1597                 }
1598
1599                 /*
1600                  * calculate number of blocks S[h] must be split into when
1601                  * items are shifted in both directions, as well as number
1602                  * of items in each part of the splitted node (s012 numbers),
1603                  * and number of bytes (s1bytes) of the shared drop which
1604                  * flow to S1 if any
1605                  */
1606                 lrset = LR_SHIFT_NO_FLOW;
1607                 lrnver = get_num_ver(vn->vn_mode, tb, h,
1608                                      lpar - ((h || tb->lbytes == -1) ? 0 : 1),
1609                                      -1,
1610                                      h ? (vn->vn_nr_item - rpar) : (rpar -
1611                                                                     ((tb->
1612                                                                       rbytes !=
1613                                                                       -1) ? 1 :
1614                                                                      0)), -1,
1615                                      snum012 + LR_SHIFT_NO_FLOW, NO_FLOW);
1616                 if (!h) {
1617                         int lrnver1;
1618
1619                         lrnver1 = get_num_ver(vn->vn_mode, tb, h,
1620                                               lpar -
1621                                               ((tb->lbytes != -1) ? 1 : 0),
1622                                               tb->lbytes,
1623                                               (rpar -
1624                                                ((tb->rbytes != -1) ? 1 : 0)),
1625                                               tb->rbytes,
1626                                               snum012 + LR_SHIFT_FLOW, FLOW);
1627                         if (lrnver > lrnver1)
1628                                 lrset = LR_SHIFT_FLOW, lrnver = lrnver1;
1629                 }
1630
1631                 /*
1632                  * Our general shifting strategy is:
1633                  * 1) to minimized number of new nodes;
1634                  * 2) to minimized number of neighbors involved in shifting;
1635                  * 3) to minimized number of disk reads;
1636                  */
1637
1638                 /* we can win TWO or ONE nodes by shifting in both directions */
1639                 if (lrnver < lnver && lrnver < rnver) {
1640                         RFALSE(h &&
1641                                (tb->lnum[h] != 1 ||
1642                                 tb->rnum[h] != 1 ||
1643                                 lrnver != 1 || rnver != 2 || lnver != 2
1644                                 || h != 1), "vs-8230: bad h");
1645                         if (lrset == LR_SHIFT_FLOW)
1646                                 set_parameters(tb, h, tb->lnum[h], tb->rnum[h],
1647                                                lrnver, snum012 + lrset,
1648                                                tb->lbytes, tb->rbytes);
1649                         else
1650                                 set_parameters(tb, h,
1651                                                tb->lnum[h] -
1652                                                ((tb->lbytes == -1) ? 0 : 1),
1653                                                tb->rnum[h] -
1654                                                ((tb->rbytes == -1) ? 0 : 1),
1655                                                lrnver, snum012 + lrset, -1, -1);
1656
1657                         return CARRY_ON;
1658                 }
1659
1660                 /*
1661                  * if shifting doesn't lead to better packing
1662                  * then don't shift
1663                  */
1664                 if (nver == lrnver) {
1665                         set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1,
1666                                        -1);
1667                         return CARRY_ON;
1668                 }
1669
1670                 /*
1671                  * now we know that for better packing shifting in only one
1672                  * direction either to the left or to the right is required
1673                  */
1674
1675                 /*
1676                  * if shifting to the left is better than
1677                  * shifting to the right
1678                  */
1679                 if (lnver < rnver) {
1680                         SET_PAR_SHIFT_LEFT;
1681                         return CARRY_ON;
1682                 }
1683
1684                 /*
1685                  * if shifting to the right is better than
1686                  * shifting to the left
1687                  */
1688                 if (lnver > rnver) {
1689                         SET_PAR_SHIFT_RIGHT;
1690                         return CARRY_ON;
1691                 }
1692
1693                 /*
1694                  * now shifting in either direction gives the same number
1695                  * of nodes and we can make use of the cached neighbors
1696                  */
1697                 if (is_left_neighbor_in_cache(tb, h)) {
1698                         SET_PAR_SHIFT_LEFT;
1699                         return CARRY_ON;
1700                 }
1701
1702                 /*
1703                  * shift to the right independently on whether the
1704                  * right neighbor in cache or not
1705                  */
1706                 SET_PAR_SHIFT_RIGHT;
1707                 return CARRY_ON;
1708         }
1709 }
1710
1711 /*
1712  * Check whether current node S[h] is balanced when Decreasing its size by
1713  * Deleting or Cutting for INTERNAL node of S+tree.
1714  * Calculate parameters for balancing for current level h.
1715  * Parameters:
1716  *      tb      tree_balance structure;
1717  *      h       current level of the node;
1718  *      inum    item number in S[h];
1719  *      mode    i - insert, p - paste;
1720  * Returns:     1 - schedule occurred;
1721  *              0 - balancing for higher levels needed;
1722  *             -1 - no balancing for higher levels needed;
1723  *             -2 - no disk space.
1724  *
1725  * Note: Items of internal nodes have fixed size, so the balance condition for
1726  * the internal part of S+tree is as for the B-trees.
1727  */
1728 static int dc_check_balance_internal(struct tree_balance *tb, int h)
1729 {
1730         struct virtual_node *vn = tb->tb_vn;
1731
1732         /*
1733          * Sh is the node whose balance is currently being checked,
1734          * and Fh is its father.
1735          */
1736         struct buffer_head *Sh, *Fh;
1737         int maxsize, ret;
1738         int lfree, rfree /* free space in L and R */ ;
1739
1740         Sh = PATH_H_PBUFFER(tb->tb_path, h);
1741         Fh = PATH_H_PPARENT(tb->tb_path, h);
1742
1743         maxsize = MAX_CHILD_SIZE(Sh);
1744
1745         /*
1746          * using tb->insert_size[h], which is negative in this case,
1747          * create_virtual_node calculates:
1748          * new_nr_item = number of items node would have if operation is
1749          * performed without balancing (new_nr_item);
1750          */
1751         create_virtual_node(tb, h);
1752
1753         if (!Fh) {              /* S[h] is the root. */
1754                 /* no balancing for higher levels needed */
1755                 if (vn->vn_nr_item > 0) {
1756                         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1757                         return NO_BALANCING_NEEDED;
1758                 }
1759                 /*
1760                  * new_nr_item == 0.
1761                  * Current root will be deleted resulting in
1762                  * decrementing the tree height.
1763                  */
1764                 set_parameters(tb, h, 0, 0, 0, NULL, -1, -1);
1765                 return CARRY_ON;
1766         }
1767
1768         if ((ret = get_parents(tb, h)) != CARRY_ON)
1769                 return ret;
1770
1771         /* get free space of neighbors */
1772         rfree = get_rfree(tb, h);
1773         lfree = get_lfree(tb, h);
1774
1775         /* determine maximal number of items we can fit into neighbors */
1776         check_left(tb, h, lfree);
1777         check_right(tb, h, rfree);
1778
1779         /*
1780          * Balance condition for the internal node is valid.
1781          * In this case we balance only if it leads to better packing.
1782          */
1783         if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) {
1784                 /*
1785                  * Here we join S[h] with one of its neighbors,
1786                  * which is impossible with greater values of new_nr_item.
1787                  */
1788                 if (vn->vn_nr_item == MIN_NR_KEY(Sh)) {
1789                         /* All contents of S[h] can be moved to L[h]. */
1790                         if (tb->lnum[h] >= vn->vn_nr_item + 1) {
1791                                 int n;
1792                                 int order_L;
1793
1794                                 order_L =
1795                                     ((n =
1796                                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1797                                                           h)) ==
1798                                      0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1799                                 n = dc_size(B_N_CHILD(tb->FL[h], order_L)) /
1800                                     (DC_SIZE + KEY_SIZE);
1801                                 set_parameters(tb, h, -n - 1, 0, 0, NULL, -1,
1802                                                -1);
1803                                 return CARRY_ON;
1804                         }
1805
1806                         /* All contents of S[h] can be moved to R[h]. */
1807                         if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1808                                 int n;
1809                                 int order_R;
1810
1811                                 order_R =
1812                                     ((n =
1813                                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1814                                                           h)) ==
1815                                      B_NR_ITEMS(Fh)) ? 0 : n + 1;
1816                                 n = dc_size(B_N_CHILD(tb->FR[h], order_R)) /
1817                                     (DC_SIZE + KEY_SIZE);
1818                                 set_parameters(tb, h, 0, -n - 1, 0, NULL, -1,
1819                                                -1);
1820                                 return CARRY_ON;
1821                         }
1822                 }
1823
1824                 /*
1825                  * All contents of S[h] can be moved to the neighbors
1826                  * (L[h] & R[h]).
1827                  */
1828                 if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1829                         int to_r;
1830
1831                         to_r =
1832                             ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] -
1833                              tb->rnum[h] + vn->vn_nr_item + 1) / 2 -
1834                             (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]);
1835                         set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r,
1836                                        0, NULL, -1, -1);
1837                         return CARRY_ON;
1838                 }
1839
1840                 /* Balancing does not lead to better packing. */
1841                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1842                 return NO_BALANCING_NEEDED;
1843         }
1844
1845         /*
1846          * Current node contain insufficient number of items.
1847          * Balancing is required.
1848          */
1849         /* Check whether we can merge S[h] with left neighbor. */
1850         if (tb->lnum[h] >= vn->vn_nr_item + 1)
1851                 if (is_left_neighbor_in_cache(tb, h)
1852                     || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) {
1853                         int n;
1854                         int order_L;
1855
1856                         order_L =
1857                             ((n =
1858                               PATH_H_B_ITEM_ORDER(tb->tb_path,
1859                                                   h)) ==
1860                              0) ? B_NR_ITEMS(tb->FL[h]) : n - 1;
1861                         n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE +
1862                                                                       KEY_SIZE);
1863                         set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1);
1864                         return CARRY_ON;
1865                 }
1866
1867         /* Check whether we can merge S[h] with right neighbor. */
1868         if (tb->rnum[h] >= vn->vn_nr_item + 1) {
1869                 int n;
1870                 int order_R;
1871
1872                 order_R =
1873                     ((n =
1874                       PATH_H_B_ITEM_ORDER(tb->tb_path,
1875                                           h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1);
1876                 n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE +
1877                                                               KEY_SIZE);
1878                 set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1);
1879                 return CARRY_ON;
1880         }
1881
1882         /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */
1883         if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) {
1884                 int to_r;
1885
1886                 to_r =
1887                     ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] +
1888                      vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 -
1889                                                 tb->rnum[h]);
1890                 set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL,
1891                                -1, -1);
1892                 return CARRY_ON;
1893         }
1894
1895         /* For internal nodes try to borrow item from a neighbor */
1896         RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root");
1897
1898         /* Borrow one or two items from caching neighbor */
1899         if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) {
1900                 int from_l;
1901
1902                 from_l =
1903                     (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item +
1904                      1) / 2 - (vn->vn_nr_item + 1);
1905                 set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1);
1906                 return CARRY_ON;
1907         }
1908
1909         set_parameters(tb, h, 0,
1910                        -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item +
1911                           1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1);
1912         return CARRY_ON;
1913 }
1914
1915 /*
1916  * Check whether current node S[h] is balanced when Decreasing its size by
1917  * Deleting or Truncating for LEAF node of S+tree.
1918  * Calculate parameters for balancing for current level h.
1919  * Parameters:
1920  *      tb      tree_balance structure;
1921  *      h       current level of the node;
1922  *      inum    item number in S[h];
1923  *      mode    i - insert, p - paste;
1924  * Returns:     1 - schedule occurred;
1925  *              0 - balancing for higher levels needed;
1926  *             -1 - no balancing for higher levels needed;
1927  *             -2 - no disk space.
1928  */
1929 static int dc_check_balance_leaf(struct tree_balance *tb, int h)
1930 {
1931         struct virtual_node *vn = tb->tb_vn;
1932
1933         /*
1934          * Number of bytes that must be deleted from
1935          * (value is negative if bytes are deleted) buffer which
1936          * contains node being balanced.  The mnemonic is that the
1937          * attempted change in node space used level is levbytes bytes.
1938          */
1939         int levbytes;
1940
1941         /* the maximal item size */
1942         int maxsize, ret;
1943
1944         /*
1945          * S0 is the node whose balance is currently being checked,
1946          * and F0 is its father.
1947          */
1948         struct buffer_head *S0, *F0;
1949         int lfree, rfree /* free space in L and R */ ;
1950
1951         S0 = PATH_H_PBUFFER(tb->tb_path, 0);
1952         F0 = PATH_H_PPARENT(tb->tb_path, 0);
1953
1954         levbytes = tb->insert_size[h];
1955
1956         maxsize = MAX_CHILD_SIZE(S0);   /* maximal possible size of an item */
1957
1958         if (!F0) {              /* S[0] is the root now. */
1959
1960                 RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0),
1961                        "vs-8240: attempt to create empty buffer tree");
1962
1963                 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
1964                 return NO_BALANCING_NEEDED;
1965         }
1966
1967         if ((ret = get_parents(tb, h)) != CARRY_ON)
1968                 return ret;
1969
1970         /* get free space of neighbors */
1971         rfree = get_rfree(tb, h);
1972         lfree = get_lfree(tb, h);
1973
1974         create_virtual_node(tb, h);
1975
1976         /* if 3 leaves can be merge to one, set parameters and return */
1977         if (are_leaves_removable(tb, lfree, rfree))
1978                 return CARRY_ON;
1979
1980         /*
1981          * determine maximal number of items we can shift to the left/right
1982          * neighbor and the maximal number of bytes that can flow to the
1983          * left/right neighbor from the left/right most liquid item that
1984          * cannot be shifted from S[0] entirely
1985          */
1986         check_left(tb, h, lfree);
1987         check_right(tb, h, rfree);
1988
1989         /* check whether we can merge S with left neighbor. */
1990         if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1)
1991                 if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) ||      /* S can not be merged with R */
1992                     !tb->FR[h]) {
1993
1994                         RFALSE(!tb->FL[h],
1995                                "vs-8245: dc_check_balance_leaf: FL[h] must exist");
1996
1997                         /* set parameter to merge S[0] with its left neighbor */
1998                         set_parameters(tb, h, -1, 0, 0, NULL, -1, -1);
1999                         return CARRY_ON;
2000                 }
2001
2002         /* check whether we can merge S[0] with right neighbor. */
2003         if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) {
2004                 set_parameters(tb, h, 0, -1, 0, NULL, -1, -1);
2005                 return CARRY_ON;
2006         }
2007
2008         /*
2009          * All contents of S[0] can be moved to the neighbors (L[0] & R[0]).
2010          * Set parameters and return
2011          */
2012         if (is_leaf_removable(tb))
2013                 return CARRY_ON;
2014
2015         /* Balancing is not required. */
2016         tb->s0num = vn->vn_nr_item;
2017         set_parameters(tb, h, 0, 0, 1, NULL, -1, -1);
2018         return NO_BALANCING_NEEDED;
2019 }
2020
2021 /*
2022  * Check whether current node S[h] is balanced when Decreasing its size by
2023  * Deleting or Cutting.
2024  * Calculate parameters for balancing for current level h.
2025  * Parameters:
2026  *      tb      tree_balance structure;
2027  *      h       current level of the node;
2028  *      inum    item number in S[h];
2029  *      mode    d - delete, c - cut.
2030  * Returns:     1 - schedule occurred;
2031  *              0 - balancing for higher levels needed;
2032  *             -1 - no balancing for higher levels needed;
2033  *             -2 - no disk space.
2034  */
2035 static int dc_check_balance(struct tree_balance *tb, int h)
2036 {
2037         RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)),
2038                "vs-8250: S is not initialized");
2039
2040         if (h)
2041                 return dc_check_balance_internal(tb, h);
2042         else
2043                 return dc_check_balance_leaf(tb, h);
2044 }
2045
2046 /*
2047  * Check whether current node S[h] is balanced.
2048  * Calculate parameters for balancing for current level h.
2049  * Parameters:
2050  *
2051  *      tb      tree_balance structure:
2052  *
2053  *              tb is a large structure that must be read about in the header
2054  *              file at the same time as this procedure if the reader is
2055  *              to successfully understand this procedure
2056  *
2057  *      h       current level of the node;
2058  *      inum    item number in S[h];
2059  *      mode    i - insert, p - paste, d - delete, c - cut.
2060  * Returns:     1 - schedule occurred;
2061  *              0 - balancing for higher levels needed;
2062  *             -1 - no balancing for higher levels needed;
2063  *             -2 - no disk space.
2064  */
2065 static int check_balance(int mode,
2066                          struct tree_balance *tb,
2067                          int h,
2068                          int inum,
2069                          int pos_in_item,
2070                          struct item_head *ins_ih, const void *data)
2071 {
2072         struct virtual_node *vn;
2073
2074         vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf);
2075         vn->vn_free_ptr = (char *)(tb->tb_vn + 1);
2076         vn->vn_mode = mode;
2077         vn->vn_affected_item_num = inum;
2078         vn->vn_pos_in_item = pos_in_item;
2079         vn->vn_ins_ih = ins_ih;
2080         vn->vn_data = data;
2081
2082         RFALSE(mode == M_INSERT && !vn->vn_ins_ih,
2083                "vs-8255: ins_ih can not be 0 in insert mode");
2084
2085         /* Calculate balance parameters when size of node is increasing. */
2086         if (tb->insert_size[h] > 0)
2087                 return ip_check_balance(tb, h);
2088
2089         /* Calculate balance parameters when  size of node is decreasing. */
2090         return dc_check_balance(tb, h);
2091 }
2092
2093 /* Check whether parent at the path is the really parent of the current node.*/
2094 static int get_direct_parent(struct tree_balance *tb, int h)
2095 {
2096         struct buffer_head *bh;
2097         struct treepath *path = tb->tb_path;
2098         int position,
2099             path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h);
2100
2101         /* We are in the root or in the new root. */
2102         if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) {
2103
2104                 RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1,
2105                        "PAP-8260: invalid offset in the path");
2106
2107                 if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)->
2108                     b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) {
2109                         /* Root is not changed. */
2110                         PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL;
2111                         PATH_OFFSET_POSITION(path, path_offset - 1) = 0;
2112                         return CARRY_ON;
2113                 }
2114                 /* Root is changed and we must recalculate the path. */
2115                 return REPEAT_SEARCH;
2116         }
2117
2118         /* Parent in the path is not in the tree. */
2119         if (!B_IS_IN_TREE
2120             (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1)))
2121                 return REPEAT_SEARCH;
2122
2123         if ((position =
2124              PATH_OFFSET_POSITION(path,
2125                                   path_offset - 1)) > B_NR_ITEMS(bh))
2126                 return REPEAT_SEARCH;
2127
2128         /* Parent in the path is not parent of the current node in the tree. */
2129         if (B_N_CHILD_NUM(bh, position) !=
2130             PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr)
2131                 return REPEAT_SEARCH;
2132
2133         if (buffer_locked(bh)) {
2134                 int depth = reiserfs_write_unlock_nested(tb->tb_sb);
2135                 __wait_on_buffer(bh);
2136                 reiserfs_write_lock_nested(tb->tb_sb, depth);
2137                 if (FILESYSTEM_CHANGED_TB(tb))
2138                         return REPEAT_SEARCH;
2139         }
2140
2141         /*
2142          * Parent in the path is unlocked and really parent
2143          * of the current node.
2144          */
2145         return CARRY_ON;
2146 }
2147
2148 /*
2149  * Using lnum[h] and rnum[h] we should determine what neighbors
2150  * of S[h] we
2151  * need in order to balance S[h], and get them if necessary.
2152  * Returns:     SCHEDULE_OCCURRED - schedule occurred while the function worked;
2153  *              CARRY_ON - schedule didn't occur while the function worked;
2154  */
2155 static int get_neighbors(struct tree_balance *tb, int h)
2156 {
2157         int child_position,
2158             path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1);
2159         unsigned long son_number;
2160         struct super_block *sb = tb->tb_sb;
2161         struct buffer_head *bh;
2162         int depth;
2163
2164         PROC_INFO_INC(sb, get_neighbors[h]);
2165
2166         if (tb->lnum[h]) {
2167                 /* We need left neighbor to balance S[h]. */
2168                 PROC_INFO_INC(sb, need_l_neighbor[h]);
2169                 bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
2170
2171                 RFALSE(bh == tb->FL[h] &&
2172                        !PATH_OFFSET_POSITION(tb->tb_path, path_offset),
2173                        "PAP-8270: invalid position in the parent");
2174
2175                 child_position =
2176                     (bh ==
2177                      tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
2178                                                                        FL[h]);
2179                 son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
2180                 depth = reiserfs_write_unlock_nested(tb->tb_sb);
2181                 bh = sb_bread(sb, son_number);
2182                 reiserfs_write_lock_nested(tb->tb_sb, depth);
2183                 if (!bh)
2184                         return IO_ERROR;
2185                 if (FILESYSTEM_CHANGED_TB(tb)) {
2186                         brelse(bh);
2187                         PROC_INFO_INC(sb, get_neighbors_restart[h]);
2188                         return REPEAT_SEARCH;
2189                 }
2190
2191                 RFALSE(!B_IS_IN_TREE(tb->FL[h]) ||
2192                        child_position > B_NR_ITEMS(tb->FL[h]) ||
2193                        B_N_CHILD_NUM(tb->FL[h], child_position) !=
2194                        bh->b_blocknr, "PAP-8275: invalid parent");
2195                 RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child");
2196                 RFALSE(!h &&
2197                        B_FREE_SPACE(bh) !=
2198                        MAX_CHILD_SIZE(bh) -
2199                        dc_size(B_N_CHILD(tb->FL[0], child_position)),
2200                        "PAP-8290: invalid child size of left neighbor");
2201
2202                 brelse(tb->L[h]);
2203                 tb->L[h] = bh;
2204         }
2205
2206         /* We need right neighbor to balance S[path_offset]. */
2207         if (tb->rnum[h]) {
2208                 PROC_INFO_INC(sb, need_r_neighbor[h]);
2209                 bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset);
2210
2211                 RFALSE(bh == tb->FR[h] &&
2212                        PATH_OFFSET_POSITION(tb->tb_path,
2213                                             path_offset) >=
2214                        B_NR_ITEMS(bh),
2215                        "PAP-8295: invalid position in the parent");
2216
2217                 child_position =
2218                     (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
2219                 son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
2220                 depth = reiserfs_write_unlock_nested(tb->tb_sb);
2221                 bh = sb_bread(sb, son_number);
2222                 reiserfs_write_lock_nested(tb->tb_sb, depth);
2223                 if (!bh)
2224                         return IO_ERROR;
2225                 if (FILESYSTEM_CHANGED_TB(tb)) {
2226                         brelse(bh);
2227                         PROC_INFO_INC(sb, get_neighbors_restart[h]);
2228                         return REPEAT_SEARCH;
2229                 }
2230                 brelse(tb->R[h]);
2231                 tb->R[h] = bh;
2232
2233                 RFALSE(!h
2234                        && B_FREE_SPACE(bh) !=
2235                        MAX_CHILD_SIZE(bh) -
2236                        dc_size(B_N_CHILD(tb->FR[0], child_position)),
2237                        "PAP-8300: invalid child size of right neighbor (%d != %d - %d)",
2238                        B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh),
2239                        dc_size(B_N_CHILD(tb->FR[0], child_position)));
2240
2241         }
2242         return CARRY_ON;
2243 }
2244
2245 static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh)
2246 {
2247         int max_num_of_items;
2248         int max_num_of_entries;
2249         unsigned long blocksize = sb->s_blocksize;
2250
2251 #define MIN_NAME_LEN 1
2252
2253         max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN);
2254         max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) /
2255             (DEH_SIZE + MIN_NAME_LEN);
2256
2257         return sizeof(struct virtual_node) +
2258             max(max_num_of_items * sizeof(struct virtual_item),
2259                 sizeof(struct virtual_item) + sizeof(struct direntry_uarea) +
2260                 (max_num_of_entries - 1) * sizeof(__u16));
2261 }
2262
2263 /*
2264  * maybe we should fail balancing we are going to perform when kmalloc
2265  * fails several times. But now it will loop until kmalloc gets
2266  * required memory
2267  */
2268 static int get_mem_for_virtual_node(struct tree_balance *tb)
2269 {
2270         int check_fs = 0;
2271         int size;
2272         char *buf;
2273
2274         size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path));
2275
2276         /* we have to allocate more memory for virtual node */
2277         if (size > tb->vn_buf_size) {
2278                 if (tb->vn_buf) {
2279                         /* free memory allocated before */
2280                         kfree(tb->vn_buf);
2281                         /* this is not needed if kfree is atomic */
2282                         check_fs = 1;
2283                 }
2284
2285                 /* virtual node requires now more memory */
2286                 tb->vn_buf_size = size;
2287
2288                 /* get memory for virtual item */
2289                 buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN);
2290                 if (!buf) {
2291                         /*
2292                          * getting memory with GFP_KERNEL priority may involve
2293                          * balancing now (due to indirect_to_direct conversion
2294                          * on dcache shrinking). So, release path and collected
2295                          * resources here
2296                          */
2297                         free_buffers_in_tb(tb);
2298                         buf = kmalloc(size, GFP_NOFS);
2299                         if (!buf) {
2300                                 tb->vn_buf_size = 0;
2301                         }
2302                         tb->vn_buf = buf;
2303                         schedule();
2304                         return REPEAT_SEARCH;
2305                 }
2306
2307                 tb->vn_buf = buf;
2308         }
2309
2310         if (check_fs && FILESYSTEM_CHANGED_TB(tb))
2311                 return REPEAT_SEARCH;
2312
2313         return CARRY_ON;
2314 }
2315
2316 #ifdef CONFIG_REISERFS_CHECK
2317 static void tb_buffer_sanity_check(struct super_block *sb,
2318                                    struct buffer_head *bh,
2319                                    const char *descr, int level)
2320 {
2321         if (bh) {
2322                 if (atomic_read(&(bh->b_count)) <= 0)
2323
2324                         reiserfs_panic(sb, "jmacd-1", "negative or zero "
2325                                        "reference counter for buffer %s[%d] "
2326                                        "(%b)", descr, level, bh);
2327
2328                 if (!buffer_uptodate(bh))
2329                         reiserfs_panic(sb, "jmacd-2", "buffer is not up "
2330                                        "to date %s[%d] (%b)",
2331                                        descr, level, bh);
2332
2333                 if (!B_IS_IN_TREE(bh))
2334                         reiserfs_panic(sb, "jmacd-3", "buffer is not "
2335                                        "in tree %s[%d] (%b)",
2336                                        descr, level, bh);
2337
2338                 if (bh->b_bdev != sb->s_bdev)
2339                         reiserfs_panic(sb, "jmacd-4", "buffer has wrong "
2340                                        "device %s[%d] (%b)",
2341                                        descr, level, bh);
2342
2343                 if (bh->b_size != sb->s_blocksize)
2344                         reiserfs_panic(sb, "jmacd-5", "buffer has wrong "
2345                                        "blocksize %s[%d] (%b)",
2346                                        descr, level, bh);
2347
2348                 if (bh->b_blocknr > SB_BLOCK_COUNT(sb))
2349                         reiserfs_panic(sb, "jmacd-6", "buffer block "
2350                                        "number too high %s[%d] (%b)",
2351                                        descr, level, bh);
2352         }
2353 }
2354 #else
2355 static void tb_buffer_sanity_check(struct super_block *sb,
2356                                    struct buffer_head *bh,
2357                                    const char *descr, int level)
2358 {;
2359 }
2360 #endif
2361
2362 static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh)
2363 {
2364         return reiserfs_prepare_for_journal(s, bh, 0);
2365 }
2366
2367 static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
2368 {
2369         struct buffer_head *locked;
2370 #ifdef CONFIG_REISERFS_CHECK
2371         int repeat_counter = 0;
2372 #endif
2373         int i;
2374
2375         do {
2376
2377                 locked = NULL;
2378
2379                 for (i = tb->tb_path->path_length;
2380                      !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) {
2381                         if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) {
2382                                 /*
2383                                  * if I understand correctly, we can only
2384                                  * be sure the last buffer in the path is
2385                                  * in the tree --clm
2386                                  */
2387 #ifdef CONFIG_REISERFS_CHECK
2388                                 if (PATH_PLAST_BUFFER(tb->tb_path) ==
2389                                     PATH_OFFSET_PBUFFER(tb->tb_path, i))
2390                                         tb_buffer_sanity_check(tb->tb_sb,
2391                                                                PATH_OFFSET_PBUFFER
2392                                                                (tb->tb_path,
2393                                                                 i), "S",
2394                                                                tb->tb_path->
2395                                                                path_length - i);
2396 #endif
2397                                 if (!clear_all_dirty_bits(tb->tb_sb,
2398                                                           PATH_OFFSET_PBUFFER
2399                                                           (tb->tb_path,
2400                                                            i))) {
2401                                         locked =
2402                                             PATH_OFFSET_PBUFFER(tb->tb_path,
2403                                                                 i);
2404                                 }
2405                         }
2406                 }
2407
2408                 for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i];
2409                      i++) {
2410
2411                         if (tb->lnum[i]) {
2412
2413                                 if (tb->L[i]) {
2414                                         tb_buffer_sanity_check(tb->tb_sb,
2415                                                                tb->L[i],
2416                                                                "L", i);
2417                                         if (!clear_all_dirty_bits
2418                                             (tb->tb_sb, tb->L[i]))
2419                                                 locked = tb->L[i];
2420                                 }
2421
2422                                 if (!locked && tb->FL[i]) {
2423                                         tb_buffer_sanity_check(tb->tb_sb,
2424                                                                tb->FL[i],
2425                                                                "FL", i);
2426                                         if (!clear_all_dirty_bits
2427                                             (tb->tb_sb, tb->FL[i]))
2428                                                 locked = tb->FL[i];
2429                                 }
2430
2431                                 if (!locked && tb->CFL[i]) {
2432                                         tb_buffer_sanity_check(tb->tb_sb,
2433                                                                tb->CFL[i],
2434                                                                "CFL", i);
2435                                         if (!clear_all_dirty_bits
2436                                             (tb->tb_sb, tb->CFL[i]))
2437                                                 locked = tb->CFL[i];
2438                                 }
2439
2440                         }
2441
2442                         if (!locked && (tb->rnum[i])) {
2443
2444                                 if (tb->R[i]) {
2445                                         tb_buffer_sanity_check(tb->tb_sb,
2446                                                                tb->R[i],
2447                                                                "R", i);
2448                                         if (!clear_all_dirty_bits
2449                                             (tb->tb_sb, tb->R[i]))
2450                                                 locked = tb->R[i];
2451                                 }
2452
2453                                 if (!locked && tb->FR[i]) {
2454                                         tb_buffer_sanity_check(tb->tb_sb,
2455                                                                tb->FR[i],
2456                                                                "FR", i);
2457                                         if (!clear_all_dirty_bits
2458                                             (tb->tb_sb, tb->FR[i]))
2459                                                 locked = tb->FR[i];
2460                                 }
2461
2462                                 if (!locked && tb->CFR[i]) {
2463                                         tb_buffer_sanity_check(tb->tb_sb,
2464                                                                tb->CFR[i],
2465                                                                "CFR", i);
2466                                         if (!clear_all_dirty_bits
2467                                             (tb->tb_sb, tb->CFR[i]))
2468                                                 locked = tb->CFR[i];
2469                                 }
2470                         }
2471                 }
2472
2473                 /*
2474                  * as far as I can tell, this is not required.  The FEB list
2475                  * seems to be full of newly allocated nodes, which will
2476                  * never be locked, dirty, or anything else.
2477                  * To be safe, I'm putting in the checks and waits in.
2478                  * For the moment, they are needed to keep the code in
2479                  * journal.c from complaining about the buffer.
2480                  * That code is inside CONFIG_REISERFS_CHECK as well.  --clm
2481                  */
2482                 for (i = 0; !locked && i < MAX_FEB_SIZE; i++) {
2483                         if (tb->FEB[i]) {
2484                                 if (!clear_all_dirty_bits
2485                                     (tb->tb_sb, tb->FEB[i]))
2486                                         locked = tb->FEB[i];
2487                         }
2488                 }
2489
2490                 if (locked) {
2491                         int depth;
2492 #ifdef CONFIG_REISERFS_CHECK
2493                         repeat_counter++;
2494                         if ((repeat_counter % 10000) == 0) {
2495                                 reiserfs_warning(tb->tb_sb, "reiserfs-8200",
2496                                                  "too many iterations waiting "
2497                                                  "for buffer to unlock "
2498                                                  "(%b)", locked);
2499
2500                                 /* Don't loop forever.  Try to recover from possible error. */
2501
2502                                 return (FILESYSTEM_CHANGED_TB(tb)) ?
2503                                     REPEAT_SEARCH : CARRY_ON;
2504                         }
2505 #endif
2506                         depth = reiserfs_write_unlock_nested(tb->tb_sb);
2507                         __wait_on_buffer(locked);
2508                         reiserfs_write_lock_nested(tb->tb_sb, depth);
2509                         if (FILESYSTEM_CHANGED_TB(tb))
2510                                 return REPEAT_SEARCH;
2511                 }
2512
2513         } while (locked);
2514
2515         return CARRY_ON;
2516 }
2517
2518 /*
2519  * Prepare for balancing, that is
2520  *      get all necessary parents, and neighbors;
2521  *      analyze what and where should be moved;
2522  *      get sufficient number of new nodes;
2523  * Balancing will start only after all resources will be collected at a time.
2524  *
2525  * When ported to SMP kernels, only at the last moment after all needed nodes
2526  * are collected in cache, will the resources be locked using the usual
2527  * textbook ordered lock acquisition algorithms.  Note that ensuring that
2528  * this code neither write locks what it does not need to write lock nor locks
2529  * out of order will be a pain in the butt that could have been avoided.
2530  * Grumble grumble. -Hans
2531  *
2532  * fix is meant in the sense of render unchanging
2533  *
2534  * Latency might be improved by first gathering a list of what buffers
2535  * are needed and then getting as many of them in parallel as possible? -Hans
2536  *
2537  * Parameters:
2538  *      op_mode i - insert, d - delete, c - cut (truncate), p - paste (append)
2539  *      tb      tree_balance structure;
2540  *      inum    item number in S[h];
2541  *      pos_in_item - comment this if you can
2542  *      ins_ih  item head of item being inserted
2543  *      data    inserted item or data to be pasted
2544  * Returns:     1 - schedule occurred while the function worked;
2545  *              0 - schedule didn't occur while the function worked;
2546  *             -1 - if no_disk_space
2547  */
2548
2549 int fix_nodes(int op_mode, struct tree_balance *tb,
2550               struct item_head *ins_ih, const void *data)
2551 {
2552         int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path);
2553         int pos_in_item;
2554
2555         /*
2556          * we set wait_tb_buffers_run when we have to restore any dirty
2557          * bits cleared during wait_tb_buffers_run
2558          */
2559         int wait_tb_buffers_run = 0;
2560         struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path);
2561
2562         ++REISERFS_SB(tb->tb_sb)->s_fix_nodes;
2563
2564         pos_in_item = tb->tb_path->pos_in_item;
2565
2566         tb->fs_gen = get_generation(tb->tb_sb);
2567
2568         /*
2569          * we prepare and log the super here so it will already be in the
2570          * transaction when do_balance needs to change it.
2571          * This way do_balance won't have to schedule when trying to prepare
2572          * the super for logging
2573          */
2574         reiserfs_prepare_for_journal(tb->tb_sb,
2575                                      SB_BUFFER_WITH_SB(tb->tb_sb), 1);
2576         journal_mark_dirty(tb->transaction_handle,
2577                            SB_BUFFER_WITH_SB(tb->tb_sb));
2578         if (FILESYSTEM_CHANGED_TB(tb))
2579                 return REPEAT_SEARCH;
2580
2581         /* if it possible in indirect_to_direct conversion */
2582         if (buffer_locked(tbS0)) {
2583                 int depth = reiserfs_write_unlock_nested(tb->tb_sb);
2584                 __wait_on_buffer(tbS0);
2585                 reiserfs_write_lock_nested(tb->tb_sb, depth);
2586                 if (FILESYSTEM_CHANGED_TB(tb))
2587                         return REPEAT_SEARCH;
2588         }
2589 #ifdef CONFIG_REISERFS_CHECK
2590         if (REISERFS_SB(tb->tb_sb)->cur_tb) {
2591                 print_cur_tb("fix_nodes");
2592                 reiserfs_panic(tb->tb_sb, "PAP-8305",
2593                                "there is pending do_balance");
2594         }
2595
2596         if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0))
2597                 reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is "
2598                                "not uptodate at the beginning of fix_nodes "
2599                                "or not in tree (mode %c)",
2600                                tbS0, tbS0, op_mode);
2601
2602         /* Check parameters. */
2603         switch (op_mode) {
2604         case M_INSERT:
2605                 if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0))
2606                         reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect "
2607                                        "item number %d (in S0 - %d) in case "
2608                                        "of insert", item_num,
2609                                        B_NR_ITEMS(tbS0));
2610                 break;
2611         case M_PASTE:
2612         case M_DELETE:
2613         case M_CUT:
2614                 if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) {
2615                         print_block(tbS0, 0, -1, -1);
2616                         reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect "
2617                                        "item number(%d); mode = %c "
2618                                        "insert_size = %d",
2619                                        item_num, op_mode,
2620                                        tb->insert_size[0]);
2621                 }
2622                 break;
2623         default:
2624                 reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode "
2625                                "of operation");
2626         }
2627 #endif
2628
2629         if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH)
2630                 /* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */
2631                 return REPEAT_SEARCH;
2632
2633         /* Starting from the leaf level; for all levels h of the tree. */
2634         for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) {
2635                 ret = get_direct_parent(tb, h);
2636                 if (ret != CARRY_ON)
2637                         goto repeat;
2638
2639                 ret = check_balance(op_mode, tb, h, item_num,
2640                                     pos_in_item, ins_ih, data);
2641                 if (ret != CARRY_ON) {
2642                         if (ret == NO_BALANCING_NEEDED) {
2643                                 /* No balancing for higher levels needed. */
2644                                 ret = get_neighbors(tb, h);
2645                                 if (ret != CARRY_ON)
2646                                         goto repeat;
2647                                 if (h != MAX_HEIGHT - 1)
2648                                         tb->insert_size[h + 1] = 0;
2649                                 /*
2650                                  * ok, analysis and resource gathering
2651                                  * are complete
2652                                  */
2653                                 break;
2654                         }
2655                         goto repeat;
2656                 }
2657
2658                 ret = get_neighbors(tb, h);
2659                 if (ret != CARRY_ON)
2660                         goto repeat;
2661
2662                 /*
2663                  * No disk space, or schedule occurred and analysis may be
2664                  * invalid and needs to be redone.
2665                  */
2666                 ret = get_empty_nodes(tb, h);
2667                 if (ret != CARRY_ON)
2668                         goto repeat;
2669
2670                 /*
2671                  * We have a positive insert size but no nodes exist on this
2672                  * level, this means that we are creating a new root.
2673                  */
2674                 if (!PATH_H_PBUFFER(tb->tb_path, h)) {
2675
2676                         RFALSE(tb->blknum[h] != 1,
2677                                "PAP-8350: creating new empty root");
2678
2679                         if (h < MAX_HEIGHT - 1)
2680                                 tb->insert_size[h + 1] = 0;
2681                 } else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) {
2682                         /*
2683                          * The tree needs to be grown, so this node S[h]
2684                          * which is the root node is split into two nodes,
2685                          * and a new node (S[h+1]) will be created to
2686                          * become the root node.
2687                          */
2688                         if (tb->blknum[h] > 1) {
2689
2690                                 RFALSE(h == MAX_HEIGHT - 1,
2691                                        "PAP-8355: attempt to create too high of a tree");
2692
2693                                 tb->insert_size[h + 1] =
2694                                     (DC_SIZE +
2695                                      KEY_SIZE) * (tb->blknum[h] - 1) +
2696                                     DC_SIZE;
2697                         } else if (h < MAX_HEIGHT - 1)
2698                                 tb->insert_size[h + 1] = 0;
2699                 } else
2700                         tb->insert_size[h + 1] =
2701                             (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1);
2702         }
2703
2704         ret = wait_tb_buffers_until_unlocked(tb);
2705         if (ret == CARRY_ON) {
2706                 if (FILESYSTEM_CHANGED_TB(tb)) {
2707                         wait_tb_buffers_run = 1;
2708                         ret = REPEAT_SEARCH;
2709                         goto repeat;
2710                 } else {
2711                         return CARRY_ON;
2712                 }
2713         } else {
2714                 wait_tb_buffers_run = 1;
2715                 goto repeat;
2716         }
2717
2718 repeat:
2719         /*
2720          * fix_nodes was unable to perform its calculation due to
2721          * filesystem got changed under us, lack of free disk space or i/o
2722          * failure. If the first is the case - the search will be
2723          * repeated. For now - free all resources acquired so far except
2724          * for the new allocated nodes
2725          */
2726         {
2727                 int i;
2728
2729                 /* Release path buffers. */
2730                 if (wait_tb_buffers_run) {
2731                         pathrelse_and_restore(tb->tb_sb, tb->tb_path);
2732                 } else {
2733                         pathrelse(tb->tb_path);
2734                 }
2735                 /* brelse all resources collected for balancing */
2736                 for (i = 0; i < MAX_HEIGHT; i++) {
2737                         if (wait_tb_buffers_run) {
2738                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2739                                                                  tb->L[i]);
2740                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2741                                                                  tb->R[i]);
2742                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2743                                                                  tb->FL[i]);
2744                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2745                                                                  tb->FR[i]);
2746                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2747                                                                  tb->
2748                                                                  CFL[i]);
2749                                 reiserfs_restore_prepared_buffer(tb->tb_sb,
2750                                                                  tb->
2751                                                                  CFR[i]);
2752                         }
2753
2754                         brelse(tb->L[i]);
2755                         brelse(tb->R[i]);
2756                         brelse(tb->FL[i]);
2757                         brelse(tb->FR[i]);
2758                         brelse(tb->CFL[i]);
2759                         brelse(tb->CFR[i]);
2760
2761                         tb->L[i] = NULL;
2762                         tb->R[i] = NULL;
2763                         tb->FL[i] = NULL;
2764                         tb->FR[i] = NULL;
2765                         tb->CFL[i] = NULL;
2766                         tb->CFR[i] = NULL;
2767                 }
2768
2769                 if (wait_tb_buffers_run) {
2770                         for (i = 0; i < MAX_FEB_SIZE; i++) {
2771                                 if (tb->FEB[i])
2772                                         reiserfs_restore_prepared_buffer
2773                                             (tb->tb_sb, tb->FEB[i]);
2774                         }
2775                 }
2776                 return ret;
2777         }
2778
2779 }
2780
2781 void unfix_nodes(struct tree_balance *tb)
2782 {
2783         int i;
2784
2785         /* Release path buffers. */
2786         pathrelse_and_restore(tb->tb_sb, tb->tb_path);
2787
2788         /* brelse all resources collected for balancing */
2789         for (i = 0; i < MAX_HEIGHT; i++) {
2790                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]);
2791                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]);
2792                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]);
2793                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]);
2794                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]);
2795                 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]);
2796
2797                 brelse(tb->L[i]);
2798                 brelse(tb->R[i]);
2799                 brelse(tb->FL[i]);
2800                 brelse(tb->FR[i]);
2801                 brelse(tb->CFL[i]);
2802                 brelse(tb->CFR[i]);
2803         }
2804
2805         /* deal with list of allocated (used and unused) nodes */
2806         for (i = 0; i < MAX_FEB_SIZE; i++) {
2807                 if (tb->FEB[i]) {
2808                         b_blocknr_t blocknr = tb->FEB[i]->b_blocknr;
2809                         /*
2810                          * de-allocated block which was not used by
2811                          * balancing and bforget about buffer for it
2812                          */
2813                         brelse(tb->FEB[i]);
2814                         reiserfs_free_block(tb->transaction_handle, NULL,
2815                                             blocknr, 0);
2816                 }
2817                 if (tb->used[i]) {
2818                         /* release used as new nodes including a new root */
2819                         brelse(tb->used[i]);
2820                 }
2821         }
2822
2823         kfree(tb->vn_buf);
2824
2825 }