Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / fs / btrfs / ctree.c
1 /*
2  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include <linux/slab.h>
21 #include <linux/rbtree.h>
22 #include "ctree.h"
23 #include "disk-io.h"
24 #include "transaction.h"
25 #include "print-tree.h"
26 #include "locking.h"
27
28 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
29                       *root, struct btrfs_path *path, int level);
30 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root
31                       *root, struct btrfs_key *ins_key,
32                       struct btrfs_path *path, int data_size, int extend);
33 static int push_node_left(struct btrfs_trans_handle *trans,
34                           struct btrfs_root *root, struct extent_buffer *dst,
35                           struct extent_buffer *src, int empty);
36 static int balance_node_right(struct btrfs_trans_handle *trans,
37                               struct btrfs_root *root,
38                               struct extent_buffer *dst_buf,
39                               struct extent_buffer *src_buf);
40 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
41                     int level, int slot);
42 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
43                                  struct extent_buffer *eb);
44
45 struct btrfs_path *btrfs_alloc_path(void)
46 {
47         struct btrfs_path *path;
48         path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
49         return path;
50 }
51
52 /*
53  * set all locked nodes in the path to blocking locks.  This should
54  * be done before scheduling
55  */
56 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
57 {
58         int i;
59         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
60                 if (!p->nodes[i] || !p->locks[i])
61                         continue;
62                 btrfs_set_lock_blocking_rw(p->nodes[i], p->locks[i]);
63                 if (p->locks[i] == BTRFS_READ_LOCK)
64                         p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
65                 else if (p->locks[i] == BTRFS_WRITE_LOCK)
66                         p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
67         }
68 }
69
70 /*
71  * reset all the locked nodes in the patch to spinning locks.
72  *
73  * held is used to keep lockdep happy, when lockdep is enabled
74  * we set held to a blocking lock before we go around and
75  * retake all the spinlocks in the path.  You can safely use NULL
76  * for held
77  */
78 noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
79                                         struct extent_buffer *held, int held_rw)
80 {
81         int i;
82
83         if (held) {
84                 btrfs_set_lock_blocking_rw(held, held_rw);
85                 if (held_rw == BTRFS_WRITE_LOCK)
86                         held_rw = BTRFS_WRITE_LOCK_BLOCKING;
87                 else if (held_rw == BTRFS_READ_LOCK)
88                         held_rw = BTRFS_READ_LOCK_BLOCKING;
89         }
90         btrfs_set_path_blocking(p);
91
92         for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
93                 if (p->nodes[i] && p->locks[i]) {
94                         btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
95                         if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
96                                 p->locks[i] = BTRFS_WRITE_LOCK;
97                         else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
98                                 p->locks[i] = BTRFS_READ_LOCK;
99                 }
100         }
101
102         if (held)
103                 btrfs_clear_lock_blocking_rw(held, held_rw);
104 }
105
106 /* this also releases the path */
107 void btrfs_free_path(struct btrfs_path *p)
108 {
109         if (!p)
110                 return;
111         btrfs_release_path(p);
112         kmem_cache_free(btrfs_path_cachep, p);
113 }
114
115 /*
116  * path release drops references on the extent buffers in the path
117  * and it drops any locks held by this path
118  *
119  * It is safe to call this on paths that no locks or extent buffers held.
120  */
121 noinline void btrfs_release_path(struct btrfs_path *p)
122 {
123         int i;
124
125         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
126                 p->slots[i] = 0;
127                 if (!p->nodes[i])
128                         continue;
129                 if (p->locks[i]) {
130                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
131                         p->locks[i] = 0;
132                 }
133                 free_extent_buffer(p->nodes[i]);
134                 p->nodes[i] = NULL;
135         }
136 }
137
138 /*
139  * safely gets a reference on the root node of a tree.  A lock
140  * is not taken, so a concurrent writer may put a different node
141  * at the root of the tree.  See btrfs_lock_root_node for the
142  * looping required.
143  *
144  * The extent buffer returned by this has a reference taken, so
145  * it won't disappear.  It may stop being the root of the tree
146  * at any time because there are no locks held.
147  */
148 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
149 {
150         struct extent_buffer *eb;
151
152         while (1) {
153                 rcu_read_lock();
154                 eb = rcu_dereference(root->node);
155
156                 /*
157                  * RCU really hurts here, we could free up the root node because
158                  * it was cow'ed but we may not get the new root node yet so do
159                  * the inc_not_zero dance and if it doesn't work then
160                  * synchronize_rcu and try again.
161                  */
162                 if (atomic_inc_not_zero(&eb->refs)) {
163                         rcu_read_unlock();
164                         break;
165                 }
166                 rcu_read_unlock();
167                 synchronize_rcu();
168         }
169         return eb;
170 }
171
172 /* loop around taking references on and locking the root node of the
173  * tree until you end up with a lock on the root.  A locked buffer
174  * is returned, with a reference held.
175  */
176 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
177 {
178         struct extent_buffer *eb;
179
180         while (1) {
181                 eb = btrfs_root_node(root);
182                 btrfs_tree_lock(eb);
183                 if (eb == root->node)
184                         break;
185                 btrfs_tree_unlock(eb);
186                 free_extent_buffer(eb);
187         }
188         return eb;
189 }
190
191 /* loop around taking references on and locking the root node of the
192  * tree until you end up with a lock on the root.  A locked buffer
193  * is returned, with a reference held.
194  */
195 static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
196 {
197         struct extent_buffer *eb;
198
199         while (1) {
200                 eb = btrfs_root_node(root);
201                 btrfs_tree_read_lock(eb);
202                 if (eb == root->node)
203                         break;
204                 btrfs_tree_read_unlock(eb);
205                 free_extent_buffer(eb);
206         }
207         return eb;
208 }
209
210 /* cowonly root (everything not a reference counted cow subvolume), just get
211  * put onto a simple dirty list.  transaction.c walks this to make sure they
212  * get properly updated on disk.
213  */
214 static void add_root_to_dirty_list(struct btrfs_root *root)
215 {
216         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
217             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
218                 return;
219
220         spin_lock(&root->fs_info->trans_lock);
221         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
222                 /* Want the extent tree to be the last on the list */
223                 if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
224                         list_move_tail(&root->dirty_list,
225                                        &root->fs_info->dirty_cowonly_roots);
226                 else
227                         list_move(&root->dirty_list,
228                                   &root->fs_info->dirty_cowonly_roots);
229         }
230         spin_unlock(&root->fs_info->trans_lock);
231 }
232
233 /*
234  * used by snapshot creation to make a copy of a root for a tree with
235  * a given objectid.  The buffer with the new root node is returned in
236  * cow_ret, and this func returns zero on success or a negative error code.
237  */
238 int btrfs_copy_root(struct btrfs_trans_handle *trans,
239                       struct btrfs_root *root,
240                       struct extent_buffer *buf,
241                       struct extent_buffer **cow_ret, u64 new_root_objectid)
242 {
243         struct extent_buffer *cow;
244         int ret = 0;
245         int level;
246         struct btrfs_disk_key disk_key;
247
248         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
249                 trans->transid != root->fs_info->running_transaction->transid);
250         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
251                 trans->transid != root->last_trans);
252
253         level = btrfs_header_level(buf);
254         if (level == 0)
255                 btrfs_item_key(buf, &disk_key, 0);
256         else
257                 btrfs_node_key(buf, &disk_key, 0);
258
259         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
260                         &disk_key, level, buf->start, 0);
261         if (IS_ERR(cow))
262                 return PTR_ERR(cow);
263
264         copy_extent_buffer(cow, buf, 0, 0, cow->len);
265         btrfs_set_header_bytenr(cow, cow->start);
266         btrfs_set_header_generation(cow, trans->transid);
267         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
268         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
269                                      BTRFS_HEADER_FLAG_RELOC);
270         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
271                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
272         else
273                 btrfs_set_header_owner(cow, new_root_objectid);
274
275         write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
276                             BTRFS_FSID_SIZE);
277
278         WARN_ON(btrfs_header_generation(buf) > trans->transid);
279         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
280                 ret = btrfs_inc_ref(trans, root, cow, 1);
281         else
282                 ret = btrfs_inc_ref(trans, root, cow, 0);
283
284         if (ret)
285                 return ret;
286
287         btrfs_mark_buffer_dirty(cow);
288         *cow_ret = cow;
289         return 0;
290 }
291
292 enum mod_log_op {
293         MOD_LOG_KEY_REPLACE,
294         MOD_LOG_KEY_ADD,
295         MOD_LOG_KEY_REMOVE,
296         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
297         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
298         MOD_LOG_MOVE_KEYS,
299         MOD_LOG_ROOT_REPLACE,
300 };
301
302 struct tree_mod_move {
303         int dst_slot;
304         int nr_items;
305 };
306
307 struct tree_mod_root {
308         u64 logical;
309         u8 level;
310 };
311
312 struct tree_mod_elem {
313         struct rb_node node;
314         u64 index;              /* shifted logical */
315         u64 seq;
316         enum mod_log_op op;
317
318         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
319         int slot;
320
321         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
322         u64 generation;
323
324         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
325         struct btrfs_disk_key key;
326         u64 blockptr;
327
328         /* this is used for op == MOD_LOG_MOVE_KEYS */
329         struct tree_mod_move move;
330
331         /* this is used for op == MOD_LOG_ROOT_REPLACE */
332         struct tree_mod_root old_root;
333 };
334
335 static inline void tree_mod_log_read_lock(struct btrfs_fs_info *fs_info)
336 {
337         read_lock(&fs_info->tree_mod_log_lock);
338 }
339
340 static inline void tree_mod_log_read_unlock(struct btrfs_fs_info *fs_info)
341 {
342         read_unlock(&fs_info->tree_mod_log_lock);
343 }
344
345 static inline void tree_mod_log_write_lock(struct btrfs_fs_info *fs_info)
346 {
347         write_lock(&fs_info->tree_mod_log_lock);
348 }
349
350 static inline void tree_mod_log_write_unlock(struct btrfs_fs_info *fs_info)
351 {
352         write_unlock(&fs_info->tree_mod_log_lock);
353 }
354
355 /*
356  * Pull a new tree mod seq number for our operation.
357  */
358 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
359 {
360         return atomic64_inc_return(&fs_info->tree_mod_seq);
361 }
362
363 /*
364  * This adds a new blocker to the tree mod log's blocker list if the @elem
365  * passed does not already have a sequence number set. So when a caller expects
366  * to record tree modifications, it should ensure to set elem->seq to zero
367  * before calling btrfs_get_tree_mod_seq.
368  * Returns a fresh, unused tree log modification sequence number, even if no new
369  * blocker was added.
370  */
371 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
372                            struct seq_list *elem)
373 {
374         tree_mod_log_write_lock(fs_info);
375         spin_lock(&fs_info->tree_mod_seq_lock);
376         if (!elem->seq) {
377                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
378                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
379         }
380         spin_unlock(&fs_info->tree_mod_seq_lock);
381         tree_mod_log_write_unlock(fs_info);
382
383         return elem->seq;
384 }
385
386 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
387                             struct seq_list *elem)
388 {
389         struct rb_root *tm_root;
390         struct rb_node *node;
391         struct rb_node *next;
392         struct seq_list *cur_elem;
393         struct tree_mod_elem *tm;
394         u64 min_seq = (u64)-1;
395         u64 seq_putting = elem->seq;
396
397         if (!seq_putting)
398                 return;
399
400         spin_lock(&fs_info->tree_mod_seq_lock);
401         list_del(&elem->list);
402         elem->seq = 0;
403
404         list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
405                 if (cur_elem->seq < min_seq) {
406                         if (seq_putting > cur_elem->seq) {
407                                 /*
408                                  * blocker with lower sequence number exists, we
409                                  * cannot remove anything from the log
410                                  */
411                                 spin_unlock(&fs_info->tree_mod_seq_lock);
412                                 return;
413                         }
414                         min_seq = cur_elem->seq;
415                 }
416         }
417         spin_unlock(&fs_info->tree_mod_seq_lock);
418
419         /*
420          * anything that's lower than the lowest existing (read: blocked)
421          * sequence number can be removed from the tree.
422          */
423         tree_mod_log_write_lock(fs_info);
424         tm_root = &fs_info->tree_mod_log;
425         for (node = rb_first(tm_root); node; node = next) {
426                 next = rb_next(node);
427                 tm = container_of(node, struct tree_mod_elem, node);
428                 if (tm->seq > min_seq)
429                         continue;
430                 rb_erase(node, tm_root);
431                 kfree(tm);
432         }
433         tree_mod_log_write_unlock(fs_info);
434 }
435
436 /*
437  * key order of the log:
438  *       index -> sequence
439  *
440  * the index is the shifted logical of the *new* root node for root replace
441  * operations, or the shifted logical of the affected block for all other
442  * operations.
443  *
444  * Note: must be called with write lock (tree_mod_log_write_lock).
445  */
446 static noinline int
447 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
448 {
449         struct rb_root *tm_root;
450         struct rb_node **new;
451         struct rb_node *parent = NULL;
452         struct tree_mod_elem *cur;
453
454         BUG_ON(!tm);
455
456         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
457
458         tm_root = &fs_info->tree_mod_log;
459         new = &tm_root->rb_node;
460         while (*new) {
461                 cur = container_of(*new, struct tree_mod_elem, node);
462                 parent = *new;
463                 if (cur->index < tm->index)
464                         new = &((*new)->rb_left);
465                 else if (cur->index > tm->index)
466                         new = &((*new)->rb_right);
467                 else if (cur->seq < tm->seq)
468                         new = &((*new)->rb_left);
469                 else if (cur->seq > tm->seq)
470                         new = &((*new)->rb_right);
471                 else
472                         return -EEXIST;
473         }
474
475         rb_link_node(&tm->node, parent, new);
476         rb_insert_color(&tm->node, tm_root);
477         return 0;
478 }
479
480 /*
481  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
482  * returns zero with the tree_mod_log_lock acquired. The caller must hold
483  * this until all tree mod log insertions are recorded in the rb tree and then
484  * call tree_mod_log_write_unlock() to release.
485  */
486 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
487                                     struct extent_buffer *eb) {
488         smp_mb();
489         if (list_empty(&(fs_info)->tree_mod_seq_list))
490                 return 1;
491         if (eb && btrfs_header_level(eb) == 0)
492                 return 1;
493
494         tree_mod_log_write_lock(fs_info);
495         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
496                 tree_mod_log_write_unlock(fs_info);
497                 return 1;
498         }
499
500         return 0;
501 }
502
503 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
504 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
505                                     struct extent_buffer *eb)
506 {
507         smp_mb();
508         if (list_empty(&(fs_info)->tree_mod_seq_list))
509                 return 0;
510         if (eb && btrfs_header_level(eb) == 0)
511                 return 0;
512
513         return 1;
514 }
515
516 static struct tree_mod_elem *
517 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
518                     enum mod_log_op op, gfp_t flags)
519 {
520         struct tree_mod_elem *tm;
521
522         tm = kzalloc(sizeof(*tm), flags);
523         if (!tm)
524                 return NULL;
525
526         tm->index = eb->start >> PAGE_CACHE_SHIFT;
527         if (op != MOD_LOG_KEY_ADD) {
528                 btrfs_node_key(eb, &tm->key, slot);
529                 tm->blockptr = btrfs_node_blockptr(eb, slot);
530         }
531         tm->op = op;
532         tm->slot = slot;
533         tm->generation = btrfs_node_ptr_generation(eb, slot);
534         RB_CLEAR_NODE(&tm->node);
535
536         return tm;
537 }
538
539 static noinline int
540 tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
541                         struct extent_buffer *eb, int slot,
542                         enum mod_log_op op, gfp_t flags)
543 {
544         struct tree_mod_elem *tm;
545         int ret;
546
547         if (!tree_mod_need_log(fs_info, eb))
548                 return 0;
549
550         tm = alloc_tree_mod_elem(eb, slot, op, flags);
551         if (!tm)
552                 return -ENOMEM;
553
554         if (tree_mod_dont_log(fs_info, eb)) {
555                 kfree(tm);
556                 return 0;
557         }
558
559         ret = __tree_mod_log_insert(fs_info, tm);
560         tree_mod_log_write_unlock(fs_info);
561         if (ret)
562                 kfree(tm);
563
564         return ret;
565 }
566
567 static noinline int
568 tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
569                          struct extent_buffer *eb, int dst_slot, int src_slot,
570                          int nr_items, gfp_t flags)
571 {
572         struct tree_mod_elem *tm = NULL;
573         struct tree_mod_elem **tm_list = NULL;
574         int ret = 0;
575         int i;
576         int locked = 0;
577
578         if (!tree_mod_need_log(fs_info, eb))
579                 return 0;
580
581         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags);
582         if (!tm_list)
583                 return -ENOMEM;
584
585         tm = kzalloc(sizeof(*tm), flags);
586         if (!tm) {
587                 ret = -ENOMEM;
588                 goto free_tms;
589         }
590
591         tm->index = eb->start >> PAGE_CACHE_SHIFT;
592         tm->slot = src_slot;
593         tm->move.dst_slot = dst_slot;
594         tm->move.nr_items = nr_items;
595         tm->op = MOD_LOG_MOVE_KEYS;
596
597         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
598                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
599                     MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags);
600                 if (!tm_list[i]) {
601                         ret = -ENOMEM;
602                         goto free_tms;
603                 }
604         }
605
606         if (tree_mod_dont_log(fs_info, eb))
607                 goto free_tms;
608         locked = 1;
609
610         /*
611          * When we override something during the move, we log these removals.
612          * This can only happen when we move towards the beginning of the
613          * buffer, i.e. dst_slot < src_slot.
614          */
615         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
616                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
617                 if (ret)
618                         goto free_tms;
619         }
620
621         ret = __tree_mod_log_insert(fs_info, tm);
622         if (ret)
623                 goto free_tms;
624         tree_mod_log_write_unlock(fs_info);
625         kfree(tm_list);
626
627         return 0;
628 free_tms:
629         for (i = 0; i < nr_items; i++) {
630                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
631                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
632                 kfree(tm_list[i]);
633         }
634         if (locked)
635                 tree_mod_log_write_unlock(fs_info);
636         kfree(tm_list);
637         kfree(tm);
638
639         return ret;
640 }
641
642 static inline int
643 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
644                        struct tree_mod_elem **tm_list,
645                        int nritems)
646 {
647         int i, j;
648         int ret;
649
650         for (i = nritems - 1; i >= 0; i--) {
651                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
652                 if (ret) {
653                         for (j = nritems - 1; j > i; j--)
654                                 rb_erase(&tm_list[j]->node,
655                                          &fs_info->tree_mod_log);
656                         return ret;
657                 }
658         }
659
660         return 0;
661 }
662
663 static noinline int
664 tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
665                          struct extent_buffer *old_root,
666                          struct extent_buffer *new_root, gfp_t flags,
667                          int log_removal)
668 {
669         struct tree_mod_elem *tm = NULL;
670         struct tree_mod_elem **tm_list = NULL;
671         int nritems = 0;
672         int ret = 0;
673         int i;
674
675         if (!tree_mod_need_log(fs_info, NULL))
676                 return 0;
677
678         if (log_removal && btrfs_header_level(old_root) > 0) {
679                 nritems = btrfs_header_nritems(old_root);
680                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
681                                   flags);
682                 if (!tm_list) {
683                         ret = -ENOMEM;
684                         goto free_tms;
685                 }
686                 for (i = 0; i < nritems; i++) {
687                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
688                             MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags);
689                         if (!tm_list[i]) {
690                                 ret = -ENOMEM;
691                                 goto free_tms;
692                         }
693                 }
694         }
695
696         tm = kzalloc(sizeof(*tm), flags);
697         if (!tm) {
698                 ret = -ENOMEM;
699                 goto free_tms;
700         }
701
702         tm->index = new_root->start >> PAGE_CACHE_SHIFT;
703         tm->old_root.logical = old_root->start;
704         tm->old_root.level = btrfs_header_level(old_root);
705         tm->generation = btrfs_header_generation(old_root);
706         tm->op = MOD_LOG_ROOT_REPLACE;
707
708         if (tree_mod_dont_log(fs_info, NULL))
709                 goto free_tms;
710
711         if (tm_list)
712                 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
713         if (!ret)
714                 ret = __tree_mod_log_insert(fs_info, tm);
715
716         tree_mod_log_write_unlock(fs_info);
717         if (ret)
718                 goto free_tms;
719         kfree(tm_list);
720
721         return ret;
722
723 free_tms:
724         if (tm_list) {
725                 for (i = 0; i < nritems; i++)
726                         kfree(tm_list[i]);
727                 kfree(tm_list);
728         }
729         kfree(tm);
730
731         return ret;
732 }
733
734 static struct tree_mod_elem *
735 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
736                       int smallest)
737 {
738         struct rb_root *tm_root;
739         struct rb_node *node;
740         struct tree_mod_elem *cur = NULL;
741         struct tree_mod_elem *found = NULL;
742         u64 index = start >> PAGE_CACHE_SHIFT;
743
744         tree_mod_log_read_lock(fs_info);
745         tm_root = &fs_info->tree_mod_log;
746         node = tm_root->rb_node;
747         while (node) {
748                 cur = container_of(node, struct tree_mod_elem, node);
749                 if (cur->index < index) {
750                         node = node->rb_left;
751                 } else if (cur->index > index) {
752                         node = node->rb_right;
753                 } else if (cur->seq < min_seq) {
754                         node = node->rb_left;
755                 } else if (!smallest) {
756                         /* we want the node with the highest seq */
757                         if (found)
758                                 BUG_ON(found->seq > cur->seq);
759                         found = cur;
760                         node = node->rb_left;
761                 } else if (cur->seq > min_seq) {
762                         /* we want the node with the smallest seq */
763                         if (found)
764                                 BUG_ON(found->seq < cur->seq);
765                         found = cur;
766                         node = node->rb_right;
767                 } else {
768                         found = cur;
769                         break;
770                 }
771         }
772         tree_mod_log_read_unlock(fs_info);
773
774         return found;
775 }
776
777 /*
778  * this returns the element from the log with the smallest time sequence
779  * value that's in the log (the oldest log item). any element with a time
780  * sequence lower than min_seq will be ignored.
781  */
782 static struct tree_mod_elem *
783 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
784                            u64 min_seq)
785 {
786         return __tree_mod_log_search(fs_info, start, min_seq, 1);
787 }
788
789 /*
790  * this returns the element from the log with the largest time sequence
791  * value that's in the log (the most recent log item). any element with
792  * a time sequence lower than min_seq will be ignored.
793  */
794 static struct tree_mod_elem *
795 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
796 {
797         return __tree_mod_log_search(fs_info, start, min_seq, 0);
798 }
799
800 static noinline int
801 tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
802                      struct extent_buffer *src, unsigned long dst_offset,
803                      unsigned long src_offset, int nr_items)
804 {
805         int ret = 0;
806         struct tree_mod_elem **tm_list = NULL;
807         struct tree_mod_elem **tm_list_add, **tm_list_rem;
808         int i;
809         int locked = 0;
810
811         if (!tree_mod_need_log(fs_info, NULL))
812                 return 0;
813
814         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
815                 return 0;
816
817         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
818                           GFP_NOFS);
819         if (!tm_list)
820                 return -ENOMEM;
821
822         tm_list_add = tm_list;
823         tm_list_rem = tm_list + nr_items;
824         for (i = 0; i < nr_items; i++) {
825                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
826                     MOD_LOG_KEY_REMOVE, GFP_NOFS);
827                 if (!tm_list_rem[i]) {
828                         ret = -ENOMEM;
829                         goto free_tms;
830                 }
831
832                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
833                     MOD_LOG_KEY_ADD, GFP_NOFS);
834                 if (!tm_list_add[i]) {
835                         ret = -ENOMEM;
836                         goto free_tms;
837                 }
838         }
839
840         if (tree_mod_dont_log(fs_info, NULL))
841                 goto free_tms;
842         locked = 1;
843
844         for (i = 0; i < nr_items; i++) {
845                 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
846                 if (ret)
847                         goto free_tms;
848                 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
849                 if (ret)
850                         goto free_tms;
851         }
852
853         tree_mod_log_write_unlock(fs_info);
854         kfree(tm_list);
855
856         return 0;
857
858 free_tms:
859         for (i = 0; i < nr_items * 2; i++) {
860                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
861                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
862                 kfree(tm_list[i]);
863         }
864         if (locked)
865                 tree_mod_log_write_unlock(fs_info);
866         kfree(tm_list);
867
868         return ret;
869 }
870
871 static inline void
872 tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
873                      int dst_offset, int src_offset, int nr_items)
874 {
875         int ret;
876         ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset,
877                                        nr_items, GFP_NOFS);
878         BUG_ON(ret < 0);
879 }
880
881 static noinline void
882 tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
883                           struct extent_buffer *eb, int slot, int atomic)
884 {
885         int ret;
886
887         ret = tree_mod_log_insert_key(fs_info, eb, slot,
888                                         MOD_LOG_KEY_REPLACE,
889                                         atomic ? GFP_ATOMIC : GFP_NOFS);
890         BUG_ON(ret < 0);
891 }
892
893 static noinline int
894 tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
895 {
896         struct tree_mod_elem **tm_list = NULL;
897         int nritems = 0;
898         int i;
899         int ret = 0;
900
901         if (btrfs_header_level(eb) == 0)
902                 return 0;
903
904         if (!tree_mod_need_log(fs_info, NULL))
905                 return 0;
906
907         nritems = btrfs_header_nritems(eb);
908         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
909         if (!tm_list)
910                 return -ENOMEM;
911
912         for (i = 0; i < nritems; i++) {
913                 tm_list[i] = alloc_tree_mod_elem(eb, i,
914                     MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
915                 if (!tm_list[i]) {
916                         ret = -ENOMEM;
917                         goto free_tms;
918                 }
919         }
920
921         if (tree_mod_dont_log(fs_info, eb))
922                 goto free_tms;
923
924         ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
925         tree_mod_log_write_unlock(fs_info);
926         if (ret)
927                 goto free_tms;
928         kfree(tm_list);
929
930         return 0;
931
932 free_tms:
933         for (i = 0; i < nritems; i++)
934                 kfree(tm_list[i]);
935         kfree(tm_list);
936
937         return ret;
938 }
939
940 static noinline void
941 tree_mod_log_set_root_pointer(struct btrfs_root *root,
942                               struct extent_buffer *new_root_node,
943                               int log_removal)
944 {
945         int ret;
946         ret = tree_mod_log_insert_root(root->fs_info, root->node,
947                                        new_root_node, GFP_NOFS, log_removal);
948         BUG_ON(ret < 0);
949 }
950
951 /*
952  * check if the tree block can be shared by multiple trees
953  */
954 int btrfs_block_can_be_shared(struct btrfs_root *root,
955                               struct extent_buffer *buf)
956 {
957         /*
958          * Tree blocks not in refernece counted trees and tree roots
959          * are never shared. If a block was allocated after the last
960          * snapshot and the block was not allocated by tree relocation,
961          * we know the block is not shared.
962          */
963         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
964             buf != root->node && buf != root->commit_root &&
965             (btrfs_header_generation(buf) <=
966              btrfs_root_last_snapshot(&root->root_item) ||
967              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
968                 return 1;
969 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
970         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
971             btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
972                 return 1;
973 #endif
974         return 0;
975 }
976
977 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
978                                        struct btrfs_root *root,
979                                        struct extent_buffer *buf,
980                                        struct extent_buffer *cow,
981                                        int *last_ref)
982 {
983         u64 refs;
984         u64 owner;
985         u64 flags;
986         u64 new_flags = 0;
987         int ret;
988
989         /*
990          * Backrefs update rules:
991          *
992          * Always use full backrefs for extent pointers in tree block
993          * allocated by tree relocation.
994          *
995          * If a shared tree block is no longer referenced by its owner
996          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
997          * use full backrefs for extent pointers in tree block.
998          *
999          * If a tree block is been relocating
1000          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
1001          * use full backrefs for extent pointers in tree block.
1002          * The reason for this is some operations (such as drop tree)
1003          * are only allowed for blocks use full backrefs.
1004          */
1005
1006         if (btrfs_block_can_be_shared(root, buf)) {
1007                 ret = btrfs_lookup_extent_info(trans, root, buf->start,
1008                                                btrfs_header_level(buf), 1,
1009                                                &refs, &flags);
1010                 if (ret)
1011                         return ret;
1012                 if (refs == 0) {
1013                         ret = -EROFS;
1014                         btrfs_std_error(root->fs_info, ret);
1015                         return ret;
1016                 }
1017         } else {
1018                 refs = 1;
1019                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1020                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1021                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
1022                 else
1023                         flags = 0;
1024         }
1025
1026         owner = btrfs_header_owner(buf);
1027         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
1028                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
1029
1030         if (refs > 1) {
1031                 if ((owner == root->root_key.objectid ||
1032                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
1033                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
1034                         ret = btrfs_inc_ref(trans, root, buf, 1);
1035                         BUG_ON(ret); /* -ENOMEM */
1036
1037                         if (root->root_key.objectid ==
1038                             BTRFS_TREE_RELOC_OBJECTID) {
1039                                 ret = btrfs_dec_ref(trans, root, buf, 0);
1040                                 BUG_ON(ret); /* -ENOMEM */
1041                                 ret = btrfs_inc_ref(trans, root, cow, 1);
1042                                 BUG_ON(ret); /* -ENOMEM */
1043                         }
1044                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
1045                 } else {
1046
1047                         if (root->root_key.objectid ==
1048                             BTRFS_TREE_RELOC_OBJECTID)
1049                                 ret = btrfs_inc_ref(trans, root, cow, 1);
1050                         else
1051                                 ret = btrfs_inc_ref(trans, root, cow, 0);
1052                         BUG_ON(ret); /* -ENOMEM */
1053                 }
1054                 if (new_flags != 0) {
1055                         int level = btrfs_header_level(buf);
1056
1057                         ret = btrfs_set_disk_extent_flags(trans, root,
1058                                                           buf->start,
1059                                                           buf->len,
1060                                                           new_flags, level, 0);
1061                         if (ret)
1062                                 return ret;
1063                 }
1064         } else {
1065                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
1066                         if (root->root_key.objectid ==
1067                             BTRFS_TREE_RELOC_OBJECTID)
1068                                 ret = btrfs_inc_ref(trans, root, cow, 1);
1069                         else
1070                                 ret = btrfs_inc_ref(trans, root, cow, 0);
1071                         BUG_ON(ret); /* -ENOMEM */
1072                         ret = btrfs_dec_ref(trans, root, buf, 1);
1073                         BUG_ON(ret); /* -ENOMEM */
1074                 }
1075                 clean_tree_block(trans, root->fs_info, buf);
1076                 *last_ref = 1;
1077         }
1078         return 0;
1079 }
1080
1081 /*
1082  * does the dirty work in cow of a single block.  The parent block (if
1083  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1084  * dirty and returned locked.  If you modify the block it needs to be marked
1085  * dirty again.
1086  *
1087  * search_start -- an allocation hint for the new block
1088  *
1089  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1090  * bytes the allocator should try to find free next to the block it returns.
1091  * This is just a hint and may be ignored by the allocator.
1092  */
1093 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1094                              struct btrfs_root *root,
1095                              struct extent_buffer *buf,
1096                              struct extent_buffer *parent, int parent_slot,
1097                              struct extent_buffer **cow_ret,
1098                              u64 search_start, u64 empty_size)
1099 {
1100         struct btrfs_disk_key disk_key;
1101         struct extent_buffer *cow;
1102         int level, ret;
1103         int last_ref = 0;
1104         int unlock_orig = 0;
1105         u64 parent_start;
1106
1107         if (*cow_ret == buf)
1108                 unlock_orig = 1;
1109
1110         btrfs_assert_tree_locked(buf);
1111
1112         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1113                 trans->transid != root->fs_info->running_transaction->transid);
1114         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1115                 trans->transid != root->last_trans);
1116
1117         level = btrfs_header_level(buf);
1118
1119         if (level == 0)
1120                 btrfs_item_key(buf, &disk_key, 0);
1121         else
1122                 btrfs_node_key(buf, &disk_key, 0);
1123
1124         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
1125                 if (parent)
1126                         parent_start = parent->start;
1127                 else
1128                         parent_start = 0;
1129         } else
1130                 parent_start = 0;
1131
1132         cow = btrfs_alloc_tree_block(trans, root, parent_start,
1133                         root->root_key.objectid, &disk_key, level,
1134                         search_start, empty_size);
1135         if (IS_ERR(cow))
1136                 return PTR_ERR(cow);
1137
1138         /* cow is set to blocking by btrfs_init_new_buffer */
1139
1140         copy_extent_buffer(cow, buf, 0, 0, cow->len);
1141         btrfs_set_header_bytenr(cow, cow->start);
1142         btrfs_set_header_generation(cow, trans->transid);
1143         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1144         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1145                                      BTRFS_HEADER_FLAG_RELOC);
1146         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1147                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1148         else
1149                 btrfs_set_header_owner(cow, root->root_key.objectid);
1150
1151         write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(),
1152                             BTRFS_FSID_SIZE);
1153
1154         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1155         if (ret) {
1156                 btrfs_abort_transaction(trans, root, ret);
1157                 return ret;
1158         }
1159
1160         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1161                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1162                 if (ret)
1163                         return ret;
1164         }
1165
1166         if (buf == root->node) {
1167                 WARN_ON(parent && parent != buf);
1168                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1169                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1170                         parent_start = buf->start;
1171                 else
1172                         parent_start = 0;
1173
1174                 extent_buffer_get(cow);
1175                 tree_mod_log_set_root_pointer(root, cow, 1);
1176                 rcu_assign_pointer(root->node, cow);
1177
1178                 btrfs_free_tree_block(trans, root, buf, parent_start,
1179                                       last_ref);
1180                 free_extent_buffer(buf);
1181                 add_root_to_dirty_list(root);
1182         } else {
1183                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1184                         parent_start = parent->start;
1185                 else
1186                         parent_start = 0;
1187
1188                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1189                 tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
1190                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1191                 btrfs_set_node_blockptr(parent, parent_slot,
1192                                         cow->start);
1193                 btrfs_set_node_ptr_generation(parent, parent_slot,
1194                                               trans->transid);
1195                 btrfs_mark_buffer_dirty(parent);
1196                 if (last_ref) {
1197                         ret = tree_mod_log_free_eb(root->fs_info, buf);
1198                         if (ret) {
1199                                 btrfs_abort_transaction(trans, root, ret);
1200                                 return ret;
1201                         }
1202                 }
1203                 btrfs_free_tree_block(trans, root, buf, parent_start,
1204                                       last_ref);
1205         }
1206         if (unlock_orig)
1207                 btrfs_tree_unlock(buf);
1208         free_extent_buffer_stale(buf);
1209         btrfs_mark_buffer_dirty(cow);
1210         *cow_ret = cow;
1211         return 0;
1212 }
1213
1214 /*
1215  * returns the logical address of the oldest predecessor of the given root.
1216  * entries older than time_seq are ignored.
1217  */
1218 static struct tree_mod_elem *
1219 __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
1220                            struct extent_buffer *eb_root, u64 time_seq)
1221 {
1222         struct tree_mod_elem *tm;
1223         struct tree_mod_elem *found = NULL;
1224         u64 root_logical = eb_root->start;
1225         int looped = 0;
1226
1227         if (!time_seq)
1228                 return NULL;
1229
1230         /*
1231          * the very last operation that's logged for a root is the replacement
1232          * operation (if it is replaced at all). this has the index of the *new*
1233          * root, making it the very first operation that's logged for this root.
1234          */
1235         while (1) {
1236                 tm = tree_mod_log_search_oldest(fs_info, root_logical,
1237                                                 time_seq);
1238                 if (!looped && !tm)
1239                         return NULL;
1240                 /*
1241                  * if there are no tree operation for the oldest root, we simply
1242                  * return it. this should only happen if that (old) root is at
1243                  * level 0.
1244                  */
1245                 if (!tm)
1246                         break;
1247
1248                 /*
1249                  * if there's an operation that's not a root replacement, we
1250                  * found the oldest version of our root. normally, we'll find a
1251                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1252                  */
1253                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1254                         break;
1255
1256                 found = tm;
1257                 root_logical = tm->old_root.logical;
1258                 looped = 1;
1259         }
1260
1261         /* if there's no old root to return, return what we found instead */
1262         if (!found)
1263                 found = tm;
1264
1265         return found;
1266 }
1267
1268 /*
1269  * tm is a pointer to the first operation to rewind within eb. then, all
1270  * previous operations will be rewinded (until we reach something older than
1271  * time_seq).
1272  */
1273 static void
1274 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1275                       u64 time_seq, struct tree_mod_elem *first_tm)
1276 {
1277         u32 n;
1278         struct rb_node *next;
1279         struct tree_mod_elem *tm = first_tm;
1280         unsigned long o_dst;
1281         unsigned long o_src;
1282         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1283
1284         n = btrfs_header_nritems(eb);
1285         tree_mod_log_read_lock(fs_info);
1286         while (tm && tm->seq >= time_seq) {
1287                 /*
1288                  * all the operations are recorded with the operator used for
1289                  * the modification. as we're going backwards, we do the
1290                  * opposite of each operation here.
1291                  */
1292                 switch (tm->op) {
1293                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1294                         BUG_ON(tm->slot < n);
1295                         /* Fallthrough */
1296                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1297                 case MOD_LOG_KEY_REMOVE:
1298                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1299                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1300                         btrfs_set_node_ptr_generation(eb, tm->slot,
1301                                                       tm->generation);
1302                         n++;
1303                         break;
1304                 case MOD_LOG_KEY_REPLACE:
1305                         BUG_ON(tm->slot >= n);
1306                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1307                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1308                         btrfs_set_node_ptr_generation(eb, tm->slot,
1309                                                       tm->generation);
1310                         break;
1311                 case MOD_LOG_KEY_ADD:
1312                         /* if a move operation is needed it's in the log */
1313                         n--;
1314                         break;
1315                 case MOD_LOG_MOVE_KEYS:
1316                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1317                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1318                         memmove_extent_buffer(eb, o_dst, o_src,
1319                                               tm->move.nr_items * p_size);
1320                         break;
1321                 case MOD_LOG_ROOT_REPLACE:
1322                         /*
1323                          * this operation is special. for roots, this must be
1324                          * handled explicitly before rewinding.
1325                          * for non-roots, this operation may exist if the node
1326                          * was a root: root A -> child B; then A gets empty and
1327                          * B is promoted to the new root. in the mod log, we'll
1328                          * have a root-replace operation for B, a tree block
1329                          * that is no root. we simply ignore that operation.
1330                          */
1331                         break;
1332                 }
1333                 next = rb_next(&tm->node);
1334                 if (!next)
1335                         break;
1336                 tm = container_of(next, struct tree_mod_elem, node);
1337                 if (tm->index != first_tm->index)
1338                         break;
1339         }
1340         tree_mod_log_read_unlock(fs_info);
1341         btrfs_set_header_nritems(eb, n);
1342 }
1343
1344 /*
1345  * Called with eb read locked. If the buffer cannot be rewinded, the same buffer
1346  * is returned. If rewind operations happen, a fresh buffer is returned. The
1347  * returned buffer is always read-locked. If the returned buffer is not the
1348  * input buffer, the lock on the input buffer is released and the input buffer
1349  * is freed (its refcount is decremented).
1350  */
1351 static struct extent_buffer *
1352 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1353                     struct extent_buffer *eb, u64 time_seq)
1354 {
1355         struct extent_buffer *eb_rewin;
1356         struct tree_mod_elem *tm;
1357
1358         if (!time_seq)
1359                 return eb;
1360
1361         if (btrfs_header_level(eb) == 0)
1362                 return eb;
1363
1364         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1365         if (!tm)
1366                 return eb;
1367
1368         btrfs_set_path_blocking(path);
1369         btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
1370
1371         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1372                 BUG_ON(tm->slot != 0);
1373                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1374                 if (!eb_rewin) {
1375                         btrfs_tree_read_unlock_blocking(eb);
1376                         free_extent_buffer(eb);
1377                         return NULL;
1378                 }
1379                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1380                 btrfs_set_header_backref_rev(eb_rewin,
1381                                              btrfs_header_backref_rev(eb));
1382                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1383                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1384         } else {
1385                 eb_rewin = btrfs_clone_extent_buffer(eb);
1386                 if (!eb_rewin) {
1387                         btrfs_tree_read_unlock_blocking(eb);
1388                         free_extent_buffer(eb);
1389                         return NULL;
1390                 }
1391         }
1392
1393         btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
1394         btrfs_tree_read_unlock_blocking(eb);
1395         free_extent_buffer(eb);
1396
1397         extent_buffer_get(eb_rewin);
1398         btrfs_tree_read_lock(eb_rewin);
1399         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1400         WARN_ON(btrfs_header_nritems(eb_rewin) >
1401                 BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
1402
1403         return eb_rewin;
1404 }
1405
1406 /*
1407  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1408  * value. If there are no changes, the current root->root_node is returned. If
1409  * anything changed in between, there's a fresh buffer allocated on which the
1410  * rewind operations are done. In any case, the returned buffer is read locked.
1411  * Returns NULL on error (with no locks held).
1412  */
1413 static inline struct extent_buffer *
1414 get_old_root(struct btrfs_root *root, u64 time_seq)
1415 {
1416         struct tree_mod_elem *tm;
1417         struct extent_buffer *eb = NULL;
1418         struct extent_buffer *eb_root;
1419         struct extent_buffer *old;
1420         struct tree_mod_root *old_root = NULL;
1421         u64 old_generation = 0;
1422         u64 logical;
1423
1424         eb_root = btrfs_read_lock_root_node(root);
1425         tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1426         if (!tm)
1427                 return eb_root;
1428
1429         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1430                 old_root = &tm->old_root;
1431                 old_generation = tm->generation;
1432                 logical = old_root->logical;
1433         } else {
1434                 logical = eb_root->start;
1435         }
1436
1437         tm = tree_mod_log_search(root->fs_info, logical, time_seq);
1438         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1439                 btrfs_tree_read_unlock(eb_root);
1440                 free_extent_buffer(eb_root);
1441                 old = read_tree_block(root, logical, 0);
1442                 if (WARN_ON(!old || !extent_buffer_uptodate(old))) {
1443                         free_extent_buffer(old);
1444                         btrfs_warn(root->fs_info,
1445                                 "failed to read tree block %llu from get_old_root", logical);
1446                 } else {
1447                         eb = btrfs_clone_extent_buffer(old);
1448                         free_extent_buffer(old);
1449                 }
1450         } else if (old_root) {
1451                 btrfs_tree_read_unlock(eb_root);
1452                 free_extent_buffer(eb_root);
1453                 eb = alloc_dummy_extent_buffer(root->fs_info, logical);
1454         } else {
1455                 btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
1456                 eb = btrfs_clone_extent_buffer(eb_root);
1457                 btrfs_tree_read_unlock_blocking(eb_root);
1458                 free_extent_buffer(eb_root);
1459         }
1460
1461         if (!eb)
1462                 return NULL;
1463         extent_buffer_get(eb);
1464         btrfs_tree_read_lock(eb);
1465         if (old_root) {
1466                 btrfs_set_header_bytenr(eb, eb->start);
1467                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1468                 btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
1469                 btrfs_set_header_level(eb, old_root->level);
1470                 btrfs_set_header_generation(eb, old_generation);
1471         }
1472         if (tm)
1473                 __tree_mod_log_rewind(root->fs_info, eb, time_seq, tm);
1474         else
1475                 WARN_ON(btrfs_header_level(eb) != 0);
1476         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(root));
1477
1478         return eb;
1479 }
1480
1481 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1482 {
1483         struct tree_mod_elem *tm;
1484         int level;
1485         struct extent_buffer *eb_root = btrfs_root_node(root);
1486
1487         tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq);
1488         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1489                 level = tm->old_root.level;
1490         } else {
1491                 level = btrfs_header_level(eb_root);
1492         }
1493         free_extent_buffer(eb_root);
1494
1495         return level;
1496 }
1497
1498 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1499                                    struct btrfs_root *root,
1500                                    struct extent_buffer *buf)
1501 {
1502         if (btrfs_test_is_dummy_root(root))
1503                 return 0;
1504
1505         /* ensure we can see the force_cow */
1506         smp_rmb();
1507
1508         /*
1509          * We do not need to cow a block if
1510          * 1) this block is not created or changed in this transaction;
1511          * 2) this block does not belong to TREE_RELOC tree;
1512          * 3) the root is not forced COW.
1513          *
1514          * What is forced COW:
1515          *    when we create snapshot during commiting the transaction,
1516          *    after we've finished coping src root, we must COW the shared
1517          *    block to ensure the metadata consistency.
1518          */
1519         if (btrfs_header_generation(buf) == trans->transid &&
1520             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1521             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1522               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1523             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1524                 return 0;
1525         return 1;
1526 }
1527
1528 /*
1529  * cows a single block, see __btrfs_cow_block for the real work.
1530  * This version of it has extra checks so that a block isn't cow'd more than
1531  * once per transaction, as long as it hasn't been written yet
1532  */
1533 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1534                     struct btrfs_root *root, struct extent_buffer *buf,
1535                     struct extent_buffer *parent, int parent_slot,
1536                     struct extent_buffer **cow_ret)
1537 {
1538         u64 search_start;
1539         int ret;
1540
1541         if (trans->transaction != root->fs_info->running_transaction)
1542                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1543                        trans->transid,
1544                        root->fs_info->running_transaction->transid);
1545
1546         if (trans->transid != root->fs_info->generation)
1547                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1548                        trans->transid, root->fs_info->generation);
1549
1550         if (!should_cow_block(trans, root, buf)) {
1551                 *cow_ret = buf;
1552                 return 0;
1553         }
1554
1555         search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
1556
1557         if (parent)
1558                 btrfs_set_lock_blocking(parent);
1559         btrfs_set_lock_blocking(buf);
1560
1561         ret = __btrfs_cow_block(trans, root, buf, parent,
1562                                  parent_slot, cow_ret, search_start, 0);
1563
1564         trace_btrfs_cow_block(root, buf, *cow_ret);
1565
1566         return ret;
1567 }
1568
1569 /*
1570  * helper function for defrag to decide if two blocks pointed to by a
1571  * node are actually close by
1572  */
1573 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1574 {
1575         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1576                 return 1;
1577         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1578                 return 1;
1579         return 0;
1580 }
1581
1582 /*
1583  * compare two keys in a memcmp fashion
1584  */
1585 static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
1586 {
1587         struct btrfs_key k1;
1588
1589         btrfs_disk_key_to_cpu(&k1, disk);
1590
1591         return btrfs_comp_cpu_keys(&k1, k2);
1592 }
1593
1594 /*
1595  * same as comp_keys only with two btrfs_key's
1596  */
1597 int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
1598 {
1599         if (k1->objectid > k2->objectid)
1600                 return 1;
1601         if (k1->objectid < k2->objectid)
1602                 return -1;
1603         if (k1->type > k2->type)
1604                 return 1;
1605         if (k1->type < k2->type)
1606                 return -1;
1607         if (k1->offset > k2->offset)
1608                 return 1;
1609         if (k1->offset < k2->offset)
1610                 return -1;
1611         return 0;
1612 }
1613
1614 /*
1615  * this is used by the defrag code to go through all the
1616  * leaves pointed to by a node and reallocate them so that
1617  * disk order is close to key order
1618  */
1619 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1620                        struct btrfs_root *root, struct extent_buffer *parent,
1621                        int start_slot, u64 *last_ret,
1622                        struct btrfs_key *progress)
1623 {
1624         struct extent_buffer *cur;
1625         u64 blocknr;
1626         u64 gen;
1627         u64 search_start = *last_ret;
1628         u64 last_block = 0;
1629         u64 other;
1630         u32 parent_nritems;
1631         int end_slot;
1632         int i;
1633         int err = 0;
1634         int parent_level;
1635         int uptodate;
1636         u32 blocksize;
1637         int progress_passed = 0;
1638         struct btrfs_disk_key disk_key;
1639
1640         parent_level = btrfs_header_level(parent);
1641
1642         WARN_ON(trans->transaction != root->fs_info->running_transaction);
1643         WARN_ON(trans->transid != root->fs_info->generation);
1644
1645         parent_nritems = btrfs_header_nritems(parent);
1646         blocksize = root->nodesize;
1647         end_slot = parent_nritems - 1;
1648
1649         if (parent_nritems <= 1)
1650                 return 0;
1651
1652         btrfs_set_lock_blocking(parent);
1653
1654         for (i = start_slot; i <= end_slot; i++) {
1655                 int close = 1;
1656
1657                 btrfs_node_key(parent, &disk_key, i);
1658                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1659                         continue;
1660
1661                 progress_passed = 1;
1662                 blocknr = btrfs_node_blockptr(parent, i);
1663                 gen = btrfs_node_ptr_generation(parent, i);
1664                 if (last_block == 0)
1665                         last_block = blocknr;
1666
1667                 if (i > 0) {
1668                         other = btrfs_node_blockptr(parent, i - 1);
1669                         close = close_blocks(blocknr, other, blocksize);
1670                 }
1671                 if (!close && i < end_slot) {
1672                         other = btrfs_node_blockptr(parent, i + 1);
1673                         close = close_blocks(blocknr, other, blocksize);
1674                 }
1675                 if (close) {
1676                         last_block = blocknr;
1677                         continue;
1678                 }
1679
1680                 cur = btrfs_find_tree_block(root->fs_info, blocknr);
1681                 if (cur)
1682                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1683                 else
1684                         uptodate = 0;
1685                 if (!cur || !uptodate) {
1686                         if (!cur) {
1687                                 cur = read_tree_block(root, blocknr, gen);
1688                                 if (!cur || !extent_buffer_uptodate(cur)) {
1689                                         free_extent_buffer(cur);
1690                                         return -EIO;
1691                                 }
1692                         } else if (!uptodate) {
1693                                 err = btrfs_read_buffer(cur, gen);
1694                                 if (err) {
1695                                         free_extent_buffer(cur);
1696                                         return err;
1697                                 }
1698                         }
1699                 }
1700                 if (search_start == 0)
1701                         search_start = last_block;
1702
1703                 btrfs_tree_lock(cur);
1704                 btrfs_set_lock_blocking(cur);
1705                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1706                                         &cur, search_start,
1707                                         min(16 * blocksize,
1708                                             (end_slot - i) * blocksize));
1709                 if (err) {
1710                         btrfs_tree_unlock(cur);
1711                         free_extent_buffer(cur);
1712                         break;
1713                 }
1714                 search_start = cur->start;
1715                 last_block = cur->start;
1716                 *last_ret = search_start;
1717                 btrfs_tree_unlock(cur);
1718                 free_extent_buffer(cur);
1719         }
1720         return err;
1721 }
1722
1723 /*
1724  * The leaf data grows from end-to-front in the node.
1725  * this returns the address of the start of the last item,
1726  * which is the stop of the leaf data stack
1727  */
1728 static inline unsigned int leaf_data_end(struct btrfs_root *root,
1729                                          struct extent_buffer *leaf)
1730 {
1731         u32 nr = btrfs_header_nritems(leaf);
1732         if (nr == 0)
1733                 return BTRFS_LEAF_DATA_SIZE(root);
1734         return btrfs_item_offset_nr(leaf, nr - 1);
1735 }
1736
1737
1738 /*
1739  * search for key in the extent_buffer.  The items start at offset p,
1740  * and they are item_size apart.  There are 'max' items in p.
1741  *
1742  * the slot in the array is returned via slot, and it points to
1743  * the place where you would insert key if it is not found in
1744  * the array.
1745  *
1746  * slot may point to max if the key is bigger than all of the keys
1747  */
1748 static noinline int generic_bin_search(struct extent_buffer *eb,
1749                                        unsigned long p,
1750                                        int item_size, struct btrfs_key *key,
1751                                        int max, int *slot)
1752 {
1753         int low = 0;
1754         int high = max;
1755         int mid;
1756         int ret;
1757         struct btrfs_disk_key *tmp = NULL;
1758         struct btrfs_disk_key unaligned;
1759         unsigned long offset;
1760         char *kaddr = NULL;
1761         unsigned long map_start = 0;
1762         unsigned long map_len = 0;
1763         int err;
1764
1765         while (low < high) {
1766                 mid = (low + high) / 2;
1767                 offset = p + mid * item_size;
1768
1769                 if (!kaddr || offset < map_start ||
1770                     (offset + sizeof(struct btrfs_disk_key)) >
1771                     map_start + map_len) {
1772
1773                         err = map_private_extent_buffer(eb, offset,
1774                                                 sizeof(struct btrfs_disk_key),
1775                                                 &kaddr, &map_start, &map_len);
1776
1777                         if (!err) {
1778                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1779                                                         map_start);
1780                         } else {
1781                                 read_extent_buffer(eb, &unaligned,
1782                                                    offset, sizeof(unaligned));
1783                                 tmp = &unaligned;
1784                         }
1785
1786                 } else {
1787                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1788                                                         map_start);
1789                 }
1790                 ret = comp_keys(tmp, key);
1791
1792                 if (ret < 0)
1793                         low = mid + 1;
1794                 else if (ret > 0)
1795                         high = mid;
1796                 else {
1797                         *slot = mid;
1798                         return 0;
1799                 }
1800         }
1801         *slot = low;
1802         return 1;
1803 }
1804
1805 /*
1806  * simple bin_search frontend that does the right thing for
1807  * leaves vs nodes
1808  */
1809 static int bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1810                       int level, int *slot)
1811 {
1812         if (level == 0)
1813                 return generic_bin_search(eb,
1814                                           offsetof(struct btrfs_leaf, items),
1815                                           sizeof(struct btrfs_item),
1816                                           key, btrfs_header_nritems(eb),
1817                                           slot);
1818         else
1819                 return generic_bin_search(eb,
1820                                           offsetof(struct btrfs_node, ptrs),
1821                                           sizeof(struct btrfs_key_ptr),
1822                                           key, btrfs_header_nritems(eb),
1823                                           slot);
1824 }
1825
1826 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
1827                      int level, int *slot)
1828 {
1829         return bin_search(eb, key, level, slot);
1830 }
1831
1832 static void root_add_used(struct btrfs_root *root, u32 size)
1833 {
1834         spin_lock(&root->accounting_lock);
1835         btrfs_set_root_used(&root->root_item,
1836                             btrfs_root_used(&root->root_item) + size);
1837         spin_unlock(&root->accounting_lock);
1838 }
1839
1840 static void root_sub_used(struct btrfs_root *root, u32 size)
1841 {
1842         spin_lock(&root->accounting_lock);
1843         btrfs_set_root_used(&root->root_item,
1844                             btrfs_root_used(&root->root_item) - size);
1845         spin_unlock(&root->accounting_lock);
1846 }
1847
1848 /* given a node and slot number, this reads the blocks it points to.  The
1849  * extent buffer is returned with a reference taken (but unlocked).
1850  * NULL is returned on error.
1851  */
1852 static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root,
1853                                    struct extent_buffer *parent, int slot)
1854 {
1855         int level = btrfs_header_level(parent);
1856         struct extent_buffer *eb;
1857
1858         if (slot < 0)
1859                 return NULL;
1860         if (slot >= btrfs_header_nritems(parent))
1861                 return NULL;
1862
1863         BUG_ON(level == 0);
1864
1865         eb = read_tree_block(root, btrfs_node_blockptr(parent, slot),
1866                              btrfs_node_ptr_generation(parent, slot));
1867         if (eb && !extent_buffer_uptodate(eb)) {
1868                 free_extent_buffer(eb);
1869                 eb = NULL;
1870         }
1871
1872         return eb;
1873 }
1874
1875 /*
1876  * node level balancing, used to make sure nodes are in proper order for
1877  * item deletion.  We balance from the top down, so we have to make sure
1878  * that a deletion won't leave an node completely empty later on.
1879  */
1880 static noinline int balance_level(struct btrfs_trans_handle *trans,
1881                          struct btrfs_root *root,
1882                          struct btrfs_path *path, int level)
1883 {
1884         struct extent_buffer *right = NULL;
1885         struct extent_buffer *mid;
1886         struct extent_buffer *left = NULL;
1887         struct extent_buffer *parent = NULL;
1888         int ret = 0;
1889         int wret;
1890         int pslot;
1891         int orig_slot = path->slots[level];
1892         u64 orig_ptr;
1893
1894         if (level == 0)
1895                 return 0;
1896
1897         mid = path->nodes[level];
1898
1899         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1900                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1901         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1902
1903         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1904
1905         if (level < BTRFS_MAX_LEVEL - 1) {
1906                 parent = path->nodes[level + 1];
1907                 pslot = path->slots[level + 1];
1908         }
1909
1910         /*
1911          * deal with the case where there is only one pointer in the root
1912          * by promoting the node below to a root
1913          */
1914         if (!parent) {
1915                 struct extent_buffer *child;
1916
1917                 if (btrfs_header_nritems(mid) != 1)
1918                         return 0;
1919
1920                 /* promote the child to a root */
1921                 child = read_node_slot(root, mid, 0);
1922                 if (!child) {
1923                         ret = -EROFS;
1924                         btrfs_std_error(root->fs_info, ret);
1925                         goto enospc;
1926                 }
1927
1928                 btrfs_tree_lock(child);
1929                 btrfs_set_lock_blocking(child);
1930                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1931                 if (ret) {
1932                         btrfs_tree_unlock(child);
1933                         free_extent_buffer(child);
1934                         goto enospc;
1935                 }
1936
1937                 tree_mod_log_set_root_pointer(root, child, 1);
1938                 rcu_assign_pointer(root->node, child);
1939
1940                 add_root_to_dirty_list(root);
1941                 btrfs_tree_unlock(child);
1942
1943                 path->locks[level] = 0;
1944                 path->nodes[level] = NULL;
1945                 clean_tree_block(trans, root->fs_info, mid);
1946                 btrfs_tree_unlock(mid);
1947                 /* once for the path */
1948                 free_extent_buffer(mid);
1949
1950                 root_sub_used(root, mid->len);
1951                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1952                 /* once for the root ptr */
1953                 free_extent_buffer_stale(mid);
1954                 return 0;
1955         }
1956         if (btrfs_header_nritems(mid) >
1957             BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
1958                 return 0;
1959
1960         left = read_node_slot(root, parent, pslot - 1);
1961         if (left) {
1962                 btrfs_tree_lock(left);
1963                 btrfs_set_lock_blocking(left);
1964                 wret = btrfs_cow_block(trans, root, left,
1965                                        parent, pslot - 1, &left);
1966                 if (wret) {
1967                         ret = wret;
1968                         goto enospc;
1969                 }
1970         }
1971         right = read_node_slot(root, parent, pslot + 1);
1972         if (right) {
1973                 btrfs_tree_lock(right);
1974                 btrfs_set_lock_blocking(right);
1975                 wret = btrfs_cow_block(trans, root, right,
1976                                        parent, pslot + 1, &right);
1977                 if (wret) {
1978                         ret = wret;
1979                         goto enospc;
1980                 }
1981         }
1982
1983         /* first, try to make some room in the middle buffer */
1984         if (left) {
1985                 orig_slot += btrfs_header_nritems(left);
1986                 wret = push_node_left(trans, root, left, mid, 1);
1987                 if (wret < 0)
1988                         ret = wret;
1989         }
1990
1991         /*
1992          * then try to empty the right most buffer into the middle
1993          */
1994         if (right) {
1995                 wret = push_node_left(trans, root, mid, right, 1);
1996                 if (wret < 0 && wret != -ENOSPC)
1997                         ret = wret;
1998                 if (btrfs_header_nritems(right) == 0) {
1999                         clean_tree_block(trans, root->fs_info, right);
2000                         btrfs_tree_unlock(right);
2001                         del_ptr(root, path, level + 1, pslot + 1);
2002                         root_sub_used(root, right->len);
2003                         btrfs_free_tree_block(trans, root, right, 0, 1);
2004                         free_extent_buffer_stale(right);
2005                         right = NULL;
2006                 } else {
2007                         struct btrfs_disk_key right_key;
2008                         btrfs_node_key(right, &right_key, 0);
2009                         tree_mod_log_set_node_key(root->fs_info, parent,
2010                                                   pslot + 1, 0);
2011                         btrfs_set_node_key(parent, &right_key, pslot + 1);
2012                         btrfs_mark_buffer_dirty(parent);
2013                 }
2014         }
2015         if (btrfs_header_nritems(mid) == 1) {
2016                 /*
2017                  * we're not allowed to leave a node with one item in the
2018                  * tree during a delete.  A deletion from lower in the tree
2019                  * could try to delete the only pointer in this node.
2020                  * So, pull some keys from the left.
2021                  * There has to be a left pointer at this point because
2022                  * otherwise we would have pulled some pointers from the
2023                  * right
2024                  */
2025                 if (!left) {
2026                         ret = -EROFS;
2027                         btrfs_std_error(root->fs_info, ret);
2028                         goto enospc;
2029                 }
2030                 wret = balance_node_right(trans, root, mid, left);
2031                 if (wret < 0) {
2032                         ret = wret;
2033                         goto enospc;
2034                 }
2035                 if (wret == 1) {
2036                         wret = push_node_left(trans, root, left, mid, 1);
2037                         if (wret < 0)
2038                                 ret = wret;
2039                 }
2040                 BUG_ON(wret == 1);
2041         }
2042         if (btrfs_header_nritems(mid) == 0) {
2043                 clean_tree_block(trans, root->fs_info, mid);
2044                 btrfs_tree_unlock(mid);
2045                 del_ptr(root, path, level + 1, pslot);
2046                 root_sub_used(root, mid->len);
2047                 btrfs_free_tree_block(trans, root, mid, 0, 1);
2048                 free_extent_buffer_stale(mid);
2049                 mid = NULL;
2050         } else {
2051                 /* update the parent key to reflect our changes */
2052                 struct btrfs_disk_key mid_key;
2053                 btrfs_node_key(mid, &mid_key, 0);
2054                 tree_mod_log_set_node_key(root->fs_info, parent,
2055                                           pslot, 0);
2056                 btrfs_set_node_key(parent, &mid_key, pslot);
2057                 btrfs_mark_buffer_dirty(parent);
2058         }
2059
2060         /* update the path */
2061         if (left) {
2062                 if (btrfs_header_nritems(left) > orig_slot) {
2063                         extent_buffer_get(left);
2064                         /* left was locked after cow */
2065                         path->nodes[level] = left;
2066                         path->slots[level + 1] -= 1;
2067                         path->slots[level] = orig_slot;
2068                         if (mid) {
2069                                 btrfs_tree_unlock(mid);
2070                                 free_extent_buffer(mid);
2071                         }
2072                 } else {
2073                         orig_slot -= btrfs_header_nritems(left);
2074                         path->slots[level] = orig_slot;
2075                 }
2076         }
2077         /* double check we haven't messed things up */
2078         if (orig_ptr !=
2079             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2080                 BUG();
2081 enospc:
2082         if (right) {
2083                 btrfs_tree_unlock(right);
2084                 free_extent_buffer(right);
2085         }
2086         if (left) {
2087                 if (path->nodes[level] != left)
2088                         btrfs_tree_unlock(left);
2089                 free_extent_buffer(left);
2090         }
2091         return ret;
2092 }
2093
2094 /* Node balancing for insertion.  Here we only split or push nodes around
2095  * when they are completely full.  This is also done top down, so we
2096  * have to be pessimistic.
2097  */
2098 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2099                                           struct btrfs_root *root,
2100                                           struct btrfs_path *path, int level)
2101 {
2102         struct extent_buffer *right = NULL;
2103         struct extent_buffer *mid;
2104         struct extent_buffer *left = NULL;
2105         struct extent_buffer *parent = NULL;
2106         int ret = 0;
2107         int wret;
2108         int pslot;
2109         int orig_slot = path->slots[level];
2110
2111         if (level == 0)
2112                 return 1;
2113
2114         mid = path->nodes[level];
2115         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2116
2117         if (level < BTRFS_MAX_LEVEL - 1) {
2118                 parent = path->nodes[level + 1];
2119                 pslot = path->slots[level + 1];
2120         }
2121
2122         if (!parent)
2123                 return 1;
2124
2125         left = read_node_slot(root, parent, pslot - 1);
2126
2127         /* first, try to make some room in the middle buffer */
2128         if (left) {
2129                 u32 left_nr;
2130
2131                 btrfs_tree_lock(left);
2132                 btrfs_set_lock_blocking(left);
2133
2134                 left_nr = btrfs_header_nritems(left);
2135                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2136                         wret = 1;
2137                 } else {
2138                         ret = btrfs_cow_block(trans, root, left, parent,
2139                                               pslot - 1, &left);
2140                         if (ret)
2141                                 wret = 1;
2142                         else {
2143                                 wret = push_node_left(trans, root,
2144                                                       left, mid, 0);
2145                         }
2146                 }
2147                 if (wret < 0)
2148                         ret = wret;
2149                 if (wret == 0) {
2150                         struct btrfs_disk_key disk_key;
2151                         orig_slot += left_nr;
2152                         btrfs_node_key(mid, &disk_key, 0);
2153                         tree_mod_log_set_node_key(root->fs_info, parent,
2154                                                   pslot, 0);
2155                         btrfs_set_node_key(parent, &disk_key, pslot);
2156                         btrfs_mark_buffer_dirty(parent);
2157                         if (btrfs_header_nritems(left) > orig_slot) {
2158                                 path->nodes[level] = left;
2159                                 path->slots[level + 1] -= 1;
2160                                 path->slots[level] = orig_slot;
2161                                 btrfs_tree_unlock(mid);
2162                                 free_extent_buffer(mid);
2163                         } else {
2164                                 orig_slot -=
2165                                         btrfs_header_nritems(left);
2166                                 path->slots[level] = orig_slot;
2167                                 btrfs_tree_unlock(left);
2168                                 free_extent_buffer(left);
2169                         }
2170                         return 0;
2171                 }
2172                 btrfs_tree_unlock(left);
2173                 free_extent_buffer(left);
2174         }
2175         right = read_node_slot(root, parent, pslot + 1);
2176
2177         /*
2178          * then try to empty the right most buffer into the middle
2179          */
2180         if (right) {
2181                 u32 right_nr;
2182
2183                 btrfs_tree_lock(right);
2184                 btrfs_set_lock_blocking(right);
2185
2186                 right_nr = btrfs_header_nritems(right);
2187                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) {
2188                         wret = 1;
2189                 } else {
2190                         ret = btrfs_cow_block(trans, root, right,
2191                                               parent, pslot + 1,
2192                                               &right);
2193                         if (ret)
2194                                 wret = 1;
2195                         else {
2196                                 wret = balance_node_right(trans, root,
2197                                                           right, mid);
2198                         }
2199                 }
2200                 if (wret < 0)
2201                         ret = wret;
2202                 if (wret == 0) {
2203                         struct btrfs_disk_key disk_key;
2204
2205                         btrfs_node_key(right, &disk_key, 0);
2206                         tree_mod_log_set_node_key(root->fs_info, parent,
2207                                                   pslot + 1, 0);
2208                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2209                         btrfs_mark_buffer_dirty(parent);
2210
2211                         if (btrfs_header_nritems(mid) <= orig_slot) {
2212                                 path->nodes[level] = right;
2213                                 path->slots[level + 1] += 1;
2214                                 path->slots[level] = orig_slot -
2215                                         btrfs_header_nritems(mid);
2216                                 btrfs_tree_unlock(mid);
2217                                 free_extent_buffer(mid);
2218                         } else {
2219                                 btrfs_tree_unlock(right);
2220                                 free_extent_buffer(right);
2221                         }
2222                         return 0;
2223                 }
2224                 btrfs_tree_unlock(right);
2225                 free_extent_buffer(right);
2226         }
2227         return 1;
2228 }
2229
2230 /*
2231  * readahead one full node of leaves, finding things that are close
2232  * to the block in 'slot', and triggering ra on them.
2233  */
2234 static void reada_for_search(struct btrfs_root *root,
2235                              struct btrfs_path *path,
2236                              int level, int slot, u64 objectid)
2237 {
2238         struct extent_buffer *node;
2239         struct btrfs_disk_key disk_key;
2240         u32 nritems;
2241         u64 search;
2242         u64 target;
2243         u64 nread = 0;
2244         u64 gen;
2245         int direction = path->reada;
2246         struct extent_buffer *eb;
2247         u32 nr;
2248         u32 blocksize;
2249         u32 nscan = 0;
2250
2251         if (level != 1)
2252                 return;
2253
2254         if (!path->nodes[level])
2255                 return;
2256
2257         node = path->nodes[level];
2258
2259         search = btrfs_node_blockptr(node, slot);
2260         blocksize = root->nodesize;
2261         eb = btrfs_find_tree_block(root->fs_info, search);
2262         if (eb) {
2263                 free_extent_buffer(eb);
2264                 return;
2265         }
2266
2267         target = search;
2268
2269         nritems = btrfs_header_nritems(node);
2270         nr = slot;
2271
2272         while (1) {
2273                 if (direction < 0) {
2274                         if (nr == 0)
2275                                 break;
2276                         nr--;
2277                 } else if (direction > 0) {
2278                         nr++;
2279                         if (nr >= nritems)
2280                                 break;
2281                 }
2282                 if (path->reada < 0 && objectid) {
2283                         btrfs_node_key(node, &disk_key, nr);
2284                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2285                                 break;
2286                 }
2287                 search = btrfs_node_blockptr(node, nr);
2288                 if ((search <= target && target - search <= 65536) ||
2289                     (search > target && search - target <= 65536)) {
2290                         gen = btrfs_node_ptr_generation(node, nr);
2291                         readahead_tree_block(root, search);
2292                         nread += blocksize;
2293                 }
2294                 nscan++;
2295                 if ((nread > 65536 || nscan > 32))
2296                         break;
2297         }
2298 }
2299
2300 static noinline void reada_for_balance(struct btrfs_root *root,
2301                                        struct btrfs_path *path, int level)
2302 {
2303         int slot;
2304         int nritems;
2305         struct extent_buffer *parent;
2306         struct extent_buffer *eb;
2307         u64 gen;
2308         u64 block1 = 0;
2309         u64 block2 = 0;
2310
2311         parent = path->nodes[level + 1];
2312         if (!parent)
2313                 return;
2314
2315         nritems = btrfs_header_nritems(parent);
2316         slot = path->slots[level + 1];
2317
2318         if (slot > 0) {
2319                 block1 = btrfs_node_blockptr(parent, slot - 1);
2320                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2321                 eb = btrfs_find_tree_block(root->fs_info, block1);
2322                 /*
2323                  * if we get -eagain from btrfs_buffer_uptodate, we
2324                  * don't want to return eagain here.  That will loop
2325                  * forever
2326                  */
2327                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2328                         block1 = 0;
2329                 free_extent_buffer(eb);
2330         }
2331         if (slot + 1 < nritems) {
2332                 block2 = btrfs_node_blockptr(parent, slot + 1);
2333                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2334                 eb = btrfs_find_tree_block(root->fs_info, block2);
2335                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2336                         block2 = 0;
2337                 free_extent_buffer(eb);
2338         }
2339
2340         if (block1)
2341                 readahead_tree_block(root, block1);
2342         if (block2)
2343                 readahead_tree_block(root, block2);
2344 }
2345
2346
2347 /*
2348  * when we walk down the tree, it is usually safe to unlock the higher layers
2349  * in the tree.  The exceptions are when our path goes through slot 0, because
2350  * operations on the tree might require changing key pointers higher up in the
2351  * tree.
2352  *
2353  * callers might also have set path->keep_locks, which tells this code to keep
2354  * the lock if the path points to the last slot in the block.  This is part of
2355  * walking through the tree, and selecting the next slot in the higher block.
2356  *
2357  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2358  * if lowest_unlock is 1, level 0 won't be unlocked
2359  */
2360 static noinline void unlock_up(struct btrfs_path *path, int level,
2361                                int lowest_unlock, int min_write_lock_level,
2362                                int *write_lock_level)
2363 {
2364         int i;
2365         int skip_level = level;
2366         int no_skips = 0;
2367         struct extent_buffer *t;
2368
2369         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2370                 if (!path->nodes[i])
2371                         break;
2372                 if (!path->locks[i])
2373                         break;
2374                 if (!no_skips && path->slots[i] == 0) {
2375                         skip_level = i + 1;
2376                         continue;
2377                 }
2378                 if (!no_skips && path->keep_locks) {
2379                         u32 nritems;
2380                         t = path->nodes[i];
2381                         nritems = btrfs_header_nritems(t);
2382                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2383                                 skip_level = i + 1;
2384                                 continue;
2385                         }
2386                 }
2387                 if (skip_level < i && i >= lowest_unlock)
2388                         no_skips = 1;
2389
2390                 t = path->nodes[i];
2391                 if (i >= lowest_unlock && i > skip_level && path->locks[i]) {
2392                         btrfs_tree_unlock_rw(t, path->locks[i]);
2393                         path->locks[i] = 0;
2394                         if (write_lock_level &&
2395                             i > min_write_lock_level &&
2396                             i <= *write_lock_level) {
2397                                 *write_lock_level = i - 1;
2398                         }
2399                 }
2400         }
2401 }
2402
2403 /*
2404  * This releases any locks held in the path starting at level and
2405  * going all the way up to the root.
2406  *
2407  * btrfs_search_slot will keep the lock held on higher nodes in a few
2408  * corner cases, such as COW of the block at slot zero in the node.  This
2409  * ignores those rules, and it should only be called when there are no
2410  * more updates to be done higher up in the tree.
2411  */
2412 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2413 {
2414         int i;
2415
2416         if (path->keep_locks)
2417                 return;
2418
2419         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2420                 if (!path->nodes[i])
2421                         continue;
2422                 if (!path->locks[i])
2423                         continue;
2424                 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2425                 path->locks[i] = 0;
2426         }
2427 }
2428
2429 /*
2430  * helper function for btrfs_search_slot.  The goal is to find a block
2431  * in cache without setting the path to blocking.  If we find the block
2432  * we return zero and the path is unchanged.
2433  *
2434  * If we can't find the block, we set the path blocking and do some
2435  * reada.  -EAGAIN is returned and the search must be repeated.
2436  */
2437 static int
2438 read_block_for_search(struct btrfs_trans_handle *trans,
2439                        struct btrfs_root *root, struct btrfs_path *p,
2440                        struct extent_buffer **eb_ret, int level, int slot,
2441                        struct btrfs_key *key, u64 time_seq)
2442 {
2443         u64 blocknr;
2444         u64 gen;
2445         struct extent_buffer *b = *eb_ret;
2446         struct extent_buffer *tmp;
2447         int ret;
2448
2449         blocknr = btrfs_node_blockptr(b, slot);
2450         gen = btrfs_node_ptr_generation(b, slot);
2451
2452         tmp = btrfs_find_tree_block(root->fs_info, blocknr);
2453         if (tmp) {
2454                 /* first we do an atomic uptodate check */
2455                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2456                         *eb_ret = tmp;
2457                         return 0;
2458                 }
2459
2460                 /* the pages were up to date, but we failed
2461                  * the generation number check.  Do a full
2462                  * read for the generation number that is correct.
2463                  * We must do this without dropping locks so
2464                  * we can trust our generation number
2465                  */
2466                 btrfs_set_path_blocking(p);
2467
2468                 /* now we're allowed to do a blocking uptodate check */
2469                 ret = btrfs_read_buffer(tmp, gen);
2470                 if (!ret) {
2471                         *eb_ret = tmp;
2472                         return 0;
2473                 }
2474                 free_extent_buffer(tmp);
2475                 btrfs_release_path(p);
2476                 return -EIO;
2477         }
2478
2479         /*
2480          * reduce lock contention at high levels
2481          * of the btree by dropping locks before
2482          * we read.  Don't release the lock on the current
2483          * level because we need to walk this node to figure
2484          * out which blocks to read.
2485          */
2486         btrfs_unlock_up_safe(p, level + 1);
2487         btrfs_set_path_blocking(p);
2488
2489         free_extent_buffer(tmp);
2490         if (p->reada)
2491                 reada_for_search(root, p, level, slot, key->objectid);
2492
2493         btrfs_release_path(p);
2494
2495         ret = -EAGAIN;
2496         tmp = read_tree_block(root, blocknr, 0);
2497         if (tmp) {
2498                 /*
2499                  * If the read above didn't mark this buffer up to date,
2500                  * it will never end up being up to date.  Set ret to EIO now
2501                  * and give up so that our caller doesn't loop forever
2502                  * on our EAGAINs.
2503                  */
2504                 if (!btrfs_buffer_uptodate(tmp, 0, 0))
2505                         ret = -EIO;
2506                 free_extent_buffer(tmp);
2507         }
2508         return ret;
2509 }
2510
2511 /*
2512  * helper function for btrfs_search_slot.  This does all of the checks
2513  * for node-level blocks and does any balancing required based on
2514  * the ins_len.
2515  *
2516  * If no extra work was required, zero is returned.  If we had to
2517  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2518  * start over
2519  */
2520 static int
2521 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2522                        struct btrfs_root *root, struct btrfs_path *p,
2523                        struct extent_buffer *b, int level, int ins_len,
2524                        int *write_lock_level)
2525 {
2526         int ret;
2527         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2528             BTRFS_NODEPTRS_PER_BLOCK(root) - 3) {
2529                 int sret;
2530
2531                 if (*write_lock_level < level + 1) {
2532                         *write_lock_level = level + 1;
2533                         btrfs_release_path(p);
2534                         goto again;
2535                 }
2536
2537                 btrfs_set_path_blocking(p);
2538                 reada_for_balance(root, p, level);
2539                 sret = split_node(trans, root, p, level);
2540                 btrfs_clear_path_blocking(p, NULL, 0);
2541
2542                 BUG_ON(sret > 0);
2543                 if (sret) {
2544                         ret = sret;
2545                         goto done;
2546                 }
2547                 b = p->nodes[level];
2548         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2549                    BTRFS_NODEPTRS_PER_BLOCK(root) / 2) {
2550                 int sret;
2551
2552                 if (*write_lock_level < level + 1) {
2553                         *write_lock_level = level + 1;
2554                         btrfs_release_path(p);
2555                         goto again;
2556                 }
2557
2558                 btrfs_set_path_blocking(p);
2559                 reada_for_balance(root, p, level);
2560                 sret = balance_level(trans, root, p, level);
2561                 btrfs_clear_path_blocking(p, NULL, 0);
2562
2563                 if (sret) {
2564                         ret = sret;
2565                         goto done;
2566                 }
2567                 b = p->nodes[level];
2568                 if (!b) {
2569                         btrfs_release_path(p);
2570                         goto again;
2571                 }
2572                 BUG_ON(btrfs_header_nritems(b) == 1);
2573         }
2574         return 0;
2575
2576 again:
2577         ret = -EAGAIN;
2578 done:
2579         return ret;
2580 }
2581
2582 static void key_search_validate(struct extent_buffer *b,
2583                                 struct btrfs_key *key,
2584                                 int level)
2585 {
2586 #ifdef CONFIG_BTRFS_ASSERT
2587         struct btrfs_disk_key disk_key;
2588
2589         btrfs_cpu_key_to_disk(&disk_key, key);
2590
2591         if (level == 0)
2592                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2593                     offsetof(struct btrfs_leaf, items[0].key),
2594                     sizeof(disk_key)));
2595         else
2596                 ASSERT(!memcmp_extent_buffer(b, &disk_key,
2597                     offsetof(struct btrfs_node, ptrs[0].key),
2598                     sizeof(disk_key)));
2599 #endif
2600 }
2601
2602 static int key_search(struct extent_buffer *b, struct btrfs_key *key,
2603                       int level, int *prev_cmp, int *slot)
2604 {
2605         if (*prev_cmp != 0) {
2606                 *prev_cmp = bin_search(b, key, level, slot);
2607                 return *prev_cmp;
2608         }
2609
2610         key_search_validate(b, key, level);
2611         *slot = 0;
2612
2613         return 0;
2614 }
2615
2616 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2617                 u64 iobjectid, u64 ioff, u8 key_type,
2618                 struct btrfs_key *found_key)
2619 {
2620         int ret;
2621         struct btrfs_key key;
2622         struct extent_buffer *eb;
2623
2624         ASSERT(path);
2625         ASSERT(found_key);
2626
2627         key.type = key_type;
2628         key.objectid = iobjectid;
2629         key.offset = ioff;
2630
2631         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2632         if (ret < 0)
2633                 return ret;
2634
2635         eb = path->nodes[0];
2636         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2637                 ret = btrfs_next_leaf(fs_root, path);
2638                 if (ret)
2639                         return ret;
2640                 eb = path->nodes[0];
2641         }
2642
2643         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2644         if (found_key->type != key.type ||
2645                         found_key->objectid != key.objectid)
2646                 return 1;
2647
2648         return 0;
2649 }
2650
2651 /*
2652  * look for key in the tree.  path is filled in with nodes along the way
2653  * if key is found, we return zero and you can find the item in the leaf
2654  * level of the path (level 0)
2655  *
2656  * If the key isn't found, the path points to the slot where it should
2657  * be inserted, and 1 is returned.  If there are other errors during the
2658  * search a negative error number is returned.
2659  *
2660  * if ins_len > 0, nodes and leaves will be split as we walk down the
2661  * tree.  if ins_len < 0, nodes will be merged as we walk down the tree (if
2662  * possible)
2663  */
2664 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
2665                       *root, struct btrfs_key *key, struct btrfs_path *p, int
2666                       ins_len, int cow)
2667 {
2668         struct extent_buffer *b;
2669         int slot;
2670         int ret;
2671         int err;
2672         int level;
2673         int lowest_unlock = 1;
2674         int root_lock;
2675         /* everything at write_lock_level or lower must be write locked */
2676         int write_lock_level = 0;
2677         u8 lowest_level = 0;
2678         int min_write_lock_level;
2679         int prev_cmp;
2680
2681         lowest_level = p->lowest_level;
2682         WARN_ON(lowest_level && ins_len > 0);
2683         WARN_ON(p->nodes[0] != NULL);
2684         BUG_ON(!cow && ins_len);
2685
2686         if (ins_len < 0) {
2687                 lowest_unlock = 2;
2688
2689                 /* when we are removing items, we might have to go up to level
2690                  * two as we update tree pointers  Make sure we keep write
2691                  * for those levels as well
2692                  */
2693                 write_lock_level = 2;
2694         } else if (ins_len > 0) {
2695                 /*
2696                  * for inserting items, make sure we have a write lock on
2697                  * level 1 so we can update keys
2698                  */
2699                 write_lock_level = 1;
2700         }
2701
2702         if (!cow)
2703                 write_lock_level = -1;
2704
2705         if (cow && (p->keep_locks || p->lowest_level))
2706                 write_lock_level = BTRFS_MAX_LEVEL;
2707
2708         min_write_lock_level = write_lock_level;
2709
2710 again:
2711         prev_cmp = -1;
2712         /*
2713          * we try very hard to do read locks on the root
2714          */
2715         root_lock = BTRFS_READ_LOCK;
2716         level = 0;
2717         if (p->search_commit_root) {
2718                 /*
2719                  * the commit roots are read only
2720                  * so we always do read locks
2721                  */
2722                 if (p->need_commit_sem)
2723                         down_read(&root->fs_info->commit_root_sem);
2724                 b = root->commit_root;
2725                 extent_buffer_get(b);
2726                 level = btrfs_header_level(b);
2727                 if (p->need_commit_sem)
2728                         up_read(&root->fs_info->commit_root_sem);
2729                 if (!p->skip_locking)
2730                         btrfs_tree_read_lock(b);
2731         } else {
2732                 if (p->skip_locking) {
2733                         b = btrfs_root_node(root);
2734                         level = btrfs_header_level(b);
2735                 } else {
2736                         /* we don't know the level of the root node
2737                          * until we actually have it read locked
2738                          */
2739                         b = btrfs_read_lock_root_node(root);
2740                         level = btrfs_header_level(b);
2741                         if (level <= write_lock_level) {
2742                                 /* whoops, must trade for write lock */
2743                                 btrfs_tree_read_unlock(b);
2744                                 free_extent_buffer(b);
2745                                 b = btrfs_lock_root_node(root);
2746                                 root_lock = BTRFS_WRITE_LOCK;
2747
2748                                 /* the level might have changed, check again */
2749                                 level = btrfs_header_level(b);
2750                         }
2751                 }
2752         }
2753         p->nodes[level] = b;
2754         if (!p->skip_locking)
2755                 p->locks[level] = root_lock;
2756
2757         while (b) {
2758                 level = btrfs_header_level(b);
2759
2760                 /*
2761                  * setup the path here so we can release it under lock
2762                  * contention with the cow code
2763                  */
2764                 if (cow) {
2765                         /*
2766                          * if we don't really need to cow this block
2767                          * then we don't want to set the path blocking,
2768                          * so we test it here
2769                          */
2770                         if (!should_cow_block(trans, root, b))
2771                                 goto cow_done;
2772
2773                         /*
2774                          * must have write locks on this node and the
2775                          * parent
2776                          */
2777                         if (level > write_lock_level ||
2778                             (level + 1 > write_lock_level &&
2779                             level + 1 < BTRFS_MAX_LEVEL &&
2780                             p->nodes[level + 1])) {
2781                                 write_lock_level = level + 1;
2782                                 btrfs_release_path(p);
2783                                 goto again;
2784                         }
2785
2786                         btrfs_set_path_blocking(p);
2787                         err = btrfs_cow_block(trans, root, b,
2788                                               p->nodes[level + 1],
2789                                               p->slots[level + 1], &b);
2790                         if (err) {
2791                                 ret = err;
2792                                 goto done;
2793                         }
2794                 }
2795 cow_done:
2796                 p->nodes[level] = b;
2797                 btrfs_clear_path_blocking(p, NULL, 0);
2798
2799                 /*
2800                  * we have a lock on b and as long as we aren't changing
2801                  * the tree, there is no way to for the items in b to change.
2802                  * It is safe to drop the lock on our parent before we
2803                  * go through the expensive btree search on b.
2804                  *
2805                  * If we're inserting or deleting (ins_len != 0), then we might
2806                  * be changing slot zero, which may require changing the parent.
2807                  * So, we can't drop the lock until after we know which slot
2808                  * we're operating on.
2809                  */
2810                 if (!ins_len && !p->keep_locks) {
2811                         int u = level + 1;
2812
2813                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2814                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2815                                 p->locks[u] = 0;
2816                         }
2817                 }
2818
2819                 ret = key_search(b, key, level, &prev_cmp, &slot);
2820
2821                 if (level != 0) {
2822                         int dec = 0;
2823                         if (ret && slot > 0) {
2824                                 dec = 1;
2825                                 slot -= 1;
2826                         }
2827                         p->slots[level] = slot;
2828                         err = setup_nodes_for_search(trans, root, p, b, level,
2829                                              ins_len, &write_lock_level);
2830                         if (err == -EAGAIN)
2831                                 goto again;
2832                         if (err) {
2833                                 ret = err;
2834                                 goto done;
2835                         }
2836                         b = p->nodes[level];
2837                         slot = p->slots[level];
2838
2839                         /*
2840                          * slot 0 is special, if we change the key
2841                          * we have to update the parent pointer
2842                          * which means we must have a write lock
2843                          * on the parent
2844                          */
2845                         if (slot == 0 && ins_len &&
2846                             write_lock_level < level + 1) {
2847                                 write_lock_level = level + 1;
2848                                 btrfs_release_path(p);
2849                                 goto again;
2850                         }
2851
2852                         unlock_up(p, level, lowest_unlock,
2853                                   min_write_lock_level, &write_lock_level);
2854
2855                         if (level == lowest_level) {
2856                                 if (dec)
2857                                         p->slots[level]++;
2858                                 goto done;
2859                         }
2860
2861                         err = read_block_for_search(trans, root, p,
2862                                                     &b, level, slot, key, 0);
2863                         if (err == -EAGAIN)
2864                                 goto again;
2865                         if (err) {
2866                                 ret = err;
2867                                 goto done;
2868                         }
2869
2870                         if (!p->skip_locking) {
2871                                 level = btrfs_header_level(b);
2872                                 if (level <= write_lock_level) {
2873                                         err = btrfs_try_tree_write_lock(b);
2874                                         if (!err) {
2875                                                 btrfs_set_path_blocking(p);
2876                                                 btrfs_tree_lock(b);
2877                                                 btrfs_clear_path_blocking(p, b,
2878                                                                   BTRFS_WRITE_LOCK);
2879                                         }
2880                                         p->locks[level] = BTRFS_WRITE_LOCK;
2881                                 } else {
2882                                         err = btrfs_tree_read_lock_atomic(b);
2883                                         if (!err) {
2884                                                 btrfs_set_path_blocking(p);
2885                                                 btrfs_tree_read_lock(b);
2886                                                 btrfs_clear_path_blocking(p, b,
2887                                                                   BTRFS_READ_LOCK);
2888                                         }
2889                                         p->locks[level] = BTRFS_READ_LOCK;
2890                                 }
2891                                 p->nodes[level] = b;
2892                         }
2893                 } else {
2894                         p->slots[level] = slot;
2895                         if (ins_len > 0 &&
2896                             btrfs_leaf_free_space(root, b) < ins_len) {
2897                                 if (write_lock_level < 1) {
2898                                         write_lock_level = 1;
2899                                         btrfs_release_path(p);
2900                                         goto again;
2901                                 }
2902
2903                                 btrfs_set_path_blocking(p);
2904                                 err = split_leaf(trans, root, key,
2905                                                  p, ins_len, ret == 0);
2906                                 btrfs_clear_path_blocking(p, NULL, 0);
2907
2908                                 BUG_ON(err > 0);
2909                                 if (err) {
2910                                         ret = err;
2911                                         goto done;
2912                                 }
2913                         }
2914                         if (!p->search_for_split)
2915                                 unlock_up(p, level, lowest_unlock,
2916                                           min_write_lock_level, &write_lock_level);
2917                         goto done;
2918                 }
2919         }
2920         ret = 1;
2921 done:
2922         /*
2923          * we don't really know what they plan on doing with the path
2924          * from here on, so for now just mark it as blocking
2925          */
2926         if (!p->leave_spinning)
2927                 btrfs_set_path_blocking(p);
2928         if (ret < 0 && !p->skip_release_on_error)
2929                 btrfs_release_path(p);
2930         return ret;
2931 }
2932
2933 /*
2934  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2935  * current state of the tree together with the operations recorded in the tree
2936  * modification log to search for the key in a previous version of this tree, as
2937  * denoted by the time_seq parameter.
2938  *
2939  * Naturally, there is no support for insert, delete or cow operations.
2940  *
2941  * The resulting path and return value will be set up as if we called
2942  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2943  */
2944 int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
2945                           struct btrfs_path *p, u64 time_seq)
2946 {
2947         struct extent_buffer *b;
2948         int slot;
2949         int ret;
2950         int err;
2951         int level;
2952         int lowest_unlock = 1;
2953         u8 lowest_level = 0;
2954         int prev_cmp = -1;
2955
2956         lowest_level = p->lowest_level;
2957         WARN_ON(p->nodes[0] != NULL);
2958
2959         if (p->search_commit_root) {
2960                 BUG_ON(time_seq);
2961                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2962         }
2963
2964 again:
2965         b = get_old_root(root, time_seq);
2966         level = btrfs_header_level(b);
2967         p->locks[level] = BTRFS_READ_LOCK;
2968
2969         while (b) {
2970                 level = btrfs_header_level(b);
2971                 p->nodes[level] = b;
2972                 btrfs_clear_path_blocking(p, NULL, 0);
2973
2974                 /*
2975                  * we have a lock on b and as long as we aren't changing
2976                  * the tree, there is no way to for the items in b to change.
2977                  * It is safe to drop the lock on our parent before we
2978                  * go through the expensive btree search on b.
2979                  */
2980                 btrfs_unlock_up_safe(p, level + 1);
2981
2982                 /*
2983                  * Since we can unwind eb's we want to do a real search every
2984                  * time.
2985                  */
2986                 prev_cmp = -1;
2987                 ret = key_search(b, key, level, &prev_cmp, &slot);
2988
2989                 if (level != 0) {
2990                         int dec = 0;
2991                         if (ret && slot > 0) {
2992                                 dec = 1;
2993                                 slot -= 1;
2994                         }
2995                         p->slots[level] = slot;
2996                         unlock_up(p, level, lowest_unlock, 0, NULL);
2997
2998                         if (level == lowest_level) {
2999                                 if (dec)
3000                                         p->slots[level]++;
3001                                 goto done;
3002                         }
3003
3004                         err = read_block_for_search(NULL, root, p, &b, level,
3005                                                     slot, key, time_seq);
3006                         if (err == -EAGAIN)
3007                                 goto again;
3008                         if (err) {
3009                                 ret = err;
3010                                 goto done;
3011                         }
3012
3013                         level = btrfs_header_level(b);
3014                         err = btrfs_tree_read_lock_atomic(b);
3015                         if (!err) {
3016                                 btrfs_set_path_blocking(p);
3017                                 btrfs_tree_read_lock(b);
3018                                 btrfs_clear_path_blocking(p, b,
3019                                                           BTRFS_READ_LOCK);
3020                         }
3021                         b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
3022                         if (!b) {
3023                                 ret = -ENOMEM;
3024                                 goto done;
3025                         }
3026                         p->locks[level] = BTRFS_READ_LOCK;
3027                         p->nodes[level] = b;
3028                 } else {
3029                         p->slots[level] = slot;
3030                         unlock_up(p, level, lowest_unlock, 0, NULL);
3031                         goto done;
3032                 }
3033         }
3034         ret = 1;
3035 done:
3036         if (!p->leave_spinning)
3037                 btrfs_set_path_blocking(p);
3038         if (ret < 0)
3039                 btrfs_release_path(p);
3040
3041         return ret;
3042 }
3043
3044 /*
3045  * helper to use instead of search slot if no exact match is needed but
3046  * instead the next or previous item should be returned.
3047  * When find_higher is true, the next higher item is returned, the next lower
3048  * otherwise.
3049  * When return_any and find_higher are both true, and no higher item is found,
3050  * return the next lower instead.
3051  * When return_any is true and find_higher is false, and no lower item is found,
3052  * return the next higher instead.
3053  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3054  * < 0 on error
3055  */
3056 int btrfs_search_slot_for_read(struct btrfs_root *root,
3057                                struct btrfs_key *key, struct btrfs_path *p,
3058                                int find_higher, int return_any)
3059 {
3060         int ret;
3061         struct extent_buffer *leaf;
3062
3063 again:
3064         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3065         if (ret <= 0)
3066                 return ret;
3067         /*
3068          * a return value of 1 means the path is at the position where the
3069          * item should be inserted. Normally this is the next bigger item,
3070          * but in case the previous item is the last in a leaf, path points
3071          * to the first free slot in the previous leaf, i.e. at an invalid
3072          * item.
3073          */
3074         leaf = p->nodes[0];
3075
3076         if (find_higher) {
3077                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3078                         ret = btrfs_next_leaf(root, p);
3079                         if (ret <= 0)
3080                                 return ret;
3081                         if (!return_any)
3082                                 return 1;
3083                         /*
3084                          * no higher item found, return the next
3085                          * lower instead
3086                          */
3087                         return_any = 0;
3088                         find_higher = 0;
3089                         btrfs_release_path(p);
3090                         goto again;
3091                 }
3092         } else {
3093                 if (p->slots[0] == 0) {
3094                         ret = btrfs_prev_leaf(root, p);
3095                         if (ret < 0)
3096                                 return ret;
3097                         if (!ret) {
3098                                 leaf = p->nodes[0];
3099                                 if (p->slots[0] == btrfs_header_nritems(leaf))
3100                                         p->slots[0]--;
3101                                 return 0;
3102                         }
3103                         if (!return_any)
3104                                 return 1;
3105                         /*
3106                          * no lower item found, return the next
3107                          * higher instead
3108                          */
3109                         return_any = 0;
3110                         find_higher = 1;
3111                         btrfs_release_path(p);
3112                         goto again;
3113                 } else {
3114                         --p->slots[0];
3115                 }
3116         }
3117         return 0;
3118 }
3119
3120 /*
3121  * adjust the pointers going up the tree, starting at level
3122  * making sure the right key of each node is points to 'key'.
3123  * This is used after shifting pointers to the left, so it stops
3124  * fixing up pointers when a given leaf/node is not in slot 0 of the
3125  * higher levels
3126  *
3127  */
3128 static void fixup_low_keys(struct btrfs_fs_info *fs_info,
3129                            struct btrfs_path *path,
3130                            struct btrfs_disk_key *key, int level)
3131 {
3132         int i;
3133         struct extent_buffer *t;
3134
3135         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3136                 int tslot = path->slots[i];
3137                 if (!path->nodes[i])
3138                         break;
3139                 t = path->nodes[i];
3140                 tree_mod_log_set_node_key(fs_info, t, tslot, 1);
3141                 btrfs_set_node_key(t, key, tslot);
3142                 btrfs_mark_buffer_dirty(path->nodes[i]);
3143                 if (tslot != 0)
3144                         break;
3145         }
3146 }
3147
3148 /*
3149  * update item key.
3150  *
3151  * This function isn't completely safe. It's the caller's responsibility
3152  * that the new key won't break the order
3153  */
3154 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3155                              struct btrfs_path *path,
3156                              struct btrfs_key *new_key)
3157 {
3158         struct btrfs_disk_key disk_key;
3159         struct extent_buffer *eb;
3160         int slot;
3161
3162         eb = path->nodes[0];
3163         slot = path->slots[0];
3164         if (slot > 0) {
3165                 btrfs_item_key(eb, &disk_key, slot - 1);
3166                 BUG_ON(comp_keys(&disk_key, new_key) >= 0);
3167         }
3168         if (slot < btrfs_header_nritems(eb) - 1) {
3169                 btrfs_item_key(eb, &disk_key, slot + 1);
3170                 BUG_ON(comp_keys(&disk_key, new_key) <= 0);
3171         }
3172
3173         btrfs_cpu_key_to_disk(&disk_key, new_key);
3174         btrfs_set_item_key(eb, &disk_key, slot);
3175         btrfs_mark_buffer_dirty(eb);
3176         if (slot == 0)
3177                 fixup_low_keys(fs_info, path, &disk_key, 1);
3178 }
3179
3180 /*
3181  * try to push data from one node into the next node left in the
3182  * tree.
3183  *
3184  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3185  * error, and > 0 if there was no room in the left hand block.
3186  */
3187 static int push_node_left(struct btrfs_trans_handle *trans,
3188                           struct btrfs_root *root, struct extent_buffer *dst,
3189                           struct extent_buffer *src, int empty)
3190 {
3191         int push_items = 0;
3192         int src_nritems;
3193         int dst_nritems;
3194         int ret = 0;
3195
3196         src_nritems = btrfs_header_nritems(src);
3197         dst_nritems = btrfs_header_nritems(dst);
3198         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3199         WARN_ON(btrfs_header_generation(src) != trans->transid);
3200         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3201
3202         if (!empty && src_nritems <= 8)
3203                 return 1;
3204
3205         if (push_items <= 0)
3206                 return 1;
3207
3208         if (empty) {
3209                 push_items = min(src_nritems, push_items);
3210                 if (push_items < src_nritems) {
3211                         /* leave at least 8 pointers in the node if
3212                          * we aren't going to empty it
3213                          */
3214                         if (src_nritems - push_items < 8) {
3215                                 if (push_items <= 8)
3216                                         return 1;
3217                                 push_items -= 8;
3218                         }
3219                 }
3220         } else
3221                 push_items = min(src_nritems - 8, push_items);
3222
3223         ret = tree_mod_log_eb_copy(root->fs_info, dst, src, dst_nritems, 0,
3224                                    push_items);
3225         if (ret) {
3226                 btrfs_abort_transaction(trans, root, ret);
3227                 return ret;
3228         }
3229         copy_extent_buffer(dst, src,
3230                            btrfs_node_key_ptr_offset(dst_nritems),
3231                            btrfs_node_key_ptr_offset(0),
3232                            push_items * sizeof(struct btrfs_key_ptr));
3233
3234         if (push_items < src_nritems) {
3235                 /*
3236                  * don't call tree_mod_log_eb_move here, key removal was already
3237                  * fully logged by tree_mod_log_eb_copy above.
3238                  */
3239                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3240                                       btrfs_node_key_ptr_offset(push_items),
3241                                       (src_nritems - push_items) *
3242                                       sizeof(struct btrfs_key_ptr));
3243         }
3244         btrfs_set_header_nritems(src, src_nritems - push_items);
3245         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3246         btrfs_mark_buffer_dirty(src);
3247         btrfs_mark_buffer_dirty(dst);
3248
3249         return ret;
3250 }
3251
3252 /*
3253  * try to push data from one node into the next node right in the
3254  * tree.
3255  *
3256  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3257  * error, and > 0 if there was no room in the right hand block.
3258  *
3259  * this will  only push up to 1/2 the contents of the left node over
3260  */
3261 static int balance_node_right(struct btrfs_trans_handle *trans,
3262                               struct btrfs_root *root,
3263                               struct extent_buffer *dst,
3264                               struct extent_buffer *src)
3265 {
3266         int push_items = 0;
3267         int max_push;
3268         int src_nritems;
3269         int dst_nritems;
3270         int ret = 0;
3271
3272         WARN_ON(btrfs_header_generation(src) != trans->transid);
3273         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3274
3275         src_nritems = btrfs_header_nritems(src);
3276         dst_nritems = btrfs_header_nritems(dst);
3277         push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems;
3278         if (push_items <= 0)
3279                 return 1;
3280
3281         if (src_nritems < 4)
3282                 return 1;
3283
3284         max_push = src_nritems / 2 + 1;
3285         /* don't try to empty the node */
3286         if (max_push >= src_nritems)
3287                 return 1;
3288
3289         if (max_push < push_items)
3290                 push_items = max_push;
3291
3292         tree_mod_log_eb_move(root->fs_info, dst, push_items, 0, dst_nritems);
3293         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3294                                       btrfs_node_key_ptr_offset(0),
3295                                       (dst_nritems) *
3296                                       sizeof(struct btrfs_key_ptr));
3297
3298         ret = tree_mod_log_eb_copy(root->fs_info, dst, src, 0,
3299                                    src_nritems - push_items, push_items);
3300         if (ret) {
3301                 btrfs_abort_transaction(trans, root, ret);
3302                 return ret;
3303         }
3304         copy_extent_buffer(dst, src,
3305                            btrfs_node_key_ptr_offset(0),
3306                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3307                            push_items * sizeof(struct btrfs_key_ptr));
3308
3309         btrfs_set_header_nritems(src, src_nritems - push_items);
3310         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3311
3312         btrfs_mark_buffer_dirty(src);
3313         btrfs_mark_buffer_dirty(dst);
3314
3315         return ret;
3316 }
3317
3318 /*
3319  * helper function to insert a new root level in the tree.
3320  * A new node is allocated, and a single item is inserted to
3321  * point to the existing root
3322  *
3323  * returns zero on success or < 0 on failure.
3324  */
3325 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3326                            struct btrfs_root *root,
3327                            struct btrfs_path *path, int level)
3328 {
3329         u64 lower_gen;
3330         struct extent_buffer *lower;
3331         struct extent_buffer *c;
3332         struct extent_buffer *old;
3333         struct btrfs_disk_key lower_key;
3334
3335         BUG_ON(path->nodes[level]);
3336         BUG_ON(path->nodes[level-1] != root->node);
3337
3338         lower = path->nodes[level-1];
3339         if (level == 1)
3340                 btrfs_item_key(lower, &lower_key, 0);
3341         else
3342                 btrfs_node_key(lower, &lower_key, 0);
3343
3344         c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3345                                    &lower_key, level, root->node->start, 0);
3346         if (IS_ERR(c))
3347                 return PTR_ERR(c);
3348
3349         root_add_used(root, root->nodesize);
3350
3351         memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
3352         btrfs_set_header_nritems(c, 1);
3353         btrfs_set_header_level(c, level);
3354         btrfs_set_header_bytenr(c, c->start);
3355         btrfs_set_header_generation(c, trans->transid);
3356         btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
3357         btrfs_set_header_owner(c, root->root_key.objectid);
3358
3359         write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(),
3360                             BTRFS_FSID_SIZE);
3361
3362         write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
3363                             btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
3364
3365         btrfs_set_node_key(c, &lower_key, 0);
3366         btrfs_set_node_blockptr(c, 0, lower->start);
3367         lower_gen = btrfs_header_generation(lower);
3368         WARN_ON(lower_gen != trans->transid);
3369
3370         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3371
3372         btrfs_mark_buffer_dirty(c);
3373
3374         old = root->node;
3375         tree_mod_log_set_root_pointer(root, c, 0);
3376         rcu_assign_pointer(root->node, c);
3377
3378         /* the super has an extra ref to root->node */
3379         free_extent_buffer(old);
3380
3381         add_root_to_dirty_list(root);
3382         extent_buffer_get(c);
3383         path->nodes[level] = c;
3384         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3385         path->slots[level] = 0;
3386         return 0;
3387 }
3388
3389 /*
3390  * worker function to insert a single pointer in a node.
3391  * the node should have enough room for the pointer already
3392  *
3393  * slot and level indicate where you want the key to go, and
3394  * blocknr is the block the key points to.
3395  */
3396 static void insert_ptr(struct btrfs_trans_handle *trans,
3397                        struct btrfs_root *root, struct btrfs_path *path,
3398                        struct btrfs_disk_key *key, u64 bytenr,
3399                        int slot, int level)
3400 {
3401         struct extent_buffer *lower;
3402         int nritems;
3403         int ret;
3404
3405         BUG_ON(!path->nodes[level]);
3406         btrfs_assert_tree_locked(path->nodes[level]);
3407         lower = path->nodes[level];
3408         nritems = btrfs_header_nritems(lower);
3409         BUG_ON(slot > nritems);
3410         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(root));
3411         if (slot != nritems) {
3412                 if (level)
3413                         tree_mod_log_eb_move(root->fs_info, lower, slot + 1,
3414                                              slot, nritems - slot);
3415                 memmove_extent_buffer(lower,
3416                               btrfs_node_key_ptr_offset(slot + 1),
3417                               btrfs_node_key_ptr_offset(slot),
3418                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3419         }
3420         if (level) {
3421                 ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
3422                                               MOD_LOG_KEY_ADD, GFP_NOFS);
3423                 BUG_ON(ret < 0);
3424         }
3425         btrfs_set_node_key(lower, key, slot);
3426         btrfs_set_node_blockptr(lower, slot, bytenr);
3427         WARN_ON(trans->transid == 0);
3428         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3429         btrfs_set_header_nritems(lower, nritems + 1);
3430         btrfs_mark_buffer_dirty(lower);
3431 }
3432
3433 /*
3434  * split the node at the specified level in path in two.
3435  * The path is corrected to point to the appropriate node after the split
3436  *
3437  * Before splitting this tries to make some room in the node by pushing
3438  * left and right, if either one works, it returns right away.
3439  *
3440  * returns 0 on success and < 0 on failure
3441  */
3442 static noinline int split_node(struct btrfs_trans_handle *trans,
3443                                struct btrfs_root *root,
3444                                struct btrfs_path *path, int level)
3445 {
3446         struct extent_buffer *c;
3447         struct extent_buffer *split;
3448         struct btrfs_disk_key disk_key;
3449         int mid;
3450         int ret;
3451         u32 c_nritems;
3452
3453         c = path->nodes[level];
3454         WARN_ON(btrfs_header_generation(c) != trans->transid);
3455         if (c == root->node) {
3456                 /*
3457                  * trying to split the root, lets make a new one
3458                  *
3459                  * tree mod log: We don't log_removal old root in
3460                  * insert_new_root, because that root buffer will be kept as a
3461                  * normal node. We are going to log removal of half of the
3462                  * elements below with tree_mod_log_eb_copy. We're holding a
3463                  * tree lock on the buffer, which is why we cannot race with
3464                  * other tree_mod_log users.
3465                  */
3466                 ret = insert_new_root(trans, root, path, level + 1);
3467                 if (ret)
3468                         return ret;
3469         } else {
3470                 ret = push_nodes_for_insert(trans, root, path, level);
3471                 c = path->nodes[level];
3472                 if (!ret && btrfs_header_nritems(c) <
3473                     BTRFS_NODEPTRS_PER_BLOCK(root) - 3)
3474                         return 0;
3475                 if (ret < 0)
3476                         return ret;
3477         }
3478
3479         c_nritems = btrfs_header_nritems(c);
3480         mid = (c_nritems + 1) / 2;
3481         btrfs_node_key(c, &disk_key, mid);
3482
3483         split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3484                         &disk_key, level, c->start, 0);
3485         if (IS_ERR(split))
3486                 return PTR_ERR(split);
3487
3488         root_add_used(root, root->nodesize);
3489
3490         memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
3491         btrfs_set_header_level(split, btrfs_header_level(c));
3492         btrfs_set_header_bytenr(split, split->start);
3493         btrfs_set_header_generation(split, trans->transid);
3494         btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
3495         btrfs_set_header_owner(split, root->root_key.objectid);
3496         write_extent_buffer(split, root->fs_info->fsid,
3497                             btrfs_header_fsid(), BTRFS_FSID_SIZE);
3498         write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
3499                             btrfs_header_chunk_tree_uuid(split),
3500                             BTRFS_UUID_SIZE);
3501
3502         ret = tree_mod_log_eb_copy(root->fs_info, split, c, 0,
3503                                    mid, c_nritems - mid);
3504         if (ret) {
3505                 btrfs_abort_transaction(trans, root, ret);
3506                 return ret;
3507         }
3508         copy_extent_buffer(split, c,
3509                            btrfs_node_key_ptr_offset(0),
3510                            btrfs_node_key_ptr_offset(mid),
3511                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3512         btrfs_set_header_nritems(split, c_nritems - mid);
3513         btrfs_set_header_nritems(c, mid);
3514         ret = 0;
3515
3516         btrfs_mark_buffer_dirty(c);
3517         btrfs_mark_buffer_dirty(split);
3518
3519         insert_ptr(trans, root, path, &disk_key, split->start,
3520                    path->slots[level + 1] + 1, level + 1);
3521
3522         if (path->slots[level] >= mid) {
3523                 path->slots[level] -= mid;
3524                 btrfs_tree_unlock(c);
3525                 free_extent_buffer(c);
3526                 path->nodes[level] = split;
3527                 path->slots[level + 1] += 1;
3528         } else {
3529                 btrfs_tree_unlock(split);
3530                 free_extent_buffer(split);
3531         }
3532         return ret;
3533 }
3534
3535 /*
3536  * how many bytes are required to store the items in a leaf.  start
3537  * and nr indicate which items in the leaf to check.  This totals up the
3538  * space used both by the item structs and the item data
3539  */
3540 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3541 {
3542         struct btrfs_item *start_item;
3543         struct btrfs_item *end_item;
3544         struct btrfs_map_token token;
3545         int data_len;
3546         int nritems = btrfs_header_nritems(l);
3547         int end = min(nritems, start + nr) - 1;
3548
3549         if (!nr)
3550                 return 0;
3551         btrfs_init_map_token(&token);
3552         start_item = btrfs_item_nr(start);
3553         end_item = btrfs_item_nr(end);
3554         data_len = btrfs_token_item_offset(l, start_item, &token) +
3555                 btrfs_token_item_size(l, start_item, &token);
3556         data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3557         data_len += sizeof(struct btrfs_item) * nr;
3558         WARN_ON(data_len < 0);
3559         return data_len;
3560 }
3561
3562 /*
3563  * The space between the end of the leaf items and
3564  * the start of the leaf data.  IOW, how much room
3565  * the leaf has left for both items and data
3566  */
3567 noinline int btrfs_leaf_free_space(struct btrfs_root *root,
3568                                    struct extent_buffer *leaf)
3569 {
3570         int nritems = btrfs_header_nritems(leaf);
3571         int ret;
3572         ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems);
3573         if (ret < 0) {
3574                 btrfs_crit(root->fs_info,
3575                         "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3576                        ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root),
3577                        leaf_space_used(leaf, 0, nritems), nritems);
3578         }
3579         return ret;
3580 }
3581
3582 /*
3583  * min slot controls the lowest index we're willing to push to the
3584  * right.  We'll push up to and including min_slot, but no lower
3585  */
3586 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3587                                       struct btrfs_root *root,
3588                                       struct btrfs_path *path,
3589                                       int data_size, int empty,
3590                                       struct extent_buffer *right,
3591                                       int free_space, u32 left_nritems,
3592                                       u32 min_slot)
3593 {
3594         struct extent_buffer *left = path->nodes[0];
3595         struct extent_buffer *upper = path->nodes[1];
3596         struct btrfs_map_token token;
3597         struct btrfs_disk_key disk_key;
3598         int slot;
3599         u32 i;
3600         int push_space = 0;
3601         int push_items = 0;
3602         struct btrfs_item *item;
3603         u32 nr;
3604         u32 right_nritems;
3605         u32 data_end;
3606         u32 this_item_size;
3607
3608         btrfs_init_map_token(&token);
3609
3610         if (empty)
3611                 nr = 0;
3612         else
3613                 nr = max_t(u32, 1, min_slot);
3614
3615         if (path->slots[0] >= left_nritems)
3616                 push_space += data_size;
3617
3618         slot = path->slots[1];
3619         i = left_nritems - 1;
3620         while (i >= nr) {
3621                 item = btrfs_item_nr(i);
3622
3623                 if (!empty && push_items > 0) {
3624                         if (path->slots[0] > i)
3625                                 break;
3626                         if (path->slots[0] == i) {
3627                                 int space = btrfs_leaf_free_space(root, left);
3628                                 if (space + push_space * 2 > free_space)
3629                                         break;
3630                         }
3631                 }
3632
3633                 if (path->slots[0] == i)
3634                         push_space += data_size;
3635
3636                 this_item_size = btrfs_item_size(left, item);
3637                 if (this_item_size + sizeof(*item) + push_space > free_space)
3638                         break;
3639
3640                 push_items++;
3641                 push_space += this_item_size + sizeof(*item);
3642                 if (i == 0)
3643                         break;
3644                 i--;
3645         }
3646
3647         if (push_items == 0)
3648                 goto out_unlock;
3649
3650         WARN_ON(!empty && push_items == left_nritems);
3651
3652         /* push left to right */
3653         right_nritems = btrfs_header_nritems(right);
3654
3655         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3656         push_space -= leaf_data_end(root, left);
3657
3658         /* make room in the right data area */
3659         data_end = leaf_data_end(root, right);
3660         memmove_extent_buffer(right,
3661                               btrfs_leaf_data(right) + data_end - push_space,
3662                               btrfs_leaf_data(right) + data_end,
3663                               BTRFS_LEAF_DATA_SIZE(root) - data_end);
3664
3665         /* copy from the left data area */
3666         copy_extent_buffer(right, left, btrfs_leaf_data(right) +
3667                      BTRFS_LEAF_DATA_SIZE(root) - push_space,
3668                      btrfs_leaf_data(left) + leaf_data_end(root, left),
3669                      push_space);
3670
3671         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3672                               btrfs_item_nr_offset(0),
3673                               right_nritems * sizeof(struct btrfs_item));
3674
3675         /* copy the items from left to right */
3676         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3677                    btrfs_item_nr_offset(left_nritems - push_items),
3678                    push_items * sizeof(struct btrfs_item));
3679
3680         /* update the item pointers */
3681         right_nritems += push_items;
3682         btrfs_set_header_nritems(right, right_nritems);
3683         push_space = BTRFS_LEAF_DATA_SIZE(root);
3684         for (i = 0; i < right_nritems; i++) {
3685                 item = btrfs_item_nr(i);
3686                 push_space -= btrfs_token_item_size(right, item, &token);
3687                 btrfs_set_token_item_offset(right, item, push_space, &token);
3688         }
3689
3690         left_nritems -= push_items;
3691         btrfs_set_header_nritems(left, left_nritems);
3692
3693         if (left_nritems)
3694                 btrfs_mark_buffer_dirty(left);
3695         else
3696                 clean_tree_block(trans, root->fs_info, left);
3697
3698         btrfs_mark_buffer_dirty(right);
3699
3700         btrfs_item_key(right, &disk_key, 0);
3701         btrfs_set_node_key(upper, &disk_key, slot + 1);
3702         btrfs_mark_buffer_dirty(upper);
3703
3704         /* then fixup the leaf pointer in the path */
3705         if (path->slots[0] >= left_nritems) {
3706                 path->slots[0] -= left_nritems;
3707                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3708                         clean_tree_block(trans, root->fs_info, path->nodes[0]);
3709                 btrfs_tree_unlock(path->nodes[0]);
3710                 free_extent_buffer(path->nodes[0]);
3711                 path->nodes[0] = right;
3712                 path->slots[1] += 1;
3713         } else {
3714                 btrfs_tree_unlock(right);
3715                 free_extent_buffer(right);
3716         }
3717         return 0;
3718
3719 out_unlock:
3720         btrfs_tree_unlock(right);
3721         free_extent_buffer(right);
3722         return 1;
3723 }
3724
3725 /*
3726  * push some data in the path leaf to the right, trying to free up at
3727  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3728  *
3729  * returns 1 if the push failed because the other node didn't have enough
3730  * room, 0 if everything worked out and < 0 if there were major errors.
3731  *
3732  * this will push starting from min_slot to the end of the leaf.  It won't
3733  * push any slot lower than min_slot
3734  */
3735 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3736                            *root, struct btrfs_path *path,
3737                            int min_data_size, int data_size,
3738                            int empty, u32 min_slot)
3739 {
3740         struct extent_buffer *left = path->nodes[0];
3741         struct extent_buffer *right;
3742         struct extent_buffer *upper;
3743         int slot;
3744         int free_space;
3745         u32 left_nritems;
3746         int ret;
3747
3748         if (!path->nodes[1])
3749                 return 1;
3750
3751         slot = path->slots[1];
3752         upper = path->nodes[1];
3753         if (slot >= btrfs_header_nritems(upper) - 1)
3754                 return 1;
3755
3756         btrfs_assert_tree_locked(path->nodes[1]);
3757
3758         right = read_node_slot(root, upper, slot + 1);
3759         if (right == NULL)
3760                 return 1;
3761
3762         btrfs_tree_lock(right);
3763         btrfs_set_lock_blocking(right);
3764
3765         free_space = btrfs_leaf_free_space(root, right);
3766         if (free_space < data_size)
3767                 goto out_unlock;
3768
3769         /* cow and double check */
3770         ret = btrfs_cow_block(trans, root, right, upper,
3771                               slot + 1, &right);
3772         if (ret)
3773                 goto out_unlock;
3774
3775         free_space = btrfs_leaf_free_space(root, right);
3776         if (free_space < data_size)
3777                 goto out_unlock;
3778
3779         left_nritems = btrfs_header_nritems(left);
3780         if (left_nritems == 0)
3781                 goto out_unlock;
3782
3783         if (path->slots[0] == left_nritems && !empty) {
3784                 /* Key greater than all keys in the leaf, right neighbor has
3785                  * enough room for it and we're not emptying our leaf to delete
3786                  * it, therefore use right neighbor to insert the new item and
3787                  * no need to touch/dirty our left leaft. */
3788                 btrfs_tree_unlock(left);
3789                 free_extent_buffer(left);
3790                 path->nodes[0] = right;
3791                 path->slots[0] = 0;
3792                 path->slots[1]++;
3793                 return 0;
3794         }
3795
3796         return __push_leaf_right(trans, root, path, min_data_size, empty,
3797                                 right, free_space, left_nritems, min_slot);
3798 out_unlock:
3799         btrfs_tree_unlock(right);
3800         free_extent_buffer(right);
3801         return 1;
3802 }
3803
3804 /*
3805  * push some data in the path leaf to the left, trying to free up at
3806  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3807  *
3808  * max_slot can put a limit on how far into the leaf we'll push items.  The
3809  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3810  * items
3811  */
3812 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3813                                      struct btrfs_root *root,
3814                                      struct btrfs_path *path, int data_size,
3815                                      int empty, struct extent_buffer *left,
3816                                      int free_space, u32 right_nritems,
3817                                      u32 max_slot)
3818 {
3819         struct btrfs_disk_key disk_key;
3820         struct extent_buffer *right = path->nodes[0];
3821         int i;
3822         int push_space = 0;
3823         int push_items = 0;
3824         struct btrfs_item *item;
3825         u32 old_left_nritems;
3826         u32 nr;
3827         int ret = 0;
3828         u32 this_item_size;
3829         u32 old_left_item_size;
3830         struct btrfs_map_token token;
3831
3832         btrfs_init_map_token(&token);
3833
3834         if (empty)
3835                 nr = min(right_nritems, max_slot);
3836         else
3837                 nr = min(right_nritems - 1, max_slot);
3838
3839         for (i = 0; i < nr; i++) {
3840                 item = btrfs_item_nr(i);
3841
3842                 if (!empty && push_items > 0) {
3843                         if (path->slots[0] < i)
3844                                 break;
3845                         if (path->slots[0] == i) {
3846                                 int space = btrfs_leaf_free_space(root, right);
3847                                 if (space + push_space * 2 > free_space)
3848                                         break;
3849                         }
3850                 }
3851
3852                 if (path->slots[0] == i)
3853                         push_space += data_size;
3854
3855                 this_item_size = btrfs_item_size(right, item);
3856                 if (this_item_size + sizeof(*item) + push_space > free_space)
3857                         break;
3858
3859                 push_items++;
3860                 push_space += this_item_size + sizeof(*item);
3861         }
3862
3863         if (push_items == 0) {
3864                 ret = 1;
3865                 goto out;
3866         }
3867         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3868
3869         /* push data from right to left */
3870         copy_extent_buffer(left, right,
3871                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3872                            btrfs_item_nr_offset(0),
3873                            push_items * sizeof(struct btrfs_item));
3874
3875         push_space = BTRFS_LEAF_DATA_SIZE(root) -
3876                      btrfs_item_offset_nr(right, push_items - 1);
3877
3878         copy_extent_buffer(left, right, btrfs_leaf_data(left) +
3879                      leaf_data_end(root, left) - push_space,
3880                      btrfs_leaf_data(right) +
3881                      btrfs_item_offset_nr(right, push_items - 1),
3882                      push_space);
3883         old_left_nritems = btrfs_header_nritems(left);
3884         BUG_ON(old_left_nritems <= 0);
3885
3886         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3887         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3888                 u32 ioff;
3889
3890                 item = btrfs_item_nr(i);
3891
3892                 ioff = btrfs_token_item_offset(left, item, &token);
3893                 btrfs_set_token_item_offset(left, item,
3894                       ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size),
3895                       &token);
3896         }
3897         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3898
3899         /* fixup right node */
3900         if (push_items > right_nritems)
3901                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3902                        right_nritems);
3903
3904         if (push_items < right_nritems) {
3905                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3906                                                   leaf_data_end(root, right);
3907                 memmove_extent_buffer(right, btrfs_leaf_data(right) +
3908                                       BTRFS_LEAF_DATA_SIZE(root) - push_space,
3909                                       btrfs_leaf_data(right) +
3910                                       leaf_data_end(root, right), push_space);
3911
3912                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
3913                               btrfs_item_nr_offset(push_items),
3914                              (btrfs_header_nritems(right) - push_items) *
3915                              sizeof(struct btrfs_item));
3916         }
3917         right_nritems -= push_items;
3918         btrfs_set_header_nritems(right, right_nritems);
3919         push_space = BTRFS_LEAF_DATA_SIZE(root);
3920         for (i = 0; i < right_nritems; i++) {
3921                 item = btrfs_item_nr(i);
3922
3923                 push_space = push_space - btrfs_token_item_size(right,
3924                                                                 item, &token);
3925                 btrfs_set_token_item_offset(right, item, push_space, &token);
3926         }
3927
3928         btrfs_mark_buffer_dirty(left);
3929         if (right_nritems)
3930                 btrfs_mark_buffer_dirty(right);
3931         else
3932                 clean_tree_block(trans, root->fs_info, right);
3933
3934         btrfs_item_key(right, &disk_key, 0);
3935         fixup_low_keys(root->fs_info, path, &disk_key, 1);
3936
3937         /* then fixup the leaf pointer in the path */
3938         if (path->slots[0] < push_items) {
3939                 path->slots[0] += old_left_nritems;
3940                 btrfs_tree_unlock(path->nodes[0]);
3941                 free_extent_buffer(path->nodes[0]);
3942                 path->nodes[0] = left;
3943                 path->slots[1] -= 1;
3944         } else {
3945                 btrfs_tree_unlock(left);
3946                 free_extent_buffer(left);
3947                 path->slots[0] -= push_items;
3948         }
3949         BUG_ON(path->slots[0] < 0);
3950         return ret;
3951 out:
3952         btrfs_tree_unlock(left);
3953         free_extent_buffer(left);
3954         return ret;
3955 }
3956
3957 /*
3958  * push some data in the path leaf to the left, trying to free up at
3959  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3960  *
3961  * max_slot can put a limit on how far into the leaf we'll push items.  The
3962  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3963  * items
3964  */
3965 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3966                           *root, struct btrfs_path *path, int min_data_size,
3967                           int data_size, int empty, u32 max_slot)
3968 {
3969         struct extent_buffer *right = path->nodes[0];
3970         struct extent_buffer *left;
3971         int slot;
3972         int free_space;
3973         u32 right_nritems;
3974         int ret = 0;
3975
3976         slot = path->slots[1];
3977         if (slot == 0)
3978                 return 1;
3979         if (!path->nodes[1])
3980                 return 1;
3981
3982         right_nritems = btrfs_header_nritems(right);
3983         if (right_nritems == 0)
3984                 return 1;
3985
3986         btrfs_assert_tree_locked(path->nodes[1]);
3987
3988         left = read_node_slot(root, path->nodes[1], slot - 1);
3989         if (left == NULL)
3990                 return 1;
3991
3992         btrfs_tree_lock(left);
3993         btrfs_set_lock_blocking(left);
3994
3995         free_space = btrfs_leaf_free_space(root, left);
3996         if (free_space < data_size) {
3997                 ret = 1;
3998                 goto out;
3999         }
4000
4001         /* cow and double check */
4002         ret = btrfs_cow_block(trans, root, left,
4003                               path->nodes[1], slot - 1, &left);
4004         if (ret) {
4005                 /* we hit -ENOSPC, but it isn't fatal here */
4006                 if (ret == -ENOSPC)
4007                         ret = 1;
4008                 goto out;
4009         }
4010
4011         free_space = btrfs_leaf_free_space(root, left);
4012         if (free_space < data_size) {
4013                 ret = 1;
4014                 goto out;
4015         }
4016
4017         return __push_leaf_left(trans, root, path, min_data_size,
4018                                empty, left, free_space, right_nritems,
4019                                max_slot);
4020 out:
4021         btrfs_tree_unlock(left);
4022         free_extent_buffer(left);
4023         return ret;
4024 }
4025
4026 /*
4027  * split the path's leaf in two, making sure there is at least data_size
4028  * available for the resulting leaf level of the path.
4029  */
4030 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4031                                     struct btrfs_root *root,
4032                                     struct btrfs_path *path,
4033                                     struct extent_buffer *l,
4034                                     struct extent_buffer *right,
4035                                     int slot, int mid, int nritems)
4036 {
4037         int data_copy_size;
4038         int rt_data_off;
4039         int i;
4040         struct btrfs_disk_key disk_key;
4041         struct btrfs_map_token token;
4042
4043         btrfs_init_map_token(&token);
4044
4045         nritems = nritems - mid;
4046         btrfs_set_header_nritems(right, nritems);
4047         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
4048
4049         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4050                            btrfs_item_nr_offset(mid),
4051                            nritems * sizeof(struct btrfs_item));
4052
4053         copy_extent_buffer(right, l,
4054                      btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
4055                      data_copy_size, btrfs_leaf_data(l) +
4056                      leaf_data_end(root, l), data_copy_size);
4057
4058         rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
4059                       btrfs_item_end_nr(l, mid);
4060
4061         for (i = 0; i < nritems; i++) {
4062                 struct btrfs_item *item = btrfs_item_nr(i);
4063                 u32 ioff;
4064
4065                 ioff = btrfs_token_item_offset(right, item, &token);
4066                 btrfs_set_token_item_offset(right, item,
4067                                             ioff + rt_data_off, &token);
4068         }
4069
4070         btrfs_set_header_nritems(l, mid);
4071         btrfs_item_key(right, &disk_key, 0);
4072         insert_ptr(trans, root, path, &disk_key, right->start,
4073                    path->slots[1] + 1, 1);
4074
4075         btrfs_mark_buffer_dirty(right);
4076         btrfs_mark_buffer_dirty(l);
4077         BUG_ON(path->slots[0] != slot);
4078
4079         if (mid <= slot) {
4080                 btrfs_tree_unlock(path->nodes[0]);
4081                 free_extent_buffer(path->nodes[0]);
4082                 path->nodes[0] = right;
4083                 path->slots[0] -= mid;
4084                 path->slots[1] += 1;
4085         } else {
4086                 btrfs_tree_unlock(right);
4087                 free_extent_buffer(right);
4088         }
4089
4090         BUG_ON(path->slots[0] < 0);
4091 }
4092
4093 /*
4094  * double splits happen when we need to insert a big item in the middle
4095  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4096  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4097  *          A                 B                 C
4098  *
4099  * We avoid this by trying to push the items on either side of our target
4100  * into the adjacent leaves.  If all goes well we can avoid the double split
4101  * completely.
4102  */
4103 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4104                                           struct btrfs_root *root,
4105                                           struct btrfs_path *path,
4106                                           int data_size)
4107 {
4108         int ret;
4109         int progress = 0;
4110         int slot;
4111         u32 nritems;
4112         int space_needed = data_size;
4113
4114         slot = path->slots[0];
4115         if (slot < btrfs_header_nritems(path->nodes[0]))
4116                 space_needed -= btrfs_leaf_free_space(root, path->nodes[0]);
4117
4118         /*
4119          * try to push all the items after our slot into the
4120          * right leaf
4121          */
4122         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4123         if (ret < 0)
4124                 return ret;
4125
4126         if (ret == 0)
4127                 progress++;
4128
4129         nritems = btrfs_header_nritems(path->nodes[0]);
4130         /*
4131          * our goal is to get our slot at the start or end of a leaf.  If
4132          * we've done so we're done
4133          */
4134         if (path->slots[0] == 0 || path->slots[0] == nritems)
4135                 return 0;
4136
4137         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4138                 return 0;
4139
4140         /* try to push all the items before our slot into the next leaf */
4141         slot = path->slots[0];
4142         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4143         if (ret < 0)
4144                 return ret;
4145
4146         if (ret == 0)
4147                 progress++;
4148
4149         if (progress)
4150                 return 0;
4151         return 1;
4152 }
4153
4154 /*
4155  * split the path's leaf in two, making sure there is at least data_size
4156  * available for the resulting leaf level of the path.
4157  *
4158  * returns 0 if all went well and < 0 on failure.
4159  */
4160 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4161                                struct btrfs_root *root,
4162                                struct btrfs_key *ins_key,
4163                                struct btrfs_path *path, int data_size,
4164                                int extend)
4165 {
4166         struct btrfs_disk_key disk_key;
4167         struct extent_buffer *l;
4168         u32 nritems;
4169         int mid;
4170         int slot;
4171         struct extent_buffer *right;
4172         struct btrfs_fs_info *fs_info = root->fs_info;
4173         int ret = 0;
4174         int wret;
4175         int split;
4176         int num_doubles = 0;
4177         int tried_avoid_double = 0;
4178
4179         l = path->nodes[0];
4180         slot = path->slots[0];
4181         if (extend && data_size + btrfs_item_size_nr(l, slot) +
4182             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root))
4183                 return -EOVERFLOW;
4184
4185         /* first try to make some room by pushing left and right */
4186         if (data_size && path->nodes[1]) {
4187                 int space_needed = data_size;
4188
4189                 if (slot < btrfs_header_nritems(l))
4190                         space_needed -= btrfs_leaf_free_space(root, l);
4191
4192                 wret = push_leaf_right(trans, root, path, space_needed,
4193                                        space_needed, 0, 0);
4194                 if (wret < 0)
4195                         return wret;
4196                 if (wret) {
4197                         wret = push_leaf_left(trans, root, path, space_needed,
4198                                               space_needed, 0, (u32)-1);
4199                         if (wret < 0)
4200                                 return wret;
4201                 }
4202                 l = path->nodes[0];
4203
4204                 /* did the pushes work? */
4205                 if (btrfs_leaf_free_space(root, l) >= data_size)
4206                         return 0;
4207         }
4208
4209         if (!path->nodes[1]) {
4210                 ret = insert_new_root(trans, root, path, 1);
4211                 if (ret)
4212                         return ret;
4213         }
4214 again:
4215         split = 1;
4216         l = path->nodes[0];
4217         slot = path->slots[0];
4218         nritems = btrfs_header_nritems(l);
4219         mid = (nritems + 1) / 2;
4220
4221         if (mid <= slot) {
4222                 if (nritems == 1 ||
4223                     leaf_space_used(l, mid, nritems - mid) + data_size >
4224                         BTRFS_LEAF_DATA_SIZE(root)) {
4225                         if (slot >= nritems) {
4226                                 split = 0;
4227                         } else {
4228                                 mid = slot;
4229                                 if (mid != nritems &&
4230                                     leaf_space_used(l, mid, nritems - mid) +
4231                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4232                                         if (data_size && !tried_avoid_double)
4233                                                 goto push_for_double;
4234                                         split = 2;
4235                                 }
4236                         }
4237                 }
4238         } else {
4239                 if (leaf_space_used(l, 0, mid) + data_size >
4240                         BTRFS_LEAF_DATA_SIZE(root)) {
4241                         if (!extend && data_size && slot == 0) {
4242                                 split = 0;
4243                         } else if ((extend || !data_size) && slot == 0) {
4244                                 mid = 1;
4245                         } else {
4246                                 mid = slot;
4247                                 if (mid != nritems &&
4248                                     leaf_space_used(l, mid, nritems - mid) +
4249                                     data_size > BTRFS_LEAF_DATA_SIZE(root)) {
4250                                         if (data_size && !tried_avoid_double)
4251                                                 goto push_for_double;
4252                                         split = 2;
4253                                 }
4254                         }
4255                 }
4256         }
4257
4258         if (split == 0)
4259                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4260         else
4261                 btrfs_item_key(l, &disk_key, mid);
4262
4263         right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
4264                         &disk_key, 0, l->start, 0);
4265         if (IS_ERR(right))
4266                 return PTR_ERR(right);
4267
4268         root_add_used(root, root->nodesize);
4269
4270         memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
4271         btrfs_set_header_bytenr(right, right->start);
4272         btrfs_set_header_generation(right, trans->transid);
4273         btrfs_set_header_backref_rev(right, BTRFS_MIXED_BACKREF_REV);
4274         btrfs_set_header_owner(right, root->root_key.objectid);
4275         btrfs_set_header_level(right, 0);
4276         write_extent_buffer(right, fs_info->fsid,
4277                             btrfs_header_fsid(), BTRFS_FSID_SIZE);
4278
4279         write_extent_buffer(right, fs_info->chunk_tree_uuid,
4280                             btrfs_header_chunk_tree_uuid(right),
4281                             BTRFS_UUID_SIZE);
4282
4283         if (split == 0) {
4284                 if (mid <= slot) {
4285                         btrfs_set_header_nritems(right, 0);
4286                         insert_ptr(trans, root, path, &disk_key, right->start,
4287                                    path->slots[1] + 1, 1);
4288                         btrfs_tree_unlock(path->nodes[0]);
4289                         free_extent_buffer(path->nodes[0]);
4290                         path->nodes[0] = right;
4291                         path->slots[0] = 0;
4292                         path->slots[1] += 1;
4293                 } else {
4294                         btrfs_set_header_nritems(right, 0);
4295                         insert_ptr(trans, root, path, &disk_key, right->start,
4296                                           path->slots[1], 1);
4297                         btrfs_tree_unlock(path->nodes[0]);
4298                         free_extent_buffer(path->nodes[0]);
4299                         path->nodes[0] = right;
4300                         path->slots[0] = 0;
4301                         if (path->slots[1] == 0)
4302                                 fixup_low_keys(fs_info, path, &disk_key, 1);
4303                 }
4304                 btrfs_mark_buffer_dirty(right);
4305                 return ret;
4306         }
4307
4308         copy_for_split(trans, root, path, l, right, slot, mid, nritems);
4309
4310         if (split == 2) {
4311                 BUG_ON(num_doubles != 0);
4312                 num_doubles++;
4313                 goto again;
4314         }
4315
4316         return 0;
4317
4318 push_for_double:
4319         push_for_double_split(trans, root, path, data_size);
4320         tried_avoid_double = 1;
4321         if (btrfs_leaf_free_space(root, path->nodes[0]) >= data_size)
4322                 return 0;
4323         goto again;
4324 }
4325
4326 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4327                                          struct btrfs_root *root,
4328                                          struct btrfs_path *path, int ins_len)
4329 {
4330         struct btrfs_key key;
4331         struct extent_buffer *leaf;
4332         struct btrfs_file_extent_item *fi;
4333         u64 extent_len = 0;
4334         u32 item_size;
4335         int ret;
4336
4337         leaf = path->nodes[0];
4338         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4339
4340         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4341                key.type != BTRFS_EXTENT_CSUM_KEY);
4342
4343         if (btrfs_leaf_free_space(root, leaf) >= ins_len)
4344                 return 0;
4345
4346         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4347         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4348                 fi = btrfs_item_ptr(leaf, path->slots[0],
4349                                     struct btrfs_file_extent_item);
4350                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4351         }
4352         btrfs_release_path(path);
4353
4354         path->keep_locks = 1;
4355         path->search_for_split = 1;
4356         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4357         path->search_for_split = 0;
4358         if (ret > 0)
4359                 ret = -EAGAIN;
4360         if (ret < 0)
4361                 goto err;
4362
4363         ret = -EAGAIN;
4364         leaf = path->nodes[0];
4365         /* if our item isn't there, return now */
4366         if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4367                 goto err;
4368
4369         /* the leaf has  changed, it now has room.  return now */
4370         if (btrfs_leaf_free_space(root, path->nodes[0]) >= ins_len)
4371                 goto err;
4372
4373         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4374                 fi = btrfs_item_ptr(leaf, path->slots[0],
4375                                     struct btrfs_file_extent_item);
4376                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4377                         goto err;
4378         }
4379
4380         btrfs_set_path_blocking(path);
4381         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4382         if (ret)
4383                 goto err;
4384
4385         path->keep_locks = 0;
4386         btrfs_unlock_up_safe(path, 1);
4387         return 0;
4388 err:
4389         path->keep_locks = 0;
4390         return ret;
4391 }
4392
4393 static noinline int split_item(struct btrfs_trans_handle *trans,
4394                                struct btrfs_root *root,
4395                                struct btrfs_path *path,
4396                                struct btrfs_key *new_key,
4397                                unsigned long split_offset)
4398 {
4399         struct extent_buffer *leaf;
4400         struct btrfs_item *item;
4401         struct btrfs_item *new_item;
4402         int slot;
4403         char *buf;
4404         u32 nritems;
4405         u32 item_size;
4406         u32 orig_offset;
4407         struct btrfs_disk_key disk_key;
4408
4409         leaf = path->nodes[0];
4410         BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
4411
4412         btrfs_set_path_blocking(path);
4413
4414         item = btrfs_item_nr(path->slots[0]);
4415         orig_offset = btrfs_item_offset(leaf, item);
4416         item_size = btrfs_item_size(leaf, item);
4417
4418         buf = kmalloc(item_size, GFP_NOFS);
4419         if (!buf)
4420                 return -ENOMEM;
4421
4422         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4423                             path->slots[0]), item_size);
4424
4425         slot = path->slots[0] + 1;
4426         nritems = btrfs_header_nritems(leaf);
4427         if (slot != nritems) {
4428                 /* shift the items */
4429                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4430                                 btrfs_item_nr_offset(slot),
4431                                 (nritems - slot) * sizeof(struct btrfs_item));
4432         }
4433
4434         btrfs_cpu_key_to_disk(&disk_key, new_key);
4435         btrfs_set_item_key(leaf, &disk_key, slot);
4436
4437         new_item = btrfs_item_nr(slot);
4438
4439         btrfs_set_item_offset(leaf, new_item, orig_offset);
4440         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4441
4442         btrfs_set_item_offset(leaf, item,
4443                               orig_offset + item_size - split_offset);
4444         btrfs_set_item_size(leaf, item, split_offset);
4445
4446         btrfs_set_header_nritems(leaf, nritems + 1);
4447
4448         /* write the data for the start of the original item */
4449         write_extent_buffer(leaf, buf,
4450                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4451                             split_offset);
4452
4453         /* write the data for the new item */
4454         write_extent_buffer(leaf, buf + split_offset,
4455                             btrfs_item_ptr_offset(leaf, slot),
4456                             item_size - split_offset);
4457         btrfs_mark_buffer_dirty(leaf);
4458
4459         BUG_ON(btrfs_leaf_free_space(root, leaf) < 0);
4460         kfree(buf);
4461         return 0;
4462 }
4463
4464 /*
4465  * This function splits a single item into two items,
4466  * giving 'new_key' to the new item and splitting the
4467  * old one at split_offset (from the start of the item).
4468  *
4469  * The path may be released by this operation.  After
4470  * the split, the path is pointing to the old item.  The
4471  * new item is going to be in the same node as the old one.
4472  *
4473  * Note, the item being split must be smaller enough to live alone on
4474  * a tree block with room for one extra struct btrfs_item
4475  *
4476  * This allows us to split the item in place, keeping a lock on the
4477  * leaf the entire time.
4478  */
4479 int btrfs_split_item(struct btrfs_trans_handle *trans,
4480                      struct btrfs_root *root,
4481                      struct btrfs_path *path,
4482                      struct btrfs_key *new_key,
4483                      unsigned long split_offset)
4484 {
4485         int ret;
4486         ret = setup_leaf_for_split(trans, root, path,
4487                                    sizeof(struct btrfs_item));
4488         if (ret)
4489                 return ret;
4490
4491         ret = split_item(trans, root, path, new_key, split_offset);
4492         return ret;
4493 }
4494
4495 /*
4496  * This function duplicate a item, giving 'new_key' to the new item.
4497  * It guarantees both items live in the same tree leaf and the new item
4498  * is contiguous with the original item.
4499  *
4500  * This allows us to split file extent in place, keeping a lock on the
4501  * leaf the entire time.
4502  */
4503 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4504                          struct btrfs_root *root,
4505                          struct btrfs_path *path,
4506                          struct btrfs_key *new_key)
4507 {
4508         struct extent_buffer *leaf;
4509         int ret;
4510         u32 item_size;
4511
4512         leaf = path->nodes[0];
4513         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4514         ret = setup_leaf_for_split(trans, root, path,
4515                                    item_size + sizeof(struct btrfs_item));
4516         if (ret)
4517                 return ret;
4518
4519         path->slots[0]++;
4520         setup_items_for_insert(root, path, new_key, &item_size,
4521                                item_size, item_size +
4522                                sizeof(struct btrfs_item), 1);
4523         leaf = path->nodes[0];
4524         memcpy_extent_buffer(leaf,
4525                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4526                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4527                              item_size);
4528         return 0;
4529 }
4530
4531 /*
4532  * make the item pointed to by the path smaller.  new_size indicates
4533  * how small to make it, and from_end tells us if we just chop bytes
4534  * off the end of the item or if we shift the item to chop bytes off
4535  * the front.
4536  */
4537 void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path,
4538                          u32 new_size, int from_end)
4539 {
4540         int slot;
4541         struct extent_buffer *leaf;
4542         struct btrfs_item *item;
4543         u32 nritems;
4544         unsigned int data_end;
4545         unsigned int old_data_start;
4546         unsigned int old_size;
4547         unsigned int size_diff;
4548         int i;
4549         struct btrfs_map_token token;
4550
4551         btrfs_init_map_token(&token);
4552
4553         leaf = path->nodes[0];
4554         slot = path->slots[0];
4555
4556         old_size = btrfs_item_size_nr(leaf, slot);
4557         if (old_size == new_size)
4558                 return;
4559
4560         nritems = btrfs_header_nritems(leaf);
4561         data_end = leaf_data_end(root, leaf);
4562
4563         old_data_start = btrfs_item_offset_nr(leaf, slot);
4564
4565         size_diff = old_size - new_size;
4566
4567         BUG_ON(slot < 0);
4568         BUG_ON(slot >= nritems);
4569
4570         /*
4571          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4572          */
4573         /* first correct the data pointers */
4574         for (i = slot; i < nritems; i++) {
4575                 u32 ioff;
4576                 item = btrfs_item_nr(i);
4577
4578                 ioff = btrfs_token_item_offset(leaf, item, &token);
4579                 btrfs_set_token_item_offset(leaf, item,
4580                                             ioff + size_diff, &token);
4581         }
4582
4583         /* shift the data */
4584         if (from_end) {
4585                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4586                               data_end + size_diff, btrfs_leaf_data(leaf) +
4587                               data_end, old_data_start + new_size - data_end);
4588         } else {
4589                 struct btrfs_disk_key disk_key;
4590                 u64 offset;
4591
4592                 btrfs_item_key(leaf, &disk_key, slot);
4593
4594                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4595                         unsigned long ptr;
4596                         struct btrfs_file_extent_item *fi;
4597
4598                         fi = btrfs_item_ptr(leaf, slot,
4599                                             struct btrfs_file_extent_item);
4600                         fi = (struct btrfs_file_extent_item *)(
4601                              (unsigned long)fi - size_diff);
4602
4603                         if (btrfs_file_extent_type(leaf, fi) ==
4604                             BTRFS_FILE_EXTENT_INLINE) {
4605                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4606                                 memmove_extent_buffer(leaf, ptr,
4607                                       (unsigned long)fi,
4608                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
4609                         }
4610                 }
4611
4612                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4613                               data_end + size_diff, btrfs_leaf_data(leaf) +
4614                               data_end, old_data_start - data_end);
4615
4616                 offset = btrfs_disk_key_offset(&disk_key);
4617                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4618                 btrfs_set_item_key(leaf, &disk_key, slot);
4619                 if (slot == 0)
4620                         fixup_low_keys(root->fs_info, path, &disk_key, 1);
4621         }
4622
4623         item = btrfs_item_nr(slot);
4624         btrfs_set_item_size(leaf, item, new_size);
4625         btrfs_mark_buffer_dirty(leaf);
4626
4627         if (btrfs_leaf_free_space(root, leaf) < 0) {
4628                 btrfs_print_leaf(root, leaf);
4629                 BUG();
4630         }
4631 }
4632
4633 /*
4634  * make the item pointed to by the path bigger, data_size is the added size.
4635  */
4636 void btrfs_extend_item(struct btrfs_root *root, struct btrfs_path *path,
4637                        u32 data_size)
4638 {
4639         int slot;
4640         struct extent_buffer *leaf;
4641         struct btrfs_item *item;
4642         u32 nritems;
4643         unsigned int data_end;
4644         unsigned int old_data;
4645         unsigned int old_size;
4646         int i;
4647         struct btrfs_map_token token;
4648
4649         btrfs_init_map_token(&token);
4650
4651         leaf = path->nodes[0];
4652
4653         nritems = btrfs_header_nritems(leaf);
4654         data_end = leaf_data_end(root, leaf);
4655
4656         if (btrfs_leaf_free_space(root, leaf) < data_size) {
4657                 btrfs_print_leaf(root, leaf);
4658                 BUG();
4659         }
4660         slot = path->slots[0];
4661         old_data = btrfs_item_end_nr(leaf, slot);
4662
4663         BUG_ON(slot < 0);
4664         if (slot >= nritems) {
4665                 btrfs_print_leaf(root, leaf);
4666                 btrfs_crit(root->fs_info, "slot %d too large, nritems %d",
4667                        slot, nritems);
4668                 BUG_ON(1);
4669         }
4670
4671         /*
4672          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4673          */
4674         /* first correct the data pointers */
4675         for (i = slot; i < nritems; i++) {
4676                 u32 ioff;
4677                 item = btrfs_item_nr(i);
4678
4679                 ioff = btrfs_token_item_offset(leaf, item, &token);
4680                 btrfs_set_token_item_offset(leaf, item,
4681                                             ioff - data_size, &token);
4682         }
4683
4684         /* shift the data */
4685         memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4686                       data_end - data_size, btrfs_leaf_data(leaf) +
4687                       data_end, old_data - data_end);
4688
4689         data_end = old_data;
4690         old_size = btrfs_item_size_nr(leaf, slot);
4691         item = btrfs_item_nr(slot);
4692         btrfs_set_item_size(leaf, item, old_size + data_size);
4693         btrfs_mark_buffer_dirty(leaf);
4694
4695         if (btrfs_leaf_free_space(root, leaf) < 0) {
4696                 btrfs_print_leaf(root, leaf);
4697                 BUG();
4698         }
4699 }
4700
4701 /*
4702  * this is a helper for btrfs_insert_empty_items, the main goal here is
4703  * to save stack depth by doing the bulk of the work in a function
4704  * that doesn't call btrfs_search_slot
4705  */
4706 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4707                             struct btrfs_key *cpu_key, u32 *data_size,
4708                             u32 total_data, u32 total_size, int nr)
4709 {
4710         struct btrfs_item *item;
4711         int i;
4712         u32 nritems;
4713         unsigned int data_end;
4714         struct btrfs_disk_key disk_key;
4715         struct extent_buffer *leaf;
4716         int slot;
4717         struct btrfs_map_token token;
4718
4719         if (path->slots[0] == 0) {
4720                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4721                 fixup_low_keys(root->fs_info, path, &disk_key, 1);
4722         }
4723         btrfs_unlock_up_safe(path, 1);
4724
4725         btrfs_init_map_token(&token);
4726
4727         leaf = path->nodes[0];
4728         slot = path->slots[0];
4729
4730         nritems = btrfs_header_nritems(leaf);
4731         data_end = leaf_data_end(root, leaf);
4732
4733         if (btrfs_leaf_free_space(root, leaf) < total_size) {
4734                 btrfs_print_leaf(root, leaf);
4735                 btrfs_crit(root->fs_info, "not enough freespace need %u have %d",
4736                        total_size, btrfs_leaf_free_space(root, leaf));
4737                 BUG();
4738         }
4739
4740         if (slot != nritems) {
4741                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4742
4743                 if (old_data < data_end) {
4744                         btrfs_print_leaf(root, leaf);
4745                         btrfs_crit(root->fs_info, "slot %d old_data %d data_end %d",
4746                                slot, old_data, data_end);
4747                         BUG_ON(1);
4748                 }
4749                 /*
4750                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4751                  */
4752                 /* first correct the data pointers */
4753                 for (i = slot; i < nritems; i++) {
4754                         u32 ioff;
4755
4756                         item = btrfs_item_nr( i);
4757                         ioff = btrfs_token_item_offset(leaf, item, &token);
4758                         btrfs_set_token_item_offset(leaf, item,
4759                                                     ioff - total_data, &token);
4760                 }
4761                 /* shift the items */
4762                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4763                               btrfs_item_nr_offset(slot),
4764                               (nritems - slot) * sizeof(struct btrfs_item));
4765
4766                 /* shift the data */
4767                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4768                               data_end - total_data, btrfs_leaf_data(leaf) +
4769                               data_end, old_data - data_end);
4770                 data_end = old_data;
4771         }
4772
4773         /* setup the item for the new data */
4774         for (i = 0; i < nr; i++) {
4775                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4776                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4777                 item = btrfs_item_nr(slot + i);
4778                 btrfs_set_token_item_offset(leaf, item,
4779                                             data_end - data_size[i], &token);
4780                 data_end -= data_size[i];
4781                 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4782         }
4783
4784         btrfs_set_header_nritems(leaf, nritems + nr);
4785         btrfs_mark_buffer_dirty(leaf);
4786
4787         if (btrfs_leaf_free_space(root, leaf) < 0) {
4788                 btrfs_print_leaf(root, leaf);
4789                 BUG();
4790         }
4791 }
4792
4793 /*
4794  * Given a key and some data, insert items into the tree.
4795  * This does all the path init required, making room in the tree if needed.
4796  */
4797 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4798                             struct btrfs_root *root,
4799                             struct btrfs_path *path,
4800                             struct btrfs_key *cpu_key, u32 *data_size,
4801                             int nr)
4802 {
4803         int ret = 0;
4804         int slot;
4805         int i;
4806         u32 total_size = 0;
4807         u32 total_data = 0;
4808
4809         for (i = 0; i < nr; i++)
4810                 total_data += data_size[i];
4811
4812         total_size = total_data + (nr * sizeof(struct btrfs_item));
4813         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4814         if (ret == 0)
4815                 return -EEXIST;
4816         if (ret < 0)
4817                 return ret;
4818
4819         slot = path->slots[0];
4820         BUG_ON(slot < 0);
4821
4822         setup_items_for_insert(root, path, cpu_key, data_size,
4823                                total_data, total_size, nr);
4824         return 0;
4825 }
4826
4827 /*
4828  * Given a key and some data, insert an item into the tree.
4829  * This does all the path init required, making room in the tree if needed.
4830  */
4831 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
4832                       *root, struct btrfs_key *cpu_key, void *data, u32
4833                       data_size)
4834 {
4835         int ret = 0;
4836         struct btrfs_path *path;
4837         struct extent_buffer *leaf;
4838         unsigned long ptr;
4839
4840         path = btrfs_alloc_path();
4841         if (!path)
4842                 return -ENOMEM;
4843         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4844         if (!ret) {
4845                 leaf = path->nodes[0];
4846                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4847                 write_extent_buffer(leaf, data, ptr, data_size);
4848                 btrfs_mark_buffer_dirty(leaf);
4849         }
4850         btrfs_free_path(path);
4851         return ret;
4852 }
4853
4854 /*
4855  * delete the pointer from a given node.
4856  *
4857  * the tree should have been previously balanced so the deletion does not
4858  * empty a node.
4859  */
4860 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4861                     int level, int slot)
4862 {
4863         struct extent_buffer *parent = path->nodes[level];
4864         u32 nritems;
4865         int ret;
4866
4867         nritems = btrfs_header_nritems(parent);
4868         if (slot != nritems - 1) {
4869                 if (level)
4870                         tree_mod_log_eb_move(root->fs_info, parent, slot,
4871                                              slot + 1, nritems - slot - 1);
4872                 memmove_extent_buffer(parent,
4873                               btrfs_node_key_ptr_offset(slot),
4874                               btrfs_node_key_ptr_offset(slot + 1),
4875                               sizeof(struct btrfs_key_ptr) *
4876                               (nritems - slot - 1));
4877         } else if (level) {
4878                 ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
4879                                               MOD_LOG_KEY_REMOVE, GFP_NOFS);
4880                 BUG_ON(ret < 0);
4881         }
4882
4883         nritems--;
4884         btrfs_set_header_nritems(parent, nritems);
4885         if (nritems == 0 && parent == root->node) {
4886                 BUG_ON(btrfs_header_level(root->node) != 1);
4887                 /* just turn the root into a leaf and break */
4888                 btrfs_set_header_level(root->node, 0);
4889         } else if (slot == 0) {
4890                 struct btrfs_disk_key disk_key;
4891
4892                 btrfs_node_key(parent, &disk_key, 0);
4893                 fixup_low_keys(root->fs_info, path, &disk_key, level + 1);
4894         }
4895         btrfs_mark_buffer_dirty(parent);
4896 }
4897
4898 /*
4899  * a helper function to delete the leaf pointed to by path->slots[1] and
4900  * path->nodes[1].
4901  *
4902  * This deletes the pointer in path->nodes[1] and frees the leaf
4903  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4904  *
4905  * The path must have already been setup for deleting the leaf, including
4906  * all the proper balancing.  path->nodes[1] must be locked.
4907  */
4908 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4909                                     struct btrfs_root *root,
4910                                     struct btrfs_path *path,
4911                                     struct extent_buffer *leaf)
4912 {
4913         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4914         del_ptr(root, path, 1, path->slots[1]);
4915
4916         /*
4917          * btrfs_free_extent is expensive, we want to make sure we
4918          * aren't holding any locks when we call it
4919          */
4920         btrfs_unlock_up_safe(path, 0);
4921
4922         root_sub_used(root, leaf->len);
4923
4924         extent_buffer_get(leaf);
4925         btrfs_free_tree_block(trans, root, leaf, 0, 1);
4926         free_extent_buffer_stale(leaf);
4927 }
4928 /*
4929  * delete the item at the leaf level in path.  If that empties
4930  * the leaf, remove it from the tree
4931  */
4932 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4933                     struct btrfs_path *path, int slot, int nr)
4934 {
4935         struct extent_buffer *leaf;
4936         struct btrfs_item *item;
4937         int last_off;
4938         int dsize = 0;
4939         int ret = 0;
4940         int wret;
4941         int i;
4942         u32 nritems;
4943         struct btrfs_map_token token;
4944
4945         btrfs_init_map_token(&token);
4946
4947         leaf = path->nodes[0];
4948         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
4949
4950         for (i = 0; i < nr; i++)
4951                 dsize += btrfs_item_size_nr(leaf, slot + i);
4952
4953         nritems = btrfs_header_nritems(leaf);
4954
4955         if (slot + nr != nritems) {
4956                 int data_end = leaf_data_end(root, leaf);
4957
4958                 memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) +
4959                               data_end + dsize,
4960                               btrfs_leaf_data(leaf) + data_end,
4961                               last_off - data_end);
4962
4963                 for (i = slot + nr; i < nritems; i++) {
4964                         u32 ioff;
4965
4966                         item = btrfs_item_nr(i);
4967                         ioff = btrfs_token_item_offset(leaf, item, &token);
4968                         btrfs_set_token_item_offset(leaf, item,
4969                                                     ioff + dsize, &token);
4970                 }
4971
4972                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
4973                               btrfs_item_nr_offset(slot + nr),
4974                               sizeof(struct btrfs_item) *
4975                               (nritems - slot - nr));
4976         }
4977         btrfs_set_header_nritems(leaf, nritems - nr);
4978         nritems -= nr;
4979
4980         /* delete the leaf if we've emptied it */
4981         if (nritems == 0) {
4982                 if (leaf == root->node) {
4983                         btrfs_set_header_level(leaf, 0);
4984                 } else {
4985                         btrfs_set_path_blocking(path);
4986                         clean_tree_block(trans, root->fs_info, leaf);
4987                         btrfs_del_leaf(trans, root, path, leaf);
4988                 }
4989         } else {
4990                 int used = leaf_space_used(leaf, 0, nritems);
4991                 if (slot == 0) {
4992                         struct btrfs_disk_key disk_key;
4993
4994                         btrfs_item_key(leaf, &disk_key, 0);
4995                         fixup_low_keys(root->fs_info, path, &disk_key, 1);
4996                 }
4997
4998                 /* delete the leaf if it is mostly empty */
4999                 if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
5000                         /* push_leaf_left fixes the path.
5001                          * make sure the path still points to our leaf
5002                          * for possible call to del_ptr below
5003                          */
5004                         slot = path->slots[1];
5005                         extent_buffer_get(leaf);
5006
5007                         btrfs_set_path_blocking(path);
5008                         wret = push_leaf_left(trans, root, path, 1, 1,
5009                                               1, (u32)-1);
5010                         if (wret < 0 && wret != -ENOSPC)
5011                                 ret = wret;
5012
5013                         if (path->nodes[0] == leaf &&
5014                             btrfs_header_nritems(leaf)) {
5015                                 wret = push_leaf_right(trans, root, path, 1,
5016                                                        1, 1, 0);
5017                                 if (wret < 0 && wret != -ENOSPC)
5018                                         ret = wret;
5019                         }
5020
5021                         if (btrfs_header_nritems(leaf) == 0) {
5022                                 path->slots[1] = slot;
5023                                 btrfs_del_leaf(trans, root, path, leaf);
5024                                 free_extent_buffer(leaf);
5025                                 ret = 0;
5026                         } else {
5027                                 /* if we're still in the path, make sure
5028                                  * we're dirty.  Otherwise, one of the
5029                                  * push_leaf functions must have already
5030                                  * dirtied this buffer
5031                                  */
5032                                 if (path->nodes[0] == leaf)
5033                                         btrfs_mark_buffer_dirty(leaf);
5034                                 free_extent_buffer(leaf);
5035                         }
5036                 } else {
5037                         btrfs_mark_buffer_dirty(leaf);
5038                 }
5039         }
5040         return ret;
5041 }
5042
5043 /*
5044  * search the tree again to find a leaf with lesser keys
5045  * returns 0 if it found something or 1 if there are no lesser leaves.
5046  * returns < 0 on io errors.
5047  *
5048  * This may release the path, and so you may lose any locks held at the
5049  * time you call it.
5050  */
5051 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5052 {
5053         struct btrfs_key key;
5054         struct btrfs_disk_key found_key;
5055         int ret;
5056
5057         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5058
5059         if (key.offset > 0) {
5060                 key.offset--;
5061         } else if (key.type > 0) {
5062                 key.type--;
5063                 key.offset = (u64)-1;
5064         } else if (key.objectid > 0) {
5065                 key.objectid--;
5066                 key.type = (u8)-1;
5067                 key.offset = (u64)-1;
5068         } else {
5069                 return 1;
5070         }
5071
5072         btrfs_release_path(path);
5073         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5074         if (ret < 0)
5075                 return ret;
5076         btrfs_item_key(path->nodes[0], &found_key, 0);
5077         ret = comp_keys(&found_key, &key);
5078         /*
5079          * We might have had an item with the previous key in the tree right
5080          * before we released our path. And after we released our path, that
5081          * item might have been pushed to the first slot (0) of the leaf we
5082          * were holding due to a tree balance. Alternatively, an item with the
5083          * previous key can exist as the only element of a leaf (big fat item).
5084          * Therefore account for these 2 cases, so that our callers (like
5085          * btrfs_previous_item) don't miss an existing item with a key matching
5086          * the previous key we computed above.
5087          */
5088         if (ret <= 0)
5089                 return 0;
5090         return 1;
5091 }
5092
5093 /*
5094  * A helper function to walk down the tree starting at min_key, and looking
5095  * for nodes or leaves that are have a minimum transaction id.
5096  * This is used by the btree defrag code, and tree logging
5097  *
5098  * This does not cow, but it does stuff the starting key it finds back
5099  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5100  * key and get a writable path.
5101  *
5102  * This does lock as it descends, and path->keep_locks should be set
5103  * to 1 by the caller.
5104  *
5105  * This honors path->lowest_level to prevent descent past a given level
5106  * of the tree.
5107  *
5108  * min_trans indicates the oldest transaction that you are interested
5109  * in walking through.  Any nodes or leaves older than min_trans are
5110  * skipped over (without reading them).
5111  *
5112  * returns zero if something useful was found, < 0 on error and 1 if there
5113  * was nothing in the tree that matched the search criteria.
5114  */
5115 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5116                          struct btrfs_path *path,
5117                          u64 min_trans)
5118 {
5119         struct extent_buffer *cur;
5120         struct btrfs_key found_key;
5121         int slot;
5122         int sret;
5123         u32 nritems;
5124         int level;
5125         int ret = 1;
5126         int keep_locks = path->keep_locks;
5127
5128         path->keep_locks = 1;
5129 again:
5130         cur = btrfs_read_lock_root_node(root);
5131         level = btrfs_header_level(cur);
5132         WARN_ON(path->nodes[level]);
5133         path->nodes[level] = cur;
5134         path->locks[level] = BTRFS_READ_LOCK;
5135
5136         if (btrfs_header_generation(cur) < min_trans) {
5137                 ret = 1;
5138                 goto out;
5139         }
5140         while (1) {
5141                 nritems = btrfs_header_nritems(cur);
5142                 level = btrfs_header_level(cur);
5143                 sret = bin_search(cur, min_key, level, &slot);
5144
5145                 /* at the lowest level, we're done, setup the path and exit */
5146                 if (level == path->lowest_level) {
5147                         if (slot >= nritems)
5148                                 goto find_next_key;
5149                         ret = 0;
5150                         path->slots[level] = slot;
5151                         btrfs_item_key_to_cpu(cur, &found_key, slot);
5152                         goto out;
5153                 }
5154                 if (sret && slot > 0)
5155                         slot--;
5156                 /*
5157                  * check this node pointer against the min_trans parameters.
5158                  * If it is too old, old, skip to the next one.
5159                  */
5160                 while (slot < nritems) {
5161                         u64 gen;
5162
5163                         gen = btrfs_node_ptr_generation(cur, slot);
5164                         if (gen < min_trans) {
5165                                 slot++;
5166                                 continue;
5167                         }
5168                         break;
5169                 }
5170 find_next_key:
5171                 /*
5172                  * we didn't find a candidate key in this node, walk forward
5173                  * and find another one
5174                  */
5175                 if (slot >= nritems) {
5176                         path->slots[level] = slot;
5177                         btrfs_set_path_blocking(path);
5178                         sret = btrfs_find_next_key(root, path, min_key, level,
5179                                                   min_trans);
5180                         if (sret == 0) {
5181                                 btrfs_release_path(path);
5182                                 goto again;
5183                         } else {
5184                                 goto out;
5185                         }
5186                 }
5187                 /* save our key for returning back */
5188                 btrfs_node_key_to_cpu(cur, &found_key, slot);
5189                 path->slots[level] = slot;
5190                 if (level == path->lowest_level) {
5191                         ret = 0;
5192                         goto out;
5193                 }
5194                 btrfs_set_path_blocking(path);
5195                 cur = read_node_slot(root, cur, slot);
5196                 BUG_ON(!cur); /* -ENOMEM */
5197
5198                 btrfs_tree_read_lock(cur);
5199
5200                 path->locks[level - 1] = BTRFS_READ_LOCK;
5201                 path->nodes[level - 1] = cur;
5202                 unlock_up(path, level, 1, 0, NULL);
5203                 btrfs_clear_path_blocking(path, NULL, 0);
5204         }
5205 out:
5206         path->keep_locks = keep_locks;
5207         if (ret == 0) {
5208                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5209                 btrfs_set_path_blocking(path);
5210                 memcpy(min_key, &found_key, sizeof(found_key));
5211         }
5212         return ret;
5213 }
5214
5215 static void tree_move_down(struct btrfs_root *root,
5216                            struct btrfs_path *path,
5217                            int *level, int root_level)
5218 {
5219         BUG_ON(*level == 0);
5220         path->nodes[*level - 1] = read_node_slot(root, path->nodes[*level],
5221                                         path->slots[*level]);
5222         path->slots[*level - 1] = 0;
5223         (*level)--;
5224 }
5225
5226 static int tree_move_next_or_upnext(struct btrfs_root *root,
5227                                     struct btrfs_path *path,
5228                                     int *level, int root_level)
5229 {
5230         int ret = 0;
5231         int nritems;
5232         nritems = btrfs_header_nritems(path->nodes[*level]);
5233
5234         path->slots[*level]++;
5235
5236         while (path->slots[*level] >= nritems) {
5237                 if (*level == root_level)
5238                         return -1;
5239
5240                 /* move upnext */
5241                 path->slots[*level] = 0;
5242                 free_extent_buffer(path->nodes[*level]);
5243                 path->nodes[*level] = NULL;
5244                 (*level)++;
5245                 path->slots[*level]++;
5246
5247                 nritems = btrfs_header_nritems(path->nodes[*level]);
5248                 ret = 1;
5249         }
5250         return ret;
5251 }
5252
5253 /*
5254  * Returns 1 if it had to move up and next. 0 is returned if it moved only next
5255  * or down.
5256  */
5257 static int tree_advance(struct btrfs_root *root,
5258                         struct btrfs_path *path,
5259                         int *level, int root_level,
5260                         int allow_down,
5261                         struct btrfs_key *key)
5262 {
5263         int ret;
5264
5265         if (*level == 0 || !allow_down) {
5266                 ret = tree_move_next_or_upnext(root, path, level, root_level);
5267         } else {
5268                 tree_move_down(root, path, level, root_level);
5269                 ret = 0;
5270         }
5271         if (ret >= 0) {
5272                 if (*level == 0)
5273                         btrfs_item_key_to_cpu(path->nodes[*level], key,
5274                                         path->slots[*level]);
5275                 else
5276                         btrfs_node_key_to_cpu(path->nodes[*level], key,
5277                                         path->slots[*level]);
5278         }
5279         return ret;
5280 }
5281
5282 static int tree_compare_item(struct btrfs_root *left_root,
5283                              struct btrfs_path *left_path,
5284                              struct btrfs_path *right_path,
5285                              char *tmp_buf)
5286 {
5287         int cmp;
5288         int len1, len2;
5289         unsigned long off1, off2;
5290
5291         len1 = btrfs_item_size_nr(left_path->nodes[0], left_path->slots[0]);
5292         len2 = btrfs_item_size_nr(right_path->nodes[0], right_path->slots[0]);
5293         if (len1 != len2)
5294                 return 1;
5295
5296         off1 = btrfs_item_ptr_offset(left_path->nodes[0], left_path->slots[0]);
5297         off2 = btrfs_item_ptr_offset(right_path->nodes[0],
5298                                 right_path->slots[0]);
5299
5300         read_extent_buffer(left_path->nodes[0], tmp_buf, off1, len1);
5301
5302         cmp = memcmp_extent_buffer(right_path->nodes[0], tmp_buf, off2, len1);
5303         if (cmp)
5304                 return 1;
5305         return 0;
5306 }
5307
5308 #define ADVANCE 1
5309 #define ADVANCE_ONLY_NEXT -1
5310
5311 /*
5312  * This function compares two trees and calls the provided callback for
5313  * every changed/new/deleted item it finds.
5314  * If shared tree blocks are encountered, whole subtrees are skipped, making
5315  * the compare pretty fast on snapshotted subvolumes.
5316  *
5317  * This currently works on commit roots only. As commit roots are read only,
5318  * we don't do any locking. The commit roots are protected with transactions.
5319  * Transactions are ended and rejoined when a commit is tried in between.
5320  *
5321  * This function checks for modifications done to the trees while comparing.
5322  * If it detects a change, it aborts immediately.
5323  */
5324 int btrfs_compare_trees(struct btrfs_root *left_root,
5325                         struct btrfs_root *right_root,
5326                         btrfs_changed_cb_t changed_cb, void *ctx)
5327 {
5328         int ret;
5329         int cmp;
5330         struct btrfs_path *left_path = NULL;
5331         struct btrfs_path *right_path = NULL;
5332         struct btrfs_key left_key;
5333         struct btrfs_key right_key;
5334         char *tmp_buf = NULL;
5335         int left_root_level;
5336         int right_root_level;
5337         int left_level;
5338         int right_level;
5339         int left_end_reached;
5340         int right_end_reached;
5341         int advance_left;
5342         int advance_right;
5343         u64 left_blockptr;
5344         u64 right_blockptr;
5345         u64 left_gen;
5346         u64 right_gen;
5347
5348         left_path = btrfs_alloc_path();
5349         if (!left_path) {
5350                 ret = -ENOMEM;
5351                 goto out;
5352         }
5353         right_path = btrfs_alloc_path();
5354         if (!right_path) {
5355                 ret = -ENOMEM;
5356                 goto out;
5357         }
5358
5359         tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS);
5360         if (!tmp_buf) {
5361                 ret = -ENOMEM;
5362                 goto out;
5363         }
5364
5365         left_path->search_commit_root = 1;
5366         left_path->skip_locking = 1;
5367         right_path->search_commit_root = 1;
5368         right_path->skip_locking = 1;
5369
5370         /*
5371          * Strategy: Go to the first items of both trees. Then do
5372          *
5373          * If both trees are at level 0
5374          *   Compare keys of current items
5375          *     If left < right treat left item as new, advance left tree
5376          *       and repeat
5377          *     If left > right treat right item as deleted, advance right tree
5378          *       and repeat
5379          *     If left == right do deep compare of items, treat as changed if
5380          *       needed, advance both trees and repeat
5381          * If both trees are at the same level but not at level 0
5382          *   Compare keys of current nodes/leafs
5383          *     If left < right advance left tree and repeat
5384          *     If left > right advance right tree and repeat
5385          *     If left == right compare blockptrs of the next nodes/leafs
5386          *       If they match advance both trees but stay at the same level
5387          *         and repeat
5388          *       If they don't match advance both trees while allowing to go
5389          *         deeper and repeat
5390          * If tree levels are different
5391          *   Advance the tree that needs it and repeat
5392          *
5393          * Advancing a tree means:
5394          *   If we are at level 0, try to go to the next slot. If that's not
5395          *   possible, go one level up and repeat. Stop when we found a level
5396          *   where we could go to the next slot. We may at this point be on a
5397          *   node or a leaf.
5398          *
5399          *   If we are not at level 0 and not on shared tree blocks, go one
5400          *   level deeper.
5401          *
5402          *   If we are not at level 0 and on shared tree blocks, go one slot to
5403          *   the right if possible or go up and right.
5404          */
5405
5406         down_read(&left_root->fs_info->commit_root_sem);
5407         left_level = btrfs_header_level(left_root->commit_root);
5408         left_root_level = left_level;
5409         left_path->nodes[left_level] = left_root->commit_root;
5410         extent_buffer_get(left_path->nodes[left_level]);
5411
5412         right_level = btrfs_header_level(right_root->commit_root);
5413         right_root_level = right_level;
5414         right_path->nodes[right_level] = right_root->commit_root;
5415         extent_buffer_get(right_path->nodes[right_level]);
5416         up_read(&left_root->fs_info->commit_root_sem);
5417
5418         if (left_level == 0)
5419                 btrfs_item_key_to_cpu(left_path->nodes[left_level],
5420                                 &left_key, left_path->slots[left_level]);
5421         else
5422                 btrfs_node_key_to_cpu(left_path->nodes[left_level],
5423                                 &left_key, left_path->slots[left_level]);
5424         if (right_level == 0)
5425                 btrfs_item_key_to_cpu(right_path->nodes[right_level],
5426                                 &right_key, right_path->slots[right_level]);
5427         else
5428                 btrfs_node_key_to_cpu(right_path->nodes[right_level],
5429                                 &right_key, right_path->slots[right_level]);
5430
5431         left_end_reached = right_end_reached = 0;
5432         advance_left = advance_right = 0;
5433
5434         while (1) {
5435                 if (advance_left && !left_end_reached) {
5436                         ret = tree_advance(left_root, left_path, &left_level,
5437                                         left_root_level,
5438                                         advance_left != ADVANCE_ONLY_NEXT,
5439                                         &left_key);
5440                         if (ret < 0)
5441                                 left_end_reached = ADVANCE;
5442                         advance_left = 0;
5443                 }
5444                 if (advance_right && !right_end_reached) {
5445                         ret = tree_advance(right_root, right_path, &right_level,
5446                                         right_root_level,
5447                                         advance_right != ADVANCE_ONLY_NEXT,
5448                                         &right_key);
5449                         if (ret < 0)
5450                                 right_end_reached = ADVANCE;
5451                         advance_right = 0;
5452                 }
5453
5454                 if (left_end_reached && right_end_reached) {
5455                         ret = 0;
5456                         goto out;
5457                 } else if (left_end_reached) {
5458                         if (right_level == 0) {
5459                                 ret = changed_cb(left_root, right_root,
5460                                                 left_path, right_path,
5461                                                 &right_key,
5462                                                 BTRFS_COMPARE_TREE_DELETED,
5463                                                 ctx);
5464                                 if (ret < 0)
5465                                         goto out;
5466                         }
5467                         advance_right = ADVANCE;
5468                         continue;
5469                 } else if (right_end_reached) {
5470                         if (left_level == 0) {
5471                                 ret = changed_cb(left_root, right_root,
5472                                                 left_path, right_path,
5473                                                 &left_key,
5474                                                 BTRFS_COMPARE_TREE_NEW,
5475                                                 ctx);
5476                                 if (ret < 0)
5477                                         goto out;
5478                         }
5479                         advance_left = ADVANCE;
5480                         continue;
5481                 }
5482
5483                 if (left_level == 0 && right_level == 0) {
5484                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5485                         if (cmp < 0) {
5486                                 ret = changed_cb(left_root, right_root,
5487                                                 left_path, right_path,
5488                                                 &left_key,
5489                                                 BTRFS_COMPARE_TREE_NEW,
5490                                                 ctx);
5491                                 if (ret < 0)
5492                                         goto out;
5493                                 advance_left = ADVANCE;
5494                         } else if (cmp > 0) {
5495                                 ret = changed_cb(left_root, right_root,
5496                                                 left_path, right_path,
5497                                                 &right_key,
5498                                                 BTRFS_COMPARE_TREE_DELETED,
5499                                                 ctx);
5500                                 if (ret < 0)
5501                                         goto out;
5502                                 advance_right = ADVANCE;
5503                         } else {
5504                                 enum btrfs_compare_tree_result result;
5505
5506                                 WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
5507                                 ret = tree_compare_item(left_root, left_path,
5508                                                 right_path, tmp_buf);
5509                                 if (ret)
5510                                         result = BTRFS_COMPARE_TREE_CHANGED;
5511                                 else
5512                                         result = BTRFS_COMPARE_TREE_SAME;
5513                                 ret = changed_cb(left_root, right_root,
5514                                                  left_path, right_path,
5515                                                  &left_key, result, ctx);
5516                                 if (ret < 0)
5517                                         goto out;
5518                                 advance_left = ADVANCE;
5519                                 advance_right = ADVANCE;
5520                         }
5521                 } else if (left_level == right_level) {
5522                         cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
5523                         if (cmp < 0) {
5524                                 advance_left = ADVANCE;
5525                         } else if (cmp > 0) {
5526                                 advance_right = ADVANCE;
5527                         } else {
5528                                 left_blockptr = btrfs_node_blockptr(
5529                                                 left_path->nodes[left_level],
5530                                                 left_path->slots[left_level]);
5531                                 right_blockptr = btrfs_node_blockptr(
5532                                                 right_path->nodes[right_level],
5533                                                 right_path->slots[right_level]);
5534                                 left_gen = btrfs_node_ptr_generation(
5535                                                 left_path->nodes[left_level],
5536                                                 left_path->slots[left_level]);
5537                                 right_gen = btrfs_node_ptr_generation(
5538                                                 right_path->nodes[right_level],
5539                                                 right_path->slots[right_level]);
5540                                 if (left_blockptr == right_blockptr &&
5541                                     left_gen == right_gen) {
5542                                         /*
5543                                          * As we're on a shared block, don't
5544                                          * allow to go deeper.
5545                                          */
5546                                         advance_left = ADVANCE_ONLY_NEXT;
5547                                         advance_right = ADVANCE_ONLY_NEXT;
5548                                 } else {
5549                                         advance_left = ADVANCE;
5550                                         advance_right = ADVANCE;
5551                                 }
5552                         }
5553                 } else if (left_level < right_level) {
5554                         advance_right = ADVANCE;
5555                 } else {
5556                         advance_left = ADVANCE;
5557                 }
5558         }
5559
5560 out:
5561         btrfs_free_path(left_path);
5562         btrfs_free_path(right_path);
5563         kfree(tmp_buf);
5564         return ret;
5565 }
5566
5567 /*
5568  * this is similar to btrfs_next_leaf, but does not try to preserve
5569  * and fixup the path.  It looks for and returns the next key in the
5570  * tree based on the current path and the min_trans parameters.
5571  *
5572  * 0 is returned if another key is found, < 0 if there are any errors
5573  * and 1 is returned if there are no higher keys in the tree
5574  *
5575  * path->keep_locks should be set to 1 on the search made before
5576  * calling this function.
5577  */
5578 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5579                         struct btrfs_key *key, int level, u64 min_trans)
5580 {
5581         int slot;
5582         struct extent_buffer *c;
5583
5584         WARN_ON(!path->keep_locks);
5585         while (level < BTRFS_MAX_LEVEL) {
5586                 if (!path->nodes[level])
5587                         return 1;
5588
5589                 slot = path->slots[level] + 1;
5590                 c = path->nodes[level];
5591 next:
5592                 if (slot >= btrfs_header_nritems(c)) {
5593                         int ret;
5594                         int orig_lowest;
5595                         struct btrfs_key cur_key;
5596                         if (level + 1 >= BTRFS_MAX_LEVEL ||
5597                             !path->nodes[level + 1])
5598                                 return 1;
5599
5600                         if (path->locks[level + 1]) {
5601                                 level++;
5602                                 continue;
5603                         }
5604
5605                         slot = btrfs_header_nritems(c) - 1;
5606                         if (level == 0)
5607                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
5608                         else
5609                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
5610
5611                         orig_lowest = path->lowest_level;
5612                         btrfs_release_path(path);
5613                         path->lowest_level = level;
5614                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
5615                                                 0, 0);
5616                         path->lowest_level = orig_lowest;
5617                         if (ret < 0)
5618                                 return ret;
5619
5620                         c = path->nodes[level];
5621                         slot = path->slots[level];
5622                         if (ret == 0)
5623                                 slot++;
5624                         goto next;
5625                 }
5626
5627                 if (level == 0)
5628                         btrfs_item_key_to_cpu(c, key, slot);
5629                 else {
5630                         u64 gen = btrfs_node_ptr_generation(c, slot);
5631
5632                         if (gen < min_trans) {
5633                                 slot++;
5634                                 goto next;
5635                         }
5636                         btrfs_node_key_to_cpu(c, key, slot);
5637                 }
5638                 return 0;
5639         }
5640         return 1;
5641 }
5642
5643 /*
5644  * search the tree again to find a leaf with greater keys
5645  * returns 0 if it found something or 1 if there are no greater leaves.
5646  * returns < 0 on io errors.
5647  */
5648 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5649 {
5650         return btrfs_next_old_leaf(root, path, 0);
5651 }
5652
5653 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5654                         u64 time_seq)
5655 {
5656         int slot;
5657         int level;
5658         struct extent_buffer *c;
5659         struct extent_buffer *next;
5660         struct btrfs_key key;
5661         u32 nritems;
5662         int ret;
5663         int old_spinning = path->leave_spinning;
5664         int next_rw_lock = 0;
5665
5666         nritems = btrfs_header_nritems(path->nodes[0]);
5667         if (nritems == 0)
5668                 return 1;
5669
5670         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5671 again:
5672         level = 1;
5673         next = NULL;
5674         next_rw_lock = 0;
5675         btrfs_release_path(path);
5676
5677         path->keep_locks = 1;
5678         path->leave_spinning = 1;
5679
5680         if (time_seq)
5681                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5682         else
5683                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5684         path->keep_locks = 0;
5685
5686         if (ret < 0)
5687                 return ret;
5688
5689         nritems = btrfs_header_nritems(path->nodes[0]);
5690         /*
5691          * by releasing the path above we dropped all our locks.  A balance
5692          * could have added more items next to the key that used to be
5693          * at the very end of the block.  So, check again here and
5694          * advance the path if there are now more items available.
5695          */
5696         if (nritems > 0 && path->slots[0] < nritems - 1) {
5697                 if (ret == 0)
5698                         path->slots[0]++;
5699                 ret = 0;
5700                 goto done;
5701         }
5702         /*
5703          * So the above check misses one case:
5704          * - after releasing the path above, someone has removed the item that
5705          *   used to be at the very end of the block, and balance between leafs
5706          *   gets another one with bigger key.offset to replace it.
5707          *
5708          * This one should be returned as well, or we can get leaf corruption
5709          * later(esp. in __btrfs_drop_extents()).
5710          *
5711          * And a bit more explanation about this check,
5712          * with ret > 0, the key isn't found, the path points to the slot
5713          * where it should be inserted, so the path->slots[0] item must be the
5714          * bigger one.
5715          */
5716         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5717                 ret = 0;
5718                 goto done;
5719         }
5720
5721         while (level < BTRFS_MAX_LEVEL) {
5722                 if (!path->nodes[level]) {
5723                         ret = 1;
5724                         goto done;
5725                 }
5726
5727                 slot = path->slots[level] + 1;
5728                 c = path->nodes[level];
5729                 if (slot >= btrfs_header_nritems(c)) {
5730                         level++;
5731                         if (level == BTRFS_MAX_LEVEL) {
5732                                 ret = 1;
5733                                 goto done;
5734                         }
5735                         continue;
5736                 }
5737
5738                 if (next) {
5739                         btrfs_tree_unlock_rw(next, next_rw_lock);
5740                         free_extent_buffer(next);
5741                 }
5742
5743                 next = c;
5744                 next_rw_lock = path->locks[level];
5745                 ret = read_block_for_search(NULL, root, path, &next, level,
5746                                             slot, &key, 0);
5747                 if (ret == -EAGAIN)
5748                         goto again;
5749
5750                 if (ret < 0) {
5751                         btrfs_release_path(path);
5752                         goto done;
5753                 }
5754
5755                 if (!path->skip_locking) {
5756                         ret = btrfs_try_tree_read_lock(next);
5757                         if (!ret && time_seq) {
5758                                 /*
5759                                  * If we don't get the lock, we may be racing
5760                                  * with push_leaf_left, holding that lock while
5761                                  * itself waiting for the leaf we've currently
5762                                  * locked. To solve this situation, we give up
5763                                  * on our lock and cycle.
5764                                  */
5765                                 free_extent_buffer(next);
5766                                 btrfs_release_path(path);
5767                                 cond_resched();
5768                                 goto again;
5769                         }
5770                         if (!ret) {
5771                                 btrfs_set_path_blocking(path);
5772                                 btrfs_tree_read_lock(next);
5773                                 btrfs_clear_path_blocking(path, next,
5774                                                           BTRFS_READ_LOCK);
5775                         }
5776                         next_rw_lock = BTRFS_READ_LOCK;
5777                 }
5778                 break;
5779         }
5780         path->slots[level] = slot;
5781         while (1) {
5782                 level--;
5783                 c = path->nodes[level];
5784                 if (path->locks[level])
5785                         btrfs_tree_unlock_rw(c, path->locks[level]);
5786
5787                 free_extent_buffer(c);
5788                 path->nodes[level] = next;
5789                 path->slots[level] = 0;
5790                 if (!path->skip_locking)
5791                         path->locks[level] = next_rw_lock;
5792                 if (!level)
5793                         break;
5794
5795                 ret = read_block_for_search(NULL, root, path, &next, level,
5796                                             0, &key, 0);
5797                 if (ret == -EAGAIN)
5798                         goto again;
5799
5800                 if (ret < 0) {
5801                         btrfs_release_path(path);
5802                         goto done;
5803                 }
5804
5805                 if (!path->skip_locking) {
5806                         ret = btrfs_try_tree_read_lock(next);
5807                         if (!ret) {
5808                                 btrfs_set_path_blocking(path);
5809                                 btrfs_tree_read_lock(next);
5810                                 btrfs_clear_path_blocking(path, next,
5811                                                           BTRFS_READ_LOCK);
5812                         }
5813                         next_rw_lock = BTRFS_READ_LOCK;
5814                 }
5815         }
5816         ret = 0;
5817 done:
5818         unlock_up(path, 0, 1, 0, NULL);
5819         path->leave_spinning = old_spinning;
5820         if (!old_spinning)
5821                 btrfs_set_path_blocking(path);
5822
5823         return ret;
5824 }
5825
5826 /*
5827  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5828  * searching until it gets past min_objectid or finds an item of 'type'
5829  *
5830  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5831  */
5832 int btrfs_previous_item(struct btrfs_root *root,
5833                         struct btrfs_path *path, u64 min_objectid,
5834                         int type)
5835 {
5836         struct btrfs_key found_key;
5837         struct extent_buffer *leaf;
5838         u32 nritems;
5839         int ret;
5840
5841         while (1) {
5842                 if (path->slots[0] == 0) {
5843                         btrfs_set_path_blocking(path);
5844                         ret = btrfs_prev_leaf(root, path);
5845                         if (ret != 0)
5846                                 return ret;
5847                 } else {
5848                         path->slots[0]--;
5849                 }
5850                 leaf = path->nodes[0];
5851                 nritems = btrfs_header_nritems(leaf);
5852                 if (nritems == 0)
5853                         return 1;
5854                 if (path->slots[0] == nritems)
5855                         path->slots[0]--;
5856
5857                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5858                 if (found_key.objectid < min_objectid)
5859                         break;
5860                 if (found_key.type == type)
5861                         return 0;
5862                 if (found_key.objectid == min_objectid &&
5863                     found_key.type < type)
5864                         break;
5865         }
5866         return 1;
5867 }
5868
5869 /*
5870  * search in extent tree to find a previous Metadata/Data extent item with
5871  * min objecitd.
5872  *
5873  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5874  */
5875 int btrfs_previous_extent_item(struct btrfs_root *root,
5876                         struct btrfs_path *path, u64 min_objectid)
5877 {
5878         struct btrfs_key found_key;
5879         struct extent_buffer *leaf;
5880         u32 nritems;
5881         int ret;
5882
5883         while (1) {
5884                 if (path->slots[0] == 0) {
5885                         btrfs_set_path_blocking(path);
5886                         ret = btrfs_prev_leaf(root, path);
5887                         if (ret != 0)
5888                                 return ret;
5889                 } else {
5890                         path->slots[0]--;
5891                 }
5892                 leaf = path->nodes[0];
5893                 nritems = btrfs_header_nritems(leaf);
5894                 if (nritems == 0)
5895                         return 1;
5896                 if (path->slots[0] == nritems)
5897                         path->slots[0]--;
5898
5899                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5900                 if (found_key.objectid < min_objectid)
5901                         break;
5902                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5903                     found_key.type == BTRFS_METADATA_ITEM_KEY)
5904                         return 0;
5905                 if (found_key.objectid == min_objectid &&
5906                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
5907                         break;
5908         }
5909         return 1;
5910 }