Add the rt linux 4.1.3-rt3 as base
[kvmfornfv.git] / kernel / fs / xfs / libxfs / xfs_btree.c
1 /*
2  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_shared.h"
21 #include "xfs_format.h"
22 #include "xfs_log_format.h"
23 #include "xfs_trans_resv.h"
24 #include "xfs_bit.h"
25 #include "xfs_mount.h"
26 #include "xfs_inode.h"
27 #include "xfs_trans.h"
28 #include "xfs_inode_item.h"
29 #include "xfs_buf_item.h"
30 #include "xfs_btree.h"
31 #include "xfs_error.h"
32 #include "xfs_trace.h"
33 #include "xfs_cksum.h"
34 #include "xfs_alloc.h"
35
36 /*
37  * Cursor allocation zone.
38  */
39 kmem_zone_t     *xfs_btree_cur_zone;
40
41 /*
42  * Btree magic numbers.
43  */
44 static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
45         { XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
46           XFS_FIBT_MAGIC },
47         { XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC,
48           XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
49 };
50 #define xfs_btree_magic(cur) \
51         xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
52
53
54 STATIC int                              /* error (0 or EFSCORRUPTED) */
55 xfs_btree_check_lblock(
56         struct xfs_btree_cur    *cur,   /* btree cursor */
57         struct xfs_btree_block  *block, /* btree long form block pointer */
58         int                     level,  /* level of the btree block */
59         struct xfs_buf          *bp)    /* buffer for block, if any */
60 {
61         int                     lblock_ok = 1; /* block passes checks */
62         struct xfs_mount        *mp;    /* file system mount point */
63
64         mp = cur->bc_mp;
65
66         if (xfs_sb_version_hascrc(&mp->m_sb)) {
67                 lblock_ok = lblock_ok &&
68                         uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid) &&
69                         block->bb_u.l.bb_blkno == cpu_to_be64(
70                                 bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
71         }
72
73         lblock_ok = lblock_ok &&
74                 be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
75                 be16_to_cpu(block->bb_level) == level &&
76                 be16_to_cpu(block->bb_numrecs) <=
77                         cur->bc_ops->get_maxrecs(cur, level) &&
78                 block->bb_u.l.bb_leftsib &&
79                 (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK) ||
80                  XFS_FSB_SANITY_CHECK(mp,
81                         be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
82                 block->bb_u.l.bb_rightsib &&
83                 (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK) ||
84                  XFS_FSB_SANITY_CHECK(mp,
85                         be64_to_cpu(block->bb_u.l.bb_rightsib)));
86
87         if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
88                         XFS_ERRTAG_BTREE_CHECK_LBLOCK,
89                         XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
90                 if (bp)
91                         trace_xfs_btree_corrupt(bp, _RET_IP_);
92                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
93                 return -EFSCORRUPTED;
94         }
95         return 0;
96 }
97
98 STATIC int                              /* error (0 or EFSCORRUPTED) */
99 xfs_btree_check_sblock(
100         struct xfs_btree_cur    *cur,   /* btree cursor */
101         struct xfs_btree_block  *block, /* btree short form block pointer */
102         int                     level,  /* level of the btree block */
103         struct xfs_buf          *bp)    /* buffer containing block */
104 {
105         struct xfs_mount        *mp;    /* file system mount point */
106         struct xfs_buf          *agbp;  /* buffer for ag. freespace struct */
107         struct xfs_agf          *agf;   /* ag. freespace structure */
108         xfs_agblock_t           agflen; /* native ag. freespace length */
109         int                     sblock_ok = 1; /* block passes checks */
110
111         mp = cur->bc_mp;
112         agbp = cur->bc_private.a.agbp;
113         agf = XFS_BUF_TO_AGF(agbp);
114         agflen = be32_to_cpu(agf->agf_length);
115
116         if (xfs_sb_version_hascrc(&mp->m_sb)) {
117                 sblock_ok = sblock_ok &&
118                         uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid) &&
119                         block->bb_u.s.bb_blkno == cpu_to_be64(
120                                 bp ? bp->b_bn : XFS_BUF_DADDR_NULL);
121         }
122
123         sblock_ok = sblock_ok &&
124                 be32_to_cpu(block->bb_magic) == xfs_btree_magic(cur) &&
125                 be16_to_cpu(block->bb_level) == level &&
126                 be16_to_cpu(block->bb_numrecs) <=
127                         cur->bc_ops->get_maxrecs(cur, level) &&
128                 (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
129                  be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
130                 block->bb_u.s.bb_leftsib &&
131                 (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
132                  be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
133                 block->bb_u.s.bb_rightsib;
134
135         if (unlikely(XFS_TEST_ERROR(!sblock_ok, mp,
136                         XFS_ERRTAG_BTREE_CHECK_SBLOCK,
137                         XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
138                 if (bp)
139                         trace_xfs_btree_corrupt(bp, _RET_IP_);
140                 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, mp);
141                 return -EFSCORRUPTED;
142         }
143         return 0;
144 }
145
146 /*
147  * Debug routine: check that block header is ok.
148  */
149 int
150 xfs_btree_check_block(
151         struct xfs_btree_cur    *cur,   /* btree cursor */
152         struct xfs_btree_block  *block, /* generic btree block pointer */
153         int                     level,  /* level of the btree block */
154         struct xfs_buf          *bp)    /* buffer containing block, if any */
155 {
156         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
157                 return xfs_btree_check_lblock(cur, block, level, bp);
158         else
159                 return xfs_btree_check_sblock(cur, block, level, bp);
160 }
161
162 /*
163  * Check that (long) pointer is ok.
164  */
165 int                                     /* error (0 or EFSCORRUPTED) */
166 xfs_btree_check_lptr(
167         struct xfs_btree_cur    *cur,   /* btree cursor */
168         xfs_fsblock_t           bno,    /* btree block disk address */
169         int                     level)  /* btree block level */
170 {
171         XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
172                 level > 0 &&
173                 bno != NULLFSBLOCK &&
174                 XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
175         return 0;
176 }
177
178 #ifdef DEBUG
179 /*
180  * Check that (short) pointer is ok.
181  */
182 STATIC int                              /* error (0 or EFSCORRUPTED) */
183 xfs_btree_check_sptr(
184         struct xfs_btree_cur    *cur,   /* btree cursor */
185         xfs_agblock_t           bno,    /* btree block disk address */
186         int                     level)  /* btree block level */
187 {
188         xfs_agblock_t           agblocks = cur->bc_mp->m_sb.sb_agblocks;
189
190         XFS_WANT_CORRUPTED_RETURN(cur->bc_mp,
191                 level > 0 &&
192                 bno != NULLAGBLOCK &&
193                 bno != 0 &&
194                 bno < agblocks);
195         return 0;
196 }
197
198 /*
199  * Check that block ptr is ok.
200  */
201 STATIC int                              /* error (0 or EFSCORRUPTED) */
202 xfs_btree_check_ptr(
203         struct xfs_btree_cur    *cur,   /* btree cursor */
204         union xfs_btree_ptr     *ptr,   /* btree block disk address */
205         int                     index,  /* offset from ptr to check */
206         int                     level)  /* btree block level */
207 {
208         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
209                 return xfs_btree_check_lptr(cur,
210                                 be64_to_cpu((&ptr->l)[index]), level);
211         } else {
212                 return xfs_btree_check_sptr(cur,
213                                 be32_to_cpu((&ptr->s)[index]), level);
214         }
215 }
216 #endif
217
218 /*
219  * Calculate CRC on the whole btree block and stuff it into the
220  * long-form btree header.
221  *
222  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
223  * it into the buffer so recovery knows what the last modifcation was that made
224  * it to disk.
225  */
226 void
227 xfs_btree_lblock_calc_crc(
228         struct xfs_buf          *bp)
229 {
230         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
231         struct xfs_buf_log_item *bip = bp->b_fspriv;
232
233         if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
234                 return;
235         if (bip)
236                 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
237         xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
238 }
239
240 bool
241 xfs_btree_lblock_verify_crc(
242         struct xfs_buf          *bp)
243 {
244         if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
245                 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
246
247         return true;
248 }
249
250 /*
251  * Calculate CRC on the whole btree block and stuff it into the
252  * short-form btree header.
253  *
254  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
255  * it into the buffer so recovery knows what the last modifcation was that made
256  * it to disk.
257  */
258 void
259 xfs_btree_sblock_calc_crc(
260         struct xfs_buf          *bp)
261 {
262         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
263         struct xfs_buf_log_item *bip = bp->b_fspriv;
264
265         if (!xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
266                 return;
267         if (bip)
268                 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
269         xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
270 }
271
272 bool
273 xfs_btree_sblock_verify_crc(
274         struct xfs_buf          *bp)
275 {
276         if (xfs_sb_version_hascrc(&bp->b_target->bt_mount->m_sb))
277                 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
278
279         return true;
280 }
281
282 /*
283  * Delete the btree cursor.
284  */
285 void
286 xfs_btree_del_cursor(
287         xfs_btree_cur_t *cur,           /* btree cursor */
288         int             error)          /* del because of error */
289 {
290         int             i;              /* btree level */
291
292         /*
293          * Clear the buffer pointers, and release the buffers.
294          * If we're doing this in the face of an error, we
295          * need to make sure to inspect all of the entries
296          * in the bc_bufs array for buffers to be unlocked.
297          * This is because some of the btree code works from
298          * level n down to 0, and if we get an error along
299          * the way we won't have initialized all the entries
300          * down to 0.
301          */
302         for (i = 0; i < cur->bc_nlevels; i++) {
303                 if (cur->bc_bufs[i])
304                         xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[i]);
305                 else if (!error)
306                         break;
307         }
308         /*
309          * Can't free a bmap cursor without having dealt with the
310          * allocated indirect blocks' accounting.
311          */
312         ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
313                cur->bc_private.b.allocated == 0);
314         /*
315          * Free the cursor.
316          */
317         kmem_zone_free(xfs_btree_cur_zone, cur);
318 }
319
320 /*
321  * Duplicate the btree cursor.
322  * Allocate a new one, copy the record, re-get the buffers.
323  */
324 int                                     /* error */
325 xfs_btree_dup_cursor(
326         xfs_btree_cur_t *cur,           /* input cursor */
327         xfs_btree_cur_t **ncur)         /* output cursor */
328 {
329         xfs_buf_t       *bp;            /* btree block's buffer pointer */
330         int             error;          /* error return value */
331         int             i;              /* level number of btree block */
332         xfs_mount_t     *mp;            /* mount structure for filesystem */
333         xfs_btree_cur_t *new;           /* new cursor value */
334         xfs_trans_t     *tp;            /* transaction pointer, can be NULL */
335
336         tp = cur->bc_tp;
337         mp = cur->bc_mp;
338
339         /*
340          * Allocate a new cursor like the old one.
341          */
342         new = cur->bc_ops->dup_cursor(cur);
343
344         /*
345          * Copy the record currently in the cursor.
346          */
347         new->bc_rec = cur->bc_rec;
348
349         /*
350          * For each level current, re-get the buffer and copy the ptr value.
351          */
352         for (i = 0; i < new->bc_nlevels; i++) {
353                 new->bc_ptrs[i] = cur->bc_ptrs[i];
354                 new->bc_ra[i] = cur->bc_ra[i];
355                 bp = cur->bc_bufs[i];
356                 if (bp) {
357                         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
358                                                    XFS_BUF_ADDR(bp), mp->m_bsize,
359                                                    0, &bp,
360                                                    cur->bc_ops->buf_ops);
361                         if (error) {
362                                 xfs_btree_del_cursor(new, error);
363                                 *ncur = NULL;
364                                 return error;
365                         }
366                 }
367                 new->bc_bufs[i] = bp;
368         }
369         *ncur = new;
370         return 0;
371 }
372
373 /*
374  * XFS btree block layout and addressing:
375  *
376  * There are two types of blocks in the btree: leaf and non-leaf blocks.
377  *
378  * The leaf record start with a header then followed by records containing
379  * the values.  A non-leaf block also starts with the same header, and
380  * then first contains lookup keys followed by an equal number of pointers
381  * to the btree blocks at the previous level.
382  *
383  *              +--------+-------+-------+-------+-------+-------+-------+
384  * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
385  *              +--------+-------+-------+-------+-------+-------+-------+
386  *
387  *              +--------+-------+-------+-------+-------+-------+-------+
388  * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
389  *              +--------+-------+-------+-------+-------+-------+-------+
390  *
391  * The header is called struct xfs_btree_block for reasons better left unknown
392  * and comes in different versions for short (32bit) and long (64bit) block
393  * pointers.  The record and key structures are defined by the btree instances
394  * and opaque to the btree core.  The block pointers are simple disk endian
395  * integers, available in a short (32bit) and long (64bit) variant.
396  *
397  * The helpers below calculate the offset of a given record, key or pointer
398  * into a btree block (xfs_btree_*_offset) or return a pointer to the given
399  * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
400  * inside the btree block is done using indices starting at one, not zero!
401  */
402
403 /*
404  * Return size of the btree block header for this btree instance.
405  */
406 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
407 {
408         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
409                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
410                         return XFS_BTREE_LBLOCK_CRC_LEN;
411                 return XFS_BTREE_LBLOCK_LEN;
412         }
413         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS)
414                 return XFS_BTREE_SBLOCK_CRC_LEN;
415         return XFS_BTREE_SBLOCK_LEN;
416 }
417
418 /*
419  * Return size of btree block pointers for this btree instance.
420  */
421 static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
422 {
423         return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
424                 sizeof(__be64) : sizeof(__be32);
425 }
426
427 /*
428  * Calculate offset of the n-th record in a btree block.
429  */
430 STATIC size_t
431 xfs_btree_rec_offset(
432         struct xfs_btree_cur    *cur,
433         int                     n)
434 {
435         return xfs_btree_block_len(cur) +
436                 (n - 1) * cur->bc_ops->rec_len;
437 }
438
439 /*
440  * Calculate offset of the n-th key in a btree block.
441  */
442 STATIC size_t
443 xfs_btree_key_offset(
444         struct xfs_btree_cur    *cur,
445         int                     n)
446 {
447         return xfs_btree_block_len(cur) +
448                 (n - 1) * cur->bc_ops->key_len;
449 }
450
451 /*
452  * Calculate offset of the n-th block pointer in a btree block.
453  */
454 STATIC size_t
455 xfs_btree_ptr_offset(
456         struct xfs_btree_cur    *cur,
457         int                     n,
458         int                     level)
459 {
460         return xfs_btree_block_len(cur) +
461                 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
462                 (n - 1) * xfs_btree_ptr_len(cur);
463 }
464
465 /*
466  * Return a pointer to the n-th record in the btree block.
467  */
468 STATIC union xfs_btree_rec *
469 xfs_btree_rec_addr(
470         struct xfs_btree_cur    *cur,
471         int                     n,
472         struct xfs_btree_block  *block)
473 {
474         return (union xfs_btree_rec *)
475                 ((char *)block + xfs_btree_rec_offset(cur, n));
476 }
477
478 /*
479  * Return a pointer to the n-th key in the btree block.
480  */
481 STATIC union xfs_btree_key *
482 xfs_btree_key_addr(
483         struct xfs_btree_cur    *cur,
484         int                     n,
485         struct xfs_btree_block  *block)
486 {
487         return (union xfs_btree_key *)
488                 ((char *)block + xfs_btree_key_offset(cur, n));
489 }
490
491 /*
492  * Return a pointer to the n-th block pointer in the btree block.
493  */
494 STATIC union xfs_btree_ptr *
495 xfs_btree_ptr_addr(
496         struct xfs_btree_cur    *cur,
497         int                     n,
498         struct xfs_btree_block  *block)
499 {
500         int                     level = xfs_btree_get_level(block);
501
502         ASSERT(block->bb_level != 0);
503
504         return (union xfs_btree_ptr *)
505                 ((char *)block + xfs_btree_ptr_offset(cur, n, level));
506 }
507
508 /*
509  * Get the root block which is stored in the inode.
510  *
511  * For now this btree implementation assumes the btree root is always
512  * stored in the if_broot field of an inode fork.
513  */
514 STATIC struct xfs_btree_block *
515 xfs_btree_get_iroot(
516        struct xfs_btree_cur    *cur)
517 {
518        struct xfs_ifork        *ifp;
519
520        ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
521        return (struct xfs_btree_block *)ifp->if_broot;
522 }
523
524 /*
525  * Retrieve the block pointer from the cursor at the given level.
526  * This may be an inode btree root or from a buffer.
527  */
528 STATIC struct xfs_btree_block *         /* generic btree block pointer */
529 xfs_btree_get_block(
530         struct xfs_btree_cur    *cur,   /* btree cursor */
531         int                     level,  /* level in btree */
532         struct xfs_buf          **bpp)  /* buffer containing the block */
533 {
534         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
535             (level == cur->bc_nlevels - 1)) {
536                 *bpp = NULL;
537                 return xfs_btree_get_iroot(cur);
538         }
539
540         *bpp = cur->bc_bufs[level];
541         return XFS_BUF_TO_BLOCK(*bpp);
542 }
543
544 /*
545  * Get a buffer for the block, return it with no data read.
546  * Long-form addressing.
547  */
548 xfs_buf_t *                             /* buffer for fsbno */
549 xfs_btree_get_bufl(
550         xfs_mount_t     *mp,            /* file system mount point */
551         xfs_trans_t     *tp,            /* transaction pointer */
552         xfs_fsblock_t   fsbno,          /* file system block number */
553         uint            lock)           /* lock flags for get_buf */
554 {
555         xfs_daddr_t             d;              /* real disk block address */
556
557         ASSERT(fsbno != NULLFSBLOCK);
558         d = XFS_FSB_TO_DADDR(mp, fsbno);
559         return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
560 }
561
562 /*
563  * Get a buffer for the block, return it with no data read.
564  * Short-form addressing.
565  */
566 xfs_buf_t *                             /* buffer for agno/agbno */
567 xfs_btree_get_bufs(
568         xfs_mount_t     *mp,            /* file system mount point */
569         xfs_trans_t     *tp,            /* transaction pointer */
570         xfs_agnumber_t  agno,           /* allocation group number */
571         xfs_agblock_t   agbno,          /* allocation group block number */
572         uint            lock)           /* lock flags for get_buf */
573 {
574         xfs_daddr_t             d;              /* real disk block address */
575
576         ASSERT(agno != NULLAGNUMBER);
577         ASSERT(agbno != NULLAGBLOCK);
578         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
579         return xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize, lock);
580 }
581
582 /*
583  * Check for the cursor referring to the last block at the given level.
584  */
585 int                                     /* 1=is last block, 0=not last block */
586 xfs_btree_islastblock(
587         xfs_btree_cur_t         *cur,   /* btree cursor */
588         int                     level)  /* level to check */
589 {
590         struct xfs_btree_block  *block; /* generic btree block pointer */
591         xfs_buf_t               *bp;    /* buffer containing block */
592
593         block = xfs_btree_get_block(cur, level, &bp);
594         xfs_btree_check_block(cur, block, level, bp);
595         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
596                 return block->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK);
597         else
598                 return block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK);
599 }
600
601 /*
602  * Change the cursor to point to the first record at the given level.
603  * Other levels are unaffected.
604  */
605 STATIC int                              /* success=1, failure=0 */
606 xfs_btree_firstrec(
607         xfs_btree_cur_t         *cur,   /* btree cursor */
608         int                     level)  /* level to change */
609 {
610         struct xfs_btree_block  *block; /* generic btree block pointer */
611         xfs_buf_t               *bp;    /* buffer containing block */
612
613         /*
614          * Get the block pointer for this level.
615          */
616         block = xfs_btree_get_block(cur, level, &bp);
617         xfs_btree_check_block(cur, block, level, bp);
618         /*
619          * It's empty, there is no such record.
620          */
621         if (!block->bb_numrecs)
622                 return 0;
623         /*
624          * Set the ptr value to 1, that's the first record/key.
625          */
626         cur->bc_ptrs[level] = 1;
627         return 1;
628 }
629
630 /*
631  * Change the cursor to point to the last record in the current block
632  * at the given level.  Other levels are unaffected.
633  */
634 STATIC int                              /* success=1, failure=0 */
635 xfs_btree_lastrec(
636         xfs_btree_cur_t         *cur,   /* btree cursor */
637         int                     level)  /* level to change */
638 {
639         struct xfs_btree_block  *block; /* generic btree block pointer */
640         xfs_buf_t               *bp;    /* buffer containing block */
641
642         /*
643          * Get the block pointer for this level.
644          */
645         block = xfs_btree_get_block(cur, level, &bp);
646         xfs_btree_check_block(cur, block, level, bp);
647         /*
648          * It's empty, there is no such record.
649          */
650         if (!block->bb_numrecs)
651                 return 0;
652         /*
653          * Set the ptr value to numrecs, that's the last record/key.
654          */
655         cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
656         return 1;
657 }
658
659 /*
660  * Compute first and last byte offsets for the fields given.
661  * Interprets the offsets table, which contains struct field offsets.
662  */
663 void
664 xfs_btree_offsets(
665         __int64_t       fields,         /* bitmask of fields */
666         const short     *offsets,       /* table of field offsets */
667         int             nbits,          /* number of bits to inspect */
668         int             *first,         /* output: first byte offset */
669         int             *last)          /* output: last byte offset */
670 {
671         int             i;              /* current bit number */
672         __int64_t       imask;          /* mask for current bit number */
673
674         ASSERT(fields != 0);
675         /*
676          * Find the lowest bit, so the first byte offset.
677          */
678         for (i = 0, imask = 1LL; ; i++, imask <<= 1) {
679                 if (imask & fields) {
680                         *first = offsets[i];
681                         break;
682                 }
683         }
684         /*
685          * Find the highest bit, so the last byte offset.
686          */
687         for (i = nbits - 1, imask = 1LL << i; ; i--, imask >>= 1) {
688                 if (imask & fields) {
689                         *last = offsets[i + 1] - 1;
690                         break;
691                 }
692         }
693 }
694
695 /*
696  * Get a buffer for the block, return it read in.
697  * Long-form addressing.
698  */
699 int
700 xfs_btree_read_bufl(
701         struct xfs_mount        *mp,            /* file system mount point */
702         struct xfs_trans        *tp,            /* transaction pointer */
703         xfs_fsblock_t           fsbno,          /* file system block number */
704         uint                    lock,           /* lock flags for read_buf */
705         struct xfs_buf          **bpp,          /* buffer for fsbno */
706         int                     refval,         /* ref count value for buffer */
707         const struct xfs_buf_ops *ops)
708 {
709         struct xfs_buf          *bp;            /* return value */
710         xfs_daddr_t             d;              /* real disk block address */
711         int                     error;
712
713         ASSERT(fsbno != NULLFSBLOCK);
714         d = XFS_FSB_TO_DADDR(mp, fsbno);
715         error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
716                                    mp->m_bsize, lock, &bp, ops);
717         if (error)
718                 return error;
719         if (bp)
720                 xfs_buf_set_ref(bp, refval);
721         *bpp = bp;
722         return 0;
723 }
724
725 /*
726  * Read-ahead the block, don't wait for it, don't return a buffer.
727  * Long-form addressing.
728  */
729 /* ARGSUSED */
730 void
731 xfs_btree_reada_bufl(
732         struct xfs_mount        *mp,            /* file system mount point */
733         xfs_fsblock_t           fsbno,          /* file system block number */
734         xfs_extlen_t            count,          /* count of filesystem blocks */
735         const struct xfs_buf_ops *ops)
736 {
737         xfs_daddr_t             d;
738
739         ASSERT(fsbno != NULLFSBLOCK);
740         d = XFS_FSB_TO_DADDR(mp, fsbno);
741         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
742 }
743
744 /*
745  * Read-ahead the block, don't wait for it, don't return a buffer.
746  * Short-form addressing.
747  */
748 /* ARGSUSED */
749 void
750 xfs_btree_reada_bufs(
751         struct xfs_mount        *mp,            /* file system mount point */
752         xfs_agnumber_t          agno,           /* allocation group number */
753         xfs_agblock_t           agbno,          /* allocation group block number */
754         xfs_extlen_t            count,          /* count of filesystem blocks */
755         const struct xfs_buf_ops *ops)
756 {
757         xfs_daddr_t             d;
758
759         ASSERT(agno != NULLAGNUMBER);
760         ASSERT(agbno != NULLAGBLOCK);
761         d = XFS_AGB_TO_DADDR(mp, agno, agbno);
762         xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
763 }
764
765 STATIC int
766 xfs_btree_readahead_lblock(
767         struct xfs_btree_cur    *cur,
768         int                     lr,
769         struct xfs_btree_block  *block)
770 {
771         int                     rval = 0;
772         xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
773         xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
774
775         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
776                 xfs_btree_reada_bufl(cur->bc_mp, left, 1,
777                                      cur->bc_ops->buf_ops);
778                 rval++;
779         }
780
781         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
782                 xfs_btree_reada_bufl(cur->bc_mp, right, 1,
783                                      cur->bc_ops->buf_ops);
784                 rval++;
785         }
786
787         return rval;
788 }
789
790 STATIC int
791 xfs_btree_readahead_sblock(
792         struct xfs_btree_cur    *cur,
793         int                     lr,
794         struct xfs_btree_block *block)
795 {
796         int                     rval = 0;
797         xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
798         xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
799
800
801         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
802                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
803                                      left, 1, cur->bc_ops->buf_ops);
804                 rval++;
805         }
806
807         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
808                 xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
809                                      right, 1, cur->bc_ops->buf_ops);
810                 rval++;
811         }
812
813         return rval;
814 }
815
816 /*
817  * Read-ahead btree blocks, at the given level.
818  * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
819  */
820 STATIC int
821 xfs_btree_readahead(
822         struct xfs_btree_cur    *cur,           /* btree cursor */
823         int                     lev,            /* level in btree */
824         int                     lr)             /* left/right bits */
825 {
826         struct xfs_btree_block  *block;
827
828         /*
829          * No readahead needed if we are at the root level and the
830          * btree root is stored in the inode.
831          */
832         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
833             (lev == cur->bc_nlevels - 1))
834                 return 0;
835
836         if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
837                 return 0;
838
839         cur->bc_ra[lev] |= lr;
840         block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
841
842         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
843                 return xfs_btree_readahead_lblock(cur, lr, block);
844         return xfs_btree_readahead_sblock(cur, lr, block);
845 }
846
847 STATIC xfs_daddr_t
848 xfs_btree_ptr_to_daddr(
849         struct xfs_btree_cur    *cur,
850         union xfs_btree_ptr     *ptr)
851 {
852         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
853                 ASSERT(ptr->l != cpu_to_be64(NULLFSBLOCK));
854
855                 return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
856         } else {
857                 ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
858                 ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
859
860                 return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
861                                         be32_to_cpu(ptr->s));
862         }
863 }
864
865 /*
866  * Readahead @count btree blocks at the given @ptr location.
867  *
868  * We don't need to care about long or short form btrees here as we have a
869  * method of converting the ptr directly to a daddr available to us.
870  */
871 STATIC void
872 xfs_btree_readahead_ptr(
873         struct xfs_btree_cur    *cur,
874         union xfs_btree_ptr     *ptr,
875         xfs_extlen_t            count)
876 {
877         xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
878                           xfs_btree_ptr_to_daddr(cur, ptr),
879                           cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
880 }
881
882 /*
883  * Set the buffer for level "lev" in the cursor to bp, releasing
884  * any previous buffer.
885  */
886 STATIC void
887 xfs_btree_setbuf(
888         xfs_btree_cur_t         *cur,   /* btree cursor */
889         int                     lev,    /* level in btree */
890         xfs_buf_t               *bp)    /* new buffer to set */
891 {
892         struct xfs_btree_block  *b;     /* btree block */
893
894         if (cur->bc_bufs[lev])
895                 xfs_trans_brelse(cur->bc_tp, cur->bc_bufs[lev]);
896         cur->bc_bufs[lev] = bp;
897         cur->bc_ra[lev] = 0;
898
899         b = XFS_BUF_TO_BLOCK(bp);
900         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
901                 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
902                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
903                 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
904                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
905         } else {
906                 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
907                         cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
908                 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
909                         cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
910         }
911 }
912
913 STATIC int
914 xfs_btree_ptr_is_null(
915         struct xfs_btree_cur    *cur,
916         union xfs_btree_ptr     *ptr)
917 {
918         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
919                 return ptr->l == cpu_to_be64(NULLFSBLOCK);
920         else
921                 return ptr->s == cpu_to_be32(NULLAGBLOCK);
922 }
923
924 STATIC void
925 xfs_btree_set_ptr_null(
926         struct xfs_btree_cur    *cur,
927         union xfs_btree_ptr     *ptr)
928 {
929         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
930                 ptr->l = cpu_to_be64(NULLFSBLOCK);
931         else
932                 ptr->s = cpu_to_be32(NULLAGBLOCK);
933 }
934
935 /*
936  * Get/set/init sibling pointers
937  */
938 STATIC void
939 xfs_btree_get_sibling(
940         struct xfs_btree_cur    *cur,
941         struct xfs_btree_block  *block,
942         union xfs_btree_ptr     *ptr,
943         int                     lr)
944 {
945         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
946
947         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
948                 if (lr == XFS_BB_RIGHTSIB)
949                         ptr->l = block->bb_u.l.bb_rightsib;
950                 else
951                         ptr->l = block->bb_u.l.bb_leftsib;
952         } else {
953                 if (lr == XFS_BB_RIGHTSIB)
954                         ptr->s = block->bb_u.s.bb_rightsib;
955                 else
956                         ptr->s = block->bb_u.s.bb_leftsib;
957         }
958 }
959
960 STATIC void
961 xfs_btree_set_sibling(
962         struct xfs_btree_cur    *cur,
963         struct xfs_btree_block  *block,
964         union xfs_btree_ptr     *ptr,
965         int                     lr)
966 {
967         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
968
969         if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
970                 if (lr == XFS_BB_RIGHTSIB)
971                         block->bb_u.l.bb_rightsib = ptr->l;
972                 else
973                         block->bb_u.l.bb_leftsib = ptr->l;
974         } else {
975                 if (lr == XFS_BB_RIGHTSIB)
976                         block->bb_u.s.bb_rightsib = ptr->s;
977                 else
978                         block->bb_u.s.bb_leftsib = ptr->s;
979         }
980 }
981
982 void
983 xfs_btree_init_block_int(
984         struct xfs_mount        *mp,
985         struct xfs_btree_block  *buf,
986         xfs_daddr_t             blkno,
987         __u32                   magic,
988         __u16                   level,
989         __u16                   numrecs,
990         __u64                   owner,
991         unsigned int            flags)
992 {
993         buf->bb_magic = cpu_to_be32(magic);
994         buf->bb_level = cpu_to_be16(level);
995         buf->bb_numrecs = cpu_to_be16(numrecs);
996
997         if (flags & XFS_BTREE_LONG_PTRS) {
998                 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
999                 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1000                 if (flags & XFS_BTREE_CRC_BLOCKS) {
1001                         buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1002                         buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1003                         uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_uuid);
1004                         buf->bb_u.l.bb_pad = 0;
1005                         buf->bb_u.l.bb_lsn = 0;
1006                 }
1007         } else {
1008                 /* owner is a 32 bit value on short blocks */
1009                 __u32 __owner = (__u32)owner;
1010
1011                 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1012                 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1013                 if (flags & XFS_BTREE_CRC_BLOCKS) {
1014                         buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1015                         buf->bb_u.s.bb_owner = cpu_to_be32(__owner);
1016                         uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_uuid);
1017                         buf->bb_u.s.bb_lsn = 0;
1018                 }
1019         }
1020 }
1021
1022 void
1023 xfs_btree_init_block(
1024         struct xfs_mount *mp,
1025         struct xfs_buf  *bp,
1026         __u32           magic,
1027         __u16           level,
1028         __u16           numrecs,
1029         __u64           owner,
1030         unsigned int    flags)
1031 {
1032         xfs_btree_init_block_int(mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1033                                  magic, level, numrecs, owner, flags);
1034 }
1035
1036 STATIC void
1037 xfs_btree_init_block_cur(
1038         struct xfs_btree_cur    *cur,
1039         struct xfs_buf          *bp,
1040         int                     level,
1041         int                     numrecs)
1042 {
1043         __u64 owner;
1044
1045         /*
1046          * we can pull the owner from the cursor right now as the different
1047          * owners align directly with the pointer size of the btree. This may
1048          * change in future, but is safe for current users of the generic btree
1049          * code.
1050          */
1051         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1052                 owner = cur->bc_private.b.ip->i_ino;
1053         else
1054                 owner = cur->bc_private.a.agno;
1055
1056         xfs_btree_init_block_int(cur->bc_mp, XFS_BUF_TO_BLOCK(bp), bp->b_bn,
1057                                  xfs_btree_magic(cur), level, numrecs,
1058                                  owner, cur->bc_flags);
1059 }
1060
1061 /*
1062  * Return true if ptr is the last record in the btree and
1063  * we need to track updates to this record.  The decision
1064  * will be further refined in the update_lastrec method.
1065  */
1066 STATIC int
1067 xfs_btree_is_lastrec(
1068         struct xfs_btree_cur    *cur,
1069         struct xfs_btree_block  *block,
1070         int                     level)
1071 {
1072         union xfs_btree_ptr     ptr;
1073
1074         if (level > 0)
1075                 return 0;
1076         if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
1077                 return 0;
1078
1079         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1080         if (!xfs_btree_ptr_is_null(cur, &ptr))
1081                 return 0;
1082         return 1;
1083 }
1084
1085 STATIC void
1086 xfs_btree_buf_to_ptr(
1087         struct xfs_btree_cur    *cur,
1088         struct xfs_buf          *bp,
1089         union xfs_btree_ptr     *ptr)
1090 {
1091         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
1092                 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1093                                         XFS_BUF_ADDR(bp)));
1094         else {
1095                 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1096                                         XFS_BUF_ADDR(bp)));
1097         }
1098 }
1099
1100 STATIC void
1101 xfs_btree_set_refs(
1102         struct xfs_btree_cur    *cur,
1103         struct xfs_buf          *bp)
1104 {
1105         switch (cur->bc_btnum) {
1106         case XFS_BTNUM_BNO:
1107         case XFS_BTNUM_CNT:
1108                 xfs_buf_set_ref(bp, XFS_ALLOC_BTREE_REF);
1109                 break;
1110         case XFS_BTNUM_INO:
1111         case XFS_BTNUM_FINO:
1112                 xfs_buf_set_ref(bp, XFS_INO_BTREE_REF);
1113                 break;
1114         case XFS_BTNUM_BMAP:
1115                 xfs_buf_set_ref(bp, XFS_BMAP_BTREE_REF);
1116                 break;
1117         default:
1118                 ASSERT(0);
1119         }
1120 }
1121
1122 STATIC int
1123 xfs_btree_get_buf_block(
1124         struct xfs_btree_cur    *cur,
1125         union xfs_btree_ptr     *ptr,
1126         int                     flags,
1127         struct xfs_btree_block  **block,
1128         struct xfs_buf          **bpp)
1129 {
1130         struct xfs_mount        *mp = cur->bc_mp;
1131         xfs_daddr_t             d;
1132
1133         /* need to sort out how callers deal with failures first */
1134         ASSERT(!(flags & XBF_TRYLOCK));
1135
1136         d = xfs_btree_ptr_to_daddr(cur, ptr);
1137         *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
1138                                  mp->m_bsize, flags);
1139
1140         if (!*bpp)
1141                 return -ENOMEM;
1142
1143         (*bpp)->b_ops = cur->bc_ops->buf_ops;
1144         *block = XFS_BUF_TO_BLOCK(*bpp);
1145         return 0;
1146 }
1147
1148 /*
1149  * Read in the buffer at the given ptr and return the buffer and
1150  * the block pointer within the buffer.
1151  */
1152 STATIC int
1153 xfs_btree_read_buf_block(
1154         struct xfs_btree_cur    *cur,
1155         union xfs_btree_ptr     *ptr,
1156         int                     flags,
1157         struct xfs_btree_block  **block,
1158         struct xfs_buf          **bpp)
1159 {
1160         struct xfs_mount        *mp = cur->bc_mp;
1161         xfs_daddr_t             d;
1162         int                     error;
1163
1164         /* need to sort out how callers deal with failures first */
1165         ASSERT(!(flags & XBF_TRYLOCK));
1166
1167         d = xfs_btree_ptr_to_daddr(cur, ptr);
1168         error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
1169                                    mp->m_bsize, flags, bpp,
1170                                    cur->bc_ops->buf_ops);
1171         if (error)
1172                 return error;
1173
1174         xfs_btree_set_refs(cur, *bpp);
1175         *block = XFS_BUF_TO_BLOCK(*bpp);
1176         return 0;
1177 }
1178
1179 /*
1180  * Copy keys from one btree block to another.
1181  */
1182 STATIC void
1183 xfs_btree_copy_keys(
1184         struct xfs_btree_cur    *cur,
1185         union xfs_btree_key     *dst_key,
1186         union xfs_btree_key     *src_key,
1187         int                     numkeys)
1188 {
1189         ASSERT(numkeys >= 0);
1190         memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1191 }
1192
1193 /*
1194  * Copy records from one btree block to another.
1195  */
1196 STATIC void
1197 xfs_btree_copy_recs(
1198         struct xfs_btree_cur    *cur,
1199         union xfs_btree_rec     *dst_rec,
1200         union xfs_btree_rec     *src_rec,
1201         int                     numrecs)
1202 {
1203         ASSERT(numrecs >= 0);
1204         memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1205 }
1206
1207 /*
1208  * Copy block pointers from one btree block to another.
1209  */
1210 STATIC void
1211 xfs_btree_copy_ptrs(
1212         struct xfs_btree_cur    *cur,
1213         union xfs_btree_ptr     *dst_ptr,
1214         union xfs_btree_ptr     *src_ptr,
1215         int                     numptrs)
1216 {
1217         ASSERT(numptrs >= 0);
1218         memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
1219 }
1220
1221 /*
1222  * Shift keys one index left/right inside a single btree block.
1223  */
1224 STATIC void
1225 xfs_btree_shift_keys(
1226         struct xfs_btree_cur    *cur,
1227         union xfs_btree_key     *key,
1228         int                     dir,
1229         int                     numkeys)
1230 {
1231         char                    *dst_key;
1232
1233         ASSERT(numkeys >= 0);
1234         ASSERT(dir == 1 || dir == -1);
1235
1236         dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1237         memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1238 }
1239
1240 /*
1241  * Shift records one index left/right inside a single btree block.
1242  */
1243 STATIC void
1244 xfs_btree_shift_recs(
1245         struct xfs_btree_cur    *cur,
1246         union xfs_btree_rec     *rec,
1247         int                     dir,
1248         int                     numrecs)
1249 {
1250         char                    *dst_rec;
1251
1252         ASSERT(numrecs >= 0);
1253         ASSERT(dir == 1 || dir == -1);
1254
1255         dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1256         memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1257 }
1258
1259 /*
1260  * Shift block pointers one index left/right inside a single btree block.
1261  */
1262 STATIC void
1263 xfs_btree_shift_ptrs(
1264         struct xfs_btree_cur    *cur,
1265         union xfs_btree_ptr     *ptr,
1266         int                     dir,
1267         int                     numptrs)
1268 {
1269         char                    *dst_ptr;
1270
1271         ASSERT(numptrs >= 0);
1272         ASSERT(dir == 1 || dir == -1);
1273
1274         dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
1275         memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
1276 }
1277
1278 /*
1279  * Log key values from the btree block.
1280  */
1281 STATIC void
1282 xfs_btree_log_keys(
1283         struct xfs_btree_cur    *cur,
1284         struct xfs_buf          *bp,
1285         int                     first,
1286         int                     last)
1287 {
1288         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1289         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1290
1291         if (bp) {
1292                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1293                 xfs_trans_log_buf(cur->bc_tp, bp,
1294                                   xfs_btree_key_offset(cur, first),
1295                                   xfs_btree_key_offset(cur, last + 1) - 1);
1296         } else {
1297                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1298                                 xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1299         }
1300
1301         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1302 }
1303
1304 /*
1305  * Log record values from the btree block.
1306  */
1307 void
1308 xfs_btree_log_recs(
1309         struct xfs_btree_cur    *cur,
1310         struct xfs_buf          *bp,
1311         int                     first,
1312         int                     last)
1313 {
1314         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1315         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1316
1317         xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1318         xfs_trans_log_buf(cur->bc_tp, bp,
1319                           xfs_btree_rec_offset(cur, first),
1320                           xfs_btree_rec_offset(cur, last + 1) - 1);
1321
1322         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1323 }
1324
1325 /*
1326  * Log block pointer fields from a btree block (nonleaf).
1327  */
1328 STATIC void
1329 xfs_btree_log_ptrs(
1330         struct xfs_btree_cur    *cur,   /* btree cursor */
1331         struct xfs_buf          *bp,    /* buffer containing btree block */
1332         int                     first,  /* index of first pointer to log */
1333         int                     last)   /* index of last pointer to log */
1334 {
1335         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1336         XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
1337
1338         if (bp) {
1339                 struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1340                 int                     level = xfs_btree_get_level(block);
1341
1342                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1343                 xfs_trans_log_buf(cur->bc_tp, bp,
1344                                 xfs_btree_ptr_offset(cur, first, level),
1345                                 xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1346         } else {
1347                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1348                         xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1349         }
1350
1351         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1352 }
1353
1354 /*
1355  * Log fields from a btree block header.
1356  */
1357 void
1358 xfs_btree_log_block(
1359         struct xfs_btree_cur    *cur,   /* btree cursor */
1360         struct xfs_buf          *bp,    /* buffer containing btree block */
1361         int                     fields) /* mask of fields: XFS_BB_... */
1362 {
1363         int                     first;  /* first byte offset logged */
1364         int                     last;   /* last byte offset logged */
1365         static const short      soffsets[] = {  /* table of offsets (short) */
1366                 offsetof(struct xfs_btree_block, bb_magic),
1367                 offsetof(struct xfs_btree_block, bb_level),
1368                 offsetof(struct xfs_btree_block, bb_numrecs),
1369                 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1370                 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1371                 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1372                 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1373                 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1374                 offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1375                 offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1376                 XFS_BTREE_SBLOCK_CRC_LEN
1377         };
1378         static const short      loffsets[] = {  /* table of offsets (long) */
1379                 offsetof(struct xfs_btree_block, bb_magic),
1380                 offsetof(struct xfs_btree_block, bb_level),
1381                 offsetof(struct xfs_btree_block, bb_numrecs),
1382                 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1383                 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1384                 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1385                 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1386                 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1387                 offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1388                 offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1389                 offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1390                 XFS_BTREE_LBLOCK_CRC_LEN
1391         };
1392
1393         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1394         XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
1395
1396         if (bp) {
1397                 int nbits;
1398
1399                 if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
1400                         /*
1401                          * We don't log the CRC when updating a btree
1402                          * block but instead recreate it during log
1403                          * recovery.  As the log buffers have checksums
1404                          * of their own this is safe and avoids logging a crc
1405                          * update in a lot of places.
1406                          */
1407                         if (fields == XFS_BB_ALL_BITS)
1408                                 fields = XFS_BB_ALL_BITS_CRC;
1409                         nbits = XFS_BB_NUM_BITS_CRC;
1410                 } else {
1411                         nbits = XFS_BB_NUM_BITS;
1412                 }
1413                 xfs_btree_offsets(fields,
1414                                   (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
1415                                         loffsets : soffsets,
1416                                   nbits, &first, &last);
1417                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1418                 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1419         } else {
1420                 xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
1421                         xfs_ilog_fbroot(cur->bc_private.b.whichfork));
1422         }
1423
1424         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1425 }
1426
1427 /*
1428  * Increment cursor by one record at the level.
1429  * For nonzero levels the leaf-ward information is untouched.
1430  */
1431 int                                             /* error */
1432 xfs_btree_increment(
1433         struct xfs_btree_cur    *cur,
1434         int                     level,
1435         int                     *stat)          /* success/failure */
1436 {
1437         struct xfs_btree_block  *block;
1438         union xfs_btree_ptr     ptr;
1439         struct xfs_buf          *bp;
1440         int                     error;          /* error return value */
1441         int                     lev;
1442
1443         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1444         XFS_BTREE_TRACE_ARGI(cur, level);
1445
1446         ASSERT(level < cur->bc_nlevels);
1447
1448         /* Read-ahead to the right at this level. */
1449         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1450
1451         /* Get a pointer to the btree block. */
1452         block = xfs_btree_get_block(cur, level, &bp);
1453
1454 #ifdef DEBUG
1455         error = xfs_btree_check_block(cur, block, level, bp);
1456         if (error)
1457                 goto error0;
1458 #endif
1459
1460         /* We're done if we remain in the block after the increment. */
1461         if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
1462                 goto out1;
1463
1464         /* Fail if we just went off the right edge of the tree. */
1465         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1466         if (xfs_btree_ptr_is_null(cur, &ptr))
1467                 goto out0;
1468
1469         XFS_BTREE_STATS_INC(cur, increment);
1470
1471         /*
1472          * March up the tree incrementing pointers.
1473          * Stop when we don't go off the right edge of a block.
1474          */
1475         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1476                 block = xfs_btree_get_block(cur, lev, &bp);
1477
1478 #ifdef DEBUG
1479                 error = xfs_btree_check_block(cur, block, lev, bp);
1480                 if (error)
1481                         goto error0;
1482 #endif
1483
1484                 if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
1485                         break;
1486
1487                 /* Read-ahead the right block for the next loop. */
1488                 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1489         }
1490
1491         /*
1492          * If we went off the root then we are either seriously
1493          * confused or have the tree root in an inode.
1494          */
1495         if (lev == cur->bc_nlevels) {
1496                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1497                         goto out0;
1498                 ASSERT(0);
1499                 error = -EFSCORRUPTED;
1500                 goto error0;
1501         }
1502         ASSERT(lev < cur->bc_nlevels);
1503
1504         /*
1505          * Now walk back down the tree, fixing up the cursor's buffer
1506          * pointers and key numbers.
1507          */
1508         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1509                 union xfs_btree_ptr     *ptrp;
1510
1511                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1512                 --lev;
1513                 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1514                 if (error)
1515                         goto error0;
1516
1517                 xfs_btree_setbuf(cur, lev, bp);
1518                 cur->bc_ptrs[lev] = 1;
1519         }
1520 out1:
1521         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1522         *stat = 1;
1523         return 0;
1524
1525 out0:
1526         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1527         *stat = 0;
1528         return 0;
1529
1530 error0:
1531         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1532         return error;
1533 }
1534
1535 /*
1536  * Decrement cursor by one record at the level.
1537  * For nonzero levels the leaf-ward information is untouched.
1538  */
1539 int                                             /* error */
1540 xfs_btree_decrement(
1541         struct xfs_btree_cur    *cur,
1542         int                     level,
1543         int                     *stat)          /* success/failure */
1544 {
1545         struct xfs_btree_block  *block;
1546         xfs_buf_t               *bp;
1547         int                     error;          /* error return value */
1548         int                     lev;
1549         union xfs_btree_ptr     ptr;
1550
1551         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1552         XFS_BTREE_TRACE_ARGI(cur, level);
1553
1554         ASSERT(level < cur->bc_nlevels);
1555
1556         /* Read-ahead to the left at this level. */
1557         xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1558
1559         /* We're done if we remain in the block after the decrement. */
1560         if (--cur->bc_ptrs[level] > 0)
1561                 goto out1;
1562
1563         /* Get a pointer to the btree block. */
1564         block = xfs_btree_get_block(cur, level, &bp);
1565
1566 #ifdef DEBUG
1567         error = xfs_btree_check_block(cur, block, level, bp);
1568         if (error)
1569                 goto error0;
1570 #endif
1571
1572         /* Fail if we just went off the left edge of the tree. */
1573         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1574         if (xfs_btree_ptr_is_null(cur, &ptr))
1575                 goto out0;
1576
1577         XFS_BTREE_STATS_INC(cur, decrement);
1578
1579         /*
1580          * March up the tree decrementing pointers.
1581          * Stop when we don't go off the left edge of a block.
1582          */
1583         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1584                 if (--cur->bc_ptrs[lev] > 0)
1585                         break;
1586                 /* Read-ahead the left block for the next loop. */
1587                 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1588         }
1589
1590         /*
1591          * If we went off the root then we are seriously confused.
1592          * or the root of the tree is in an inode.
1593          */
1594         if (lev == cur->bc_nlevels) {
1595                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
1596                         goto out0;
1597                 ASSERT(0);
1598                 error = -EFSCORRUPTED;
1599                 goto error0;
1600         }
1601         ASSERT(lev < cur->bc_nlevels);
1602
1603         /*
1604          * Now walk back down the tree, fixing up the cursor's buffer
1605          * pointers and key numbers.
1606          */
1607         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1608                 union xfs_btree_ptr     *ptrp;
1609
1610                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
1611                 --lev;
1612                 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1613                 if (error)
1614                         goto error0;
1615                 xfs_btree_setbuf(cur, lev, bp);
1616                 cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
1617         }
1618 out1:
1619         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1620         *stat = 1;
1621         return 0;
1622
1623 out0:
1624         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1625         *stat = 0;
1626         return 0;
1627
1628 error0:
1629         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1630         return error;
1631 }
1632
1633 STATIC int
1634 xfs_btree_lookup_get_block(
1635         struct xfs_btree_cur    *cur,   /* btree cursor */
1636         int                     level,  /* level in the btree */
1637         union xfs_btree_ptr     *pp,    /* ptr to btree block */
1638         struct xfs_btree_block  **blkp) /* return btree block */
1639 {
1640         struct xfs_buf          *bp;    /* buffer pointer for btree block */
1641         int                     error = 0;
1642
1643         /* special case the root block if in an inode */
1644         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1645             (level == cur->bc_nlevels - 1)) {
1646                 *blkp = xfs_btree_get_iroot(cur);
1647                 return 0;
1648         }
1649
1650         /*
1651          * If the old buffer at this level for the disk address we are
1652          * looking for re-use it.
1653          *
1654          * Otherwise throw it away and get a new one.
1655          */
1656         bp = cur->bc_bufs[level];
1657         if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
1658                 *blkp = XFS_BUF_TO_BLOCK(bp);
1659                 return 0;
1660         }
1661
1662         error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1663         if (error)
1664                 return error;
1665
1666         xfs_btree_setbuf(cur, level, bp);
1667         return 0;
1668 }
1669
1670 /*
1671  * Get current search key.  For level 0 we don't actually have a key
1672  * structure so we make one up from the record.  For all other levels
1673  * we just return the right key.
1674  */
1675 STATIC union xfs_btree_key *
1676 xfs_lookup_get_search_key(
1677         struct xfs_btree_cur    *cur,
1678         int                     level,
1679         int                     keyno,
1680         struct xfs_btree_block  *block,
1681         union xfs_btree_key     *kp)
1682 {
1683         if (level == 0) {
1684                 cur->bc_ops->init_key_from_rec(kp,
1685                                 xfs_btree_rec_addr(cur, keyno, block));
1686                 return kp;
1687         }
1688
1689         return xfs_btree_key_addr(cur, keyno, block);
1690 }
1691
1692 /*
1693  * Lookup the record.  The cursor is made to point to it, based on dir.
1694  * stat is set to 0 if can't find any such record, 1 for success.
1695  */
1696 int                                     /* error */
1697 xfs_btree_lookup(
1698         struct xfs_btree_cur    *cur,   /* btree cursor */
1699         xfs_lookup_t            dir,    /* <=, ==, or >= */
1700         int                     *stat)  /* success/failure */
1701 {
1702         struct xfs_btree_block  *block; /* current btree block */
1703         __int64_t               diff;   /* difference for the current key */
1704         int                     error;  /* error return value */
1705         int                     keyno;  /* current key number */
1706         int                     level;  /* level in the btree */
1707         union xfs_btree_ptr     *pp;    /* ptr to btree block */
1708         union xfs_btree_ptr     ptr;    /* ptr to btree block */
1709
1710         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1711         XFS_BTREE_TRACE_ARGI(cur, dir);
1712
1713         XFS_BTREE_STATS_INC(cur, lookup);
1714
1715         block = NULL;
1716         keyno = 0;
1717
1718         /* initialise start pointer from cursor */
1719         cur->bc_ops->init_ptr_from_cur(cur, &ptr);
1720         pp = &ptr;
1721
1722         /*
1723          * Iterate over each level in the btree, starting at the root.
1724          * For each level above the leaves, find the key we need, based
1725          * on the lookup record, then follow the corresponding block
1726          * pointer down to the next level.
1727          */
1728         for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
1729                 /* Get the block we need to do the lookup on. */
1730                 error = xfs_btree_lookup_get_block(cur, level, pp, &block);
1731                 if (error)
1732                         goto error0;
1733
1734                 if (diff == 0) {
1735                         /*
1736                          * If we already had a key match at a higher level, we
1737                          * know we need to use the first entry in this block.
1738                          */
1739                         keyno = 1;
1740                 } else {
1741                         /* Otherwise search this block. Do a binary search. */
1742
1743                         int     high;   /* high entry number */
1744                         int     low;    /* low entry number */
1745
1746                         /* Set low and high entry numbers, 1-based. */
1747                         low = 1;
1748                         high = xfs_btree_get_numrecs(block);
1749                         if (!high) {
1750                                 /* Block is empty, must be an empty leaf. */
1751                                 ASSERT(level == 0 && cur->bc_nlevels == 1);
1752
1753                                 cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
1754                                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1755                                 *stat = 0;
1756                                 return 0;
1757                         }
1758
1759                         /* Binary search the block. */
1760                         while (low <= high) {
1761                                 union xfs_btree_key     key;
1762                                 union xfs_btree_key     *kp;
1763
1764                                 XFS_BTREE_STATS_INC(cur, compare);
1765
1766                                 /* keyno is average of low and high. */
1767                                 keyno = (low + high) >> 1;
1768
1769                                 /* Get current search key */
1770                                 kp = xfs_lookup_get_search_key(cur, level,
1771                                                 keyno, block, &key);
1772
1773                                 /*
1774                                  * Compute difference to get next direction:
1775                                  *  - less than, move right
1776                                  *  - greater than, move left
1777                                  *  - equal, we're done
1778                                  */
1779                                 diff = cur->bc_ops->key_diff(cur, kp);
1780                                 if (diff < 0)
1781                                         low = keyno + 1;
1782                                 else if (diff > 0)
1783                                         high = keyno - 1;
1784                                 else
1785                                         break;
1786                         }
1787                 }
1788
1789                 /*
1790                  * If there are more levels, set up for the next level
1791                  * by getting the block number and filling in the cursor.
1792                  */
1793                 if (level > 0) {
1794                         /*
1795                          * If we moved left, need the previous key number,
1796                          * unless there isn't one.
1797                          */
1798                         if (diff > 0 && --keyno < 1)
1799                                 keyno = 1;
1800                         pp = xfs_btree_ptr_addr(cur, keyno, block);
1801
1802 #ifdef DEBUG
1803                         error = xfs_btree_check_ptr(cur, pp, 0, level);
1804                         if (error)
1805                                 goto error0;
1806 #endif
1807                         cur->bc_ptrs[level] = keyno;
1808                 }
1809         }
1810
1811         /* Done with the search. See if we need to adjust the results. */
1812         if (dir != XFS_LOOKUP_LE && diff < 0) {
1813                 keyno++;
1814                 /*
1815                  * If ge search and we went off the end of the block, but it's
1816                  * not the last block, we're in the wrong block.
1817                  */
1818                 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1819                 if (dir == XFS_LOOKUP_GE &&
1820                     keyno > xfs_btree_get_numrecs(block) &&
1821                     !xfs_btree_ptr_is_null(cur, &ptr)) {
1822                         int     i;
1823
1824                         cur->bc_ptrs[0] = keyno;
1825                         error = xfs_btree_increment(cur, 0, &i);
1826                         if (error)
1827                                 goto error0;
1828                         XFS_WANT_CORRUPTED_RETURN(cur->bc_mp, i == 1);
1829                         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1830                         *stat = 1;
1831                         return 0;
1832                 }
1833         } else if (dir == XFS_LOOKUP_LE && diff > 0)
1834                 keyno--;
1835         cur->bc_ptrs[0] = keyno;
1836
1837         /* Return if we succeeded or not. */
1838         if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
1839                 *stat = 0;
1840         else if (dir != XFS_LOOKUP_EQ || diff == 0)
1841                 *stat = 1;
1842         else
1843                 *stat = 0;
1844         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1845         return 0;
1846
1847 error0:
1848         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1849         return error;
1850 }
1851
1852 /*
1853  * Update keys at all levels from here to the root along the cursor's path.
1854  */
1855 STATIC int
1856 xfs_btree_updkey(
1857         struct xfs_btree_cur    *cur,
1858         union xfs_btree_key     *keyp,
1859         int                     level)
1860 {
1861         struct xfs_btree_block  *block;
1862         struct xfs_buf          *bp;
1863         union xfs_btree_key     *kp;
1864         int                     ptr;
1865
1866         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1867         XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
1868
1869         ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
1870
1871         /*
1872          * Go up the tree from this level toward the root.
1873          * At each level, update the key value to the value input.
1874          * Stop when we reach a level where the cursor isn't pointing
1875          * at the first entry in the block.
1876          */
1877         for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
1878 #ifdef DEBUG
1879                 int             error;
1880 #endif
1881                 block = xfs_btree_get_block(cur, level, &bp);
1882 #ifdef DEBUG
1883                 error = xfs_btree_check_block(cur, block, level, bp);
1884                 if (error) {
1885                         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1886                         return error;
1887                 }
1888 #endif
1889                 ptr = cur->bc_ptrs[level];
1890                 kp = xfs_btree_key_addr(cur, ptr, block);
1891                 xfs_btree_copy_keys(cur, kp, keyp, 1);
1892                 xfs_btree_log_keys(cur, bp, ptr, ptr);
1893         }
1894
1895         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1896         return 0;
1897 }
1898
1899 /*
1900  * Update the record referred to by cur to the value in the
1901  * given record. This either works (return 0) or gets an
1902  * EFSCORRUPTED error.
1903  */
1904 int
1905 xfs_btree_update(
1906         struct xfs_btree_cur    *cur,
1907         union xfs_btree_rec     *rec)
1908 {
1909         struct xfs_btree_block  *block;
1910         struct xfs_buf          *bp;
1911         int                     error;
1912         int                     ptr;
1913         union xfs_btree_rec     *rp;
1914
1915         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1916         XFS_BTREE_TRACE_ARGR(cur, rec);
1917
1918         /* Pick up the current block. */
1919         block = xfs_btree_get_block(cur, 0, &bp);
1920
1921 #ifdef DEBUG
1922         error = xfs_btree_check_block(cur, block, 0, bp);
1923         if (error)
1924                 goto error0;
1925 #endif
1926         /* Get the address of the rec to be updated. */
1927         ptr = cur->bc_ptrs[0];
1928         rp = xfs_btree_rec_addr(cur, ptr, block);
1929
1930         /* Fill in the new contents and log them. */
1931         xfs_btree_copy_recs(cur, rp, rec, 1);
1932         xfs_btree_log_recs(cur, bp, ptr, ptr);
1933
1934         /*
1935          * If we are tracking the last record in the tree and
1936          * we are at the far right edge of the tree, update it.
1937          */
1938         if (xfs_btree_is_lastrec(cur, block, 0)) {
1939                 cur->bc_ops->update_lastrec(cur, block, rec,
1940                                             ptr, LASTREC_UPDATE);
1941         }
1942
1943         /* Updating first rec in leaf. Pass new key value up to our parent. */
1944         if (ptr == 1) {
1945                 union xfs_btree_key     key;
1946
1947                 cur->bc_ops->init_key_from_rec(&key, rec);
1948                 error = xfs_btree_updkey(cur, &key, 1);
1949                 if (error)
1950                         goto error0;
1951         }
1952
1953         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
1954         return 0;
1955
1956 error0:
1957         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
1958         return error;
1959 }
1960
1961 /*
1962  * Move 1 record left from cur/level if possible.
1963  * Update cur to reflect the new path.
1964  */
1965 STATIC int                                      /* error */
1966 xfs_btree_lshift(
1967         struct xfs_btree_cur    *cur,
1968         int                     level,
1969         int                     *stat)          /* success/failure */
1970 {
1971         union xfs_btree_key     key;            /* btree key */
1972         struct xfs_buf          *lbp;           /* left buffer pointer */
1973         struct xfs_btree_block  *left;          /* left btree block */
1974         int                     lrecs;          /* left record count */
1975         struct xfs_buf          *rbp;           /* right buffer pointer */
1976         struct xfs_btree_block  *right;         /* right btree block */
1977         int                     rrecs;          /* right record count */
1978         union xfs_btree_ptr     lptr;           /* left btree pointer */
1979         union xfs_btree_key     *rkp = NULL;    /* right btree key */
1980         union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
1981         union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
1982         int                     error;          /* error return value */
1983
1984         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
1985         XFS_BTREE_TRACE_ARGI(cur, level);
1986
1987         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
1988             level == cur->bc_nlevels - 1)
1989                 goto out0;
1990
1991         /* Set up variables for this block as "right". */
1992         right = xfs_btree_get_block(cur, level, &rbp);
1993
1994 #ifdef DEBUG
1995         error = xfs_btree_check_block(cur, right, level, rbp);
1996         if (error)
1997                 goto error0;
1998 #endif
1999
2000         /* If we've got no left sibling then we can't shift an entry left. */
2001         xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2002         if (xfs_btree_ptr_is_null(cur, &lptr))
2003                 goto out0;
2004
2005         /*
2006          * If the cursor entry is the one that would be moved, don't
2007          * do it... it's too complicated.
2008          */
2009         if (cur->bc_ptrs[level] <= 1)
2010                 goto out0;
2011
2012         /* Set up the left neighbor as "left". */
2013         error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2014         if (error)
2015                 goto error0;
2016
2017         /* If it's full, it can't take another entry. */
2018         lrecs = xfs_btree_get_numrecs(left);
2019         if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2020                 goto out0;
2021
2022         rrecs = xfs_btree_get_numrecs(right);
2023
2024         /*
2025          * We add one entry to the left side and remove one for the right side.
2026          * Account for it here, the changes will be updated on disk and logged
2027          * later.
2028          */
2029         lrecs++;
2030         rrecs--;
2031
2032         XFS_BTREE_STATS_INC(cur, lshift);
2033         XFS_BTREE_STATS_ADD(cur, moves, 1);
2034
2035         /*
2036          * If non-leaf, copy a key and a ptr to the left block.
2037          * Log the changes to the left block.
2038          */
2039         if (level > 0) {
2040                 /* It's a non-leaf.  Move keys and pointers. */
2041                 union xfs_btree_key     *lkp;   /* left btree key */
2042                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2043
2044                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2045                 rkp = xfs_btree_key_addr(cur, 1, right);
2046
2047                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2048                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2049 #ifdef DEBUG
2050                 error = xfs_btree_check_ptr(cur, rpp, 0, level);
2051                 if (error)
2052                         goto error0;
2053 #endif
2054                 xfs_btree_copy_keys(cur, lkp, rkp, 1);
2055                 xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2056
2057                 xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2058                 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2059
2060                 ASSERT(cur->bc_ops->keys_inorder(cur,
2061                         xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2062         } else {
2063                 /* It's a leaf.  Move records.  */
2064                 union xfs_btree_rec     *lrp;   /* left record pointer */
2065
2066                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2067                 rrp = xfs_btree_rec_addr(cur, 1, right);
2068
2069                 xfs_btree_copy_recs(cur, lrp, rrp, 1);
2070                 xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2071
2072                 ASSERT(cur->bc_ops->recs_inorder(cur,
2073                         xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2074         }
2075
2076         xfs_btree_set_numrecs(left, lrecs);
2077         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2078
2079         xfs_btree_set_numrecs(right, rrecs);
2080         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2081
2082         /*
2083          * Slide the contents of right down one entry.
2084          */
2085         XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2086         if (level > 0) {
2087                 /* It's a nonleaf. operate on keys and ptrs */
2088 #ifdef DEBUG
2089                 int                     i;              /* loop index */
2090
2091                 for (i = 0; i < rrecs; i++) {
2092                         error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
2093                         if (error)
2094                                 goto error0;
2095                 }
2096 #endif
2097                 xfs_btree_shift_keys(cur,
2098                                 xfs_btree_key_addr(cur, 2, right),
2099                                 -1, rrecs);
2100                 xfs_btree_shift_ptrs(cur,
2101                                 xfs_btree_ptr_addr(cur, 2, right),
2102                                 -1, rrecs);
2103
2104                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2105                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2106         } else {
2107                 /* It's a leaf. operate on records */
2108                 xfs_btree_shift_recs(cur,
2109                         xfs_btree_rec_addr(cur, 2, right),
2110                         -1, rrecs);
2111                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2112
2113                 /*
2114                  * If it's the first record in the block, we'll need a key
2115                  * structure to pass up to the next level (updkey).
2116                  */
2117                 cur->bc_ops->init_key_from_rec(&key,
2118                         xfs_btree_rec_addr(cur, 1, right));
2119                 rkp = &key;
2120         }
2121
2122         /* Update the parent key values of right. */
2123         error = xfs_btree_updkey(cur, rkp, level + 1);
2124         if (error)
2125                 goto error0;
2126
2127         /* Slide the cursor value left one. */
2128         cur->bc_ptrs[level]--;
2129
2130         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2131         *stat = 1;
2132         return 0;
2133
2134 out0:
2135         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2136         *stat = 0;
2137         return 0;
2138
2139 error0:
2140         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2141         return error;
2142 }
2143
2144 /*
2145  * Move 1 record right from cur/level if possible.
2146  * Update cur to reflect the new path.
2147  */
2148 STATIC int                                      /* error */
2149 xfs_btree_rshift(
2150         struct xfs_btree_cur    *cur,
2151         int                     level,
2152         int                     *stat)          /* success/failure */
2153 {
2154         union xfs_btree_key     key;            /* btree key */
2155         struct xfs_buf          *lbp;           /* left buffer pointer */
2156         struct xfs_btree_block  *left;          /* left btree block */
2157         struct xfs_buf          *rbp;           /* right buffer pointer */
2158         struct xfs_btree_block  *right;         /* right btree block */
2159         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2160         union xfs_btree_ptr     rptr;           /* right block pointer */
2161         union xfs_btree_key     *rkp;           /* right btree key */
2162         int                     rrecs;          /* right record count */
2163         int                     lrecs;          /* left record count */
2164         int                     error;          /* error return value */
2165         int                     i;              /* loop counter */
2166
2167         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2168         XFS_BTREE_TRACE_ARGI(cur, level);
2169
2170         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2171             (level == cur->bc_nlevels - 1))
2172                 goto out0;
2173
2174         /* Set up variables for this block as "left". */
2175         left = xfs_btree_get_block(cur, level, &lbp);
2176
2177 #ifdef DEBUG
2178         error = xfs_btree_check_block(cur, left, level, lbp);
2179         if (error)
2180                 goto error0;
2181 #endif
2182
2183         /* If we've got no right sibling then we can't shift an entry right. */
2184         xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2185         if (xfs_btree_ptr_is_null(cur, &rptr))
2186                 goto out0;
2187
2188         /*
2189          * If the cursor entry is the one that would be moved, don't
2190          * do it... it's too complicated.
2191          */
2192         lrecs = xfs_btree_get_numrecs(left);
2193         if (cur->bc_ptrs[level] >= lrecs)
2194                 goto out0;
2195
2196         /* Set up the right neighbor as "right". */
2197         error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2198         if (error)
2199                 goto error0;
2200
2201         /* If it's full, it can't take another entry. */
2202         rrecs = xfs_btree_get_numrecs(right);
2203         if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2204                 goto out0;
2205
2206         XFS_BTREE_STATS_INC(cur, rshift);
2207         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2208
2209         /*
2210          * Make a hole at the start of the right neighbor block, then
2211          * copy the last left block entry to the hole.
2212          */
2213         if (level > 0) {
2214                 /* It's a nonleaf. make a hole in the keys and ptrs */
2215                 union xfs_btree_key     *lkp;
2216                 union xfs_btree_ptr     *lpp;
2217                 union xfs_btree_ptr     *rpp;
2218
2219                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2220                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2221                 rkp = xfs_btree_key_addr(cur, 1, right);
2222                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2223
2224 #ifdef DEBUG
2225                 for (i = rrecs - 1; i >= 0; i--) {
2226                         error = xfs_btree_check_ptr(cur, rpp, i, level);
2227                         if (error)
2228                                 goto error0;
2229                 }
2230 #endif
2231
2232                 xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2233                 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2234
2235 #ifdef DEBUG
2236                 error = xfs_btree_check_ptr(cur, lpp, 0, level);
2237                 if (error)
2238                         goto error0;
2239 #endif
2240
2241                 /* Now put the new data in, and log it. */
2242                 xfs_btree_copy_keys(cur, rkp, lkp, 1);
2243                 xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2244
2245                 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2246                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2247
2248                 ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2249                         xfs_btree_key_addr(cur, 2, right)));
2250         } else {
2251                 /* It's a leaf. make a hole in the records */
2252                 union xfs_btree_rec     *lrp;
2253                 union xfs_btree_rec     *rrp;
2254
2255                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2256                 rrp = xfs_btree_rec_addr(cur, 1, right);
2257
2258                 xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2259
2260                 /* Now put the new data in, and log it. */
2261                 xfs_btree_copy_recs(cur, rrp, lrp, 1);
2262                 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2263
2264                 cur->bc_ops->init_key_from_rec(&key, rrp);
2265                 rkp = &key;
2266
2267                 ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
2268                         xfs_btree_rec_addr(cur, 2, right)));
2269         }
2270
2271         /*
2272          * Decrement and log left's numrecs, bump and log right's numrecs.
2273          */
2274         xfs_btree_set_numrecs(left, --lrecs);
2275         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2276
2277         xfs_btree_set_numrecs(right, ++rrecs);
2278         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2279
2280         /*
2281          * Using a temporary cursor, update the parent key values of the
2282          * block on the right.
2283          */
2284         error = xfs_btree_dup_cursor(cur, &tcur);
2285         if (error)
2286                 goto error0;
2287         i = xfs_btree_lastrec(tcur, level);
2288         XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
2289
2290         error = xfs_btree_increment(tcur, level, &i);
2291         if (error)
2292                 goto error1;
2293
2294         error = xfs_btree_updkey(tcur, rkp, level + 1);
2295         if (error)
2296                 goto error1;
2297
2298         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2299
2300         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2301         *stat = 1;
2302         return 0;
2303
2304 out0:
2305         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2306         *stat = 0;
2307         return 0;
2308
2309 error0:
2310         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2311         return error;
2312
2313 error1:
2314         XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
2315         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2316         return error;
2317 }
2318
2319 /*
2320  * Split cur/level block in half.
2321  * Return new block number and the key to its first
2322  * record (to be inserted into parent).
2323  */
2324 STATIC int                                      /* error */
2325 __xfs_btree_split(
2326         struct xfs_btree_cur    *cur,
2327         int                     level,
2328         union xfs_btree_ptr     *ptrp,
2329         union xfs_btree_key     *key,
2330         struct xfs_btree_cur    **curp,
2331         int                     *stat)          /* success/failure */
2332 {
2333         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2334         struct xfs_buf          *lbp;           /* left buffer pointer */
2335         struct xfs_btree_block  *left;          /* left btree block */
2336         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2337         struct xfs_buf          *rbp;           /* right buffer pointer */
2338         struct xfs_btree_block  *right;         /* right btree block */
2339         union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2340         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2341         struct xfs_btree_block  *rrblock;       /* right-right btree block */
2342         int                     lrecs;
2343         int                     rrecs;
2344         int                     src_index;
2345         int                     error;          /* error return value */
2346 #ifdef DEBUG
2347         int                     i;
2348 #endif
2349
2350         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2351         XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
2352
2353         XFS_BTREE_STATS_INC(cur, split);
2354
2355         /* Set up left block (current one). */
2356         left = xfs_btree_get_block(cur, level, &lbp);
2357
2358 #ifdef DEBUG
2359         error = xfs_btree_check_block(cur, left, level, lbp);
2360         if (error)
2361                 goto error0;
2362 #endif
2363
2364         xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2365
2366         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2367         error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, stat);
2368         if (error)
2369                 goto error0;
2370         if (*stat == 0)
2371                 goto out0;
2372         XFS_BTREE_STATS_INC(cur, alloc);
2373
2374         /* Set up the new block as "right". */
2375         error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
2376         if (error)
2377                 goto error0;
2378
2379         /* Fill in the btree header for the new right block. */
2380         xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2381
2382         /*
2383          * Split the entries between the old and the new block evenly.
2384          * Make sure that if there's an odd number of entries now, that
2385          * each new block will have the same number of entries.
2386          */
2387         lrecs = xfs_btree_get_numrecs(left);
2388         rrecs = lrecs / 2;
2389         if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
2390                 rrecs++;
2391         src_index = (lrecs - rrecs + 1);
2392
2393         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2394
2395         /*
2396          * Copy btree block entries from the left block over to the
2397          * new block, the right. Update the right block and log the
2398          * changes.
2399          */
2400         if (level > 0) {
2401                 /* It's a non-leaf.  Move keys and pointers. */
2402                 union xfs_btree_key     *lkp;   /* left btree key */
2403                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2404                 union xfs_btree_key     *rkp;   /* right btree key */
2405                 union xfs_btree_ptr     *rpp;   /* right address pointer */
2406
2407                 lkp = xfs_btree_key_addr(cur, src_index, left);
2408                 lpp = xfs_btree_ptr_addr(cur, src_index, left);
2409                 rkp = xfs_btree_key_addr(cur, 1, right);
2410                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2411
2412 #ifdef DEBUG
2413                 for (i = src_index; i < rrecs; i++) {
2414                         error = xfs_btree_check_ptr(cur, lpp, i, level);
2415                         if (error)
2416                                 goto error0;
2417                 }
2418 #endif
2419
2420                 xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2421                 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2422
2423                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2424                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2425
2426                 /* Grab the keys to the entries moved to the right block */
2427                 xfs_btree_copy_keys(cur, key, rkp, 1);
2428         } else {
2429                 /* It's a leaf.  Move records.  */
2430                 union xfs_btree_rec     *lrp;   /* left record pointer */
2431                 union xfs_btree_rec     *rrp;   /* right record pointer */
2432
2433                 lrp = xfs_btree_rec_addr(cur, src_index, left);
2434                 rrp = xfs_btree_rec_addr(cur, 1, right);
2435
2436                 xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2437                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2438
2439                 cur->bc_ops->init_key_from_rec(key,
2440                         xfs_btree_rec_addr(cur, 1, right));
2441         }
2442
2443
2444         /*
2445          * Find the left block number by looking in the buffer.
2446          * Adjust numrecs, sibling pointers.
2447          */
2448         xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2449         xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2450         xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2451         xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2452
2453         lrecs -= rrecs;
2454         xfs_btree_set_numrecs(left, lrecs);
2455         xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2456
2457         xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2458         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2459
2460         /*
2461          * If there's a block to the new block's right, make that block
2462          * point back to right instead of to left.
2463          */
2464         if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2465                 error = xfs_btree_read_buf_block(cur, &rrptr,
2466                                                         0, &rrblock, &rrbp);
2467                 if (error)
2468                         goto error0;
2469                 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2470                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2471         }
2472         /*
2473          * If the cursor is really in the right block, move it there.
2474          * If it's just pointing past the last entry in left, then we'll
2475          * insert there, so don't change anything in that case.
2476          */
2477         if (cur->bc_ptrs[level] > lrecs + 1) {
2478                 xfs_btree_setbuf(cur, level, rbp);
2479                 cur->bc_ptrs[level] -= lrecs;
2480         }
2481         /*
2482          * If there are more levels, we'll need another cursor which refers
2483          * the right block, no matter where this cursor was.
2484          */
2485         if (level + 1 < cur->bc_nlevels) {
2486                 error = xfs_btree_dup_cursor(cur, curp);
2487                 if (error)
2488                         goto error0;
2489                 (*curp)->bc_ptrs[level + 1]++;
2490         }
2491         *ptrp = rptr;
2492         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2493         *stat = 1;
2494         return 0;
2495 out0:
2496         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2497         *stat = 0;
2498         return 0;
2499
2500 error0:
2501         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2502         return error;
2503 }
2504
2505 struct xfs_btree_split_args {
2506         struct xfs_btree_cur    *cur;
2507         int                     level;
2508         union xfs_btree_ptr     *ptrp;
2509         union xfs_btree_key     *key;
2510         struct xfs_btree_cur    **curp;
2511         int                     *stat;          /* success/failure */
2512         int                     result;
2513         bool                    kswapd; /* allocation in kswapd context */
2514         struct completion       *done;
2515         struct work_struct      work;
2516 };
2517
2518 /*
2519  * Stack switching interfaces for allocation
2520  */
2521 static void
2522 xfs_btree_split_worker(
2523         struct work_struct      *work)
2524 {
2525         struct xfs_btree_split_args     *args = container_of(work,
2526                                                 struct xfs_btree_split_args, work);
2527         unsigned long           pflags;
2528         unsigned long           new_pflags = PF_FSTRANS;
2529
2530         /*
2531          * we are in a transaction context here, but may also be doing work
2532          * in kswapd context, and hence we may need to inherit that state
2533          * temporarily to ensure that we don't block waiting for memory reclaim
2534          * in any way.
2535          */
2536         if (args->kswapd)
2537                 new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
2538
2539         current_set_flags_nested(&pflags, new_pflags);
2540
2541         args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
2542                                          args->key, args->curp, args->stat);
2543         complete(args->done);
2544
2545         current_restore_flags_nested(&pflags, new_pflags);
2546 }
2547
2548 /*
2549  * BMBT split requests often come in with little stack to work on. Push
2550  * them off to a worker thread so there is lots of stack to use. For the other
2551  * btree types, just call directly to avoid the context switch overhead here.
2552  */
2553 STATIC int                                      /* error */
2554 xfs_btree_split(
2555         struct xfs_btree_cur    *cur,
2556         int                     level,
2557         union xfs_btree_ptr     *ptrp,
2558         union xfs_btree_key     *key,
2559         struct xfs_btree_cur    **curp,
2560         int                     *stat)          /* success/failure */
2561 {
2562         struct xfs_btree_split_args     args;
2563         DECLARE_COMPLETION_ONSTACK(done);
2564
2565         if (cur->bc_btnum != XFS_BTNUM_BMAP)
2566                 return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
2567
2568         args.cur = cur;
2569         args.level = level;
2570         args.ptrp = ptrp;
2571         args.key = key;
2572         args.curp = curp;
2573         args.stat = stat;
2574         args.done = &done;
2575         args.kswapd = current_is_kswapd();
2576         INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
2577         queue_work(xfs_alloc_wq, &args.work);
2578         wait_for_completion(&done);
2579         destroy_work_on_stack(&args.work);
2580         return args.result;
2581 }
2582
2583
2584 /*
2585  * Copy the old inode root contents into a real block and make the
2586  * broot point to it.
2587  */
2588 int                                             /* error */
2589 xfs_btree_new_iroot(
2590         struct xfs_btree_cur    *cur,           /* btree cursor */
2591         int                     *logflags,      /* logging flags for inode */
2592         int                     *stat)          /* return status - 0 fail */
2593 {
2594         struct xfs_buf          *cbp;           /* buffer for cblock */
2595         struct xfs_btree_block  *block;         /* btree block */
2596         struct xfs_btree_block  *cblock;        /* child btree block */
2597         union xfs_btree_key     *ckp;           /* child key pointer */
2598         union xfs_btree_ptr     *cpp;           /* child ptr pointer */
2599         union xfs_btree_key     *kp;            /* pointer to btree key */
2600         union xfs_btree_ptr     *pp;            /* pointer to block addr */
2601         union xfs_btree_ptr     nptr;           /* new block addr */
2602         int                     level;          /* btree level */
2603         int                     error;          /* error return code */
2604 #ifdef DEBUG
2605         int                     i;              /* loop counter */
2606 #endif
2607
2608         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2609         XFS_BTREE_STATS_INC(cur, newroot);
2610
2611         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
2612
2613         level = cur->bc_nlevels - 1;
2614
2615         block = xfs_btree_get_iroot(cur);
2616         pp = xfs_btree_ptr_addr(cur, 1, block);
2617
2618         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2619         error = cur->bc_ops->alloc_block(cur, pp, &nptr, stat);
2620         if (error)
2621                 goto error0;
2622         if (*stat == 0) {
2623                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2624                 return 0;
2625         }
2626         XFS_BTREE_STATS_INC(cur, alloc);
2627
2628         /* Copy the root into a real block. */
2629         error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
2630         if (error)
2631                 goto error0;
2632
2633         /*
2634          * we can't just memcpy() the root in for CRC enabled btree blocks.
2635          * In that case have to also ensure the blkno remains correct
2636          */
2637         memcpy(cblock, block, xfs_btree_block_len(cur));
2638         if (cur->bc_flags & XFS_BTREE_CRC_BLOCKS) {
2639                 if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
2640                         cblock->bb_u.l.bb_blkno = cpu_to_be64(cbp->b_bn);
2641                 else
2642                         cblock->bb_u.s.bb_blkno = cpu_to_be64(cbp->b_bn);
2643         }
2644
2645         be16_add_cpu(&block->bb_level, 1);
2646         xfs_btree_set_numrecs(block, 1);
2647         cur->bc_nlevels++;
2648         cur->bc_ptrs[level + 1] = 1;
2649
2650         kp = xfs_btree_key_addr(cur, 1, block);
2651         ckp = xfs_btree_key_addr(cur, 1, cblock);
2652         xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
2653
2654         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
2655 #ifdef DEBUG
2656         for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
2657                 error = xfs_btree_check_ptr(cur, pp, i, level);
2658                 if (error)
2659                         goto error0;
2660         }
2661 #endif
2662         xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
2663
2664 #ifdef DEBUG
2665         error = xfs_btree_check_ptr(cur, &nptr, 0, level);
2666         if (error)
2667                 goto error0;
2668 #endif
2669         xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
2670
2671         xfs_iroot_realloc(cur->bc_private.b.ip,
2672                           1 - xfs_btree_get_numrecs(cblock),
2673                           cur->bc_private.b.whichfork);
2674
2675         xfs_btree_setbuf(cur, level, cbp);
2676
2677         /*
2678          * Do all this logging at the end so that
2679          * the root is at the right level.
2680          */
2681         xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
2682         xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2683         xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
2684
2685         *logflags |=
2686                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork);
2687         *stat = 1;
2688         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2689         return 0;
2690 error0:
2691         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2692         return error;
2693 }
2694
2695 /*
2696  * Allocate a new root block, fill it in.
2697  */
2698 STATIC int                              /* error */
2699 xfs_btree_new_root(
2700         struct xfs_btree_cur    *cur,   /* btree cursor */
2701         int                     *stat)  /* success/failure */
2702 {
2703         struct xfs_btree_block  *block; /* one half of the old root block */
2704         struct xfs_buf          *bp;    /* buffer containing block */
2705         int                     error;  /* error return value */
2706         struct xfs_buf          *lbp;   /* left buffer pointer */
2707         struct xfs_btree_block  *left;  /* left btree block */
2708         struct xfs_buf          *nbp;   /* new (root) buffer */
2709         struct xfs_btree_block  *new;   /* new (root) btree block */
2710         int                     nptr;   /* new value for key index, 1 or 2 */
2711         struct xfs_buf          *rbp;   /* right buffer pointer */
2712         struct xfs_btree_block  *right; /* right btree block */
2713         union xfs_btree_ptr     rptr;
2714         union xfs_btree_ptr     lptr;
2715
2716         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2717         XFS_BTREE_STATS_INC(cur, newroot);
2718
2719         /* initialise our start point from the cursor */
2720         cur->bc_ops->init_ptr_from_cur(cur, &rptr);
2721
2722         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2723         error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, stat);
2724         if (error)
2725                 goto error0;
2726         if (*stat == 0)
2727                 goto out0;
2728         XFS_BTREE_STATS_INC(cur, alloc);
2729
2730         /* Set up the new block. */
2731         error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
2732         if (error)
2733                 goto error0;
2734
2735         /* Set the root in the holding structure  increasing the level by 1. */
2736         cur->bc_ops->set_root(cur, &lptr, 1);
2737
2738         /*
2739          * At the previous root level there are now two blocks: the old root,
2740          * and the new block generated when it was split.  We don't know which
2741          * one the cursor is pointing at, so we set up variables "left" and
2742          * "right" for each case.
2743          */
2744         block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
2745
2746 #ifdef DEBUG
2747         error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
2748         if (error)
2749                 goto error0;
2750 #endif
2751
2752         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
2753         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
2754                 /* Our block is left, pick up the right block. */
2755                 lbp = bp;
2756                 xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2757                 left = block;
2758                 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2759                 if (error)
2760                         goto error0;
2761                 bp = rbp;
2762                 nptr = 1;
2763         } else {
2764                 /* Our block is right, pick up the left block. */
2765                 rbp = bp;
2766                 xfs_btree_buf_to_ptr(cur, rbp, &rptr);
2767                 right = block;
2768                 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2769                 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2770                 if (error)
2771                         goto error0;
2772                 bp = lbp;
2773                 nptr = 2;
2774         }
2775         /* Fill in the new block's btree header and log it. */
2776         xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
2777         xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
2778         ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
2779                         !xfs_btree_ptr_is_null(cur, &rptr));
2780
2781         /* Fill in the key data in the new root. */
2782         if (xfs_btree_get_level(left) > 0) {
2783                 xfs_btree_copy_keys(cur,
2784                                 xfs_btree_key_addr(cur, 1, new),
2785                                 xfs_btree_key_addr(cur, 1, left), 1);
2786                 xfs_btree_copy_keys(cur,
2787                                 xfs_btree_key_addr(cur, 2, new),
2788                                 xfs_btree_key_addr(cur, 1, right), 1);
2789         } else {
2790                 cur->bc_ops->init_key_from_rec(
2791                                 xfs_btree_key_addr(cur, 1, new),
2792                                 xfs_btree_rec_addr(cur, 1, left));
2793                 cur->bc_ops->init_key_from_rec(
2794                                 xfs_btree_key_addr(cur, 2, new),
2795                                 xfs_btree_rec_addr(cur, 1, right));
2796         }
2797         xfs_btree_log_keys(cur, nbp, 1, 2);
2798
2799         /* Fill in the pointer data in the new root. */
2800         xfs_btree_copy_ptrs(cur,
2801                 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
2802         xfs_btree_copy_ptrs(cur,
2803                 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
2804         xfs_btree_log_ptrs(cur, nbp, 1, 2);
2805
2806         /* Fix up the cursor. */
2807         xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
2808         cur->bc_ptrs[cur->bc_nlevels] = nptr;
2809         cur->bc_nlevels++;
2810         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2811         *stat = 1;
2812         return 0;
2813 error0:
2814         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
2815         return error;
2816 out0:
2817         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2818         *stat = 0;
2819         return 0;
2820 }
2821
2822 STATIC int
2823 xfs_btree_make_block_unfull(
2824         struct xfs_btree_cur    *cur,   /* btree cursor */
2825         int                     level,  /* btree level */
2826         int                     numrecs,/* # of recs in block */
2827         int                     *oindex,/* old tree index */
2828         int                     *index, /* new tree index */
2829         union xfs_btree_ptr     *nptr,  /* new btree ptr */
2830         struct xfs_btree_cur    **ncur, /* new btree cursor */
2831         union xfs_btree_rec     *nrec,  /* new record */
2832         int                     *stat)
2833 {
2834         union xfs_btree_key     key;    /* new btree key value */
2835         int                     error = 0;
2836
2837         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2838             level == cur->bc_nlevels - 1) {
2839                 struct xfs_inode *ip = cur->bc_private.b.ip;
2840
2841                 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
2842                         /* A root block that can be made bigger. */
2843                         xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
2844                 } else {
2845                         /* A root block that needs replacing */
2846                         int     logflags = 0;
2847
2848                         error = xfs_btree_new_iroot(cur, &logflags, stat);
2849                         if (error || *stat == 0)
2850                                 return error;
2851
2852                         xfs_trans_log_inode(cur->bc_tp, ip, logflags);
2853                 }
2854
2855                 return 0;
2856         }
2857
2858         /* First, try shifting an entry to the right neighbor. */
2859         error = xfs_btree_rshift(cur, level, stat);
2860         if (error || *stat)
2861                 return error;
2862
2863         /* Next, try shifting an entry to the left neighbor. */
2864         error = xfs_btree_lshift(cur, level, stat);
2865         if (error)
2866                 return error;
2867
2868         if (*stat) {
2869                 *oindex = *index = cur->bc_ptrs[level];
2870                 return 0;
2871         }
2872
2873         /*
2874          * Next, try splitting the current block in half.
2875          *
2876          * If this works we have to re-set our variables because we
2877          * could be in a different block now.
2878          */
2879         error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
2880         if (error || *stat == 0)
2881                 return error;
2882
2883
2884         *index = cur->bc_ptrs[level];
2885         cur->bc_ops->init_rec_from_key(&key, nrec);
2886         return 0;
2887 }
2888
2889 /*
2890  * Insert one record/level.  Return information to the caller
2891  * allowing the next level up to proceed if necessary.
2892  */
2893 STATIC int
2894 xfs_btree_insrec(
2895         struct xfs_btree_cur    *cur,   /* btree cursor */
2896         int                     level,  /* level to insert record at */
2897         union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
2898         union xfs_btree_rec     *recp,  /* i/o: record data inserted */
2899         struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
2900         int                     *stat)  /* success/failure */
2901 {
2902         struct xfs_btree_block  *block; /* btree block */
2903         struct xfs_buf          *bp;    /* buffer for block */
2904         union xfs_btree_key     key;    /* btree key */
2905         union xfs_btree_ptr     nptr;   /* new block ptr */
2906         struct xfs_btree_cur    *ncur;  /* new btree cursor */
2907         union xfs_btree_rec     nrec;   /* new record count */
2908         int                     optr;   /* old key/record index */
2909         int                     ptr;    /* key/record index */
2910         int                     numrecs;/* number of records */
2911         int                     error;  /* error return value */
2912 #ifdef DEBUG
2913         int                     i;
2914 #endif
2915
2916         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
2917         XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
2918
2919         ncur = NULL;
2920
2921         /*
2922          * If we have an external root pointer, and we've made it to the
2923          * root level, allocate a new root block and we're done.
2924          */
2925         if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
2926             (level >= cur->bc_nlevels)) {
2927                 error = xfs_btree_new_root(cur, stat);
2928                 xfs_btree_set_ptr_null(cur, ptrp);
2929
2930                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2931                 return error;
2932         }
2933
2934         /* If we're off the left edge, return failure. */
2935         ptr = cur->bc_ptrs[level];
2936         if (ptr == 0) {
2937                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
2938                 *stat = 0;
2939                 return 0;
2940         }
2941
2942         /* Make a key out of the record data to be inserted, and save it. */
2943         cur->bc_ops->init_key_from_rec(&key, recp);
2944
2945         optr = ptr;
2946
2947         XFS_BTREE_STATS_INC(cur, insrec);
2948
2949         /* Get pointers to the btree buffer and block. */
2950         block = xfs_btree_get_block(cur, level, &bp);
2951         numrecs = xfs_btree_get_numrecs(block);
2952
2953 #ifdef DEBUG
2954         error = xfs_btree_check_block(cur, block, level, bp);
2955         if (error)
2956                 goto error0;
2957
2958         /* Check that the new entry is being inserted in the right place. */
2959         if (ptr <= numrecs) {
2960                 if (level == 0) {
2961                         ASSERT(cur->bc_ops->recs_inorder(cur, recp,
2962                                 xfs_btree_rec_addr(cur, ptr, block)));
2963                 } else {
2964                         ASSERT(cur->bc_ops->keys_inorder(cur, &key,
2965                                 xfs_btree_key_addr(cur, ptr, block)));
2966                 }
2967         }
2968 #endif
2969
2970         /*
2971          * If the block is full, we can't insert the new entry until we
2972          * make the block un-full.
2973          */
2974         xfs_btree_set_ptr_null(cur, &nptr);
2975         if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
2976                 error = xfs_btree_make_block_unfull(cur, level, numrecs,
2977                                         &optr, &ptr, &nptr, &ncur, &nrec, stat);
2978                 if (error || *stat == 0)
2979                         goto error0;
2980         }
2981
2982         /*
2983          * The current block may have changed if the block was
2984          * previously full and we have just made space in it.
2985          */
2986         block = xfs_btree_get_block(cur, level, &bp);
2987         numrecs = xfs_btree_get_numrecs(block);
2988
2989 #ifdef DEBUG
2990         error = xfs_btree_check_block(cur, block, level, bp);
2991         if (error)
2992                 return error;
2993 #endif
2994
2995         /*
2996          * At this point we know there's room for our new entry in the block
2997          * we're pointing at.
2998          */
2999         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3000
3001         if (level > 0) {
3002                 /* It's a nonleaf. make a hole in the keys and ptrs */
3003                 union xfs_btree_key     *kp;
3004                 union xfs_btree_ptr     *pp;
3005
3006                 kp = xfs_btree_key_addr(cur, ptr, block);
3007                 pp = xfs_btree_ptr_addr(cur, ptr, block);
3008
3009 #ifdef DEBUG
3010                 for (i = numrecs - ptr; i >= 0; i--) {
3011                         error = xfs_btree_check_ptr(cur, pp, i, level);
3012                         if (error)
3013                                 return error;
3014                 }
3015 #endif
3016
3017                 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3018                 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3019
3020 #ifdef DEBUG
3021                 error = xfs_btree_check_ptr(cur, ptrp, 0, level);
3022                 if (error)
3023                         goto error0;
3024 #endif
3025
3026                 /* Now put the new data in, bump numrecs and log it. */
3027                 xfs_btree_copy_keys(cur, kp, &key, 1);
3028                 xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3029                 numrecs++;
3030                 xfs_btree_set_numrecs(block, numrecs);
3031                 xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3032                 xfs_btree_log_keys(cur, bp, ptr, numrecs);
3033 #ifdef DEBUG
3034                 if (ptr < numrecs) {
3035                         ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3036                                 xfs_btree_key_addr(cur, ptr + 1, block)));
3037                 }
3038 #endif
3039         } else {
3040                 /* It's a leaf. make a hole in the records */
3041                 union xfs_btree_rec             *rp;
3042
3043                 rp = xfs_btree_rec_addr(cur, ptr, block);
3044
3045                 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3046
3047                 /* Now put the new data in, bump numrecs and log it. */
3048                 xfs_btree_copy_recs(cur, rp, recp, 1);
3049                 xfs_btree_set_numrecs(block, ++numrecs);
3050                 xfs_btree_log_recs(cur, bp, ptr, numrecs);
3051 #ifdef DEBUG
3052                 if (ptr < numrecs) {
3053                         ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3054                                 xfs_btree_rec_addr(cur, ptr + 1, block)));
3055                 }
3056 #endif
3057         }
3058
3059         /* Log the new number of records in the btree header. */
3060         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3061
3062         /* If we inserted at the start of a block, update the parents' keys. */
3063         if (optr == 1) {
3064                 error = xfs_btree_updkey(cur, &key, level + 1);
3065                 if (error)
3066                         goto error0;
3067         }
3068
3069         /*
3070          * If we are tracking the last record in the tree and
3071          * we are at the far right edge of the tree, update it.
3072          */
3073         if (xfs_btree_is_lastrec(cur, block, level)) {
3074                 cur->bc_ops->update_lastrec(cur, block, recp,
3075                                             ptr, LASTREC_INSREC);
3076         }
3077
3078         /*
3079          * Return the new block number, if any.
3080          * If there is one, give back a record value and a cursor too.
3081          */
3082         *ptrp = nptr;
3083         if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3084                 *recp = nrec;
3085                 *curp = ncur;
3086         }
3087
3088         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3089         *stat = 1;
3090         return 0;
3091
3092 error0:
3093         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3094         return error;
3095 }
3096
3097 /*
3098  * Insert the record at the point referenced by cur.
3099  *
3100  * A multi-level split of the tree on insert will invalidate the original
3101  * cursor.  All callers of this function should assume that the cursor is
3102  * no longer valid and revalidate it.
3103  */
3104 int
3105 xfs_btree_insert(
3106         struct xfs_btree_cur    *cur,
3107         int                     *stat)
3108 {
3109         int                     error;  /* error return value */
3110         int                     i;      /* result value, 0 for failure */
3111         int                     level;  /* current level number in btree */
3112         union xfs_btree_ptr     nptr;   /* new block number (split result) */
3113         struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3114         struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3115         union xfs_btree_rec     rec;    /* record to insert */
3116
3117         level = 0;
3118         ncur = NULL;
3119         pcur = cur;
3120
3121         xfs_btree_set_ptr_null(cur, &nptr);
3122         cur->bc_ops->init_rec_from_cur(cur, &rec);
3123
3124         /*
3125          * Loop going up the tree, starting at the leaf level.
3126          * Stop when we don't get a split block, that must mean that
3127          * the insert is finished with this level.
3128          */
3129         do {
3130                 /*
3131                  * Insert nrec/nptr into this level of the tree.
3132                  * Note if we fail, nptr will be null.
3133                  */
3134                 error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
3135                 if (error) {
3136                         if (pcur != cur)
3137                                 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3138                         goto error0;
3139                 }
3140
3141                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3142                 level++;
3143
3144                 /*
3145                  * See if the cursor we just used is trash.
3146                  * Can't trash the caller's cursor, but otherwise we should
3147                  * if ncur is a new cursor or we're about to be done.
3148                  */
3149                 if (pcur != cur &&
3150                     (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3151                         /* Save the state from the cursor before we trash it */
3152                         if (cur->bc_ops->update_cursor)
3153                                 cur->bc_ops->update_cursor(pcur, cur);
3154                         cur->bc_nlevels = pcur->bc_nlevels;
3155                         xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3156                 }
3157                 /* If we got a new cursor, switch to it. */
3158                 if (ncur) {
3159                         pcur = ncur;
3160                         ncur = NULL;
3161                 }
3162         } while (!xfs_btree_ptr_is_null(cur, &nptr));
3163
3164         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3165         *stat = i;
3166         return 0;
3167 error0:
3168         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3169         return error;
3170 }
3171
3172 /*
3173  * Try to merge a non-leaf block back into the inode root.
3174  *
3175  * Note: the killroot names comes from the fact that we're effectively
3176  * killing the old root block.  But because we can't just delete the
3177  * inode we have to copy the single block it was pointing to into the
3178  * inode.
3179  */
3180 STATIC int
3181 xfs_btree_kill_iroot(
3182         struct xfs_btree_cur    *cur)
3183 {
3184         int                     whichfork = cur->bc_private.b.whichfork;
3185         struct xfs_inode        *ip = cur->bc_private.b.ip;
3186         struct xfs_ifork        *ifp = XFS_IFORK_PTR(ip, whichfork);
3187         struct xfs_btree_block  *block;
3188         struct xfs_btree_block  *cblock;
3189         union xfs_btree_key     *kp;
3190         union xfs_btree_key     *ckp;
3191         union xfs_btree_ptr     *pp;
3192         union xfs_btree_ptr     *cpp;
3193         struct xfs_buf          *cbp;
3194         int                     level;
3195         int                     index;
3196         int                     numrecs;
3197 #ifdef DEBUG
3198         union xfs_btree_ptr     ptr;
3199         int                     i;
3200 #endif
3201
3202         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3203
3204         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
3205         ASSERT(cur->bc_nlevels > 1);
3206
3207         /*
3208          * Don't deal with the root block needs to be a leaf case.
3209          * We're just going to turn the thing back into extents anyway.
3210          */
3211         level = cur->bc_nlevels - 1;
3212         if (level == 1)
3213                 goto out0;
3214
3215         /*
3216          * Give up if the root has multiple children.
3217          */
3218         block = xfs_btree_get_iroot(cur);
3219         if (xfs_btree_get_numrecs(block) != 1)
3220                 goto out0;
3221
3222         cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3223         numrecs = xfs_btree_get_numrecs(cblock);
3224
3225         /*
3226          * Only do this if the next level will fit.
3227          * Then the data must be copied up to the inode,
3228          * instead of freeing the root you free the next level.
3229          */
3230         if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3231                 goto out0;
3232
3233         XFS_BTREE_STATS_INC(cur, killroot);
3234
3235 #ifdef DEBUG
3236         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3237         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3238         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3239         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3240 #endif
3241
3242         index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3243         if (index) {
3244                 xfs_iroot_realloc(cur->bc_private.b.ip, index,
3245                                   cur->bc_private.b.whichfork);
3246                 block = ifp->if_broot;
3247         }
3248
3249         be16_add_cpu(&block->bb_numrecs, index);
3250         ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3251
3252         kp = xfs_btree_key_addr(cur, 1, block);
3253         ckp = xfs_btree_key_addr(cur, 1, cblock);
3254         xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3255
3256         pp = xfs_btree_ptr_addr(cur, 1, block);
3257         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3258 #ifdef DEBUG
3259         for (i = 0; i < numrecs; i++) {
3260                 int             error;
3261
3262                 error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
3263                 if (error) {
3264                         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3265                         return error;
3266                 }
3267         }
3268 #endif
3269         xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3270
3271         cur->bc_ops->free_block(cur, cbp);
3272         XFS_BTREE_STATS_INC(cur, free);
3273
3274         cur->bc_bufs[level - 1] = NULL;
3275         be16_add_cpu(&block->bb_level, -1);
3276         xfs_trans_log_inode(cur->bc_tp, ip,
3277                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_private.b.whichfork));
3278         cur->bc_nlevels--;
3279 out0:
3280         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3281         return 0;
3282 }
3283
3284 /*
3285  * Kill the current root node, and replace it with it's only child node.
3286  */
3287 STATIC int
3288 xfs_btree_kill_root(
3289         struct xfs_btree_cur    *cur,
3290         struct xfs_buf          *bp,
3291         int                     level,
3292         union xfs_btree_ptr     *newroot)
3293 {
3294         int                     error;
3295
3296         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3297         XFS_BTREE_STATS_INC(cur, killroot);
3298
3299         /*
3300          * Update the root pointer, decreasing the level by 1 and then
3301          * free the old root.
3302          */
3303         cur->bc_ops->set_root(cur, newroot, -1);
3304
3305         error = cur->bc_ops->free_block(cur, bp);
3306         if (error) {
3307                 XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3308                 return error;
3309         }
3310
3311         XFS_BTREE_STATS_INC(cur, free);
3312
3313         cur->bc_bufs[level] = NULL;
3314         cur->bc_ra[level] = 0;
3315         cur->bc_nlevels--;
3316
3317         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3318         return 0;
3319 }
3320
3321 STATIC int
3322 xfs_btree_dec_cursor(
3323         struct xfs_btree_cur    *cur,
3324         int                     level,
3325         int                     *stat)
3326 {
3327         int                     error;
3328         int                     i;
3329
3330         if (level > 0) {
3331                 error = xfs_btree_decrement(cur, level, &i);
3332                 if (error)
3333                         return error;
3334         }
3335
3336         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3337         *stat = 1;
3338         return 0;
3339 }
3340
3341 /*
3342  * Single level of the btree record deletion routine.
3343  * Delete record pointed to by cur/level.
3344  * Remove the record from its block then rebalance the tree.
3345  * Return 0 for error, 1 for done, 2 to go on to the next level.
3346  */
3347 STATIC int                                      /* error */
3348 xfs_btree_delrec(
3349         struct xfs_btree_cur    *cur,           /* btree cursor */
3350         int                     level,          /* level removing record from */
3351         int                     *stat)          /* fail/done/go-on */
3352 {
3353         struct xfs_btree_block  *block;         /* btree block */
3354         union xfs_btree_ptr     cptr;           /* current block ptr */
3355         struct xfs_buf          *bp;            /* buffer for block */
3356         int                     error;          /* error return value */
3357         int                     i;              /* loop counter */
3358         union xfs_btree_key     key;            /* storage for keyp */
3359         union xfs_btree_key     *keyp = &key;   /* passed to the next level */
3360         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3361         struct xfs_buf          *lbp;           /* left buffer pointer */
3362         struct xfs_btree_block  *left;          /* left btree block */
3363         int                     lrecs = 0;      /* left record count */
3364         int                     ptr;            /* key/record index */
3365         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3366         struct xfs_buf          *rbp;           /* right buffer pointer */
3367         struct xfs_btree_block  *right;         /* right btree block */
3368         struct xfs_btree_block  *rrblock;       /* right-right btree block */
3369         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3370         int                     rrecs = 0;      /* right record count */
3371         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3372         int                     numrecs;        /* temporary numrec count */
3373
3374         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3375         XFS_BTREE_TRACE_ARGI(cur, level);
3376
3377         tcur = NULL;
3378
3379         /* Get the index of the entry being deleted, check for nothing there. */
3380         ptr = cur->bc_ptrs[level];
3381         if (ptr == 0) {
3382                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3383                 *stat = 0;
3384                 return 0;
3385         }
3386
3387         /* Get the buffer & block containing the record or key/ptr. */
3388         block = xfs_btree_get_block(cur, level, &bp);
3389         numrecs = xfs_btree_get_numrecs(block);
3390
3391 #ifdef DEBUG
3392         error = xfs_btree_check_block(cur, block, level, bp);
3393         if (error)
3394                 goto error0;
3395 #endif
3396
3397         /* Fail if we're off the end of the block. */
3398         if (ptr > numrecs) {
3399                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3400                 *stat = 0;
3401                 return 0;
3402         }
3403
3404         XFS_BTREE_STATS_INC(cur, delrec);
3405         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3406
3407         /* Excise the entries being deleted. */
3408         if (level > 0) {
3409                 /* It's a nonleaf. operate on keys and ptrs */
3410                 union xfs_btree_key     *lkp;
3411                 union xfs_btree_ptr     *lpp;
3412
3413                 lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3414                 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3415
3416 #ifdef DEBUG
3417                 for (i = 0; i < numrecs - ptr; i++) {
3418                         error = xfs_btree_check_ptr(cur, lpp, i, level);
3419                         if (error)
3420                                 goto error0;
3421                 }
3422 #endif
3423
3424                 if (ptr < numrecs) {
3425                         xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3426                         xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3427                         xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3428                         xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3429                 }
3430
3431                 /*
3432                  * If it's the first record in the block, we'll need to pass a
3433                  * key up to the next level (updkey).
3434                  */
3435                 if (ptr == 1)
3436                         keyp = xfs_btree_key_addr(cur, 1, block);
3437         } else {
3438                 /* It's a leaf. operate on records */
3439                 if (ptr < numrecs) {
3440                         xfs_btree_shift_recs(cur,
3441                                 xfs_btree_rec_addr(cur, ptr + 1, block),
3442                                 -1, numrecs - ptr);
3443                         xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3444                 }
3445
3446                 /*
3447                  * If it's the first record in the block, we'll need a key
3448                  * structure to pass up to the next level (updkey).
3449                  */
3450                 if (ptr == 1) {
3451                         cur->bc_ops->init_key_from_rec(&key,
3452                                         xfs_btree_rec_addr(cur, 1, block));
3453                         keyp = &key;
3454                 }
3455         }
3456
3457         /*
3458          * Decrement and log the number of entries in the block.
3459          */
3460         xfs_btree_set_numrecs(block, --numrecs);
3461         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3462
3463         /*
3464          * If we are tracking the last record in the tree and
3465          * we are at the far right edge of the tree, update it.
3466          */
3467         if (xfs_btree_is_lastrec(cur, block, level)) {
3468                 cur->bc_ops->update_lastrec(cur, block, NULL,
3469                                             ptr, LASTREC_DELREC);
3470         }
3471
3472         /*
3473          * We're at the root level.  First, shrink the root block in-memory.
3474          * Try to get rid of the next level down.  If we can't then there's
3475          * nothing left to do.
3476          */
3477         if (level == cur->bc_nlevels - 1) {
3478                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3479                         xfs_iroot_realloc(cur->bc_private.b.ip, -1,
3480                                           cur->bc_private.b.whichfork);
3481
3482                         error = xfs_btree_kill_iroot(cur);
3483                         if (error)
3484                                 goto error0;
3485
3486                         error = xfs_btree_dec_cursor(cur, level, stat);
3487                         if (error)
3488                                 goto error0;
3489                         *stat = 1;
3490                         return 0;
3491                 }
3492
3493                 /*
3494                  * If this is the root level, and there's only one entry left,
3495                  * and it's NOT the leaf level, then we can get rid of this
3496                  * level.
3497                  */
3498                 if (numrecs == 1 && level > 0) {
3499                         union xfs_btree_ptr     *pp;
3500                         /*
3501                          * pp is still set to the first pointer in the block.
3502                          * Make it the new root of the btree.
3503                          */
3504                         pp = xfs_btree_ptr_addr(cur, 1, block);
3505                         error = xfs_btree_kill_root(cur, bp, level, pp);
3506                         if (error)
3507                                 goto error0;
3508                 } else if (level > 0) {
3509                         error = xfs_btree_dec_cursor(cur, level, stat);
3510                         if (error)
3511                                 goto error0;
3512                 }
3513                 *stat = 1;
3514                 return 0;
3515         }
3516
3517         /*
3518          * If we deleted the leftmost entry in the block, update the
3519          * key values above us in the tree.
3520          */
3521         if (ptr == 1) {
3522                 error = xfs_btree_updkey(cur, keyp, level + 1);
3523                 if (error)
3524                         goto error0;
3525         }
3526
3527         /*
3528          * If the number of records remaining in the block is at least
3529          * the minimum, we're done.
3530          */
3531         if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
3532                 error = xfs_btree_dec_cursor(cur, level, stat);
3533                 if (error)
3534                         goto error0;
3535                 return 0;
3536         }
3537
3538         /*
3539          * Otherwise, we have to move some records around to keep the
3540          * tree balanced.  Look at the left and right sibling blocks to
3541          * see if we can re-balance by moving only one record.
3542          */
3543         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3544         xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
3545
3546         if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
3547                 /*
3548                  * One child of root, need to get a chance to copy its contents
3549                  * into the root and delete it. Can't go up to next level,
3550                  * there's nothing to delete there.
3551                  */
3552                 if (xfs_btree_ptr_is_null(cur, &rptr) &&
3553                     xfs_btree_ptr_is_null(cur, &lptr) &&
3554                     level == cur->bc_nlevels - 2) {
3555                         error = xfs_btree_kill_iroot(cur);
3556                         if (!error)
3557                                 error = xfs_btree_dec_cursor(cur, level, stat);
3558                         if (error)
3559                                 goto error0;
3560                         return 0;
3561                 }
3562         }
3563
3564         ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
3565                !xfs_btree_ptr_is_null(cur, &lptr));
3566
3567         /*
3568          * Duplicate the cursor so our btree manipulations here won't
3569          * disrupt the next level up.
3570          */
3571         error = xfs_btree_dup_cursor(cur, &tcur);
3572         if (error)
3573                 goto error0;
3574
3575         /*
3576          * If there's a right sibling, see if it's ok to shift an entry
3577          * out of it.
3578          */
3579         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3580                 /*
3581                  * Move the temp cursor to the last entry in the next block.
3582                  * Actually any entry but the first would suffice.
3583                  */
3584                 i = xfs_btree_lastrec(tcur, level);
3585                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3586
3587                 error = xfs_btree_increment(tcur, level, &i);
3588                 if (error)
3589                         goto error0;
3590                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3591
3592                 i = xfs_btree_lastrec(tcur, level);
3593                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3594
3595                 /* Grab a pointer to the block. */
3596                 right = xfs_btree_get_block(tcur, level, &rbp);
3597 #ifdef DEBUG
3598                 error = xfs_btree_check_block(tcur, right, level, rbp);
3599                 if (error)
3600                         goto error0;
3601 #endif
3602                 /* Grab the current block number, for future use. */
3603                 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
3604
3605                 /*
3606                  * If right block is full enough so that removing one entry
3607                  * won't make it too empty, and left-shifting an entry out
3608                  * of right to us works, we're done.
3609                  */
3610                 if (xfs_btree_get_numrecs(right) - 1 >=
3611                     cur->bc_ops->get_minrecs(tcur, level)) {
3612                         error = xfs_btree_lshift(tcur, level, &i);
3613                         if (error)
3614                                 goto error0;
3615                         if (i) {
3616                                 ASSERT(xfs_btree_get_numrecs(block) >=
3617                                        cur->bc_ops->get_minrecs(tcur, level));
3618
3619                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3620                                 tcur = NULL;
3621
3622                                 error = xfs_btree_dec_cursor(cur, level, stat);
3623                                 if (error)
3624                                         goto error0;
3625                                 return 0;
3626                         }
3627                 }
3628
3629                 /*
3630                  * Otherwise, grab the number of records in right for
3631                  * future reference, and fix up the temp cursor to point
3632                  * to our block again (last record).
3633                  */
3634                 rrecs = xfs_btree_get_numrecs(right);
3635                 if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3636                         i = xfs_btree_firstrec(tcur, level);
3637                         XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3638
3639                         error = xfs_btree_decrement(tcur, level, &i);
3640                         if (error)
3641                                 goto error0;
3642                         XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3643                 }
3644         }
3645
3646         /*
3647          * If there's a left sibling, see if it's ok to shift an entry
3648          * out of it.
3649          */
3650         if (!xfs_btree_ptr_is_null(cur, &lptr)) {
3651                 /*
3652                  * Move the temp cursor to the first entry in the
3653                  * previous block.
3654                  */
3655                 i = xfs_btree_firstrec(tcur, level);
3656                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3657
3658                 error = xfs_btree_decrement(tcur, level, &i);
3659                 if (error)
3660                         goto error0;
3661                 i = xfs_btree_firstrec(tcur, level);
3662                 XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, error0);
3663
3664                 /* Grab a pointer to the block. */
3665                 left = xfs_btree_get_block(tcur, level, &lbp);
3666 #ifdef DEBUG
3667                 error = xfs_btree_check_block(cur, left, level, lbp);
3668                 if (error)
3669                         goto error0;
3670 #endif
3671                 /* Grab the current block number, for future use. */
3672                 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
3673
3674                 /*
3675                  * If left block is full enough so that removing one entry
3676                  * won't make it too empty, and right-shifting an entry out
3677                  * of left to us works, we're done.
3678                  */
3679                 if (xfs_btree_get_numrecs(left) - 1 >=
3680                     cur->bc_ops->get_minrecs(tcur, level)) {
3681                         error = xfs_btree_rshift(tcur, level, &i);
3682                         if (error)
3683                                 goto error0;
3684                         if (i) {
3685                                 ASSERT(xfs_btree_get_numrecs(block) >=
3686                                        cur->bc_ops->get_minrecs(tcur, level));
3687                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3688                                 tcur = NULL;
3689                                 if (level == 0)
3690                                         cur->bc_ptrs[0]++;
3691                                 XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3692                                 *stat = 1;
3693                                 return 0;
3694                         }
3695                 }
3696
3697                 /*
3698                  * Otherwise, grab the number of records in right for
3699                  * future reference.
3700                  */
3701                 lrecs = xfs_btree_get_numrecs(left);
3702         }
3703
3704         /* Delete the temp cursor, we're done with it. */
3705         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
3706         tcur = NULL;
3707
3708         /* If here, we need to do a join to keep the tree balanced. */
3709         ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
3710
3711         if (!xfs_btree_ptr_is_null(cur, &lptr) &&
3712             lrecs + xfs_btree_get_numrecs(block) <=
3713                         cur->bc_ops->get_maxrecs(cur, level)) {
3714                 /*
3715                  * Set "right" to be the starting block,
3716                  * "left" to be the left neighbor.
3717                  */
3718                 rptr = cptr;
3719                 right = block;
3720                 rbp = bp;
3721                 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3722                 if (error)
3723                         goto error0;
3724
3725         /*
3726          * If that won't work, see if we can join with the right neighbor block.
3727          */
3728         } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
3729                    rrecs + xfs_btree_get_numrecs(block) <=
3730                         cur->bc_ops->get_maxrecs(cur, level)) {
3731                 /*
3732                  * Set "left" to be the starting block,
3733                  * "right" to be the right neighbor.
3734                  */
3735                 lptr = cptr;
3736                 left = block;
3737                 lbp = bp;
3738                 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3739                 if (error)
3740                         goto error0;
3741
3742         /*
3743          * Otherwise, we can't fix the imbalance.
3744          * Just return.  This is probably a logic error, but it's not fatal.
3745          */
3746         } else {
3747                 error = xfs_btree_dec_cursor(cur, level, stat);
3748                 if (error)
3749                         goto error0;
3750                 return 0;
3751         }
3752
3753         rrecs = xfs_btree_get_numrecs(right);
3754         lrecs = xfs_btree_get_numrecs(left);
3755
3756         /*
3757          * We're now going to join "left" and "right" by moving all the stuff
3758          * in "right" to "left" and deleting "right".
3759          */
3760         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
3761         if (level > 0) {
3762                 /* It's a non-leaf.  Move keys and pointers. */
3763                 union xfs_btree_key     *lkp;   /* left btree key */
3764                 union xfs_btree_ptr     *lpp;   /* left address pointer */
3765                 union xfs_btree_key     *rkp;   /* right btree key */
3766                 union xfs_btree_ptr     *rpp;   /* right address pointer */
3767
3768                 lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
3769                 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
3770                 rkp = xfs_btree_key_addr(cur, 1, right);
3771                 rpp = xfs_btree_ptr_addr(cur, 1, right);
3772 #ifdef DEBUG
3773                 for (i = 1; i < rrecs; i++) {
3774                         error = xfs_btree_check_ptr(cur, rpp, i, level);
3775                         if (error)
3776                                 goto error0;
3777                 }
3778 #endif
3779                 xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
3780                 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
3781
3782                 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
3783                 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
3784         } else {
3785                 /* It's a leaf.  Move records.  */
3786                 union xfs_btree_rec     *lrp;   /* left record pointer */
3787                 union xfs_btree_rec     *rrp;   /* right record pointer */
3788
3789                 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
3790                 rrp = xfs_btree_rec_addr(cur, 1, right);
3791
3792                 xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
3793                 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
3794         }
3795
3796         XFS_BTREE_STATS_INC(cur, join);
3797
3798         /*
3799          * Fix up the number of records and right block pointer in the
3800          * surviving block, and log it.
3801          */
3802         xfs_btree_set_numrecs(left, lrecs + rrecs);
3803         xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
3804         xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3805         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
3806
3807         /* If there is a right sibling, point it to the remaining block. */
3808         xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
3809         if (!xfs_btree_ptr_is_null(cur, &cptr)) {
3810                 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
3811                 if (error)
3812                         goto error0;
3813                 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
3814                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
3815         }
3816
3817         /* Free the deleted block. */
3818         error = cur->bc_ops->free_block(cur, rbp);
3819         if (error)
3820                 goto error0;
3821         XFS_BTREE_STATS_INC(cur, free);
3822
3823         /*
3824          * If we joined with the left neighbor, set the buffer in the
3825          * cursor to the left block, and fix up the index.
3826          */
3827         if (bp != lbp) {
3828                 cur->bc_bufs[level] = lbp;
3829                 cur->bc_ptrs[level] += lrecs;
3830                 cur->bc_ra[level] = 0;
3831         }
3832         /*
3833          * If we joined with the right neighbor and there's a level above
3834          * us, increment the cursor at that level.
3835          */
3836         else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
3837                    (level + 1 < cur->bc_nlevels)) {
3838                 error = xfs_btree_increment(cur, level + 1, &i);
3839                 if (error)
3840                         goto error0;
3841         }
3842
3843         /*
3844          * Readjust the ptr at this level if it's not a leaf, since it's
3845          * still pointing at the deletion point, which makes the cursor
3846          * inconsistent.  If this makes the ptr 0, the caller fixes it up.
3847          * We can't use decrement because it would change the next level up.
3848          */
3849         if (level > 0)
3850                 cur->bc_ptrs[level]--;
3851
3852         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3853         /* Return value means the next level up has something to do. */
3854         *stat = 2;
3855         return 0;
3856
3857 error0:
3858         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3859         if (tcur)
3860                 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
3861         return error;
3862 }
3863
3864 /*
3865  * Delete the record pointed to by cur.
3866  * The cursor refers to the place where the record was (could be inserted)
3867  * when the operation returns.
3868  */
3869 int                                     /* error */
3870 xfs_btree_delete(
3871         struct xfs_btree_cur    *cur,
3872         int                     *stat)  /* success/failure */
3873 {
3874         int                     error;  /* error return value */
3875         int                     level;
3876         int                     i;
3877
3878         XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
3879
3880         /*
3881          * Go up the tree, starting at leaf level.
3882          *
3883          * If 2 is returned then a join was done; go to the next level.
3884          * Otherwise we are done.
3885          */
3886         for (level = 0, i = 2; i == 2; level++) {
3887                 error = xfs_btree_delrec(cur, level, &i);
3888                 if (error)
3889                         goto error0;
3890         }
3891
3892         if (i == 0) {
3893                 for (level = 1; level < cur->bc_nlevels; level++) {
3894                         if (cur->bc_ptrs[level] == 0) {
3895                                 error = xfs_btree_decrement(cur, level, &i);
3896                                 if (error)
3897                                         goto error0;
3898                                 break;
3899                         }
3900                 }
3901         }
3902
3903         XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
3904         *stat = i;
3905         return 0;
3906 error0:
3907         XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
3908         return error;
3909 }
3910
3911 /*
3912  * Get the data from the pointed-to record.
3913  */
3914 int                                     /* error */
3915 xfs_btree_get_rec(
3916         struct xfs_btree_cur    *cur,   /* btree cursor */
3917         union xfs_btree_rec     **recp, /* output: btree record */
3918         int                     *stat)  /* output: success/failure */
3919 {
3920         struct xfs_btree_block  *block; /* btree block */
3921         struct xfs_buf          *bp;    /* buffer pointer */
3922         int                     ptr;    /* record number */
3923 #ifdef DEBUG
3924         int                     error;  /* error return value */
3925 #endif
3926
3927         ptr = cur->bc_ptrs[0];
3928         block = xfs_btree_get_block(cur, 0, &bp);
3929
3930 #ifdef DEBUG
3931         error = xfs_btree_check_block(cur, block, 0, bp);
3932         if (error)
3933                 return error;
3934 #endif
3935
3936         /*
3937          * Off the right end or left end, return failure.
3938          */
3939         if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
3940                 *stat = 0;
3941                 return 0;
3942         }
3943
3944         /*
3945          * Point to the record and extract its data.
3946          */
3947         *recp = xfs_btree_rec_addr(cur, ptr, block);
3948         *stat = 1;
3949         return 0;
3950 }
3951
3952 /*
3953  * Change the owner of a btree.
3954  *
3955  * The mechanism we use here is ordered buffer logging. Because we don't know
3956  * how many buffers were are going to need to modify, we don't really want to
3957  * have to make transaction reservations for the worst case of every buffer in a
3958  * full size btree as that may be more space that we can fit in the log....
3959  *
3960  * We do the btree walk in the most optimal manner possible - we have sibling
3961  * pointers so we can just walk all the blocks on each level from left to right
3962  * in a single pass, and then move to the next level and do the same. We can
3963  * also do readahead on the sibling pointers to get IO moving more quickly,
3964  * though for slow disks this is unlikely to make much difference to performance
3965  * as the amount of CPU work we have to do before moving to the next block is
3966  * relatively small.
3967  *
3968  * For each btree block that we load, modify the owner appropriately, set the
3969  * buffer as an ordered buffer and log it appropriately. We need to ensure that
3970  * we mark the region we change dirty so that if the buffer is relogged in
3971  * a subsequent transaction the changes we make here as an ordered buffer are
3972  * correctly relogged in that transaction.  If we are in recovery context, then
3973  * just queue the modified buffer as delayed write buffer so the transaction
3974  * recovery completion writes the changes to disk.
3975  */
3976 static int
3977 xfs_btree_block_change_owner(
3978         struct xfs_btree_cur    *cur,
3979         int                     level,
3980         __uint64_t              new_owner,
3981         struct list_head        *buffer_list)
3982 {
3983         struct xfs_btree_block  *block;
3984         struct xfs_buf          *bp;
3985         union xfs_btree_ptr     rptr;
3986
3987         /* do right sibling readahead */
3988         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
3989
3990         /* modify the owner */
3991         block = xfs_btree_get_block(cur, level, &bp);
3992         if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
3993                 block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
3994         else
3995                 block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
3996
3997         /*
3998          * If the block is a root block hosted in an inode, we might not have a
3999          * buffer pointer here and we shouldn't attempt to log the change as the
4000          * information is already held in the inode and discarded when the root
4001          * block is formatted into the on-disk inode fork. We still change it,
4002          * though, so everything is consistent in memory.
4003          */
4004         if (bp) {
4005                 if (cur->bc_tp) {
4006                         xfs_trans_ordered_buf(cur->bc_tp, bp);
4007                         xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4008                 } else {
4009                         xfs_buf_delwri_queue(bp, buffer_list);
4010                 }
4011         } else {
4012                 ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
4013                 ASSERT(level == cur->bc_nlevels - 1);
4014         }
4015
4016         /* now read rh sibling block for next iteration */
4017         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4018         if (xfs_btree_ptr_is_null(cur, &rptr))
4019                 return -ENOENT;
4020
4021         return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4022 }
4023
4024 int
4025 xfs_btree_change_owner(
4026         struct xfs_btree_cur    *cur,
4027         __uint64_t              new_owner,
4028         struct list_head        *buffer_list)
4029 {
4030         union xfs_btree_ptr     lptr;
4031         int                     level;
4032         struct xfs_btree_block  *block = NULL;
4033         int                     error = 0;
4034
4035         cur->bc_ops->init_ptr_from_cur(cur, &lptr);
4036
4037         /* for each level */
4038         for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4039                 /* grab the left hand block */
4040                 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4041                 if (error)
4042                         return error;
4043
4044                 /* readahead the left most block for the next level down */
4045                 if (level > 0) {
4046                         union xfs_btree_ptr     *ptr;
4047
4048                         ptr = xfs_btree_ptr_addr(cur, 1, block);
4049                         xfs_btree_readahead_ptr(cur, ptr, 1);
4050
4051                         /* save for the next iteration of the loop */
4052                         lptr = *ptr;
4053                 }
4054
4055                 /* for each buffer in the level */
4056                 do {
4057                         error = xfs_btree_block_change_owner(cur, level,
4058                                                              new_owner,
4059                                                              buffer_list);
4060                 } while (!error);
4061
4062                 if (error != -ENOENT)
4063                         return error;
4064         }
4065
4066         return 0;
4067 }